Commit 3f0de7ddf2ecadd35cfb1915abe205be37a89e97

Authored by Laila Saadatifard
1 parent e0edbe13

upload the vote and update_dir kernels that are used bounding box

stim/cuda/ivote/david_update_dir_global.cuh deleted
1   -#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
2   -#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
3   -
4   -# include <iostream>
5   -# include <cuda.h>
6   -#include <stim/cuda/cudatools.h>
7   -#include <stim/cuda/sharedmem.cuh>
8   -#include <math.h>
9   -#include "cpyToshare.cuh"
10   -
11   -#define RMAX_TEST 8
12   -
13   -namespace stim{
14   - namespace cuda{
15   -
16   - // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
17   - template<typename T>
18   - __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
19   - extern __shared__ T atan2_table[];
20   -
21   - //calculate the start point for this block
22   - //int bxi = blockIdx.x * blockDim.x;
23   -
24   - stim::cuda::sharedMemcpy(atan2_table, gpuTable, (2 * rmax + 1) * (2 * rmax + 1), threadIdx.x, blockDim.x);
25   -
26   - __syncthreads();
27   -
28   - // calculate the 2D coordinates for this current thread.
29   - //int xi = bxi + threadIdx.x;
30   - int xi = blockIdx.x * blockDim.x + threadIdx.x;
31   - int yi = blockIdx.y * blockDim.y + threadIdx.y;
32   - if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel
33   -
34   - int i = yi * x + xi; // convert 2D coordinates to 1D
35   -
36   - float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch
37   - gpuDir[i] = 0; //initialize the vote direction to zero
38   - float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter
39   - int id_x = 0; // define two local variables for the x and y position of the maximum
40   - int id_y = 0;
41   -
42   - int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter
43   - int rmax_sq = rmax * rmax;
44   - int tx_rmax = threadIdx.x + rmax;
45   - float atan_angle;
46   - float vote_c;
47   - unsigned int ind_t;
48   - for(int yr = -rmax; yr <= rmax; yr++){ //for each counter in the y direction
49   - if (yi+yr >= 0 && yi + yr < y){ //if the counter exists (we aren't looking outside of the image)
50   - for(int xr = -rmax; xr <= rmax; xr++){ //for each counter in the x direction
51   - if((xr * xr + yr *yr)< rmax_sq){ //if the counter is within range of the voter
52   -
53   - ind_t = (rmax - yr) * x_table + rmax - xr; //calculate the index to the atan2 table
54   - atan_angle = atan2_table[ind_t]; //retrieve the direction vector from the table
55   -
56   - //atan_angle = atan2((float)yr, (float)xr);
57   -
58   - if (abs(atan_angle - theta) <phi){ // check if the current pixel is located in the voting angle of this voter.
59   - vote_c = gpuVote[(yi+yr)*x + (xi+xr)]; // find the vote value for the current counter
60   - if(vote_c>max) { // compare the vote value of this pixel with the max value to find the maxima and its index.
61   - max = vote_c;
62   - id_x = xr;
63   - id_y = yr;
64   - }
65   - }
66   - }
67   - }
68   - }
69   - }
70   -
71   - unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
72   - float new_angle = gpuTable[ind_m];
73   -
74   - if(xi < x && yi < y)
75   - gpuDir[i] = new_angle;
76   - } //end kernel
77   -
78   - // this kernel updates the gradient direction by the calculated voting direction.
79   - template<typename T>
80   - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
81   -
82   - // calculate the 2D coordinates for this current thread.
83   - int xi = blockIdx.x * blockDim.x + threadIdx.x;
84   - int yi = blockIdx.y * blockDim.y + threadIdx.y;
85   -
86   - // convert 2D coordinates to 1D
87   - int i = yi * x + xi;
88   -
89   - //update the gradient image with the vote direction
90   - gpuGrad[2*i] = gpuDir[i];
91   - }
92   -
93   - template<typename T>
94   - void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
95   -
96   -
97   -
98   - //calculate the number of bytes in the array
99   - unsigned int bytes = x * y * sizeof(T);
100   -
101   - unsigned int max_threads = stim::maxThreadsPerBlock();
102   -
103   - dim3 threads(sqrt(max_threads), sqrt(max_threads));
104   - dim3 blocks(x/threads.x + 1, y/threads.y + 1);
105   -
106   -
107   -
108   - // allocate space on the GPU for the updated vote direction
109   - T* gpuDir;
110   - cudaMalloc(&gpuDir, bytes);
111   -
112   - size_t shared_mem = sizeof(T) * std::pow((2 * rmax + 1), 2);
113   - std::cout<<"Shared memory for atan2 table: "<<shared_mem<<std::endl;
114   -
115   - //call the kernel to calculate the new voting direction
116   - cuda_update_dir <<< blocks, threads, shared_mem>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
117   -
118   - //call the kernel to update the gradient direction
119   - cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
120   -
121   - //free allocated memory
122   - cudaFree(gpuDir);
123   -
124   - }
125   -
126   - template<typename T>
127   - void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
128   -
129   - //calculate the number of bytes in the array
130   - unsigned int bytes = x * y * sizeof(T);
131   -
132   - //calculate the number of bytes in the atan2 table
133   - unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
134   -
135   - //allocate space on the GPU for the Vote Image
136   - T* gpuVote;
137   - cudaMalloc(&gpuVote, bytes);
138   -
139   - //copy the input vote image to the GPU
140   - HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
141   -
142   - //allocate space on the GPU for the input Gradient image
143   - T* gpuGrad;
144   - HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
145   -
146   - //copy the Gradient data to the GPU
147   - HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
148   -
149   - //allocate space on the GPU for the atan2 table
150   - T* gpuTable;
151   - HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
152   -
153   - //copy the atan2 values to the GPU
154   - HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
155   -
156   - //call the GPU version of the update direction function
157   - gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
158   -
159   - //copy the new gradient image back to the CPU
160   - cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
161   -
162   - //free allocated memory
163   - cudaFree(gpuTable);
164   - cudaFree(gpuVote);
165   - cudaFree(gpuGrad);
166   - }
167   -
168   - }
169   -}
170   -
171   -#endif
172 0 \ No newline at end of file
stim/cuda/ivote/re_sample.cuh 0 โ†’ 100644
  1 +#ifndef STIM_CUDA_RE_SAMPLE_H
  2 +#define STIM_CUDA_RE_SAMPLE_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/templates/gaussian_blur.cuh>
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + template<typename T>
  13 + __global__ void cuda_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
  14 +
  15 + unsigned int sigma_ds = 1/resize;
  16 + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
  17 + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
  18 +
  19 +
  20 + // calculate the 2D coordinates for this current thread.
  21 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  22 + int yi = blockIdx.y;
  23 + // convert 2D coordinates to 1D
  24 + int i = yi * x + xi;
  25 +
  26 + if(xi< x && yi< y){
  27 + if(xi%sigma_ds==0){
  28 + if(yi%sigma_ds==0){
  29 + gpuI[i] = gpuI0[(yi/sigma_ds)*x_ds + xi/sigma_ds];
  30 + }
  31 + }
  32 + else gpuI[i] = 0;
  33 +
  34 + //int x_org = xi * sigma_ds ;
  35 + //int y_org = yi * sigma_ds ;
  36 + //int i_org = y_org * x + x_org;
  37 + //gpuI[i] = gpuI0[i_org];
  38 + }
  39 +
  40 + }
  41 +
  42 +
  43 + /// Applies a Gaussian blur to a 2D image stored on the GPU
  44 + template<typename T>
  45 + void gpu_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
  46 +
  47 +
  48 + //unsigned int sigma_ds = 1/resize;
  49 + //unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
  50 + //unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
  51 +
  52 + //get the number of pixels in the image
  53 + //unsigned int pixels_ds = x_ds * y_ds;
  54 +
  55 + unsigned int max_threads = stim::maxThreadsPerBlock();
  56 + dim3 threads(max_threads, 1);
  57 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  58 +
  59 + //stim::cuda::gpu_gaussian_blur2<float>(gpuI0, sigma_ds,x ,y);
  60 +
  61 + //resample the image
  62 + cuda_re_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y);
  63 +
  64 + }
  65 +
  66 + /// Applies a Gaussian blur to a 2D image stored on the CPU
  67 + template<typename T>
  68 + void cpu_re_sample(T* out, T* in, T resize, unsigned int x, unsigned int y){
  69 +
  70 + //get the number of pixels in the image
  71 + unsigned int pixels = x*y;
  72 + unsigned int bytes = sizeof(T) * pixels;
  73 +
  74 + unsigned int sigma_ds = 1/resize;
  75 + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
  76 + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
  77 + unsigned int bytes_ds = sizeof(T) * x_ds * y_ds;
  78 +
  79 +
  80 +
  81 + //allocate space on the GPU for the original image
  82 + T* gpuI0;
  83 + cudaMalloc(&gpuI0, bytes_ds);
  84 +
  85 +
  86 + //copy the image data to the GPU
  87 + cudaMemcpy(gpuI0, in, bytes_ds, cudaMemcpyHostToDevice);
  88 +
  89 + //allocate space on the GPU for the down sampled image
  90 + T* gpuI;
  91 + cudaMalloc(&gpuI, bytes);
  92 +
  93 + //run the GPU-based version of the algorithm
  94 + gpu_re_sample<T>(gpuI, gpuI0, resize, x, y);
  95 +
  96 + //copy the image data to the GPU
  97 + cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
  98 +
  99 + cudaFree(gpuI0);
  100 + cudeFree(gpuI);
  101 + }
  102 +
  103 + }
  104 +}
  105 +
  106 +#endif
0 107 \ No newline at end of file
... ...
stim/cuda/ivote/update_dir_global.cuh renamed to stim/cuda/ivote/update_dir_bb.cuh
1   -#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
2   -#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
  1 +#ifndef STIM_CUDA_UPDATE_DIR_BB_H
  2 +#define STIM_CUDA_UPDATE_DIR_BB_H
3 3  
4 4 # include <iostream>
5 5 # include <cuda.h>
... ...
stim/cuda/ivote/vote_atomic_global.cuh renamed to stim/cuda/ivote/vote_atomic_bb.cuh
1   -#ifndef STIM_CUDA_VOTE_ATOMIC_GLOBAL_H
2   -#define STIM_CUDA_VOTE_ATOMIC_GLOBAL_H
  1 +#ifndef STIM_CUDA_VOTE_ATOMIC_BB_H
  2 +#define STIM_CUDA_VOTE_ATOMIC_BB_H
3 3  
4 4 # include <iostream>
5 5 # include <cuda.h>
... ...
stim/cuda/ivote_atomic.cuh renamed to stim/cuda/ivote_atomic_bb.cuh
1   -#ifndef STIM_CUDA_IVOTE_ATOMIC_H
2   -#define STIM_CUDA_IVOTE_ATOMIC_H
  1 +#ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H
  2 +#define STIM_CUDA_IVOTE_ATOMIC_BB_H
3 3  
4 4 #include <stim/cuda/ivote/down_sample.cuh>
5 5 #include <stim/cuda/ivote/local_max.cuh>
6   -#include <stim/cuda/ivote/update_dir_global.cuh>
7   -//#include <stim/cuda/ivote/vote_shared_32-32.cuh>
8   -#include <stim/cuda/ivote/vote_atomic_global.cuh>
9   -//#include <stim/cuda/ivote/re_sample.cuh>
  6 +#include <stim/cuda/ivote/update_dir_bb.cuh>
  7 +#include <stim/cuda/ivote/vote_atomic_bb.cuh>
  8 +
10 9 namespace stim{
11 10 namespace cuda{
12 11  
... ...