Commit 3f0de7ddf2ecadd35cfb1915abe205be37a89e97

Authored by Laila Saadatifard
1 parent e0edbe13

upload the vote and update_dir kernels that are used bounding box

stim/cuda/ivote/david_update_dir_global.cuh deleted
1 -#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H  
2 -#define STIM_CUDA_UPDATE_DIR_GLOBAL_H  
3 -  
4 -# include <iostream>  
5 -# include <cuda.h>  
6 -#include <stim/cuda/cudatools.h>  
7 -#include <stim/cuda/sharedmem.cuh>  
8 -#include <math.h>  
9 -#include "cpyToshare.cuh"  
10 -  
11 -#define RMAX_TEST 8  
12 -  
13 -namespace stim{  
14 - namespace cuda{  
15 -  
16 - // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.  
17 - template<typename T>  
18 - __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){  
19 - extern __shared__ T atan2_table[];  
20 -  
21 - //calculate the start point for this block  
22 - //int bxi = blockIdx.x * blockDim.x;  
23 -  
24 - stim::cuda::sharedMemcpy(atan2_table, gpuTable, (2 * rmax + 1) * (2 * rmax + 1), threadIdx.x, blockDim.x);  
25 -  
26 - __syncthreads();  
27 -  
28 - // calculate the 2D coordinates for this current thread.  
29 - //int xi = bxi + threadIdx.x;  
30 - int xi = blockIdx.x * blockDim.x + threadIdx.x;  
31 - int yi = blockIdx.y * blockDim.y + threadIdx.y;  
32 - if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel  
33 -  
34 - int i = yi * x + xi; // convert 2D coordinates to 1D  
35 -  
36 - float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch  
37 - gpuDir[i] = 0; //initialize the vote direction to zero  
38 - float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter  
39 - int id_x = 0; // define two local variables for the x and y position of the maximum  
40 - int id_y = 0;  
41 -  
42 - int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter  
43 - int rmax_sq = rmax * rmax;  
44 - int tx_rmax = threadIdx.x + rmax;  
45 - float atan_angle;  
46 - float vote_c;  
47 - unsigned int ind_t;  
48 - for(int yr = -rmax; yr <= rmax; yr++){ //for each counter in the y direction  
49 - if (yi+yr >= 0 && yi + yr < y){ //if the counter exists (we aren't looking outside of the image)  
50 - for(int xr = -rmax; xr <= rmax; xr++){ //for each counter in the x direction  
51 - if((xr * xr + yr *yr)< rmax_sq){ //if the counter is within range of the voter  
52 -  
53 - ind_t = (rmax - yr) * x_table + rmax - xr; //calculate the index to the atan2 table  
54 - atan_angle = atan2_table[ind_t]; //retrieve the direction vector from the table  
55 -  
56 - //atan_angle = atan2((float)yr, (float)xr);  
57 -  
58 - if (abs(atan_angle - theta) <phi){ // check if the current pixel is located in the voting angle of this voter.  
59 - vote_c = gpuVote[(yi+yr)*x + (xi+xr)]; // find the vote value for the current counter  
60 - if(vote_c>max) { // compare the vote value of this pixel with the max value to find the maxima and its index.  
61 - max = vote_c;  
62 - id_x = xr;  
63 - id_y = yr;  
64 - }  
65 - }  
66 - }  
67 - }  
68 - }  
69 - }  
70 -  
71 - unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);  
72 - float new_angle = gpuTable[ind_m];  
73 -  
74 - if(xi < x && yi < y)  
75 - gpuDir[i] = new_angle;  
76 - } //end kernel  
77 -  
78 - // this kernel updates the gradient direction by the calculated voting direction.  
79 - template<typename T>  
80 - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){  
81 -  
82 - // calculate the 2D coordinates for this current thread.  
83 - int xi = blockIdx.x * blockDim.x + threadIdx.x;  
84 - int yi = blockIdx.y * blockDim.y + threadIdx.y;  
85 -  
86 - // convert 2D coordinates to 1D  
87 - int i = yi * x + xi;  
88 -  
89 - //update the gradient image with the vote direction  
90 - gpuGrad[2*i] = gpuDir[i];  
91 - }  
92 -  
93 - template<typename T>  
94 - void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){  
95 -  
96 -  
97 -  
98 - //calculate the number of bytes in the array  
99 - unsigned int bytes = x * y * sizeof(T);  
100 -  
101 - unsigned int max_threads = stim::maxThreadsPerBlock();  
102 -  
103 - dim3 threads(sqrt(max_threads), sqrt(max_threads));  
104 - dim3 blocks(x/threads.x + 1, y/threads.y + 1);  
105 -  
106 -  
107 -  
108 - // allocate space on the GPU for the updated vote direction  
109 - T* gpuDir;  
110 - cudaMalloc(&gpuDir, bytes);  
111 -  
112 - size_t shared_mem = sizeof(T) * std::pow((2 * rmax + 1), 2);  
113 - std::cout<<"Shared memory for atan2 table: "<<shared_mem<<std::endl;  
114 -  
115 - //call the kernel to calculate the new voting direction  
116 - cuda_update_dir <<< blocks, threads, shared_mem>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);  
117 -  
118 - //call the kernel to update the gradient direction  
119 - cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);  
120 -  
121 - //free allocated memory  
122 - cudaFree(gpuDir);  
123 -  
124 - }  
125 -  
126 - template<typename T>  
127 - void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){  
128 -  
129 - //calculate the number of bytes in the array  
130 - unsigned int bytes = x * y * sizeof(T);  
131 -  
132 - //calculate the number of bytes in the atan2 table  
133 - unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);  
134 -  
135 - //allocate space on the GPU for the Vote Image  
136 - T* gpuVote;  
137 - cudaMalloc(&gpuVote, bytes);  
138 -  
139 - //copy the input vote image to the GPU  
140 - HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));  
141 -  
142 - //allocate space on the GPU for the input Gradient image  
143 - T* gpuGrad;  
144 - HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));  
145 -  
146 - //copy the Gradient data to the GPU  
147 - HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));  
148 -  
149 - //allocate space on the GPU for the atan2 table  
150 - T* gpuTable;  
151 - HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));  
152 -  
153 - //copy the atan2 values to the GPU  
154 - HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));  
155 -  
156 - //call the GPU version of the update direction function  
157 - gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);  
158 -  
159 - //copy the new gradient image back to the CPU  
160 - cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;  
161 -  
162 - //free allocated memory  
163 - cudaFree(gpuTable);  
164 - cudaFree(gpuVote);  
165 - cudaFree(gpuGrad);  
166 - }  
167 -  
168 - }  
169 -}  
170 -  
171 -#endif  
172 \ No newline at end of file 0 \ No newline at end of file
stim/cuda/ivote/re_sample.cuh 0 โ†’ 100644
  1 +#ifndef STIM_CUDA_RE_SAMPLE_H
  2 +#define STIM_CUDA_RE_SAMPLE_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/templates/gaussian_blur.cuh>
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + template<typename T>
  13 + __global__ void cuda_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
  14 +
  15 + unsigned int sigma_ds = 1/resize;
  16 + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
  17 + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
  18 +
  19 +
  20 + // calculate the 2D coordinates for this current thread.
  21 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  22 + int yi = blockIdx.y;
  23 + // convert 2D coordinates to 1D
  24 + int i = yi * x + xi;
  25 +
  26 + if(xi< x && yi< y){
  27 + if(xi%sigma_ds==0){
  28 + if(yi%sigma_ds==0){
  29 + gpuI[i] = gpuI0[(yi/sigma_ds)*x_ds + xi/sigma_ds];
  30 + }
  31 + }
  32 + else gpuI[i] = 0;
  33 +
  34 + //int x_org = xi * sigma_ds ;
  35 + //int y_org = yi * sigma_ds ;
  36 + //int i_org = y_org * x + x_org;
  37 + //gpuI[i] = gpuI0[i_org];
  38 + }
  39 +
  40 + }
  41 +
  42 +
  43 + /// Applies a Gaussian blur to a 2D image stored on the GPU
  44 + template<typename T>
  45 + void gpu_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
  46 +
  47 +
  48 + //unsigned int sigma_ds = 1/resize;
  49 + //unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
  50 + //unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
  51 +
  52 + //get the number of pixels in the image
  53 + //unsigned int pixels_ds = x_ds * y_ds;
  54 +
  55 + unsigned int max_threads = stim::maxThreadsPerBlock();
  56 + dim3 threads(max_threads, 1);
  57 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  58 +
  59 + //stim::cuda::gpu_gaussian_blur2<float>(gpuI0, sigma_ds,x ,y);
  60 +
  61 + //resample the image
  62 + cuda_re_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y);
  63 +
  64 + }
  65 +
  66 + /// Applies a Gaussian blur to a 2D image stored on the CPU
  67 + template<typename T>
  68 + void cpu_re_sample(T* out, T* in, T resize, unsigned int x, unsigned int y){
  69 +
  70 + //get the number of pixels in the image
  71 + unsigned int pixels = x*y;
  72 + unsigned int bytes = sizeof(T) * pixels;
  73 +
  74 + unsigned int sigma_ds = 1/resize;
  75 + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
  76 + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
  77 + unsigned int bytes_ds = sizeof(T) * x_ds * y_ds;
  78 +
  79 +
  80 +
  81 + //allocate space on the GPU for the original image
  82 + T* gpuI0;
  83 + cudaMalloc(&gpuI0, bytes_ds);
  84 +
  85 +
  86 + //copy the image data to the GPU
  87 + cudaMemcpy(gpuI0, in, bytes_ds, cudaMemcpyHostToDevice);
  88 +
  89 + //allocate space on the GPU for the down sampled image
  90 + T* gpuI;
  91 + cudaMalloc(&gpuI, bytes);
  92 +
  93 + //run the GPU-based version of the algorithm
  94 + gpu_re_sample<T>(gpuI, gpuI0, resize, x, y);
  95 +
  96 + //copy the image data to the GPU
  97 + cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
  98 +
  99 + cudaFree(gpuI0);
  100 + cudeFree(gpuI);
  101 + }
  102 +
  103 + }
  104 +}
  105 +
  106 +#endif
0 \ No newline at end of file 107 \ No newline at end of file
stim/cuda/ivote/update_dir_global.cuh renamed to stim/cuda/ivote/update_dir_bb.cuh
1 -#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H  
2 -#define STIM_CUDA_UPDATE_DIR_GLOBAL_H 1 +#ifndef STIM_CUDA_UPDATE_DIR_BB_H
  2 +#define STIM_CUDA_UPDATE_DIR_BB_H
3 3
4 # include <iostream> 4 # include <iostream>
5 # include <cuda.h> 5 # include <cuda.h>
stim/cuda/ivote/vote_atomic_global.cuh renamed to stim/cuda/ivote/vote_atomic_bb.cuh
1 -#ifndef STIM_CUDA_VOTE_ATOMIC_GLOBAL_H  
2 -#define STIM_CUDA_VOTE_ATOMIC_GLOBAL_H 1 +#ifndef STIM_CUDA_VOTE_ATOMIC_BB_H
  2 +#define STIM_CUDA_VOTE_ATOMIC_BB_H
3 3
4 # include <iostream> 4 # include <iostream>
5 # include <cuda.h> 5 # include <cuda.h>
stim/cuda/ivote_atomic.cuh renamed to stim/cuda/ivote_atomic_bb.cuh
1 -#ifndef STIM_CUDA_IVOTE_ATOMIC_H  
2 -#define STIM_CUDA_IVOTE_ATOMIC_H 1 +#ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H
  2 +#define STIM_CUDA_IVOTE_ATOMIC_BB_H
3 3
4 #include <stim/cuda/ivote/down_sample.cuh> 4 #include <stim/cuda/ivote/down_sample.cuh>
5 #include <stim/cuda/ivote/local_max.cuh> 5 #include <stim/cuda/ivote/local_max.cuh>
6 -#include <stim/cuda/ivote/update_dir_global.cuh>  
7 -//#include <stim/cuda/ivote/vote_shared_32-32.cuh>  
8 -#include <stim/cuda/ivote/vote_atomic_global.cuh>  
9 -//#include <stim/cuda/ivote/re_sample.cuh> 6 +#include <stim/cuda/ivote/update_dir_bb.cuh>
  7 +#include <stim/cuda/ivote/vote_atomic_bb.cuh>
  8 +
10 namespace stim{ 9 namespace stim{
11 namespace cuda{ 10 namespace cuda{
12 11