Commit 3f0de7ddf2ecadd35cfb1915abe205be37a89e97
1 parent
e0edbe13
upload the vote and update_dir kernels that are used bounding box
Showing
5 changed files
with
115 additions
and
181 deletions
Show diff stats
stim/cuda/ivote/david_update_dir_global.cuh deleted
1 | -#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H | ||
2 | -#define STIM_CUDA_UPDATE_DIR_GLOBAL_H | ||
3 | - | ||
4 | -# include <iostream> | ||
5 | -# include <cuda.h> | ||
6 | -#include <stim/cuda/cudatools.h> | ||
7 | -#include <stim/cuda/sharedmem.cuh> | ||
8 | -#include <math.h> | ||
9 | -#include "cpyToshare.cuh" | ||
10 | - | ||
11 | -#define RMAX_TEST 8 | ||
12 | - | ||
13 | -namespace stim{ | ||
14 | - namespace cuda{ | ||
15 | - | ||
16 | - // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. | ||
17 | - template<typename T> | ||
18 | - __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | ||
19 | - extern __shared__ T atan2_table[]; | ||
20 | - | ||
21 | - //calculate the start point for this block | ||
22 | - //int bxi = blockIdx.x * blockDim.x; | ||
23 | - | ||
24 | - stim::cuda::sharedMemcpy(atan2_table, gpuTable, (2 * rmax + 1) * (2 * rmax + 1), threadIdx.x, blockDim.x); | ||
25 | - | ||
26 | - __syncthreads(); | ||
27 | - | ||
28 | - // calculate the 2D coordinates for this current thread. | ||
29 | - //int xi = bxi + threadIdx.x; | ||
30 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
31 | - int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
32 | - if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel | ||
33 | - | ||
34 | - int i = yi * x + xi; // convert 2D coordinates to 1D | ||
35 | - | ||
36 | - float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch | ||
37 | - gpuDir[i] = 0; //initialize the vote direction to zero | ||
38 | - float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter | ||
39 | - int id_x = 0; // define two local variables for the x and y position of the maximum | ||
40 | - int id_y = 0; | ||
41 | - | ||
42 | - int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter | ||
43 | - int rmax_sq = rmax * rmax; | ||
44 | - int tx_rmax = threadIdx.x + rmax; | ||
45 | - float atan_angle; | ||
46 | - float vote_c; | ||
47 | - unsigned int ind_t; | ||
48 | - for(int yr = -rmax; yr <= rmax; yr++){ //for each counter in the y direction | ||
49 | - if (yi+yr >= 0 && yi + yr < y){ //if the counter exists (we aren't looking outside of the image) | ||
50 | - for(int xr = -rmax; xr <= rmax; xr++){ //for each counter in the x direction | ||
51 | - if((xr * xr + yr *yr)< rmax_sq){ //if the counter is within range of the voter | ||
52 | - | ||
53 | - ind_t = (rmax - yr) * x_table + rmax - xr; //calculate the index to the atan2 table | ||
54 | - atan_angle = atan2_table[ind_t]; //retrieve the direction vector from the table | ||
55 | - | ||
56 | - //atan_angle = atan2((float)yr, (float)xr); | ||
57 | - | ||
58 | - if (abs(atan_angle - theta) <phi){ // check if the current pixel is located in the voting angle of this voter. | ||
59 | - vote_c = gpuVote[(yi+yr)*x + (xi+xr)]; // find the vote value for the current counter | ||
60 | - if(vote_c>max) { // compare the vote value of this pixel with the max value to find the maxima and its index. | ||
61 | - max = vote_c; | ||
62 | - id_x = xr; | ||
63 | - id_y = yr; | ||
64 | - } | ||
65 | - } | ||
66 | - } | ||
67 | - } | ||
68 | - } | ||
69 | - } | ||
70 | - | ||
71 | - unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); | ||
72 | - float new_angle = gpuTable[ind_m]; | ||
73 | - | ||
74 | - if(xi < x && yi < y) | ||
75 | - gpuDir[i] = new_angle; | ||
76 | - } //end kernel | ||
77 | - | ||
78 | - // this kernel updates the gradient direction by the calculated voting direction. | ||
79 | - template<typename T> | ||
80 | - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ | ||
81 | - | ||
82 | - // calculate the 2D coordinates for this current thread. | ||
83 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
84 | - int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
85 | - | ||
86 | - // convert 2D coordinates to 1D | ||
87 | - int i = yi * x + xi; | ||
88 | - | ||
89 | - //update the gradient image with the vote direction | ||
90 | - gpuGrad[2*i] = gpuDir[i]; | ||
91 | - } | ||
92 | - | ||
93 | - template<typename T> | ||
94 | - void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
95 | - | ||
96 | - | ||
97 | - | ||
98 | - //calculate the number of bytes in the array | ||
99 | - unsigned int bytes = x * y * sizeof(T); | ||
100 | - | ||
101 | - unsigned int max_threads = stim::maxThreadsPerBlock(); | ||
102 | - | ||
103 | - dim3 threads(sqrt(max_threads), sqrt(max_threads)); | ||
104 | - dim3 blocks(x/threads.x + 1, y/threads.y + 1); | ||
105 | - | ||
106 | - | ||
107 | - | ||
108 | - // allocate space on the GPU for the updated vote direction | ||
109 | - T* gpuDir; | ||
110 | - cudaMalloc(&gpuDir, bytes); | ||
111 | - | ||
112 | - size_t shared_mem = sizeof(T) * std::pow((2 * rmax + 1), 2); | ||
113 | - std::cout<<"Shared memory for atan2 table: "<<shared_mem<<std::endl; | ||
114 | - | ||
115 | - //call the kernel to calculate the new voting direction | ||
116 | - cuda_update_dir <<< blocks, threads, shared_mem>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
117 | - | ||
118 | - //call the kernel to update the gradient direction | ||
119 | - cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); | ||
120 | - | ||
121 | - //free allocated memory | ||
122 | - cudaFree(gpuDir); | ||
123 | - | ||
124 | - } | ||
125 | - | ||
126 | - template<typename T> | ||
127 | - void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
128 | - | ||
129 | - //calculate the number of bytes in the array | ||
130 | - unsigned int bytes = x * y * sizeof(T); | ||
131 | - | ||
132 | - //calculate the number of bytes in the atan2 table | ||
133 | - unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | ||
134 | - | ||
135 | - //allocate space on the GPU for the Vote Image | ||
136 | - T* gpuVote; | ||
137 | - cudaMalloc(&gpuVote, bytes); | ||
138 | - | ||
139 | - //copy the input vote image to the GPU | ||
140 | - HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); | ||
141 | - | ||
142 | - //allocate space on the GPU for the input Gradient image | ||
143 | - T* gpuGrad; | ||
144 | - HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | ||
145 | - | ||
146 | - //copy the Gradient data to the GPU | ||
147 | - HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | ||
148 | - | ||
149 | - //allocate space on the GPU for the atan2 table | ||
150 | - T* gpuTable; | ||
151 | - HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | ||
152 | - | ||
153 | - //copy the atan2 values to the GPU | ||
154 | - HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | ||
155 | - | ||
156 | - //call the GPU version of the update direction function | ||
157 | - gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
158 | - | ||
159 | - //copy the new gradient image back to the CPU | ||
160 | - cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; | ||
161 | - | ||
162 | - //free allocated memory | ||
163 | - cudaFree(gpuTable); | ||
164 | - cudaFree(gpuVote); | ||
165 | - cudaFree(gpuGrad); | ||
166 | - } | ||
167 | - | ||
168 | - } | ||
169 | -} | ||
170 | - | ||
171 | -#endif | ||
172 | \ No newline at end of file | 0 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_RE_SAMPLE_H | ||
2 | +#define STIM_CUDA_RE_SAMPLE_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <stim/cuda/cudatools.h> | ||
7 | +#include <stim/cuda/templates/gaussian_blur.cuh> | ||
8 | + | ||
9 | +namespace stim{ | ||
10 | + namespace cuda{ | ||
11 | + | ||
12 | + template<typename T> | ||
13 | + __global__ void cuda_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ | ||
14 | + | ||
15 | + unsigned int sigma_ds = 1/resize; | ||
16 | + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); | ||
17 | + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); | ||
18 | + | ||
19 | + | ||
20 | + // calculate the 2D coordinates for this current thread. | ||
21 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
22 | + int yi = blockIdx.y; | ||
23 | + // convert 2D coordinates to 1D | ||
24 | + int i = yi * x + xi; | ||
25 | + | ||
26 | + if(xi< x && yi< y){ | ||
27 | + if(xi%sigma_ds==0){ | ||
28 | + if(yi%sigma_ds==0){ | ||
29 | + gpuI[i] = gpuI0[(yi/sigma_ds)*x_ds + xi/sigma_ds]; | ||
30 | + } | ||
31 | + } | ||
32 | + else gpuI[i] = 0; | ||
33 | + | ||
34 | + //int x_org = xi * sigma_ds ; | ||
35 | + //int y_org = yi * sigma_ds ; | ||
36 | + //int i_org = y_org * x + x_org; | ||
37 | + //gpuI[i] = gpuI0[i_org]; | ||
38 | + } | ||
39 | + | ||
40 | + } | ||
41 | + | ||
42 | + | ||
43 | + /// Applies a Gaussian blur to a 2D image stored on the GPU | ||
44 | + template<typename T> | ||
45 | + void gpu_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ | ||
46 | + | ||
47 | + | ||
48 | + //unsigned int sigma_ds = 1/resize; | ||
49 | + //unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); | ||
50 | + //unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); | ||
51 | + | ||
52 | + //get the number of pixels in the image | ||
53 | + //unsigned int pixels_ds = x_ds * y_ds; | ||
54 | + | ||
55 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | ||
56 | + dim3 threads(max_threads, 1); | ||
57 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | ||
58 | + | ||
59 | + //stim::cuda::gpu_gaussian_blur2<float>(gpuI0, sigma_ds,x ,y); | ||
60 | + | ||
61 | + //resample the image | ||
62 | + cuda_re_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y); | ||
63 | + | ||
64 | + } | ||
65 | + | ||
66 | + /// Applies a Gaussian blur to a 2D image stored on the CPU | ||
67 | + template<typename T> | ||
68 | + void cpu_re_sample(T* out, T* in, T resize, unsigned int x, unsigned int y){ | ||
69 | + | ||
70 | + //get the number of pixels in the image | ||
71 | + unsigned int pixels = x*y; | ||
72 | + unsigned int bytes = sizeof(T) * pixels; | ||
73 | + | ||
74 | + unsigned int sigma_ds = 1/resize; | ||
75 | + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); | ||
76 | + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); | ||
77 | + unsigned int bytes_ds = sizeof(T) * x_ds * y_ds; | ||
78 | + | ||
79 | + | ||
80 | + | ||
81 | + //allocate space on the GPU for the original image | ||
82 | + T* gpuI0; | ||
83 | + cudaMalloc(&gpuI0, bytes_ds); | ||
84 | + | ||
85 | + | ||
86 | + //copy the image data to the GPU | ||
87 | + cudaMemcpy(gpuI0, in, bytes_ds, cudaMemcpyHostToDevice); | ||
88 | + | ||
89 | + //allocate space on the GPU for the down sampled image | ||
90 | + T* gpuI; | ||
91 | + cudaMalloc(&gpuI, bytes); | ||
92 | + | ||
93 | + //run the GPU-based version of the algorithm | ||
94 | + gpu_re_sample<T>(gpuI, gpuI0, resize, x, y); | ||
95 | + | ||
96 | + //copy the image data to the GPU | ||
97 | + cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice); | ||
98 | + | ||
99 | + cudaFree(gpuI0); | ||
100 | + cudeFree(gpuI); | ||
101 | + } | ||
102 | + | ||
103 | + } | ||
104 | +} | ||
105 | + | ||
106 | +#endif | ||
0 | \ No newline at end of file | 107 | \ No newline at end of file |
stim/cuda/ivote/update_dir_global.cuh renamed to stim/cuda/ivote/update_dir_bb.cuh
stim/cuda/ivote/vote_atomic_global.cuh renamed to stim/cuda/ivote/vote_atomic_bb.cuh
stim/cuda/ivote_atomic.cuh renamed to stim/cuda/ivote_atomic_bb.cuh
1 | -#ifndef STIM_CUDA_IVOTE_ATOMIC_H | ||
2 | -#define STIM_CUDA_IVOTE_ATOMIC_H | 1 | +#ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H |
2 | +#define STIM_CUDA_IVOTE_ATOMIC_BB_H | ||
3 | 3 | ||
4 | #include <stim/cuda/ivote/down_sample.cuh> | 4 | #include <stim/cuda/ivote/down_sample.cuh> |
5 | #include <stim/cuda/ivote/local_max.cuh> | 5 | #include <stim/cuda/ivote/local_max.cuh> |
6 | -#include <stim/cuda/ivote/update_dir_global.cuh> | ||
7 | -//#include <stim/cuda/ivote/vote_shared_32-32.cuh> | ||
8 | -#include <stim/cuda/ivote/vote_atomic_global.cuh> | ||
9 | -//#include <stim/cuda/ivote/re_sample.cuh> | 6 | +#include <stim/cuda/ivote/update_dir_bb.cuh> |
7 | +#include <stim/cuda/ivote/vote_atomic_bb.cuh> | ||
8 | + | ||
10 | namespace stim{ | 9 | namespace stim{ |
11 | namespace cuda{ | 10 | namespace cuda{ |
12 | 11 |