Merged
Merge Request #4
·
created by
Ivote_bounding_box
I found out the ivote directory has useless functions, so I removed them, and also I had not push the last changes for vote kernel that used bounding box.
From
ivote_bb
into
master
-
Status changed to closed
-
Status changed to reopened
-
mentioned in commit 5b108e44feef75db062958f2b18e87abeb8d5010
-
Status changed to merged
-
…es for vote and update dir, and shared memory and atomic operation
Showing
9 changed files
Show diff stats
stim/cuda/ivote/local_max.cuh
@@ -14,7 +14,7 @@ namespace stim{ | @@ -14,7 +14,7 @@ namespace stim{ | ||
14 | 14 | ||
15 | // calculate the 2D coordinates for this current thread. | 15 | // calculate the 2D coordinates for this current thread. |
16 | int xi = blockIdx.x * blockDim.x + threadIdx.x; | 16 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
17 | - int yi = blockIdx.y; | 17 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; |
18 | 18 | ||
19 | if(xi >= x || yi >= y) | 19 | if(xi >= x || yi >= y) |
20 | return; | 20 | return; |
@@ -63,8 +63,10 @@ namespace stim{ | @@ -63,8 +63,10 @@ namespace stim{ | ||
63 | void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ | 63 | void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ |
64 | 64 | ||
65 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 65 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
66 | - dim3 threads(max_threads, 1); | ||
67 | - dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | 66 | + /*dim3 threads(max_threads, 1); |
67 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/ | ||
68 | + dim3 threads( sqrt(max_threads), sqrt(max_threads) ); | ||
69 | + dim3 blocks(x/threads.x + 1, y/threads.y + 1); | ||
68 | 70 | ||
69 | //call the kernel to find the local maximum. | 71 | //call the kernel to find the local maximum. |
70 | cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y); | 72 | cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y); |
1 | +#ifndef STIM_CUDA_RE_SAMPLE_H | ||
2 | +#define STIM_CUDA_RE_SAMPLE_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <stim/cuda/cudatools.h> | ||
7 | +#include <stim/cuda/templates/gaussian_blur.cuh> | ||
8 | + | ||
9 | +namespace stim{ | ||
10 | + namespace cuda{ | ||
11 | + | ||
12 | + template<typename T> | ||
13 | + __global__ void cuda_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ | ||
14 | + | ||
15 | + unsigned int sigma_ds = 1/resize; | ||
16 | + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); | ||
17 | + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); | ||
18 | + | ||
19 | + | ||
20 | + // calculate the 2D coordinates for this current thread. | ||
21 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
22 | + int yi = blockIdx.y; | ||
23 | + // convert 2D coordinates to 1D | ||
24 | + int i = yi * x + xi; | ||
25 | + | ||
26 | + if(xi< x && yi< y){ | ||
27 | + if(xi%sigma_ds==0){ | ||
28 | + if(yi%sigma_ds==0){ | ||
29 | + gpuI[i] = gpuI0[(yi/sigma_ds)*x_ds + xi/sigma_ds]; | ||
30 | + } | ||
31 | + } | ||
32 | + else gpuI[i] = 0; | ||
33 | + | ||
34 | + //int x_org = xi * sigma_ds ; | ||
35 | + //int y_org = yi * sigma_ds ; | ||
36 | + //int i_org = y_org * x + x_org; | ||
37 | + //gpuI[i] = gpuI0[i_org]; | ||
38 | + } | ||
39 | + | ||
40 | + } | ||
41 | + | ||
42 | + | ||
43 | + /// Applies a Gaussian blur to a 2D image stored on the GPU | ||
44 | + template<typename T> | ||
45 | + void gpu_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ | ||
46 | + | ||
47 | + | ||
48 | + //unsigned int sigma_ds = 1/resize; | ||
49 | + //unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); | ||
50 | + //unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); | ||
51 | + | ||
52 | + //get the number of pixels in the image | ||
53 | + //unsigned int pixels_ds = x_ds * y_ds; | ||
54 | + | ||
55 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | ||
56 | + dim3 threads(max_threads, 1); | ||
57 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | ||
58 | + | ||
59 | + //stim::cuda::gpu_gaussian_blur2<float>(gpuI0, sigma_ds,x ,y); | ||
60 | + | ||
61 | + //resample the image | ||
62 | + cuda_re_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y); | ||
63 | + | ||
64 | + } | ||
65 | + | ||
66 | + /// Applies a Gaussian blur to a 2D image stored on the CPU | ||
67 | + template<typename T> | ||
68 | + void cpu_re_sample(T* out, T* in, T resize, unsigned int x, unsigned int y){ | ||
69 | + | ||
70 | + //get the number of pixels in the image | ||
71 | + unsigned int pixels = x*y; | ||
72 | + unsigned int bytes = sizeof(T) * pixels; | ||
73 | + | ||
74 | + unsigned int sigma_ds = 1/resize; | ||
75 | + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); | ||
76 | + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); | ||
77 | + unsigned int bytes_ds = sizeof(T) * x_ds * y_ds; | ||
78 | + | ||
79 | + | ||
80 | + | ||
81 | + //allocate space on the GPU for the original image | ||
82 | + T* gpuI0; | ||
83 | + cudaMalloc(&gpuI0, bytes_ds); | ||
84 | + | ||
85 | + | ||
86 | + //copy the image data to the GPU | ||
87 | + cudaMemcpy(gpuI0, in, bytes_ds, cudaMemcpyHostToDevice); | ||
88 | + | ||
89 | + //allocate space on the GPU for the down sampled image | ||
90 | + T* gpuI; | ||
91 | + cudaMalloc(&gpuI, bytes); | ||
92 | + | ||
93 | + //run the GPU-based version of the algorithm | ||
94 | + gpu_re_sample<T>(gpuI, gpuI0, resize, x, y); | ||
95 | + | ||
96 | + //copy the image data to the GPU | ||
97 | + cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice); | ||
98 | + | ||
99 | + cudaFree(gpuI0); | ||
100 | + cudeFree(gpuI); | ||
101 | + } | ||
102 | + | ||
103 | + } | ||
104 | +} | ||
105 | + | ||
106 | +#endif | ||
0 | \ No newline at end of file | 107 | \ No newline at end of file |
stim/cuda/ivote/update_dir_global.cuh renamed to stim/cuda/ivote/update_dir_bb.cuh
1 | -#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H | ||
2 | -#define STIM_CUDA_UPDATE_DIR_GLOBAL_H | 1 | +#ifndef STIM_CUDA_UPDATE_DIR_BB_H |
2 | +#define STIM_CUDA_UPDATE_DIR_BB_H | ||
3 | 3 | ||
4 | # include <iostream> | 4 | # include <iostream> |
5 | # include <cuda.h> | 5 | # include <cuda.h> |
@@ -7,8 +7,7 @@ | @@ -7,8 +7,7 @@ | ||
7 | #include <stim/cuda/sharedmem.cuh> | 7 | #include <stim/cuda/sharedmem.cuh> |
8 | #include <stim/visualization/aabb2.h> | 8 | #include <stim/visualization/aabb2.h> |
9 | #include <stim/visualization/colormap.h> | 9 | #include <stim/visualization/colormap.h> |
10 | -#include <math.h> | ||
11 | -#include "cpyToshare.cuh" | 10 | +#include <math.h> |
12 | 11 | ||
13 | //#define RMAX_TEST 8 | 12 | //#define RMAX_TEST 8 |
14 | 13 | ||
@@ -76,68 +75,6 @@ namespace stim{ | @@ -76,68 +75,6 @@ namespace stim{ | ||
76 | gpuDir[i] = atan2((T)max_dy, (T)max_dx); | 75 | gpuDir[i] = atan2((T)max_dy, (T)max_dx); |
77 | } | 76 | } |
78 | 77 | ||
79 | - // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. | ||
80 | - template<typename T> | ||
81 | - __global__ void leila_cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | ||
82 | - | ||
83 | - | ||
84 | - // calculate the 2D coordinates for this current thread. | ||
85 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
86 | - int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
87 | - | ||
88 | - if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel | ||
89 | - | ||
90 | - int i = yi * x + xi; // convert 2D coordinates to 1D | ||
91 | - | ||
92 | - float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch | ||
93 | - gpuDir[i] = 0; //initialize the vote direction to zero | ||
94 | - float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter | ||
95 | - int id_x = 0; // define two local variables for the x and y position of the maximum | ||
96 | - int id_y = 0; | ||
97 | - | ||
98 | - int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter | ||
99 | - int rmax_sq = rmax * rmax; | ||
100 | - int tx_rmax = threadIdx.x + rmax; | ||
101 | - float atan_angle; | ||
102 | - float vote_c; | ||
103 | - int xidx, yidx, yr_sq, xr_sq; | ||
104 | - for(int yr = -rmax; yr <= rmax; yr++){ | ||
105 | - yidx = yi + yr; //compute the index into the image | ||
106 | - if (yidx >= 0 && yidx < y){ //if the current y-index is inside the image | ||
107 | - yr_sq = yr * yr; //compute the square of yr, to save time later | ||
108 | - for(int xr = -rmax; xr <= rmax; xr++){ | ||
109 | - xidx = xi + xr; | ||
110 | - if(xidx >= 0 && xidx < x){ | ||
111 | - xr_sq = xr * xr; | ||
112 | - unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; | ||
113 | - | ||
114 | - // calculate the angle between the voter and the current pixel in x and y directions | ||
115 | - atan_angle = gpuTable[ind_t]; | ||
116 | - //atan_angle = atan2((T)yr, (T)xr); | ||
117 | - | ||
118 | - // check if the current pixel is located in the voting area of this voter. | ||
119 | - if (((xr_sq + yr_sq)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | ||
120 | - | ||
121 | - vote_c = gpuVote[yidx * x + xidx]; // find the vote value for the current counter | ||
122 | - // compare the vote value of this pixel with the max value to find the maxima and its index. | ||
123 | - if (vote_c>max) { | ||
124 | - | ||
125 | - max = vote_c; | ||
126 | - id_x = xr; | ||
127 | - id_y = yr; | ||
128 | - } | ||
129 | - } | ||
130 | - } | ||
131 | - } | ||
132 | - } | ||
133 | - } | ||
134 | - | ||
135 | - unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); | ||
136 | - float new_angle = gpuTable[ind_m]; | ||
137 | - | ||
138 | - if(xi < x && yi < y) | ||
139 | - gpuDir[i] = new_angle; | ||
140 | - } //end kernel | ||
141 | 78 | ||
142 | 79 | ||
143 | // this kernel updates the gradient direction by the calculated voting direction. | 80 | // this kernel updates the gradient direction by the calculated voting direction. |
@@ -168,9 +105,7 @@ namespace stim{ | @@ -168,9 +105,7 @@ namespace stim{ | ||
168 | HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) ); | 105 | HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) ); |
169 | 106 | ||
170 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 107 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
171 | - //dim3 threads(min(x, max_threads), 1); | ||
172 | - //dim3 blocks(x/threads.x, y); | ||
173 | - | 108 | + |
174 | dim3 threads( sqrt(max_threads), sqrt(max_threads) ); | 109 | dim3 threads( sqrt(max_threads), sqrt(max_threads) ); |
175 | dim3 blocks(x/threads.x + 1, y/threads.y + 1); | 110 | dim3 blocks(x/threads.x + 1, y/threads.y + 1); |
176 | 111 | ||
@@ -188,12 +123,12 @@ namespace stim{ | @@ -188,12 +123,12 @@ namespace stim{ | ||
188 | 123 | ||
189 | //call the kernel to calculate the new voting direction | 124 | //call the kernel to calculate the new voting direction |
190 | cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | 125 | cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); |
191 | - stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer); | 126 | + //stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer); |
192 | 127 | ||
193 | //exit(0); | 128 | //exit(0); |
194 | 129 | ||
195 | - threads = dim3( sqrt(max_threads), sqrt(max_threads) ); | ||
196 | - blocks = dim3(x/threads.x + 1, y/threads.y + 1); | 130 | + //threads = dim3( sqrt(max_threads), sqrt(max_threads) ); |
131 | + //blocks = dim3(x/threads.x + 1, y/threads.y + 1); | ||
197 | 132 | ||
198 | //call the kernel to update the gradient direction | 133 | //call the kernel to update the gradient direction |
199 | cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); | 134 | cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); |
stim/cuda/ivote/david_update_dir_global.cuh renamed to stim/cuda/ivote/update_dir_threshold_global.cuh
1 | -#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H | ||
2 | -#define STIM_CUDA_UPDATE_DIR_GLOBAL_H | 1 | +#ifndef STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBALD_H |
2 | +#define STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBAL_H | ||
3 | 3 | ||
4 | # include <iostream> | 4 | # include <iostream> |
5 | # include <cuda.h> | 5 | # include <cuda.h> |
6 | #include <stim/cuda/cudatools.h> | 6 | #include <stim/cuda/cudatools.h> |
7 | #include <stim/cuda/sharedmem.cuh> | 7 | #include <stim/cuda/sharedmem.cuh> |
8 | -#include <math.h> | ||
9 | -#include "cpyToshare.cuh" | ||
10 | - | ||
11 | -#define RMAX_TEST 8 | 8 | +#include "cpyToshare.cuh" |
12 | 9 | ||
13 | namespace stim{ | 10 | namespace stim{ |
14 | namespace cuda{ | 11 | namespace cuda{ |
15 | 12 | ||
16 | // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. | 13 | // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. |
17 | template<typename T> | 14 | template<typename T> |
18 | - __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | ||
19 | - extern __shared__ T atan2_table[]; | ||
20 | - | ||
21 | - //calculate the start point for this block | ||
22 | - //int bxi = blockIdx.x * blockDim.x; | ||
23 | - | ||
24 | - stim::cuda::sharedMemcpy(atan2_table, gpuTable, (2 * rmax + 1) * (2 * rmax + 1), threadIdx.x, blockDim.x); | 15 | + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x, int y){ |
25 | 16 | ||
26 | - __syncthreads(); | ||
27 | 17 | ||
28 | - // calculate the 2D coordinates for this current thread. | ||
29 | - //int xi = bxi + threadIdx.x; | 18 | + |
19 | + // calculate the coordinate for this current thread. | ||
30 | int xi = blockIdx.x * blockDim.x + threadIdx.x; | 20 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
31 | - int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
32 | - if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel | ||
33 | - | ||
34 | - int i = yi * x + xi; // convert 2D coordinates to 1D | 21 | + // calculate the voting direction based on the grtadient direction |
22 | + float theta = gpuTh[3*xi]; | ||
35 | 23 | ||
36 | - float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch | ||
37 | - gpuDir[i] = 0; //initialize the vote direction to zero | ||
38 | - float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter | ||
39 | - int id_x = 0; // define two local variables for the x and y position of the maximum | ||
40 | - int id_y = 0; | 24 | + //calculate the position and x, y coordinations of this voter in the original image |
25 | + unsigned int i_v = gpuTh[3*xi+2]; | ||
26 | + unsigned int y_v = i_v/x; | ||
27 | + unsigned int x_v = i_v - (y_v*x); | ||
41 | 28 | ||
42 | - int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter | 29 | + //initialize the vote direction to zero |
30 | + gpuDir[xi] = 0; | ||
31 | + | ||
32 | + // define a local variable to maximum value of the vote image in the voting area for this voter | ||
33 | + float max = 0; | ||
34 | + | ||
35 | + // define two local variables for the x and y coordinations where the maximum happened | ||
36 | + int id_x = 0; | ||
37 | + int id_y = 0; | ||
38 | + | ||
39 | + // compute the size of window which will be checked for finding the voting area for this voter | ||
40 | + int x_table = 2*rmax +1; | ||
43 | int rmax_sq = rmax * rmax; | 41 | int rmax_sq = rmax * rmax; |
44 | int tx_rmax = threadIdx.x + rmax; | 42 | int tx_rmax = threadIdx.x + rmax; |
45 | - float atan_angle; | ||
46 | - float vote_c; | ||
47 | - unsigned int ind_t; | ||
48 | - for(int yr = -rmax; yr <= rmax; yr++){ //for each counter in the y direction | ||
49 | - if (yi+yr >= 0 && yi + yr < y){ //if the counter exists (we aren't looking outside of the image) | ||
50 | - for(int xr = -rmax; xr <= rmax; xr++){ //for each counter in the x direction | ||
51 | - if((xr * xr + yr *yr)< rmax_sq){ //if the counter is within range of the voter | ||
52 | - | ||
53 | - ind_t = (rmax - yr) * x_table + rmax - xr; //calculate the index to the atan2 table | ||
54 | - atan_angle = atan2_table[ind_t]; //retrieve the direction vector from the table | ||
55 | - | ||
56 | - //atan_angle = atan2((float)yr, (float)xr); | ||
57 | - | ||
58 | - if (abs(atan_angle - theta) <phi){ // check if the current pixel is located in the voting angle of this voter. | ||
59 | - vote_c = gpuVote[(yi+yr)*x + (xi+xr)]; // find the vote value for the current counter | ||
60 | - if(vote_c>max) { // compare the vote value of this pixel with the max value to find the maxima and its index. | ||
61 | - max = vote_c; | ||
62 | - id_x = xr; | ||
63 | - id_y = yr; | ||
64 | - } | 43 | + if(xi < th_size){ |
44 | + | ||
45 | + for(int yr = -rmax; yr <= rmax; yr++){ | ||
46 | + | ||
47 | + for(int xr = -rmax; xr <= rmax; xr++){ | ||
48 | + | ||
49 | + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; | ||
50 | + | ||
51 | + // find the angle between the voter and the current pixel in x and y directions | ||
52 | + float atan_angle = gpuTable[ind_t]; | ||
53 | + | ||
54 | + // check if the current pixel is located in the voting area of this voter. | ||
55 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | ||
56 | + // find the vote value for the current counter | ||
57 | + float vote_c = gpuVote[(y_v+yr)*x + (x_v+xr)]; | ||
58 | + // compare the vote value of this pixel with the max value to find the maxima and its index. | ||
59 | + if (vote_c>max) { | ||
60 | + | ||
61 | + max = vote_c; | ||
62 | + id_x = xr; | ||
63 | + id_y = yr; | ||
65 | } | 64 | } |
66 | } | 65 | } |
67 | } | 66 | } |
68 | } | 67 | } |
69 | - } | 68 | + |
70 | 69 | ||
71 | - unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); | ||
72 | - float new_angle = gpuTable[ind_m]; | 70 | + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); |
71 | + float new_angle = gpuTable[ind_m]; | ||
72 | + gpuDir[xi] = new_angle; | ||
73 | + } | ||
73 | 74 | ||
74 | - if(xi < x && yi < y) | ||
75 | - gpuDir[i] = new_angle; | ||
76 | - } //end kernel | 75 | + } |
77 | 76 | ||
78 | // this kernel updates the gradient direction by the calculated voting direction. | 77 | // this kernel updates the gradient direction by the calculated voting direction. |
79 | template<typename T> | 78 | template<typename T> |
80 | - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ | 79 | + __global__ void cuda_update_grad(T* gpuTh, T* gpuDir, int th_size, int x, int y){ |
81 | 80 | ||
82 | - // calculate the 2D coordinates for this current thread. | 81 | + // calculate the coordinate for this current thread. |
83 | int xi = blockIdx.x * blockDim.x + threadIdx.x; | 82 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
84 | - int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
85 | - | ||
86 | - // convert 2D coordinates to 1D | ||
87 | - int i = yi * x + xi; | ||
88 | 83 | ||
84 | + | ||
89 | //update the gradient image with the vote direction | 85 | //update the gradient image with the vote direction |
90 | - gpuGrad[2*i] = gpuDir[i]; | 86 | + gpuTh[3*xi] = gpuDir[xi]; |
91 | } | 87 | } |
92 | 88 | ||
93 | template<typename T> | 89 | template<typename T> |
94 | - void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
95 | - | ||
96 | - | 90 | + void gpu_update_dir(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){ |
97 | 91 | ||
98 | //calculate the number of bytes in the array | 92 | //calculate the number of bytes in the array |
99 | - unsigned int bytes = x * y * sizeof(T); | 93 | + unsigned int bytes_th = th_size* sizeof(T); |
100 | 94 | ||
101 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 95 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
102 | - | ||
103 | - dim3 threads(sqrt(max_threads), sqrt(max_threads)); | ||
104 | - dim3 blocks(x/threads.x + 1, y/threads.y + 1); | ||
105 | - | ||
106 | - | 96 | + dim3 threads(max_threads); |
97 | + dim3 blocks(th_size/threads.x+1); | ||
107 | 98 | ||
108 | // allocate space on the GPU for the updated vote direction | 99 | // allocate space on the GPU for the updated vote direction |
109 | T* gpuDir; | 100 | T* gpuDir; |
110 | - cudaMalloc(&gpuDir, bytes); | ||
111 | - | ||
112 | - size_t shared_mem = sizeof(T) * std::pow((2 * rmax + 1), 2); | ||
113 | - std::cout<<"Shared memory for atan2 table: "<<shared_mem<<std::endl; | 101 | + cudaMalloc(&gpuDir, bytes_th); |
114 | 102 | ||
115 | //call the kernel to calculate the new voting direction | 103 | //call the kernel to calculate the new voting direction |
116 | - cuda_update_dir <<< blocks, threads, shared_mem>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | 104 | + cuda_update_dir <<< blocks, threads>>>(gpuDir, gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y); |
117 | 105 | ||
118 | //call the kernel to update the gradient direction | 106 | //call the kernel to update the gradient direction |
119 | - cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); | 107 | + cuda_update_grad <<< blocks, threads >>>(gpuTh, gpuDir, th_size, x , y); |
120 | 108 | ||
121 | //free allocated memory | 109 | //free allocated memory |
122 | cudaFree(gpuDir); | 110 | cudaFree(gpuDir); |
1 | +#ifndef STIM_CUDA_VOTE_ATOMIC_BB_H | ||
2 | +#define STIM_CUDA_VOTE_ATOMIC_BB_H | ||
3 | + | ||
4 | +# include <iostream> | ||
5 | +# include <cuda.h> | ||
6 | +#include <stim/cuda/cudatools.h> | ||
7 | +#include <stim/cuda/sharedmem.cuh> | ||
8 | +#include <stim/visualization/aabb2.h> | ||
9 | +#include <stim/visualization/colormap.h> | ||
10 | +#include <math.h> | ||
11 | + | ||
12 | +namespace stim{ | ||
13 | + namespace cuda{ | ||
14 | + | ||
15 | + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | ||
16 | + template<typename T> | ||
17 | + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | ||
18 | + | ||
19 | + extern __shared__ T S[]; | ||
20 | + T* shared_atan = S; | ||
21 | + size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1); | ||
22 | + stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x); | ||
23 | + | ||
24 | + // calculate the 2D coordinates for this current thread. | ||
25 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
26 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
27 | + | ||
28 | + if(xi >= x || yi >= y) return; | ||
29 | + // convert 2D coordinates to 1D | ||
30 | + int i = yi * x + xi; | ||
31 | + | ||
32 | + // calculate the voting direction based on the grtadient direction | ||
33 | + float theta = gpuGrad[2*i]; | ||
34 | + //calculate the amount of vote for the voter | ||
35 | + float mag = gpuGrad[2*i + 1]; | ||
36 | + | ||
37 | + | ||
38 | + stim::aabb2<int> bb(xi, yi); //initialize a bounding box at the current point | ||
39 | + bb.insert(xi + ceil(rmax * cos(theta)), ceil(yi + rmax * sin(theta))); | ||
40 | + bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi))); //insert one corner of the triangle into the bounding box | ||
41 | + bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi))); //insert the final corner into the bounding box | ||
42 | + | ||
43 | + // compute the size of window which will be checked for finding the proper voters for this pixel | ||
44 | + int x_table = 2*rmax +1; | ||
45 | + int rmax_sq = rmax * rmax; | ||
46 | + | ||
47 | + int lut_i; | ||
48 | + T dx_sq, dy_sq; | ||
49 | + | ||
50 | + bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image | ||
51 | + bb.trim_high(x-1, y-1); | ||
52 | + | ||
53 | + int by, bx; | ||
54 | + int dx, dy; | ||
55 | + | ||
56 | + unsigned int ind_g; //initialize the maximum vote value to zero | ||
57 | + T alpha; | ||
58 | + | ||
59 | + for(by = bb.low[1]; by <= bb.high[1]; by++){ //for each element in the bounding box | ||
60 | + dy = by - yi; //calculate the y coordinate of the current point relative to yi | ||
61 | + dy_sq = dy * dy; | ||
62 | + for(bx = bb.low[0]; bx <= bb.high[0]; bx++){ | ||
63 | + dx = bx - xi; | ||
64 | + dx_sq = dx * dx; | ||
65 | + lut_i = (rmax - dy) * x_table + rmax - dx; | ||
66 | + alpha = shared_atan[lut_i]; | ||
67 | + if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){ | ||
68 | + ind_g = (by)*x + (bx); | ||
69 | + atomicAdd(&gpuVote[ind_g], mag); | ||
70 | + | ||
71 | + } | ||
72 | + } | ||
73 | + } | ||
74 | + | ||
75 | + } | ||
76 | + | ||
77 | + | ||
78 | + template<typename T> | ||
79 | + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
80 | + | ||
81 | + | ||
82 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | ||
83 | + dim3 threads( sqrt(max_threads), sqrt(max_threads) ); | ||
84 | + dim3 blocks(x/threads.x + 1, y/threads.y + 1); | ||
85 | + size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); | ||
86 | + size_t shared_mem_req = table_bytes;// + template_bytes; | ||
87 | + std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl; | ||
88 | + size_t shared_mem = stim::sharedMemPerBlock(); | ||
89 | + if(shared_mem_req > shared_mem){ | ||
90 | + std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl; | ||
91 | + exit(1); | ||
92 | + } | ||
93 | + | ||
94 | + //call the kernel to do the voting | ||
95 | + cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
96 | + | ||
97 | + } | ||
98 | + | ||
99 | + | ||
100 | + template<typename T> | ||
101 | + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
102 | + | ||
103 | + //calculate the number of bytes in the array | ||
104 | + unsigned int bytes = x * y * sizeof(T); | ||
105 | + | ||
106 | + //calculate the number of bytes in the atan2 table | ||
107 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | ||
108 | + | ||
109 | + //allocate space on the GPU for the Vote Image | ||
110 | + T* gpuVote; | ||
111 | + cudaMalloc(&gpuVote, bytes); | ||
112 | + | ||
113 | + //allocate space on the GPU for the input Gradient image | ||
114 | + T* gpuGrad; | ||
115 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | ||
116 | + | ||
117 | + //copy the Gradient Magnitude data to the GPU | ||
118 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | ||
119 | + | ||
120 | + //allocate space on the GPU for the atan2 table | ||
121 | + T* gpuTable; | ||
122 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | ||
123 | + | ||
124 | + //copy the atan2 values to the GPU | ||
125 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | ||
126 | + | ||
127 | + //call the GPU version of the vote calculation function | ||
128 | + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
129 | + | ||
130 | + //copy the Vote Data back to the CPU | ||
131 | + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; | ||
132 | + | ||
133 | + //free allocated memory | ||
134 | + cudaFree(gpuTable); | ||
135 | + cudaFree(gpuVote); | ||
136 | + cudaFree(gpuGrad); | ||
137 | + } | ||
138 | + | ||
139 | + } | ||
140 | +} | ||
141 | + | ||
142 | +#endif | ||
0 | \ No newline at end of file | 143 | \ No newline at end of file |
stim/cuda/ivote/vote_atomic_shared.cuh
@@ -5,7 +5,7 @@ | @@ -5,7 +5,7 @@ | ||
5 | # include <cuda.h> | 5 | # include <cuda.h> |
6 | #include <stim/cuda/cudatools.h> | 6 | #include <stim/cuda/cudatools.h> |
7 | #include <stim/cuda/sharedmem.cuh> | 7 | #include <stim/cuda/sharedmem.cuh> |
8 | -#include "cpyToshare.cuh" | 8 | + |
9 | //#include "writebackshared.cuh" | 9 | //#include "writebackshared.cuh" |
10 | namespace stim{ | 10 | namespace stim{ |
11 | namespace cuda{ | 11 | namespace cuda{ |
1 | +#ifndef STIM_CUDA_VOTE_SHARED_H | ||
2 | +#define STIM_CUDA_VOTE_SHARED | ||
3 | +# include <iostream> | ||
4 | +# include <cuda.h> | ||
5 | +#include <stim/cuda/cudatools.h> | ||
6 | +#include <stim/cuda/sharedmem.cuh> | ||
7 | +#include "cpyToshare.cuh" | ||
8 | + | ||
9 | +namespace stim{ | ||
10 | + namespace cuda{ | ||
11 | + | ||
12 | + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | ||
13 | + template<typename T> | ||
14 | + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | ||
15 | + | ||
16 | + //generate a pointer to shared memory (size will be specified as a kernel parameter) | ||
17 | + extern __shared__ float s_grad[]; | ||
18 | + | ||
19 | + //calculate the start point for this block | ||
20 | + int bxi = blockIdx.x * blockDim.x; | ||
21 | + | ||
22 | + // calculate the 2D coordinates for this current thread. | ||
23 | + int xi = bxi + threadIdx.x; | ||
24 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
25 | + // convert 2D coordinates to 1D | ||
26 | + int i = yi * x + xi; | ||
27 | + | ||
28 | + // define a local variable to sum the votes from the voters | ||
29 | + float sum = 0; | ||
30 | + | ||
31 | + //calculate the width of the shared memory block | ||
32 | + int swidth = 2 * rmax + blockDim.x; | ||
33 | + | ||
34 | + // compute the size of window which will be checked for finding the proper voters for this pixel | ||
35 | + int x_table = 2*rmax +1; | ||
36 | + int rmax_sq = rmax * rmax; | ||
37 | + int tx_rmax = threadIdx.x + rmax; | ||
38 | + int bxs = bxi - rmax; | ||
39 | + | ||
40 | + //for every line (along y) | ||
41 | + for(int yr = -rmax; yr <= rmax; yr++){ | ||
42 | + if (yi+yr<y && yi+yr>=0){ | ||
43 | + //copy the portion of the image necessary for this block to shared memory | ||
44 | + __syncthreads(); | ||
45 | + cpyG2S1D2ch<float>(s_grad, gpuGrad, bxs, yi + yr , 2*swidth, 1, threadIdx, blockDim, x, y); | ||
46 | + __syncthreads(); | ||
47 | + | ||
48 | + if(xi < x && yi < y){ | ||
49 | + | ||
50 | + for(int xr = -rmax; xr <= rmax; xr++){ | ||
51 | + | ||
52 | + //find the location of this voter in the atan2 table | ||
53 | + int id_t = (yr + rmax) * x_table + xr + rmax; | ||
54 | + | ||
55 | + // calculate the angle between the pixel and the current voter in x and y directions | ||
56 | + float atan_angle = gpuTable[id_t]; | ||
57 | + | ||
58 | + // calculate the voting direction based on the grtadient direction | ||
59 | + int idx_share = xr + tx_rmax ; | ||
60 | + float theta = s_grad[idx_share*2]; | ||
61 | + float mag = s_grad[idx_share*2 + 1]; | ||
62 | + | ||
63 | + | ||
64 | + // check if the current voter is located in the voting area of this pixel. | ||
65 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | ||
66 | + sum += mag; | ||
67 | + | ||
68 | + } | ||
69 | + } | ||
70 | + | ||
71 | + } | ||
72 | + } | ||
73 | + } | ||
74 | + if(xi < x && yi < y) | ||
75 | + gpuVote[i] = sum; | ||
76 | + | ||
77 | + } | ||
78 | + | ||
79 | + template<typename T> | ||
80 | + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
81 | + | ||
82 | + | ||
83 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | ||
84 | + dim3 threads(max_threads, 1); | ||
85 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | ||
86 | + | ||
87 | + | ||
88 | + // specify share memory | ||
89 | + unsigned int share_bytes = (2*rmax + threads.x)*1*2*sizeof(T); | ||
90 | + | ||
91 | + //call the kernel to do the voting | ||
92 | + cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
93 | + | ||
94 | + } | ||
95 | + | ||
96 | + | ||
97 | + template<typename T> | ||
98 | + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
99 | + | ||
100 | + //calculate the number of bytes in the array | ||
101 | + unsigned int bytes = x * y * sizeof(T); | ||
102 | + | ||
103 | + //calculate the number of bytes in the atan2 table | ||
104 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | ||
105 | + | ||
106 | + //allocate space on the GPU for the Vote Image | ||
107 | + T* gpuVote; | ||
108 | + cudaMalloc(&gpuVote, bytes); | ||
109 | + | ||
110 | + //allocate space on the GPU for the input Gradient image | ||
111 | + T* gpuGrad; | ||
112 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | ||
113 | + | ||
114 | + //copy the Gradient Magnitude data to the GPU | ||
115 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | ||
116 | + | ||
117 | + //allocate space on the GPU for the atan2 table | ||
118 | + T* gpuTable; | ||
119 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | ||
120 | + | ||
121 | + //copy the atan2 values to the GPU | ||
122 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | ||
123 | + | ||
124 | + //call the GPU version of the vote calculation function | ||
125 | + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
126 | + | ||
127 | + //copy the Vote Data back to the CPU | ||
128 | + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; | ||
129 | + | ||
130 | + //free allocated memory | ||
131 | + cudaFree(gpuTable); | ||
132 | + cudaFree(gpuVote); | ||
133 | + cudaFree(gpuGrad); | ||
134 | + } | ||
135 | + | ||
136 | + } | ||
137 | +} | ||
138 | + | ||
139 | +#endif | ||
0 | \ No newline at end of file | 140 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H | ||
2 | +#define STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H | ||
3 | +# include <iostream> | ||
4 | +# include <cuda.h> | ||
5 | +#include <stim/cuda/cudatools.h> | ||
6 | +#include <stim/cuda/sharedmem.cuh> | ||
7 | +#include "cpyToshare.cuh" | ||
8 | + | ||
9 | +namespace stim{ | ||
10 | + namespace cuda{ | ||
11 | + | ||
12 | + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | ||
13 | + template<typename T> | ||
14 | + __global__ void cuda_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x, int y){ | ||
15 | + | ||
16 | + | ||
17 | + // calculate the x coordinate for this current thread. | ||
18 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
19 | + | ||
20 | + // calculate the voting direction based on the grtadient direction | ||
21 | + float theta = gpuTh[3*xi]; | ||
22 | + //find the gradient magnitude for the current voter | ||
23 | + float mag = gpuTh[3*xi + 1]; | ||
24 | + //calculate the position and x, y coordinations of this voter in the original image | ||
25 | + unsigned int i_v = gpuTh[3*xi+2]; | ||
26 | + unsigned int y_v = i_v/x; | ||
27 | + unsigned int x_v = i_v - (y_v*x); | ||
28 | + | ||
29 | + // compute the size of window which will be checked for finding the proper voters for this pixel | ||
30 | + int x_table = 2*rmax +1; | ||
31 | + int rmax_sq = rmax * rmax; | ||
32 | + if(xi < th_size){ | ||
33 | + for(int yr = -rmax; yr <= rmax; yr++){ | ||
34 | + for(int xr = -rmax; xr <= rmax; xr++){ | ||
35 | + if ((y_v+yr)>=0 && (y_v+yr)<y && (x_v+xr)>=0 && (x_v+xr)<x){ | ||
36 | + | ||
37 | + //find the location of the current pixel in the atan2 table | ||
38 | + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; | ||
39 | + | ||
40 | + // calculate the angle between the voter and the current pixel in x and y directions | ||
41 | + float atan_angle = gpuTable[ind_t]; | ||
42 | + | ||
43 | + // check if the current pixel is located in the voting area of this voter. | ||
44 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | ||
45 | + // calculate the 1D index for the current pixel in global memory | ||
46 | + unsigned int ind_g = (y_v+yr)*x + (x_v+xr); | ||
47 | + atomicAdd(&gpuVote[ind_g], mag); | ||
48 | + | ||
49 | + } | ||
50 | + } | ||
51 | + } | ||
52 | + } | ||
53 | + } | ||
54 | + } | ||
55 | + | ||
56 | + template<typename T> | ||
57 | + void gpu_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){ | ||
58 | + | ||
59 | + | ||
60 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | ||
61 | + dim3 threads(max_threads); | ||
62 | + dim3 blocks(th_size/threads.x + 1); | ||
63 | + | ||
64 | + //call the kernel to do the voting | ||
65 | + cuda_vote <<< blocks, threads>>>(gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y); | ||
66 | + | ||
67 | + } | ||
68 | + | ||
69 | + | ||
70 | + template<typename T> | ||
71 | + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
72 | + | ||
73 | + //calculate the number of bytes in the array | ||
74 | + unsigned int bytes = x * y * sizeof(T); | ||
75 | + | ||
76 | + //calculate the number of bytes in the atan2 table | ||
77 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | ||
78 | + | ||
79 | + //allocate space on the GPU for the Vote Image | ||
80 | + T* gpuVote; | ||
81 | + cudaMalloc(&gpuVote, bytes); | ||
82 | + | ||
83 | + //allocate space on the GPU for the input Gradient image | ||
84 | + T* gpuGrad; | ||
85 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | ||
86 | + | ||
87 | + //copy the Gradient Magnitude data to the GPU | ||
88 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | ||
89 | + | ||
90 | + //allocate space on the GPU for the atan2 table | ||
91 | + T* gpuTable; | ||
92 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | ||
93 | + | ||
94 | + //copy the atan2 values to the GPU | ||
95 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | ||
96 | + | ||
97 | + //call the GPU version of the vote calculation function | ||
98 | + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
99 | + | ||
100 | + //copy the Vote Data back to the CPU | ||
101 | + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; | ||
102 | + | ||
103 | + //free allocated memory | ||
104 | + cudaFree(gpuTable); | ||
105 | + cudaFree(gpuVote); | ||
106 | + cudaFree(gpuGrad); | ||
107 | + } | ||
108 | + | ||
109 | + } | ||
110 | +} | ||
111 | + | ||
112 | +#endif | ||
0 | \ No newline at end of file | 113 | \ No newline at end of file |
stim/cuda/ivote_atomic.cuh renamed to stim/cuda/ivote_atomic_bb.cuh
1 | -#ifndef STIM_CUDA_IVOTE_ATOMIC_H | ||
2 | -#define STIM_CUDA_IVOTE_ATOMIC_H | 1 | +#ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H |
2 | +#define STIM_CUDA_IVOTE_ATOMIC_BB_H | ||
3 | 3 | ||
4 | #include <stim/cuda/ivote/down_sample.cuh> | 4 | #include <stim/cuda/ivote/down_sample.cuh> |
5 | #include <stim/cuda/ivote/local_max.cuh> | 5 | #include <stim/cuda/ivote/local_max.cuh> |
6 | -#include <stim/cuda/ivote/update_dir_global.cuh> | ||
7 | -//#include <stim/cuda/ivote/vote_shared_32-32.cuh> | ||
8 | -#include <stim/cuda/ivote/vote_atomic_shared.cuh> | ||
9 | -//#include <stim/cuda/ivote/re_sample.cuh> | 6 | +#include <stim/cuda/ivote/update_dir_bb.cuh> |
7 | +#include <stim/cuda/ivote/vote_atomic_bb.cuh> | ||
8 | + | ||
10 | namespace stim{ | 9 | namespace stim{ |
11 | namespace cuda{ | 10 | namespace cuda{ |
12 | 11 |