Commit 63d349748fe796c11620ca470fb3e49bee47ff5c
Merge branch 'ivote-profiling' into 'master'
Leila's ivote profiling push This is a test See merge request !1
Showing
8 changed files
with
814 additions
and
4 deletions
Show diff stats
stim/cuda/arraymath.cuh
... | ... | @@ -10,7 +10,8 @@ |
10 | 10 | #include <stim/cuda/arraymath/array_atan.cuh> |
11 | 11 | #include <stim/cuda/arraymath/array_abs.cuh> |
12 | 12 | #include <stim/cuda/arraymath/array_cart2polar.cuh> |
13 | - | |
13 | +#include <stim/cuda/arraymath/array_threshold.cuh> | |
14 | +#include <stim/cuda/arraymath/array_add3.cuh> | |
14 | 15 | namespace stim{ |
15 | 16 | namespace cuda{ |
16 | 17 | ... | ... |
stim/cuda/ivote/local_max.cuh
... | ... | @@ -29,17 +29,33 @@ namespace stim{ |
29 | 29 | //compare to the threshold |
30 | 30 | if(val < final_t) return; |
31 | 31 | |
32 | + //define an array to store indices with same vote value | |
33 | + /*int * IdxEq; | |
34 | + IdxEq = new int [2*conn]; | |
35 | + int n = 0;*/ | |
36 | + | |
32 | 37 | for(int xl = xi - conn; xl < xi + conn; xl++){ |
33 | 38 | for(int yl = yi - conn; yl < yi + conn; yl++){ |
34 | 39 | if(xl >= 0 && xl < x && yl >= 0 && yl < y){ |
35 | 40 | int il = yl * x + xl; |
36 | 41 | if(gpuVote[il] > val){ |
37 | 42 | return; |
38 | - } | |
39 | - } | |
43 | + } | |
44 | + if (gpuVote[il] == val){ | |
45 | + /*IdxEq[n] = il; | |
46 | + n = n+1;*/ | |
47 | + if( il > i){ | |
48 | + return; | |
49 | + } | |
50 | + } | |
51 | + } | |
40 | 52 | } |
41 | 53 | } |
42 | - | |
54 | + /*if (n!=0){ | |
55 | + if(IdxEq[n/2] !=i){ | |
56 | + return; | |
57 | + } | |
58 | + } */ | |
43 | 59 | gpuCenters[i] = 1; |
44 | 60 | } |
45 | 61 | ... | ... |
1 | +#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H | |
2 | +#define STIM_CUDA_UPDATE_DIR_GLOBAL_H | |
3 | + | |
4 | +# include <iostream> | |
5 | +# include <cuda.h> | |
6 | +#include <stim/cuda/cudatools.h> | |
7 | +#include <stim/cuda/sharedmem.cuh> | |
8 | +#include "cpyToshare.cuh" | |
9 | + | |
10 | +#define RMAX_TEST 8 | |
11 | + | |
12 | +namespace stim{ | |
13 | + namespace cuda{ | |
14 | + | |
15 | + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. | |
16 | + template<typename T> | |
17 | + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | |
18 | + | |
19 | + | |
20 | + //calculate the start point for this block | |
21 | + int bxi = blockIdx.x * blockDim.x; | |
22 | + | |
23 | + // calculate the 2D coordinates for this current thread. | |
24 | + int xi = bxi + threadIdx.x; | |
25 | + if(xi >= x) return; //if the index is outside of the image, terminate the kernel | |
26 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
27 | + int i = yi * x + xi; // convert 2D coordinates to 1D | |
28 | + | |
29 | + float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch | |
30 | + gpuDir[i] = 0; //initialize the vote direction to zero | |
31 | + float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter | |
32 | + int id_x = 0; // define two local variables for the x and y position of the maximum | |
33 | + int id_y = 0; | |
34 | + | |
35 | + int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter | |
36 | + int rmax_sq = rmax * rmax; | |
37 | + int tx_rmax = threadIdx.x + rmax; | |
38 | + float atan_angle; | |
39 | + float vote_c; | |
40 | + for(int yr = -RMAX_TEST; yr <= RMAX_TEST; yr++){ | |
41 | + if (yi+yr >= 0 && yi + yr < y){ | |
42 | + for(int xr = -RMAX_TEST; xr <= RMAX_TEST; xr++){ | |
43 | + | |
44 | + unsigned int ind_t = (RMAX_TEST - yr) * x_table + RMAX_TEST - xr; | |
45 | + | |
46 | + // calculate the angle between the voter and the current pixel in x and y directions | |
47 | + atan_angle = gpuTable[ind_t]; | |
48 | + | |
49 | + // find the vote value for the current counter | |
50 | + vote_c = gpuVote[(yi+yr)*x + (xi+xr)]; | |
51 | + | |
52 | + // check if the current pixel is located in the voting area of this voter. | |
53 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | |
54 | + | |
55 | + // compare the vote value of this pixel with the max value to find the maxima and its index. | |
56 | + if (vote_c>max) { | |
57 | + | |
58 | + max = vote_c; | |
59 | + id_x = xr; | |
60 | + id_y = yr; | |
61 | + } | |
62 | + } | |
63 | + } | |
64 | + } | |
65 | + } | |
66 | + | |
67 | + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); | |
68 | + float new_angle = gpuTable[ind_m]; | |
69 | + | |
70 | + if(xi < x && yi < y) | |
71 | + gpuDir[i] = new_angle; | |
72 | + } //end kernel | |
73 | + | |
74 | + // this kernel updates the gradient direction by the calculated voting direction. | |
75 | + template<typename T> | |
76 | + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ | |
77 | + | |
78 | + // calculate the 2D coordinates for this current thread. | |
79 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
80 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
81 | + | |
82 | + // convert 2D coordinates to 1D | |
83 | + int i = yi * x + xi; | |
84 | + | |
85 | + //update the gradient image with the vote direction | |
86 | + gpuGrad[2*i] = gpuDir[i]; | |
87 | + } | |
88 | + | |
89 | + template<typename T> | |
90 | + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
91 | + | |
92 | + //calculate the number of bytes in the array | |
93 | + unsigned int bytes = x * y * sizeof(T); | |
94 | + | |
95 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
96 | + dim3 threads(max_threads, 1); | |
97 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | |
98 | + | |
99 | + // allocate space on the GPU for the updated vote direction | |
100 | + T* gpuDir; | |
101 | + cudaMalloc(&gpuDir, bytes); | |
102 | + | |
103 | + //call the kernel to calculate the new voting direction | |
104 | + cuda_update_dir <<< blocks, threads>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
105 | + | |
106 | + //call the kernel to update the gradient direction | |
107 | + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); | |
108 | + | |
109 | + //free allocated memory | |
110 | + cudaFree(gpuDir); | |
111 | + | |
112 | + } | |
113 | + | |
114 | + template<typename T> | |
115 | + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
116 | + | |
117 | + //calculate the number of bytes in the array | |
118 | + unsigned int bytes = x * y * sizeof(T); | |
119 | + | |
120 | + //calculate the number of bytes in the atan2 table | |
121 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | |
122 | + | |
123 | + //allocate space on the GPU for the Vote Image | |
124 | + T* gpuVote; | |
125 | + cudaMalloc(&gpuVote, bytes); | |
126 | + | |
127 | + //copy the input vote image to the GPU | |
128 | + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); | |
129 | + | |
130 | + //allocate space on the GPU for the input Gradient image | |
131 | + T* gpuGrad; | |
132 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | |
133 | + | |
134 | + //copy the Gradient data to the GPU | |
135 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | |
136 | + | |
137 | + //allocate space on the GPU for the atan2 table | |
138 | + T* gpuTable; | |
139 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | |
140 | + | |
141 | + //copy the atan2 values to the GPU | |
142 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | |
143 | + | |
144 | + //call the GPU version of the update direction function | |
145 | + gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
146 | + | |
147 | + //copy the new gradient image back to the CPU | |
148 | + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; | |
149 | + | |
150 | + //free allocated memory | |
151 | + cudaFree(gpuTable); | |
152 | + cudaFree(gpuVote); | |
153 | + cudaFree(gpuGrad); | |
154 | + } | |
155 | + | |
156 | + } | |
157 | +} | |
158 | + | |
159 | +#endif | |
0 | 160 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_UPDATE_DIR_SHARED_H | |
2 | +#define STIM_CUDA_UPDATE_DIR_SHARED_H | |
3 | + | |
4 | +# include <iostream> | |
5 | +# include <cuda.h> | |
6 | +#include <stim/cuda/cudatools.h> | |
7 | +#include <stim/cuda/sharedmem.cuh> | |
8 | +#include "cpyToshare.cuh" | |
9 | + | |
10 | +namespace stim{ | |
11 | + namespace cuda{ | |
12 | + | |
13 | + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. | |
14 | + template<typename T> | |
15 | + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | |
16 | + | |
17 | + //generate a pointer to shared memory (size will be specified as a kernel parameter) | |
18 | + extern __shared__ float s_vote[]; | |
19 | + | |
20 | + //calculate the start point for this block | |
21 | + int bxi = blockIdx.x * blockDim.x; | |
22 | + | |
23 | + // calculate the 2D coordinates for this current thread. | |
24 | + int xi = bxi + threadIdx.x; | |
25 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
26 | + // convert 2D coordinates to 1D | |
27 | + int i = yi * x + xi; | |
28 | + | |
29 | + // calculate the voting direction based on the grtadient direction | |
30 | + float theta = gpuGrad[2*i]; | |
31 | + | |
32 | + //initialize the vote direction to zero | |
33 | + gpuDir[i] = 0; | |
34 | + | |
35 | + // define a local variable to maximum value of the vote image in the voting area for this voter | |
36 | + float max = 0; | |
37 | + | |
38 | + // define two local variables for the x and y coordinations where the maximum happened | |
39 | + int id_x = 0; | |
40 | + int id_y = 0; | |
41 | + | |
42 | + //calculate the width of the shared memory block | |
43 | + int swidth = 2 * rmax + blockDim.x; | |
44 | + | |
45 | + // compute the size of window which will be checked for finding the voting area for this voter | |
46 | + int x_table = 2*rmax +1; | |
47 | + int rmax_sq = rmax * rmax; | |
48 | + int tx_rmax = threadIdx.x + rmax; | |
49 | + int bxs = bxi - rmax; | |
50 | + | |
51 | + for(int yr = -rmax; yr <= rmax; yr++){ | |
52 | + //if (yi+yr >= 0 && yi + yr < y){ | |
53 | + //copy the portion of the image necessary for this block to shared memory | |
54 | + __syncthreads(); | |
55 | + cpyG2S1D<float>(s_vote, gpuVote, bxs, yi + yr , swidth, 1, threadIdx, blockDim, x, y); | |
56 | + __syncthreads(); | |
57 | + | |
58 | + //if the current thread is outside of the image, it doesn't have to be computed | |
59 | + if(xi < x && yi < y){ | |
60 | + | |
61 | + for(int xr = -rmax; xr <= rmax; xr++){ | |
62 | + | |
63 | + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; | |
64 | + | |
65 | + // calculate the angle between the voter and the current pixel in x and y directions | |
66 | + float atan_angle = gpuTable[ind_t]; | |
67 | + | |
68 | + // calculate the voting direction based on the grtadient direction | |
69 | + int idx_share_update = xr + tx_rmax ; | |
70 | + float share_vote = s_vote[idx_share_update]; | |
71 | + | |
72 | + // check if the current pixel is located in the voting area of this voter. | |
73 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | |
74 | + | |
75 | + // compare the vote value of this pixel with the max value to find the maxima and its index. | |
76 | + if (share_vote>max) { | |
77 | + | |
78 | + max = share_vote; | |
79 | + id_x = xr; | |
80 | + id_y = yr; | |
81 | + } | |
82 | + } | |
83 | + } | |
84 | + } | |
85 | + //} | |
86 | + } | |
87 | + | |
88 | + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); | |
89 | + float new_angle = gpuTable[ind_m]; | |
90 | + | |
91 | + if(xi < x && yi < y) | |
92 | + gpuDir[i] = new_angle; | |
93 | + | |
94 | + } | |
95 | + | |
96 | + // this kernel updates the gradient direction by the calculated voting direction. | |
97 | + template<typename T> | |
98 | + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ | |
99 | + | |
100 | + // calculate the 2D coordinates for this current thread. | |
101 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
102 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
103 | + | |
104 | + // convert 2D coordinates to 1D | |
105 | + int i = yi * x + xi; | |
106 | + | |
107 | + //update the gradient image with the vote direction | |
108 | + gpuGrad[2*i] = gpuDir[i]; | |
109 | + } | |
110 | + | |
111 | + template<typename T> | |
112 | + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
113 | + | |
114 | + //calculate the number of bytes in the array | |
115 | + unsigned int bytes = x * y * sizeof(T); | |
116 | + | |
117 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
118 | + dim3 threads(max_threads, 1); | |
119 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | |
120 | + | |
121 | + // specify share memory | |
122 | + unsigned int share_bytes = (2*rmax + threads.x)*(1)*4; | |
123 | + | |
124 | + // allocate space on the GPU for the updated vote direction | |
125 | + T* gpuDir; | |
126 | + cudaMalloc(&gpuDir, bytes); | |
127 | + | |
128 | + //call the kernel to calculate the new voting direction | |
129 | + cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
130 | + | |
131 | + //call the kernel to update the gradient direction | |
132 | + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); | |
133 | + | |
134 | + //free allocated memory | |
135 | + cudaFree(gpuDir); | |
136 | + | |
137 | + } | |
138 | + | |
139 | + template<typename T> | |
140 | + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
141 | + | |
142 | + //calculate the number of bytes in the array | |
143 | + unsigned int bytes = x * y * sizeof(T); | |
144 | + | |
145 | + //calculate the number of bytes in the atan2 table | |
146 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | |
147 | + | |
148 | + //allocate space on the GPU for the Vote Image | |
149 | + T* gpuVote; | |
150 | + cudaMalloc(&gpuVote, bytes); | |
151 | + | |
152 | + //copy the input vote image to the GPU | |
153 | + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); | |
154 | + | |
155 | + //allocate space on the GPU for the input Gradient image | |
156 | + T* gpuGrad; | |
157 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | |
158 | + | |
159 | + //copy the Gradient data to the GPU | |
160 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | |
161 | + | |
162 | + //allocate space on the GPU for the atan2 table | |
163 | + T* gpuTable; | |
164 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | |
165 | + | |
166 | + //copy the atan2 values to the GPU | |
167 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | |
168 | + | |
169 | + //call the GPU version of the update direction function | |
170 | + gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
171 | + | |
172 | + //copy the new gradient image back to the CPU | |
173 | + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; | |
174 | + | |
175 | + //free allocated memory | |
176 | + cudaFree(gpuTable); | |
177 | + cudaFree(gpuVote); | |
178 | + cudaFree(gpuGrad); | |
179 | + } | |
180 | + | |
181 | + } | |
182 | +} | |
183 | + | |
184 | +#endif | |
0 | 185 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_VOTE_ATOMIC_H | |
2 | +#define STIM_CUDA_VOTE_ATOMIC_H | |
3 | + | |
4 | +# include <iostream> | |
5 | +# include <cuda.h> | |
6 | +#include <stim/cuda/cudatools.h> | |
7 | +#include <stim/cuda/sharedmem.cuh> | |
8 | +#include "cpyToshare.cuh" | |
9 | + | |
10 | +namespace stim{ | |
11 | + namespace cuda{ | |
12 | + | |
13 | + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | |
14 | + template<typename T> | |
15 | + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | |
16 | + | |
17 | + | |
18 | + // calculate the 2D coordinates for this current thread. | |
19 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
20 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
21 | + // convert 2D coordinates to 1D | |
22 | + int i = yi * x + xi; | |
23 | + | |
24 | + // calculate the voting direction based on the grtadient direction | |
25 | + float theta = gpuGrad[2*i]; | |
26 | + //calculate the amount of vote for the voter | |
27 | + float mag = gpuGrad[2*i + 1]; | |
28 | + | |
29 | + // compute the size of window which will be checked for finding the proper voters for this pixel | |
30 | + int x_table = 2*rmax +1; | |
31 | + int rmax_sq = rmax * rmax; | |
32 | + if(xi < x && yi < y){ | |
33 | + //for every line (along y) | |
34 | + for(int yr = -rmax; yr <= rmax; yr++){ | |
35 | + for(int xr = -rmax; xr <= rmax; xr++){ | |
36 | + if ((yi+yr)>=0 && (yi+yr)<y && (xi+xr)>=0 && (xi+xr)<x){ | |
37 | + | |
38 | + //find the location of the current pixel in the atan2 table | |
39 | + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; | |
40 | + | |
41 | + // calculate the angle between the voter and the current pixel in x and y directions | |
42 | + float atan_angle = gpuTable[ind_t]; | |
43 | + | |
44 | + // check if the current pixel is located in the voting area of this voter. | |
45 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | |
46 | + // calculate the 1D index for the current pixel in global memory | |
47 | + unsigned int ind_g = (yi+yr)*x + (xi+xr); | |
48 | + atomicAdd(&gpuVote[ind_g], mag); | |
49 | + | |
50 | + } | |
51 | + } | |
52 | + } | |
53 | + } | |
54 | + } | |
55 | + } | |
56 | + | |
57 | + template<typename T> | |
58 | + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
59 | + | |
60 | + | |
61 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
62 | + dim3 threads(max_threads, 1); | |
63 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | |
64 | + | |
65 | + // specify share memory | |
66 | + //unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4; | |
67 | + | |
68 | + //call the kernel to do the voting | |
69 | + cuda_vote <<< blocks, threads>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
70 | + | |
71 | + } | |
72 | + | |
73 | + | |
74 | + template<typename T> | |
75 | + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
76 | + | |
77 | + //calculate the number of bytes in the array | |
78 | + unsigned int bytes = x * y * sizeof(T); | |
79 | + | |
80 | + //calculate the number of bytes in the atan2 table | |
81 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | |
82 | + | |
83 | + //allocate space on the GPU for the Vote Image | |
84 | + T* gpuVote; | |
85 | + cudaMalloc(&gpuVote, bytes); | |
86 | + | |
87 | + //allocate space on the GPU for the input Gradient image | |
88 | + T* gpuGrad; | |
89 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | |
90 | + | |
91 | + //copy the Gradient Magnitude data to the GPU | |
92 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | |
93 | + | |
94 | + //allocate space on the GPU for the atan2 table | |
95 | + T* gpuTable; | |
96 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | |
97 | + | |
98 | + //copy the atan2 values to the GPU | |
99 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | |
100 | + | |
101 | + //call the GPU version of the vote calculation function | |
102 | + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
103 | + | |
104 | + //copy the Vote Data back to the CPU | |
105 | + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; | |
106 | + | |
107 | + //free allocated memory | |
108 | + cudaFree(gpuTable); | |
109 | + cudaFree(gpuVote); | |
110 | + cudaFree(gpuGrad); | |
111 | + } | |
112 | + | |
113 | + } | |
114 | +} | |
115 | + | |
116 | +#endif | |
0 | 117 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_VOTE_ATOMIC_SHARED_H | |
2 | +#define STIM_CUDA_VOTE_ATOMIC_SHARED_H | |
3 | + | |
4 | +# include <iostream> | |
5 | +# include <cuda.h> | |
6 | +#include <stim/cuda/cudatools.h> | |
7 | +#include <stim/cuda/sharedmem.cuh> | |
8 | +#include "cpyToshare.cuh" | |
9 | +//#include "writebackshared.cuh" | |
10 | +namespace stim{ | |
11 | + namespace cuda{ | |
12 | + | |
13 | + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | |
14 | + template<typename T> | |
15 | + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | |
16 | + | |
17 | + //generate a pointer to the shared memory | |
18 | + extern __shared__ float s_vote[]; | |
19 | + // calculate the 2D coordinates for this current thread. | |
20 | + int bxi = blockIdx.x * blockDim.x; | |
21 | + int byi = blockIdx.y * blockDim.y; | |
22 | + int xi = bxi + threadIdx.x; | |
23 | + int yi = byi + threadIdx.y; | |
24 | + // convert 2D coordinates to 1D | |
25 | + int i = yi * x + xi; | |
26 | + | |
27 | + // calculate the voting direction based on the gradient direction | |
28 | + float theta = gpuGrad[2*i]; | |
29 | + //calculate the amount of vote for the voter | |
30 | + float mag = gpuGrad[2*i + 1]; | |
31 | + | |
32 | + //find the starting points and size of window, wich will be copied to the shared memory | |
33 | + int bxs = bxi - rmax; | |
34 | + int bys = byi - rmax; | |
35 | + int xwidth = 2*rmax + blockDim.x; | |
36 | + int ywidth = 2*rmax + blockDim.y; | |
37 | + //compute the coordinations of this pixel in the 2D-shared memory. | |
38 | + int sx_rx = threadIdx.x + rmax; | |
39 | + int sy_ry = threadIdx.y + rmax; | |
40 | + // compute the size of window which will be checked for finding the counters for this voter | |
41 | + int x_table = 2*rmax +1; | |
42 | + int rmax_sq = rmax * rmax; | |
43 | + //calculate some parameters for indexing shared memory | |
44 | + //calculate the total number of threads available | |
45 | + unsigned int tThreads = blockDim.x * blockDim.y; | |
46 | + //calculate the current 1D thread ID | |
47 | + unsigned int ti = threadIdx.y * (blockDim.x) + threadIdx.x; | |
48 | + //calculate the number of iteration required | |
49 | + unsigned int In = xwidth*ywidth/tThreads + 1; | |
50 | + if(xi < x && yi < y){ | |
51 | + __syncthreads(); | |
52 | + //initialize the shared memory to zero | |
53 | + for (unsigned int i = 0; i < In; i++){ | |
54 | + unsigned int sIdx0 = i * tThreads + ti; | |
55 | + if (sIdx0< xwidth*ywidth) { | |
56 | + s_vote[sIdx0] = 0; | |
57 | + } | |
58 | + } | |
59 | + __syncthreads(); | |
60 | + //for every line (along y) | |
61 | + for(int yr = -rmax; yr <= rmax; yr++){ | |
62 | + //compute the position of the current voter in the shared memory along the y axis. | |
63 | + unsigned int sIdx_y1d = (sy_ry + yr)* xwidth; | |
64 | + for(int xr = -rmax; xr <= rmax; xr++){ | |
65 | + | |
66 | + //find the location of the current pixel in the atan2 table | |
67 | + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; | |
68 | + | |
69 | + // calculate the angle between the voter and the current pixel in x and y directions | |
70 | + float atan_angle = gpuTable[ind_t]; | |
71 | + | |
72 | + // check if the current pixel is located in the voting area of this voter. | |
73 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | |
74 | + //compute the position of the current voter in the 2D-shared memory along the x axis. | |
75 | + unsigned int sIdx_x = (sx_rx + xr); | |
76 | + //find the 1D index of this voter in the 2D-shared memory. | |
77 | + unsigned int s_Idx = (sIdx_y1d + sIdx_x); | |
78 | + | |
79 | + atomicAdd(&s_vote[s_Idx], mag); | |
80 | + } | |
81 | + } | |
82 | + } | |
83 | + //write shared memory back to global memory | |
84 | + | |
85 | + __syncthreads(); | |
86 | + for (unsigned int i = 0; i < In; i++){ | |
87 | + | |
88 | + unsigned int sIdx = i * tThreads + ti; | |
89 | + if (sIdx>= xwidth*ywidth) return; | |
90 | + | |
91 | + unsigned int sy = sIdx/xwidth; | |
92 | + unsigned int sx = sIdx - (sy * xwidth); | |
93 | + | |
94 | + unsigned int gx = bxs + sx; | |
95 | + unsigned int gy = bys + sy; | |
96 | + if (gx<x&& gy<y){ | |
97 | + unsigned int gIdx = gy * x + gx; | |
98 | + //write shared to global memory | |
99 | + atomicAdd(&gpuVote[gIdx], s_vote[sIdx]); | |
100 | + | |
101 | + } | |
102 | + } | |
103 | + | |
104 | + } | |
105 | + } | |
106 | + | |
107 | + template<typename T> | |
108 | + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
109 | + | |
110 | + | |
111 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
112 | + dim3 threads(sqrt(max_threads), sqrt(max_threads)); | |
113 | + dim3 blocks(x/threads.x + 1 , y/threads.y+1); | |
114 | + | |
115 | + // specify share memory | |
116 | + unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*sizeof(T); | |
117 | + | |
118 | + //call the kernel to do the voting | |
119 | + cuda_vote <<< blocks, threads, share_bytes>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
120 | + | |
121 | + } | |
122 | + | |
123 | + | |
124 | + template<typename T> | |
125 | + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
126 | + | |
127 | + //calculate the number of bytes in the array | |
128 | + unsigned int bytes = x * y * sizeof(T); | |
129 | + | |
130 | + //calculate the number of bytes in the atan2 table | |
131 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | |
132 | + | |
133 | + //allocate space on the GPU for the Vote Image | |
134 | + T* gpuVote; | |
135 | + cudaMalloc(&gpuVote, bytes); | |
136 | + | |
137 | + //allocate space on the GPU for the input Gradient image | |
138 | + T* gpuGrad; | |
139 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | |
140 | + | |
141 | + //copy the Gradient Magnitude data to the GPU | |
142 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | |
143 | + | |
144 | + //allocate space on the GPU for the atan2 table | |
145 | + T* gpuTable; | |
146 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | |
147 | + | |
148 | + //copy the atan2 values to the GPU | |
149 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | |
150 | + | |
151 | + //call the GPU version of the vote calculation function | |
152 | + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
153 | + | |
154 | + //copy the Vote Data back to the CPU | |
155 | + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; | |
156 | + | |
157 | + //free allocated memory | |
158 | + cudaFree(gpuTable); | |
159 | + cudaFree(gpuVote); | |
160 | + cudaFree(gpuGrad); | |
161 | + } | |
162 | + | |
163 | + } | |
164 | +} | |
165 | + | |
166 | +#endif | |
0 | 167 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_VOTE_SHARED_H | |
2 | +#define STIM_CUDA_VOTE_SHARED | |
3 | +# include <iostream> | |
4 | +# include <cuda.h> | |
5 | +#include <stim/cuda/cudatools.h> | |
6 | +#include <stim/cuda/sharedmem.cuh> | |
7 | +#include "cpyToshare.cuh" | |
8 | + | |
9 | +namespace stim{ | |
10 | + namespace cuda{ | |
11 | + | |
12 | + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | |
13 | + template<typename T> | |
14 | + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | |
15 | + | |
16 | + //generate a pointer to shared memory (size will be specified as a kernel parameter) | |
17 | + extern __shared__ float s_grad[]; | |
18 | + | |
19 | + //calculate the start point for this block | |
20 | + int bxi = blockIdx.x * blockDim.x; | |
21 | + int byi = blockIdx.y * blockDim.y; | |
22 | + // calculate the 2D coordinates for this current thread. | |
23 | + int xi = bxi + threadIdx.x; | |
24 | + int yi = byi + threadIdx.y; | |
25 | + // convert 2D coordinates to 1D | |
26 | + int i = yi * x + xi; | |
27 | + | |
28 | + // define a local variable to sum the votes from the voters | |
29 | + float sum = 0; | |
30 | + | |
31 | + //calculate the width of the shared memory block | |
32 | + int xwidth = 2 * rmax + blockDim.x; | |
33 | + int ywidth = 2 * rmax + blockDim.y; | |
34 | + // compute the size of window which will be checked for finding the proper voters for this pixel | |
35 | + int x_table = 2*rmax +1; | |
36 | + int rmax_sq = rmax * rmax; | |
37 | + int tx_rmax = threadIdx.x + rmax; | |
38 | + int bxs = bxi - rmax; | |
39 | + int bys = byi - rmax; | |
40 | + //compute the coordinations of this pixel in the 2D-shared memory. | |
41 | + int sx_rx = threadIdx.x + rmax; | |
42 | + int sy_ry = threadIdx.y + rmax; | |
43 | + //copy the portion of the image necessary for this block to shared memory | |
44 | + __syncthreads(); | |
45 | + cpyG2S2D2ch<float>(s_grad, gpuGrad, bxs, bys, 2*xwidth, ywidth, threadIdx, blockDim, x, y); | |
46 | + __syncthreads(); | |
47 | + | |
48 | + for(int yr = -rmax; yr <= rmax; yr++){ | |
49 | + int yi_v = (yi + yr) ; | |
50 | + //compute the position of the current voter in the shared memory along the y axis. | |
51 | + unsigned int sIdx_y1d = (sy_ry + yr)* xwidth; | |
52 | + //if (yi+yr<y && yi+yr>=0){ | |
53 | + if(xi < x && yi < y){ | |
54 | + | |
55 | + for(int xr = -rmax; xr <= rmax; xr++){ | |
56 | + | |
57 | + //compute the position of the current voter in the 2D-shared memory along the x axis. | |
58 | + unsigned int sIdx_x = (sx_rx + xr); | |
59 | + //find the 1D index of this voter in the 2D-shared memory. | |
60 | + unsigned int s_Idx = (sIdx_y1d + sIdx_x); | |
61 | + unsigned int s_Idx2 = s_Idx * 2; | |
62 | + | |
63 | + //find the location of this voter in the atan2 table | |
64 | + int id_t = (yr + rmax) * x_table + xr + rmax; | |
65 | + | |
66 | + // calculate the angle between the pixel and the current voter in x and y directions | |
67 | + float atan_angle = gpuTable[id_t]; | |
68 | + | |
69 | + // calculate the voting direction based on the grtadient direction | |
70 | + //int idx_share = xr + tx_rmax ; | |
71 | + float theta = s_grad[s_Idx2]; | |
72 | + float mag = s_grad[s_Idx2 + 1]; | |
73 | + | |
74 | + | |
75 | + // check if the current voter is located in the voting area of this pixel. | |
76 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | |
77 | + sum += mag; | |
78 | + | |
79 | + } | |
80 | + } | |
81 | + | |
82 | + } | |
83 | + //} | |
84 | + } | |
85 | + if(xi < x && yi < y) | |
86 | + gpuVote[i] = sum; | |
87 | + | |
88 | + } | |
89 | + | |
90 | + template<typename T> | |
91 | + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
92 | + | |
93 | + | |
94 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
95 | + dim3 threads(sqrt(max_threads), sqrt(max_threads)); | |
96 | + dim3 blocks(x/threads.x + 1 , y/threads.y+1); | |
97 | + | |
98 | + | |
99 | + // specify share memory | |
100 | + unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*2*sizeof(T); | |
101 | + | |
102 | + //call the kernel to do the voting | |
103 | + cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
104 | + | |
105 | + } | |
106 | + | |
107 | + | |
108 | + template<typename T> | |
109 | + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
110 | + | |
111 | + //calculate the number of bytes in the array | |
112 | + unsigned int bytes = x * y * sizeof(T); | |
113 | + | |
114 | + //calculate the number of bytes in the atan2 table | |
115 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | |
116 | + | |
117 | + //allocate space on the GPU for the Vote Image | |
118 | + T* gpuVote; | |
119 | + cudaMalloc(&gpuVote, bytes); | |
120 | + | |
121 | + //allocate space on the GPU for the input Gradient image | |
122 | + T* gpuGrad; | |
123 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | |
124 | + | |
125 | + //copy the Gradient Magnitude data to the GPU | |
126 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | |
127 | + | |
128 | + //allocate space on the GPU for the atan2 table | |
129 | + T* gpuTable; | |
130 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | |
131 | + | |
132 | + //copy the atan2 values to the GPU | |
133 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | |
134 | + | |
135 | + //call the GPU version of the vote calculation function | |
136 | + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
137 | + | |
138 | + //copy the Vote Data back to the CPU | |
139 | + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; | |
140 | + | |
141 | + //free allocated memory | |
142 | + cudaFree(gpuTable); | |
143 | + cudaFree(gpuVote); | |
144 | + cudaFree(gpuGrad); | |
145 | + } | |
146 | + | |
147 | + } | |
148 | +} | |
149 | + | |
150 | +#endif | |
0 | 151 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_IVOTE_ATOMIC_H | |
2 | +#define STIM_CUDA_IVOTE_ATOMIC_H | |
3 | + | |
4 | +#include <stim/cuda/ivote/down_sample.cuh> | |
5 | +#include <stim/cuda/ivote/local_max.cuh> | |
6 | +#include <stim/cuda/ivote/update_dir_global.cuh> | |
7 | +//#include <stim/cuda/ivote/vote_shared_32-32.cuh> | |
8 | +#include <stim/cuda/ivote/vote_atomic_shared.cuh> | |
9 | +#include <stim/cuda/ivote/re_sample.cuh> | |
10 | +namespace stim{ | |
11 | + namespace cuda{ | |
12 | + | |
13 | + } | |
14 | +} | |
15 | + | |
16 | + | |
17 | + | |
18 | +#endif | |
0 | 19 | \ No newline at end of file | ... | ... |