Commit bf73197014983c55343330462d4f78d2492f6b0b
1 parent
84ca9bba
fix some bugs in the gradient, conv2sep and local_max codes to get the determini…
…stic results in every run and update the vote and update_dir codes
Showing
6 changed files
with
62 additions
and
195 deletions
Show diff stats
stim/cuda/arraymath/array_cart2polar.cuh
@@ -4,42 +4,47 @@ | @@ -4,42 +4,47 @@ | ||
4 | namespace stim{ | 4 | namespace stim{ |
5 | namespace cuda{ | 5 | namespace cuda{ |
6 | template<typename T> | 6 | template<typename T> |
7 | - __global__ void cuda_cart2polar(T* a, unsigned int N){ | 7 | + __global__ void cuda_cart2polar(T* a, int x, int y){ |
8 | 8 | ||
9 | 9 | ||
10 | - //calculate the 1D index for this thread | ||
11 | - int i = blockIdx.x * blockDim.x + threadIdx.x; | ||
12 | - | ||
13 | - if(i < N){ | ||
14 | - float x = a[i * 2 + 0]; | ||
15 | - float y = a[i * 2 + 1]; | ||
16 | - float theta = atan2( y, x ) ; | ||
17 | - float r = sqrt(x * x + y * y); | 10 | + // calculate the 2D coordinates for this current thread. |
11 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
12 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
13 | + // convert 2D coordinates to 1D | ||
14 | + int i = yi * x + xi; | ||
15 | + | ||
16 | + | ||
17 | + if(xi >= x|| yi >= y) return; | ||
18 | + | ||
19 | + float xl = a[i * 2 + 0]; | ||
20 | + float yl = a[i * 2 + 1]; | ||
21 | + float theta = atan2( yl, xl ) ; | ||
22 | + float r = sqrt(xl * xl + yl * yl); | ||
18 | a[i * 2 + 0] = theta; | 23 | a[i * 2 + 0] = theta; |
19 | a[i * 2 + 1] = r; | 24 | a[i * 2 + 1] = r; |
20 | - } | 25 | + |
21 | } | 26 | } |
22 | 27 | ||
23 | 28 | ||
24 | template<typename T> | 29 | template<typename T> |
25 | - void gpu_cart2polar(T* gpuGrad, unsigned int N){ | ||
26 | - | ||
27 | - //get the maximum number of threads per block for the CUDA device | ||
28 | - int threads = stim::maxThreadsPerBlock(); | 30 | + void gpu_cart2polar(T* gpuGrad, unsigned int x, unsigned int y){ |
29 | 31 | ||
30 | - //calculate the number of blocks | ||
31 | - int blocks = N / threads + (N % threads == 0 ? 0:1); | 32 | + unsigned int max_threads = stim::maxThreadsPerBlock(); |
33 | + dim3 threads(max_threads, 1); | ||
34 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | ||
35 | + | ||
32 | 36 | ||
33 | //call the kernel to do the multiplication | 37 | //call the kernel to do the multiplication |
34 | - cuda_cart2polar <<< blocks, threads >>>(gpuGrad, N); | 38 | + cuda_cart2polar <<< blocks, threads >>>(gpuGrad, x, y); |
35 | 39 | ||
36 | } | 40 | } |
37 | 41 | ||
38 | 42 | ||
39 | template<typename T> | 43 | template<typename T> |
40 | - void cpu_cart2polar(T* a, unsigned int N){ | 44 | + void cpu_cart2polar(T* a, unsigned int x, unsigned int y){ |
41 | 45 | ||
42 | //calculate the number of bytes in the array | 46 | //calculate the number of bytes in the array |
47 | + unsigned int N = x *y; | ||
43 | unsigned int bytes = N * sizeof(T) * 2; | 48 | unsigned int bytes = N * sizeof(T) * 2; |
44 | 49 | ||
45 | //allocate memory on the GPU for the array | 50 | //allocate memory on the GPU for the array |
@@ -50,7 +55,7 @@ namespace stim{ | @@ -50,7 +55,7 @@ namespace stim{ | ||
50 | HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) ); | 55 | HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) ); |
51 | 56 | ||
52 | //call the GPU version of this function | 57 | //call the GPU version of this function |
53 | - gpu_cart2polar<T>(gpuA, N); | 58 | + gpu_cart2polar<T>(gpuA, x, y); |
54 | 59 | ||
55 | //copy the array back to the CPU | 60 | //copy the array back to the CPU |
56 | HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) ); | 61 | HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) ); |
stim/cuda/ivote/local_max.cuh
1 | #ifndef STIM_CUDA_LOCAL_MAX_H | 1 | #ifndef STIM_CUDA_LOCAL_MAX_H |
2 | #define STIM_CUDA_LOCAL_MAX_H | 2 | #define STIM_CUDA_LOCAL_MAX_H |
3 | 3 | ||
4 | - | ||
5 | # include <iostream> | 4 | # include <iostream> |
6 | # include <cuda.h> | 5 | # include <cuda.h> |
7 | #include <stim/cuda/cudatools.h> | 6 | #include <stim/cuda/cudatools.h> |
@@ -9,7 +8,6 @@ | @@ -9,7 +8,6 @@ | ||
9 | namespace stim{ | 8 | namespace stim{ |
10 | namespace cuda{ | 9 | namespace cuda{ |
11 | 10 | ||
12 | - | ||
13 | // this kernel calculates the local maximum for finding the cell centers | 11 | // this kernel calculates the local maximum for finding the cell centers |
14 | template<typename T> | 12 | template<typename T> |
15 | __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, int conn, int x, int y){ | 13 | __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, int conn, int x, int y){ |
@@ -20,13 +18,10 @@ namespace stim{ | @@ -20,13 +18,10 @@ namespace stim{ | ||
20 | 18 | ||
21 | if(xi >= x || yi >= y) | 19 | if(xi >= x || yi >= y) |
22 | return; | 20 | return; |
23 | - | ||
24 | - | 21 | + |
25 | // convert 2D coordinates to 1D | 22 | // convert 2D coordinates to 1D |
26 | int i = yi * x + xi; | 23 | int i = yi * x + xi; |
27 | 24 | ||
28 | - // START DAVID | ||
29 | - | ||
30 | gpuCenters[i] = 0; //initialize the value at this location to zero | 25 | gpuCenters[i] = 0; //initialize the value at this location to zero |
31 | 26 | ||
32 | T val = gpuVote[i]; | 27 | T val = gpuVote[i]; |
@@ -46,100 +41,22 @@ namespace stim{ | @@ -46,100 +41,22 @@ namespace stim{ | ||
46 | } | 41 | } |
47 | 42 | ||
48 | gpuCenters[i] = 1; | 43 | gpuCenters[i] = 1; |
49 | - | ||
50 | - // END DAVID | ||
51 | - /* | ||
52 | - //calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. | ||
53 | - int xl = xi - conn; | ||
54 | - int yl = yi - conn; | ||
55 | - | ||
56 | - | ||
57 | - // use zero for the lowest limits if the xi or yi is less than conn. | ||
58 | - if (xi <= conn) | ||
59 | - xl = 0; | ||
60 | - if (yi <= conn) | ||
61 | - yl = 0; | ||
62 | - | ||
63 | - //calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. | ||
64 | - int xh = xi + conn; | ||
65 | - int yh = yi + conn; | ||
66 | - | ||
67 | - // use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn. | ||
68 | - if (xi >= x - conn) | ||
69 | - xh = x; | ||
70 | - if (yi>= y - conn) | ||
71 | - yh = y; | ||
72 | - | ||
73 | - // calculate the limits for finding the local maximum location in the connected neighbors for the current pixel | ||
74 | - int n_l = yl * x + xl; | ||
75 | - int n_h = yh * x + xh; | ||
76 | - | ||
77 | - //initial the centers image to zero | ||
78 | - gpuCenters[i] = 0; | ||
79 | - | ||
80 | - | ||
81 | - int n = n_l; | ||
82 | - | ||
83 | - float l_value = 0; | ||
84 | - | ||
85 | - if (i < x * y) | ||
86 | - | ||
87 | - // check if the vote value for this pixel is greater than threshold, so this pixel may be a local max. | ||
88 | - if (gpuVote[i]>final_t){ | ||
89 | - | ||
90 | - // compare the vote value for this pixel with the vote values with its neighbors. | ||
91 | - while (n<=n_h){ | ||
92 | - | ||
93 | - // check if this vote value is a local max in its neighborhood. | ||
94 | - if (gpuVote[i] < gpuVote[n]){ | ||
95 | - l_value = 0; | ||
96 | - n =n_h+1; | ||
97 | - } | ||
98 | - else if (n == n_h){ | ||
99 | - l_value = 1; | ||
100 | - n = n+1; | ||
101 | - } | ||
102 | - // check if the current neighbor is the last one at the current row | ||
103 | - else if ((n - n_l - 2*conn)% x ==0){ | ||
104 | - n = n + x - 2*conn -1; | ||
105 | - n ++; | ||
106 | - } | ||
107 | - else | ||
108 | - n ++; | ||
109 | - } | ||
110 | - // set the center value for this pixel to high if it's a local max ,and to low if not. | ||
111 | - gpuCenters[i] = l_value ; | ||
112 | - } | ||
113 | - */ | ||
114 | - | ||
115 | } | 44 | } |
116 | - | ||
117 | - | ||
118 | - | 45 | + |
119 | template<typename T> | 46 | template<typename T> |
120 | void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ | 47 | void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ |
121 | 48 | ||
122 | - | ||
123 | - | ||
124 | - | ||
125 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 49 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
126 | dim3 threads(max_threads, 1); | 50 | dim3 threads(max_threads, 1); |
127 | dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | 51 | dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); |
128 | 52 | ||
129 | - | ||
130 | - | ||
131 | //call the kernel to find the local maximum. | 53 | //call the kernel to find the local maximum. |
132 | cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y); | 54 | cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y); |
133 | - | ||
134 | - | ||
135 | } | 55 | } |
136 | 56 | ||
137 | - | ||
138 | - | ||
139 | template<typename T> | 57 | template<typename T> |
140 | void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ | 58 | void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ |
141 | 59 | ||
142 | - | ||
143 | //calculate the number of bytes in the array | 60 | //calculate the number of bytes in the array |
144 | unsigned int bytes = x * y * sizeof(T); | 61 | unsigned int bytes = x * y * sizeof(T); |
145 | 62 | ||
@@ -147,28 +64,22 @@ namespace stim{ | @@ -147,28 +64,22 @@ namespace stim{ | ||
147 | T* gpuCenters; | 64 | T* gpuCenters; |
148 | cudaMalloc(&gpuCenters, bytes); | 65 | cudaMalloc(&gpuCenters, bytes); |
149 | 66 | ||
150 | - | ||
151 | //allocate space on the GPU for the input Vote Image | 67 | //allocate space on the GPU for the input Vote Image |
152 | T* gpuVote; | 68 | T* gpuVote; |
153 | cudaMalloc(&gpuVote, bytes); | 69 | cudaMalloc(&gpuVote, bytes); |
154 | 70 | ||
155 | - | ||
156 | //copy the Vote image data to the GPU | 71 | //copy the Vote image data to the GPU |
157 | HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); | 72 | HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); |
158 | - | ||
159 | - | 73 | + |
160 | //call the GPU version of the local max function | 74 | //call the GPU version of the local max function |
161 | gpu_local_max<T>(gpuCenters, gpuVote, final_t, conn, x, y); | 75 | gpu_local_max<T>(gpuCenters, gpuVote, final_t, conn, x, y); |
162 | - | ||
163 | - | 76 | + |
164 | //copy the cell centers data to the CPU | 77 | //copy the cell centers data to the CPU |
165 | cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ; | 78 | cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ; |
166 | - | ||
167 | - | 79 | + |
168 | //free allocated memory | 80 | //free allocated memory |
169 | cudaFree(gpuCenters); | 81 | cudaFree(gpuCenters); |
170 | cudaFree(gpuVote); | 82 | cudaFree(gpuVote); |
171 | - cudaFree(gpuGrad); | ||
172 | } | 83 | } |
173 | 84 | ||
174 | } | 85 | } |
stim/cuda/ivote/update_dir.cuh
1 | #ifndef STIM_CUDA_UPDATE_DIR_H | 1 | #ifndef STIM_CUDA_UPDATE_DIR_H |
2 | #define STIM_CUDA_UPDATE_DIR_H | 2 | #define STIM_CUDA_UPDATE_DIR_H |
3 | 3 | ||
4 | - | ||
5 | # include <iostream> | 4 | # include <iostream> |
6 | # include <cuda.h> | 5 | # include <cuda.h> |
7 | #include <stim/cuda/cudatools.h> | 6 | #include <stim/cuda/cudatools.h> |
@@ -48,8 +47,7 @@ namespace stim{ | @@ -48,8 +47,7 @@ namespace stim{ | ||
48 | int rmax_sq = rmax * rmax; | 47 | int rmax_sq = rmax * rmax; |
49 | int tx_rmax = threadIdx.x + rmax; | 48 | int tx_rmax = threadIdx.x + rmax; |
50 | int bxs = bxi - rmax; | 49 | int bxs = bxi - rmax; |
51 | - | ||
52 | - | 50 | + |
53 | for(int yr = -rmax; yr <= rmax; yr++){ | 51 | for(int yr = -rmax; yr <= rmax; yr++){ |
54 | 52 | ||
55 | //copy the portion of the image necessary for this block to shared memory | 53 | //copy the portion of the image necessary for this block to shared memory |
@@ -66,8 +64,7 @@ namespace stim{ | @@ -66,8 +64,7 @@ namespace stim{ | ||
66 | 64 | ||
67 | // calculate the angle between the voter and the current pixel in x and y directions | 65 | // calculate the angle between the voter and the current pixel in x and y directions |
68 | float atan_angle = gpuTable[ind_t]; | 66 | float atan_angle = gpuTable[ind_t]; |
69 | - | ||
70 | - | 67 | + |
71 | // calculate the voting direction based on the grtadient direction | 68 | // calculate the voting direction based on the grtadient direction |
72 | int idx_share_update = xr + tx_rmax ; | 69 | int idx_share_update = xr + tx_rmax ; |
73 | float share_vote = s_vote[idx_share_update]; | 70 | float share_vote = s_vote[idx_share_update]; |
@@ -86,11 +83,8 @@ namespace stim{ | @@ -86,11 +83,8 @@ namespace stim{ | ||
86 | } | 83 | } |
87 | } | 84 | } |
88 | } | 85 | } |
89 | - | ||
90 | - | ||
91 | - //float new_angle = atan2(dy, dx); | 86 | + |
92 | unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); | 87 | unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); |
93 | - | ||
94 | float new_angle = gpuTable[ind_m]; | 88 | float new_angle = gpuTable[ind_m]; |
95 | 89 | ||
96 | if(xi < x && yi < y) | 90 | if(xi < x && yi < y) |
@@ -102,35 +96,27 @@ namespace stim{ | @@ -102,35 +96,27 @@ namespace stim{ | ||
102 | template<typename T> | 96 | template<typename T> |
103 | __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ | 97 | __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ |
104 | 98 | ||
105 | - //************ when the number of threads are (1024,1) ************* | ||
106 | - | ||
107 | // calculate the 2D coordinates for this current thread. | 99 | // calculate the 2D coordinates for this current thread. |
108 | int xi = blockIdx.x * blockDim.x + threadIdx.x; | 100 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
109 | int yi = blockIdx.y * blockDim.y + threadIdx.y; | 101 | int yi = blockIdx.y * blockDim.y + threadIdx.y; |
110 | 102 | ||
111 | // convert 2D coordinates to 1D | 103 | // convert 2D coordinates to 1D |
112 | int i = yi * x + xi; | 104 | int i = yi * x + xi; |
113 | - | ||
114 | - | 105 | + |
115 | //update the gradient image with the vote direction | 106 | //update the gradient image with the vote direction |
116 | gpuGrad[2*i] = gpuDir[i]; | 107 | gpuGrad[2*i] = gpuDir[i]; |
117 | } | 108 | } |
118 | - | ||
119 | - | 109 | + |
120 | template<typename T> | 110 | template<typename T> |
121 | void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | 111 | void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ |
122 | 112 | ||
123 | //get the number of pixels in the image | 113 | //get the number of pixels in the image |
124 | unsigned int pixels = x * y; | 114 | unsigned int pixels = x * y; |
125 | unsigned int bytes = sizeof(T) * pixels; | 115 | unsigned int bytes = sizeof(T) * pixels; |
126 | - | ||
127 | - | 116 | + |
128 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 117 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
129 | dim3 threads(max_threads, 1); | 118 | dim3 threads(max_threads, 1); |
130 | dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | 119 | dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); |
131 | - //dim3 threads(1, 1); | ||
132 | - //dim3 blocks(x, y); | ||
133 | - // Allocate CUDA array in device memory | ||
134 | 120 | ||
135 | //define a channel descriptor for a single 32-bit channel | 121 | //define a channel descriptor for a single 32-bit channel |
136 | cudaChannelFormatDesc channelDesc = | 122 | cudaChannelFormatDesc channelDesc = |
@@ -174,14 +160,12 @@ namespace stim{ | @@ -174,14 +160,12 @@ namespace stim{ | ||
174 | 160 | ||
175 | //call the kernel to update the gradient direction | 161 | //call the kernel to update the gradient direction |
176 | cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); | 162 | cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); |
177 | - | ||
178 | - | 163 | + |
179 | //free allocated memory | 164 | //free allocated memory |
180 | cudaFree(gpuDir); | 165 | cudaFree(gpuDir); |
181 | 166 | ||
182 | } | 167 | } |
183 | - | ||
184 | - | 168 | + |
185 | template<typename T> | 169 | template<typename T> |
186 | void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | 170 | void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ |
187 | 171 | ||
@@ -211,12 +195,10 @@ namespace stim{ | @@ -211,12 +195,10 @@ namespace stim{ | ||
211 | 195 | ||
212 | //copy the atan2 values to the GPU | 196 | //copy the atan2 values to the GPU |
213 | HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | 197 | HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); |
214 | - | ||
215 | - | 198 | + |
216 | //call the GPU version of the update direction function | 199 | //call the GPU version of the update direction function |
217 | gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | 200 | gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); |
218 | - | ||
219 | - | 201 | + |
220 | //copy the new gradient image back to the CPU | 202 | //copy the new gradient image back to the CPU |
221 | cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; | 203 | cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; |
222 | 204 | ||
@@ -229,6 +211,4 @@ namespace stim{ | @@ -229,6 +211,4 @@ namespace stim{ | ||
229 | } | 211 | } |
230 | } | 212 | } |
231 | 213 | ||
232 | - | ||
233 | - | ||
234 | #endif | 214 | #endif |
235 | \ No newline at end of file | 215 | \ No newline at end of file |
stim/cuda/ivote/vote.cuh
1 | #ifndef STIM_CUDA_VOTE_H | 1 | #ifndef STIM_CUDA_VOTE_H |
2 | #define STIM_CUDA_VOTE_H | 2 | #define STIM_CUDA_VOTE_H |
3 | 3 | ||
4 | - | ||
5 | # include <iostream> | 4 | # include <iostream> |
6 | # include <cuda.h> | 5 | # include <cuda.h> |
7 | #include <stim/cuda/cudatools.h> | 6 | #include <stim/cuda/cudatools.h> |
8 | #include <stim/cuda/sharedmem.cuh> | 7 | #include <stim/cuda/sharedmem.cuh> |
9 | 8 | ||
10 | - | ||
11 | namespace stim{ | 9 | namespace stim{ |
12 | namespace cuda{ | 10 | namespace cuda{ |
13 | 11 | ||
@@ -26,13 +24,10 @@ namespace stim{ | @@ -26,13 +24,10 @@ namespace stim{ | ||
26 | int yi = blockIdx.y * blockDim.y + threadIdx.y; | 24 | int yi = blockIdx.y * blockDim.y + threadIdx.y; |
27 | // convert 2D coordinates to 1D | 25 | // convert 2D coordinates to 1D |
28 | int i = yi * x + xi; | 26 | int i = yi * x + xi; |
29 | - | ||
30 | - | ||
31 | - | 27 | + |
32 | // define a local variable to sum the votes from the voters | 28 | // define a local variable to sum the votes from the voters |
33 | float sum = 0; | 29 | float sum = 0; |
34 | 30 | ||
35 | - | ||
36 | //calculate the width of the shared memory block | 31 | //calculate the width of the shared memory block |
37 | int swidth = 2 * rmax + blockDim.x; | 32 | int swidth = 2 * rmax + blockDim.x; |
38 | 33 | ||
@@ -59,8 +54,7 @@ namespace stim{ | @@ -59,8 +54,7 @@ namespace stim{ | ||
59 | 54 | ||
60 | // calculate the angle between the pixel and the current voter in x and y directions | 55 | // calculate the angle between the pixel and the current voter in x and y directions |
61 | float atan_angle = gpuTable[id_t]; | 56 | float atan_angle = gpuTable[id_t]; |
62 | - | ||
63 | - | 57 | + |
64 | // calculate the voting direction based on the grtadient direction | 58 | // calculate the voting direction based on the grtadient direction |
65 | int idx_share = xr + tx_rmax ; | 59 | int idx_share = xr + tx_rmax ; |
66 | float2 g = s_grad[idx_share]; | 60 | float2 g = s_grad[idx_share]; |
@@ -71,7 +65,6 @@ namespace stim{ | @@ -71,7 +65,6 @@ namespace stim{ | ||
71 | sum += g.y; | 65 | sum += g.y; |
72 | 66 | ||
73 | } | 67 | } |
74 | - | ||
75 | } | 68 | } |
76 | 69 | ||
77 | } | 70 | } |
@@ -87,15 +80,11 @@ namespace stim{ | @@ -87,15 +80,11 @@ namespace stim{ | ||
87 | //get the number of pixels in the image | 80 | //get the number of pixels in the image |
88 | unsigned int pixels = x * y; | 81 | unsigned int pixels = x * y; |
89 | unsigned int bytes = sizeof(T) * pixels; | 82 | unsigned int bytes = sizeof(T) * pixels; |
90 | - | ||
91 | - | 83 | + |
92 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 84 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
93 | - //unsigned int thread_dim = sqrt(max_threads); | ||
94 | dim3 threads(max_threads, 1); | 85 | dim3 threads(max_threads, 1); |
95 | dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | 86 | dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); |
96 | - //dim3 threads(1,1); | ||
97 | - //dim3 blocks(x, y); | ||
98 | - | 87 | + |
99 | // Allocate CUDA array in device memory | 88 | // Allocate CUDA array in device memory |
100 | 89 | ||
101 | //define a channel descriptor for a single 32-bit channel | 90 | //define a channel descriptor for a single 32-bit channel |
@@ -131,7 +120,6 @@ namespace stim{ | @@ -131,7 +120,6 @@ namespace stim{ | ||
131 | // specify share memory | 120 | // specify share memory |
132 | unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4; | 121 | unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4; |
133 | 122 | ||
134 | - | ||
135 | //call the kernel to do the voting | 123 | //call the kernel to do the voting |
136 | 124 | ||
137 | cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y); | 125 | cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y); |
@@ -165,14 +153,10 @@ namespace stim{ | @@ -165,14 +153,10 @@ namespace stim{ | ||
165 | 153 | ||
166 | //copy the atan2 values to the GPU | 154 | //copy the atan2 values to the GPU |
167 | HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | 155 | HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); |
168 | - | ||
169 | - //cudaMemcpyToSymbol (cstTable, cpuTable, bytes_table ); | ||
170 | - | ||
171 | - | 156 | + |
172 | //call the GPU version of the vote calculation function | 157 | //call the GPU version of the vote calculation function |
173 | gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | 158 | gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); |
174 | - | ||
175 | - | 159 | + |
176 | //copy the Vote Data back to the CPU | 160 | //copy the Vote Data back to the CPU |
177 | cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; | 161 | cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; |
178 | 162 | ||
@@ -185,6 +169,4 @@ namespace stim{ | @@ -185,6 +169,4 @@ namespace stim{ | ||
185 | } | 169 | } |
186 | } | 170 | } |
187 | 171 | ||
188 | - | ||
189 | - | ||
190 | #endif | 172 | #endif |
191 | \ No newline at end of file | 173 | \ No newline at end of file |
stim/cuda/templates/conv2sep.cuh
@@ -49,7 +49,7 @@ namespace stim{ | @@ -49,7 +49,7 @@ namespace stim{ | ||
49 | __syncthreads(); | 49 | __syncthreads(); |
50 | 50 | ||
51 | //if the current pixel is outside of the image | 51 | //if the current pixel is outside of the image |
52 | - if(xi > x || yi > y) | 52 | + if(xi >= x || yi >= y) |
53 | return; | 53 | return; |
54 | 54 | ||
55 | 55 | ||
@@ -61,7 +61,7 @@ namespace stim{ | @@ -61,7 +61,7 @@ namespace stim{ | ||
61 | 61 | ||
62 | //for each element of the kernel | 62 | //for each element of the kernel |
63 | for(int ki = -kr; ki <= kr; ki++){ | 63 | for(int ki = -kr; ki <= kr; ki++){ |
64 | - sum += g[ki + kr] * s[si + ki]; | 64 | + sum += s[si + ki] * g[ki + kr]; |
65 | } | 65 | } |
66 | 66 | ||
67 | //calculate the 1D image index for this thread | 67 | //calculate the 1D image index for this thread |
stim/cuda/templates/gradient.cuh
@@ -11,12 +11,12 @@ namespace stim{ | @@ -11,12 +11,12 @@ namespace stim{ | ||
11 | template<typename T> | 11 | template<typename T> |
12 | __global__ void gradient_2d(T* out, T* in, int x, int y){ | 12 | __global__ void gradient_2d(T* out, T* in, int x, int y){ |
13 | 13 | ||
14 | - //calculate the 1D image index for this thread | ||
15 | - int i = blockIdx.x * blockDim.x + threadIdx.x; | ||
16 | - | ||
17 | - //convert this to 2D pixel coordinates | ||
18 | - int yi = i / x; | ||
19 | - int xi = i - (yi * x); | 14 | + |
15 | + // calculate the 2D coordinates for this current thread. | ||
16 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
17 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
18 | + // convert 2D coordinates to 1D | ||
19 | + int i = yi * x + xi; | ||
20 | 20 | ||
21 | //return if the pixel is outside of the image | 21 | //return if the pixel is outside of the image |
22 | if(xi >= x || yi >= y) return; | 22 | if(xi >= x || yi >= y) return; |
@@ -25,16 +25,16 @@ namespace stim{ | @@ -25,16 +25,16 @@ namespace stim{ | ||
25 | int i_xp = yi * x + (xi + 1); | 25 | int i_xp = yi * x + (xi + 1); |
26 | int i_yp = (yi + 1) * x + xi; | 26 | int i_yp = (yi + 1) * x + xi; |
27 | 27 | ||
28 | + //calculate indices for the backward difference | ||
29 | + int i_xn = yi * x + (xi - 1); | ||
30 | + int i_yn = (yi - 1) * x + xi; | ||
31 | + | ||
28 | //use forward differences if a coordinate is zero | 32 | //use forward differences if a coordinate is zero |
29 | if(xi == 0) | 33 | if(xi == 0) |
30 | out[i * 2 + 0] = in[i_xp] - in[i]; | 34 | out[i * 2 + 0] = in[i_xp] - in[i]; |
31 | if(yi == 0) | 35 | if(yi == 0) |
32 | out[i * 2 + 1] = in[i_yp] - in[i]; | 36 | out[i * 2 + 1] = in[i_yp] - in[i]; |
33 | 37 | ||
34 | - //calculate indices for the backward difference | ||
35 | - int i_xn = yi * x + (xi - 1); | ||
36 | - int i_yn = (yi - 1) * x + xi; | ||
37 | - | ||
38 | //use backward differences if the coordinate is at the maximum edge | 38 | //use backward differences if the coordinate is at the maximum edge |
39 | if(xi == x-1) | 39 | if(xi == x-1) |
40 | out[i * 2 + 0] = in[i] - in[i_xn]; | 40 | out[i * 2 + 0] = in[i] - in[i_xn]; |
@@ -51,28 +51,16 @@ namespace stim{ | @@ -51,28 +51,16 @@ namespace stim{ | ||
51 | } | 51 | } |
52 | 52 | ||
53 | template<typename T> | 53 | template<typename T> |
54 | - //void gpu_gradient_2d(T* gpuOut, T* gpuIn, unsigned int x, unsigned int y){ | ||
55 | void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){ | 54 | void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){ |
56 | 55 | ||
57 | //get the number of pixels in the image | 56 | //get the number of pixels in the image |
58 | unsigned int pixels = x * y; | 57 | unsigned int pixels = x * y; |
59 | 58 | ||
60 | - //allocate space on the GPU for the input image | ||
61 | - //T* gpuI; | ||
62 | - //HANDLE_ERROR(cudaMalloc(&gpuI, bytes)); | ||
63 | - | ||
64 | - //cudaMemcpy(gpuI, gpuI0, bytes, cudaMemcpyDeviceToDevice); | ||
65 | - | ||
66 | - | ||
67 | - //allocate space on the GPU for the output gradient image | ||
68 | - //T* gpuGrad; | ||
69 | - //cudaMalloc(&gpuGrad, bytes * 2); //the output image will have two channels (x, y) | ||
70 | - | ||
71 | //get the maximum number of threads per block for the CUDA device | 59 | //get the maximum number of threads per block for the CUDA device |
72 | - int threads = stim::maxThreadsPerBlock(); | ||
73 | - | ||
74 | - //calculate the number of blocks | ||
75 | - int blocks = pixels / threads + (pixels%threads == 0 ? 0:1); | 60 | + unsigned int max_threads = stim::maxThreadsPerBlock(); |
61 | + dim3 threads(max_threads, 1); | ||
62 | + dim3 blocks(x/threads.x + 1 , y); | ||
63 | + | ||
76 | 64 | ||
77 | //call the GPU kernel to determine the gradient | 65 | //call the GPU kernel to determine the gradient |
78 | gradient_2d<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y); | 66 | gradient_2d<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y); |
@@ -105,6 +93,7 @@ namespace stim{ | @@ -105,6 +93,7 @@ namespace stim{ | ||
105 | 93 | ||
106 | //free allocated memory | 94 | //free allocated memory |
107 | cudaFree(gpuOut); | 95 | cudaFree(gpuOut); |
96 | + cudaFree(gpuIn); | ||
108 | } | 97 | } |
109 | 98 | ||
110 | } | 99 | } |