Commit bf73197014983c55343330462d4f78d2492f6b0b

Authored by Laila Saadatifard
1 parent 84ca9bba

fix some bugs in the gradient, conv2sep and local_max codes to get the determini…

…stic results in every run and update the vote and update_dir codes
stim/cuda/arraymath/array_cart2polar.cuh
@@ -4,42 +4,47 @@ @@ -4,42 +4,47 @@
4 namespace stim{ 4 namespace stim{
5 namespace cuda{ 5 namespace cuda{
6 template<typename T> 6 template<typename T>
7 - __global__ void cuda_cart2polar(T* a, unsigned int N){ 7 + __global__ void cuda_cart2polar(T* a, int x, int y){
8 8
9 9
10 - //calculate the 1D index for this thread  
11 - int i = blockIdx.x * blockDim.x + threadIdx.x;  
12 -  
13 - if(i < N){  
14 - float x = a[i * 2 + 0];  
15 - float y = a[i * 2 + 1];  
16 - float theta = atan2( y, x ) ;  
17 - float r = sqrt(x * x + y * y); 10 + // calculate the 2D coordinates for this current thread.
  11 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  12 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  13 + // convert 2D coordinates to 1D
  14 + int i = yi * x + xi;
  15 +
  16 +
  17 + if(xi >= x|| yi >= y) return;
  18 +
  19 + float xl = a[i * 2 + 0];
  20 + float yl = a[i * 2 + 1];
  21 + float theta = atan2( yl, xl ) ;
  22 + float r = sqrt(xl * xl + yl * yl);
18 a[i * 2 + 0] = theta; 23 a[i * 2 + 0] = theta;
19 a[i * 2 + 1] = r; 24 a[i * 2 + 1] = r;
20 - } 25 +
21 } 26 }
22 27
23 28
24 template<typename T> 29 template<typename T>
25 - void gpu_cart2polar(T* gpuGrad, unsigned int N){  
26 -  
27 - //get the maximum number of threads per block for the CUDA device  
28 - int threads = stim::maxThreadsPerBlock(); 30 + void gpu_cart2polar(T* gpuGrad, unsigned int x, unsigned int y){
29 31
30 - //calculate the number of blocks  
31 - int blocks = N / threads + (N % threads == 0 ? 0:1); 32 + unsigned int max_threads = stim::maxThreadsPerBlock();
  33 + dim3 threads(max_threads, 1);
  34 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  35 +
32 36
33 //call the kernel to do the multiplication 37 //call the kernel to do the multiplication
34 - cuda_cart2polar <<< blocks, threads >>>(gpuGrad, N); 38 + cuda_cart2polar <<< blocks, threads >>>(gpuGrad, x, y);
35 39
36 } 40 }
37 41
38 42
39 template<typename T> 43 template<typename T>
40 - void cpu_cart2polar(T* a, unsigned int N){ 44 + void cpu_cart2polar(T* a, unsigned int x, unsigned int y){
41 45
42 //calculate the number of bytes in the array 46 //calculate the number of bytes in the array
  47 + unsigned int N = x *y;
43 unsigned int bytes = N * sizeof(T) * 2; 48 unsigned int bytes = N * sizeof(T) * 2;
44 49
45 //allocate memory on the GPU for the array 50 //allocate memory on the GPU for the array
@@ -50,7 +55,7 @@ namespace stim{ @@ -50,7 +55,7 @@ namespace stim{
50 HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) ); 55 HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) );
51 56
52 //call the GPU version of this function 57 //call the GPU version of this function
53 - gpu_cart2polar<T>(gpuA, N); 58 + gpu_cart2polar<T>(gpuA, x, y);
54 59
55 //copy the array back to the CPU 60 //copy the array back to the CPU
56 HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) ); 61 HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) );
stim/cuda/ivote/local_max.cuh
1 #ifndef STIM_CUDA_LOCAL_MAX_H 1 #ifndef STIM_CUDA_LOCAL_MAX_H
2 #define STIM_CUDA_LOCAL_MAX_H 2 #define STIM_CUDA_LOCAL_MAX_H
3 3
4 -  
5 # include <iostream> 4 # include <iostream>
6 # include <cuda.h> 5 # include <cuda.h>
7 #include <stim/cuda/cudatools.h> 6 #include <stim/cuda/cudatools.h>
@@ -9,7 +8,6 @@ @@ -9,7 +8,6 @@
9 namespace stim{ 8 namespace stim{
10 namespace cuda{ 9 namespace cuda{
11 10
12 -  
13 // this kernel calculates the local maximum for finding the cell centers 11 // this kernel calculates the local maximum for finding the cell centers
14 template<typename T> 12 template<typename T>
15 __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, int conn, int x, int y){ 13 __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, int conn, int x, int y){
@@ -20,13 +18,10 @@ namespace stim{ @@ -20,13 +18,10 @@ namespace stim{
20 18
21 if(xi >= x || yi >= y) 19 if(xi >= x || yi >= y)
22 return; 20 return;
23 -  
24 - 21 +
25 // convert 2D coordinates to 1D 22 // convert 2D coordinates to 1D
26 int i = yi * x + xi; 23 int i = yi * x + xi;
27 24
28 - // START DAVID  
29 -  
30 gpuCenters[i] = 0; //initialize the value at this location to zero 25 gpuCenters[i] = 0; //initialize the value at this location to zero
31 26
32 T val = gpuVote[i]; 27 T val = gpuVote[i];
@@ -46,100 +41,22 @@ namespace stim{ @@ -46,100 +41,22 @@ namespace stim{
46 } 41 }
47 42
48 gpuCenters[i] = 1; 43 gpuCenters[i] = 1;
49 -  
50 - // END DAVID  
51 - /*  
52 - //calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.  
53 - int xl = xi - conn;  
54 - int yl = yi - conn;  
55 -  
56 -  
57 - // use zero for the lowest limits if the xi or yi is less than conn.  
58 - if (xi <= conn)  
59 - xl = 0;  
60 - if (yi <= conn)  
61 - yl = 0;  
62 -  
63 - //calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.  
64 - int xh = xi + conn;  
65 - int yh = yi + conn;  
66 -  
67 - // use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn.  
68 - if (xi >= x - conn)  
69 - xh = x;  
70 - if (yi>= y - conn)  
71 - yh = y;  
72 -  
73 - // calculate the limits for finding the local maximum location in the connected neighbors for the current pixel  
74 - int n_l = yl * x + xl;  
75 - int n_h = yh * x + xh;  
76 -  
77 - //initial the centers image to zero  
78 - gpuCenters[i] = 0;  
79 -  
80 -  
81 - int n = n_l;  
82 -  
83 - float l_value = 0;  
84 -  
85 - if (i < x * y)  
86 -  
87 - // check if the vote value for this pixel is greater than threshold, so this pixel may be a local max.  
88 - if (gpuVote[i]>final_t){  
89 -  
90 - // compare the vote value for this pixel with the vote values with its neighbors.  
91 - while (n<=n_h){  
92 -  
93 - // check if this vote value is a local max in its neighborhood.  
94 - if (gpuVote[i] < gpuVote[n]){  
95 - l_value = 0;  
96 - n =n_h+1;  
97 - }  
98 - else if (n == n_h){  
99 - l_value = 1;  
100 - n = n+1;  
101 - }  
102 - // check if the current neighbor is the last one at the current row  
103 - else if ((n - n_l - 2*conn)% x ==0){  
104 - n = n + x - 2*conn -1;  
105 - n ++;  
106 - }  
107 - else  
108 - n ++;  
109 - }  
110 - // set the center value for this pixel to high if it's a local max ,and to low if not.  
111 - gpuCenters[i] = l_value ;  
112 - }  
113 - */  
114 -  
115 } 44 }
116 -  
117 -  
118 - 45 +
119 template<typename T> 46 template<typename T>
120 void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ 47 void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
121 48
122 -  
123 -  
124 -  
125 unsigned int max_threads = stim::maxThreadsPerBlock(); 49 unsigned int max_threads = stim::maxThreadsPerBlock();
126 dim3 threads(max_threads, 1); 50 dim3 threads(max_threads, 1);
127 dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); 51 dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
128 52
129 -  
130 -  
131 //call the kernel to find the local maximum. 53 //call the kernel to find the local maximum.
132 cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y); 54 cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y);
133 -  
134 -  
135 } 55 }
136 56
137 -  
138 -  
139 template<typename T> 57 template<typename T>
140 void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ 58 void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
141 59
142 -  
143 //calculate the number of bytes in the array 60 //calculate the number of bytes in the array
144 unsigned int bytes = x * y * sizeof(T); 61 unsigned int bytes = x * y * sizeof(T);
145 62
@@ -147,28 +64,22 @@ namespace stim{ @@ -147,28 +64,22 @@ namespace stim{
147 T* gpuCenters; 64 T* gpuCenters;
148 cudaMalloc(&gpuCenters, bytes); 65 cudaMalloc(&gpuCenters, bytes);
149 66
150 -  
151 //allocate space on the GPU for the input Vote Image 67 //allocate space on the GPU for the input Vote Image
152 T* gpuVote; 68 T* gpuVote;
153 cudaMalloc(&gpuVote, bytes); 69 cudaMalloc(&gpuVote, bytes);
154 70
155 -  
156 //copy the Vote image data to the GPU 71 //copy the Vote image data to the GPU
157 HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); 72 HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
158 -  
159 - 73 +
160 //call the GPU version of the local max function 74 //call the GPU version of the local max function
161 gpu_local_max<T>(gpuCenters, gpuVote, final_t, conn, x, y); 75 gpu_local_max<T>(gpuCenters, gpuVote, final_t, conn, x, y);
162 -  
163 - 76 +
164 //copy the cell centers data to the CPU 77 //copy the cell centers data to the CPU
165 cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ; 78 cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ;
166 -  
167 - 79 +
168 //free allocated memory 80 //free allocated memory
169 cudaFree(gpuCenters); 81 cudaFree(gpuCenters);
170 cudaFree(gpuVote); 82 cudaFree(gpuVote);
171 - cudaFree(gpuGrad);  
172 } 83 }
173 84
174 } 85 }
stim/cuda/ivote/update_dir.cuh
1 #ifndef STIM_CUDA_UPDATE_DIR_H 1 #ifndef STIM_CUDA_UPDATE_DIR_H
2 #define STIM_CUDA_UPDATE_DIR_H 2 #define STIM_CUDA_UPDATE_DIR_H
3 3
4 -  
5 # include <iostream> 4 # include <iostream>
6 # include <cuda.h> 5 # include <cuda.h>
7 #include <stim/cuda/cudatools.h> 6 #include <stim/cuda/cudatools.h>
@@ -48,8 +47,7 @@ namespace stim{ @@ -48,8 +47,7 @@ namespace stim{
48 int rmax_sq = rmax * rmax; 47 int rmax_sq = rmax * rmax;
49 int tx_rmax = threadIdx.x + rmax; 48 int tx_rmax = threadIdx.x + rmax;
50 int bxs = bxi - rmax; 49 int bxs = bxi - rmax;
51 -  
52 - 50 +
53 for(int yr = -rmax; yr <= rmax; yr++){ 51 for(int yr = -rmax; yr <= rmax; yr++){
54 52
55 //copy the portion of the image necessary for this block to shared memory 53 //copy the portion of the image necessary for this block to shared memory
@@ -66,8 +64,7 @@ namespace stim{ @@ -66,8 +64,7 @@ namespace stim{
66 64
67 // calculate the angle between the voter and the current pixel in x and y directions 65 // calculate the angle between the voter and the current pixel in x and y directions
68 float atan_angle = gpuTable[ind_t]; 66 float atan_angle = gpuTable[ind_t];
69 -  
70 - 67 +
71 // calculate the voting direction based on the grtadient direction 68 // calculate the voting direction based on the grtadient direction
72 int idx_share_update = xr + tx_rmax ; 69 int idx_share_update = xr + tx_rmax ;
73 float share_vote = s_vote[idx_share_update]; 70 float share_vote = s_vote[idx_share_update];
@@ -86,11 +83,8 @@ namespace stim{ @@ -86,11 +83,8 @@ namespace stim{
86 } 83 }
87 } 84 }
88 } 85 }
89 -  
90 -  
91 - //float new_angle = atan2(dy, dx); 86 +
92 unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); 87 unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
93 -  
94 float new_angle = gpuTable[ind_m]; 88 float new_angle = gpuTable[ind_m];
95 89
96 if(xi < x && yi < y) 90 if(xi < x && yi < y)
@@ -102,35 +96,27 @@ namespace stim{ @@ -102,35 +96,27 @@ namespace stim{
102 template<typename T> 96 template<typename T>
103 __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ 97 __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
104 98
105 - //************ when the number of threads are (1024,1) *************  
106 -  
107 // calculate the 2D coordinates for this current thread. 99 // calculate the 2D coordinates for this current thread.
108 int xi = blockIdx.x * blockDim.x + threadIdx.x; 100 int xi = blockIdx.x * blockDim.x + threadIdx.x;
109 int yi = blockIdx.y * blockDim.y + threadIdx.y; 101 int yi = blockIdx.y * blockDim.y + threadIdx.y;
110 102
111 // convert 2D coordinates to 1D 103 // convert 2D coordinates to 1D
112 int i = yi * x + xi; 104 int i = yi * x + xi;
113 -  
114 - 105 +
115 //update the gradient image with the vote direction 106 //update the gradient image with the vote direction
116 gpuGrad[2*i] = gpuDir[i]; 107 gpuGrad[2*i] = gpuDir[i];
117 } 108 }
118 -  
119 - 109 +
120 template<typename T> 110 template<typename T>
121 void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ 111 void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
122 112
123 //get the number of pixels in the image 113 //get the number of pixels in the image
124 unsigned int pixels = x * y; 114 unsigned int pixels = x * y;
125 unsigned int bytes = sizeof(T) * pixels; 115 unsigned int bytes = sizeof(T) * pixels;
126 -  
127 - 116 +
128 unsigned int max_threads = stim::maxThreadsPerBlock(); 117 unsigned int max_threads = stim::maxThreadsPerBlock();
129 dim3 threads(max_threads, 1); 118 dim3 threads(max_threads, 1);
130 dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); 119 dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
131 - //dim3 threads(1, 1);  
132 - //dim3 blocks(x, y);  
133 - // Allocate CUDA array in device memory  
134 120
135 //define a channel descriptor for a single 32-bit channel 121 //define a channel descriptor for a single 32-bit channel
136 cudaChannelFormatDesc channelDesc = 122 cudaChannelFormatDesc channelDesc =
@@ -174,14 +160,12 @@ namespace stim{ @@ -174,14 +160,12 @@ namespace stim{
174 160
175 //call the kernel to update the gradient direction 161 //call the kernel to update the gradient direction
176 cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); 162 cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
177 -  
178 - 163 +
179 //free allocated memory 164 //free allocated memory
180 cudaFree(gpuDir); 165 cudaFree(gpuDir);
181 166
182 } 167 }
183 -  
184 - 168 +
185 template<typename T> 169 template<typename T>
186 void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ 170 void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
187 171
@@ -211,12 +195,10 @@ namespace stim{ @@ -211,12 +195,10 @@ namespace stim{
211 195
212 //copy the atan2 values to the GPU 196 //copy the atan2 values to the GPU
213 HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); 197 HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
214 -  
215 - 198 +
216 //call the GPU version of the update direction function 199 //call the GPU version of the update direction function
217 gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); 200 gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
218 -  
219 - 201 +
220 //copy the new gradient image back to the CPU 202 //copy the new gradient image back to the CPU
221 cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; 203 cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
222 204
@@ -229,6 +211,4 @@ namespace stim{ @@ -229,6 +211,4 @@ namespace stim{
229 } 211 }
230 } 212 }
231 213
232 -  
233 -  
234 #endif 214 #endif
235 \ No newline at end of file 215 \ No newline at end of file
stim/cuda/ivote/vote.cuh
1 #ifndef STIM_CUDA_VOTE_H 1 #ifndef STIM_CUDA_VOTE_H
2 #define STIM_CUDA_VOTE_H 2 #define STIM_CUDA_VOTE_H
3 3
4 -  
5 # include <iostream> 4 # include <iostream>
6 # include <cuda.h> 5 # include <cuda.h>
7 #include <stim/cuda/cudatools.h> 6 #include <stim/cuda/cudatools.h>
8 #include <stim/cuda/sharedmem.cuh> 7 #include <stim/cuda/sharedmem.cuh>
9 8
10 -  
11 namespace stim{ 9 namespace stim{
12 namespace cuda{ 10 namespace cuda{
13 11
@@ -26,13 +24,10 @@ namespace stim{ @@ -26,13 +24,10 @@ namespace stim{
26 int yi = blockIdx.y * blockDim.y + threadIdx.y; 24 int yi = blockIdx.y * blockDim.y + threadIdx.y;
27 // convert 2D coordinates to 1D 25 // convert 2D coordinates to 1D
28 int i = yi * x + xi; 26 int i = yi * x + xi;
29 -  
30 -  
31 - 27 +
32 // define a local variable to sum the votes from the voters 28 // define a local variable to sum the votes from the voters
33 float sum = 0; 29 float sum = 0;
34 30
35 -  
36 //calculate the width of the shared memory block 31 //calculate the width of the shared memory block
37 int swidth = 2 * rmax + blockDim.x; 32 int swidth = 2 * rmax + blockDim.x;
38 33
@@ -59,8 +54,7 @@ namespace stim{ @@ -59,8 +54,7 @@ namespace stim{
59 54
60 // calculate the angle between the pixel and the current voter in x and y directions 55 // calculate the angle between the pixel and the current voter in x and y directions
61 float atan_angle = gpuTable[id_t]; 56 float atan_angle = gpuTable[id_t];
62 -  
63 - 57 +
64 // calculate the voting direction based on the grtadient direction 58 // calculate the voting direction based on the grtadient direction
65 int idx_share = xr + tx_rmax ; 59 int idx_share = xr + tx_rmax ;
66 float2 g = s_grad[idx_share]; 60 float2 g = s_grad[idx_share];
@@ -71,7 +65,6 @@ namespace stim{ @@ -71,7 +65,6 @@ namespace stim{
71 sum += g.y; 65 sum += g.y;
72 66
73 } 67 }
74 -  
75 } 68 }
76 69
77 } 70 }
@@ -87,15 +80,11 @@ namespace stim{ @@ -87,15 +80,11 @@ namespace stim{
87 //get the number of pixels in the image 80 //get the number of pixels in the image
88 unsigned int pixels = x * y; 81 unsigned int pixels = x * y;
89 unsigned int bytes = sizeof(T) * pixels; 82 unsigned int bytes = sizeof(T) * pixels;
90 -  
91 - 83 +
92 unsigned int max_threads = stim::maxThreadsPerBlock(); 84 unsigned int max_threads = stim::maxThreadsPerBlock();
93 - //unsigned int thread_dim = sqrt(max_threads);  
94 dim3 threads(max_threads, 1); 85 dim3 threads(max_threads, 1);
95 dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); 86 dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
96 - //dim3 threads(1,1);  
97 - //dim3 blocks(x, y);  
98 - 87 +
99 // Allocate CUDA array in device memory 88 // Allocate CUDA array in device memory
100 89
101 //define a channel descriptor for a single 32-bit channel 90 //define a channel descriptor for a single 32-bit channel
@@ -131,7 +120,6 @@ namespace stim{ @@ -131,7 +120,6 @@ namespace stim{
131 // specify share memory 120 // specify share memory
132 unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4; 121 unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4;
133 122
134 -  
135 //call the kernel to do the voting 123 //call the kernel to do the voting
136 124
137 cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y); 125 cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y);
@@ -165,14 +153,10 @@ namespace stim{ @@ -165,14 +153,10 @@ namespace stim{
165 153
166 //copy the atan2 values to the GPU 154 //copy the atan2 values to the GPU
167 HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); 155 HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
168 -  
169 - //cudaMemcpyToSymbol (cstTable, cpuTable, bytes_table );  
170 -  
171 - 156 +
172 //call the GPU version of the vote calculation function 157 //call the GPU version of the vote calculation function
173 gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); 158 gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
174 -  
175 - 159 +
176 //copy the Vote Data back to the CPU 160 //copy the Vote Data back to the CPU
177 cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; 161 cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
178 162
@@ -185,6 +169,4 @@ namespace stim{ @@ -185,6 +169,4 @@ namespace stim{
185 } 169 }
186 } 170 }
187 171
188 -  
189 -  
190 #endif 172 #endif
191 \ No newline at end of file 173 \ No newline at end of file
stim/cuda/templates/conv2sep.cuh
@@ -49,7 +49,7 @@ namespace stim{ @@ -49,7 +49,7 @@ namespace stim{
49 __syncthreads(); 49 __syncthreads();
50 50
51 //if the current pixel is outside of the image 51 //if the current pixel is outside of the image
52 - if(xi > x || yi > y) 52 + if(xi >= x || yi >= y)
53 return; 53 return;
54 54
55 55
@@ -61,7 +61,7 @@ namespace stim{ @@ -61,7 +61,7 @@ namespace stim{
61 61
62 //for each element of the kernel 62 //for each element of the kernel
63 for(int ki = -kr; ki <= kr; ki++){ 63 for(int ki = -kr; ki <= kr; ki++){
64 - sum += g[ki + kr] * s[si + ki]; 64 + sum += s[si + ki] * g[ki + kr];
65 } 65 }
66 66
67 //calculate the 1D image index for this thread 67 //calculate the 1D image index for this thread
stim/cuda/templates/gradient.cuh
@@ -11,12 +11,12 @@ namespace stim{ @@ -11,12 +11,12 @@ namespace stim{
11 template<typename T> 11 template<typename T>
12 __global__ void gradient_2d(T* out, T* in, int x, int y){ 12 __global__ void gradient_2d(T* out, T* in, int x, int y){
13 13
14 - //calculate the 1D image index for this thread  
15 - int i = blockIdx.x * blockDim.x + threadIdx.x;  
16 -  
17 - //convert this to 2D pixel coordinates  
18 - int yi = i / x;  
19 - int xi = i - (yi * x); 14 +
  15 + // calculate the 2D coordinates for this current thread.
  16 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  17 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  18 + // convert 2D coordinates to 1D
  19 + int i = yi * x + xi;
20 20
21 //return if the pixel is outside of the image 21 //return if the pixel is outside of the image
22 if(xi >= x || yi >= y) return; 22 if(xi >= x || yi >= y) return;
@@ -25,16 +25,16 @@ namespace stim{ @@ -25,16 +25,16 @@ namespace stim{
25 int i_xp = yi * x + (xi + 1); 25 int i_xp = yi * x + (xi + 1);
26 int i_yp = (yi + 1) * x + xi; 26 int i_yp = (yi + 1) * x + xi;
27 27
  28 + //calculate indices for the backward difference
  29 + int i_xn = yi * x + (xi - 1);
  30 + int i_yn = (yi - 1) * x + xi;
  31 +
28 //use forward differences if a coordinate is zero 32 //use forward differences if a coordinate is zero
29 if(xi == 0) 33 if(xi == 0)
30 out[i * 2 + 0] = in[i_xp] - in[i]; 34 out[i * 2 + 0] = in[i_xp] - in[i];
31 if(yi == 0) 35 if(yi == 0)
32 out[i * 2 + 1] = in[i_yp] - in[i]; 36 out[i * 2 + 1] = in[i_yp] - in[i];
33 37
34 - //calculate indices for the backward difference  
35 - int i_xn = yi * x + (xi - 1);  
36 - int i_yn = (yi - 1) * x + xi;  
37 -  
38 //use backward differences if the coordinate is at the maximum edge 38 //use backward differences if the coordinate is at the maximum edge
39 if(xi == x-1) 39 if(xi == x-1)
40 out[i * 2 + 0] = in[i] - in[i_xn]; 40 out[i * 2 + 0] = in[i] - in[i_xn];
@@ -51,28 +51,16 @@ namespace stim{ @@ -51,28 +51,16 @@ namespace stim{
51 } 51 }
52 52
53 template<typename T> 53 template<typename T>
54 - //void gpu_gradient_2d(T* gpuOut, T* gpuIn, unsigned int x, unsigned int y){  
55 void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){ 54 void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){
56 55
57 //get the number of pixels in the image 56 //get the number of pixels in the image
58 unsigned int pixels = x * y; 57 unsigned int pixels = x * y;
59 58
60 - //allocate space on the GPU for the input image  
61 - //T* gpuI;  
62 - //HANDLE_ERROR(cudaMalloc(&gpuI, bytes));  
63 -  
64 - //cudaMemcpy(gpuI, gpuI0, bytes, cudaMemcpyDeviceToDevice);  
65 -  
66 -  
67 - //allocate space on the GPU for the output gradient image  
68 - //T* gpuGrad;  
69 - //cudaMalloc(&gpuGrad, bytes * 2); //the output image will have two channels (x, y)  
70 -  
71 //get the maximum number of threads per block for the CUDA device 59 //get the maximum number of threads per block for the CUDA device
72 - int threads = stim::maxThreadsPerBlock();  
73 -  
74 - //calculate the number of blocks  
75 - int blocks = pixels / threads + (pixels%threads == 0 ? 0:1); 60 + unsigned int max_threads = stim::maxThreadsPerBlock();
  61 + dim3 threads(max_threads, 1);
  62 + dim3 blocks(x/threads.x + 1 , y);
  63 +
76 64
77 //call the GPU kernel to determine the gradient 65 //call the GPU kernel to determine the gradient
78 gradient_2d<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y); 66 gradient_2d<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y);
@@ -105,6 +93,7 @@ namespace stim{ @@ -105,6 +93,7 @@ namespace stim{
105 93
106 //free allocated memory 94 //free allocated memory
107 cudaFree(gpuOut); 95 cudaFree(gpuOut);
  96 + cudaFree(gpuIn);
108 } 97 }
109 98
110 } 99 }