Commit bf73197014983c55343330462d4f78d2492f6b0b

Authored by Laila Saadatifard
1 parent 84ca9bba

fix some bugs in the gradient, conv2sep and local_max codes to get the determini…

…stic results in every run and update the vote and update_dir codes
stim/cuda/arraymath/array_cart2polar.cuh
... ... @@ -4,42 +4,47 @@
4 4 namespace stim{
5 5 namespace cuda{
6 6 template<typename T>
7   - __global__ void cuda_cart2polar(T* a, unsigned int N){
  7 + __global__ void cuda_cart2polar(T* a, int x, int y){
8 8  
9 9  
10   - //calculate the 1D index for this thread
11   - int i = blockIdx.x * blockDim.x + threadIdx.x;
12   -
13   - if(i < N){
14   - float x = a[i * 2 + 0];
15   - float y = a[i * 2 + 1];
16   - float theta = atan2( y, x ) ;
17   - float r = sqrt(x * x + y * y);
  10 + // calculate the 2D coordinates for this current thread.
  11 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  12 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  13 + // convert 2D coordinates to 1D
  14 + int i = yi * x + xi;
  15 +
  16 +
  17 + if(xi >= x|| yi >= y) return;
  18 +
  19 + float xl = a[i * 2 + 0];
  20 + float yl = a[i * 2 + 1];
  21 + float theta = atan2( yl, xl ) ;
  22 + float r = sqrt(xl * xl + yl * yl);
18 23 a[i * 2 + 0] = theta;
19 24 a[i * 2 + 1] = r;
20   - }
  25 +
21 26 }
22 27  
23 28  
24 29 template<typename T>
25   - void gpu_cart2polar(T* gpuGrad, unsigned int N){
26   -
27   - //get the maximum number of threads per block for the CUDA device
28   - int threads = stim::maxThreadsPerBlock();
  30 + void gpu_cart2polar(T* gpuGrad, unsigned int x, unsigned int y){
29 31  
30   - //calculate the number of blocks
31   - int blocks = N / threads + (N % threads == 0 ? 0:1);
  32 + unsigned int max_threads = stim::maxThreadsPerBlock();
  33 + dim3 threads(max_threads, 1);
  34 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  35 +
32 36  
33 37 //call the kernel to do the multiplication
34   - cuda_cart2polar <<< blocks, threads >>>(gpuGrad, N);
  38 + cuda_cart2polar <<< blocks, threads >>>(gpuGrad, x, y);
35 39  
36 40 }
37 41  
38 42  
39 43 template<typename T>
40   - void cpu_cart2polar(T* a, unsigned int N){
  44 + void cpu_cart2polar(T* a, unsigned int x, unsigned int y){
41 45  
42 46 //calculate the number of bytes in the array
  47 + unsigned int N = x *y;
43 48 unsigned int bytes = N * sizeof(T) * 2;
44 49  
45 50 //allocate memory on the GPU for the array
... ... @@ -50,7 +55,7 @@ namespace stim{
50 55 HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) );
51 56  
52 57 //call the GPU version of this function
53   - gpu_cart2polar<T>(gpuA, N);
  58 + gpu_cart2polar<T>(gpuA, x, y);
54 59  
55 60 //copy the array back to the CPU
56 61 HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) );
... ...
stim/cuda/ivote/local_max.cuh
1 1 #ifndef STIM_CUDA_LOCAL_MAX_H
2 2 #define STIM_CUDA_LOCAL_MAX_H
3 3  
4   -
5 4 # include <iostream>
6 5 # include <cuda.h>
7 6 #include <stim/cuda/cudatools.h>
... ... @@ -9,7 +8,6 @@
9 8 namespace stim{
10 9 namespace cuda{
11 10  
12   -
13 11 // this kernel calculates the local maximum for finding the cell centers
14 12 template<typename T>
15 13 __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, int conn, int x, int y){
... ... @@ -20,13 +18,10 @@ namespace stim{
20 18  
21 19 if(xi >= x || yi >= y)
22 20 return;
23   -
24   -
  21 +
25 22 // convert 2D coordinates to 1D
26 23 int i = yi * x + xi;
27 24  
28   - // START DAVID
29   -
30 25 gpuCenters[i] = 0; //initialize the value at this location to zero
31 26  
32 27 T val = gpuVote[i];
... ... @@ -46,100 +41,22 @@ namespace stim{
46 41 }
47 42  
48 43 gpuCenters[i] = 1;
49   -
50   - // END DAVID
51   - /*
52   - //calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
53   - int xl = xi - conn;
54   - int yl = yi - conn;
55   -
56   -
57   - // use zero for the lowest limits if the xi or yi is less than conn.
58   - if (xi <= conn)
59   - xl = 0;
60   - if (yi <= conn)
61   - yl = 0;
62   -
63   - //calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
64   - int xh = xi + conn;
65   - int yh = yi + conn;
66   -
67   - // use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn.
68   - if (xi >= x - conn)
69   - xh = x;
70   - if (yi>= y - conn)
71   - yh = y;
72   -
73   - // calculate the limits for finding the local maximum location in the connected neighbors for the current pixel
74   - int n_l = yl * x + xl;
75   - int n_h = yh * x + xh;
76   -
77   - //initial the centers image to zero
78   - gpuCenters[i] = 0;
79   -
80   -
81   - int n = n_l;
82   -
83   - float l_value = 0;
84   -
85   - if (i < x * y)
86   -
87   - // check if the vote value for this pixel is greater than threshold, so this pixel may be a local max.
88   - if (gpuVote[i]>final_t){
89   -
90   - // compare the vote value for this pixel with the vote values with its neighbors.
91   - while (n<=n_h){
92   -
93   - // check if this vote value is a local max in its neighborhood.
94   - if (gpuVote[i] < gpuVote[n]){
95   - l_value = 0;
96   - n =n_h+1;
97   - }
98   - else if (n == n_h){
99   - l_value = 1;
100   - n = n+1;
101   - }
102   - // check if the current neighbor is the last one at the current row
103   - else if ((n - n_l - 2*conn)% x ==0){
104   - n = n + x - 2*conn -1;
105   - n ++;
106   - }
107   - else
108   - n ++;
109   - }
110   - // set the center value for this pixel to high if it's a local max ,and to low if not.
111   - gpuCenters[i] = l_value ;
112   - }
113   - */
114   -
115 44 }
116   -
117   -
118   -
  45 +
119 46 template<typename T>
120 47 void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
121 48  
122   -
123   -
124   -
125 49 unsigned int max_threads = stim::maxThreadsPerBlock();
126 50 dim3 threads(max_threads, 1);
127 51 dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
128 52  
129   -
130   -
131 53 //call the kernel to find the local maximum.
132 54 cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y);
133   -
134   -
135 55 }
136 56  
137   -
138   -
139 57 template<typename T>
140 58 void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
141 59  
142   -
143 60 //calculate the number of bytes in the array
144 61 unsigned int bytes = x * y * sizeof(T);
145 62  
... ... @@ -147,28 +64,22 @@ namespace stim{
147 64 T* gpuCenters;
148 65 cudaMalloc(&gpuCenters, bytes);
149 66  
150   -
151 67 //allocate space on the GPU for the input Vote Image
152 68 T* gpuVote;
153 69 cudaMalloc(&gpuVote, bytes);
154 70  
155   -
156 71 //copy the Vote image data to the GPU
157 72 HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
158   -
159   -
  73 +
160 74 //call the GPU version of the local max function
161 75 gpu_local_max<T>(gpuCenters, gpuVote, final_t, conn, x, y);
162   -
163   -
  76 +
164 77 //copy the cell centers data to the CPU
165 78 cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ;
166   -
167   -
  79 +
168 80 //free allocated memory
169 81 cudaFree(gpuCenters);
170 82 cudaFree(gpuVote);
171   - cudaFree(gpuGrad);
172 83 }
173 84  
174 85 }
... ...
stim/cuda/ivote/update_dir.cuh
1 1 #ifndef STIM_CUDA_UPDATE_DIR_H
2 2 #define STIM_CUDA_UPDATE_DIR_H
3 3  
4   -
5 4 # include <iostream>
6 5 # include <cuda.h>
7 6 #include <stim/cuda/cudatools.h>
... ... @@ -48,8 +47,7 @@ namespace stim{
48 47 int rmax_sq = rmax * rmax;
49 48 int tx_rmax = threadIdx.x + rmax;
50 49 int bxs = bxi - rmax;
51   -
52   -
  50 +
53 51 for(int yr = -rmax; yr <= rmax; yr++){
54 52  
55 53 //copy the portion of the image necessary for this block to shared memory
... ... @@ -66,8 +64,7 @@ namespace stim{
66 64  
67 65 // calculate the angle between the voter and the current pixel in x and y directions
68 66 float atan_angle = gpuTable[ind_t];
69   -
70   -
  67 +
71 68 // calculate the voting direction based on the grtadient direction
72 69 int idx_share_update = xr + tx_rmax ;
73 70 float share_vote = s_vote[idx_share_update];
... ... @@ -86,11 +83,8 @@ namespace stim{
86 83 }
87 84 }
88 85 }
89   -
90   -
91   - //float new_angle = atan2(dy, dx);
  86 +
92 87 unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
93   -
94 88 float new_angle = gpuTable[ind_m];
95 89  
96 90 if(xi < x && yi < y)
... ... @@ -102,35 +96,27 @@ namespace stim{
102 96 template<typename T>
103 97 __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
104 98  
105   - //************ when the number of threads are (1024,1) *************
106   -
107 99 // calculate the 2D coordinates for this current thread.
108 100 int xi = blockIdx.x * blockDim.x + threadIdx.x;
109 101 int yi = blockIdx.y * blockDim.y + threadIdx.y;
110 102  
111 103 // convert 2D coordinates to 1D
112 104 int i = yi * x + xi;
113   -
114   -
  105 +
115 106 //update the gradient image with the vote direction
116 107 gpuGrad[2*i] = gpuDir[i];
117 108 }
118   -
119   -
  109 +
120 110 template<typename T>
121 111 void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
122 112  
123 113 //get the number of pixels in the image
124 114 unsigned int pixels = x * y;
125 115 unsigned int bytes = sizeof(T) * pixels;
126   -
127   -
  116 +
128 117 unsigned int max_threads = stim::maxThreadsPerBlock();
129 118 dim3 threads(max_threads, 1);
130 119 dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
131   - //dim3 threads(1, 1);
132   - //dim3 blocks(x, y);
133   - // Allocate CUDA array in device memory
134 120  
135 121 //define a channel descriptor for a single 32-bit channel
136 122 cudaChannelFormatDesc channelDesc =
... ... @@ -174,14 +160,12 @@ namespace stim{
174 160  
175 161 //call the kernel to update the gradient direction
176 162 cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
177   -
178   -
  163 +
179 164 //free allocated memory
180 165 cudaFree(gpuDir);
181 166  
182 167 }
183   -
184   -
  168 +
185 169 template<typename T>
186 170 void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
187 171  
... ... @@ -211,12 +195,10 @@ namespace stim{
211 195  
212 196 //copy the atan2 values to the GPU
213 197 HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
214   -
215   -
  198 +
216 199 //call the GPU version of the update direction function
217 200 gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
218   -
219   -
  201 +
220 202 //copy the new gradient image back to the CPU
221 203 cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
222 204  
... ... @@ -229,6 +211,4 @@ namespace stim{
229 211 }
230 212 }
231 213  
232   -
233   -
234 214 #endif
235 215 \ No newline at end of file
... ...
stim/cuda/ivote/vote.cuh
1 1 #ifndef STIM_CUDA_VOTE_H
2 2 #define STIM_CUDA_VOTE_H
3 3  
4   -
5 4 # include <iostream>
6 5 # include <cuda.h>
7 6 #include <stim/cuda/cudatools.h>
8 7 #include <stim/cuda/sharedmem.cuh>
9 8  
10   -
11 9 namespace stim{
12 10 namespace cuda{
13 11  
... ... @@ -26,13 +24,10 @@ namespace stim{
26 24 int yi = blockIdx.y * blockDim.y + threadIdx.y;
27 25 // convert 2D coordinates to 1D
28 26 int i = yi * x + xi;
29   -
30   -
31   -
  27 +
32 28 // define a local variable to sum the votes from the voters
33 29 float sum = 0;
34 30  
35   -
36 31 //calculate the width of the shared memory block
37 32 int swidth = 2 * rmax + blockDim.x;
38 33  
... ... @@ -59,8 +54,7 @@ namespace stim{
59 54  
60 55 // calculate the angle between the pixel and the current voter in x and y directions
61 56 float atan_angle = gpuTable[id_t];
62   -
63   -
  57 +
64 58 // calculate the voting direction based on the grtadient direction
65 59 int idx_share = xr + tx_rmax ;
66 60 float2 g = s_grad[idx_share];
... ... @@ -71,7 +65,6 @@ namespace stim{
71 65 sum += g.y;
72 66  
73 67 }
74   -
75 68 }
76 69  
77 70 }
... ... @@ -87,15 +80,11 @@ namespace stim{
87 80 //get the number of pixels in the image
88 81 unsigned int pixels = x * y;
89 82 unsigned int bytes = sizeof(T) * pixels;
90   -
91   -
  83 +
92 84 unsigned int max_threads = stim::maxThreadsPerBlock();
93   - //unsigned int thread_dim = sqrt(max_threads);
94 85 dim3 threads(max_threads, 1);
95 86 dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
96   - //dim3 threads(1,1);
97   - //dim3 blocks(x, y);
98   -
  87 +
99 88 // Allocate CUDA array in device memory
100 89  
101 90 //define a channel descriptor for a single 32-bit channel
... ... @@ -131,7 +120,6 @@ namespace stim{
131 120 // specify share memory
132 121 unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4;
133 122  
134   -
135 123 //call the kernel to do the voting
136 124  
137 125 cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y);
... ... @@ -165,14 +153,10 @@ namespace stim{
165 153  
166 154 //copy the atan2 values to the GPU
167 155 HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
168   -
169   - //cudaMemcpyToSymbol (cstTable, cpuTable, bytes_table );
170   -
171   -
  156 +
172 157 //call the GPU version of the vote calculation function
173 158 gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
174   -
175   -
  159 +
176 160 //copy the Vote Data back to the CPU
177 161 cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
178 162  
... ... @@ -185,6 +169,4 @@ namespace stim{
185 169 }
186 170 }
187 171  
188   -
189   -
190 172 #endif
191 173 \ No newline at end of file
... ...
stim/cuda/templates/conv2sep.cuh
... ... @@ -49,7 +49,7 @@ namespace stim{
49 49 __syncthreads();
50 50  
51 51 //if the current pixel is outside of the image
52   - if(xi > x || yi > y)
  52 + if(xi >= x || yi >= y)
53 53 return;
54 54  
55 55  
... ... @@ -61,7 +61,7 @@ namespace stim{
61 61  
62 62 //for each element of the kernel
63 63 for(int ki = -kr; ki <= kr; ki++){
64   - sum += g[ki + kr] * s[si + ki];
  64 + sum += s[si + ki] * g[ki + kr];
65 65 }
66 66  
67 67 //calculate the 1D image index for this thread
... ...
stim/cuda/templates/gradient.cuh
... ... @@ -11,12 +11,12 @@ namespace stim{
11 11 template<typename T>
12 12 __global__ void gradient_2d(T* out, T* in, int x, int y){
13 13  
14   - //calculate the 1D image index for this thread
15   - int i = blockIdx.x * blockDim.x + threadIdx.x;
16   -
17   - //convert this to 2D pixel coordinates
18   - int yi = i / x;
19   - int xi = i - (yi * x);
  14 +
  15 + // calculate the 2D coordinates for this current thread.
  16 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  17 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  18 + // convert 2D coordinates to 1D
  19 + int i = yi * x + xi;
20 20  
21 21 //return if the pixel is outside of the image
22 22 if(xi >= x || yi >= y) return;
... ... @@ -25,16 +25,16 @@ namespace stim{
25 25 int i_xp = yi * x + (xi + 1);
26 26 int i_yp = (yi + 1) * x + xi;
27 27  
  28 + //calculate indices for the backward difference
  29 + int i_xn = yi * x + (xi - 1);
  30 + int i_yn = (yi - 1) * x + xi;
  31 +
28 32 //use forward differences if a coordinate is zero
29 33 if(xi == 0)
30 34 out[i * 2 + 0] = in[i_xp] - in[i];
31 35 if(yi == 0)
32 36 out[i * 2 + 1] = in[i_yp] - in[i];
33 37  
34   - //calculate indices for the backward difference
35   - int i_xn = yi * x + (xi - 1);
36   - int i_yn = (yi - 1) * x + xi;
37   -
38 38 //use backward differences if the coordinate is at the maximum edge
39 39 if(xi == x-1)
40 40 out[i * 2 + 0] = in[i] - in[i_xn];
... ... @@ -51,28 +51,16 @@ namespace stim{
51 51 }
52 52  
53 53 template<typename T>
54   - //void gpu_gradient_2d(T* gpuOut, T* gpuIn, unsigned int x, unsigned int y){
55 54 void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){
56 55  
57 56 //get the number of pixels in the image
58 57 unsigned int pixels = x * y;
59 58  
60   - //allocate space on the GPU for the input image
61   - //T* gpuI;
62   - //HANDLE_ERROR(cudaMalloc(&gpuI, bytes));
63   -
64   - //cudaMemcpy(gpuI, gpuI0, bytes, cudaMemcpyDeviceToDevice);
65   -
66   -
67   - //allocate space on the GPU for the output gradient image
68   - //T* gpuGrad;
69   - //cudaMalloc(&gpuGrad, bytes * 2); //the output image will have two channels (x, y)
70   -
71 59 //get the maximum number of threads per block for the CUDA device
72   - int threads = stim::maxThreadsPerBlock();
73   -
74   - //calculate the number of blocks
75   - int blocks = pixels / threads + (pixels%threads == 0 ? 0:1);
  60 + unsigned int max_threads = stim::maxThreadsPerBlock();
  61 + dim3 threads(max_threads, 1);
  62 + dim3 blocks(x/threads.x + 1 , y);
  63 +
76 64  
77 65 //call the GPU kernel to determine the gradient
78 66 gradient_2d<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y);
... ... @@ -105,6 +93,7 @@ namespace stim{
105 93  
106 94 //free allocated memory
107 95 cudaFree(gpuOut);
  96 + cudaFree(gpuIn);
108 97 }
109 98  
110 99 }
... ...