Commit bf73197014983c55343330462d4f78d2492f6b0b
1 parent
84ca9bba
fix some bugs in the gradient, conv2sep and local_max codes to get the determini…
…stic results in every run and update the vote and update_dir codes
Showing
6 changed files
with
62 additions
and
195 deletions
Show diff stats
stim/cuda/arraymath/array_cart2polar.cuh
... | ... | @@ -4,42 +4,47 @@ |
4 | 4 | namespace stim{ |
5 | 5 | namespace cuda{ |
6 | 6 | template<typename T> |
7 | - __global__ void cuda_cart2polar(T* a, unsigned int N){ | |
7 | + __global__ void cuda_cart2polar(T* a, int x, int y){ | |
8 | 8 | |
9 | 9 | |
10 | - //calculate the 1D index for this thread | |
11 | - int i = blockIdx.x * blockDim.x + threadIdx.x; | |
12 | - | |
13 | - if(i < N){ | |
14 | - float x = a[i * 2 + 0]; | |
15 | - float y = a[i * 2 + 1]; | |
16 | - float theta = atan2( y, x ) ; | |
17 | - float r = sqrt(x * x + y * y); | |
10 | + // calculate the 2D coordinates for this current thread. | |
11 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
12 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
13 | + // convert 2D coordinates to 1D | |
14 | + int i = yi * x + xi; | |
15 | + | |
16 | + | |
17 | + if(xi >= x|| yi >= y) return; | |
18 | + | |
19 | + float xl = a[i * 2 + 0]; | |
20 | + float yl = a[i * 2 + 1]; | |
21 | + float theta = atan2( yl, xl ) ; | |
22 | + float r = sqrt(xl * xl + yl * yl); | |
18 | 23 | a[i * 2 + 0] = theta; |
19 | 24 | a[i * 2 + 1] = r; |
20 | - } | |
25 | + | |
21 | 26 | } |
22 | 27 | |
23 | 28 | |
24 | 29 | template<typename T> |
25 | - void gpu_cart2polar(T* gpuGrad, unsigned int N){ | |
26 | - | |
27 | - //get the maximum number of threads per block for the CUDA device | |
28 | - int threads = stim::maxThreadsPerBlock(); | |
30 | + void gpu_cart2polar(T* gpuGrad, unsigned int x, unsigned int y){ | |
29 | 31 | |
30 | - //calculate the number of blocks | |
31 | - int blocks = N / threads + (N % threads == 0 ? 0:1); | |
32 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
33 | + dim3 threads(max_threads, 1); | |
34 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | |
35 | + | |
32 | 36 | |
33 | 37 | //call the kernel to do the multiplication |
34 | - cuda_cart2polar <<< blocks, threads >>>(gpuGrad, N); | |
38 | + cuda_cart2polar <<< blocks, threads >>>(gpuGrad, x, y); | |
35 | 39 | |
36 | 40 | } |
37 | 41 | |
38 | 42 | |
39 | 43 | template<typename T> |
40 | - void cpu_cart2polar(T* a, unsigned int N){ | |
44 | + void cpu_cart2polar(T* a, unsigned int x, unsigned int y){ | |
41 | 45 | |
42 | 46 | //calculate the number of bytes in the array |
47 | + unsigned int N = x *y; | |
43 | 48 | unsigned int bytes = N * sizeof(T) * 2; |
44 | 49 | |
45 | 50 | //allocate memory on the GPU for the array |
... | ... | @@ -50,7 +55,7 @@ namespace stim{ |
50 | 55 | HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) ); |
51 | 56 | |
52 | 57 | //call the GPU version of this function |
53 | - gpu_cart2polar<T>(gpuA, N); | |
58 | + gpu_cart2polar<T>(gpuA, x, y); | |
54 | 59 | |
55 | 60 | //copy the array back to the CPU |
56 | 61 | HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) ); | ... | ... |
stim/cuda/ivote/local_max.cuh
1 | 1 | #ifndef STIM_CUDA_LOCAL_MAX_H |
2 | 2 | #define STIM_CUDA_LOCAL_MAX_H |
3 | 3 | |
4 | - | |
5 | 4 | # include <iostream> |
6 | 5 | # include <cuda.h> |
7 | 6 | #include <stim/cuda/cudatools.h> |
... | ... | @@ -9,7 +8,6 @@ |
9 | 8 | namespace stim{ |
10 | 9 | namespace cuda{ |
11 | 10 | |
12 | - | |
13 | 11 | // this kernel calculates the local maximum for finding the cell centers |
14 | 12 | template<typename T> |
15 | 13 | __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, int conn, int x, int y){ |
... | ... | @@ -20,13 +18,10 @@ namespace stim{ |
20 | 18 | |
21 | 19 | if(xi >= x || yi >= y) |
22 | 20 | return; |
23 | - | |
24 | - | |
21 | + | |
25 | 22 | // convert 2D coordinates to 1D |
26 | 23 | int i = yi * x + xi; |
27 | 24 | |
28 | - // START DAVID | |
29 | - | |
30 | 25 | gpuCenters[i] = 0; //initialize the value at this location to zero |
31 | 26 | |
32 | 27 | T val = gpuVote[i]; |
... | ... | @@ -46,100 +41,22 @@ namespace stim{ |
46 | 41 | } |
47 | 42 | |
48 | 43 | gpuCenters[i] = 1; |
49 | - | |
50 | - // END DAVID | |
51 | - /* | |
52 | - //calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. | |
53 | - int xl = xi - conn; | |
54 | - int yl = yi - conn; | |
55 | - | |
56 | - | |
57 | - // use zero for the lowest limits if the xi or yi is less than conn. | |
58 | - if (xi <= conn) | |
59 | - xl = 0; | |
60 | - if (yi <= conn) | |
61 | - yl = 0; | |
62 | - | |
63 | - //calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. | |
64 | - int xh = xi + conn; | |
65 | - int yh = yi + conn; | |
66 | - | |
67 | - // use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn. | |
68 | - if (xi >= x - conn) | |
69 | - xh = x; | |
70 | - if (yi>= y - conn) | |
71 | - yh = y; | |
72 | - | |
73 | - // calculate the limits for finding the local maximum location in the connected neighbors for the current pixel | |
74 | - int n_l = yl * x + xl; | |
75 | - int n_h = yh * x + xh; | |
76 | - | |
77 | - //initial the centers image to zero | |
78 | - gpuCenters[i] = 0; | |
79 | - | |
80 | - | |
81 | - int n = n_l; | |
82 | - | |
83 | - float l_value = 0; | |
84 | - | |
85 | - if (i < x * y) | |
86 | - | |
87 | - // check if the vote value for this pixel is greater than threshold, so this pixel may be a local max. | |
88 | - if (gpuVote[i]>final_t){ | |
89 | - | |
90 | - // compare the vote value for this pixel with the vote values with its neighbors. | |
91 | - while (n<=n_h){ | |
92 | - | |
93 | - // check if this vote value is a local max in its neighborhood. | |
94 | - if (gpuVote[i] < gpuVote[n]){ | |
95 | - l_value = 0; | |
96 | - n =n_h+1; | |
97 | - } | |
98 | - else if (n == n_h){ | |
99 | - l_value = 1; | |
100 | - n = n+1; | |
101 | - } | |
102 | - // check if the current neighbor is the last one at the current row | |
103 | - else if ((n - n_l - 2*conn)% x ==0){ | |
104 | - n = n + x - 2*conn -1; | |
105 | - n ++; | |
106 | - } | |
107 | - else | |
108 | - n ++; | |
109 | - } | |
110 | - // set the center value for this pixel to high if it's a local max ,and to low if not. | |
111 | - gpuCenters[i] = l_value ; | |
112 | - } | |
113 | - */ | |
114 | - | |
115 | 44 | } |
116 | - | |
117 | - | |
118 | - | |
45 | + | |
119 | 46 | template<typename T> |
120 | 47 | void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ |
121 | 48 | |
122 | - | |
123 | - | |
124 | - | |
125 | 49 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
126 | 50 | dim3 threads(max_threads, 1); |
127 | 51 | dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); |
128 | 52 | |
129 | - | |
130 | - | |
131 | 53 | //call the kernel to find the local maximum. |
132 | 54 | cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y); |
133 | - | |
134 | - | |
135 | 55 | } |
136 | 56 | |
137 | - | |
138 | - | |
139 | 57 | template<typename T> |
140 | 58 | void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ |
141 | 59 | |
142 | - | |
143 | 60 | //calculate the number of bytes in the array |
144 | 61 | unsigned int bytes = x * y * sizeof(T); |
145 | 62 | |
... | ... | @@ -147,28 +64,22 @@ namespace stim{ |
147 | 64 | T* gpuCenters; |
148 | 65 | cudaMalloc(&gpuCenters, bytes); |
149 | 66 | |
150 | - | |
151 | 67 | //allocate space on the GPU for the input Vote Image |
152 | 68 | T* gpuVote; |
153 | 69 | cudaMalloc(&gpuVote, bytes); |
154 | 70 | |
155 | - | |
156 | 71 | //copy the Vote image data to the GPU |
157 | 72 | HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); |
158 | - | |
159 | - | |
73 | + | |
160 | 74 | //call the GPU version of the local max function |
161 | 75 | gpu_local_max<T>(gpuCenters, gpuVote, final_t, conn, x, y); |
162 | - | |
163 | - | |
76 | + | |
164 | 77 | //copy the cell centers data to the CPU |
165 | 78 | cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ; |
166 | - | |
167 | - | |
79 | + | |
168 | 80 | //free allocated memory |
169 | 81 | cudaFree(gpuCenters); |
170 | 82 | cudaFree(gpuVote); |
171 | - cudaFree(gpuGrad); | |
172 | 83 | } |
173 | 84 | |
174 | 85 | } | ... | ... |
stim/cuda/ivote/update_dir.cuh
1 | 1 | #ifndef STIM_CUDA_UPDATE_DIR_H |
2 | 2 | #define STIM_CUDA_UPDATE_DIR_H |
3 | 3 | |
4 | - | |
5 | 4 | # include <iostream> |
6 | 5 | # include <cuda.h> |
7 | 6 | #include <stim/cuda/cudatools.h> |
... | ... | @@ -48,8 +47,7 @@ namespace stim{ |
48 | 47 | int rmax_sq = rmax * rmax; |
49 | 48 | int tx_rmax = threadIdx.x + rmax; |
50 | 49 | int bxs = bxi - rmax; |
51 | - | |
52 | - | |
50 | + | |
53 | 51 | for(int yr = -rmax; yr <= rmax; yr++){ |
54 | 52 | |
55 | 53 | //copy the portion of the image necessary for this block to shared memory |
... | ... | @@ -66,8 +64,7 @@ namespace stim{ |
66 | 64 | |
67 | 65 | // calculate the angle between the voter and the current pixel in x and y directions |
68 | 66 | float atan_angle = gpuTable[ind_t]; |
69 | - | |
70 | - | |
67 | + | |
71 | 68 | // calculate the voting direction based on the grtadient direction |
72 | 69 | int idx_share_update = xr + tx_rmax ; |
73 | 70 | float share_vote = s_vote[idx_share_update]; |
... | ... | @@ -86,11 +83,8 @@ namespace stim{ |
86 | 83 | } |
87 | 84 | } |
88 | 85 | } |
89 | - | |
90 | - | |
91 | - //float new_angle = atan2(dy, dx); | |
86 | + | |
92 | 87 | unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); |
93 | - | |
94 | 88 | float new_angle = gpuTable[ind_m]; |
95 | 89 | |
96 | 90 | if(xi < x && yi < y) |
... | ... | @@ -102,35 +96,27 @@ namespace stim{ |
102 | 96 | template<typename T> |
103 | 97 | __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ |
104 | 98 | |
105 | - //************ when the number of threads are (1024,1) ************* | |
106 | - | |
107 | 99 | // calculate the 2D coordinates for this current thread. |
108 | 100 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
109 | 101 | int yi = blockIdx.y * blockDim.y + threadIdx.y; |
110 | 102 | |
111 | 103 | // convert 2D coordinates to 1D |
112 | 104 | int i = yi * x + xi; |
113 | - | |
114 | - | |
105 | + | |
115 | 106 | //update the gradient image with the vote direction |
116 | 107 | gpuGrad[2*i] = gpuDir[i]; |
117 | 108 | } |
118 | - | |
119 | - | |
109 | + | |
120 | 110 | template<typename T> |
121 | 111 | void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ |
122 | 112 | |
123 | 113 | //get the number of pixels in the image |
124 | 114 | unsigned int pixels = x * y; |
125 | 115 | unsigned int bytes = sizeof(T) * pixels; |
126 | - | |
127 | - | |
116 | + | |
128 | 117 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
129 | 118 | dim3 threads(max_threads, 1); |
130 | 119 | dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); |
131 | - //dim3 threads(1, 1); | |
132 | - //dim3 blocks(x, y); | |
133 | - // Allocate CUDA array in device memory | |
134 | 120 | |
135 | 121 | //define a channel descriptor for a single 32-bit channel |
136 | 122 | cudaChannelFormatDesc channelDesc = |
... | ... | @@ -174,14 +160,12 @@ namespace stim{ |
174 | 160 | |
175 | 161 | //call the kernel to update the gradient direction |
176 | 162 | cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); |
177 | - | |
178 | - | |
163 | + | |
179 | 164 | //free allocated memory |
180 | 165 | cudaFree(gpuDir); |
181 | 166 | |
182 | 167 | } |
183 | - | |
184 | - | |
168 | + | |
185 | 169 | template<typename T> |
186 | 170 | void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ |
187 | 171 | |
... | ... | @@ -211,12 +195,10 @@ namespace stim{ |
211 | 195 | |
212 | 196 | //copy the atan2 values to the GPU |
213 | 197 | HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); |
214 | - | |
215 | - | |
198 | + | |
216 | 199 | //call the GPU version of the update direction function |
217 | 200 | gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); |
218 | - | |
219 | - | |
201 | + | |
220 | 202 | //copy the new gradient image back to the CPU |
221 | 203 | cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; |
222 | 204 | |
... | ... | @@ -229,6 +211,4 @@ namespace stim{ |
229 | 211 | } |
230 | 212 | } |
231 | 213 | |
232 | - | |
233 | - | |
234 | 214 | #endif |
235 | 215 | \ No newline at end of file | ... | ... |
stim/cuda/ivote/vote.cuh
1 | 1 | #ifndef STIM_CUDA_VOTE_H |
2 | 2 | #define STIM_CUDA_VOTE_H |
3 | 3 | |
4 | - | |
5 | 4 | # include <iostream> |
6 | 5 | # include <cuda.h> |
7 | 6 | #include <stim/cuda/cudatools.h> |
8 | 7 | #include <stim/cuda/sharedmem.cuh> |
9 | 8 | |
10 | - | |
11 | 9 | namespace stim{ |
12 | 10 | namespace cuda{ |
13 | 11 | |
... | ... | @@ -26,13 +24,10 @@ namespace stim{ |
26 | 24 | int yi = blockIdx.y * blockDim.y + threadIdx.y; |
27 | 25 | // convert 2D coordinates to 1D |
28 | 26 | int i = yi * x + xi; |
29 | - | |
30 | - | |
31 | - | |
27 | + | |
32 | 28 | // define a local variable to sum the votes from the voters |
33 | 29 | float sum = 0; |
34 | 30 | |
35 | - | |
36 | 31 | //calculate the width of the shared memory block |
37 | 32 | int swidth = 2 * rmax + blockDim.x; |
38 | 33 | |
... | ... | @@ -59,8 +54,7 @@ namespace stim{ |
59 | 54 | |
60 | 55 | // calculate the angle between the pixel and the current voter in x and y directions |
61 | 56 | float atan_angle = gpuTable[id_t]; |
62 | - | |
63 | - | |
57 | + | |
64 | 58 | // calculate the voting direction based on the grtadient direction |
65 | 59 | int idx_share = xr + tx_rmax ; |
66 | 60 | float2 g = s_grad[idx_share]; |
... | ... | @@ -71,7 +65,6 @@ namespace stim{ |
71 | 65 | sum += g.y; |
72 | 66 | |
73 | 67 | } |
74 | - | |
75 | 68 | } |
76 | 69 | |
77 | 70 | } |
... | ... | @@ -87,15 +80,11 @@ namespace stim{ |
87 | 80 | //get the number of pixels in the image |
88 | 81 | unsigned int pixels = x * y; |
89 | 82 | unsigned int bytes = sizeof(T) * pixels; |
90 | - | |
91 | - | |
83 | + | |
92 | 84 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
93 | - //unsigned int thread_dim = sqrt(max_threads); | |
94 | 85 | dim3 threads(max_threads, 1); |
95 | 86 | dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); |
96 | - //dim3 threads(1,1); | |
97 | - //dim3 blocks(x, y); | |
98 | - | |
87 | + | |
99 | 88 | // Allocate CUDA array in device memory |
100 | 89 | |
101 | 90 | //define a channel descriptor for a single 32-bit channel |
... | ... | @@ -131,7 +120,6 @@ namespace stim{ |
131 | 120 | // specify share memory |
132 | 121 | unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4; |
133 | 122 | |
134 | - | |
135 | 123 | //call the kernel to do the voting |
136 | 124 | |
137 | 125 | cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y); |
... | ... | @@ -165,14 +153,10 @@ namespace stim{ |
165 | 153 | |
166 | 154 | //copy the atan2 values to the GPU |
167 | 155 | HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); |
168 | - | |
169 | - //cudaMemcpyToSymbol (cstTable, cpuTable, bytes_table ); | |
170 | - | |
171 | - | |
156 | + | |
172 | 157 | //call the GPU version of the vote calculation function |
173 | 158 | gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); |
174 | - | |
175 | - | |
159 | + | |
176 | 160 | //copy the Vote Data back to the CPU |
177 | 161 | cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; |
178 | 162 | |
... | ... | @@ -185,6 +169,4 @@ namespace stim{ |
185 | 169 | } |
186 | 170 | } |
187 | 171 | |
188 | - | |
189 | - | |
190 | 172 | #endif |
191 | 173 | \ No newline at end of file | ... | ... |
stim/cuda/templates/conv2sep.cuh
... | ... | @@ -49,7 +49,7 @@ namespace stim{ |
49 | 49 | __syncthreads(); |
50 | 50 | |
51 | 51 | //if the current pixel is outside of the image |
52 | - if(xi > x || yi > y) | |
52 | + if(xi >= x || yi >= y) | |
53 | 53 | return; |
54 | 54 | |
55 | 55 | |
... | ... | @@ -61,7 +61,7 @@ namespace stim{ |
61 | 61 | |
62 | 62 | //for each element of the kernel |
63 | 63 | for(int ki = -kr; ki <= kr; ki++){ |
64 | - sum += g[ki + kr] * s[si + ki]; | |
64 | + sum += s[si + ki] * g[ki + kr]; | |
65 | 65 | } |
66 | 66 | |
67 | 67 | //calculate the 1D image index for this thread | ... | ... |
stim/cuda/templates/gradient.cuh
... | ... | @@ -11,12 +11,12 @@ namespace stim{ |
11 | 11 | template<typename T> |
12 | 12 | __global__ void gradient_2d(T* out, T* in, int x, int y){ |
13 | 13 | |
14 | - //calculate the 1D image index for this thread | |
15 | - int i = blockIdx.x * blockDim.x + threadIdx.x; | |
16 | - | |
17 | - //convert this to 2D pixel coordinates | |
18 | - int yi = i / x; | |
19 | - int xi = i - (yi * x); | |
14 | + | |
15 | + // calculate the 2D coordinates for this current thread. | |
16 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
17 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
18 | + // convert 2D coordinates to 1D | |
19 | + int i = yi * x + xi; | |
20 | 20 | |
21 | 21 | //return if the pixel is outside of the image |
22 | 22 | if(xi >= x || yi >= y) return; |
... | ... | @@ -25,16 +25,16 @@ namespace stim{ |
25 | 25 | int i_xp = yi * x + (xi + 1); |
26 | 26 | int i_yp = (yi + 1) * x + xi; |
27 | 27 | |
28 | + //calculate indices for the backward difference | |
29 | + int i_xn = yi * x + (xi - 1); | |
30 | + int i_yn = (yi - 1) * x + xi; | |
31 | + | |
28 | 32 | //use forward differences if a coordinate is zero |
29 | 33 | if(xi == 0) |
30 | 34 | out[i * 2 + 0] = in[i_xp] - in[i]; |
31 | 35 | if(yi == 0) |
32 | 36 | out[i * 2 + 1] = in[i_yp] - in[i]; |
33 | 37 | |
34 | - //calculate indices for the backward difference | |
35 | - int i_xn = yi * x + (xi - 1); | |
36 | - int i_yn = (yi - 1) * x + xi; | |
37 | - | |
38 | 38 | //use backward differences if the coordinate is at the maximum edge |
39 | 39 | if(xi == x-1) |
40 | 40 | out[i * 2 + 0] = in[i] - in[i_xn]; |
... | ... | @@ -51,28 +51,16 @@ namespace stim{ |
51 | 51 | } |
52 | 52 | |
53 | 53 | template<typename T> |
54 | - //void gpu_gradient_2d(T* gpuOut, T* gpuIn, unsigned int x, unsigned int y){ | |
55 | 54 | void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){ |
56 | 55 | |
57 | 56 | //get the number of pixels in the image |
58 | 57 | unsigned int pixels = x * y; |
59 | 58 | |
60 | - //allocate space on the GPU for the input image | |
61 | - //T* gpuI; | |
62 | - //HANDLE_ERROR(cudaMalloc(&gpuI, bytes)); | |
63 | - | |
64 | - //cudaMemcpy(gpuI, gpuI0, bytes, cudaMemcpyDeviceToDevice); | |
65 | - | |
66 | - | |
67 | - //allocate space on the GPU for the output gradient image | |
68 | - //T* gpuGrad; | |
69 | - //cudaMalloc(&gpuGrad, bytes * 2); //the output image will have two channels (x, y) | |
70 | - | |
71 | 59 | //get the maximum number of threads per block for the CUDA device |
72 | - int threads = stim::maxThreadsPerBlock(); | |
73 | - | |
74 | - //calculate the number of blocks | |
75 | - int blocks = pixels / threads + (pixels%threads == 0 ? 0:1); | |
60 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
61 | + dim3 threads(max_threads, 1); | |
62 | + dim3 blocks(x/threads.x + 1 , y); | |
63 | + | |
76 | 64 | |
77 | 65 | //call the GPU kernel to determine the gradient |
78 | 66 | gradient_2d<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y); |
... | ... | @@ -105,6 +93,7 @@ namespace stim{ |
105 | 93 | |
106 | 94 | //free allocated memory |
107 | 95 | cudaFree(gpuOut); |
96 | + cudaFree(gpuIn); | |
108 | 97 | } |
109 | 98 | |
110 | 99 | } | ... | ... |