Commit 13fe3c84bbffad233bbd8728590c870364472ebc
1 parent
788862df
update the stimlib by adding some gpu-functions (abs, cart2polar, multiply, down…
…_sample, gaussian_blur, gradient, local_max, sharedmem, update_dir, vote)
Showing
11 changed files
with
1277 additions
and
0 deletions
Show diff stats
1 | +#ifndef STIM_CUDA_ARRAY_ABS_H | |
2 | +#define STIM_CUDA_ARRAY_ABS_H | |
3 | + | |
4 | +namespace stim{ | |
5 | + namespace cuda{ | |
6 | + template<typename T> | |
7 | + __global__ void cuda_abs(T* a, unsigned int N){ | |
8 | + | |
9 | + //calculate the 1D index for this thread | |
10 | + int i = blockIdx.x * blockDim.x + threadIdx.x; | |
11 | + | |
12 | + if(i < N) | |
13 | + a[i] = abs(a[i]); | |
14 | + } | |
15 | + | |
16 | + | |
17 | + template<typename T> | |
18 | + void gpu_abs(T* a, unsigned int N){ | |
19 | + | |
20 | + //get the maximum number of threads per block for the CUDA device | |
21 | + int threads = stim::maxThreadsPerBlock(); | |
22 | + | |
23 | + //calculate the number of blocks | |
24 | + int blocks = N / threads + (N%threads == 0 ? 0:1); | |
25 | + | |
26 | + //call the kernel to do the multiplication | |
27 | + cuda_abs <<< blocks, threads >>>(a, N); | |
28 | + | |
29 | + } | |
30 | + | |
31 | + | |
32 | + template<typename T> | |
33 | + void cpu_abs(T* a, unsigned int N){ | |
34 | + | |
35 | + //calculate the number of bytes in the array | |
36 | + unsigned int bytes = N * sizeof(T); | |
37 | + | |
38 | + //allocate memory on the GPU for the array | |
39 | + T* gpuA; | |
40 | + HANDLE_ERROR( cudaMalloc(&gpuA, bytes) ); | |
41 | + | |
42 | + //copy the array to the GPU | |
43 | + HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) ); | |
44 | + | |
45 | + //call the GPU version of this function | |
46 | + gpu_abs<T>(gpuA, N); | |
47 | + | |
48 | + //copy the array back to the CPU | |
49 | + HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) ); | |
50 | + | |
51 | + //free allocated memory | |
52 | + cudaFree(gpuA); | |
53 | + | |
54 | + } | |
55 | + | |
56 | + } //end namespace cuda | |
57 | +} //end namespace stim | |
58 | + | |
59 | +#endif | |
0 | 60 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_ARRAY_CART2POLAR_H | |
2 | +#define STIM_CUDA_ARRAY_CART2POLAR_H | |
3 | + | |
4 | +namespace stim{ | |
5 | + namespace cuda{ | |
6 | + template<typename T> | |
7 | + __global__ void cuda_cart2polar(T* a, unsigned int N){ | |
8 | + | |
9 | + | |
10 | + //calculate the 1D index for this thread | |
11 | + int i = blockIdx.x * blockDim.x + threadIdx.x; | |
12 | + | |
13 | + if(i < N){ | |
14 | + float x = a[i * 2 + 0]; | |
15 | + float y = a[i * 2 + 1]; | |
16 | + float theta = atan2( y, x ) ; | |
17 | + float r = sqrt(x * x + y * y); | |
18 | + a[i * 2 + 0] = theta; | |
19 | + a[i * 2 + 1] = r; | |
20 | + } | |
21 | + } | |
22 | + | |
23 | + | |
24 | + template<typename T> | |
25 | + void gpu_cart2polar(T* gpuGrad, unsigned int N){ | |
26 | + | |
27 | + //get the maximum number of threads per block for the CUDA device | |
28 | + int threads = stim::maxThreadsPerBlock(); | |
29 | + | |
30 | + //calculate the number of blocks | |
31 | + int blocks = N / threads + (N % threads == 0 ? 0:1); | |
32 | + | |
33 | + //call the kernel to do the multiplication | |
34 | + cuda_cart2polar <<< blocks, threads >>>(gpuGrad, N); | |
35 | + | |
36 | + } | |
37 | + | |
38 | + | |
39 | + template<typename T> | |
40 | + void cpu_cart2polar(T* a, unsigned int N){ | |
41 | + | |
42 | + //calculate the number of bytes in the array | |
43 | + unsigned int bytes = N * sizeof(T) * 2; | |
44 | + | |
45 | + //allocate memory on the GPU for the array | |
46 | + T* gpuA; | |
47 | + HANDLE_ERROR( cudaMalloc(&gpuA, bytes) ); | |
48 | + | |
49 | + //copy the array to the GPU | |
50 | + HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) ); | |
51 | + | |
52 | + //call the GPU version of this function | |
53 | + gpu_cart2polar<T>(gpuA, N); | |
54 | + | |
55 | + //copy the array back to the CPU | |
56 | + HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) ); | |
57 | + | |
58 | + //free allocated memory | |
59 | + cudaFree(gpuA); | |
60 | + | |
61 | + } | |
62 | + | |
63 | + } | |
64 | +} | |
65 | + | |
66 | +#endif | |
0 | 67 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_ARRAY_MULTIPLY_H | |
2 | +#define STIM_CUDA_ARRAY_MULTIPLY_H | |
3 | + | |
4 | +#include <iostream> | |
5 | +#include <cuda.h> | |
6 | +#include <stim/cuda/devices.h> | |
7 | +#include <stim/cuda/error.h> | |
8 | + | |
9 | +namespace stim{ | |
10 | + namespace cuda{ | |
11 | + | |
12 | + template<typename T> | |
13 | + __global__ void cuda_multiply(T* lhs, T rhs, unsigned int N){ | |
14 | + | |
15 | + //calculate the 1D index for this thread | |
16 | + int i = blockIdx.x * blockDim.x + threadIdx.x; | |
17 | + | |
18 | + if(i < N) | |
19 | + lhs[i] *= rhs; | |
20 | + } | |
21 | + | |
22 | + template<typename T> | |
23 | + void gpu_multiply(T* lhs, T rhs, unsigned int N){ | |
24 | + | |
25 | + //get the maximum number of threads per block for the CUDA device | |
26 | + int threads = stim::maxThreadsPerBlock(); | |
27 | + | |
28 | + //calculate the number of blocks | |
29 | + int blocks = N / threads + (N%threads == 0 ? 0:1); | |
30 | + | |
31 | + //call the kernel to do the multiplication | |
32 | + cuda_multiply <<< blocks, threads >>>(lhs, rhs, N); | |
33 | + | |
34 | + } | |
35 | + | |
36 | + template<typename T> | |
37 | + void cpu_multiply(T* lhs, T rhs, unsigned int N){ | |
38 | + | |
39 | + //calculate the number of bytes in the array | |
40 | + unsigned int bytes = N * sizeof(T); | |
41 | + | |
42 | + //allocate memory on the GPU for the array | |
43 | + T* gpuLHS; | |
44 | + HANDLE_ERROR( cudaMalloc(&gpuLHS, bytes) ); | |
45 | + | |
46 | + //copy the array to the GPU | |
47 | + HANDLE_ERROR( cudaMemcpy(gpuLHS, lhs, bytes, cudaMemcpyHostToDevice) ); | |
48 | + | |
49 | + //call the GPU version of this function | |
50 | + gpu_multiply<T>(gpuLHS, rhs, N); | |
51 | + | |
52 | + //copy the array back to the CPU | |
53 | + HANDLE_ERROR( cudaMemcpy(lhs, gpuLHS, bytes, cudaMemcpyDeviceToHost) ); | |
54 | + | |
55 | + //free allocated memory | |
56 | + cudaFree(gpuLHS); | |
57 | + } | |
58 | + | |
59 | + } | |
60 | +} | |
61 | + | |
62 | + | |
63 | + | |
64 | +#endif | |
0 | 65 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_ARRAYMATH_H | |
2 | +#define STIM_CUDA_ARRAYMATH_H | |
3 | + | |
4 | +#include <stim/cuda/array_multiply.cuh> | |
5 | +#include <stim/cuda/array_abs.cuh> | |
6 | +#include <stim/cuda/array_cart2polar.cuh> | |
7 | + | |
8 | +namespace stim{ | |
9 | + namespace cuda{ | |
10 | + | |
11 | + } | |
12 | +} | |
13 | + | |
14 | + | |
15 | + | |
16 | +#endif | |
0 | 17 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_DOWN_SAMPLE_H | |
2 | +#define STIM_CUDA_DOWN_SAMPLE_H | |
3 | + | |
4 | +#include <iostream> | |
5 | +#include <cuda.h> | |
6 | +#include <stim/cuda/devices.h> | |
7 | +#include <stim/cuda/timer.h> | |
8 | +#include <stim/cuda/gaussian_blur.cuh> | |
9 | + | |
10 | +namespace stim{ | |
11 | + namespace cuda{ | |
12 | + | |
13 | + template<typename T> | |
14 | + __global__ void down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ | |
15 | + | |
16 | + unsigned int sigma_ds = 1/resize; | |
17 | + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); | |
18 | + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); | |
19 | + | |
20 | + | |
21 | + // calculate the 2D coordinates for this current thread. | |
22 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
23 | + int yi = blockIdx.y; | |
24 | + // convert 2D coordinates to 1D | |
25 | + int i = yi * x_ds + xi; | |
26 | + | |
27 | + if(xi< x_ds && yi< y_ds){ | |
28 | + | |
29 | + int x_org = xi * sigma_ds; | |
30 | + int y_org = yi * sigma_ds; | |
31 | + int i_org = y_org * x + x_org; | |
32 | + gpuI[i] = gpuI0[i_org]; | |
33 | + } | |
34 | + | |
35 | + } | |
36 | + | |
37 | + | |
38 | + /// Applies a Gaussian blur to a 2D image stored on the GPU | |
39 | + template<typename T> | |
40 | + void gpu_down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ | |
41 | + | |
42 | + | |
43 | + unsigned int sigma_ds = 1/resize; | |
44 | + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); | |
45 | + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); | |
46 | + | |
47 | + //get the number of pixels in the image | |
48 | + unsigned int pixels_ds = x_ds * y_ds; | |
49 | + | |
50 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
51 | + dim3 threads(max_threads, 1); | |
52 | + dim3 blocks(x_ds/threads.x + (x_ds %threads.x == 0 ? 0:1) , y_ds); | |
53 | + | |
54 | + stim::cuda::gpu_gaussian_blur_2d<float>(gpuI0, sigma_ds,x ,y); | |
55 | + | |
56 | + //resample the image | |
57 | + down_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y); | |
58 | + | |
59 | + } | |
60 | + | |
61 | + /// Applies a Gaussian blur to a 2D image stored on the CPU | |
62 | + template<typename T> | |
63 | + void cpu_down_sample(T* re_img, T* image, T resize, unsigned int x, unsigned int y){ | |
64 | + | |
65 | + //get the number of pixels in the image | |
66 | + unsigned int pixels = x * y; | |
67 | + unsigned int bytes = sizeof(T) * pixels; | |
68 | + | |
69 | + unsigned int sigma_ds = 1/resize; | |
70 | + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); | |
71 | + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); | |
72 | + unsigned int bytes_ds = sizeof(T) * x_ds * y_ds; | |
73 | + | |
74 | + | |
75 | + | |
76 | + //allocate space on the GPU for the original image | |
77 | + T* gpuI0; | |
78 | + cudaMalloc(&gpuI0, bytes); | |
79 | + | |
80 | + | |
81 | + //copy the image data to the GPU | |
82 | + cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice); | |
83 | + | |
84 | + //allocate space on the GPU for the down sampled image | |
85 | + T* gpuI; | |
86 | + cudaMalloc(&gpuI, bytes_ds); | |
87 | + | |
88 | + //run the GPU-based version of the algorithm | |
89 | + gpu_down_sample<T>(gpuI, gpuI0, resize, x, y); | |
90 | + | |
91 | + //copy the image data to the GPU | |
92 | + cudaMemcpy(re_image, gpuI, bytes_ds, cudaMemcpyHostToDevice); | |
93 | + | |
94 | + cudaFree(gpuI0); | |
95 | + cudeFree(gpuI); | |
96 | + } | |
97 | + | |
98 | + } | |
99 | +} | |
100 | + | |
101 | +#endif | |
0 | 102 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_GAUSSIAN_BLUR_H | |
2 | +#define STIM_CUDA_GAUSSIAN_BLUR_H | |
3 | + | |
4 | +#include <iostream> | |
5 | +#include <cuda.h> | |
6 | +#include <stim/cuda/devices.h> | |
7 | +#include <stim/cuda/timer.h> | |
8 | +#include <stim/cuda/sharedmem.cuh> | |
9 | + | |
10 | +#define pi 3.14159 | |
11 | + | |
12 | +namespace stim{ | |
13 | + namespace cuda{ | |
14 | + | |
15 | + template<typename T> | |
16 | + __global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ | |
17 | + | |
18 | + //generate a pointer to shared memory (size will be specified as a kernel parameter) | |
19 | + extern __shared__ T s[]; | |
20 | + | |
21 | + int kr = sigma * 4; //calculate the kernel radius | |
22 | + | |
23 | + //get a pointer to the gaussian in memory | |
24 | + T* g = (T*)&s[blockDim.x + 2 * kr]; | |
25 | + | |
26 | + //calculate the start point for this block | |
27 | + int bxi = blockIdx.x * blockDim.x; | |
28 | + int byi = blockIdx.y; | |
29 | + | |
30 | + //copy the portion of the image necessary for this block to shared memory | |
31 | + stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); | |
32 | + | |
33 | + //calculate the thread index and block index | |
34 | + int ti = threadIdx.x; | |
35 | + | |
36 | + //calculate the spatial coordinate for this thread | |
37 | + int xi = bxi + ti; | |
38 | + | |
39 | + //pre-compute the gaussian values for each kernel point | |
40 | + T a = 1.0 / (sigma * sqrt(2 * pi)); | |
41 | + T c = - 1.0 / (2*sigma*sigma); | |
42 | + int ki; | |
43 | + | |
44 | + //use the first 2kr+1 threads to evaluate a gaussian and store the result | |
45 | + if(ti <= 2* kr+1){ | |
46 | + ki = ti - kr; | |
47 | + g[ti] = a * exp((ki*ki) * c); | |
48 | + } | |
49 | + | |
50 | + //make sure that all writing to shared memory is done before continuing | |
51 | + __syncthreads(); | |
52 | + | |
53 | + //if the current pixel is outside of the image | |
54 | + if(bxi + ti > x || byi > y) | |
55 | + return; | |
56 | + | |
57 | + | |
58 | + | |
59 | + //calculate the coordinates of the current thread in shared memory | |
60 | + int si = ti + kr; | |
61 | + | |
62 | + T sum = 0; //running weighted sum across the kernel | |
63 | + | |
64 | + | |
65 | + //for each element of the kernel | |
66 | + for(int ki = -kr; ki <= kr; ki++){ | |
67 | + sum += g[ki + kr] * s[si + ki]; | |
68 | + } | |
69 | + | |
70 | + //calculate the 1D image index for this thread | |
71 | + unsigned int i = byi * x + xi; | |
72 | + | |
73 | + //output the result to global memory | |
74 | + out[i] = sum; | |
75 | + } | |
76 | + | |
77 | + template<typename T> | |
78 | + __global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ | |
79 | + | |
80 | + //generate a pointer to shared memory (size will be specified as a kernel parameter) | |
81 | + extern __shared__ T s[]; | |
82 | + | |
83 | + int kr = sigma * 4; //calculate the kernel radius | |
84 | + | |
85 | + //get a pointer to the gaussian in memory | |
86 | + T* g = (T*)&s[blockDim.y + 2 * kr]; | |
87 | + | |
88 | + //calculate the start point for this block | |
89 | + int bxi = blockIdx.x; | |
90 | + int byi = blockIdx.y * blockDim.y; | |
91 | + | |
92 | + //copy the portion of the image necessary for this block to shared memory | |
93 | + stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); | |
94 | + | |
95 | + //calculate the thread index and block index | |
96 | + int ti = threadIdx.y; | |
97 | + | |
98 | + //calculate the spatial coordinate for this thread | |
99 | + int yi = byi + ti; | |
100 | + | |
101 | + //pre-compute the gaussian values for each kernel point | |
102 | + T a = 1.0 / (sigma * sqrt(2 * pi)); | |
103 | + T c = - 1.0 / (2*sigma*sigma); | |
104 | + int ki; | |
105 | + | |
106 | + //use the first 2kr+1 threads to evaluate a gaussian and store the result | |
107 | + if(ti <= 2* kr+1){ | |
108 | + ki = ti - kr; | |
109 | + g[ti] = a * exp((ki*ki) * c); | |
110 | + } | |
111 | + | |
112 | + //make sure that all writing to shared memory is done before continuing | |
113 | + __syncthreads(); | |
114 | + | |
115 | + //if the current pixel is outside of the image | |
116 | + if(bxi >= x || yi >= y) | |
117 | + return; | |
118 | + | |
119 | + | |
120 | + | |
121 | + //calculate the coordinates of the current thread in shared memory | |
122 | + int si = ti + kr; | |
123 | + | |
124 | + T sum = 0; //running weighted sum across the kernel | |
125 | + | |
126 | + | |
127 | + //for each element of the kernel | |
128 | + for(int ki = -kr; ki <= kr; ki++){ | |
129 | + sum += g[ki + kr] * s[si + ki]; | |
130 | + } | |
131 | + | |
132 | + //calculate the 1D image index for this thread | |
133 | + unsigned int i = yi * x + bxi; | |
134 | + | |
135 | + //output the result to global memory | |
136 | + out[i] = sum; | |
137 | + } | |
138 | + | |
139 | + /// Applies a Gaussian blur to a 2D image stored on the GPU | |
140 | + template<typename T> | |
141 | + void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ | |
142 | + | |
143 | + //get the number of pixels in the image | |
144 | + unsigned int pixels = x * y; | |
145 | + unsigned int bytes = sizeof(T) * pixels; | |
146 | + | |
147 | + // Allocate CUDA array in device memory | |
148 | + | |
149 | + //define a channel descriptor for a single 32-bit channel | |
150 | + cudaChannelFormatDesc channelDesc = | |
151 | + cudaCreateChannelDesc(32, 0, 0, 0, | |
152 | + cudaChannelFormatKindFloat); | |
153 | + cudaArray* cuArray; //declare the cuda array | |
154 | + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array | |
155 | + | |
156 | + // Copy the image data from global memory to the array | |
157 | + cudaMemcpyToArray(cuArray, 0, 0, image, bytes, | |
158 | + cudaMemcpyDeviceToDevice); | |
159 | + | |
160 | + // Specify texture | |
161 | + struct cudaResourceDesc resDesc; //create a resource descriptor | |
162 | + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero | |
163 | + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type | |
164 | + resDesc.res.array.array = cuArray; //add a pointer to the cuda array | |
165 | + | |
166 | + // Specify texture object parameters | |
167 | + struct cudaTextureDesc texDesc; //create a texture descriptor | |
168 | + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero | |
169 | + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) | |
170 | + texDesc.addressMode[1] = cudaAddressModeWrap; | |
171 | + texDesc.filterMode = cudaFilterModePoint; //use linear filtering | |
172 | + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) | |
173 | + texDesc.normalizedCoords = 0; //not using normalized coordinates | |
174 | + | |
175 | + // Create texture object | |
176 | + cudaTextureObject_t texObj = 0; | |
177 | + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); | |
178 | + | |
179 | + | |
180 | + //get the maximum number of threads per block for the CUDA device | |
181 | + int max_threads = stim::maxThreadsPerBlock(); | |
182 | + dim3 threads(max_threads, 1); | |
183 | + | |
184 | + //calculate the number of blocks | |
185 | + dim3 blocks(x / threads.x + 1, y); | |
186 | + | |
187 | + //calculate the shared memory used in the kernel | |
188 | + unsigned int pixel_bytes = max_threads * 4; //bytes devoted to pixel data being processed | |
189 | + unsigned int apron_bytes = sigma * 8 * 4; //bytes devoted to pixels outside the window | |
190 | + unsigned int gaussian_bytes = (sigma * 8 + 1) * 4; //bytes devoted to memory used to store the pre-computed Gaussian window | |
191 | + unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes; //total number of bytes shared memory used | |
192 | + | |
193 | + //blur the image along the x-axis | |
194 | + gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y); | |
195 | + | |
196 | + // Copy the x-blurred data from global memory to the texture | |
197 | + cudaMemcpyToArray(cuArray, 0, 0, image, bytes, | |
198 | + cudaMemcpyDeviceToDevice); | |
199 | + | |
200 | + //transpose the block and thread dimensions | |
201 | + threads.x = 1; | |
202 | + threads.y = max_threads; | |
203 | + blocks.x = x; | |
204 | + blocks.y = y / threads.y + 1; | |
205 | + | |
206 | + //blur the image along the y-axis | |
207 | + gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y); | |
208 | + | |
209 | + //free allocated memory | |
210 | + cudaFree(cuArray); | |
211 | + | |
212 | + } | |
213 | + | |
214 | + /// Applies a Gaussian blur to a 2D image stored on the CPU | |
215 | + template<typename T> | |
216 | + void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ | |
217 | + | |
218 | + //get the number of pixels in the image | |
219 | + unsigned int pixels = x * y; | |
220 | + unsigned int bytes = sizeof(T) * pixels; | |
221 | + | |
222 | + //allocate space on the GPU | |
223 | + T* gpuI0; | |
224 | + cudaMalloc(&gpuI0, bytes); | |
225 | + | |
226 | + | |
227 | + //copy the image data to the GPU | |
228 | + cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice); | |
229 | + | |
230 | + //run the GPU-based version of the algorithm | |
231 | + gpu_gaussian_blur_2d<T>(gpuI0, sigma, x, y); | |
232 | + | |
233 | + //copy the image data from the device | |
234 | + cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost); | |
235 | + | |
236 | + //free allocated memory | |
237 | + cudaFree(gpuI0); | |
238 | + } | |
239 | + | |
240 | + } | |
241 | +} | |
242 | + | |
243 | +#endif | |
0 | 244 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_GRADIENT_H | |
2 | +#define STIM_CUDA_GRADIENT_H | |
3 | + | |
4 | +#include <iostream> | |
5 | +#include <cuda.h> | |
6 | +#include <stim/cuda/devices.h> | |
7 | +#include <stim/cuda/error.h> | |
8 | + | |
9 | +namespace stim{ | |
10 | + namespace cuda{ | |
11 | + | |
12 | + template<typename T> | |
13 | + __global__ void gradient_2d(T* out, T* in, unsigned int x, unsigned int y){ | |
14 | + | |
15 | + //calculate the 1D image index for this thread | |
16 | + int i = blockIdx.x * blockDim.x + threadIdx.x; | |
17 | + | |
18 | + //convert this to 2D pixel coordinates | |
19 | + int yi = i / x; | |
20 | + int xi = i - (yi * x); | |
21 | + | |
22 | + //return if the pixel is outside of the image | |
23 | + if(xi >= x || yi >= y) return; | |
24 | + | |
25 | + //calculate indices for the forward difference | |
26 | + int i_xp = yi * x + (xi + 1); | |
27 | + int i_yp = (yi + 1) * x + xi; | |
28 | + | |
29 | + //use forward differences if a coordinate is zero | |
30 | + if(xi == 0) | |
31 | + out[i * 2 + 0] = in[i_xp] - in[i]; | |
32 | + if(yi == 0) | |
33 | + out[i * 2 + 1] = in[i_yp] - in[i]; | |
34 | + | |
35 | + //calculate indices for the backward difference | |
36 | + int i_xn = yi * x + (xi - 1); | |
37 | + int i_yn = (yi - 1) * x + xi; | |
38 | + | |
39 | + //use backward differences if the coordinate is at the maximum edge | |
40 | + if(xi == x-1) | |
41 | + out[i * 2 + 0] = in[i] - in[i_xn]; | |
42 | + if(yi == y-1) | |
43 | + out[i * 2 + 1] = in[i] - in[i_yn]; | |
44 | + | |
45 | + //otherwise use central differences | |
46 | + if(xi > 0 && xi < x-1) | |
47 | + out[i * 2 + 0] = (in[i_xp] - in[i_xn]) / 2; | |
48 | + | |
49 | + if(yi > 0 && yi < y-1) | |
50 | + out[i * 2 + 1] = (in[i_yp] - in[i_yn]) / 2; | |
51 | + | |
52 | + } | |
53 | + | |
54 | + template<typename T> | |
55 | + //void gpu_gradient_2d(T* gpuOut, T* gpuIn, unsigned int x, unsigned int y){ | |
56 | + void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){ | |
57 | + | |
58 | + //get the number of pixels in the image | |
59 | + unsigned int pixels = x * y; | |
60 | + | |
61 | + //allocate space on the GPU for the input image | |
62 | + //T* gpuI; | |
63 | + //HANDLE_ERROR(cudaMalloc(&gpuI, bytes)); | |
64 | + | |
65 | + //cudaMemcpy(gpuI, gpuI0, bytes, cudaMemcpyDeviceToDevice); | |
66 | + | |
67 | + | |
68 | + //allocate space on the GPU for the output gradient image | |
69 | + //T* gpuGrad; | |
70 | + //cudaMalloc(&gpuGrad, bytes * 2); //the output image will have two channels (x, y) | |
71 | + | |
72 | + //get the maximum number of threads per block for the CUDA device | |
73 | + int threads = stim::maxThreadsPerBlock(); | |
74 | + | |
75 | + //calculate the number of blocks | |
76 | + int blocks = pixels / threads + (pixels%threads == 0 ? 0:1); | |
77 | + | |
78 | + //call the GPU kernel to determine the gradient | |
79 | + gradient_2d<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y); | |
80 | + | |
81 | + } | |
82 | + | |
83 | + template<typename T> | |
84 | + void cpu_gradient_2d(T* out, T* in, unsigned int x, unsigned int y){ | |
85 | + | |
86 | + //get the number of pixels in the image | |
87 | + unsigned int pixels = x * y; | |
88 | + unsigned int bytes = pixels * sizeof(T); | |
89 | + | |
90 | + //allocate space on the GPU for the input image | |
91 | + T* gpuIn; | |
92 | + HANDLE_ERROR(cudaMalloc(&gpuIn, bytes)); | |
93 | + | |
94 | + //copy the image data to the GPU | |
95 | + HANDLE_ERROR(cudaMemcpy(gpuIn, in, bytes, cudaMemcpyHostToDevice)); | |
96 | + | |
97 | + //allocate space on the GPU for the output gradient image | |
98 | + T* gpuOut; | |
99 | + cudaMalloc(&gpuOut, bytes * 2); //the output image will have two channels (x, y) | |
100 | + | |
101 | + //call the GPU version of this function | |
102 | + gpu_gradient_2d(gpuOut, gpuIn, x, y); | |
103 | + | |
104 | + //copy the results to the CPU | |
105 | + cudaMemcpy(out, gpuOut, bytes * 2, cudaMemcpyDeviceToHost); | |
106 | + | |
107 | + //free allocated memory | |
108 | + cudaFree(gpuOut); | |
109 | + } | |
110 | + | |
111 | + } | |
112 | +} | |
113 | + | |
114 | + | |
115 | +#endif | |
0 | 116 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_LOCAL_MAX_H | |
2 | +#define STIM_CUDA_LOCAL_MAX_H | |
3 | + | |
4 | + | |
5 | +# include <iostream> | |
6 | +# include <cuda.h> | |
7 | +# include <stim/cuda/devices.h> | |
8 | +# include <stim/cuda/error.h> | |
9 | + | |
10 | +namespace stim{ | |
11 | + namespace cuda{ | |
12 | + | |
13 | + | |
14 | + // this kernel calculates the local maximum for finding the cell centers | |
15 | + template<typename T> | |
16 | + __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ | |
17 | + | |
18 | + // calculate the 2D coordinates for this current thread. | |
19 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
20 | + int yi = blockIdx.y; | |
21 | + // convert 2D coordinates to 1D | |
22 | + int i = yi * x + xi; | |
23 | + | |
24 | + | |
25 | + | |
26 | + //calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. | |
27 | + int xl = xi - conn; | |
28 | + int yl = yi - conn; | |
29 | + | |
30 | + // use zero for the lowest limits if the xi or yi is less than conn. | |
31 | + if (xi <= conn) | |
32 | + xl = 0; | |
33 | + if (yi <= conn) | |
34 | + yl = 0; | |
35 | + | |
36 | + //calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. | |
37 | + int xh = xi + conn; | |
38 | + int yh = yi + conn; | |
39 | + | |
40 | + // use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn. | |
41 | + if (xi >= x - conn) | |
42 | + xh = x; | |
43 | + if (yi>= y - conn) | |
44 | + yh = y; | |
45 | + | |
46 | + // calculate the limits for finding the local maximum location in the connected neighbors for the current pixel | |
47 | + int n_l = yl * x + xl; | |
48 | + int n_h = yh * x + xh; | |
49 | + | |
50 | + //initial the centers image to zero | |
51 | + gpuCenters[i] = 0; | |
52 | + | |
53 | + | |
54 | + int n = n_l; | |
55 | + | |
56 | + float l_value = 0; | |
57 | + | |
58 | + if (i < x * y) | |
59 | + | |
60 | + // check if the vote value for this pixel is greater than threshold, so this pixel may be a local max. | |
61 | + if (gpuVote[i]>final_t){ | |
62 | + | |
63 | + // compare the vote value for this pixel with the vote values with its neighbors. | |
64 | + while (n<=n_h){ | |
65 | + | |
66 | + // check if this vote value is a local max in its neighborhood. | |
67 | + if (gpuVote[i] < gpuVote[n]){ | |
68 | + l_value = 0; | |
69 | + n =n_h+1; | |
70 | + } | |
71 | + else if (n == n_h){ | |
72 | + l_value = 1; | |
73 | + n = n+1; | |
74 | + } | |
75 | + // check if the current neighbor is the last one at the current row | |
76 | + else if ((n - n_l - 2*conn)% x ==0){ | |
77 | + n = n + x - 2*conn -1; | |
78 | + n ++; | |
79 | + } | |
80 | + else | |
81 | + n ++; | |
82 | + } | |
83 | + // set the center value for this pixel to high if it's a local max ,and to low if not. | |
84 | + gpuCenters[i] = l_value ; | |
85 | + } | |
86 | + | |
87 | + } | |
88 | + | |
89 | + | |
90 | + | |
91 | + template<typename T> | |
92 | + void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ | |
93 | + | |
94 | + | |
95 | + | |
96 | + | |
97 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
98 | + dim3 threads(max_threads, 1); | |
99 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | |
100 | + | |
101 | + | |
102 | + | |
103 | + //call the kernel to find the local maximum. | |
104 | + cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y); | |
105 | + | |
106 | + | |
107 | + } | |
108 | + | |
109 | + | |
110 | + | |
111 | + template<typename T> | |
112 | + void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ | |
113 | + | |
114 | + | |
115 | + //calculate the number of bytes in the array | |
116 | + unsigned int bytes = x * y * sizeof(T); | |
117 | + | |
118 | + // allocate space on the GPU for the detected cell centes | |
119 | + T* gpuCenters; | |
120 | + cudaMalloc(&gpuCenters, bytes); | |
121 | + | |
122 | + | |
123 | + //allocate space on the GPU for the input Vote Image | |
124 | + T* gpuVote; | |
125 | + cudaMalloc(&gpuVote, bytes); | |
126 | + | |
127 | + | |
128 | + //copy the Vote image data to the GPU | |
129 | + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); | |
130 | + | |
131 | + | |
132 | + //call the GPU version of the local max function | |
133 | + gpu_local_max<T>(gpuCenters, gpuVote, final_t, conn, x, y); | |
134 | + | |
135 | + | |
136 | + //copy the cell centers data to the CPU | |
137 | + cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ; | |
138 | + | |
139 | + | |
140 | + //free allocated memory | |
141 | + cudaFree(gpuCenters); | |
142 | + cudaFree(gpuVote); | |
143 | + cudaFree(gpuGrad); | |
144 | + } | |
145 | + | |
146 | + } | |
147 | +} | |
148 | + | |
149 | + | |
150 | + | |
151 | +#endif | |
0 | 152 | \ No newline at end of file | ... | ... |
1 | + | |
2 | +#ifndef STIM_CUDA_SHAREDMEM_H | |
3 | +#define STIM_CUDA_SHAREDMEM_H | |
4 | + | |
5 | +namespace stim{ | |
6 | + namespace cuda{ | |
7 | + | |
8 | + // Copies values from global memory to shared memory, optimizing threads | |
9 | + template<typename T> | |
10 | + __device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src, | |
11 | + unsigned int x, unsigned int y, unsigned int X, unsigned int Y, | |
12 | + dim3 threadIdx, dim3 blockDim){ | |
13 | + | |
14 | + //calculate the number of iterations required for the copy | |
15 | + unsigned int xI, yI; | |
16 | + xI = X/blockDim.x + 1; //number of iterations along X | |
17 | + yI = Y/blockDim.y + 1; //number of iterations along Y | |
18 | + | |
19 | + //for each iteration | |
20 | + for(unsigned int xi = 0; xi < xI; xi++){ | |
21 | + for(unsigned int yi = 0; yi < yI; yi++){ | |
22 | + | |
23 | + //calculate the index into shared memory | |
24 | + unsigned int sx = xi * blockDim.x + threadIdx.x; | |
25 | + unsigned int sy = yi * blockDim.y + threadIdx.y; | |
26 | + | |
27 | + //calculate the index into the texture | |
28 | + unsigned int tx = x + sx; | |
29 | + unsigned int ty = y + sy; | |
30 | + | |
31 | + //perform the copy | |
32 | + if(sx < X && sy < Y) | |
33 | + dest[sy * X + sx] = tex2D<T>(src, tx, ty); | |
34 | + } | |
35 | + } | |
36 | + } | |
37 | + | |
38 | + } | |
39 | +} | |
40 | + | |
41 | + | |
42 | +#endif | |
0 | 43 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_UPDATE_DIR_H | |
2 | +#define STIM_CUDA_UPDATE_DIR_H | |
3 | + | |
4 | + | |
5 | +# include <iostream> | |
6 | +# include <cuda.h> | |
7 | +# include <stim/cuda/devices.h> | |
8 | +# include <stim/cuda/error.h> | |
9 | +#include <stim/cuda/sharedmem.cuh> | |
10 | + | |
11 | +namespace stim{ | |
12 | + namespace cuda{ | |
13 | + | |
14 | + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. | |
15 | + template<typename T> | |
16 | + __global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
17 | + | |
18 | + //generate a pointer to shared memory (size will be specified as a kernel parameter) | |
19 | + extern __shared__ float s_vote[]; | |
20 | + | |
21 | + //calculate the start point for this block | |
22 | + int bxi = blockIdx.x * blockDim.x; | |
23 | + | |
24 | + //calculate the width of the shared memory block | |
25 | + int swidth = 2 * rmax + blockDim.x; | |
26 | + | |
27 | + // calculate the 2D coordinates for this current thread. | |
28 | + int xi = bxi + threadIdx.x; | |
29 | + int yi = blockIdx.y; | |
30 | + | |
31 | + // convert 2D coordinates to 1D | |
32 | + int i = yi * x + xi; | |
33 | + | |
34 | + // calculate the voting direction based on the grtadient direction | |
35 | + float theta = gpuGrad[2*i]; | |
36 | + | |
37 | + //initialize the vote direction to zero | |
38 | + gpuDir[i] = 0; | |
39 | + | |
40 | + // define a local variable to maximum value of the vote image in the voting area for this voter | |
41 | + float max = 0; | |
42 | + | |
43 | + // define two local variables for the x and y coordinations where the maximum happened | |
44 | + int id_x = 0; | |
45 | + int id_y = 0; | |
46 | + | |
47 | + // compute the size of window which will be checked for finding the voting area for this voter | |
48 | + unsigned int x_table = 2*rmax +1; | |
49 | + unsigned int rmax_sq = rmax * rmax; | |
50 | + int r = (int)rmax; | |
51 | + int tx_rmax = threadIdx.x + rmax; | |
52 | + int bxs = bxi - rmax; | |
53 | + | |
54 | + for(int yr = -r; yr <= r; yr++){ | |
55 | + | |
56 | + //copy the portion of the image necessary for this block to shared memory | |
57 | + __syncthreads(); | |
58 | + stim::cuda::sharedMemcpy_tex2D<float>(s_vote, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim); | |
59 | + __syncthreads(); | |
60 | + | |
61 | + //if the current thread is outside of the image, it doesn't have to be computed | |
62 | + if(xi < x && yi < y){ | |
63 | + | |
64 | + for(int xr = -r; xr <= r; xr++){ | |
65 | + | |
66 | + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; | |
67 | + | |
68 | + // calculate the angle between the voter and the current pixel in x and y directions | |
69 | + float atan_angle = gpuTable[ind_t]; | |
70 | + | |
71 | + | |
72 | + // calculate the voting direction based on the grtadient direction | |
73 | + int idx_share_update = xr + tx_rmax ; | |
74 | + float share_vote = s_vote[idx_share_update]; | |
75 | + | |
76 | + // check if the current pixel is located in the voting area of this voter. | |
77 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | |
78 | + | |
79 | + // compare the vote value of this pixel with the max value to find the maxima and its index. | |
80 | + if (share_vote>max) { | |
81 | + | |
82 | + max = share_vote; | |
83 | + id_x = xr; | |
84 | + id_y = yr; | |
85 | + } | |
86 | + } | |
87 | + } | |
88 | + } | |
89 | + } | |
90 | + | |
91 | + | |
92 | + //float new_angle = atan2(dy, dx); | |
93 | + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); | |
94 | + | |
95 | + float new_angle = gpuTable[ind_m]; | |
96 | + | |
97 | + gpuDir[i] = new_angle; | |
98 | + | |
99 | + } | |
100 | + | |
101 | + // this kernel updates the gradient direction by the calculated voting direction. | |
102 | + template<typename T> | |
103 | + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, unsigned int x, unsigned int y){ | |
104 | + | |
105 | + //************ when the number of threads are (1024,1) ************* | |
106 | + | |
107 | + // calculate the 2D coordinates for this current thread. | |
108 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
109 | + int yi = blockIdx.y; | |
110 | + // convert 2D coordinates to 1D | |
111 | + int i = yi * x + xi; | |
112 | + | |
113 | + | |
114 | + //update the gradient image with the vote direction | |
115 | + gpuGrad[2*i] = gpuDir[i]; | |
116 | + } | |
117 | + | |
118 | + | |
119 | + template<typename T> | |
120 | + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
121 | + | |
122 | + //get the number of pixels in the image | |
123 | + unsigned int pixels = x * y; | |
124 | + unsigned int bytes = sizeof(T) * pixels; | |
125 | + | |
126 | + | |
127 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
128 | + dim3 threads(max_threads, 1); | |
129 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | |
130 | + | |
131 | + // Allocate CUDA array in device memory | |
132 | + | |
133 | + //define a channel descriptor for a single 32-bit channel | |
134 | + cudaChannelFormatDesc channelDesc = | |
135 | + cudaCreateChannelDesc(32, 0, 0, 0, | |
136 | + cudaChannelFormatKindFloat); | |
137 | + cudaArray* cuArray; //declare the cuda array | |
138 | + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array | |
139 | + | |
140 | + // Copy the image data from global memory to the array | |
141 | + cudaMemcpyToArray(cuArray, 0, 0, gpuVote, bytes, | |
142 | + cudaMemcpyDeviceToDevice); | |
143 | + | |
144 | + // Specify texture | |
145 | + struct cudaResourceDesc resDesc; //create a resource descriptor | |
146 | + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero | |
147 | + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type | |
148 | + resDesc.res.array.array = cuArray; //add a pointer to the cuda array | |
149 | + | |
150 | + // Specify texture object parameters | |
151 | + struct cudaTextureDesc texDesc; //create a texture descriptor | |
152 | + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero | |
153 | + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) | |
154 | + texDesc.addressMode[1] = cudaAddressModeWrap; | |
155 | + texDesc.filterMode = cudaFilterModePoint; //use linear filtering | |
156 | + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) | |
157 | + texDesc.normalizedCoords = 0; //not using normalized coordinates | |
158 | + | |
159 | + // Create texture object | |
160 | + cudaTextureObject_t texObj = 0; | |
161 | + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); | |
162 | + | |
163 | + // specify share memory | |
164 | + unsigned int share_bytes = (2*rmax + threads.x)*(1)*4; | |
165 | + | |
166 | + // allocate space on the GPU for the updated vote direction | |
167 | + T* gpuDir; | |
168 | + cudaMalloc(&gpuDir, bytes); | |
169 | + | |
170 | + //call the kernel to calculate the new voting direction | |
171 | + cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, texObj, gpuGrad, gpuTable, phi, rmax, x , y); | |
172 | + | |
173 | + //call the kernel to update the gradient direction | |
174 | + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); | |
175 | + | |
176 | + | |
177 | + //free allocated memory | |
178 | + cudaFree(gpuDir); | |
179 | + | |
180 | + } | |
181 | + | |
182 | + | |
183 | + template<typename T> | |
184 | + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
185 | + | |
186 | + //calculate the number of bytes in the array | |
187 | + unsigned int bytes = x * y * sizeof(T); | |
188 | + | |
189 | + //calculate the number of bytes in the atan2 table | |
190 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | |
191 | + | |
192 | + //allocate space on the GPU for the Vote Image | |
193 | + T* gpuVote; | |
194 | + cudaMalloc(&gpuVote, bytes); | |
195 | + | |
196 | + //copy the input vote image to the GPU | |
197 | + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); | |
198 | + | |
199 | + //allocate space on the GPU for the input Gradient image | |
200 | + T* gpuGrad; | |
201 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | |
202 | + | |
203 | + //copy the Gradient data to the GPU | |
204 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | |
205 | + | |
206 | + //allocate space on the GPU for the atan2 table | |
207 | + T* gpuTable; | |
208 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | |
209 | + | |
210 | + //copy the atan2 values to the GPU | |
211 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | |
212 | + | |
213 | + | |
214 | + //call the GPU version of the update direction function | |
215 | + gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
216 | + | |
217 | + | |
218 | + //copy the new gradient image back to the CPU | |
219 | + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; | |
220 | + | |
221 | + //free allocated memory | |
222 | + cudaFree(gpuTable); | |
223 | + cudaFree(gpuVote); | |
224 | + cudaFree(gpuGrad); | |
225 | + } | |
226 | + | |
227 | + } | |
228 | +} | |
229 | + | |
230 | + | |
231 | + | |
232 | +#endif | |
0 | 233 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_VOTE_H | |
2 | +#define STIM_CUDA_VOTE_H | |
3 | + | |
4 | + | |
5 | +# include <iostream> | |
6 | +# include <cuda.h> | |
7 | +# include <stim/cuda/devices.h> | |
8 | +# include <stim/cuda/error.h> | |
9 | +#include <stim/cuda/sharedmem.cuh> | |
10 | + | |
11 | + | |
12 | +namespace stim{ | |
13 | + namespace cuda{ | |
14 | + | |
15 | + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | |
16 | + template<typename T> | |
17 | + __global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
18 | + | |
19 | + //generate a pointer to shared memory (size will be specified as a kernel parameter) | |
20 | + extern __shared__ float2 s_grad[]; | |
21 | + | |
22 | + //calculate the start point for this block | |
23 | + int bxi = blockIdx.x * blockDim.x; | |
24 | + | |
25 | + //calculate the width of the shared memory block | |
26 | + int swidth = 2 * rmax + blockDim.x; | |
27 | + | |
28 | + // calculate the 2D coordinates for this current thread. | |
29 | + int xi = bxi + threadIdx.x; | |
30 | + int yi = blockIdx.y; | |
31 | + // convert 2D coordinates to 1D | |
32 | + int i = yi * x + xi; | |
33 | + | |
34 | + | |
35 | + // define a local variable to sum the votes from the voters | |
36 | + float sum = 0; | |
37 | + | |
38 | + // compute the size of window which will be checked for finding the proper voters for this pixel | |
39 | + unsigned int x_table = 2*rmax +1; | |
40 | + | |
41 | + unsigned int rmax_sq = rmax * rmax; | |
42 | + int r = (int)rmax; | |
43 | + int tx_rmax = threadIdx.x + rmax; | |
44 | + int bxs = bxi - rmax; | |
45 | + | |
46 | + | |
47 | + for(int yr = -r; yr <= r; yr++){ | |
48 | + | |
49 | + //copy the portion of the image necessary for this block to shared memory | |
50 | + __syncthreads(); | |
51 | + stim::cuda::sharedMemcpy_tex2D<float2>(s_grad, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim); | |
52 | + __syncthreads(); | |
53 | + | |
54 | + //if the current thread is outside of the image, it doesn't have to be computed | |
55 | + if(xi < x && yi < y){ | |
56 | + | |
57 | + for(int xr = -r; xr <= r; xr++){ | |
58 | + | |
59 | + //find the location of this voter in the atan2 table | |
60 | + unsigned int id_t = (yr + rmax) * x_table + xr + rmax; | |
61 | + | |
62 | + // calculate the angle between the pixel and the current voter in x and y directions | |
63 | + float atan_angle = gpuTable[id_t]; | |
64 | + | |
65 | + | |
66 | + // calculate the voting direction based on the grtadient direction | |
67 | + int idx_share = xr + tx_rmax ; | |
68 | + float2 g = s_grad[idx_share]; | |
69 | + float theta = g.x; | |
70 | + | |
71 | + // check if the current voter is located in the voting area of this pixel. | |
72 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | |
73 | + sum += g.y; | |
74 | + | |
75 | + } | |
76 | + | |
77 | + } | |
78 | + } | |
79 | + } | |
80 | + | |
81 | + gpuVote[i] = sum; | |
82 | + } | |
83 | + | |
84 | + template<typename T> | |
85 | + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
86 | + | |
87 | + //get the number of pixels in the image | |
88 | + unsigned int pixels = x * y; | |
89 | + unsigned int bytes = sizeof(T) * pixels; | |
90 | + | |
91 | + | |
92 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
93 | + //unsigned int thread_dim = sqrt(max_threads); | |
94 | + dim3 threads(max_threads, 1); | |
95 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | |
96 | + | |
97 | + // Allocate CUDA array in device memory | |
98 | + | |
99 | + //define a channel descriptor for a single 32-bit channel | |
100 | + cudaChannelFormatDesc channelDesc = | |
101 | + cudaCreateChannelDesc(32, 32, 0, 0, | |
102 | + cudaChannelFormatKindFloat); | |
103 | + cudaArray* cuArray; //declare the cuda array | |
104 | + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array | |
105 | + | |
106 | + // Copy the image data from global memory to the array | |
107 | + cudaMemcpyToArray(cuArray, 0, 0, gpuGrad, bytes*2, | |
108 | + cudaMemcpyDeviceToDevice); | |
109 | + | |
110 | + // Specify texture | |
111 | + struct cudaResourceDesc resDesc; //create a resource descriptor | |
112 | + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero | |
113 | + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type | |
114 | + resDesc.res.array.array = cuArray; //add a pointer to the cuda array | |
115 | + | |
116 | + // Specify texture object parameters | |
117 | + struct cudaTextureDesc texDesc; //create a texture descriptor | |
118 | + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero | |
119 | + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) | |
120 | + texDesc.addressMode[1] = cudaAddressModeWrap; | |
121 | + texDesc.filterMode = cudaFilterModePoint; //use linear filtering | |
122 | + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) | |
123 | + texDesc.normalizedCoords = 0; //not using normalized coordinates | |
124 | + | |
125 | + // Create texture object | |
126 | + cudaTextureObject_t texObj = 0; | |
127 | + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); | |
128 | + | |
129 | + // specify share memory | |
130 | + unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4; | |
131 | + | |
132 | + | |
133 | + //call the kernel to do the voting | |
134 | + | |
135 | + cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y); | |
136 | + | |
137 | + } | |
138 | + | |
139 | + | |
140 | + template<typename T> | |
141 | + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
142 | + | |
143 | + //calculate the number of bytes in the array | |
144 | + unsigned int bytes = x * y * sizeof(T); | |
145 | + | |
146 | + //calculate the number of bytes in the atan2 table | |
147 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | |
148 | + | |
149 | + //allocate space on the GPU for the Vote Image | |
150 | + T* gpuVote; | |
151 | + cudaMalloc(&gpuVote, bytes); | |
152 | + | |
153 | + //allocate space on the GPU for the input Gradient image | |
154 | + T* gpuGrad; | |
155 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | |
156 | + | |
157 | + //copy the Gradient Magnitude data to the GPU | |
158 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | |
159 | + | |
160 | + //allocate space on the GPU for the atan2 table | |
161 | + T* gpuTable; | |
162 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | |
163 | + | |
164 | + //copy the atan2 values to the GPU | |
165 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | |
166 | + | |
167 | + //cudaMemcpyToSymbol (cstTable, cpuTable, bytes_table ); | |
168 | + | |
169 | + | |
170 | + //call the GPU version of the vote calculation function | |
171 | + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
172 | + | |
173 | + | |
174 | + //copy the Vote Data back to the CPU | |
175 | + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; | |
176 | + | |
177 | + //free allocated memory | |
178 | + cudaFree(gpuTable); | |
179 | + cudaFree(gpuVote); | |
180 | + cudaFree(gpuGrad); | |
181 | + } | |
182 | + | |
183 | + } | |
184 | +} | |
185 | + | |
186 | + | |
187 | + | |
188 | +#endif | |
0 | 189 | \ No newline at end of file | ... | ... |