Commit e433d0d4d49606a052101768de0724d777e95765
Merge branch 'master' of git.stim.ee.uh.edu:codebase/stimlib
Showing
12 changed files
with
1281 additions
and
2 deletions
Show diff stats
1 | +#ifndef STIM_CUDA_ARRAY_ABS_H | ||
2 | +#define STIM_CUDA_ARRAY_ABS_H | ||
3 | + | ||
4 | +namespace stim{ | ||
5 | + namespace cuda{ | ||
6 | + template<typename T> | ||
7 | + __global__ void cuda_abs(T* a, unsigned int N){ | ||
8 | + | ||
9 | + //calculate the 1D index for this thread | ||
10 | + int i = blockIdx.x * blockDim.x + threadIdx.x; | ||
11 | + | ||
12 | + if(i < N) | ||
13 | + a[i] = abs(a[i]); | ||
14 | + } | ||
15 | + | ||
16 | + | ||
17 | + template<typename T> | ||
18 | + void gpu_abs(T* a, unsigned int N){ | ||
19 | + | ||
20 | + //get the maximum number of threads per block for the CUDA device | ||
21 | + int threads = stim::maxThreadsPerBlock(); | ||
22 | + | ||
23 | + //calculate the number of blocks | ||
24 | + int blocks = N / threads + (N%threads == 0 ? 0:1); | ||
25 | + | ||
26 | + //call the kernel to do the multiplication | ||
27 | + cuda_abs <<< blocks, threads >>>(a, N); | ||
28 | + | ||
29 | + } | ||
30 | + | ||
31 | + | ||
32 | + template<typename T> | ||
33 | + void cpu_abs(T* a, unsigned int N){ | ||
34 | + | ||
35 | + //calculate the number of bytes in the array | ||
36 | + unsigned int bytes = N * sizeof(T); | ||
37 | + | ||
38 | + //allocate memory on the GPU for the array | ||
39 | + T* gpuA; | ||
40 | + HANDLE_ERROR( cudaMalloc(&gpuA, bytes) ); | ||
41 | + | ||
42 | + //copy the array to the GPU | ||
43 | + HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) ); | ||
44 | + | ||
45 | + //call the GPU version of this function | ||
46 | + gpu_abs<T>(gpuA, N); | ||
47 | + | ||
48 | + //copy the array back to the CPU | ||
49 | + HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) ); | ||
50 | + | ||
51 | + //free allocated memory | ||
52 | + cudaFree(gpuA); | ||
53 | + | ||
54 | + } | ||
55 | + | ||
56 | + } //end namespace cuda | ||
57 | +} //end namespace stim | ||
58 | + | ||
59 | +#endif | ||
0 | \ No newline at end of file | 60 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_ARRAY_CART2POLAR_H | ||
2 | +#define STIM_CUDA_ARRAY_CART2POLAR_H | ||
3 | + | ||
4 | +namespace stim{ | ||
5 | + namespace cuda{ | ||
6 | + template<typename T> | ||
7 | + __global__ void cuda_cart2polar(T* a, unsigned int N){ | ||
8 | + | ||
9 | + | ||
10 | + //calculate the 1D index for this thread | ||
11 | + int i = blockIdx.x * blockDim.x + threadIdx.x; | ||
12 | + | ||
13 | + if(i < N){ | ||
14 | + float x = a[i * 2 + 0]; | ||
15 | + float y = a[i * 2 + 1]; | ||
16 | + float theta = atan2( y, x ) ; | ||
17 | + float r = sqrt(x * x + y * y); | ||
18 | + a[i * 2 + 0] = theta; | ||
19 | + a[i * 2 + 1] = r; | ||
20 | + } | ||
21 | + } | ||
22 | + | ||
23 | + | ||
24 | + template<typename T> | ||
25 | + void gpu_cart2polar(T* gpuGrad, unsigned int N){ | ||
26 | + | ||
27 | + //get the maximum number of threads per block for the CUDA device | ||
28 | + int threads = stim::maxThreadsPerBlock(); | ||
29 | + | ||
30 | + //calculate the number of blocks | ||
31 | + int blocks = N / threads + (N % threads == 0 ? 0:1); | ||
32 | + | ||
33 | + //call the kernel to do the multiplication | ||
34 | + cuda_cart2polar <<< blocks, threads >>>(gpuGrad, N); | ||
35 | + | ||
36 | + } | ||
37 | + | ||
38 | + | ||
39 | + template<typename T> | ||
40 | + void cpu_cart2polar(T* a, unsigned int N){ | ||
41 | + | ||
42 | + //calculate the number of bytes in the array | ||
43 | + unsigned int bytes = N * sizeof(T) * 2; | ||
44 | + | ||
45 | + //allocate memory on the GPU for the array | ||
46 | + T* gpuA; | ||
47 | + HANDLE_ERROR( cudaMalloc(&gpuA, bytes) ); | ||
48 | + | ||
49 | + //copy the array to the GPU | ||
50 | + HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) ); | ||
51 | + | ||
52 | + //call the GPU version of this function | ||
53 | + gpu_cart2polar<T>(gpuA, N); | ||
54 | + | ||
55 | + //copy the array back to the CPU | ||
56 | + HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) ); | ||
57 | + | ||
58 | + //free allocated memory | ||
59 | + cudaFree(gpuA); | ||
60 | + | ||
61 | + } | ||
62 | + | ||
63 | + } | ||
64 | +} | ||
65 | + | ||
66 | +#endif | ||
0 | \ No newline at end of file | 67 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_ARRAY_MULTIPLY_H | ||
2 | +#define STIM_CUDA_ARRAY_MULTIPLY_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <stim/cuda/devices.h> | ||
7 | +#include <stim/cuda/error.h> | ||
8 | + | ||
9 | +namespace stim{ | ||
10 | + namespace cuda{ | ||
11 | + | ||
12 | + template<typename T> | ||
13 | + __global__ void cuda_multiply(T* lhs, T rhs, unsigned int N){ | ||
14 | + | ||
15 | + //calculate the 1D index for this thread | ||
16 | + int i = blockIdx.x * blockDim.x + threadIdx.x; | ||
17 | + | ||
18 | + if(i < N) | ||
19 | + lhs[i] *= rhs; | ||
20 | + } | ||
21 | + | ||
22 | + template<typename T> | ||
23 | + void gpu_multiply(T* lhs, T rhs, unsigned int N){ | ||
24 | + | ||
25 | + //get the maximum number of threads per block for the CUDA device | ||
26 | + int threads = stim::maxThreadsPerBlock(); | ||
27 | + | ||
28 | + //calculate the number of blocks | ||
29 | + int blocks = N / threads + (N%threads == 0 ? 0:1); | ||
30 | + | ||
31 | + //call the kernel to do the multiplication | ||
32 | + cuda_multiply <<< blocks, threads >>>(lhs, rhs, N); | ||
33 | + | ||
34 | + } | ||
35 | + | ||
36 | + template<typename T> | ||
37 | + void cpu_multiply(T* lhs, T rhs, unsigned int N){ | ||
38 | + | ||
39 | + //calculate the number of bytes in the array | ||
40 | + unsigned int bytes = N * sizeof(T); | ||
41 | + | ||
42 | + //allocate memory on the GPU for the array | ||
43 | + T* gpuLHS; | ||
44 | + HANDLE_ERROR( cudaMalloc(&gpuLHS, bytes) ); | ||
45 | + | ||
46 | + //copy the array to the GPU | ||
47 | + HANDLE_ERROR( cudaMemcpy(gpuLHS, lhs, bytes, cudaMemcpyHostToDevice) ); | ||
48 | + | ||
49 | + //call the GPU version of this function | ||
50 | + gpu_multiply<T>(gpuLHS, rhs, N); | ||
51 | + | ||
52 | + //copy the array back to the CPU | ||
53 | + HANDLE_ERROR( cudaMemcpy(lhs, gpuLHS, bytes, cudaMemcpyDeviceToHost) ); | ||
54 | + | ||
55 | + //free allocated memory | ||
56 | + cudaFree(gpuLHS); | ||
57 | + } | ||
58 | + | ||
59 | + } | ||
60 | +} | ||
61 | + | ||
62 | + | ||
63 | + | ||
64 | +#endif | ||
0 | \ No newline at end of file | 65 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_ARRAYMATH_H | ||
2 | +#define STIM_CUDA_ARRAYMATH_H | ||
3 | + | ||
4 | +#include <stim/cuda/array_multiply.cuh> | ||
5 | +#include <stim/cuda/array_abs.cuh> | ||
6 | +#include <stim/cuda/array_cart2polar.cuh> | ||
7 | + | ||
8 | +namespace stim{ | ||
9 | + namespace cuda{ | ||
10 | + | ||
11 | + } | ||
12 | +} | ||
13 | + | ||
14 | + | ||
15 | + | ||
16 | +#endif | ||
0 | \ No newline at end of file | 17 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_DOWN_SAMPLE_H | ||
2 | +#define STIM_CUDA_DOWN_SAMPLE_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <stim/cuda/devices.h> | ||
7 | +#include <stim/cuda/timer.h> | ||
8 | +#include <stim/cuda/gaussian_blur.cuh> | ||
9 | + | ||
10 | +namespace stim{ | ||
11 | + namespace cuda{ | ||
12 | + | ||
13 | + template<typename T> | ||
14 | + __global__ void down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ | ||
15 | + | ||
16 | + unsigned int sigma_ds = 1/resize; | ||
17 | + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); | ||
18 | + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); | ||
19 | + | ||
20 | + | ||
21 | + // calculate the 2D coordinates for this current thread. | ||
22 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
23 | + int yi = blockIdx.y; | ||
24 | + // convert 2D coordinates to 1D | ||
25 | + int i = yi * x_ds + xi; | ||
26 | + | ||
27 | + if(xi< x_ds && yi< y_ds){ | ||
28 | + | ||
29 | + int x_org = xi * sigma_ds; | ||
30 | + int y_org = yi * sigma_ds; | ||
31 | + int i_org = y_org * x + x_org; | ||
32 | + gpuI[i] = gpuI0[i_org]; | ||
33 | + } | ||
34 | + | ||
35 | + } | ||
36 | + | ||
37 | + | ||
38 | + /// Applies a Gaussian blur to a 2D image stored on the GPU | ||
39 | + template<typename T> | ||
40 | + void gpu_down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ | ||
41 | + | ||
42 | + | ||
43 | + unsigned int sigma_ds = 1/resize; | ||
44 | + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); | ||
45 | + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); | ||
46 | + | ||
47 | + //get the number of pixels in the image | ||
48 | + unsigned int pixels_ds = x_ds * y_ds; | ||
49 | + | ||
50 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | ||
51 | + dim3 threads(max_threads, 1); | ||
52 | + dim3 blocks(x_ds/threads.x + (x_ds %threads.x == 0 ? 0:1) , y_ds); | ||
53 | + | ||
54 | + stim::cuda::gpu_gaussian_blur_2d<float>(gpuI0, sigma_ds,x ,y); | ||
55 | + | ||
56 | + //resample the image | ||
57 | + down_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y); | ||
58 | + | ||
59 | + } | ||
60 | + | ||
61 | + /// Applies a Gaussian blur to a 2D image stored on the CPU | ||
62 | + template<typename T> | ||
63 | + void cpu_down_sample(T* re_img, T* image, T resize, unsigned int x, unsigned int y){ | ||
64 | + | ||
65 | + //get the number of pixels in the image | ||
66 | + unsigned int pixels = x * y; | ||
67 | + unsigned int bytes = sizeof(T) * pixels; | ||
68 | + | ||
69 | + unsigned int sigma_ds = 1/resize; | ||
70 | + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); | ||
71 | + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); | ||
72 | + unsigned int bytes_ds = sizeof(T) * x_ds * y_ds; | ||
73 | + | ||
74 | + | ||
75 | + | ||
76 | + //allocate space on the GPU for the original image | ||
77 | + T* gpuI0; | ||
78 | + cudaMalloc(&gpuI0, bytes); | ||
79 | + | ||
80 | + | ||
81 | + //copy the image data to the GPU | ||
82 | + cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice); | ||
83 | + | ||
84 | + //allocate space on the GPU for the down sampled image | ||
85 | + T* gpuI; | ||
86 | + cudaMalloc(&gpuI, bytes_ds); | ||
87 | + | ||
88 | + //run the GPU-based version of the algorithm | ||
89 | + gpu_down_sample<T>(gpuI, gpuI0, resize, x, y); | ||
90 | + | ||
91 | + //copy the image data to the GPU | ||
92 | + cudaMemcpy(re_image, gpuI, bytes_ds, cudaMemcpyHostToDevice); | ||
93 | + | ||
94 | + cudaFree(gpuI0); | ||
95 | + cudeFree(gpuI); | ||
96 | + } | ||
97 | + | ||
98 | + } | ||
99 | +} | ||
100 | + | ||
101 | +#endif | ||
0 | \ No newline at end of file | 102 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_GAUSSIAN_BLUR_H | ||
2 | +#define STIM_CUDA_GAUSSIAN_BLUR_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <stim/cuda/devices.h> | ||
7 | +#include <stim/cuda/timer.h> | ||
8 | +#include <stim/cuda/sharedmem.cuh> | ||
9 | + | ||
10 | +#define pi 3.14159 | ||
11 | + | ||
12 | +namespace stim{ | ||
13 | + namespace cuda{ | ||
14 | + | ||
15 | + template<typename T> | ||
16 | + __global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ | ||
17 | + | ||
18 | + //generate a pointer to shared memory (size will be specified as a kernel parameter) | ||
19 | + extern __shared__ T s[]; | ||
20 | + | ||
21 | + int kr = sigma * 4; //calculate the kernel radius | ||
22 | + | ||
23 | + //get a pointer to the gaussian in memory | ||
24 | + T* g = (T*)&s[blockDim.x + 2 * kr]; | ||
25 | + | ||
26 | + //calculate the start point for this block | ||
27 | + int bxi = blockIdx.x * blockDim.x; | ||
28 | + int byi = blockIdx.y; | ||
29 | + | ||
30 | + //copy the portion of the image necessary for this block to shared memory | ||
31 | + stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); | ||
32 | + | ||
33 | + //calculate the thread index and block index | ||
34 | + int ti = threadIdx.x; | ||
35 | + | ||
36 | + //calculate the spatial coordinate for this thread | ||
37 | + int xi = bxi + ti; | ||
38 | + | ||
39 | + //pre-compute the gaussian values for each kernel point | ||
40 | + T a = 1.0 / (sigma * sqrt(2 * pi)); | ||
41 | + T c = - 1.0 / (2*sigma*sigma); | ||
42 | + int ki; | ||
43 | + | ||
44 | + //use the first 2kr+1 threads to evaluate a gaussian and store the result | ||
45 | + if(ti <= 2* kr+1){ | ||
46 | + ki = ti - kr; | ||
47 | + g[ti] = a * exp((ki*ki) * c); | ||
48 | + } | ||
49 | + | ||
50 | + //make sure that all writing to shared memory is done before continuing | ||
51 | + __syncthreads(); | ||
52 | + | ||
53 | + //if the current pixel is outside of the image | ||
54 | + if(bxi + ti > x || byi > y) | ||
55 | + return; | ||
56 | + | ||
57 | + | ||
58 | + | ||
59 | + //calculate the coordinates of the current thread in shared memory | ||
60 | + int si = ti + kr; | ||
61 | + | ||
62 | + T sum = 0; //running weighted sum across the kernel | ||
63 | + | ||
64 | + | ||
65 | + //for each element of the kernel | ||
66 | + for(int ki = -kr; ki <= kr; ki++){ | ||
67 | + sum += g[ki + kr] * s[si + ki]; | ||
68 | + } | ||
69 | + | ||
70 | + //calculate the 1D image index for this thread | ||
71 | + unsigned int i = byi * x + xi; | ||
72 | + | ||
73 | + //output the result to global memory | ||
74 | + out[i] = sum; | ||
75 | + } | ||
76 | + | ||
77 | + template<typename T> | ||
78 | + __global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ | ||
79 | + | ||
80 | + //generate a pointer to shared memory (size will be specified as a kernel parameter) | ||
81 | + extern __shared__ T s[]; | ||
82 | + | ||
83 | + int kr = sigma * 4; //calculate the kernel radius | ||
84 | + | ||
85 | + //get a pointer to the gaussian in memory | ||
86 | + T* g = (T*)&s[blockDim.y + 2 * kr]; | ||
87 | + | ||
88 | + //calculate the start point for this block | ||
89 | + int bxi = blockIdx.x; | ||
90 | + int byi = blockIdx.y * blockDim.y; | ||
91 | + | ||
92 | + //copy the portion of the image necessary for this block to shared memory | ||
93 | + stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); | ||
94 | + | ||
95 | + //calculate the thread index and block index | ||
96 | + int ti = threadIdx.y; | ||
97 | + | ||
98 | + //calculate the spatial coordinate for this thread | ||
99 | + int yi = byi + ti; | ||
100 | + | ||
101 | + //pre-compute the gaussian values for each kernel point | ||
102 | + T a = 1.0 / (sigma * sqrt(2 * pi)); | ||
103 | + T c = - 1.0 / (2*sigma*sigma); | ||
104 | + int ki; | ||
105 | + | ||
106 | + //use the first 2kr+1 threads to evaluate a gaussian and store the result | ||
107 | + if(ti <= 2* kr+1){ | ||
108 | + ki = ti - kr; | ||
109 | + g[ti] = a * exp((ki*ki) * c); | ||
110 | + } | ||
111 | + | ||
112 | + //make sure that all writing to shared memory is done before continuing | ||
113 | + __syncthreads(); | ||
114 | + | ||
115 | + //if the current pixel is outside of the image | ||
116 | + if(bxi >= x || yi >= y) | ||
117 | + return; | ||
118 | + | ||
119 | + | ||
120 | + | ||
121 | + //calculate the coordinates of the current thread in shared memory | ||
122 | + int si = ti + kr; | ||
123 | + | ||
124 | + T sum = 0; //running weighted sum across the kernel | ||
125 | + | ||
126 | + | ||
127 | + //for each element of the kernel | ||
128 | + for(int ki = -kr; ki <= kr; ki++){ | ||
129 | + sum += g[ki + kr] * s[si + ki]; | ||
130 | + } | ||
131 | + | ||
132 | + //calculate the 1D image index for this thread | ||
133 | + unsigned int i = yi * x + bxi; | ||
134 | + | ||
135 | + //output the result to global memory | ||
136 | + out[i] = sum; | ||
137 | + } | ||
138 | + | ||
139 | + /// Applies a Gaussian blur to a 2D image stored on the GPU | ||
140 | + template<typename T> | ||
141 | + void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ | ||
142 | + | ||
143 | + //get the number of pixels in the image | ||
144 | + unsigned int pixels = x * y; | ||
145 | + unsigned int bytes = sizeof(T) * pixels; | ||
146 | + | ||
147 | + // Allocate CUDA array in device memory | ||
148 | + | ||
149 | + //define a channel descriptor for a single 32-bit channel | ||
150 | + cudaChannelFormatDesc channelDesc = | ||
151 | + cudaCreateChannelDesc(32, 0, 0, 0, | ||
152 | + cudaChannelFormatKindFloat); | ||
153 | + cudaArray* cuArray; //declare the cuda array | ||
154 | + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array | ||
155 | + | ||
156 | + // Copy the image data from global memory to the array | ||
157 | + cudaMemcpyToArray(cuArray, 0, 0, image, bytes, | ||
158 | + cudaMemcpyDeviceToDevice); | ||
159 | + | ||
160 | + // Specify texture | ||
161 | + struct cudaResourceDesc resDesc; //create a resource descriptor | ||
162 | + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero | ||
163 | + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type | ||
164 | + resDesc.res.array.array = cuArray; //add a pointer to the cuda array | ||
165 | + | ||
166 | + // Specify texture object parameters | ||
167 | + struct cudaTextureDesc texDesc; //create a texture descriptor | ||
168 | + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero | ||
169 | + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) | ||
170 | + texDesc.addressMode[1] = cudaAddressModeWrap; | ||
171 | + texDesc.filterMode = cudaFilterModePoint; //use linear filtering | ||
172 | + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) | ||
173 | + texDesc.normalizedCoords = 0; //not using normalized coordinates | ||
174 | + | ||
175 | + // Create texture object | ||
176 | + cudaTextureObject_t texObj = 0; | ||
177 | + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); | ||
178 | + | ||
179 | + | ||
180 | + //get the maximum number of threads per block for the CUDA device | ||
181 | + int max_threads = stim::maxThreadsPerBlock(); | ||
182 | + dim3 threads(max_threads, 1); | ||
183 | + | ||
184 | + //calculate the number of blocks | ||
185 | + dim3 blocks(x / threads.x + 1, y); | ||
186 | + | ||
187 | + //calculate the shared memory used in the kernel | ||
188 | + unsigned int pixel_bytes = max_threads * 4; //bytes devoted to pixel data being processed | ||
189 | + unsigned int apron_bytes = sigma * 8 * 4; //bytes devoted to pixels outside the window | ||
190 | + unsigned int gaussian_bytes = (sigma * 8 + 1) * 4; //bytes devoted to memory used to store the pre-computed Gaussian window | ||
191 | + unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes; //total number of bytes shared memory used | ||
192 | + | ||
193 | + //blur the image along the x-axis | ||
194 | + gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y); | ||
195 | + | ||
196 | + // Copy the x-blurred data from global memory to the texture | ||
197 | + cudaMemcpyToArray(cuArray, 0, 0, image, bytes, | ||
198 | + cudaMemcpyDeviceToDevice); | ||
199 | + | ||
200 | + //transpose the block and thread dimensions | ||
201 | + threads.x = 1; | ||
202 | + threads.y = max_threads; | ||
203 | + blocks.x = x; | ||
204 | + blocks.y = y / threads.y + 1; | ||
205 | + | ||
206 | + //blur the image along the y-axis | ||
207 | + gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y); | ||
208 | + | ||
209 | + //free allocated memory | ||
210 | + cudaFree(cuArray); | ||
211 | + | ||
212 | + } | ||
213 | + | ||
214 | + /// Applies a Gaussian blur to a 2D image stored on the CPU | ||
215 | + template<typename T> | ||
216 | + void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ | ||
217 | + | ||
218 | + //get the number of pixels in the image | ||
219 | + unsigned int pixels = x * y; | ||
220 | + unsigned int bytes = sizeof(T) * pixels; | ||
221 | + | ||
222 | + //allocate space on the GPU | ||
223 | + T* gpuI0; | ||
224 | + cudaMalloc(&gpuI0, bytes); | ||
225 | + | ||
226 | + | ||
227 | + //copy the image data to the GPU | ||
228 | + cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice); | ||
229 | + | ||
230 | + //run the GPU-based version of the algorithm | ||
231 | + gpu_gaussian_blur_2d<T>(gpuI0, sigma, x, y); | ||
232 | + | ||
233 | + //copy the image data from the device | ||
234 | + cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost); | ||
235 | + | ||
236 | + //free allocated memory | ||
237 | + cudaFree(gpuI0); | ||
238 | + } | ||
239 | + | ||
240 | + } | ||
241 | +} | ||
242 | + | ||
243 | +#endif | ||
0 | \ No newline at end of file | 244 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_GRADIENT_H | ||
2 | +#define STIM_CUDA_GRADIENT_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <stim/cuda/devices.h> | ||
7 | +#include <stim/cuda/error.h> | ||
8 | + | ||
9 | +namespace stim{ | ||
10 | + namespace cuda{ | ||
11 | + | ||
12 | + template<typename T> | ||
13 | + __global__ void gradient_2d(T* out, T* in, unsigned int x, unsigned int y){ | ||
14 | + | ||
15 | + //calculate the 1D image index for this thread | ||
16 | + int i = blockIdx.x * blockDim.x + threadIdx.x; | ||
17 | + | ||
18 | + //convert this to 2D pixel coordinates | ||
19 | + int yi = i / x; | ||
20 | + int xi = i - (yi * x); | ||
21 | + | ||
22 | + //return if the pixel is outside of the image | ||
23 | + if(xi >= x || yi >= y) return; | ||
24 | + | ||
25 | + //calculate indices for the forward difference | ||
26 | + int i_xp = yi * x + (xi + 1); | ||
27 | + int i_yp = (yi + 1) * x + xi; | ||
28 | + | ||
29 | + //use forward differences if a coordinate is zero | ||
30 | + if(xi == 0) | ||
31 | + out[i * 2 + 0] = in[i_xp] - in[i]; | ||
32 | + if(yi == 0) | ||
33 | + out[i * 2 + 1] = in[i_yp] - in[i]; | ||
34 | + | ||
35 | + //calculate indices for the backward difference | ||
36 | + int i_xn = yi * x + (xi - 1); | ||
37 | + int i_yn = (yi - 1) * x + xi; | ||
38 | + | ||
39 | + //use backward differences if the coordinate is at the maximum edge | ||
40 | + if(xi == x-1) | ||
41 | + out[i * 2 + 0] = in[i] - in[i_xn]; | ||
42 | + if(yi == y-1) | ||
43 | + out[i * 2 + 1] = in[i] - in[i_yn]; | ||
44 | + | ||
45 | + //otherwise use central differences | ||
46 | + if(xi > 0 && xi < x-1) | ||
47 | + out[i * 2 + 0] = (in[i_xp] - in[i_xn]) / 2; | ||
48 | + | ||
49 | + if(yi > 0 && yi < y-1) | ||
50 | + out[i * 2 + 1] = (in[i_yp] - in[i_yn]) / 2; | ||
51 | + | ||
52 | + } | ||
53 | + | ||
54 | + template<typename T> | ||
55 | + //void gpu_gradient_2d(T* gpuOut, T* gpuIn, unsigned int x, unsigned int y){ | ||
56 | + void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){ | ||
57 | + | ||
58 | + //get the number of pixels in the image | ||
59 | + unsigned int pixels = x * y; | ||
60 | + | ||
61 | + //allocate space on the GPU for the input image | ||
62 | + //T* gpuI; | ||
63 | + //HANDLE_ERROR(cudaMalloc(&gpuI, bytes)); | ||
64 | + | ||
65 | + //cudaMemcpy(gpuI, gpuI0, bytes, cudaMemcpyDeviceToDevice); | ||
66 | + | ||
67 | + | ||
68 | + //allocate space on the GPU for the output gradient image | ||
69 | + //T* gpuGrad; | ||
70 | + //cudaMalloc(&gpuGrad, bytes * 2); //the output image will have two channels (x, y) | ||
71 | + | ||
72 | + //get the maximum number of threads per block for the CUDA device | ||
73 | + int threads = stim::maxThreadsPerBlock(); | ||
74 | + | ||
75 | + //calculate the number of blocks | ||
76 | + int blocks = pixels / threads + (pixels%threads == 0 ? 0:1); | ||
77 | + | ||
78 | + //call the GPU kernel to determine the gradient | ||
79 | + gradient_2d<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y); | ||
80 | + | ||
81 | + } | ||
82 | + | ||
83 | + template<typename T> | ||
84 | + void cpu_gradient_2d(T* out, T* in, unsigned int x, unsigned int y){ | ||
85 | + | ||
86 | + //get the number of pixels in the image | ||
87 | + unsigned int pixels = x * y; | ||
88 | + unsigned int bytes = pixels * sizeof(T); | ||
89 | + | ||
90 | + //allocate space on the GPU for the input image | ||
91 | + T* gpuIn; | ||
92 | + HANDLE_ERROR(cudaMalloc(&gpuIn, bytes)); | ||
93 | + | ||
94 | + //copy the image data to the GPU | ||
95 | + HANDLE_ERROR(cudaMemcpy(gpuIn, in, bytes, cudaMemcpyHostToDevice)); | ||
96 | + | ||
97 | + //allocate space on the GPU for the output gradient image | ||
98 | + T* gpuOut; | ||
99 | + cudaMalloc(&gpuOut, bytes * 2); //the output image will have two channels (x, y) | ||
100 | + | ||
101 | + //call the GPU version of this function | ||
102 | + gpu_gradient_2d(gpuOut, gpuIn, x, y); | ||
103 | + | ||
104 | + //copy the results to the CPU | ||
105 | + cudaMemcpy(out, gpuOut, bytes * 2, cudaMemcpyDeviceToHost); | ||
106 | + | ||
107 | + //free allocated memory | ||
108 | + cudaFree(gpuOut); | ||
109 | + } | ||
110 | + | ||
111 | + } | ||
112 | +} | ||
113 | + | ||
114 | + | ||
115 | +#endif | ||
0 | \ No newline at end of file | 116 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_LOCAL_MAX_H | ||
2 | +#define STIM_CUDA_LOCAL_MAX_H | ||
3 | + | ||
4 | + | ||
5 | +# include <iostream> | ||
6 | +# include <cuda.h> | ||
7 | +# include <stim/cuda/devices.h> | ||
8 | +# include <stim/cuda/error.h> | ||
9 | + | ||
10 | +namespace stim{ | ||
11 | + namespace cuda{ | ||
12 | + | ||
13 | + | ||
14 | + // this kernel calculates the local maximum for finding the cell centers | ||
15 | + template<typename T> | ||
16 | + __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ | ||
17 | + | ||
18 | + // calculate the 2D coordinates for this current thread. | ||
19 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
20 | + int yi = blockIdx.y; | ||
21 | + // convert 2D coordinates to 1D | ||
22 | + int i = yi * x + xi; | ||
23 | + | ||
24 | + | ||
25 | + | ||
26 | + //calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. | ||
27 | + int xl = xi - conn; | ||
28 | + int yl = yi - conn; | ||
29 | + | ||
30 | + // use zero for the lowest limits if the xi or yi is less than conn. | ||
31 | + if (xi <= conn) | ||
32 | + xl = 0; | ||
33 | + if (yi <= conn) | ||
34 | + yl = 0; | ||
35 | + | ||
36 | + //calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. | ||
37 | + int xh = xi + conn; | ||
38 | + int yh = yi + conn; | ||
39 | + | ||
40 | + // use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn. | ||
41 | + if (xi >= x - conn) | ||
42 | + xh = x; | ||
43 | + if (yi>= y - conn) | ||
44 | + yh = y; | ||
45 | + | ||
46 | + // calculate the limits for finding the local maximum location in the connected neighbors for the current pixel | ||
47 | + int n_l = yl * x + xl; | ||
48 | + int n_h = yh * x + xh; | ||
49 | + | ||
50 | + //initial the centers image to zero | ||
51 | + gpuCenters[i] = 0; | ||
52 | + | ||
53 | + | ||
54 | + int n = n_l; | ||
55 | + | ||
56 | + float l_value = 0; | ||
57 | + | ||
58 | + if (i < x * y) | ||
59 | + | ||
60 | + // check if the vote value for this pixel is greater than threshold, so this pixel may be a local max. | ||
61 | + if (gpuVote[i]>final_t){ | ||
62 | + | ||
63 | + // compare the vote value for this pixel with the vote values with its neighbors. | ||
64 | + while (n<=n_h){ | ||
65 | + | ||
66 | + // check if this vote value is a local max in its neighborhood. | ||
67 | + if (gpuVote[i] < gpuVote[n]){ | ||
68 | + l_value = 0; | ||
69 | + n =n_h+1; | ||
70 | + } | ||
71 | + else if (n == n_h){ | ||
72 | + l_value = 1; | ||
73 | + n = n+1; | ||
74 | + } | ||
75 | + // check if the current neighbor is the last one at the current row | ||
76 | + else if ((n - n_l - 2*conn)% x ==0){ | ||
77 | + n = n + x - 2*conn -1; | ||
78 | + n ++; | ||
79 | + } | ||
80 | + else | ||
81 | + n ++; | ||
82 | + } | ||
83 | + // set the center value for this pixel to high if it's a local max ,and to low if not. | ||
84 | + gpuCenters[i] = l_value ; | ||
85 | + } | ||
86 | + | ||
87 | + } | ||
88 | + | ||
89 | + | ||
90 | + | ||
91 | + template<typename T> | ||
92 | + void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ | ||
93 | + | ||
94 | + | ||
95 | + | ||
96 | + | ||
97 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | ||
98 | + dim3 threads(max_threads, 1); | ||
99 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | ||
100 | + | ||
101 | + | ||
102 | + | ||
103 | + //call the kernel to find the local maximum. | ||
104 | + cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y); | ||
105 | + | ||
106 | + | ||
107 | + } | ||
108 | + | ||
109 | + | ||
110 | + | ||
111 | + template<typename T> | ||
112 | + void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ | ||
113 | + | ||
114 | + | ||
115 | + //calculate the number of bytes in the array | ||
116 | + unsigned int bytes = x * y * sizeof(T); | ||
117 | + | ||
118 | + // allocate space on the GPU for the detected cell centes | ||
119 | + T* gpuCenters; | ||
120 | + cudaMalloc(&gpuCenters, bytes); | ||
121 | + | ||
122 | + | ||
123 | + //allocate space on the GPU for the input Vote Image | ||
124 | + T* gpuVote; | ||
125 | + cudaMalloc(&gpuVote, bytes); | ||
126 | + | ||
127 | + | ||
128 | + //copy the Vote image data to the GPU | ||
129 | + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); | ||
130 | + | ||
131 | + | ||
132 | + //call the GPU version of the local max function | ||
133 | + gpu_local_max<T>(gpuCenters, gpuVote, final_t, conn, x, y); | ||
134 | + | ||
135 | + | ||
136 | + //copy the cell centers data to the CPU | ||
137 | + cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ; | ||
138 | + | ||
139 | + | ||
140 | + //free allocated memory | ||
141 | + cudaFree(gpuCenters); | ||
142 | + cudaFree(gpuVote); | ||
143 | + cudaFree(gpuGrad); | ||
144 | + } | ||
145 | + | ||
146 | + } | ||
147 | +} | ||
148 | + | ||
149 | + | ||
150 | + | ||
151 | +#endif | ||
0 | \ No newline at end of file | 152 | \ No newline at end of file |
1 | + | ||
2 | +#ifndef STIM_CUDA_SHAREDMEM_H | ||
3 | +#define STIM_CUDA_SHAREDMEM_H | ||
4 | + | ||
5 | +namespace stim{ | ||
6 | + namespace cuda{ | ||
7 | + | ||
8 | + // Copies values from global memory to shared memory, optimizing threads | ||
9 | + template<typename T> | ||
10 | + __device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src, | ||
11 | + unsigned int x, unsigned int y, unsigned int X, unsigned int Y, | ||
12 | + dim3 threadIdx, dim3 blockDim){ | ||
13 | + | ||
14 | + //calculate the number of iterations required for the copy | ||
15 | + unsigned int xI, yI; | ||
16 | + xI = X/blockDim.x + 1; //number of iterations along X | ||
17 | + yI = Y/blockDim.y + 1; //number of iterations along Y | ||
18 | + | ||
19 | + //for each iteration | ||
20 | + for(unsigned int xi = 0; xi < xI; xi++){ | ||
21 | + for(unsigned int yi = 0; yi < yI; yi++){ | ||
22 | + | ||
23 | + //calculate the index into shared memory | ||
24 | + unsigned int sx = xi * blockDim.x + threadIdx.x; | ||
25 | + unsigned int sy = yi * blockDim.y + threadIdx.y; | ||
26 | + | ||
27 | + //calculate the index into the texture | ||
28 | + unsigned int tx = x + sx; | ||
29 | + unsigned int ty = y + sy; | ||
30 | + | ||
31 | + //perform the copy | ||
32 | + if(sx < X && sy < Y) | ||
33 | + dest[sy * X + sx] = tex2D<T>(src, tx, ty); | ||
34 | + } | ||
35 | + } | ||
36 | + } | ||
37 | + | ||
38 | + } | ||
39 | +} | ||
40 | + | ||
41 | + | ||
42 | +#endif | ||
0 | \ No newline at end of file | 43 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_UPDATE_DIR_H | ||
2 | +#define STIM_CUDA_UPDATE_DIR_H | ||
3 | + | ||
4 | + | ||
5 | +# include <iostream> | ||
6 | +# include <cuda.h> | ||
7 | +# include <stim/cuda/devices.h> | ||
8 | +# include <stim/cuda/error.h> | ||
9 | +#include <stim/cuda/sharedmem.cuh> | ||
10 | + | ||
11 | +namespace stim{ | ||
12 | + namespace cuda{ | ||
13 | + | ||
14 | + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. | ||
15 | + template<typename T> | ||
16 | + __global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
17 | + | ||
18 | + //generate a pointer to shared memory (size will be specified as a kernel parameter) | ||
19 | + extern __shared__ float s_vote[]; | ||
20 | + | ||
21 | + //calculate the start point for this block | ||
22 | + int bxi = blockIdx.x * blockDim.x; | ||
23 | + | ||
24 | + //calculate the width of the shared memory block | ||
25 | + int swidth = 2 * rmax + blockDim.x; | ||
26 | + | ||
27 | + // calculate the 2D coordinates for this current thread. | ||
28 | + int xi = bxi + threadIdx.x; | ||
29 | + int yi = blockIdx.y; | ||
30 | + | ||
31 | + // convert 2D coordinates to 1D | ||
32 | + int i = yi * x + xi; | ||
33 | + | ||
34 | + // calculate the voting direction based on the grtadient direction | ||
35 | + float theta = gpuGrad[2*i]; | ||
36 | + | ||
37 | + //initialize the vote direction to zero | ||
38 | + gpuDir[i] = 0; | ||
39 | + | ||
40 | + // define a local variable to maximum value of the vote image in the voting area for this voter | ||
41 | + float max = 0; | ||
42 | + | ||
43 | + // define two local variables for the x and y coordinations where the maximum happened | ||
44 | + int id_x = 0; | ||
45 | + int id_y = 0; | ||
46 | + | ||
47 | + // compute the size of window which will be checked for finding the voting area for this voter | ||
48 | + unsigned int x_table = 2*rmax +1; | ||
49 | + unsigned int rmax_sq = rmax * rmax; | ||
50 | + int r = (int)rmax; | ||
51 | + int tx_rmax = threadIdx.x + rmax; | ||
52 | + int bxs = bxi - rmax; | ||
53 | + | ||
54 | + for(int yr = -r; yr <= r; yr++){ | ||
55 | + | ||
56 | + //copy the portion of the image necessary for this block to shared memory | ||
57 | + __syncthreads(); | ||
58 | + stim::cuda::sharedMemcpy_tex2D<float>(s_vote, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim); | ||
59 | + __syncthreads(); | ||
60 | + | ||
61 | + //if the current thread is outside of the image, it doesn't have to be computed | ||
62 | + if(xi < x && yi < y){ | ||
63 | + | ||
64 | + for(int xr = -r; xr <= r; xr++){ | ||
65 | + | ||
66 | + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; | ||
67 | + | ||
68 | + // calculate the angle between the voter and the current pixel in x and y directions | ||
69 | + float atan_angle = gpuTable[ind_t]; | ||
70 | + | ||
71 | + | ||
72 | + // calculate the voting direction based on the grtadient direction | ||
73 | + int idx_share_update = xr + tx_rmax ; | ||
74 | + float share_vote = s_vote[idx_share_update]; | ||
75 | + | ||
76 | + // check if the current pixel is located in the voting area of this voter. | ||
77 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | ||
78 | + | ||
79 | + // compare the vote value of this pixel with the max value to find the maxima and its index. | ||
80 | + if (share_vote>max) { | ||
81 | + | ||
82 | + max = share_vote; | ||
83 | + id_x = xr; | ||
84 | + id_y = yr; | ||
85 | + } | ||
86 | + } | ||
87 | + } | ||
88 | + } | ||
89 | + } | ||
90 | + | ||
91 | + | ||
92 | + //float new_angle = atan2(dy, dx); | ||
93 | + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); | ||
94 | + | ||
95 | + float new_angle = gpuTable[ind_m]; | ||
96 | + | ||
97 | + gpuDir[i] = new_angle; | ||
98 | + | ||
99 | + } | ||
100 | + | ||
101 | + // this kernel updates the gradient direction by the calculated voting direction. | ||
102 | + template<typename T> | ||
103 | + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, unsigned int x, unsigned int y){ | ||
104 | + | ||
105 | + //************ when the number of threads are (1024,1) ************* | ||
106 | + | ||
107 | + // calculate the 2D coordinates for this current thread. | ||
108 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
109 | + int yi = blockIdx.y; | ||
110 | + // convert 2D coordinates to 1D | ||
111 | + int i = yi * x + xi; | ||
112 | + | ||
113 | + | ||
114 | + //update the gradient image with the vote direction | ||
115 | + gpuGrad[2*i] = gpuDir[i]; | ||
116 | + } | ||
117 | + | ||
118 | + | ||
119 | + template<typename T> | ||
120 | + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
121 | + | ||
122 | + //get the number of pixels in the image | ||
123 | + unsigned int pixels = x * y; | ||
124 | + unsigned int bytes = sizeof(T) * pixels; | ||
125 | + | ||
126 | + | ||
127 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | ||
128 | + dim3 threads(max_threads, 1); | ||
129 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | ||
130 | + | ||
131 | + // Allocate CUDA array in device memory | ||
132 | + | ||
133 | + //define a channel descriptor for a single 32-bit channel | ||
134 | + cudaChannelFormatDesc channelDesc = | ||
135 | + cudaCreateChannelDesc(32, 0, 0, 0, | ||
136 | + cudaChannelFormatKindFloat); | ||
137 | + cudaArray* cuArray; //declare the cuda array | ||
138 | + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array | ||
139 | + | ||
140 | + // Copy the image data from global memory to the array | ||
141 | + cudaMemcpyToArray(cuArray, 0, 0, gpuVote, bytes, | ||
142 | + cudaMemcpyDeviceToDevice); | ||
143 | + | ||
144 | + // Specify texture | ||
145 | + struct cudaResourceDesc resDesc; //create a resource descriptor | ||
146 | + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero | ||
147 | + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type | ||
148 | + resDesc.res.array.array = cuArray; //add a pointer to the cuda array | ||
149 | + | ||
150 | + // Specify texture object parameters | ||
151 | + struct cudaTextureDesc texDesc; //create a texture descriptor | ||
152 | + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero | ||
153 | + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) | ||
154 | + texDesc.addressMode[1] = cudaAddressModeWrap; | ||
155 | + texDesc.filterMode = cudaFilterModePoint; //use linear filtering | ||
156 | + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) | ||
157 | + texDesc.normalizedCoords = 0; //not using normalized coordinates | ||
158 | + | ||
159 | + // Create texture object | ||
160 | + cudaTextureObject_t texObj = 0; | ||
161 | + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); | ||
162 | + | ||
163 | + // specify share memory | ||
164 | + unsigned int share_bytes = (2*rmax + threads.x)*(1)*4; | ||
165 | + | ||
166 | + // allocate space on the GPU for the updated vote direction | ||
167 | + T* gpuDir; | ||
168 | + cudaMalloc(&gpuDir, bytes); | ||
169 | + | ||
170 | + //call the kernel to calculate the new voting direction | ||
171 | + cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, texObj, gpuGrad, gpuTable, phi, rmax, x , y); | ||
172 | + | ||
173 | + //call the kernel to update the gradient direction | ||
174 | + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); | ||
175 | + | ||
176 | + | ||
177 | + //free allocated memory | ||
178 | + cudaFree(gpuDir); | ||
179 | + | ||
180 | + } | ||
181 | + | ||
182 | + | ||
183 | + template<typename T> | ||
184 | + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
185 | + | ||
186 | + //calculate the number of bytes in the array | ||
187 | + unsigned int bytes = x * y * sizeof(T); | ||
188 | + | ||
189 | + //calculate the number of bytes in the atan2 table | ||
190 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | ||
191 | + | ||
192 | + //allocate space on the GPU for the Vote Image | ||
193 | + T* gpuVote; | ||
194 | + cudaMalloc(&gpuVote, bytes); | ||
195 | + | ||
196 | + //copy the input vote image to the GPU | ||
197 | + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); | ||
198 | + | ||
199 | + //allocate space on the GPU for the input Gradient image | ||
200 | + T* gpuGrad; | ||
201 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | ||
202 | + | ||
203 | + //copy the Gradient data to the GPU | ||
204 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | ||
205 | + | ||
206 | + //allocate space on the GPU for the atan2 table | ||
207 | + T* gpuTable; | ||
208 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | ||
209 | + | ||
210 | + //copy the atan2 values to the GPU | ||
211 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | ||
212 | + | ||
213 | + | ||
214 | + //call the GPU version of the update direction function | ||
215 | + gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
216 | + | ||
217 | + | ||
218 | + //copy the new gradient image back to the CPU | ||
219 | + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; | ||
220 | + | ||
221 | + //free allocated memory | ||
222 | + cudaFree(gpuTable); | ||
223 | + cudaFree(gpuVote); | ||
224 | + cudaFree(gpuGrad); | ||
225 | + } | ||
226 | + | ||
227 | + } | ||
228 | +} | ||
229 | + | ||
230 | + | ||
231 | + | ||
232 | +#endif | ||
0 | \ No newline at end of file | 233 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_VOTE_H | ||
2 | +#define STIM_CUDA_VOTE_H | ||
3 | + | ||
4 | + | ||
5 | +# include <iostream> | ||
6 | +# include <cuda.h> | ||
7 | +# include <stim/cuda/devices.h> | ||
8 | +# include <stim/cuda/error.h> | ||
9 | +#include <stim/cuda/sharedmem.cuh> | ||
10 | + | ||
11 | + | ||
12 | +namespace stim{ | ||
13 | + namespace cuda{ | ||
14 | + | ||
15 | + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | ||
16 | + template<typename T> | ||
17 | + __global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
18 | + | ||
19 | + //generate a pointer to shared memory (size will be specified as a kernel parameter) | ||
20 | + extern __shared__ float2 s_grad[]; | ||
21 | + | ||
22 | + //calculate the start point for this block | ||
23 | + int bxi = blockIdx.x * blockDim.x; | ||
24 | + | ||
25 | + //calculate the width of the shared memory block | ||
26 | + int swidth = 2 * rmax + blockDim.x; | ||
27 | + | ||
28 | + // calculate the 2D coordinates for this current thread. | ||
29 | + int xi = bxi + threadIdx.x; | ||
30 | + int yi = blockIdx.y; | ||
31 | + // convert 2D coordinates to 1D | ||
32 | + int i = yi * x + xi; | ||
33 | + | ||
34 | + | ||
35 | + // define a local variable to sum the votes from the voters | ||
36 | + float sum = 0; | ||
37 | + | ||
38 | + // compute the size of window which will be checked for finding the proper voters for this pixel | ||
39 | + unsigned int x_table = 2*rmax +1; | ||
40 | + | ||
41 | + unsigned int rmax_sq = rmax * rmax; | ||
42 | + int r = (int)rmax; | ||
43 | + int tx_rmax = threadIdx.x + rmax; | ||
44 | + int bxs = bxi - rmax; | ||
45 | + | ||
46 | + | ||
47 | + for(int yr = -r; yr <= r; yr++){ | ||
48 | + | ||
49 | + //copy the portion of the image necessary for this block to shared memory | ||
50 | + __syncthreads(); | ||
51 | + stim::cuda::sharedMemcpy_tex2D<float2>(s_grad, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim); | ||
52 | + __syncthreads(); | ||
53 | + | ||
54 | + //if the current thread is outside of the image, it doesn't have to be computed | ||
55 | + if(xi < x && yi < y){ | ||
56 | + | ||
57 | + for(int xr = -r; xr <= r; xr++){ | ||
58 | + | ||
59 | + //find the location of this voter in the atan2 table | ||
60 | + unsigned int id_t = (yr + rmax) * x_table + xr + rmax; | ||
61 | + | ||
62 | + // calculate the angle between the pixel and the current voter in x and y directions | ||
63 | + float atan_angle = gpuTable[id_t]; | ||
64 | + | ||
65 | + | ||
66 | + // calculate the voting direction based on the grtadient direction | ||
67 | + int idx_share = xr + tx_rmax ; | ||
68 | + float2 g = s_grad[idx_share]; | ||
69 | + float theta = g.x; | ||
70 | + | ||
71 | + // check if the current voter is located in the voting area of this pixel. | ||
72 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | ||
73 | + sum += g.y; | ||
74 | + | ||
75 | + } | ||
76 | + | ||
77 | + } | ||
78 | + } | ||
79 | + } | ||
80 | + | ||
81 | + gpuVote[i] = sum; | ||
82 | + } | ||
83 | + | ||
84 | + template<typename T> | ||
85 | + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
86 | + | ||
87 | + //get the number of pixels in the image | ||
88 | + unsigned int pixels = x * y; | ||
89 | + unsigned int bytes = sizeof(T) * pixels; | ||
90 | + | ||
91 | + | ||
92 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | ||
93 | + //unsigned int thread_dim = sqrt(max_threads); | ||
94 | + dim3 threads(max_threads, 1); | ||
95 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | ||
96 | + | ||
97 | + // Allocate CUDA array in device memory | ||
98 | + | ||
99 | + //define a channel descriptor for a single 32-bit channel | ||
100 | + cudaChannelFormatDesc channelDesc = | ||
101 | + cudaCreateChannelDesc(32, 32, 0, 0, | ||
102 | + cudaChannelFormatKindFloat); | ||
103 | + cudaArray* cuArray; //declare the cuda array | ||
104 | + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array | ||
105 | + | ||
106 | + // Copy the image data from global memory to the array | ||
107 | + cudaMemcpyToArray(cuArray, 0, 0, gpuGrad, bytes*2, | ||
108 | + cudaMemcpyDeviceToDevice); | ||
109 | + | ||
110 | + // Specify texture | ||
111 | + struct cudaResourceDesc resDesc; //create a resource descriptor | ||
112 | + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero | ||
113 | + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type | ||
114 | + resDesc.res.array.array = cuArray; //add a pointer to the cuda array | ||
115 | + | ||
116 | + // Specify texture object parameters | ||
117 | + struct cudaTextureDesc texDesc; //create a texture descriptor | ||
118 | + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero | ||
119 | + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) | ||
120 | + texDesc.addressMode[1] = cudaAddressModeWrap; | ||
121 | + texDesc.filterMode = cudaFilterModePoint; //use linear filtering | ||
122 | + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) | ||
123 | + texDesc.normalizedCoords = 0; //not using normalized coordinates | ||
124 | + | ||
125 | + // Create texture object | ||
126 | + cudaTextureObject_t texObj = 0; | ||
127 | + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); | ||
128 | + | ||
129 | + // specify share memory | ||
130 | + unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4; | ||
131 | + | ||
132 | + | ||
133 | + //call the kernel to do the voting | ||
134 | + | ||
135 | + cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y); | ||
136 | + | ||
137 | + } | ||
138 | + | ||
139 | + | ||
140 | + template<typename T> | ||
141 | + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
142 | + | ||
143 | + //calculate the number of bytes in the array | ||
144 | + unsigned int bytes = x * y * sizeof(T); | ||
145 | + | ||
146 | + //calculate the number of bytes in the atan2 table | ||
147 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | ||
148 | + | ||
149 | + //allocate space on the GPU for the Vote Image | ||
150 | + T* gpuVote; | ||
151 | + cudaMalloc(&gpuVote, bytes); | ||
152 | + | ||
153 | + //allocate space on the GPU for the input Gradient image | ||
154 | + T* gpuGrad; | ||
155 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | ||
156 | + | ||
157 | + //copy the Gradient Magnitude data to the GPU | ||
158 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | ||
159 | + | ||
160 | + //allocate space on the GPU for the atan2 table | ||
161 | + T* gpuTable; | ||
162 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | ||
163 | + | ||
164 | + //copy the atan2 values to the GPU | ||
165 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | ||
166 | + | ||
167 | + //cudaMemcpyToSymbol (cstTable, cpuTable, bytes_table ); | ||
168 | + | ||
169 | + | ||
170 | + //call the GPU version of the vote calculation function | ||
171 | + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
172 | + | ||
173 | + | ||
174 | + //copy the Vote Data back to the CPU | ||
175 | + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; | ||
176 | + | ||
177 | + //free allocated memory | ||
178 | + cudaFree(gpuTable); | ||
179 | + cudaFree(gpuVote); | ||
180 | + cudaFree(gpuGrad); | ||
181 | + } | ||
182 | + | ||
183 | + } | ||
184 | +} | ||
185 | + | ||
186 | + | ||
187 | + | ||
188 | +#endif | ||
0 | \ No newline at end of file | 189 | \ No newline at end of file |
stim/parser/filename.h
@@ -19,8 +19,9 @@ | @@ -19,8 +19,9 @@ | ||
19 | #include <algorithm> | 19 | #include <algorithm> |
20 | 20 | ||
21 | #include "../parser/parser.h" | 21 | #include "../parser/parser.h" |
22 | - | 22 | +#ifdef BOOST_PRECOMPILED |
23 | #include <boost/filesystem.hpp> | 23 | #include <boost/filesystem.hpp> |
24 | +#endif | ||
24 | 25 | ||
25 | namespace stim{ | 26 | namespace stim{ |
26 | 27 | ||
@@ -154,6 +155,7 @@ public: | @@ -154,6 +155,7 @@ public: | ||
154 | 155 | ||
155 | return ss.str(); | 156 | return ss.str(); |
156 | } | 157 | } |
158 | +#ifdef BOOST_PRECOMPILED | ||
157 | 159 | ||
158 | //get a list of files matching the current template | 160 | //get a list of files matching the current template |
159 | std::vector<stim::filename> get_list(){ | 161 | std::vector<stim::filename> get_list(){ |
@@ -193,7 +195,7 @@ public: | @@ -193,7 +195,7 @@ public: | ||
193 | 195 | ||
194 | return file_list; | 196 | return file_list; |
195 | } | 197 | } |
196 | - | 198 | +#endif |
197 | //gets the current working directory | 199 | //gets the current working directory |
198 | static stim::filename cwd(){ | 200 | static stim::filename cwd(){ |
199 | 201 |