Commit 5cc0976c6028f692e7663810b4504bc34e9c8f10
1 parent
aa1bc80d
added separable convolution, organized the stim/cuda directory
Showing
18 changed files
with
433 additions
and
235 deletions
Show diff stats
stim/cuda/arraymath.cuh
1 | #ifndef STIM_CUDA_ARRAYMATH_H | 1 | #ifndef STIM_CUDA_ARRAYMATH_H |
2 | #define STIM_CUDA_ARRAYMATH_H | 2 | #define STIM_CUDA_ARRAYMATH_H |
3 | 3 | ||
4 | -#include <stim/cuda/array_add.cuh> | ||
5 | -#include <stim/cuda/array_multiply.cuh> | ||
6 | -#include <stim/cuda/array_abs.cuh> | ||
7 | -#include <stim/cuda/array_cart2polar.cuh> | ||
8 | -#include <stim/cuda/gaussian_blur.cuh> | ||
9 | -#include <stim/cuda/conv2.cuh> | 4 | +#include <stim/cuda/arraymath/array_add.cuh> |
5 | +#include <stim/cuda/arraymath/array_multiply.cuh> | ||
6 | +#include <stim/cuda/arraymath/array_abs.cuh> | ||
7 | +#include <stim/cuda/arraymath/array_cart2polar.cuh> | ||
10 | 8 | ||
11 | namespace stim{ | 9 | namespace stim{ |
12 | namespace cuda{ | 10 | namespace cuda{ |
stim/cuda/array_abs.cuh renamed to stim/cuda/arraymath/array_abs.cuh
stim/cuda/array_add.cuh renamed to stim/cuda/arraymath/array_add.cuh
stim/cuda/array_cart2polar.cuh renamed to stim/cuda/arraymath/array_cart2polar.cuh
stim/cuda/array_multiply.cuh renamed to stim/cuda/arraymath/array_multiply.cuh
stim/cuda/callable.h renamed to stim/cuda/cudatools/callable.h
stim/cuda/devices.h renamed to stim/cuda/cudatools/devices.h
stim/cuda/error.h renamed to stim/cuda/cudatools/error.h
stim/cuda/glbind.h renamed to stim/cuda/cudatools/glbind.h
stim/cuda/threads.h renamed to stim/cuda/cudatools/threads.h
1 | -#include "cuda_runtime.h" | 1 | +#include "cuda_runtime.h" |
2 | #include "device_launch_parameters.h" | 2 | #include "device_launch_parameters.h" |
3 | -#include "../cuda/callable.h" | 3 | +#include <stim/cuda/cudatools/callable.h> |
4 | 4 | ||
5 | #ifndef CUDA_THREADS_H | 5 | #ifndef CUDA_THREADS_H |
6 | #define CUDA_THREADS_H | 6 | #define CUDA_THREADS_H |
stim/cuda/timer.h renamed to stim/cuda/cudatools/timer.h
1 | -#ifndef STIM_CUDA_TIMER | ||
2 | -#define STIM_CUDA_TIMER | ||
3 | - | ||
4 | -static cudaEvent_t tStartEvent; | ||
5 | -static cudaEvent_t tStopEvent; | ||
6 | - | ||
7 | -namespace stim{ | ||
8 | - | ||
9 | -/// These functions calculate the time between GPU functions in milliseconds | ||
10 | -static void gpuStartTimer() | ||
11 | -{ | ||
12 | - //set up timing events | ||
13 | - cudaEventCreate(&tStartEvent); | ||
14 | - cudaEventCreate(&tStopEvent); | ||
15 | - cudaEventRecord(tStartEvent, 0); | ||
16 | -} | ||
17 | - | ||
18 | -static float gpuStopTimer() | ||
19 | -{ | ||
20 | - cudaEventRecord(tStopEvent, 0); | ||
21 | - cudaEventSynchronize(tStopEvent); | ||
22 | - float elapsedTime; | ||
23 | - cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent); | ||
24 | - cudaEventDestroy(tStartEvent); | ||
25 | - cudaEventDestroy(tStopEvent); | ||
26 | - return elapsedTime; | ||
27 | -} | ||
28 | - | ||
29 | -} //end namespace stim | ||
30 | - | 1 | +#ifndef STIM_CUDA_TIMER |
2 | +#define STIM_CUDA_TIMER | ||
3 | + | ||
4 | +static cudaEvent_t tStartEvent; | ||
5 | +static cudaEvent_t tStopEvent; | ||
6 | + | ||
7 | +namespace stim{ | ||
8 | + | ||
9 | +/// These functions calculate the time between GPU functions in milliseconds | ||
10 | +static void gpuStartTimer() | ||
11 | +{ | ||
12 | + //set up timing events | ||
13 | + cudaEventCreate(&tStartEvent); | ||
14 | + cudaEventCreate(&tStopEvent); | ||
15 | + cudaEventRecord(tStartEvent, 0); | ||
16 | +} | ||
17 | + | ||
18 | +static float gpuStopTimer() | ||
19 | +{ | ||
20 | + cudaEventRecord(tStopEvent, 0); | ||
21 | + cudaEventSynchronize(tStopEvent); | ||
22 | + float elapsedTime; | ||
23 | + cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent); | ||
24 | + cudaEventDestroy(tStartEvent); | ||
25 | + cudaEventDestroy(tStopEvent); | ||
26 | + return elapsedTime; | ||
27 | +} | ||
28 | + | ||
29 | +} //end namespace stim | ||
30 | + | ||
31 | #endif | 31 | #endif |
32 | \ No newline at end of file | 32 | \ No newline at end of file |
stim/cuda/down_sample.cuh
@@ -89,7 +89,7 @@ namespace stim{ | @@ -89,7 +89,7 @@ namespace stim{ | ||
89 | gpu_down_sample<T>(gpuI, gpuI0, resize, x, y); | 89 | gpu_down_sample<T>(gpuI, gpuI0, resize, x, y); |
90 | 90 | ||
91 | //copy the image data to the GPU | 91 | //copy the image data to the GPU |
92 | - cudaMemcpy(re_image, gpuI, bytes_ds, cudaMemcpyHostToDevice); | 92 | + cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice); |
93 | 93 | ||
94 | cudaFree(gpuI0); | 94 | cudaFree(gpuI0); |
95 | cudeFree(gpuI); | 95 | cudeFree(gpuI); |
stim/cuda/gaussian_blur.cuh
@@ -3,9 +3,9 @@ | @@ -3,9 +3,9 @@ | ||
3 | 3 | ||
4 | #include <iostream> | 4 | #include <iostream> |
5 | #include <cuda.h> | 5 | #include <cuda.h> |
6 | -#include <stim/cuda/devices.h> | ||
7 | -#include <stim/cuda/timer.h> | 6 | +#include <stim/cuda/cudatools.h> |
8 | #include <stim/cuda/sharedmem.cuh> | 7 | #include <stim/cuda/sharedmem.cuh> |
8 | +#include <stim/cuda/templates/conv2sep.cuh> //GPU-based separable convolution algorithm | ||
9 | 9 | ||
10 | #define pi 3.14159 | 10 | #define pi 3.14159 |
11 | 11 | ||
@@ -13,228 +13,74 @@ namespace stim{ | @@ -13,228 +13,74 @@ namespace stim{ | ||
13 | namespace cuda{ | 13 | namespace cuda{ |
14 | 14 | ||
15 | template<typename T> | 15 | template<typename T> |
16 | - __global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ | 16 | + void gen_gaussian(T* out, T sigma, unsigned int width){ |
17 | 17 | ||
18 | - //generate a pointer to shared memory (size will be specified as a kernel parameter) | ||
19 | - extern __shared__ T s[]; | 18 | + //fill the kernel with a gaussian |
19 | + for(unsigned int xi = 0; xi < width; xi++){ | ||
20 | 20 | ||
21 | - int kr = sigma * 4; //calculate the kernel radius | ||
22 | - | ||
23 | - //get a pointer to the gaussian in memory | ||
24 | - T* g = (T*)&s[blockDim.x + 2 * kr]; | ||
25 | - | ||
26 | - //calculate the start point for this block | ||
27 | - int bxi = blockIdx.x * blockDim.x; | ||
28 | - int byi = blockIdx.y; | ||
29 | - | ||
30 | - //copy the portion of the image necessary for this block to shared memory | ||
31 | - stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); | ||
32 | - | ||
33 | - //calculate the thread index and block index | ||
34 | - int ti = threadIdx.x; | ||
35 | - | ||
36 | - //calculate the spatial coordinate for this thread | ||
37 | - int xi = bxi + ti; | ||
38 | - | ||
39 | - //pre-compute the gaussian values for each kernel point | ||
40 | - T a = 1.0 / (sigma * sqrt(2 * pi)); | ||
41 | - T c = - 1.0 / (2*sigma*sigma); | ||
42 | - int ki; | ||
43 | - | ||
44 | - //use the first 2kr+1 threads to evaluate a gaussian and store the result | ||
45 | - if(ti <= 2* kr+1){ | ||
46 | - ki = ti - kr; | ||
47 | - g[ti] = a * exp((ki*ki) * c); | 21 | + float x = (float)xi - (float)(width/2); //calculate the x position of the gaussian |
22 | + float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) ); | ||
23 | + out[xi] = g; | ||
48 | } | 24 | } |
49 | 25 | ||
50 | - //make sure that all writing to shared memory is done before continuing | ||
51 | - __syncthreads(); | ||
52 | - | ||
53 | - //if the current pixel is outside of the image | ||
54 | - if(bxi + ti > x || byi > y) | ||
55 | - return; | ||
56 | - | ||
57 | - | ||
58 | - | ||
59 | - //calculate the coordinates of the current thread in shared memory | ||
60 | - int si = ti + kr; | ||
61 | - | ||
62 | - T sum = 0; //running weighted sum across the kernel | ||
63 | - | ||
64 | - | ||
65 | - //for each element of the kernel | ||
66 | - for(int ki = -kr; ki <= kr; ki++){ | ||
67 | - sum += g[ki + kr] * s[si + ki]; | ||
68 | - } | ||
69 | - | ||
70 | - //calculate the 1D image index for this thread | ||
71 | - unsigned int i = byi * x + xi; | ||
72 | - | ||
73 | - //output the result to global memory | ||
74 | - out[i] = sum; | ||
75 | } | 26 | } |
76 | 27 | ||
77 | template<typename T> | 28 | template<typename T> |
78 | - __global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ | ||
79 | - | ||
80 | - //generate a pointer to shared memory (size will be specified as a kernel parameter) | ||
81 | - extern __shared__ T s[]; | ||
82 | - | ||
83 | - int kr = sigma * 4; //calculate the kernel radius | 29 | + void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){ |
84 | 30 | ||
85 | - //get a pointer to the gaussian in memory | ||
86 | - T* g = (T*)&s[blockDim.y + 2 * kr]; | 31 | + //allocate space for the kernel |
32 | + unsigned int kwidth = sigma * 8 + 1; | ||
33 | + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); | ||
87 | 34 | ||
88 | - //calculate the start point for this block | ||
89 | - int bxi = blockIdx.x; | ||
90 | - int byi = blockIdx.y * blockDim.y; | 35 | + //fill the kernel with a gaussian |
36 | + gen_gaussian(kernel0, sigma, kwidth); | ||
91 | 37 | ||
92 | - //copy the portion of the image necessary for this block to shared memory | ||
93 | - stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); | 38 | + //copy the kernel to the GPU |
39 | + T* gpuKernel0; | ||
40 | + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); | ||
94 | 41 | ||
95 | - //calculate the thread index and block index | ||
96 | - int ti = threadIdx.y; | 42 | + //perform the gaussian blur as a separable convolution |
43 | + stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth); | ||
97 | 44 | ||
98 | - //calculate the spatial coordinate for this thread | ||
99 | - int yi = byi + ti; | 45 | + HANDLE_ERROR(cudaFree(gpuKernel0)); |
100 | 46 | ||
101 | - //pre-compute the gaussian values for each kernel point | ||
102 | - T a = 1.0 / (sigma * sqrt(2 * pi)); | ||
103 | - T c = - 1.0 / (2*sigma*sigma); | ||
104 | - int ki; | ||
105 | - | ||
106 | - //use the first 2kr+1 threads to evaluate a gaussian and store the result | ||
107 | - if(ti <= 2* kr+1){ | ||
108 | - ki = ti - kr; | ||
109 | - g[ti] = a * exp((ki*ki) * c); | ||
110 | - } | ||
111 | - | ||
112 | - //make sure that all writing to shared memory is done before continuing | ||
113 | - __syncthreads(); | ||
114 | - | ||
115 | - //if the current pixel is outside of the image | ||
116 | - if(bxi >= x || yi >= y) | ||
117 | - return; | ||
118 | - | ||
119 | - | ||
120 | - | ||
121 | - //calculate the coordinates of the current thread in shared memory | ||
122 | - int si = ti + kr; | ||
123 | - | ||
124 | - T sum = 0; //running weighted sum across the kernel | ||
125 | - | ||
126 | - | ||
127 | - //for each element of the kernel | ||
128 | - for(int ki = -kr; ki <= kr; ki++){ | ||
129 | - sum += g[ki + kr] * s[si + ki]; | ||
130 | - } | ||
131 | - | ||
132 | - //calculate the 1D image index for this thread | ||
133 | - unsigned int i = yi * x + bxi; | ||
134 | - | ||
135 | - //output the result to global memory | ||
136 | - out[i] = sum; | ||
137 | } | 47 | } |
138 | 48 | ||
139 | - /// Applies a Gaussian blur to a 2D image stored on the GPU | ||
140 | template<typename T> | 49 | template<typename T> |
141 | - void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ | 50 | + void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){ |
142 | 51 | ||
143 | - //get the number of pixels in the image | ||
144 | - unsigned int pixels = x * y; | ||
145 | - unsigned int bytes = sizeof(T) * pixels; | 52 | + //allocate space for the kernel |
53 | + unsigned int kwidth = sigma * 8 + 1; | ||
54 | + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); | ||
146 | 55 | ||
147 | - // Allocate CUDA array in device memory | ||
148 | - | ||
149 | - //define a channel descriptor for a single 32-bit channel | ||
150 | - cudaChannelFormatDesc channelDesc = | ||
151 | - cudaCreateChannelDesc(32, 0, 0, 0, | ||
152 | - cudaChannelFormatKindFloat); | ||
153 | - cudaArray* cuArray; //declare the cuda array | ||
154 | - cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array | ||
155 | - | ||
156 | - // Copy the image data from global memory to the array | ||
157 | - cudaMemcpyToArray(cuArray, 0, 0, image, bytes, | ||
158 | - cudaMemcpyDeviceToDevice); | ||
159 | - | ||
160 | - // Specify texture | ||
161 | - struct cudaResourceDesc resDesc; //create a resource descriptor | ||
162 | - memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero | ||
163 | - resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type | ||
164 | - resDesc.res.array.array = cuArray; //add a pointer to the cuda array | ||
165 | - | ||
166 | - // Specify texture object parameters | ||
167 | - struct cudaTextureDesc texDesc; //create a texture descriptor | ||
168 | - memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero | ||
169 | - texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) | ||
170 | - texDesc.addressMode[1] = cudaAddressModeWrap; | ||
171 | - texDesc.filterMode = cudaFilterModePoint; //use linear filtering | ||
172 | - texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) | ||
173 | - texDesc.normalizedCoords = 0; //not using normalized coordinates | ||
174 | - | ||
175 | - // Create texture object | ||
176 | - cudaTextureObject_t texObj = 0; | ||
177 | - cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); | 56 | + //fill the kernel with a gaussian |
57 | + gen_gaussian(kernel0, sigma, kwidth); | ||
178 | 58 | ||
179 | - | ||
180 | - //get the maximum number of threads per block for the CUDA device | ||
181 | - int max_threads = stim::maxThreadsPerBlock(); | ||
182 | - dim3 threads(max_threads, 1); | ||
183 | - | ||
184 | - //calculate the number of blocks | ||
185 | - dim3 blocks(x / threads.x + 1, y); | ||
186 | - | ||
187 | - //calculate the shared memory used in the kernel | ||
188 | - unsigned int pixel_bytes = max_threads * 4; //bytes devoted to pixel data being processed | ||
189 | - unsigned int apron_bytes = sigma * 8 * 4; //bytes devoted to pixels outside the window | ||
190 | - unsigned int gaussian_bytes = (sigma * 8 + 1) * 4; //bytes devoted to memory used to store the pre-computed Gaussian window | ||
191 | - unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes; //total number of bytes shared memory used | ||
192 | - | ||
193 | - //blur the image along the x-axis | ||
194 | - gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y); | ||
195 | - | ||
196 | - // Copy the x-blurred data from global memory to the texture | ||
197 | - cudaMemcpyToArray(cuArray, 0, 0, image, bytes, | ||
198 | - cudaMemcpyDeviceToDevice); | ||
199 | - | ||
200 | - //transpose the block and thread dimensions | ||
201 | - threads.x = 1; | ||
202 | - threads.y = max_threads; | ||
203 | - blocks.x = x; | ||
204 | - blocks.y = y / threads.y + 1; | ||
205 | - | ||
206 | - //blur the image along the y-axis | ||
207 | - gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y); | 59 | + //copy the kernel to the GPU |
60 | + T* gpuKernel0; | ||
61 | + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); | ||
62 | + | ||
63 | + //perform the gaussian blur as a separable convolution | ||
64 | + stim::cuda::gpu_conv2sep<float>(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth); | ||
208 | 65 | ||
209 | - //free allocated memory | ||
210 | - cudaFree(cuArray); | 66 | + HANDLE_ERROR(cudaFree(gpuKernel0)); |
211 | 67 | ||
212 | } | 68 | } |
213 | 69 | ||
214 | /// Applies a Gaussian blur to a 2D image stored on the CPU | 70 | /// Applies a Gaussian blur to a 2D image stored on the CPU |
215 | template<typename T> | 71 | template<typename T> |
216 | - void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ | 72 | + void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){ |
217 | 73 | ||
218 | - //get the number of pixels in the image | ||
219 | - unsigned int pixels = x * y; | ||
220 | - unsigned int bytes = sizeof(T) * pixels; | 74 | + //allocate space for the kernel |
75 | + unsigned int kwidth = sigma * 8 + 1; | ||
76 | + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); | ||
221 | 77 | ||
222 | - //allocate space on the GPU | ||
223 | - T* gpuI0; | ||
224 | - cudaMalloc(&gpuI0, bytes); | ||
225 | - | ||
226 | - | ||
227 | - //copy the image data to the GPU | ||
228 | - cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice); | ||
229 | - | ||
230 | - //run the GPU-based version of the algorithm | ||
231 | - gpu_gaussian_blur_2d<T>(gpuI0, sigma, x, y); | 78 | + //fill the kernel with a gaussian |
79 | + gen_gaussian(kernel0, sigma, kwidth); | ||
232 | 80 | ||
233 | - //copy the image data from the device | ||
234 | - cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost); | ||
235 | - | ||
236 | - //free allocated memory | ||
237 | - cudaFree(gpuI0); | 81 | + //perform the gaussian blur as a separable convolution |
82 | + stim::cuda::cpu_conv2sep<float>(image, x, y, kernel0, kwidth, kernel0, kwidth); | ||
83 | + | ||
238 | } | 84 | } |
239 | 85 | ||
240 | }; | 86 | }; |
stim/cuda/conv2.cuh renamed to stim/cuda/templates/conv2.cuh
1 | +#ifndef STIM_CUDA_CONV2SEP_H | ||
2 | +#define STIM_CUDA_CONV2SEP_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <stim/cuda/cudatools/devices.h> | ||
7 | +#include <stim/cuda/cudatools/timer.h> | ||
8 | +#include <stim/cuda/sharedmem.cuh> | ||
9 | +#include <stim/cuda/cudatools/error.h> | ||
10 | + | ||
11 | +#define pi 3.14159 | ||
12 | + | ||
13 | +namespace stim{ | ||
14 | + namespace cuda{ | ||
15 | + | ||
16 | + template<typename T> | ||
17 | + __global__ void conv2sep_0(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y, | ||
18 | + T* kernel0, unsigned int k0){ | ||
19 | + | ||
20 | + //generate a pointer to shared memory (size will be specified as a kernel parameter) | ||
21 | + extern __shared__ T s[]; | ||
22 | + | ||
23 | + int kr = k0/2; //calculate the kernel radius | ||
24 | + | ||
25 | + //get a pointer to the gaussian in memory | ||
26 | + T* g = (T*)&s[blockDim.x + 2 * kr]; | ||
27 | + | ||
28 | + //calculate the start point for this block | ||
29 | + int bxi = blockIdx.x * blockDim.x; | ||
30 | + int byi = blockIdx.y; | ||
31 | + | ||
32 | + //copy the portion of the image necessary for this block to shared memory | ||
33 | + stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); | ||
34 | + | ||
35 | + //calculate the thread index | ||
36 | + int ti = threadIdx.x; | ||
37 | + | ||
38 | + //calculate the spatial coordinate for this thread | ||
39 | + int xi = bxi + ti; | ||
40 | + int yi = byi; | ||
41 | + | ||
42 | + | ||
43 | + //use the first 2kr+1 threads to transfer the kernel to shared memory | ||
44 | + if(ti < k0){ | ||
45 | + g[ti] = kernel0[ti]; | ||
46 | + } | ||
47 | + | ||
48 | + //make sure that all writing to shared memory is done before continuing | ||
49 | + __syncthreads(); | ||
50 | + | ||
51 | + //if the current pixel is outside of the image | ||
52 | + if(xi > x || yi > y) | ||
53 | + return; | ||
54 | + | ||
55 | + | ||
56 | + //calculate the coordinates of the current thread in shared memory | ||
57 | + int si = ti + kr; | ||
58 | + | ||
59 | + T sum = 0; //running weighted sum across the kernel | ||
60 | + | ||
61 | + | ||
62 | + //for each element of the kernel | ||
63 | + for(int ki = -kr; ki <= kr; ki++){ | ||
64 | + sum += g[ki + kr] * s[si + ki]; | ||
65 | + } | ||
66 | + | ||
67 | + //calculate the 1D image index for this thread | ||
68 | + unsigned int i = byi * x + xi; | ||
69 | + | ||
70 | + //output the result to global memory | ||
71 | + out[i] = sum; | ||
72 | + } | ||
73 | + | ||
74 | + template<typename T> | ||
75 | + __global__ void conv2sep_1(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y, | ||
76 | + T* kernel0, unsigned int k0){ | ||
77 | + | ||
78 | + //generate a pointer to shared memory (size will be specified as a kernel parameter) | ||
79 | + extern __shared__ T s[]; | ||
80 | + | ||
81 | + int kr = k0/2; //calculate the kernel radius | ||
82 | + | ||
83 | + //get a pointer to the gaussian in memory | ||
84 | + T* g = (T*)&s[blockDim.y + 2 * kr]; | ||
85 | + | ||
86 | + //calculate the start point for this block | ||
87 | + int bxi = blockIdx.x; | ||
88 | + int byi = blockIdx.y * blockDim.y; | ||
89 | + | ||
90 | + //copy the portion of the image necessary for this block to shared memory | ||
91 | + stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); | ||
92 | + | ||
93 | + //calculate the thread index | ||
94 | + int ti = threadIdx.y; | ||
95 | + | ||
96 | + //calculate the spatial coordinate for this thread | ||
97 | + int xi = bxi; | ||
98 | + int yi = byi + ti; | ||
99 | + | ||
100 | + | ||
101 | + //use the first 2kr+1 threads to transfer the kernel to shared memory | ||
102 | + if(ti < k0){ | ||
103 | + g[ti] = kernel0[ti]; | ||
104 | + } | ||
105 | + | ||
106 | + //make sure that all writing to shared memory is done before continuing | ||
107 | + __syncthreads(); | ||
108 | + | ||
109 | + //if the current pixel is outside of the image | ||
110 | + if(xi > x || yi > y) | ||
111 | + return; | ||
112 | + | ||
113 | + | ||
114 | + //calculate the coordinates of the current thread in shared memory | ||
115 | + int si = ti + kr; | ||
116 | + | ||
117 | + T sum = 0; //running weighted sum across the kernel | ||
118 | + | ||
119 | + | ||
120 | + //for each element of the kernel | ||
121 | + for(int ki = -kr; ki <= kr; ki++){ | ||
122 | + sum += g[ki + kr] * s[si + ki]; | ||
123 | + } | ||
124 | + | ||
125 | + //calculate the 1D image index for this thread | ||
126 | + unsigned int i = yi * x + xi; | ||
127 | + | ||
128 | + //output the result to global memory | ||
129 | + out[i] = sum; | ||
130 | + } | ||
131 | + | ||
132 | + template<typename T> | ||
133 | + void tex_conv2sep(T* out, unsigned int x, unsigned int y, | ||
134 | + cudaTextureObject_t texObj, cudaArray* cuArray, | ||
135 | + T* kernel0, unsigned int k0, | ||
136 | + T* kernel1, unsigned int k1){ | ||
137 | + | ||
138 | + //get the maximum number of threads per block for the CUDA device | ||
139 | + int max_threads = stim::maxThreadsPerBlock(); | ||
140 | + dim3 threads(max_threads, 1); | ||
141 | + | ||
142 | + //calculate the number of blocks | ||
143 | + dim3 blocks(x / threads.x + 1, y); | ||
144 | + | ||
145 | + //calculate the shared memory used in the kernel | ||
146 | + unsigned int pixel_bytes = max_threads * sizeof(T); //bytes devoted to pixel data being processed | ||
147 | + unsigned int apron_bytes = k0/2 * sizeof(T); //bytes devoted to the apron on each side of the window | ||
148 | + unsigned int gaussian_bytes = k0 * sizeof(T); //bytes devoted to memory used to store the pre-computed Gaussian window | ||
149 | + unsigned int shared_bytes = pixel_bytes + 2 * apron_bytes + gaussian_bytes; //total number of bytes shared memory used | ||
150 | + | ||
151 | + //blur the image along the x-axis | ||
152 | + conv2sep_0<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel0, k0); | ||
153 | + | ||
154 | + // Copy the x-blurred data from global memory to the texture | ||
155 | + cudaMemcpyToArray(cuArray, 0, 0, out, x * y * sizeof(T), | ||
156 | + cudaMemcpyDeviceToDevice); | ||
157 | + | ||
158 | + //transpose the block and thread dimensions | ||
159 | + threads.x = 1; | ||
160 | + threads.y = max_threads; | ||
161 | + blocks.x = x; | ||
162 | + blocks.y = y / threads.y + 1; | ||
163 | + | ||
164 | + //blur the image along the y-axis | ||
165 | + conv2sep_1<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel1, k1); | ||
166 | + | ||
167 | + } | ||
168 | + | ||
169 | + template<typename T> | ||
170 | + void gpu_conv2sep(T* image, unsigned int x, unsigned int y, | ||
171 | + T* kernel0, unsigned int k0, | ||
172 | + T* kernel1, unsigned int k1){ | ||
173 | + | ||
174 | + //get the number of pixels in the image | ||
175 | + unsigned int pixels = x * y; | ||
176 | + unsigned int bytes = sizeof(T) * pixels; | ||
177 | + | ||
178 | + // Allocate CUDA array in device memory | ||
179 | + | ||
180 | + //define a channel descriptor for a single 32-bit channel | ||
181 | + cudaChannelFormatDesc channelDesc = | ||
182 | + cudaCreateChannelDesc(32, 0, 0, 0, | ||
183 | + cudaChannelFormatKindFloat); | ||
184 | + cudaArray* cuArray; //declare the cuda array | ||
185 | + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array | ||
186 | + | ||
187 | + // Copy the image data from global memory to the array | ||
188 | + cudaMemcpyToArray(cuArray, 0, 0, image, bytes, | ||
189 | + cudaMemcpyDeviceToDevice); | ||
190 | + | ||
191 | + // Specify texture | ||
192 | + struct cudaResourceDesc resDesc; //create a resource descriptor | ||
193 | + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero | ||
194 | + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type | ||
195 | + resDesc.res.array.array = cuArray; //add a pointer to the cuda array | ||
196 | + | ||
197 | + // Specify texture object parameters | ||
198 | + struct cudaTextureDesc texDesc; //create a texture descriptor | ||
199 | + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero | ||
200 | + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) | ||
201 | + texDesc.addressMode[1] = cudaAddressModeWrap; | ||
202 | + texDesc.filterMode = cudaFilterModePoint; //use linear filtering | ||
203 | + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) | ||
204 | + texDesc.normalizedCoords = 0; //not using normalized coordinates | ||
205 | + | ||
206 | + // Create texture object | ||
207 | + cudaTextureObject_t texObj = 0; | ||
208 | + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); | ||
209 | + | ||
210 | + //call the texture version of the separable convolution function | ||
211 | + tex_conv2sep(image, x, y, texObj, cuArray, kernel0, k0, kernel1, k1); | ||
212 | + | ||
213 | + //free allocated memory | ||
214 | + cudaFree(cuArray); | ||
215 | + | ||
216 | + } | ||
217 | + | ||
218 | + /// Applies a Gaussian blur to a 2D image stored on the CPU | ||
219 | + template<typename T> | ||
220 | + void cpu_conv2sep(T* image, unsigned int x, unsigned int y, | ||
221 | + T* kernel0, unsigned int k0, | ||
222 | + T* kernel1, unsigned int k1){ | ||
223 | + | ||
224 | + //get the number of pixels in the image | ||
225 | + unsigned int pixels = x * y; | ||
226 | + unsigned int bytes = sizeof(T) * pixels; | ||
227 | + | ||
228 | + //---------Allocate Image--------- | ||
229 | + //allocate space on the GPU for the image | ||
230 | + T* gpuI0; | ||
231 | + HANDLE_ERROR(cudaMalloc(&gpuI0, bytes)); | ||
232 | + | ||
233 | + //copy the image data to the GPU | ||
234 | + HANDLE_ERROR(cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice)); | ||
235 | + | ||
236 | + //---------Allocate Kernel-------- | ||
237 | + //allocate and copy the 0 (x) kernel | ||
238 | + T* gpuK0; | ||
239 | + HANDLE_ERROR(cudaMalloc(&gpuK0, k0 * sizeof(T))); | ||
240 | + HANDLE_ERROR(cudaMemcpy(gpuK0, kernel0, k0 * sizeof(T), cudaMemcpyHostToDevice)); | ||
241 | + | ||
242 | + //allocate and copy the 1 (y) kernel | ||
243 | + T* gpuK1; | ||
244 | + HANDLE_ERROR(cudaMalloc(&gpuK1, k1 * sizeof(T))); | ||
245 | + HANDLE_ERROR(cudaMemcpy(gpuK1, kernel1, k1 * sizeof(T), cudaMemcpyHostToDevice)); | ||
246 | + | ||
247 | + //run the GPU-based version of the algorithm | ||
248 | + gpu_conv2sep<T>(gpuI0, x, y, gpuK0, k0, gpuK1, k1); | ||
249 | + | ||
250 | + //copy the image data from the device | ||
251 | + cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost); | ||
252 | + | ||
253 | + //free allocated memory | ||
254 | + cudaFree(gpuI0); | ||
255 | + } | ||
256 | + | ||
257 | + }; | ||
258 | +}; | ||
259 | + | ||
260 | +#endif | ||
0 | \ No newline at end of file | 261 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_GAUSSIAN_BLUR_H | ||
2 | +#define STIM_CUDA_GAUSSIAN_BLUR_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <stim/cuda/cudatools.h> | ||
7 | +#include <stim/cuda/sharedmem.cuh> | ||
8 | +#include <stim/cuda/templates/conv2sep.cuh> //GPU-based separable convolution algorithm | ||
9 | + | ||
10 | +#define pi 3.14159 | ||
11 | + | ||
12 | +namespace stim{ | ||
13 | + namespace cuda{ | ||
14 | + | ||
15 | + template<typename T> | ||
16 | + void gen_gaussian(T* out, T sigma, unsigned int width){ | ||
17 | + | ||
18 | + //fill the kernel with a gaussian | ||
19 | + for(unsigned int xi = 0; xi < width; xi++){ | ||
20 | + | ||
21 | + float x = (float)xi - (float)(width/2); //calculate the x position of the gaussian | ||
22 | + float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) ); | ||
23 | + out[xi] = g; | ||
24 | + } | ||
25 | + | ||
26 | + } | ||
27 | + | ||
28 | + template<typename T> | ||
29 | + void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){ | ||
30 | + | ||
31 | + //allocate space for the kernel | ||
32 | + unsigned int kwidth = sigma * 8 + 1; | ||
33 | + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); | ||
34 | + | ||
35 | + //fill the kernel with a gaussian | ||
36 | + gen_gaussian(kernel0, sigma, kwidth); | ||
37 | + | ||
38 | + //copy the kernel to the GPU | ||
39 | + T* gpuKernel0; | ||
40 | + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); | ||
41 | + | ||
42 | + //perform the gaussian blur as a separable convolution | ||
43 | + stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth); | ||
44 | + | ||
45 | + HANDLE_ERROR(cudaFree(gpuKernel0)); | ||
46 | + | ||
47 | + } | ||
48 | + | ||
49 | + template<typename T> | ||
50 | + void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){ | ||
51 | + | ||
52 | + //allocate space for the kernel | ||
53 | + unsigned int kwidth = sigma * 8 + 1; | ||
54 | + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); | ||
55 | + | ||
56 | + //fill the kernel with a gaussian | ||
57 | + gen_gaussian(kernel0, sigma, kwidth); | ||
58 | + | ||
59 | + //copy the kernel to the GPU | ||
60 | + T* gpuKernel0; | ||
61 | + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); | ||
62 | + | ||
63 | + //perform the gaussian blur as a separable convolution | ||
64 | + stim::cuda::gpu_conv2sep<float>(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth); | ||
65 | + | ||
66 | + HANDLE_ERROR(cudaFree(gpuKernel0)); | ||
67 | + | ||
68 | + } | ||
69 | + | ||
70 | + /// Applies a Gaussian blur to a 2D image stored on the CPU | ||
71 | + template<typename T> | ||
72 | + void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){ | ||
73 | + | ||
74 | + //allocate space for the kernel | ||
75 | + unsigned int kwidth = sigma * 8 + 1; | ||
76 | + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); | ||
77 | + | ||
78 | + //fill the kernel with a gaussian | ||
79 | + gen_gaussian(kernel0, sigma, kwidth); | ||
80 | + | ||
81 | + //perform the gaussian blur as a separable convolution | ||
82 | + stim::cuda::cpu_conv2sep<float>(image, x, y, kernel0, kwidth, kernel0, kwidth); | ||
83 | + | ||
84 | + } | ||
85 | + | ||
86 | + }; | ||
87 | +}; | ||
88 | + | ||
89 | +#endif | ||
0 | \ No newline at end of file | 90 | \ No newline at end of file |
stim/cuda/gradient.cuh renamed to stim/cuda/templates/gradient.cuh