Commit 5cc0976c6028f692e7663810b4504bc34e9c8f10
1 parent
aa1bc80d
added separable convolution, organized the stim/cuda directory
Showing
18 changed files
with
433 additions
and
235 deletions
Show diff stats
stim/cuda/arraymath.cuh
1 | 1 | #ifndef STIM_CUDA_ARRAYMATH_H |
2 | 2 | #define STIM_CUDA_ARRAYMATH_H |
3 | 3 | |
4 | -#include <stim/cuda/array_add.cuh> | |
5 | -#include <stim/cuda/array_multiply.cuh> | |
6 | -#include <stim/cuda/array_abs.cuh> | |
7 | -#include <stim/cuda/array_cart2polar.cuh> | |
8 | -#include <stim/cuda/gaussian_blur.cuh> | |
9 | -#include <stim/cuda/conv2.cuh> | |
4 | +#include <stim/cuda/arraymath/array_add.cuh> | |
5 | +#include <stim/cuda/arraymath/array_multiply.cuh> | |
6 | +#include <stim/cuda/arraymath/array_abs.cuh> | |
7 | +#include <stim/cuda/arraymath/array_cart2polar.cuh> | |
10 | 8 | |
11 | 9 | namespace stim{ |
12 | 10 | namespace cuda{ | ... | ... |
stim/cuda/array_abs.cuh renamed to stim/cuda/arraymath/array_abs.cuh
stim/cuda/array_add.cuh renamed to stim/cuda/arraymath/array_add.cuh
stim/cuda/array_cart2polar.cuh renamed to stim/cuda/arraymath/array_cart2polar.cuh
stim/cuda/array_multiply.cuh renamed to stim/cuda/arraymath/array_multiply.cuh
stim/cuda/callable.h renamed to stim/cuda/cudatools/callable.h
stim/cuda/devices.h renamed to stim/cuda/cudatools/devices.h
stim/cuda/error.h renamed to stim/cuda/cudatools/error.h
stim/cuda/glbind.h renamed to stim/cuda/cudatools/glbind.h
stim/cuda/threads.h renamed to stim/cuda/cudatools/threads.h
stim/cuda/timer.h renamed to stim/cuda/cudatools/timer.h
1 | -#ifndef STIM_CUDA_TIMER | |
2 | -#define STIM_CUDA_TIMER | |
3 | - | |
4 | -static cudaEvent_t tStartEvent; | |
5 | -static cudaEvent_t tStopEvent; | |
6 | - | |
7 | -namespace stim{ | |
8 | - | |
9 | -/// These functions calculate the time between GPU functions in milliseconds | |
10 | -static void gpuStartTimer() | |
11 | -{ | |
12 | - //set up timing events | |
13 | - cudaEventCreate(&tStartEvent); | |
14 | - cudaEventCreate(&tStopEvent); | |
15 | - cudaEventRecord(tStartEvent, 0); | |
16 | -} | |
17 | - | |
18 | -static float gpuStopTimer() | |
19 | -{ | |
20 | - cudaEventRecord(tStopEvent, 0); | |
21 | - cudaEventSynchronize(tStopEvent); | |
22 | - float elapsedTime; | |
23 | - cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent); | |
24 | - cudaEventDestroy(tStartEvent); | |
25 | - cudaEventDestroy(tStopEvent); | |
26 | - return elapsedTime; | |
27 | -} | |
28 | - | |
29 | -} //end namespace stim | |
30 | - | |
1 | +#ifndef STIM_CUDA_TIMER | |
2 | +#define STIM_CUDA_TIMER | |
3 | + | |
4 | +static cudaEvent_t tStartEvent; | |
5 | +static cudaEvent_t tStopEvent; | |
6 | + | |
7 | +namespace stim{ | |
8 | + | |
9 | +/// These functions calculate the time between GPU functions in milliseconds | |
10 | +static void gpuStartTimer() | |
11 | +{ | |
12 | + //set up timing events | |
13 | + cudaEventCreate(&tStartEvent); | |
14 | + cudaEventCreate(&tStopEvent); | |
15 | + cudaEventRecord(tStartEvent, 0); | |
16 | +} | |
17 | + | |
18 | +static float gpuStopTimer() | |
19 | +{ | |
20 | + cudaEventRecord(tStopEvent, 0); | |
21 | + cudaEventSynchronize(tStopEvent); | |
22 | + float elapsedTime; | |
23 | + cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent); | |
24 | + cudaEventDestroy(tStartEvent); | |
25 | + cudaEventDestroy(tStopEvent); | |
26 | + return elapsedTime; | |
27 | +} | |
28 | + | |
29 | +} //end namespace stim | |
30 | + | |
31 | 31 | #endif |
32 | 32 | \ No newline at end of file | ... | ... |
stim/cuda/down_sample.cuh
... | ... | @@ -89,7 +89,7 @@ namespace stim{ |
89 | 89 | gpu_down_sample<T>(gpuI, gpuI0, resize, x, y); |
90 | 90 | |
91 | 91 | //copy the image data to the GPU |
92 | - cudaMemcpy(re_image, gpuI, bytes_ds, cudaMemcpyHostToDevice); | |
92 | + cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice); | |
93 | 93 | |
94 | 94 | cudaFree(gpuI0); |
95 | 95 | cudeFree(gpuI); | ... | ... |
stim/cuda/gaussian_blur.cuh
... | ... | @@ -3,9 +3,9 @@ |
3 | 3 | |
4 | 4 | #include <iostream> |
5 | 5 | #include <cuda.h> |
6 | -#include <stim/cuda/devices.h> | |
7 | -#include <stim/cuda/timer.h> | |
6 | +#include <stim/cuda/cudatools.h> | |
8 | 7 | #include <stim/cuda/sharedmem.cuh> |
8 | +#include <stim/cuda/templates/conv2sep.cuh> //GPU-based separable convolution algorithm | |
9 | 9 | |
10 | 10 | #define pi 3.14159 |
11 | 11 | |
... | ... | @@ -13,228 +13,74 @@ namespace stim{ |
13 | 13 | namespace cuda{ |
14 | 14 | |
15 | 15 | template<typename T> |
16 | - __global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ | |
16 | + void gen_gaussian(T* out, T sigma, unsigned int width){ | |
17 | 17 | |
18 | - //generate a pointer to shared memory (size will be specified as a kernel parameter) | |
19 | - extern __shared__ T s[]; | |
18 | + //fill the kernel with a gaussian | |
19 | + for(unsigned int xi = 0; xi < width; xi++){ | |
20 | 20 | |
21 | - int kr = sigma * 4; //calculate the kernel radius | |
22 | - | |
23 | - //get a pointer to the gaussian in memory | |
24 | - T* g = (T*)&s[blockDim.x + 2 * kr]; | |
25 | - | |
26 | - //calculate the start point for this block | |
27 | - int bxi = blockIdx.x * blockDim.x; | |
28 | - int byi = blockIdx.y; | |
29 | - | |
30 | - //copy the portion of the image necessary for this block to shared memory | |
31 | - stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); | |
32 | - | |
33 | - //calculate the thread index and block index | |
34 | - int ti = threadIdx.x; | |
35 | - | |
36 | - //calculate the spatial coordinate for this thread | |
37 | - int xi = bxi + ti; | |
38 | - | |
39 | - //pre-compute the gaussian values for each kernel point | |
40 | - T a = 1.0 / (sigma * sqrt(2 * pi)); | |
41 | - T c = - 1.0 / (2*sigma*sigma); | |
42 | - int ki; | |
43 | - | |
44 | - //use the first 2kr+1 threads to evaluate a gaussian and store the result | |
45 | - if(ti <= 2* kr+1){ | |
46 | - ki = ti - kr; | |
47 | - g[ti] = a * exp((ki*ki) * c); | |
21 | + float x = (float)xi - (float)(width/2); //calculate the x position of the gaussian | |
22 | + float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) ); | |
23 | + out[xi] = g; | |
48 | 24 | } |
49 | 25 | |
50 | - //make sure that all writing to shared memory is done before continuing | |
51 | - __syncthreads(); | |
52 | - | |
53 | - //if the current pixel is outside of the image | |
54 | - if(bxi + ti > x || byi > y) | |
55 | - return; | |
56 | - | |
57 | - | |
58 | - | |
59 | - //calculate the coordinates of the current thread in shared memory | |
60 | - int si = ti + kr; | |
61 | - | |
62 | - T sum = 0; //running weighted sum across the kernel | |
63 | - | |
64 | - | |
65 | - //for each element of the kernel | |
66 | - for(int ki = -kr; ki <= kr; ki++){ | |
67 | - sum += g[ki + kr] * s[si + ki]; | |
68 | - } | |
69 | - | |
70 | - //calculate the 1D image index for this thread | |
71 | - unsigned int i = byi * x + xi; | |
72 | - | |
73 | - //output the result to global memory | |
74 | - out[i] = sum; | |
75 | 26 | } |
76 | 27 | |
77 | 28 | template<typename T> |
78 | - __global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ | |
79 | - | |
80 | - //generate a pointer to shared memory (size will be specified as a kernel parameter) | |
81 | - extern __shared__ T s[]; | |
82 | - | |
83 | - int kr = sigma * 4; //calculate the kernel radius | |
29 | + void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){ | |
84 | 30 | |
85 | - //get a pointer to the gaussian in memory | |
86 | - T* g = (T*)&s[blockDim.y + 2 * kr]; | |
31 | + //allocate space for the kernel | |
32 | + unsigned int kwidth = sigma * 8 + 1; | |
33 | + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); | |
87 | 34 | |
88 | - //calculate the start point for this block | |
89 | - int bxi = blockIdx.x; | |
90 | - int byi = blockIdx.y * blockDim.y; | |
35 | + //fill the kernel with a gaussian | |
36 | + gen_gaussian(kernel0, sigma, kwidth); | |
91 | 37 | |
92 | - //copy the portion of the image necessary for this block to shared memory | |
93 | - stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); | |
38 | + //copy the kernel to the GPU | |
39 | + T* gpuKernel0; | |
40 | + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); | |
94 | 41 | |
95 | - //calculate the thread index and block index | |
96 | - int ti = threadIdx.y; | |
42 | + //perform the gaussian blur as a separable convolution | |
43 | + stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth); | |
97 | 44 | |
98 | - //calculate the spatial coordinate for this thread | |
99 | - int yi = byi + ti; | |
45 | + HANDLE_ERROR(cudaFree(gpuKernel0)); | |
100 | 46 | |
101 | - //pre-compute the gaussian values for each kernel point | |
102 | - T a = 1.0 / (sigma * sqrt(2 * pi)); | |
103 | - T c = - 1.0 / (2*sigma*sigma); | |
104 | - int ki; | |
105 | - | |
106 | - //use the first 2kr+1 threads to evaluate a gaussian and store the result | |
107 | - if(ti <= 2* kr+1){ | |
108 | - ki = ti - kr; | |
109 | - g[ti] = a * exp((ki*ki) * c); | |
110 | - } | |
111 | - | |
112 | - //make sure that all writing to shared memory is done before continuing | |
113 | - __syncthreads(); | |
114 | - | |
115 | - //if the current pixel is outside of the image | |
116 | - if(bxi >= x || yi >= y) | |
117 | - return; | |
118 | - | |
119 | - | |
120 | - | |
121 | - //calculate the coordinates of the current thread in shared memory | |
122 | - int si = ti + kr; | |
123 | - | |
124 | - T sum = 0; //running weighted sum across the kernel | |
125 | - | |
126 | - | |
127 | - //for each element of the kernel | |
128 | - for(int ki = -kr; ki <= kr; ki++){ | |
129 | - sum += g[ki + kr] * s[si + ki]; | |
130 | - } | |
131 | - | |
132 | - //calculate the 1D image index for this thread | |
133 | - unsigned int i = yi * x + bxi; | |
134 | - | |
135 | - //output the result to global memory | |
136 | - out[i] = sum; | |
137 | 47 | } |
138 | 48 | |
139 | - /// Applies a Gaussian blur to a 2D image stored on the GPU | |
140 | 49 | template<typename T> |
141 | - void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ | |
50 | + void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){ | |
142 | 51 | |
143 | - //get the number of pixels in the image | |
144 | - unsigned int pixels = x * y; | |
145 | - unsigned int bytes = sizeof(T) * pixels; | |
52 | + //allocate space for the kernel | |
53 | + unsigned int kwidth = sigma * 8 + 1; | |
54 | + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); | |
146 | 55 | |
147 | - // Allocate CUDA array in device memory | |
148 | - | |
149 | - //define a channel descriptor for a single 32-bit channel | |
150 | - cudaChannelFormatDesc channelDesc = | |
151 | - cudaCreateChannelDesc(32, 0, 0, 0, | |
152 | - cudaChannelFormatKindFloat); | |
153 | - cudaArray* cuArray; //declare the cuda array | |
154 | - cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array | |
155 | - | |
156 | - // Copy the image data from global memory to the array | |
157 | - cudaMemcpyToArray(cuArray, 0, 0, image, bytes, | |
158 | - cudaMemcpyDeviceToDevice); | |
159 | - | |
160 | - // Specify texture | |
161 | - struct cudaResourceDesc resDesc; //create a resource descriptor | |
162 | - memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero | |
163 | - resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type | |
164 | - resDesc.res.array.array = cuArray; //add a pointer to the cuda array | |
165 | - | |
166 | - // Specify texture object parameters | |
167 | - struct cudaTextureDesc texDesc; //create a texture descriptor | |
168 | - memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero | |
169 | - texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) | |
170 | - texDesc.addressMode[1] = cudaAddressModeWrap; | |
171 | - texDesc.filterMode = cudaFilterModePoint; //use linear filtering | |
172 | - texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) | |
173 | - texDesc.normalizedCoords = 0; //not using normalized coordinates | |
174 | - | |
175 | - // Create texture object | |
176 | - cudaTextureObject_t texObj = 0; | |
177 | - cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); | |
56 | + //fill the kernel with a gaussian | |
57 | + gen_gaussian(kernel0, sigma, kwidth); | |
178 | 58 | |
179 | - | |
180 | - //get the maximum number of threads per block for the CUDA device | |
181 | - int max_threads = stim::maxThreadsPerBlock(); | |
182 | - dim3 threads(max_threads, 1); | |
183 | - | |
184 | - //calculate the number of blocks | |
185 | - dim3 blocks(x / threads.x + 1, y); | |
186 | - | |
187 | - //calculate the shared memory used in the kernel | |
188 | - unsigned int pixel_bytes = max_threads * 4; //bytes devoted to pixel data being processed | |
189 | - unsigned int apron_bytes = sigma * 8 * 4; //bytes devoted to pixels outside the window | |
190 | - unsigned int gaussian_bytes = (sigma * 8 + 1) * 4; //bytes devoted to memory used to store the pre-computed Gaussian window | |
191 | - unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes; //total number of bytes shared memory used | |
192 | - | |
193 | - //blur the image along the x-axis | |
194 | - gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y); | |
195 | - | |
196 | - // Copy the x-blurred data from global memory to the texture | |
197 | - cudaMemcpyToArray(cuArray, 0, 0, image, bytes, | |
198 | - cudaMemcpyDeviceToDevice); | |
199 | - | |
200 | - //transpose the block and thread dimensions | |
201 | - threads.x = 1; | |
202 | - threads.y = max_threads; | |
203 | - blocks.x = x; | |
204 | - blocks.y = y / threads.y + 1; | |
205 | - | |
206 | - //blur the image along the y-axis | |
207 | - gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y); | |
59 | + //copy the kernel to the GPU | |
60 | + T* gpuKernel0; | |
61 | + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); | |
62 | + | |
63 | + //perform the gaussian blur as a separable convolution | |
64 | + stim::cuda::gpu_conv2sep<float>(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth); | |
208 | 65 | |
209 | - //free allocated memory | |
210 | - cudaFree(cuArray); | |
66 | + HANDLE_ERROR(cudaFree(gpuKernel0)); | |
211 | 67 | |
212 | 68 | } |
213 | 69 | |
214 | 70 | /// Applies a Gaussian blur to a 2D image stored on the CPU |
215 | 71 | template<typename T> |
216 | - void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ | |
72 | + void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){ | |
217 | 73 | |
218 | - //get the number of pixels in the image | |
219 | - unsigned int pixels = x * y; | |
220 | - unsigned int bytes = sizeof(T) * pixels; | |
74 | + //allocate space for the kernel | |
75 | + unsigned int kwidth = sigma * 8 + 1; | |
76 | + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); | |
221 | 77 | |
222 | - //allocate space on the GPU | |
223 | - T* gpuI0; | |
224 | - cudaMalloc(&gpuI0, bytes); | |
225 | - | |
226 | - | |
227 | - //copy the image data to the GPU | |
228 | - cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice); | |
229 | - | |
230 | - //run the GPU-based version of the algorithm | |
231 | - gpu_gaussian_blur_2d<T>(gpuI0, sigma, x, y); | |
78 | + //fill the kernel with a gaussian | |
79 | + gen_gaussian(kernel0, sigma, kwidth); | |
232 | 80 | |
233 | - //copy the image data from the device | |
234 | - cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost); | |
235 | - | |
236 | - //free allocated memory | |
237 | - cudaFree(gpuI0); | |
81 | + //perform the gaussian blur as a separable convolution | |
82 | + stim::cuda::cpu_conv2sep<float>(image, x, y, kernel0, kwidth, kernel0, kwidth); | |
83 | + | |
238 | 84 | } |
239 | 85 | |
240 | 86 | }; | ... | ... |
stim/cuda/conv2.cuh renamed to stim/cuda/templates/conv2.cuh
1 | +#ifndef STIM_CUDA_CONV2SEP_H | |
2 | +#define STIM_CUDA_CONV2SEP_H | |
3 | + | |
4 | +#include <iostream> | |
5 | +#include <cuda.h> | |
6 | +#include <stim/cuda/cudatools/devices.h> | |
7 | +#include <stim/cuda/cudatools/timer.h> | |
8 | +#include <stim/cuda/sharedmem.cuh> | |
9 | +#include <stim/cuda/cudatools/error.h> | |
10 | + | |
11 | +#define pi 3.14159 | |
12 | + | |
13 | +namespace stim{ | |
14 | + namespace cuda{ | |
15 | + | |
16 | + template<typename T> | |
17 | + __global__ void conv2sep_0(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y, | |
18 | + T* kernel0, unsigned int k0){ | |
19 | + | |
20 | + //generate a pointer to shared memory (size will be specified as a kernel parameter) | |
21 | + extern __shared__ T s[]; | |
22 | + | |
23 | + int kr = k0/2; //calculate the kernel radius | |
24 | + | |
25 | + //get a pointer to the gaussian in memory | |
26 | + T* g = (T*)&s[blockDim.x + 2 * kr]; | |
27 | + | |
28 | + //calculate the start point for this block | |
29 | + int bxi = blockIdx.x * blockDim.x; | |
30 | + int byi = blockIdx.y; | |
31 | + | |
32 | + //copy the portion of the image necessary for this block to shared memory | |
33 | + stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); | |
34 | + | |
35 | + //calculate the thread index | |
36 | + int ti = threadIdx.x; | |
37 | + | |
38 | + //calculate the spatial coordinate for this thread | |
39 | + int xi = bxi + ti; | |
40 | + int yi = byi; | |
41 | + | |
42 | + | |
43 | + //use the first 2kr+1 threads to transfer the kernel to shared memory | |
44 | + if(ti < k0){ | |
45 | + g[ti] = kernel0[ti]; | |
46 | + } | |
47 | + | |
48 | + //make sure that all writing to shared memory is done before continuing | |
49 | + __syncthreads(); | |
50 | + | |
51 | + //if the current pixel is outside of the image | |
52 | + if(xi > x || yi > y) | |
53 | + return; | |
54 | + | |
55 | + | |
56 | + //calculate the coordinates of the current thread in shared memory | |
57 | + int si = ti + kr; | |
58 | + | |
59 | + T sum = 0; //running weighted sum across the kernel | |
60 | + | |
61 | + | |
62 | + //for each element of the kernel | |
63 | + for(int ki = -kr; ki <= kr; ki++){ | |
64 | + sum += g[ki + kr] * s[si + ki]; | |
65 | + } | |
66 | + | |
67 | + //calculate the 1D image index for this thread | |
68 | + unsigned int i = byi * x + xi; | |
69 | + | |
70 | + //output the result to global memory | |
71 | + out[i] = sum; | |
72 | + } | |
73 | + | |
74 | + template<typename T> | |
75 | + __global__ void conv2sep_1(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y, | |
76 | + T* kernel0, unsigned int k0){ | |
77 | + | |
78 | + //generate a pointer to shared memory (size will be specified as a kernel parameter) | |
79 | + extern __shared__ T s[]; | |
80 | + | |
81 | + int kr = k0/2; //calculate the kernel radius | |
82 | + | |
83 | + //get a pointer to the gaussian in memory | |
84 | + T* g = (T*)&s[blockDim.y + 2 * kr]; | |
85 | + | |
86 | + //calculate the start point for this block | |
87 | + int bxi = blockIdx.x; | |
88 | + int byi = blockIdx.y * blockDim.y; | |
89 | + | |
90 | + //copy the portion of the image necessary for this block to shared memory | |
91 | + stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); | |
92 | + | |
93 | + //calculate the thread index | |
94 | + int ti = threadIdx.y; | |
95 | + | |
96 | + //calculate the spatial coordinate for this thread | |
97 | + int xi = bxi; | |
98 | + int yi = byi + ti; | |
99 | + | |
100 | + | |
101 | + //use the first 2kr+1 threads to transfer the kernel to shared memory | |
102 | + if(ti < k0){ | |
103 | + g[ti] = kernel0[ti]; | |
104 | + } | |
105 | + | |
106 | + //make sure that all writing to shared memory is done before continuing | |
107 | + __syncthreads(); | |
108 | + | |
109 | + //if the current pixel is outside of the image | |
110 | + if(xi > x || yi > y) | |
111 | + return; | |
112 | + | |
113 | + | |
114 | + //calculate the coordinates of the current thread in shared memory | |
115 | + int si = ti + kr; | |
116 | + | |
117 | + T sum = 0; //running weighted sum across the kernel | |
118 | + | |
119 | + | |
120 | + //for each element of the kernel | |
121 | + for(int ki = -kr; ki <= kr; ki++){ | |
122 | + sum += g[ki + kr] * s[si + ki]; | |
123 | + } | |
124 | + | |
125 | + //calculate the 1D image index for this thread | |
126 | + unsigned int i = yi * x + xi; | |
127 | + | |
128 | + //output the result to global memory | |
129 | + out[i] = sum; | |
130 | + } | |
131 | + | |
132 | + template<typename T> | |
133 | + void tex_conv2sep(T* out, unsigned int x, unsigned int y, | |
134 | + cudaTextureObject_t texObj, cudaArray* cuArray, | |
135 | + T* kernel0, unsigned int k0, | |
136 | + T* kernel1, unsigned int k1){ | |
137 | + | |
138 | + //get the maximum number of threads per block for the CUDA device | |
139 | + int max_threads = stim::maxThreadsPerBlock(); | |
140 | + dim3 threads(max_threads, 1); | |
141 | + | |
142 | + //calculate the number of blocks | |
143 | + dim3 blocks(x / threads.x + 1, y); | |
144 | + | |
145 | + //calculate the shared memory used in the kernel | |
146 | + unsigned int pixel_bytes = max_threads * sizeof(T); //bytes devoted to pixel data being processed | |
147 | + unsigned int apron_bytes = k0/2 * sizeof(T); //bytes devoted to the apron on each side of the window | |
148 | + unsigned int gaussian_bytes = k0 * sizeof(T); //bytes devoted to memory used to store the pre-computed Gaussian window | |
149 | + unsigned int shared_bytes = pixel_bytes + 2 * apron_bytes + gaussian_bytes; //total number of bytes shared memory used | |
150 | + | |
151 | + //blur the image along the x-axis | |
152 | + conv2sep_0<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel0, k0); | |
153 | + | |
154 | + // Copy the x-blurred data from global memory to the texture | |
155 | + cudaMemcpyToArray(cuArray, 0, 0, out, x * y * sizeof(T), | |
156 | + cudaMemcpyDeviceToDevice); | |
157 | + | |
158 | + //transpose the block and thread dimensions | |
159 | + threads.x = 1; | |
160 | + threads.y = max_threads; | |
161 | + blocks.x = x; | |
162 | + blocks.y = y / threads.y + 1; | |
163 | + | |
164 | + //blur the image along the y-axis | |
165 | + conv2sep_1<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel1, k1); | |
166 | + | |
167 | + } | |
168 | + | |
169 | + template<typename T> | |
170 | + void gpu_conv2sep(T* image, unsigned int x, unsigned int y, | |
171 | + T* kernel0, unsigned int k0, | |
172 | + T* kernel1, unsigned int k1){ | |
173 | + | |
174 | + //get the number of pixels in the image | |
175 | + unsigned int pixels = x * y; | |
176 | + unsigned int bytes = sizeof(T) * pixels; | |
177 | + | |
178 | + // Allocate CUDA array in device memory | |
179 | + | |
180 | + //define a channel descriptor for a single 32-bit channel | |
181 | + cudaChannelFormatDesc channelDesc = | |
182 | + cudaCreateChannelDesc(32, 0, 0, 0, | |
183 | + cudaChannelFormatKindFloat); | |
184 | + cudaArray* cuArray; //declare the cuda array | |
185 | + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array | |
186 | + | |
187 | + // Copy the image data from global memory to the array | |
188 | + cudaMemcpyToArray(cuArray, 0, 0, image, bytes, | |
189 | + cudaMemcpyDeviceToDevice); | |
190 | + | |
191 | + // Specify texture | |
192 | + struct cudaResourceDesc resDesc; //create a resource descriptor | |
193 | + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero | |
194 | + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type | |
195 | + resDesc.res.array.array = cuArray; //add a pointer to the cuda array | |
196 | + | |
197 | + // Specify texture object parameters | |
198 | + struct cudaTextureDesc texDesc; //create a texture descriptor | |
199 | + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero | |
200 | + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) | |
201 | + texDesc.addressMode[1] = cudaAddressModeWrap; | |
202 | + texDesc.filterMode = cudaFilterModePoint; //use linear filtering | |
203 | + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) | |
204 | + texDesc.normalizedCoords = 0; //not using normalized coordinates | |
205 | + | |
206 | + // Create texture object | |
207 | + cudaTextureObject_t texObj = 0; | |
208 | + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); | |
209 | + | |
210 | + //call the texture version of the separable convolution function | |
211 | + tex_conv2sep(image, x, y, texObj, cuArray, kernel0, k0, kernel1, k1); | |
212 | + | |
213 | + //free allocated memory | |
214 | + cudaFree(cuArray); | |
215 | + | |
216 | + } | |
217 | + | |
218 | + /// Applies a Gaussian blur to a 2D image stored on the CPU | |
219 | + template<typename T> | |
220 | + void cpu_conv2sep(T* image, unsigned int x, unsigned int y, | |
221 | + T* kernel0, unsigned int k0, | |
222 | + T* kernel1, unsigned int k1){ | |
223 | + | |
224 | + //get the number of pixels in the image | |
225 | + unsigned int pixels = x * y; | |
226 | + unsigned int bytes = sizeof(T) * pixels; | |
227 | + | |
228 | + //---------Allocate Image--------- | |
229 | + //allocate space on the GPU for the image | |
230 | + T* gpuI0; | |
231 | + HANDLE_ERROR(cudaMalloc(&gpuI0, bytes)); | |
232 | + | |
233 | + //copy the image data to the GPU | |
234 | + HANDLE_ERROR(cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice)); | |
235 | + | |
236 | + //---------Allocate Kernel-------- | |
237 | + //allocate and copy the 0 (x) kernel | |
238 | + T* gpuK0; | |
239 | + HANDLE_ERROR(cudaMalloc(&gpuK0, k0 * sizeof(T))); | |
240 | + HANDLE_ERROR(cudaMemcpy(gpuK0, kernel0, k0 * sizeof(T), cudaMemcpyHostToDevice)); | |
241 | + | |
242 | + //allocate and copy the 1 (y) kernel | |
243 | + T* gpuK1; | |
244 | + HANDLE_ERROR(cudaMalloc(&gpuK1, k1 * sizeof(T))); | |
245 | + HANDLE_ERROR(cudaMemcpy(gpuK1, kernel1, k1 * sizeof(T), cudaMemcpyHostToDevice)); | |
246 | + | |
247 | + //run the GPU-based version of the algorithm | |
248 | + gpu_conv2sep<T>(gpuI0, x, y, gpuK0, k0, gpuK1, k1); | |
249 | + | |
250 | + //copy the image data from the device | |
251 | + cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost); | |
252 | + | |
253 | + //free allocated memory | |
254 | + cudaFree(gpuI0); | |
255 | + } | |
256 | + | |
257 | + }; | |
258 | +}; | |
259 | + | |
260 | +#endif | |
0 | 261 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_GAUSSIAN_BLUR_H | |
2 | +#define STIM_CUDA_GAUSSIAN_BLUR_H | |
3 | + | |
4 | +#include <iostream> | |
5 | +#include <cuda.h> | |
6 | +#include <stim/cuda/cudatools.h> | |
7 | +#include <stim/cuda/sharedmem.cuh> | |
8 | +#include <stim/cuda/templates/conv2sep.cuh> //GPU-based separable convolution algorithm | |
9 | + | |
10 | +#define pi 3.14159 | |
11 | + | |
12 | +namespace stim{ | |
13 | + namespace cuda{ | |
14 | + | |
15 | + template<typename T> | |
16 | + void gen_gaussian(T* out, T sigma, unsigned int width){ | |
17 | + | |
18 | + //fill the kernel with a gaussian | |
19 | + for(unsigned int xi = 0; xi < width; xi++){ | |
20 | + | |
21 | + float x = (float)xi - (float)(width/2); //calculate the x position of the gaussian | |
22 | + float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) ); | |
23 | + out[xi] = g; | |
24 | + } | |
25 | + | |
26 | + } | |
27 | + | |
28 | + template<typename T> | |
29 | + void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){ | |
30 | + | |
31 | + //allocate space for the kernel | |
32 | + unsigned int kwidth = sigma * 8 + 1; | |
33 | + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); | |
34 | + | |
35 | + //fill the kernel with a gaussian | |
36 | + gen_gaussian(kernel0, sigma, kwidth); | |
37 | + | |
38 | + //copy the kernel to the GPU | |
39 | + T* gpuKernel0; | |
40 | + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); | |
41 | + | |
42 | + //perform the gaussian blur as a separable convolution | |
43 | + stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth); | |
44 | + | |
45 | + HANDLE_ERROR(cudaFree(gpuKernel0)); | |
46 | + | |
47 | + } | |
48 | + | |
49 | + template<typename T> | |
50 | + void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){ | |
51 | + | |
52 | + //allocate space for the kernel | |
53 | + unsigned int kwidth = sigma * 8 + 1; | |
54 | + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); | |
55 | + | |
56 | + //fill the kernel with a gaussian | |
57 | + gen_gaussian(kernel0, sigma, kwidth); | |
58 | + | |
59 | + //copy the kernel to the GPU | |
60 | + T* gpuKernel0; | |
61 | + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); | |
62 | + | |
63 | + //perform the gaussian blur as a separable convolution | |
64 | + stim::cuda::gpu_conv2sep<float>(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth); | |
65 | + | |
66 | + HANDLE_ERROR(cudaFree(gpuKernel0)); | |
67 | + | |
68 | + } | |
69 | + | |
70 | + /// Applies a Gaussian blur to a 2D image stored on the CPU | |
71 | + template<typename T> | |
72 | + void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){ | |
73 | + | |
74 | + //allocate space for the kernel | |
75 | + unsigned int kwidth = sigma * 8 + 1; | |
76 | + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); | |
77 | + | |
78 | + //fill the kernel with a gaussian | |
79 | + gen_gaussian(kernel0, sigma, kwidth); | |
80 | + | |
81 | + //perform the gaussian blur as a separable convolution | |
82 | + stim::cuda::cpu_conv2sep<float>(image, x, y, kernel0, kwidth, kernel0, kwidth); | |
83 | + | |
84 | + } | |
85 | + | |
86 | + }; | |
87 | +}; | |
88 | + | |
89 | +#endif | |
0 | 90 | \ No newline at end of file | ... | ... |
stim/cuda/gradient.cuh renamed to stim/cuda/templates/gradient.cuh