Commit 5cc0976c6028f692e7663810b4504bc34e9c8f10

Authored by David Mayerich
1 parent aa1bc80d

added separable convolution, organized the stim/cuda directory

stim/cuda/arraymath.cuh
1 #ifndef STIM_CUDA_ARRAYMATH_H 1 #ifndef STIM_CUDA_ARRAYMATH_H
2 #define STIM_CUDA_ARRAYMATH_H 2 #define STIM_CUDA_ARRAYMATH_H
3 3
4 -#include <stim/cuda/array_add.cuh>  
5 -#include <stim/cuda/array_multiply.cuh>  
6 -#include <stim/cuda/array_abs.cuh>  
7 -#include <stim/cuda/array_cart2polar.cuh>  
8 -#include <stim/cuda/gaussian_blur.cuh>  
9 -#include <stim/cuda/conv2.cuh> 4 +#include <stim/cuda/arraymath/array_add.cuh>
  5 +#include <stim/cuda/arraymath/array_multiply.cuh>
  6 +#include <stim/cuda/arraymath/array_abs.cuh>
  7 +#include <stim/cuda/arraymath/array_cart2polar.cuh>
10 8
11 namespace stim{ 9 namespace stim{
12 namespace cuda{ 10 namespace cuda{
stim/cuda/array_abs.cuh renamed to stim/cuda/arraymath/array_abs.cuh
stim/cuda/array_add.cuh renamed to stim/cuda/arraymath/array_add.cuh
stim/cuda/array_cart2polar.cuh renamed to stim/cuda/arraymath/array_cart2polar.cuh
stim/cuda/array_multiply.cuh renamed to stim/cuda/arraymath/array_multiply.cuh
stim/cuda/cudatools.h 0 → 100644
  1 +#include <stim/cuda/cudatools/callable.h>
  2 +#include <stim/cuda/cudatools/devices.h>
  3 +#include <stim/cuda/cudatools/error.h>
  4 +#include <stim/cuda/cudatools/threads.h>
  5 +#include <stim/cuda/cudatools/timer.h>
0 \ No newline at end of file 6 \ No newline at end of file
stim/cuda/callable.h renamed to stim/cuda/cudatools/callable.h
stim/cuda/devices.h renamed to stim/cuda/cudatools/devices.h
stim/cuda/error.h renamed to stim/cuda/cudatools/error.h
stim/cuda/glbind.h renamed to stim/cuda/cudatools/glbind.h
stim/cuda/threads.h renamed to stim/cuda/cudatools/threads.h
1 -#include "cuda_runtime.h" 1 +#include "cuda_runtime.h"
2 #include "device_launch_parameters.h" 2 #include "device_launch_parameters.h"
3 -#include "../cuda/callable.h" 3 +#include <stim/cuda/cudatools/callable.h>
4 4
5 #ifndef CUDA_THREADS_H 5 #ifndef CUDA_THREADS_H
6 #define CUDA_THREADS_H 6 #define CUDA_THREADS_H
stim/cuda/timer.h renamed to stim/cuda/cudatools/timer.h
1 -#ifndef STIM_CUDA_TIMER  
2 -#define STIM_CUDA_TIMER  
3 -  
4 -static cudaEvent_t tStartEvent;  
5 -static cudaEvent_t tStopEvent;  
6 -  
7 -namespace stim{  
8 -  
9 -/// These functions calculate the time between GPU functions in milliseconds  
10 -static void gpuStartTimer()  
11 -{  
12 - //set up timing events  
13 - cudaEventCreate(&tStartEvent);  
14 - cudaEventCreate(&tStopEvent);  
15 - cudaEventRecord(tStartEvent, 0);  
16 -}  
17 -  
18 -static float gpuStopTimer()  
19 -{  
20 - cudaEventRecord(tStopEvent, 0);  
21 - cudaEventSynchronize(tStopEvent);  
22 - float elapsedTime;  
23 - cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent);  
24 - cudaEventDestroy(tStartEvent);  
25 - cudaEventDestroy(tStopEvent);  
26 - return elapsedTime;  
27 -}  
28 -  
29 -} //end namespace stim  
30 - 1 +#ifndef STIM_CUDA_TIMER
  2 +#define STIM_CUDA_TIMER
  3 +
  4 +static cudaEvent_t tStartEvent;
  5 +static cudaEvent_t tStopEvent;
  6 +
  7 +namespace stim{
  8 +
  9 +/// These functions calculate the time between GPU functions in milliseconds
  10 +static void gpuStartTimer()
  11 +{
  12 + //set up timing events
  13 + cudaEventCreate(&tStartEvent);
  14 + cudaEventCreate(&tStopEvent);
  15 + cudaEventRecord(tStartEvent, 0);
  16 +}
  17 +
  18 +static float gpuStopTimer()
  19 +{
  20 + cudaEventRecord(tStopEvent, 0);
  21 + cudaEventSynchronize(tStopEvent);
  22 + float elapsedTime;
  23 + cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent);
  24 + cudaEventDestroy(tStartEvent);
  25 + cudaEventDestroy(tStopEvent);
  26 + return elapsedTime;
  27 +}
  28 +
  29 +} //end namespace stim
  30 +
31 #endif 31 #endif
32 \ No newline at end of file 32 \ No newline at end of file
stim/cuda/down_sample.cuh
@@ -89,7 +89,7 @@ namespace stim{ @@ -89,7 +89,7 @@ namespace stim{
89 gpu_down_sample<T>(gpuI, gpuI0, resize, x, y); 89 gpu_down_sample<T>(gpuI, gpuI0, resize, x, y);
90 90
91 //copy the image data to the GPU 91 //copy the image data to the GPU
92 - cudaMemcpy(re_image, gpuI, bytes_ds, cudaMemcpyHostToDevice); 92 + cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
93 93
94 cudaFree(gpuI0); 94 cudaFree(gpuI0);
95 cudeFree(gpuI); 95 cudeFree(gpuI);
stim/cuda/gaussian_blur.cuh
@@ -3,9 +3,9 @@ @@ -3,9 +3,9 @@
3 3
4 #include <iostream> 4 #include <iostream>
5 #include <cuda.h> 5 #include <cuda.h>
6 -#include <stim/cuda/devices.h>  
7 -#include <stim/cuda/timer.h> 6 +#include <stim/cuda/cudatools.h>
8 #include <stim/cuda/sharedmem.cuh> 7 #include <stim/cuda/sharedmem.cuh>
  8 +#include <stim/cuda/templates/conv2sep.cuh> //GPU-based separable convolution algorithm
9 9
10 #define pi 3.14159 10 #define pi 3.14159
11 11
@@ -13,228 +13,74 @@ namespace stim{ @@ -13,228 +13,74 @@ namespace stim{
13 namespace cuda{ 13 namespace cuda{
14 14
15 template<typename T> 15 template<typename T>
16 - __global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ 16 + void gen_gaussian(T* out, T sigma, unsigned int width){
17 17
18 - //generate a pointer to shared memory (size will be specified as a kernel parameter)  
19 - extern __shared__ T s[]; 18 + //fill the kernel with a gaussian
  19 + for(unsigned int xi = 0; xi < width; xi++){
20 20
21 - int kr = sigma * 4; //calculate the kernel radius  
22 -  
23 - //get a pointer to the gaussian in memory  
24 - T* g = (T*)&s[blockDim.x + 2 * kr];  
25 -  
26 - //calculate the start point for this block  
27 - int bxi = blockIdx.x * blockDim.x;  
28 - int byi = blockIdx.y;  
29 -  
30 - //copy the portion of the image necessary for this block to shared memory  
31 - stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);  
32 -  
33 - //calculate the thread index and block index  
34 - int ti = threadIdx.x;  
35 -  
36 - //calculate the spatial coordinate for this thread  
37 - int xi = bxi + ti;  
38 -  
39 - //pre-compute the gaussian values for each kernel point  
40 - T a = 1.0 / (sigma * sqrt(2 * pi));  
41 - T c = - 1.0 / (2*sigma*sigma);  
42 - int ki;  
43 -  
44 - //use the first 2kr+1 threads to evaluate a gaussian and store the result  
45 - if(ti <= 2* kr+1){  
46 - ki = ti - kr;  
47 - g[ti] = a * exp((ki*ki) * c); 21 + float x = (float)xi - (float)(width/2); //calculate the x position of the gaussian
  22 + float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) );
  23 + out[xi] = g;
48 } 24 }
49 25
50 - //make sure that all writing to shared memory is done before continuing  
51 - __syncthreads();  
52 -  
53 - //if the current pixel is outside of the image  
54 - if(bxi + ti > x || byi > y)  
55 - return;  
56 -  
57 -  
58 -  
59 - //calculate the coordinates of the current thread in shared memory  
60 - int si = ti + kr;  
61 -  
62 - T sum = 0; //running weighted sum across the kernel  
63 -  
64 -  
65 - //for each element of the kernel  
66 - for(int ki = -kr; ki <= kr; ki++){  
67 - sum += g[ki + kr] * s[si + ki];  
68 - }  
69 -  
70 - //calculate the 1D image index for this thread  
71 - unsigned int i = byi * x + xi;  
72 -  
73 - //output the result to global memory  
74 - out[i] = sum;  
75 } 26 }
76 27
77 template<typename T> 28 template<typename T>
78 - __global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){  
79 -  
80 - //generate a pointer to shared memory (size will be specified as a kernel parameter)  
81 - extern __shared__ T s[];  
82 -  
83 - int kr = sigma * 4; //calculate the kernel radius 29 + void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){
84 30
85 - //get a pointer to the gaussian in memory  
86 - T* g = (T*)&s[blockDim.y + 2 * kr]; 31 + //allocate space for the kernel
  32 + unsigned int kwidth = sigma * 8 + 1;
  33 + float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
87 34
88 - //calculate the start point for this block  
89 - int bxi = blockIdx.x;  
90 - int byi = blockIdx.y * blockDim.y; 35 + //fill the kernel with a gaussian
  36 + gen_gaussian(kernel0, sigma, kwidth);
91 37
92 - //copy the portion of the image necessary for this block to shared memory  
93 - stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); 38 + //copy the kernel to the GPU
  39 + T* gpuKernel0;
  40 + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
94 41
95 - //calculate the thread index and block index  
96 - int ti = threadIdx.y; 42 + //perform the gaussian blur as a separable convolution
  43 + stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth);
97 44
98 - //calculate the spatial coordinate for this thread  
99 - int yi = byi + ti; 45 + HANDLE_ERROR(cudaFree(gpuKernel0));
100 46
101 - //pre-compute the gaussian values for each kernel point  
102 - T a = 1.0 / (sigma * sqrt(2 * pi));  
103 - T c = - 1.0 / (2*sigma*sigma);  
104 - int ki;  
105 -  
106 - //use the first 2kr+1 threads to evaluate a gaussian and store the result  
107 - if(ti <= 2* kr+1){  
108 - ki = ti - kr;  
109 - g[ti] = a * exp((ki*ki) * c);  
110 - }  
111 -  
112 - //make sure that all writing to shared memory is done before continuing  
113 - __syncthreads();  
114 -  
115 - //if the current pixel is outside of the image  
116 - if(bxi >= x || yi >= y)  
117 - return;  
118 -  
119 -  
120 -  
121 - //calculate the coordinates of the current thread in shared memory  
122 - int si = ti + kr;  
123 -  
124 - T sum = 0; //running weighted sum across the kernel  
125 -  
126 -  
127 - //for each element of the kernel  
128 - for(int ki = -kr; ki <= kr; ki++){  
129 - sum += g[ki + kr] * s[si + ki];  
130 - }  
131 -  
132 - //calculate the 1D image index for this thread  
133 - unsigned int i = yi * x + bxi;  
134 -  
135 - //output the result to global memory  
136 - out[i] = sum;  
137 } 47 }
138 48
139 - /// Applies a Gaussian blur to a 2D image stored on the GPU  
140 template<typename T> 49 template<typename T>
141 - void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ 50 + void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
142 51
143 - //get the number of pixels in the image  
144 - unsigned int pixels = x * y;  
145 - unsigned int bytes = sizeof(T) * pixels; 52 + //allocate space for the kernel
  53 + unsigned int kwidth = sigma * 8 + 1;
  54 + float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
146 55
147 - // Allocate CUDA array in device memory  
148 -  
149 - //define a channel descriptor for a single 32-bit channel  
150 - cudaChannelFormatDesc channelDesc =  
151 - cudaCreateChannelDesc(32, 0, 0, 0,  
152 - cudaChannelFormatKindFloat);  
153 - cudaArray* cuArray; //declare the cuda array  
154 - cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array  
155 -  
156 - // Copy the image data from global memory to the array  
157 - cudaMemcpyToArray(cuArray, 0, 0, image, bytes,  
158 - cudaMemcpyDeviceToDevice);  
159 -  
160 - // Specify texture  
161 - struct cudaResourceDesc resDesc; //create a resource descriptor  
162 - memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero  
163 - resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type  
164 - resDesc.res.array.array = cuArray; //add a pointer to the cuda array  
165 -  
166 - // Specify texture object parameters  
167 - struct cudaTextureDesc texDesc; //create a texture descriptor  
168 - memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero  
169 - texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges)  
170 - texDesc.addressMode[1] = cudaAddressModeWrap;  
171 - texDesc.filterMode = cudaFilterModePoint; //use linear filtering  
172 - texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats)  
173 - texDesc.normalizedCoords = 0; //not using normalized coordinates  
174 -  
175 - // Create texture object  
176 - cudaTextureObject_t texObj = 0;  
177 - cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); 56 + //fill the kernel with a gaussian
  57 + gen_gaussian(kernel0, sigma, kwidth);
178 58
179 -  
180 - //get the maximum number of threads per block for the CUDA device  
181 - int max_threads = stim::maxThreadsPerBlock();  
182 - dim3 threads(max_threads, 1);  
183 -  
184 - //calculate the number of blocks  
185 - dim3 blocks(x / threads.x + 1, y);  
186 -  
187 - //calculate the shared memory used in the kernel  
188 - unsigned int pixel_bytes = max_threads * 4; //bytes devoted to pixel data being processed  
189 - unsigned int apron_bytes = sigma * 8 * 4; //bytes devoted to pixels outside the window  
190 - unsigned int gaussian_bytes = (sigma * 8 + 1) * 4; //bytes devoted to memory used to store the pre-computed Gaussian window  
191 - unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes; //total number of bytes shared memory used  
192 -  
193 - //blur the image along the x-axis  
194 - gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);  
195 -  
196 - // Copy the x-blurred data from global memory to the texture  
197 - cudaMemcpyToArray(cuArray, 0, 0, image, bytes,  
198 - cudaMemcpyDeviceToDevice);  
199 -  
200 - //transpose the block and thread dimensions  
201 - threads.x = 1;  
202 - threads.y = max_threads;  
203 - blocks.x = x;  
204 - blocks.y = y / threads.y + 1;  
205 -  
206 - //blur the image along the y-axis  
207 - gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y); 59 + //copy the kernel to the GPU
  60 + T* gpuKernel0;
  61 + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
  62 +
  63 + //perform the gaussian blur as a separable convolution
  64 + stim::cuda::gpu_conv2sep<float>(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth);
208 65
209 - //free allocated memory  
210 - cudaFree(cuArray); 66 + HANDLE_ERROR(cudaFree(gpuKernel0));
211 67
212 } 68 }
213 69
214 /// Applies a Gaussian blur to a 2D image stored on the CPU 70 /// Applies a Gaussian blur to a 2D image stored on the CPU
215 template<typename T> 71 template<typename T>
216 - void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ 72 + void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
217 73
218 - //get the number of pixels in the image  
219 - unsigned int pixels = x * y;  
220 - unsigned int bytes = sizeof(T) * pixels; 74 + //allocate space for the kernel
  75 + unsigned int kwidth = sigma * 8 + 1;
  76 + float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
221 77
222 - //allocate space on the GPU  
223 - T* gpuI0;  
224 - cudaMalloc(&gpuI0, bytes);  
225 -  
226 -  
227 - //copy the image data to the GPU  
228 - cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);  
229 -  
230 - //run the GPU-based version of the algorithm  
231 - gpu_gaussian_blur_2d<T>(gpuI0, sigma, x, y); 78 + //fill the kernel with a gaussian
  79 + gen_gaussian(kernel0, sigma, kwidth);
232 80
233 - //copy the image data from the device  
234 - cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost);  
235 -  
236 - //free allocated memory  
237 - cudaFree(gpuI0); 81 + //perform the gaussian blur as a separable convolution
  82 + stim::cuda::cpu_conv2sep<float>(image, x, y, kernel0, kwidth, kernel0, kwidth);
  83 +
238 } 84 }
239 85
240 }; 86 };
stim/cuda/conv2.cuh renamed to stim/cuda/templates/conv2.cuh
stim/cuda/templates/conv2sep.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_CONV2SEP_H
  2 +#define STIM_CUDA_CONV2SEP_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/cudatools/devices.h>
  7 +#include <stim/cuda/cudatools/timer.h>
  8 +#include <stim/cuda/sharedmem.cuh>
  9 +#include <stim/cuda/cudatools/error.h>
  10 +
  11 +#define pi 3.14159
  12 +
  13 +namespace stim{
  14 + namespace cuda{
  15 +
  16 + template<typename T>
  17 + __global__ void conv2sep_0(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y,
  18 + T* kernel0, unsigned int k0){
  19 +
  20 + //generate a pointer to shared memory (size will be specified as a kernel parameter)
  21 + extern __shared__ T s[];
  22 +
  23 + int kr = k0/2; //calculate the kernel radius
  24 +
  25 + //get a pointer to the gaussian in memory
  26 + T* g = (T*)&s[blockDim.x + 2 * kr];
  27 +
  28 + //calculate the start point for this block
  29 + int bxi = blockIdx.x * blockDim.x;
  30 + int byi = blockIdx.y;
  31 +
  32 + //copy the portion of the image necessary for this block to shared memory
  33 + stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
  34 +
  35 + //calculate the thread index
  36 + int ti = threadIdx.x;
  37 +
  38 + //calculate the spatial coordinate for this thread
  39 + int xi = bxi + ti;
  40 + int yi = byi;
  41 +
  42 +
  43 + //use the first 2kr+1 threads to transfer the kernel to shared memory
  44 + if(ti < k0){
  45 + g[ti] = kernel0[ti];
  46 + }
  47 +
  48 + //make sure that all writing to shared memory is done before continuing
  49 + __syncthreads();
  50 +
  51 + //if the current pixel is outside of the image
  52 + if(xi > x || yi > y)
  53 + return;
  54 +
  55 +
  56 + //calculate the coordinates of the current thread in shared memory
  57 + int si = ti + kr;
  58 +
  59 + T sum = 0; //running weighted sum across the kernel
  60 +
  61 +
  62 + //for each element of the kernel
  63 + for(int ki = -kr; ki <= kr; ki++){
  64 + sum += g[ki + kr] * s[si + ki];
  65 + }
  66 +
  67 + //calculate the 1D image index for this thread
  68 + unsigned int i = byi * x + xi;
  69 +
  70 + //output the result to global memory
  71 + out[i] = sum;
  72 + }
  73 +
  74 + template<typename T>
  75 + __global__ void conv2sep_1(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y,
  76 + T* kernel0, unsigned int k0){
  77 +
  78 + //generate a pointer to shared memory (size will be specified as a kernel parameter)
  79 + extern __shared__ T s[];
  80 +
  81 + int kr = k0/2; //calculate the kernel radius
  82 +
  83 + //get a pointer to the gaussian in memory
  84 + T* g = (T*)&s[blockDim.y + 2 * kr];
  85 +
  86 + //calculate the start point for this block
  87 + int bxi = blockIdx.x;
  88 + int byi = blockIdx.y * blockDim.y;
  89 +
  90 + //copy the portion of the image necessary for this block to shared memory
  91 + stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
  92 +
  93 + //calculate the thread index
  94 + int ti = threadIdx.y;
  95 +
  96 + //calculate the spatial coordinate for this thread
  97 + int xi = bxi;
  98 + int yi = byi + ti;
  99 +
  100 +
  101 + //use the first 2kr+1 threads to transfer the kernel to shared memory
  102 + if(ti < k0){
  103 + g[ti] = kernel0[ti];
  104 + }
  105 +
  106 + //make sure that all writing to shared memory is done before continuing
  107 + __syncthreads();
  108 +
  109 + //if the current pixel is outside of the image
  110 + if(xi > x || yi > y)
  111 + return;
  112 +
  113 +
  114 + //calculate the coordinates of the current thread in shared memory
  115 + int si = ti + kr;
  116 +
  117 + T sum = 0; //running weighted sum across the kernel
  118 +
  119 +
  120 + //for each element of the kernel
  121 + for(int ki = -kr; ki <= kr; ki++){
  122 + sum += g[ki + kr] * s[si + ki];
  123 + }
  124 +
  125 + //calculate the 1D image index for this thread
  126 + unsigned int i = yi * x + xi;
  127 +
  128 + //output the result to global memory
  129 + out[i] = sum;
  130 + }
  131 +
  132 + template<typename T>
  133 + void tex_conv2sep(T* out, unsigned int x, unsigned int y,
  134 + cudaTextureObject_t texObj, cudaArray* cuArray,
  135 + T* kernel0, unsigned int k0,
  136 + T* kernel1, unsigned int k1){
  137 +
  138 + //get the maximum number of threads per block for the CUDA device
  139 + int max_threads = stim::maxThreadsPerBlock();
  140 + dim3 threads(max_threads, 1);
  141 +
  142 + //calculate the number of blocks
  143 + dim3 blocks(x / threads.x + 1, y);
  144 +
  145 + //calculate the shared memory used in the kernel
  146 + unsigned int pixel_bytes = max_threads * sizeof(T); //bytes devoted to pixel data being processed
  147 + unsigned int apron_bytes = k0/2 * sizeof(T); //bytes devoted to the apron on each side of the window
  148 + unsigned int gaussian_bytes = k0 * sizeof(T); //bytes devoted to memory used to store the pre-computed Gaussian window
  149 + unsigned int shared_bytes = pixel_bytes + 2 * apron_bytes + gaussian_bytes; //total number of bytes shared memory used
  150 +
  151 + //blur the image along the x-axis
  152 + conv2sep_0<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel0, k0);
  153 +
  154 + // Copy the x-blurred data from global memory to the texture
  155 + cudaMemcpyToArray(cuArray, 0, 0, out, x * y * sizeof(T),
  156 + cudaMemcpyDeviceToDevice);
  157 +
  158 + //transpose the block and thread dimensions
  159 + threads.x = 1;
  160 + threads.y = max_threads;
  161 + blocks.x = x;
  162 + blocks.y = y / threads.y + 1;
  163 +
  164 + //blur the image along the y-axis
  165 + conv2sep_1<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel1, k1);
  166 +
  167 + }
  168 +
  169 + template<typename T>
  170 + void gpu_conv2sep(T* image, unsigned int x, unsigned int y,
  171 + T* kernel0, unsigned int k0,
  172 + T* kernel1, unsigned int k1){
  173 +
  174 + //get the number of pixels in the image
  175 + unsigned int pixels = x * y;
  176 + unsigned int bytes = sizeof(T) * pixels;
  177 +
  178 + // Allocate CUDA array in device memory
  179 +
  180 + //define a channel descriptor for a single 32-bit channel
  181 + cudaChannelFormatDesc channelDesc =
  182 + cudaCreateChannelDesc(32, 0, 0, 0,
  183 + cudaChannelFormatKindFloat);
  184 + cudaArray* cuArray; //declare the cuda array
  185 + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array
  186 +
  187 + // Copy the image data from global memory to the array
  188 + cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
  189 + cudaMemcpyDeviceToDevice);
  190 +
  191 + // Specify texture
  192 + struct cudaResourceDesc resDesc; //create a resource descriptor
  193 + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero
  194 + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type
  195 + resDesc.res.array.array = cuArray; //add a pointer to the cuda array
  196 +
  197 + // Specify texture object parameters
  198 + struct cudaTextureDesc texDesc; //create a texture descriptor
  199 + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero
  200 + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges)
  201 + texDesc.addressMode[1] = cudaAddressModeWrap;
  202 + texDesc.filterMode = cudaFilterModePoint; //use linear filtering
  203 + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats)
  204 + texDesc.normalizedCoords = 0; //not using normalized coordinates
  205 +
  206 + // Create texture object
  207 + cudaTextureObject_t texObj = 0;
  208 + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
  209 +
  210 + //call the texture version of the separable convolution function
  211 + tex_conv2sep(image, x, y, texObj, cuArray, kernel0, k0, kernel1, k1);
  212 +
  213 + //free allocated memory
  214 + cudaFree(cuArray);
  215 +
  216 + }
  217 +
  218 + /// Applies a Gaussian blur to a 2D image stored on the CPU
  219 + template<typename T>
  220 + void cpu_conv2sep(T* image, unsigned int x, unsigned int y,
  221 + T* kernel0, unsigned int k0,
  222 + T* kernel1, unsigned int k1){
  223 +
  224 + //get the number of pixels in the image
  225 + unsigned int pixels = x * y;
  226 + unsigned int bytes = sizeof(T) * pixels;
  227 +
  228 + //---------Allocate Image---------
  229 + //allocate space on the GPU for the image
  230 + T* gpuI0;
  231 + HANDLE_ERROR(cudaMalloc(&gpuI0, bytes));
  232 +
  233 + //copy the image data to the GPU
  234 + HANDLE_ERROR(cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice));
  235 +
  236 + //---------Allocate Kernel--------
  237 + //allocate and copy the 0 (x) kernel
  238 + T* gpuK0;
  239 + HANDLE_ERROR(cudaMalloc(&gpuK0, k0 * sizeof(T)));
  240 + HANDLE_ERROR(cudaMemcpy(gpuK0, kernel0, k0 * sizeof(T), cudaMemcpyHostToDevice));
  241 +
  242 + //allocate and copy the 1 (y) kernel
  243 + T* gpuK1;
  244 + HANDLE_ERROR(cudaMalloc(&gpuK1, k1 * sizeof(T)));
  245 + HANDLE_ERROR(cudaMemcpy(gpuK1, kernel1, k1 * sizeof(T), cudaMemcpyHostToDevice));
  246 +
  247 + //run the GPU-based version of the algorithm
  248 + gpu_conv2sep<T>(gpuI0, x, y, gpuK0, k0, gpuK1, k1);
  249 +
  250 + //copy the image data from the device
  251 + cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost);
  252 +
  253 + //free allocated memory
  254 + cudaFree(gpuI0);
  255 + }
  256 +
  257 + };
  258 +};
  259 +
  260 +#endif
0 \ No newline at end of file 261 \ No newline at end of file
stim/cuda/templates/gaussian_blur.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_GAUSSIAN_BLUR_H
  2 +#define STIM_CUDA_GAUSSIAN_BLUR_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include <stim/cuda/templates/conv2sep.cuh> //GPU-based separable convolution algorithm
  9 +
  10 +#define pi 3.14159
  11 +
  12 +namespace stim{
  13 + namespace cuda{
  14 +
  15 + template<typename T>
  16 + void gen_gaussian(T* out, T sigma, unsigned int width){
  17 +
  18 + //fill the kernel with a gaussian
  19 + for(unsigned int xi = 0; xi < width; xi++){
  20 +
  21 + float x = (float)xi - (float)(width/2); //calculate the x position of the gaussian
  22 + float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) );
  23 + out[xi] = g;
  24 + }
  25 +
  26 + }
  27 +
  28 + template<typename T>
  29 + void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){
  30 +
  31 + //allocate space for the kernel
  32 + unsigned int kwidth = sigma * 8 + 1;
  33 + float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
  34 +
  35 + //fill the kernel with a gaussian
  36 + gen_gaussian(kernel0, sigma, kwidth);
  37 +
  38 + //copy the kernel to the GPU
  39 + T* gpuKernel0;
  40 + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
  41 +
  42 + //perform the gaussian blur as a separable convolution
  43 + stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth);
  44 +
  45 + HANDLE_ERROR(cudaFree(gpuKernel0));
  46 +
  47 + }
  48 +
  49 + template<typename T>
  50 + void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
  51 +
  52 + //allocate space for the kernel
  53 + unsigned int kwidth = sigma * 8 + 1;
  54 + float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
  55 +
  56 + //fill the kernel with a gaussian
  57 + gen_gaussian(kernel0, sigma, kwidth);
  58 +
  59 + //copy the kernel to the GPU
  60 + T* gpuKernel0;
  61 + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
  62 +
  63 + //perform the gaussian blur as a separable convolution
  64 + stim::cuda::gpu_conv2sep<float>(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth);
  65 +
  66 + HANDLE_ERROR(cudaFree(gpuKernel0));
  67 +
  68 + }
  69 +
  70 + /// Applies a Gaussian blur to a 2D image stored on the CPU
  71 + template<typename T>
  72 + void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
  73 +
  74 + //allocate space for the kernel
  75 + unsigned int kwidth = sigma * 8 + 1;
  76 + float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
  77 +
  78 + //fill the kernel with a gaussian
  79 + gen_gaussian(kernel0, sigma, kwidth);
  80 +
  81 + //perform the gaussian blur as a separable convolution
  82 + stim::cuda::cpu_conv2sep<float>(image, x, y, kernel0, kwidth, kernel0, kwidth);
  83 +
  84 + }
  85 +
  86 + };
  87 +};
  88 +
  89 +#endif
0 \ No newline at end of file 90 \ No newline at end of file
stim/cuda/gradient.cuh renamed to stim/cuda/templates/gradient.cuh