Commit 5cc0976c6028f692e7663810b4504bc34e9c8f10

Authored by David Mayerich
1 parent aa1bc80d

added separable convolution, organized the stim/cuda directory

stim/cuda/arraymath.cuh
1 1 #ifndef STIM_CUDA_ARRAYMATH_H
2 2 #define STIM_CUDA_ARRAYMATH_H
3 3  
4   -#include <stim/cuda/array_add.cuh>
5   -#include <stim/cuda/array_multiply.cuh>
6   -#include <stim/cuda/array_abs.cuh>
7   -#include <stim/cuda/array_cart2polar.cuh>
8   -#include <stim/cuda/gaussian_blur.cuh>
9   -#include <stim/cuda/conv2.cuh>
  4 +#include <stim/cuda/arraymath/array_add.cuh>
  5 +#include <stim/cuda/arraymath/array_multiply.cuh>
  6 +#include <stim/cuda/arraymath/array_abs.cuh>
  7 +#include <stim/cuda/arraymath/array_cart2polar.cuh>
10 8  
11 9 namespace stim{
12 10 namespace cuda{
... ...
stim/cuda/array_abs.cuh renamed to stim/cuda/arraymath/array_abs.cuh
stim/cuda/array_add.cuh renamed to stim/cuda/arraymath/array_add.cuh
stim/cuda/array_cart2polar.cuh renamed to stim/cuda/arraymath/array_cart2polar.cuh
stim/cuda/array_multiply.cuh renamed to stim/cuda/arraymath/array_multiply.cuh
stim/cuda/cudatools.h 0 → 100644
  1 +#include <stim/cuda/cudatools/callable.h>
  2 +#include <stim/cuda/cudatools/devices.h>
  3 +#include <stim/cuda/cudatools/error.h>
  4 +#include <stim/cuda/cudatools/threads.h>
  5 +#include <stim/cuda/cudatools/timer.h>
0 6 \ No newline at end of file
... ...
stim/cuda/callable.h renamed to stim/cuda/cudatools/callable.h
stim/cuda/devices.h renamed to stim/cuda/cudatools/devices.h
stim/cuda/error.h renamed to stim/cuda/cudatools/error.h
stim/cuda/glbind.h renamed to stim/cuda/cudatools/glbind.h
stim/cuda/threads.h renamed to stim/cuda/cudatools/threads.h
1   -#include "cuda_runtime.h"
  1 +#include "cuda_runtime.h"
2 2 #include "device_launch_parameters.h"
3   -#include "../cuda/callable.h"
  3 +#include <stim/cuda/cudatools/callable.h>
4 4  
5 5 #ifndef CUDA_THREADS_H
6 6 #define CUDA_THREADS_H
... ...
stim/cuda/timer.h renamed to stim/cuda/cudatools/timer.h
1   -#ifndef STIM_CUDA_TIMER
2   -#define STIM_CUDA_TIMER
3   -
4   -static cudaEvent_t tStartEvent;
5   -static cudaEvent_t tStopEvent;
6   -
7   -namespace stim{
8   -
9   -/// These functions calculate the time between GPU functions in milliseconds
10   -static void gpuStartTimer()
11   -{
12   - //set up timing events
13   - cudaEventCreate(&tStartEvent);
14   - cudaEventCreate(&tStopEvent);
15   - cudaEventRecord(tStartEvent, 0);
16   -}
17   -
18   -static float gpuStopTimer()
19   -{
20   - cudaEventRecord(tStopEvent, 0);
21   - cudaEventSynchronize(tStopEvent);
22   - float elapsedTime;
23   - cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent);
24   - cudaEventDestroy(tStartEvent);
25   - cudaEventDestroy(tStopEvent);
26   - return elapsedTime;
27   -}
28   -
29   -} //end namespace stim
30   -
  1 +#ifndef STIM_CUDA_TIMER
  2 +#define STIM_CUDA_TIMER
  3 +
  4 +static cudaEvent_t tStartEvent;
  5 +static cudaEvent_t tStopEvent;
  6 +
  7 +namespace stim{
  8 +
  9 +/// These functions calculate the time between GPU functions in milliseconds
  10 +static void gpuStartTimer()
  11 +{
  12 + //set up timing events
  13 + cudaEventCreate(&tStartEvent);
  14 + cudaEventCreate(&tStopEvent);
  15 + cudaEventRecord(tStartEvent, 0);
  16 +}
  17 +
  18 +static float gpuStopTimer()
  19 +{
  20 + cudaEventRecord(tStopEvent, 0);
  21 + cudaEventSynchronize(tStopEvent);
  22 + float elapsedTime;
  23 + cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent);
  24 + cudaEventDestroy(tStartEvent);
  25 + cudaEventDestroy(tStopEvent);
  26 + return elapsedTime;
  27 +}
  28 +
  29 +} //end namespace stim
  30 +
31 31 #endif
32 32 \ No newline at end of file
... ...
stim/cuda/down_sample.cuh
... ... @@ -89,7 +89,7 @@ namespace stim{
89 89 gpu_down_sample<T>(gpuI, gpuI0, resize, x, y);
90 90  
91 91 //copy the image data to the GPU
92   - cudaMemcpy(re_image, gpuI, bytes_ds, cudaMemcpyHostToDevice);
  92 + cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
93 93  
94 94 cudaFree(gpuI0);
95 95 cudeFree(gpuI);
... ...
stim/cuda/gaussian_blur.cuh
... ... @@ -3,9 +3,9 @@
3 3  
4 4 #include <iostream>
5 5 #include <cuda.h>
6   -#include <stim/cuda/devices.h>
7   -#include <stim/cuda/timer.h>
  6 +#include <stim/cuda/cudatools.h>
8 7 #include <stim/cuda/sharedmem.cuh>
  8 +#include <stim/cuda/templates/conv2sep.cuh> //GPU-based separable convolution algorithm
9 9  
10 10 #define pi 3.14159
11 11  
... ... @@ -13,228 +13,74 @@ namespace stim{
13 13 namespace cuda{
14 14  
15 15 template<typename T>
16   - __global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){
  16 + void gen_gaussian(T* out, T sigma, unsigned int width){
17 17  
18   - //generate a pointer to shared memory (size will be specified as a kernel parameter)
19   - extern __shared__ T s[];
  18 + //fill the kernel with a gaussian
  19 + for(unsigned int xi = 0; xi < width; xi++){
20 20  
21   - int kr = sigma * 4; //calculate the kernel radius
22   -
23   - //get a pointer to the gaussian in memory
24   - T* g = (T*)&s[blockDim.x + 2 * kr];
25   -
26   - //calculate the start point for this block
27   - int bxi = blockIdx.x * blockDim.x;
28   - int byi = blockIdx.y;
29   -
30   - //copy the portion of the image necessary for this block to shared memory
31   - stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
32   -
33   - //calculate the thread index and block index
34   - int ti = threadIdx.x;
35   -
36   - //calculate the spatial coordinate for this thread
37   - int xi = bxi + ti;
38   -
39   - //pre-compute the gaussian values for each kernel point
40   - T a = 1.0 / (sigma * sqrt(2 * pi));
41   - T c = - 1.0 / (2*sigma*sigma);
42   - int ki;
43   -
44   - //use the first 2kr+1 threads to evaluate a gaussian and store the result
45   - if(ti <= 2* kr+1){
46   - ki = ti - kr;
47   - g[ti] = a * exp((ki*ki) * c);
  21 + float x = (float)xi - (float)(width/2); //calculate the x position of the gaussian
  22 + float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) );
  23 + out[xi] = g;
48 24 }
49 25  
50   - //make sure that all writing to shared memory is done before continuing
51   - __syncthreads();
52   -
53   - //if the current pixel is outside of the image
54   - if(bxi + ti > x || byi > y)
55   - return;
56   -
57   -
58   -
59   - //calculate the coordinates of the current thread in shared memory
60   - int si = ti + kr;
61   -
62   - T sum = 0; //running weighted sum across the kernel
63   -
64   -
65   - //for each element of the kernel
66   - for(int ki = -kr; ki <= kr; ki++){
67   - sum += g[ki + kr] * s[si + ki];
68   - }
69   -
70   - //calculate the 1D image index for this thread
71   - unsigned int i = byi * x + xi;
72   -
73   - //output the result to global memory
74   - out[i] = sum;
75 26 }
76 27  
77 28 template<typename T>
78   - __global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){
79   -
80   - //generate a pointer to shared memory (size will be specified as a kernel parameter)
81   - extern __shared__ T s[];
82   -
83   - int kr = sigma * 4; //calculate the kernel radius
  29 + void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){
84 30  
85   - //get a pointer to the gaussian in memory
86   - T* g = (T*)&s[blockDim.y + 2 * kr];
  31 + //allocate space for the kernel
  32 + unsigned int kwidth = sigma * 8 + 1;
  33 + float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
87 34  
88   - //calculate the start point for this block
89   - int bxi = blockIdx.x;
90   - int byi = blockIdx.y * blockDim.y;
  35 + //fill the kernel with a gaussian
  36 + gen_gaussian(kernel0, sigma, kwidth);
91 37  
92   - //copy the portion of the image necessary for this block to shared memory
93   - stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
  38 + //copy the kernel to the GPU
  39 + T* gpuKernel0;
  40 + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
94 41  
95   - //calculate the thread index and block index
96   - int ti = threadIdx.y;
  42 + //perform the gaussian blur as a separable convolution
  43 + stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth);
97 44  
98   - //calculate the spatial coordinate for this thread
99   - int yi = byi + ti;
  45 + HANDLE_ERROR(cudaFree(gpuKernel0));
100 46  
101   - //pre-compute the gaussian values for each kernel point
102   - T a = 1.0 / (sigma * sqrt(2 * pi));
103   - T c = - 1.0 / (2*sigma*sigma);
104   - int ki;
105   -
106   - //use the first 2kr+1 threads to evaluate a gaussian and store the result
107   - if(ti <= 2* kr+1){
108   - ki = ti - kr;
109   - g[ti] = a * exp((ki*ki) * c);
110   - }
111   -
112   - //make sure that all writing to shared memory is done before continuing
113   - __syncthreads();
114   -
115   - //if the current pixel is outside of the image
116   - if(bxi >= x || yi >= y)
117   - return;
118   -
119   -
120   -
121   - //calculate the coordinates of the current thread in shared memory
122   - int si = ti + kr;
123   -
124   - T sum = 0; //running weighted sum across the kernel
125   -
126   -
127   - //for each element of the kernel
128   - for(int ki = -kr; ki <= kr; ki++){
129   - sum += g[ki + kr] * s[si + ki];
130   - }
131   -
132   - //calculate the 1D image index for this thread
133   - unsigned int i = yi * x + bxi;
134   -
135   - //output the result to global memory
136   - out[i] = sum;
137 47 }
138 48  
139   - /// Applies a Gaussian blur to a 2D image stored on the GPU
140 49 template<typename T>
141   - void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
  50 + void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
142 51  
143   - //get the number of pixels in the image
144   - unsigned int pixels = x * y;
145   - unsigned int bytes = sizeof(T) * pixels;
  52 + //allocate space for the kernel
  53 + unsigned int kwidth = sigma * 8 + 1;
  54 + float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
146 55  
147   - // Allocate CUDA array in device memory
148   -
149   - //define a channel descriptor for a single 32-bit channel
150   - cudaChannelFormatDesc channelDesc =
151   - cudaCreateChannelDesc(32, 0, 0, 0,
152   - cudaChannelFormatKindFloat);
153   - cudaArray* cuArray; //declare the cuda array
154   - cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array
155   -
156   - // Copy the image data from global memory to the array
157   - cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
158   - cudaMemcpyDeviceToDevice);
159   -
160   - // Specify texture
161   - struct cudaResourceDesc resDesc; //create a resource descriptor
162   - memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero
163   - resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type
164   - resDesc.res.array.array = cuArray; //add a pointer to the cuda array
165   -
166   - // Specify texture object parameters
167   - struct cudaTextureDesc texDesc; //create a texture descriptor
168   - memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero
169   - texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges)
170   - texDesc.addressMode[1] = cudaAddressModeWrap;
171   - texDesc.filterMode = cudaFilterModePoint; //use linear filtering
172   - texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats)
173   - texDesc.normalizedCoords = 0; //not using normalized coordinates
174   -
175   - // Create texture object
176   - cudaTextureObject_t texObj = 0;
177   - cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
  56 + //fill the kernel with a gaussian
  57 + gen_gaussian(kernel0, sigma, kwidth);
178 58  
179   -
180   - //get the maximum number of threads per block for the CUDA device
181   - int max_threads = stim::maxThreadsPerBlock();
182   - dim3 threads(max_threads, 1);
183   -
184   - //calculate the number of blocks
185   - dim3 blocks(x / threads.x + 1, y);
186   -
187   - //calculate the shared memory used in the kernel
188   - unsigned int pixel_bytes = max_threads * 4; //bytes devoted to pixel data being processed
189   - unsigned int apron_bytes = sigma * 8 * 4; //bytes devoted to pixels outside the window
190   - unsigned int gaussian_bytes = (sigma * 8 + 1) * 4; //bytes devoted to memory used to store the pre-computed Gaussian window
191   - unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes; //total number of bytes shared memory used
192   -
193   - //blur the image along the x-axis
194   - gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);
195   -
196   - // Copy the x-blurred data from global memory to the texture
197   - cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
198   - cudaMemcpyDeviceToDevice);
199   -
200   - //transpose the block and thread dimensions
201   - threads.x = 1;
202   - threads.y = max_threads;
203   - blocks.x = x;
204   - blocks.y = y / threads.y + 1;
205   -
206   - //blur the image along the y-axis
207   - gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);
  59 + //copy the kernel to the GPU
  60 + T* gpuKernel0;
  61 + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
  62 +
  63 + //perform the gaussian blur as a separable convolution
  64 + stim::cuda::gpu_conv2sep<float>(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth);
208 65  
209   - //free allocated memory
210   - cudaFree(cuArray);
  66 + HANDLE_ERROR(cudaFree(gpuKernel0));
211 67  
212 68 }
213 69  
214 70 /// Applies a Gaussian blur to a 2D image stored on the CPU
215 71 template<typename T>
216   - void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
  72 + void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
217 73  
218   - //get the number of pixels in the image
219   - unsigned int pixels = x * y;
220   - unsigned int bytes = sizeof(T) * pixels;
  74 + //allocate space for the kernel
  75 + unsigned int kwidth = sigma * 8 + 1;
  76 + float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
221 77  
222   - //allocate space on the GPU
223   - T* gpuI0;
224   - cudaMalloc(&gpuI0, bytes);
225   -
226   -
227   - //copy the image data to the GPU
228   - cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
229   -
230   - //run the GPU-based version of the algorithm
231   - gpu_gaussian_blur_2d<T>(gpuI0, sigma, x, y);
  78 + //fill the kernel with a gaussian
  79 + gen_gaussian(kernel0, sigma, kwidth);
232 80  
233   - //copy the image data from the device
234   - cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost);
235   -
236   - //free allocated memory
237   - cudaFree(gpuI0);
  81 + //perform the gaussian blur as a separable convolution
  82 + stim::cuda::cpu_conv2sep<float>(image, x, y, kernel0, kwidth, kernel0, kwidth);
  83 +
238 84 }
239 85  
240 86 };
... ...
stim/cuda/conv2.cuh renamed to stim/cuda/templates/conv2.cuh
stim/cuda/templates/conv2sep.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_CONV2SEP_H
  2 +#define STIM_CUDA_CONV2SEP_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/cudatools/devices.h>
  7 +#include <stim/cuda/cudatools/timer.h>
  8 +#include <stim/cuda/sharedmem.cuh>
  9 +#include <stim/cuda/cudatools/error.h>
  10 +
  11 +#define pi 3.14159
  12 +
  13 +namespace stim{
  14 + namespace cuda{
  15 +
  16 + template<typename T>
  17 + __global__ void conv2sep_0(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y,
  18 + T* kernel0, unsigned int k0){
  19 +
  20 + //generate a pointer to shared memory (size will be specified as a kernel parameter)
  21 + extern __shared__ T s[];
  22 +
  23 + int kr = k0/2; //calculate the kernel radius
  24 +
  25 + //get a pointer to the gaussian in memory
  26 + T* g = (T*)&s[blockDim.x + 2 * kr];
  27 +
  28 + //calculate the start point for this block
  29 + int bxi = blockIdx.x * blockDim.x;
  30 + int byi = blockIdx.y;
  31 +
  32 + //copy the portion of the image necessary for this block to shared memory
  33 + stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
  34 +
  35 + //calculate the thread index
  36 + int ti = threadIdx.x;
  37 +
  38 + //calculate the spatial coordinate for this thread
  39 + int xi = bxi + ti;
  40 + int yi = byi;
  41 +
  42 +
  43 + //use the first 2kr+1 threads to transfer the kernel to shared memory
  44 + if(ti < k0){
  45 + g[ti] = kernel0[ti];
  46 + }
  47 +
  48 + //make sure that all writing to shared memory is done before continuing
  49 + __syncthreads();
  50 +
  51 + //if the current pixel is outside of the image
  52 + if(xi > x || yi > y)
  53 + return;
  54 +
  55 +
  56 + //calculate the coordinates of the current thread in shared memory
  57 + int si = ti + kr;
  58 +
  59 + T sum = 0; //running weighted sum across the kernel
  60 +
  61 +
  62 + //for each element of the kernel
  63 + for(int ki = -kr; ki <= kr; ki++){
  64 + sum += g[ki + kr] * s[si + ki];
  65 + }
  66 +
  67 + //calculate the 1D image index for this thread
  68 + unsigned int i = byi * x + xi;
  69 +
  70 + //output the result to global memory
  71 + out[i] = sum;
  72 + }
  73 +
  74 + template<typename T>
  75 + __global__ void conv2sep_1(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y,
  76 + T* kernel0, unsigned int k0){
  77 +
  78 + //generate a pointer to shared memory (size will be specified as a kernel parameter)
  79 + extern __shared__ T s[];
  80 +
  81 + int kr = k0/2; //calculate the kernel radius
  82 +
  83 + //get a pointer to the gaussian in memory
  84 + T* g = (T*)&s[blockDim.y + 2 * kr];
  85 +
  86 + //calculate the start point for this block
  87 + int bxi = blockIdx.x;
  88 + int byi = blockIdx.y * blockDim.y;
  89 +
  90 + //copy the portion of the image necessary for this block to shared memory
  91 + stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
  92 +
  93 + //calculate the thread index
  94 + int ti = threadIdx.y;
  95 +
  96 + //calculate the spatial coordinate for this thread
  97 + int xi = bxi;
  98 + int yi = byi + ti;
  99 +
  100 +
  101 + //use the first 2kr+1 threads to transfer the kernel to shared memory
  102 + if(ti < k0){
  103 + g[ti] = kernel0[ti];
  104 + }
  105 +
  106 + //make sure that all writing to shared memory is done before continuing
  107 + __syncthreads();
  108 +
  109 + //if the current pixel is outside of the image
  110 + if(xi > x || yi > y)
  111 + return;
  112 +
  113 +
  114 + //calculate the coordinates of the current thread in shared memory
  115 + int si = ti + kr;
  116 +
  117 + T sum = 0; //running weighted sum across the kernel
  118 +
  119 +
  120 + //for each element of the kernel
  121 + for(int ki = -kr; ki <= kr; ki++){
  122 + sum += g[ki + kr] * s[si + ki];
  123 + }
  124 +
  125 + //calculate the 1D image index for this thread
  126 + unsigned int i = yi * x + xi;
  127 +
  128 + //output the result to global memory
  129 + out[i] = sum;
  130 + }
  131 +
  132 + template<typename T>
  133 + void tex_conv2sep(T* out, unsigned int x, unsigned int y,
  134 + cudaTextureObject_t texObj, cudaArray* cuArray,
  135 + T* kernel0, unsigned int k0,
  136 + T* kernel1, unsigned int k1){
  137 +
  138 + //get the maximum number of threads per block for the CUDA device
  139 + int max_threads = stim::maxThreadsPerBlock();
  140 + dim3 threads(max_threads, 1);
  141 +
  142 + //calculate the number of blocks
  143 + dim3 blocks(x / threads.x + 1, y);
  144 +
  145 + //calculate the shared memory used in the kernel
  146 + unsigned int pixel_bytes = max_threads * sizeof(T); //bytes devoted to pixel data being processed
  147 + unsigned int apron_bytes = k0/2 * sizeof(T); //bytes devoted to the apron on each side of the window
  148 + unsigned int gaussian_bytes = k0 * sizeof(T); //bytes devoted to memory used to store the pre-computed Gaussian window
  149 + unsigned int shared_bytes = pixel_bytes + 2 * apron_bytes + gaussian_bytes; //total number of bytes shared memory used
  150 +
  151 + //blur the image along the x-axis
  152 + conv2sep_0<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel0, k0);
  153 +
  154 + // Copy the x-blurred data from global memory to the texture
  155 + cudaMemcpyToArray(cuArray, 0, 0, out, x * y * sizeof(T),
  156 + cudaMemcpyDeviceToDevice);
  157 +
  158 + //transpose the block and thread dimensions
  159 + threads.x = 1;
  160 + threads.y = max_threads;
  161 + blocks.x = x;
  162 + blocks.y = y / threads.y + 1;
  163 +
  164 + //blur the image along the y-axis
  165 + conv2sep_1<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel1, k1);
  166 +
  167 + }
  168 +
  169 + template<typename T>
  170 + void gpu_conv2sep(T* image, unsigned int x, unsigned int y,
  171 + T* kernel0, unsigned int k0,
  172 + T* kernel1, unsigned int k1){
  173 +
  174 + //get the number of pixels in the image
  175 + unsigned int pixels = x * y;
  176 + unsigned int bytes = sizeof(T) * pixels;
  177 +
  178 + // Allocate CUDA array in device memory
  179 +
  180 + //define a channel descriptor for a single 32-bit channel
  181 + cudaChannelFormatDesc channelDesc =
  182 + cudaCreateChannelDesc(32, 0, 0, 0,
  183 + cudaChannelFormatKindFloat);
  184 + cudaArray* cuArray; //declare the cuda array
  185 + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array
  186 +
  187 + // Copy the image data from global memory to the array
  188 + cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
  189 + cudaMemcpyDeviceToDevice);
  190 +
  191 + // Specify texture
  192 + struct cudaResourceDesc resDesc; //create a resource descriptor
  193 + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero
  194 + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type
  195 + resDesc.res.array.array = cuArray; //add a pointer to the cuda array
  196 +
  197 + // Specify texture object parameters
  198 + struct cudaTextureDesc texDesc; //create a texture descriptor
  199 + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero
  200 + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges)
  201 + texDesc.addressMode[1] = cudaAddressModeWrap;
  202 + texDesc.filterMode = cudaFilterModePoint; //use linear filtering
  203 + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats)
  204 + texDesc.normalizedCoords = 0; //not using normalized coordinates
  205 +
  206 + // Create texture object
  207 + cudaTextureObject_t texObj = 0;
  208 + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
  209 +
  210 + //call the texture version of the separable convolution function
  211 + tex_conv2sep(image, x, y, texObj, cuArray, kernel0, k0, kernel1, k1);
  212 +
  213 + //free allocated memory
  214 + cudaFree(cuArray);
  215 +
  216 + }
  217 +
  218 + /// Applies a Gaussian blur to a 2D image stored on the CPU
  219 + template<typename T>
  220 + void cpu_conv2sep(T* image, unsigned int x, unsigned int y,
  221 + T* kernel0, unsigned int k0,
  222 + T* kernel1, unsigned int k1){
  223 +
  224 + //get the number of pixels in the image
  225 + unsigned int pixels = x * y;
  226 + unsigned int bytes = sizeof(T) * pixels;
  227 +
  228 + //---------Allocate Image---------
  229 + //allocate space on the GPU for the image
  230 + T* gpuI0;
  231 + HANDLE_ERROR(cudaMalloc(&gpuI0, bytes));
  232 +
  233 + //copy the image data to the GPU
  234 + HANDLE_ERROR(cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice));
  235 +
  236 + //---------Allocate Kernel--------
  237 + //allocate and copy the 0 (x) kernel
  238 + T* gpuK0;
  239 + HANDLE_ERROR(cudaMalloc(&gpuK0, k0 * sizeof(T)));
  240 + HANDLE_ERROR(cudaMemcpy(gpuK0, kernel0, k0 * sizeof(T), cudaMemcpyHostToDevice));
  241 +
  242 + //allocate and copy the 1 (y) kernel
  243 + T* gpuK1;
  244 + HANDLE_ERROR(cudaMalloc(&gpuK1, k1 * sizeof(T)));
  245 + HANDLE_ERROR(cudaMemcpy(gpuK1, kernel1, k1 * sizeof(T), cudaMemcpyHostToDevice));
  246 +
  247 + //run the GPU-based version of the algorithm
  248 + gpu_conv2sep<T>(gpuI0, x, y, gpuK0, k0, gpuK1, k1);
  249 +
  250 + //copy the image data from the device
  251 + cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost);
  252 +
  253 + //free allocated memory
  254 + cudaFree(gpuI0);
  255 + }
  256 +
  257 + };
  258 +};
  259 +
  260 +#endif
0 261 \ No newline at end of file
... ...
stim/cuda/templates/gaussian_blur.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_GAUSSIAN_BLUR_H
  2 +#define STIM_CUDA_GAUSSIAN_BLUR_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include <stim/cuda/templates/conv2sep.cuh> //GPU-based separable convolution algorithm
  9 +
  10 +#define pi 3.14159
  11 +
  12 +namespace stim{
  13 + namespace cuda{
  14 +
  15 + template<typename T>
  16 + void gen_gaussian(T* out, T sigma, unsigned int width){
  17 +
  18 + //fill the kernel with a gaussian
  19 + for(unsigned int xi = 0; xi < width; xi++){
  20 +
  21 + float x = (float)xi - (float)(width/2); //calculate the x position of the gaussian
  22 + float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) );
  23 + out[xi] = g;
  24 + }
  25 +
  26 + }
  27 +
  28 + template<typename T>
  29 + void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){
  30 +
  31 + //allocate space for the kernel
  32 + unsigned int kwidth = sigma * 8 + 1;
  33 + float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
  34 +
  35 + //fill the kernel with a gaussian
  36 + gen_gaussian(kernel0, sigma, kwidth);
  37 +
  38 + //copy the kernel to the GPU
  39 + T* gpuKernel0;
  40 + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
  41 +
  42 + //perform the gaussian blur as a separable convolution
  43 + stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth);
  44 +
  45 + HANDLE_ERROR(cudaFree(gpuKernel0));
  46 +
  47 + }
  48 +
  49 + template<typename T>
  50 + void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
  51 +
  52 + //allocate space for the kernel
  53 + unsigned int kwidth = sigma * 8 + 1;
  54 + float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
  55 +
  56 + //fill the kernel with a gaussian
  57 + gen_gaussian(kernel0, sigma, kwidth);
  58 +
  59 + //copy the kernel to the GPU
  60 + T* gpuKernel0;
  61 + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
  62 +
  63 + //perform the gaussian blur as a separable convolution
  64 + stim::cuda::gpu_conv2sep<float>(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth);
  65 +
  66 + HANDLE_ERROR(cudaFree(gpuKernel0));
  67 +
  68 + }
  69 +
  70 + /// Applies a Gaussian blur to a 2D image stored on the CPU
  71 + template<typename T>
  72 + void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
  73 +
  74 + //allocate space for the kernel
  75 + unsigned int kwidth = sigma * 8 + 1;
  76 + float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
  77 +
  78 + //fill the kernel with a gaussian
  79 + gen_gaussian(kernel0, sigma, kwidth);
  80 +
  81 + //perform the gaussian blur as a separable convolution
  82 + stim::cuda::cpu_conv2sep<float>(image, x, y, kernel0, kwidth, kernel0, kwidth);
  83 +
  84 + }
  85 +
  86 + };
  87 +};
  88 +
  89 +#endif
0 90 \ No newline at end of file
... ...
stim/cuda/gradient.cuh renamed to stim/cuda/templates/gradient.cuh