added separable convolution, organized the stim/cuda directory

David Mayerich
1 parent aa1bc80d
Showing 18 changed files with 433 additions and 235 deletions Show diff stats
stim/cuda/arraymath.cuh
stim/cuda/array_abs.cuh → stim/cuda/arraymath/array_abs.cuh
stim/cuda/array_add.cuh → stim/cuda/arraymath/array_add.cuh
stim/cuda/array_cart2polar.cuh → stim/cuda/arraymath/array_cart2polar.cuh
stim/cuda/array_multiply.cuh → stim/cuda/arraymath/array_multiply.cuh
stim/cuda/cudatools.h
stim/cuda/callable.h → stim/cuda/cudatools/callable.h
stim/cuda/devices.h → stim/cuda/cudatools/devices.h
stim/cuda/error.h → stim/cuda/cudatools/error.h
stim/cuda/glbind.h → stim/cuda/cudatools/glbind.h
stim/cuda/threads.h → stim/cuda/cudatools/threads.h
stim/cuda/timer.h → stim/cuda/cudatools/timer.h
stim/cuda/down_sample.cuh
stim/cuda/gaussian_blur.cuh
stim/cuda/conv2.cuh → stim/cuda/templates/conv2.cuh
stim/cuda/templates/conv2sep.cuh
stim/cuda/templates/gaussian_blur.cuh
stim/cuda/gradient.cuh → stim/cuda/templates/gradient.cuh
 #ifndef STIM_CUDA_ARRAYMATH_H
 #define STIM_CUDA_ARRAYMATH_H
-#include <stim/cuda/array_add.cuh>
-#include <stim/cuda/array_multiply.cuh>
-#include <stim/cuda/array_abs.cuh>
-#include <stim/cuda/array_cart2polar.cuh>
-#include <stim/cuda/gaussian_blur.cuh>
-#include <stim/cuda/conv2.cuh>
+#include <stim/cuda/arraymath/array_add.cuh>
+#include <stim/cuda/arraymath/array_multiply.cuh>
+#include <stim/cuda/arraymath/array_abs.cuh>
+#include <stim/cuda/arraymath/array_cart2polar.cuh>
 namespace stim{
 	namespace cuda{
+#include <stim/cuda/cudatools/callable.h>
+#include <stim/cuda/cudatools/devices.h>
+#include <stim/cuda/cudatools/error.h>
+#include <stim/cuda/cudatools/threads.h>
+#include <stim/cuda/cudatools/timer.h>
 \ No newline at end of file
-#include "cuda_runtime.h"
+#include "cuda_runtime.h"
 #include "device_launch_parameters.h"
-#include "../cuda/callable.h"
+#include <stim/cuda/cudatools/callable.h>
 #ifndef CUDA_THREADS_H
 #define CUDA_THREADS_H
-#ifndef STIM_CUDA_TIMER
-#define STIM_CUDA_TIMER
-
-static cudaEvent_t tStartEvent;
-static cudaEvent_t tStopEvent;
-
-namespace stim{
-
-/// These functions calculate the time between GPU functions in milliseconds
-static void gpuStartTimer()
-{
-	//set up timing events
-	cudaEventCreate(&tStartEvent);
-	cudaEventCreate(&tStopEvent);
-	cudaEventRecord(tStartEvent, 0);
-}
-
-static float gpuStopTimer()
-{
-	cudaEventRecord(tStopEvent, 0);
-	cudaEventSynchronize(tStopEvent);
-	float elapsedTime;
-	cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent);
-	cudaEventDestroy(tStartEvent);
-	cudaEventDestroy(tStopEvent);
-	return elapsedTime;
-}
-
-}	//end namespace stim
-
+#ifndef STIM_CUDA_TIMER
+#define STIM_CUDA_TIMER
+
+static cudaEvent_t tStartEvent;
+static cudaEvent_t tStopEvent;
+
+namespace stim{
+
+/// These functions calculate the time between GPU functions in milliseconds
+static void gpuStartTimer()
+{
+	//set up timing events
+	cudaEventCreate(&tStartEvent);
+	cudaEventCreate(&tStopEvent);
+	cudaEventRecord(tStartEvent, 0);
+}
+
+static float gpuStopTimer()
+{
+	cudaEventRecord(tStopEvent, 0);
+	cudaEventSynchronize(tStopEvent);
+	float elapsedTime;
+	cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent);
+	cudaEventDestroy(tStartEvent);
+	cudaEventDestroy(tStopEvent);
+	return elapsedTime;
+}
+
+}	//end namespace stim
+
 #endif
 \ No newline at end of file
@@ -89,7 +89,7 @@ namespace stim{
 			gpu_down_sample<T>(gpuI, gpuI0, resize, x, y);
 			//copy the image data to the GPU
-			cudaMemcpy(re_image, gpuI, bytes_ds, cudaMemcpyHostToDevice);
+			cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
 			cudaFree(gpuI0);
 			cudeFree(gpuI);
@@ -3,9 +3,9 @@
 #include <iostream>
 #include <cuda.h>
-#include <stim/cuda/devices.h>
-#include <stim/cuda/timer.h>
+#include <stim/cuda/cudatools.h>
 #include <stim/cuda/sharedmem.cuh>
+#include <stim/cuda/templates/conv2sep.cuh>		//GPU-based separable convolution algorithm
 #define pi	3.14159
@@ -13,228 +13,74 @@ namespace stim{
 	namespace cuda{
 		template<typename T>
-		__global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){
+		void gen_gaussian(T* out, T sigma, unsigned int width){
-			//generate a pointer to shared memory (size will be specified as a kernel parameter)
-			extern __shared__ T s[];
+			//fill the kernel with a gaussian
+			for(unsigned int xi = 0; xi < width; xi++){
-			int kr = sigma * 4;				//calculate the kernel radius
-
-			//get a pointer to the gaussian in memory
-			T* g = (T*)&s[blockDim.x + 2 * kr];
-
-			//calculate the start point for this block
-			int bxi = blockIdx.x * blockDim.x;
-			int byi = blockIdx.y;
-
-			//copy the portion of the image necessary for this block to shared memory
-			stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
-
-			//calculate the thread index and block index
-			int ti = threadIdx.x;
-
-			//calculate the spatial coordinate for this thread
-			int xi = bxi + ti;
-
-			//pre-compute the gaussian values for each kernel point
-			T a = 1.0 / (sigma * sqrt(2 * pi));
-			T c = - 1.0 / (2*sigma*sigma);
-			int ki;
-
-			//use the first 2kr+1 threads to evaluate a gaussian and store the result
-			if(ti <= 2* kr+1){
-				ki = ti - kr;
-				g[ti] = a * exp((ki*ki) * c);
+				float x = (float)xi - (float)(width/2);	//calculate the x position of the gaussian
+				float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) );
+				out[xi] = g;
 			}
-			//make sure that all writing to shared memory is done before continuing
-			__syncthreads();
-			
-			//if the current pixel is outside of the image
-			if(bxi + ti > x || byi > y)
-				return;
-
-			
-
-			//calculate the coordinates of the current thread in shared memory
-			int si = ti + kr;
-
-			T sum = 0;		//running weighted sum across the kernel
-
-			
-			//for each element of the kernel
-			for(int ki = -kr; ki <= kr; ki++){
-				sum += g[ki + kr] * s[si + ki];
-			}
-
-			//calculate the 1D image index for this thread
-			unsigned int i = byi * x + xi;
-
-			//output the result to global memory
-			out[i] = sum;
 		}
 		template<typename T>
-		__global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){
-
-			//generate a pointer to shared memory (size will be specified as a kernel parameter)
-			extern __shared__ T s[];
-
-			int kr = sigma * 4;				//calculate the kernel radius
+		void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){
-			//get a pointer to the gaussian in memory
-			T* g = (T*)&s[blockDim.y + 2 * kr];
+			//allocate space for the kernel
+			unsigned int kwidth = sigma * 8 + 1;
+			float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
-			//calculate the start point for this block
-			int bxi = blockIdx.x;
-			int byi = blockIdx.y * blockDim.y;
+			//fill the kernel with a gaussian
+			gen_gaussian(kernel0, sigma, kwidth);
-			//copy the portion of the image necessary for this block to shared memory
-			stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
+			//copy the kernel to the GPU
+			T* gpuKernel0;
+			HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
-			//calculate the thread index and block index
-			int ti = threadIdx.y;
+			//perform the gaussian blur as a separable convolution
+			stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth);
-			//calculate the spatial coordinate for this thread
-			int yi = byi + ti;
+			HANDLE_ERROR(cudaFree(gpuKernel0));
-			//pre-compute the gaussian values for each kernel point
-			T a = 1.0 / (sigma * sqrt(2 * pi));
-			T c = - 1.0 / (2*sigma*sigma);
-			int ki;
-
-			//use the first 2kr+1 threads to evaluate a gaussian and store the result
-			if(ti <= 2* kr+1){
-				ki = ti - kr;
-				g[ti] = a * exp((ki*ki) * c);
-			}
-
-			//make sure that all writing to shared memory is done before continuing
-			__syncthreads();
-			
-			//if the current pixel is outside of the image
-			if(bxi >= x || yi >= y)
-				return;
-
-			
-
-			//calculate the coordinates of the current thread in shared memory
-			int si = ti + kr;
-
-			T sum = 0;		//running weighted sum across the kernel
-
-			
-			//for each element of the kernel
-			for(int ki = -kr; ki <= kr; ki++){
-				sum += g[ki + kr] * s[si + ki];
-			}
-
-			//calculate the 1D image index for this thread
-			unsigned int i = yi * x + bxi;
-
-			//output the result to global memory
-			out[i] = sum;
 		}
-		/// Applies a Gaussian blur to a 2D image stored on the GPU
 		template<typename T>
-		void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
+		void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
-			//get the number of pixels in the image
-			unsigned int pixels = x * y;
-			unsigned int bytes = sizeof(T) * pixels;
+			//allocate space for the kernel
+			unsigned int kwidth = sigma * 8 + 1;
+			float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
-			// Allocate CUDA array in device memory
-			
-			//define a channel descriptor for a single 32-bit channel
-			cudaChannelFormatDesc channelDesc =
-					   cudaCreateChannelDesc(32, 0, 0, 0,
-											 cudaChannelFormatKindFloat);
-			cudaArray* cuArray;												//declare the cuda array
-			cudaMallocArray(&cuArray, &channelDesc, x, y);			//allocate the cuda array
-
-			// Copy the image data from global memory to the array
-			cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
-							  cudaMemcpyDeviceToDevice);
-
-			// Specify texture
-			struct cudaResourceDesc resDesc;				//create a resource descriptor
-			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
-			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
-			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
-
-			// Specify texture object parameters
-			struct cudaTextureDesc texDesc;							//create a texture descriptor
-			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
-			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
-			texDesc.addressMode[1]   = cudaAddressModeWrap;
-			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
-			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
-			texDesc.normalizedCoords = 0;							//not using normalized coordinates
-
-			// Create texture object
-			cudaTextureObject_t texObj = 0;
-			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
+			//fill the kernel with a gaussian
+			gen_gaussian(kernel0, sigma, kwidth);
-			
-			//get the maximum number of threads per block for the CUDA device
-			int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads, 1);
-
-			//calculate the number of blocks
-			dim3 blocks(x / threads.x + 1, y);
-
-			//calculate the shared memory used in the kernel
-			unsigned int pixel_bytes = max_threads * 4;									//bytes devoted to pixel data being processed
-			unsigned int apron_bytes = sigma * 8 * 4;									//bytes devoted to pixels outside the window
-			unsigned int gaussian_bytes = (sigma * 8 + 1) * 4;							//bytes devoted to memory used to store the pre-computed Gaussian window
-			unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes;		//total number of bytes shared memory used
-
-			//blur the image along the x-axis
-			gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);
-
-			// Copy the x-blurred data from global memory to the texture
-			cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
-							  cudaMemcpyDeviceToDevice);
-
-			//transpose the block and thread dimensions
-			threads.x = 1;
-			threads.y = max_threads;
-			blocks.x = x;
-			blocks.y = y / threads.y + 1;
-			
-			//blur the image along the y-axis
-			gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);
+			//copy the kernel to the GPU
+			T* gpuKernel0;
+			HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
+
+			//perform the gaussian blur as a separable convolution
+			stim::cuda::gpu_conv2sep<float>(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth);
-			//free allocated memory
-			cudaFree(cuArray);
+			HANDLE_ERROR(cudaFree(gpuKernel0));
 		}
 		/// Applies a Gaussian blur to a 2D image stored on the CPU
 		template<typename T>
-		void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
+		void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
-			//get the number of pixels in the image
-			unsigned int pixels = x * y;
-			unsigned int bytes = sizeof(T) * pixels;
+			//allocate space for the kernel
+			unsigned int kwidth = sigma * 8 + 1;
+			float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
-			//allocate space on the GPU
-			T* gpuI0;
-			cudaMalloc(&gpuI0, bytes);
-			
-			
-			//copy the image data to the GPU
-			cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
-
-			//run the GPU-based version of the algorithm
-			gpu_gaussian_blur_2d<T>(gpuI0, sigma, x, y);
+			//fill the kernel with a gaussian
+			gen_gaussian(kernel0, sigma, kwidth);
-			//copy the image data from the device
-			cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost);
-
-			//free allocated memory
-			cudaFree(gpuI0);
+			//perform the gaussian blur as a separable convolution
+			stim::cuda::cpu_conv2sep<float>(image, x, y, kernel0, kwidth, kernel0, kwidth);
+			
 		}
 	};
+#ifndef STIM_CUDA_CONV2SEP_H
+#define STIM_CUDA_CONV2SEP_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools/devices.h>
+#include <stim/cuda/cudatools/timer.h>
+#include <stim/cuda/sharedmem.cuh>
+#include <stim/cuda/cudatools/error.h>
+
+#define pi	3.14159
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void conv2sep_0(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y,
+										   T* kernel0, unsigned int k0){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ T s[];
+
+			int kr = k0/2;				//calculate the kernel radius
+
+			//get a pointer to the gaussian in memory
+			T* g = (T*)&s[blockDim.x + 2 * kr];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			int byi = blockIdx.y;
+
+			//copy the portion of the image necessary for this block to shared memory
+			stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
+
+			//calculate the thread index
+			int ti = threadIdx.x;
+
+			//calculate the spatial coordinate for this thread
+			int xi = bxi + ti;
+			int yi = byi;
+
+			
+			//use the first 2kr+1 threads to transfer the kernel to shared memory
+			if(ti < k0){
+				g[ti] = kernel0[ti];
+			}
+
+			//make sure that all writing to shared memory is done before continuing
+			__syncthreads();
+			
+			//if the current pixel is outside of the image
+			if(xi > x || yi > y)
+				return;
+			
+
+			//calculate the coordinates of the current thread in shared memory
+			int si = ti + kr;
+
+			T sum = 0;		//running weighted sum across the kernel
+
+			
+			//for each element of the kernel
+			for(int ki = -kr; ki <= kr; ki++){
+				sum += g[ki + kr] * s[si + ki];
+			}
+			
+			//calculate the 1D image index for this thread
+			unsigned int i = byi * x + xi;
+
+			//output the result to global memory
+			out[i] = sum;
+		}
+
+		template<typename T>
+		__global__ void conv2sep_1(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y,
+										   T* kernel0, unsigned int k0){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ T s[];
+
+			int kr = k0/2;				//calculate the kernel radius
+
+			//get a pointer to the gaussian in memory
+			T* g = (T*)&s[blockDim.y + 2 * kr];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x;
+			int byi = blockIdx.y * blockDim.y;
+
+			//copy the portion of the image necessary for this block to shared memory
+			stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
+
+			//calculate the thread index
+			int ti = threadIdx.y;
+
+			//calculate the spatial coordinate for this thread
+			int xi = bxi;
+			int yi = byi + ti;
+
+			
+			//use the first 2kr+1 threads to transfer the kernel to shared memory
+			if(ti < k0){
+				g[ti] = kernel0[ti];
+			}
+
+			//make sure that all writing to shared memory is done before continuing
+			__syncthreads();
+			
+			//if the current pixel is outside of the image
+			if(xi > x || yi > y)
+				return;
+			
+
+			//calculate the coordinates of the current thread in shared memory
+			int si = ti + kr;
+
+			T sum = 0;		//running weighted sum across the kernel
+
+			
+			//for each element of the kernel
+			for(int ki = -kr; ki <= kr; ki++){
+				sum += g[ki + kr] * s[si + ki];
+			}
+			
+			//calculate the 1D image index for this thread
+			unsigned int i = yi * x + xi;
+
+			//output the result to global memory
+			out[i] = sum;
+		}
+
+		template<typename T>
+		void tex_conv2sep(T* out, unsigned int x, unsigned int y,
+						  cudaTextureObject_t texObj, cudaArray* cuArray,
+						  T* kernel0, unsigned int k0,
+						  T* kernel1, unsigned int k1){
+
+			//get the maximum number of threads per block for the CUDA device
+			int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+
+			//calculate the number of blocks
+			dim3 blocks(x / threads.x + 1, y);
+
+			//calculate the shared memory used in the kernel
+			unsigned int pixel_bytes = max_threads * sizeof(T);							//bytes devoted to pixel data being processed
+			unsigned int apron_bytes = k0/2 * sizeof(T);								//bytes devoted to the apron on each side of the window
+			unsigned int gaussian_bytes = k0 * sizeof(T);								//bytes devoted to memory used to store the pre-computed Gaussian window
+			unsigned int shared_bytes = pixel_bytes + 2 * apron_bytes + gaussian_bytes;		//total number of bytes shared memory used
+
+			//blur the image along the x-axis
+			conv2sep_0<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel0, k0);
+
+			// Copy the x-blurred data from global memory to the texture
+			cudaMemcpyToArray(cuArray, 0, 0, out, x * y * sizeof(T),
+							  cudaMemcpyDeviceToDevice);
+			
+			//transpose the block and thread dimensions
+			threads.x = 1;
+			threads.y = max_threads;
+			blocks.x = x;
+			blocks.y = y / threads.y + 1;
+			
+			//blur the image along the y-axis
+			conv2sep_1<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel1, k1);
+
+		}
+
+		template<typename T>
+		void gpu_conv2sep(T* image, unsigned int x, unsigned int y,
+						  T* kernel0, unsigned int k0,
+						  T* kernel1, unsigned int k1){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			unsigned int bytes = sizeof(T) * pixels;
+
+			// Allocate CUDA array in device memory
+			
+			//define a channel descriptor for a single 32-bit channel
+			cudaChannelFormatDesc channelDesc =
+					   cudaCreateChannelDesc(32, 0, 0, 0,
+											 cudaChannelFormatKindFloat);
+			cudaArray* cuArray;												//declare the cuda array
+			cudaMallocArray(&cuArray, &channelDesc, x, y);			//allocate the cuda array
+
+			// Copy the image data from global memory to the array
+			cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
+							  cudaMemcpyDeviceToDevice);
+
+			// Specify texture
+			struct cudaResourceDesc resDesc;				//create a resource descriptor
+			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
+			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
+			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
+
+			// Specify texture object parameters
+			struct cudaTextureDesc texDesc;							//create a texture descriptor
+			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
+			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
+			texDesc.addressMode[1]   = cudaAddressModeWrap;
+			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
+			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
+			texDesc.normalizedCoords = 0;							//not using normalized coordinates
+
+			// Create texture object
+			cudaTextureObject_t texObj = 0;
+			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
+
+			//call the texture version of the separable convolution function
+			tex_conv2sep(image, x, y, texObj, cuArray, kernel0, k0, kernel1, k1);			
+			
+			//free allocated memory
+			cudaFree(cuArray);
+
+		}
+
+		/// Applies a Gaussian blur to a 2D image stored on the CPU
+		template<typename T>
+		void cpu_conv2sep(T* image, unsigned int x, unsigned int y, 
+						  T* kernel0, unsigned int k0,
+						  T* kernel1, unsigned int k1){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			unsigned int bytes = sizeof(T) * pixels;
+
+			//---------Allocate Image---------
+			//allocate space on the GPU for the image
+			T* gpuI0;
+			HANDLE_ERROR(cudaMalloc(&gpuI0, bytes));			
+			
+			//copy the image data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice));
+
+			//---------Allocate Kernel--------
+			//allocate and copy the 0 (x) kernel
+			T* gpuK0;
+			HANDLE_ERROR(cudaMalloc(&gpuK0, k0 * sizeof(T)));
+			HANDLE_ERROR(cudaMemcpy(gpuK0, kernel0, k0 * sizeof(T), cudaMemcpyHostToDevice));
+
+			//allocate and copy the 1 (y) kernel
+			T* gpuK1;
+			HANDLE_ERROR(cudaMalloc(&gpuK1, k1 * sizeof(T)));
+			HANDLE_ERROR(cudaMemcpy(gpuK1, kernel1, k1 * sizeof(T), cudaMemcpyHostToDevice));
+
+			//run the GPU-based version of the algorithm
+			gpu_conv2sep<T>(gpuI0, x, y, gpuK0, k0, gpuK1, k1);
+
+			//copy the image data from the device
+			cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost);
+
+			//free allocated memory
+			cudaFree(gpuI0);
+		}
+		
+	};
+};
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_GAUSSIAN_BLUR_H
+#define STIM_CUDA_GAUSSIAN_BLUR_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include <stim/cuda/templates/conv2sep.cuh>		//GPU-based separable convolution algorithm
+
+#define pi	3.14159
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		void gen_gaussian(T* out, T sigma, unsigned int width){
+
+			//fill the kernel with a gaussian
+			for(unsigned int xi = 0; xi < width; xi++){
+
+				float x = (float)xi - (float)(width/2);	//calculate the x position of the gaussian
+				float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) );
+				out[xi] = g;
+			}
+
+		}
+
+		template<typename T>
+		void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){
+
+			//allocate space for the kernel
+			unsigned int kwidth = sigma * 8 + 1;
+			float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
+
+			//fill the kernel with a gaussian
+			gen_gaussian(kernel0, sigma, kwidth);
+
+			//copy the kernel to the GPU
+			T* gpuKernel0;
+			HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
+
+			//perform the gaussian blur as a separable convolution
+			stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth);
+
+			HANDLE_ERROR(cudaFree(gpuKernel0));
+
+		}
+
+		template<typename T>
+		void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
+
+			//allocate space for the kernel
+			unsigned int kwidth = sigma * 8 + 1;
+			float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
+
+			//fill the kernel with a gaussian
+			gen_gaussian(kernel0, sigma, kwidth);
+
+			//copy the kernel to the GPU
+			T* gpuKernel0;
+			HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
+
+			//perform the gaussian blur as a separable convolution
+			stim::cuda::gpu_conv2sep<float>(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth);
+
+			HANDLE_ERROR(cudaFree(gpuKernel0));
+
+		}
+
+		/// Applies a Gaussian blur to a 2D image stored on the CPU
+		template<typename T>
+		void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
+
+			//allocate space for the kernel
+			unsigned int kwidth = sigma * 8 + 1;
+			float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
+
+			//fill the kernel with a gaussian
+			gen_gaussian(kernel0, sigma, kwidth);
+
+			//perform the gaussian blur as a separable convolution
+			stim::cuda::cpu_conv2sep<float>(image, x, y, kernel0, kwidth, kernel0, kwidth);
+			
+		}
+		
+	};
+};
+
+#endif
 \ No newline at end of file