From 5cc0976c6028f692e7663810b4504bc34e9c8f10 Mon Sep 17 00:00:00 2001
From: David <mayerich@uh.edu>
Date: Mon, 14 Sep 2015 00:10:29 -0500
Subject: [PATCH] added separable convolution, organized the stim/cuda directory

---
 stim/cuda/array_abs.cuh                  |  59 -----------------------------------------------------------
 stim/cuda/array_add.cuh                  |  71 -----------------------------------------------------------------------
 stim/cuda/array_cart2polar.cuh           |  66 ------------------------------------------------------------------
 stim/cuda/array_multiply.cuh             |  64 ----------------------------------------------------------------
 stim/cuda/arraymath.cuh                  |  10 ++++------
 stim/cuda/arraymath/array_abs.cuh        |  59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/arraymath/array_add.cuh        |  71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/arraymath/array_cart2polar.cuh |  66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/arraymath/array_multiply.cuh   |  64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/callable.h                     |  10 ----------
 stim/cuda/conv2.cuh                      | 147 ---------------------------------------------------------------------------------------------------------------------------------------------------
 stim/cuda/cudatools.h                    |   5 +++++
 stim/cuda/cudatools/callable.h           |  10 ++++++++++
 stim/cuda/cudatools/devices.h            |  18 ++++++++++++++++++
 stim/cuda/cudatools/error.h              |  48 ++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/cudatools/glbind.h             |  79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/cudatools/threads.h            |  34 ++++++++++++++++++++++++++++++++++
 stim/cuda/cudatools/timer.h              |  31 +++++++++++++++++++++++++++++++
 stim/cuda/devices.h                      |  18 ------------------
 stim/cuda/down_sample.cuh                |   2 +-
 stim/cuda/error.h                        |  48 ------------------------------------------------
 stim/cuda/gaussian_blur.cuh              | 238 ++++++++++++++++++++++++++++++++++++++++++----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 stim/cuda/glbind.h                       |  79 -------------------------------------------------------------------------------
 stim/cuda/gradient.cuh                   | 115 -------------------------------------------------------------------------------------------------------------------
 stim/cuda/templates/conv2.cuh            | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/templates/conv2sep.cuh         | 260 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/templates/gaussian_blur.cuh    |  89 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/templates/gradient.cuh         | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/threads.h                      |  34 ----------------------------------
 stim/cuda/timer.h                        |  31 -------------------------------
 30 files changed, 1143 insertions(+), 945 deletions(-)
 delete mode 100644 stim/cuda/array_abs.cuh
 delete mode 100644 stim/cuda/array_add.cuh
 delete mode 100644 stim/cuda/array_cart2polar.cuh
 delete mode 100644 stim/cuda/array_multiply.cuh
 create mode 100644 stim/cuda/arraymath/array_abs.cuh
 create mode 100644 stim/cuda/arraymath/array_add.cuh
 create mode 100644 stim/cuda/arraymath/array_cart2polar.cuh
 create mode 100644 stim/cuda/arraymath/array_multiply.cuh
 delete mode 100644 stim/cuda/callable.h
 delete mode 100644 stim/cuda/conv2.cuh
 create mode 100644 stim/cuda/cudatools.h
 create mode 100644 stim/cuda/cudatools/callable.h
 create mode 100644 stim/cuda/cudatools/devices.h
 create mode 100644 stim/cuda/cudatools/error.h
 create mode 100644 stim/cuda/cudatools/glbind.h
 create mode 100644 stim/cuda/cudatools/threads.h
 create mode 100644 stim/cuda/cudatools/timer.h
 delete mode 100644 stim/cuda/devices.h
 delete mode 100644 stim/cuda/error.h
 delete mode 100644 stim/cuda/glbind.h
 delete mode 100644 stim/cuda/gradient.cuh
 create mode 100644 stim/cuda/templates/conv2.cuh
 create mode 100644 stim/cuda/templates/conv2sep.cuh
 create mode 100644 stim/cuda/templates/gaussian_blur.cuh
 create mode 100644 stim/cuda/templates/gradient.cuh
 delete mode 100644 stim/cuda/threads.h
 delete mode 100644 stim/cuda/timer.h

diff --git a/stim/cuda/array_abs.cuh b/stim/cuda/array_abs.cuh
deleted file mode 100644
index fc4d176..0000000
--- a/stim/cuda/array_abs.cuh
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef STIM_CUDA_ARRAY_ABS_H
-#define STIM_CUDA_ARRAY_ABS_H
-
-namespace stim{
-	namespace cuda{
-		template<typename T>
-		__global__ void cuda_abs(T* a, unsigned int N){
-
-			//calculate the 1D index for this thread
-			int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-			if(i < N)
-				a[i] = abs(a[i]);
-		}
-
-
-		template<typename T>
-		void gpu_abs(T* a, unsigned int N){
-
-			//get the maximum number of threads per block for the CUDA device
-			int threads = stim::maxThreadsPerBlock();
-
-			//calculate the number of blocks
-			int blocks = N / threads + (N%threads == 0 ? 0:1);
-
-			//call the kernel to do the multiplication
-			cuda_abs <<< blocks, threads >>>(a, N);
-
-		}
-
-
-		template<typename T>
-		void cpu_abs(T* a, unsigned int N){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = N * sizeof(T);
-
-			//allocate memory on the GPU for the array
-			T* gpuA;
-			HANDLE_ERROR( cudaMalloc(&gpuA, bytes) );
-
-			//copy the array to the GPU
-			HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) );
-
-			//call the GPU version of this function
-			gpu_abs<T>(gpuA, N);
-
-			//copy the array back to the CPU
-			HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) );
-
-			//free allocated memory
-			cudaFree(gpuA);
-
-		}
-	
-	} //end namespace cuda
-} //end namespace stim
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/array_add.cuh b/stim/cuda/array_add.cuh
deleted file mode 100644
index a05353c..0000000
--- a/stim/cuda/array_add.cuh
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef STIM_CUDA_ARRAY_ADD_H
-#define STIM_CUDA_ARRAY_ADD_H
-
-#include <iostream>
-#include <cuda.h>
-#include <stim/cuda/devices.h>
-#include <stim/cuda/error.h>
-
-namespace stim{
-	namespace cuda{
-
-		template<typename T>
-		__global__ void cuda_add(T* ptr1, T* ptr2, T* sum, unsigned int N){
-
-			//calculate the 1D index for this thread
-			int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-			if(idx < N){
-				sum[idx] = ptr1[idx] + ptr2[idx];
-			}
-
-		}
-
-		template<typename T>
-		void gpu_add(T* ptr1, T* ptr2, T* sum, unsigned int N){
-
-			//get the maximum number of threads per block for the CUDA device
-			int threads = stim::maxThreadsPerBlock();
-
-			//calculate the number of blocks
-			int blocks = N / threads + (N%threads == 0 ? 0:1);
-
-			//call the kernel to do the multiplication
-			cuda_add <<< blocks, threads >>>(ptr1, ptr2, sum, N);
-
-		}
-
-		template<typename T>
-		void cpu_add(T* ptr1, T* ptr2, T* cpu_sum, unsigned int N){
-
-			//allocate memory on the GPU for the array
-			T* gpu_ptr1; 
-			T* gpu_ptr2; 
-			T* gpu_sum;
-			HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
-			HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) );
-			HANDLE_ERROR( cudaMalloc( &gpu_sum, N * sizeof(T) ) );
-
-			//copy the array to the GPU
-			HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
-			HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) );
-
-			//call the GPU version of this function
-			gpu_add<T>(gpu_ptr1, gpu_ptr2 ,gpu_sum, N);
-
-			//copy the array back to the CPU
-			HANDLE_ERROR( cudaMemcpy( cpu_sum, gpu_sum, N * sizeof(T), cudaMemcpyDeviceToHost) );
-
-			//free allocated memory
-			cudaFree(gpu_ptr1);
-			cudaFree(gpu_ptr2);
-			cudaFree(gpu_sum);
-
-		}
-		
-	}
-}
-
-
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/array_cart2polar.cuh b/stim/cuda/array_cart2polar.cuh
deleted file mode 100644
index 8889b03..0000000
--- a/stim/cuda/array_cart2polar.cuh
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef STIM_CUDA_ARRAY_CART2POLAR_H
-#define STIM_CUDA_ARRAY_CART2POLAR_H
-
-namespace stim{
-	namespace cuda{
-		template<typename T>
-		__global__ void cuda_cart2polar(T* a, unsigned int N){
-
-			
-			//calculate the 1D index for this thread
-			int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-			if(i < N){
-				float x = a[i * 2 + 0];
-				float y = a[i * 2 + 1];
-				float theta = atan2( y, x ) ;
-				float r = sqrt(x * x + y * y);
-				a[i * 2 + 0] = theta;
-				a[i * 2 + 1] = r;
-			}
-		}
-
-
-		template<typename T>
-		void gpu_cart2polar(T* gpuGrad, unsigned int N){
-
-			//get the maximum number of threads per block for the CUDA device
-			int threads = stim::maxThreadsPerBlock();
-
-			//calculate the number of blocks
-			int blocks = N / threads + (N % threads == 0 ? 0:1);
-
-			//call the kernel to do the multiplication
-			cuda_cart2polar <<< blocks, threads >>>(gpuGrad, N);
-
-		}
-
-
-		template<typename T>
-		void cpu_cart2polar(T* a, unsigned int N){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = N * sizeof(T) * 2;
-
-			//allocate memory on the GPU for the array
-			T* gpuA;
-			HANDLE_ERROR( cudaMalloc(&gpuA, bytes) );
-
-			//copy the array to the GPU
-			HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) );
-
-			//call the GPU version of this function
-			gpu_cart2polar<T>(gpuA, N);
-
-			//copy the array back to the CPU
-			HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) );
-
-			//free allocated memory
-			cudaFree(gpuA);
-
-		}
-
-	}
-}
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/array_multiply.cuh b/stim/cuda/array_multiply.cuh
deleted file mode 100644
index add19ba..0000000
--- a/stim/cuda/array_multiply.cuh
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef STIM_CUDA_ARRAY_MULTIPLY_H
-#define STIM_CUDA_ARRAY_MULTIPLY_H
-
-#include <iostream>
-#include <cuda.h>
-#include <stim/cuda/devices.h>
-#include <stim/cuda/error.h>
-
-namespace stim{
-	namespace cuda{
-
-		template<typename T>
-		__global__ void cuda_multiply(T* lhs, T rhs, unsigned int N){
-
-			//calculate the 1D index for this thread
-			int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-			if(i < N)
-				lhs[i] *= rhs;
-		}
-
-		template<typename T>
-		void gpu_multiply(T* lhs, T rhs, unsigned int N){
-
-			//get the maximum number of threads per block for the CUDA device
-			int threads = stim::maxThreadsPerBlock();
-
-			//calculate the number of blocks
-			int blocks = N / threads + (N%threads == 0 ? 0:1);
-
-			//call the kernel to do the multiplication
-			cuda_multiply <<< blocks, threads >>>(lhs, rhs, N);
-
-		}
-
-		template<typename T>
-		void cpu_multiply(T* lhs, T rhs, unsigned int N){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = N * sizeof(T);
-
-			//allocate memory on the GPU for the array
-			T* gpuLHS;
-			HANDLE_ERROR( cudaMalloc(&gpuLHS, bytes) );
-
-			//copy the array to the GPU
-			HANDLE_ERROR( cudaMemcpy(gpuLHS, lhs, bytes, cudaMemcpyHostToDevice) );
-
-			//call the GPU version of this function
-			gpu_multiply<T>(gpuLHS, rhs, N);
-
-			//copy the array back to the CPU
-			HANDLE_ERROR( cudaMemcpy(lhs, gpuLHS, bytes, cudaMemcpyDeviceToHost) );
-
-			//free allocated memory
-			cudaFree(gpuLHS);
-		}
-		
-	}
-}
-
-
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/arraymath.cuh b/stim/cuda/arraymath.cuh
index 16a36b8..43d0154 100644
--- a/stim/cuda/arraymath.cuh
+++ b/stim/cuda/arraymath.cuh
@@ -1,12 +1,10 @@
 #ifndef STIM_CUDA_ARRAYMATH_H
 #define STIM_CUDA_ARRAYMATH_H
 
-#include <stim/cuda/array_add.cuh>
-#include <stim/cuda/array_multiply.cuh>
-#include <stim/cuda/array_abs.cuh>
-#include <stim/cuda/array_cart2polar.cuh>
-#include <stim/cuda/gaussian_blur.cuh>
-#include <stim/cuda/conv2.cuh>
+#include <stim/cuda/arraymath/array_add.cuh>
+#include <stim/cuda/arraymath/array_multiply.cuh>
+#include <stim/cuda/arraymath/array_abs.cuh>
+#include <stim/cuda/arraymath/array_cart2polar.cuh>
 
 namespace stim{
 	namespace cuda{
diff --git a/stim/cuda/arraymath/array_abs.cuh b/stim/cuda/arraymath/array_abs.cuh
new file mode 100644
index 0000000..fc4d176
--- /dev/null
+++ b/stim/cuda/arraymath/array_abs.cuh
@@ -0,0 +1,59 @@
+#ifndef STIM_CUDA_ARRAY_ABS_H
+#define STIM_CUDA_ARRAY_ABS_H
+
+namespace stim{
+	namespace cuda{
+		template<typename T>
+		__global__ void cuda_abs(T* a, unsigned int N){
+
+			//calculate the 1D index for this thread
+			int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(i < N)
+				a[i] = abs(a[i]);
+		}
+
+
+		template<typename T>
+		void gpu_abs(T* a, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + (N%threads == 0 ? 0:1);
+
+			//call the kernel to do the multiplication
+			cuda_abs <<< blocks, threads >>>(a, N);
+
+		}
+
+
+		template<typename T>
+		void cpu_abs(T* a, unsigned int N){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = N * sizeof(T);
+
+			//allocate memory on the GPU for the array
+			T* gpuA;
+			HANDLE_ERROR( cudaMalloc(&gpuA, bytes) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_abs<T>(gpuA, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpuA);
+
+		}
+	
+	} //end namespace cuda
+} //end namespace stim
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/arraymath/array_add.cuh b/stim/cuda/arraymath/array_add.cuh
new file mode 100644
index 0000000..a05353c
--- /dev/null
+++ b/stim/cuda/arraymath/array_add.cuh
@@ -0,0 +1,71 @@
+#ifndef STIM_CUDA_ARRAY_ADD_H
+#define STIM_CUDA_ARRAY_ADD_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/devices.h>
+#include <stim/cuda/error.h>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_add(T* ptr1, T* ptr2, T* sum, unsigned int N){
+
+			//calculate the 1D index for this thread
+			int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(idx < N){
+				sum[idx] = ptr1[idx] + ptr2[idx];
+			}
+
+		}
+
+		template<typename T>
+		void gpu_add(T* ptr1, T* ptr2, T* sum, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + (N%threads == 0 ? 0:1);
+
+			//call the kernel to do the multiplication
+			cuda_add <<< blocks, threads >>>(ptr1, ptr2, sum, N);
+
+		}
+
+		template<typename T>
+		void cpu_add(T* ptr1, T* ptr2, T* cpu_sum, unsigned int N){
+
+			//allocate memory on the GPU for the array
+			T* gpu_ptr1; 
+			T* gpu_ptr2; 
+			T* gpu_sum;
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_sum, N * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_add<T>(gpu_ptr1, gpu_ptr2 ,gpu_sum, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_sum, gpu_sum, N * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_ptr1);
+			cudaFree(gpu_ptr2);
+			cudaFree(gpu_sum);
+
+		}
+		
+	}
+}
+
+
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/arraymath/array_cart2polar.cuh b/stim/cuda/arraymath/array_cart2polar.cuh
new file mode 100644
index 0000000..8889b03
--- /dev/null
+++ b/stim/cuda/arraymath/array_cart2polar.cuh
@@ -0,0 +1,66 @@
+#ifndef STIM_CUDA_ARRAY_CART2POLAR_H
+#define STIM_CUDA_ARRAY_CART2POLAR_H
+
+namespace stim{
+	namespace cuda{
+		template<typename T>
+		__global__ void cuda_cart2polar(T* a, unsigned int N){
+
+			
+			//calculate the 1D index for this thread
+			int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(i < N){
+				float x = a[i * 2 + 0];
+				float y = a[i * 2 + 1];
+				float theta = atan2( y, x ) ;
+				float r = sqrt(x * x + y * y);
+				a[i * 2 + 0] = theta;
+				a[i * 2 + 1] = r;
+			}
+		}
+
+
+		template<typename T>
+		void gpu_cart2polar(T* gpuGrad, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + (N % threads == 0 ? 0:1);
+
+			//call the kernel to do the multiplication
+			cuda_cart2polar <<< blocks, threads >>>(gpuGrad, N);
+
+		}
+
+
+		template<typename T>
+		void cpu_cart2polar(T* a, unsigned int N){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = N * sizeof(T) * 2;
+
+			//allocate memory on the GPU for the array
+			T* gpuA;
+			HANDLE_ERROR( cudaMalloc(&gpuA, bytes) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_cart2polar<T>(gpuA, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpuA);
+
+		}
+
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/arraymath/array_multiply.cuh b/stim/cuda/arraymath/array_multiply.cuh
new file mode 100644
index 0000000..add19ba
--- /dev/null
+++ b/stim/cuda/arraymath/array_multiply.cuh
@@ -0,0 +1,64 @@
+#ifndef STIM_CUDA_ARRAY_MULTIPLY_H
+#define STIM_CUDA_ARRAY_MULTIPLY_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/devices.h>
+#include <stim/cuda/error.h>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_multiply(T* lhs, T rhs, unsigned int N){
+
+			//calculate the 1D index for this thread
+			int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(i < N)
+				lhs[i] *= rhs;
+		}
+
+		template<typename T>
+		void gpu_multiply(T* lhs, T rhs, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + (N%threads == 0 ? 0:1);
+
+			//call the kernel to do the multiplication
+			cuda_multiply <<< blocks, threads >>>(lhs, rhs, N);
+
+		}
+
+		template<typename T>
+		void cpu_multiply(T* lhs, T rhs, unsigned int N){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = N * sizeof(T);
+
+			//allocate memory on the GPU for the array
+			T* gpuLHS;
+			HANDLE_ERROR( cudaMalloc(&gpuLHS, bytes) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy(gpuLHS, lhs, bytes, cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_multiply<T>(gpuLHS, rhs, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy(lhs, gpuLHS, bytes, cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpuLHS);
+		}
+		
+	}
+}
+
+
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/callable.h b/stim/cuda/callable.h
deleted file mode 100644
index eefc437..0000000
--- a/stim/cuda/callable.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef CUDA_CALLABLE
-
-//define the CUDA_CALLABLE macro (will prefix all members)
-#ifdef __CUDACC__
-#define CUDA_CALLABLE __host__ __device__
-#else
-#define CUDA_CALLABLE
-#endif
-
-#endif
diff --git a/stim/cuda/conv2.cuh b/stim/cuda/conv2.cuh
deleted file mode 100644
index a40fdd0..0000000
--- a/stim/cuda/conv2.cuh
+++ /dev/null
@@ -1,147 +0,0 @@
-#ifndef STIM_CUDA_CONV2_H
-#define STIM_CUDA_CONV2_H
-
-#include <iostream>
-#include <cuda.h>
-#include <stim/cuda/devices.h>
-#include <stim/cuda/error.h>
-#include <cmath>
-#include <algorithm>
-
-namespace stim{
-	namespace cuda{
-
-		template<typename T>
-		//__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){
-		__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){
-
-
-			//the radius of mask
-			int r = (M - 1)/2;
-
-
-			//calculate the 1D index for this thread
-			//int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-			//change 1D index to 2D cordinates
-			int i = blockIdx.x * blockDim.x + threadIdx.x;
-			int j = blockIdx.y;
-
-			int idx = j * w + i;
-			//unsigned long N = w * h;
-
-			if(i < w && j < h){
-
-				//copy[idx] = tex2D<float>(texObj, i+100, j+100);
-				//return;
-
-				//tex2D<float>(texObj, i, j);
-
-				//allocate memory for result
-				T sum = 0;
-
-				//for (unsigned int y = max(j - r, 0); y <= min(j + r, h - 1); y++){
-
-					//for (unsigned int x = max(i - r, 0); x <= min(i + r, w - 1); x++){
-
-				for (int y = j - r; y <= j + r; y++){
-
-					for (int x = i - r; x <= i + r; x++){
-
-						//idx to mask cordinates(xx, yy)
-						int xx = x - (i - r);
-						int yy = y - (j - r);
-
-						//T temp = img[y * w + x] * mask[yy * M + xx];
-						//sum += img[y * w + x] * mask[yy * M + xx];
-						sum += tex2D<T>(texObj, x, y) * 1.0;//mask[yy * M + xx];
-					}		
-				}
-				copy[idx] = sum;
-			 }
-		}
-		
-
-		template<typename T>
-		void gpu_conv2(T* img, T* mask, T* copy, unsigned int w, unsigned int h, unsigned M){
-
-			unsigned long N = w * h;
-
-			// Allocate CUDA array in device memory
-			
-			//define a channel descriptor for a single 32-bit channel
-			cudaChannelFormatDesc channelDesc =
-					   cudaCreateChannelDesc(32, 0, 0, 0,
-											 cudaChannelFormatKindFloat);
-			cudaArray* cuArray;												//declare the cuda array
-			cudaMallocArray(&cuArray, &channelDesc, w, h);			//allocate the cuda array
-
-			// Copy the image data from global memory to the array
-			cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T),
-							  cudaMemcpyDeviceToDevice);
-
-			// Specify texture
-			struct cudaResourceDesc resDesc;				//create a resource descriptor
-			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
-			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
-			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
-
-			// Specify texture object parameters
-			struct cudaTextureDesc texDesc;							//create a texture descriptor
-			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
-			texDesc.addressMode[0]   = cudaAddressModeMirror;			//use wrapping (around the edges)
-			texDesc.addressMode[1]   = cudaAddressModeMirror;
-			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
-			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
-			texDesc.normalizedCoords = 0;							//not using normalized coordinates
-
-			// Create texture object
-			cudaTextureObject_t texObj = 0;
-			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
-
-			//get the maximum number of threads per block for the CUDA device
-			int threads = stim::maxThreadsPerBlock();
-
-			//calculate the number of blocks
-			dim3 blocks(w / threads + 1, h);
-
-			//call the kernel to do the multiplication
-			//cuda_conv2 <<< blocks, threads >>>(img, mask, copy, w, h, M);
-			cuda_conv2 <<< blocks, threads >>>(img, mask, copy, texObj, w, h, M);
-
-		}
-
-		template<typename T>
-		void cpu_conv2(T* img, T* mask, T* cpu_copy, unsigned int w, unsigned int h, unsigned M){
-
-			unsigned long N = w * h;
-			//allocate memory on the GPU for the array
-			T* gpu_img; 
-			T* gpu_mask; 
-			T* gpu_copy;
-			HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) );
-			HANDLE_ERROR( cudaMalloc( &gpu_mask, pow(M, 2) * sizeof(T) ) );
-			HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) );
-
-			//copy the array to the GPU
-			HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) );
-			HANDLE_ERROR( cudaMemcpy( gpu_mask, mask, pow(M, 2) * sizeof(T), cudaMemcpyHostToDevice) );
-
-			//call the GPU version of this function
-			gpu_conv2<T>(gpu_img, gpu_mask ,gpu_copy, w, h, M);
-
-			//copy the array back to the CPU
-			HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) );
-
-			//free allocated memory
-			cudaFree(gpu_img);
-			cudaFree(gpu_mask);
-			cudaFree(gpu_copy);
-
-		}
-		
-	}
-}
-
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/cudatools.h b/stim/cuda/cudatools.h
new file mode 100644
index 0000000..1f1a9e6
--- /dev/null
+++ b/stim/cuda/cudatools.h
@@ -0,0 +1,5 @@
+#include <stim/cuda/cudatools/callable.h>
+#include <stim/cuda/cudatools/devices.h>
+#include <stim/cuda/cudatools/error.h>
+#include <stim/cuda/cudatools/threads.h>
+#include <stim/cuda/cudatools/timer.h>
\ No newline at end of file
diff --git a/stim/cuda/cudatools/callable.h b/stim/cuda/cudatools/callable.h
new file mode 100644
index 0000000..eefc437
--- /dev/null
+++ b/stim/cuda/cudatools/callable.h
@@ -0,0 +1,10 @@
+#ifndef CUDA_CALLABLE
+
+//define the CUDA_CALLABLE macro (will prefix all members)
+#ifdef __CUDACC__
+#define CUDA_CALLABLE __host__ __device__
+#else
+#define CUDA_CALLABLE
+#endif
+
+#endif
diff --git a/stim/cuda/cudatools/devices.h b/stim/cuda/cudatools/devices.h
new file mode 100644
index 0000000..4dce378
--- /dev/null
+++ b/stim/cuda/cudatools/devices.h
@@ -0,0 +1,18 @@
+#ifndef RTS_CUDA_DEVICES
+#define RTS_CUDA_DEVICES
+
+#include <cuda.h>
+
+namespace stim{
+
+int maxThreadsPerBlock()
+{
+	int device;
+	cudaGetDevice(&device);		//get the id of the current device
+	cudaDeviceProp props;		//device property structure
+	cudaGetDeviceProperties(&props, device);
+	return props.maxThreadsPerBlock;
+}
+}	//end namespace rts
+
+#endif
diff --git a/stim/cuda/cudatools/error.h b/stim/cuda/cudatools/error.h
new file mode 100644
index 0000000..b49cf4b
--- /dev/null
+++ b/stim/cuda/cudatools/error.h
@@ -0,0 +1,48 @@
+#include <stdio.h>
+#include <iostream>
+using namespace std;
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include "cufft.h"
+
+#ifndef CUDA_HANDLE_ERROR_H
+#define CUDA_HANDLE_ERROR_H
+
+//handle error macro
+static void HandleError( cudaError_t err, const char *file,  int line ) {
+   	if (err != cudaSuccess) {
+			//FILE* outfile = fopen("cudaErrorLog.txt", "w");
+      		//fprintf(outfile,  "%s in %s at line %d\n", cudaGetErrorString( err ),  file, line );
+			//fclose(outfile);
+            printf("%s in %s at line %d\n", cudaGetErrorString( err ),  file, line );
+       		//exit( EXIT_FAILURE );
+
+   	}
+}
+#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
+
+static void CufftError( cufftResult err )
+{
+    if (err != CUFFT_SUCCESS)
+    {
+        if(err == CUFFT_INVALID_PLAN)
+            cout<<"The plan parameter is not a valid handle."<<endl;
+        else if(err == CUFFT_ALLOC_FAILED)
+            cout<<"Allocation failed."<<endl;
+        else if(err == CUFFT_INVALID_VALUE)
+            cout<<"At least one of the parameters idata, odata, and direction is not valid."<<endl;
+        else if(err == CUFFT_INTERNAL_ERROR)
+            cout<<"An internal driver error was detected."<<endl;
+        else if(err == CUFFT_EXEC_FAILED)
+            cout<<"CUFFT failed to execute the transform on the GPU."<<endl;
+        else if(err == CUFFT_SETUP_FAILED)
+            cout<<"The CUFFT library failed to initialize."<<endl;
+        else
+            cout<<"Unknown error: "<<err<<endl;
+
+    }
+}
+
+
+
+#endif
diff --git a/stim/cuda/cudatools/glbind.h b/stim/cuda/cudatools/glbind.h
new file mode 100644
index 0000000..bd841cc
--- /dev/null
+++ b/stim/cuda/cudatools/glbind.h
@@ -0,0 +1,79 @@
+#ifndef RTS_GL_BIND_H
+#define RTS_GL_BIND_H
+
+#include <GL/glew.h>
+#include <GL/gl.h>
+
+#include <stdio.h>
+#include <cstring>
+
+//#include <cudaHandleError.h>
+#include "cuda_gl_interop.h"
+#include "../gl/error.h"
+
+namespace stim
+{
+
+static void InitGLEW()
+{
+	//Initialize the GLEW toolkit
+
+	GLenum err = glewInit();
+	if(GLEW_OK != err)
+	{
+		printf("Error starting GLEW.");
+	}
+	fprintf(stdout, "Status: Using GLEW %s\n", glewGetString(GLEW_VERSION));
+}
+
+static void cudaSetDevice(int major = 1, int minor = 3)
+{
+	cudaDeviceProp prop;
+	int dev;
+
+	//find a CUDA device that can handle an offscreen buffer
+	int num_gpu;
+	HANDLE_ERROR(cudaGetDeviceCount(&num_gpu));
+	printf("Number of CUDA devices detected: %d\n", num_gpu);
+	memset(&prop, 0, sizeof(cudaDeviceProp));
+	prop.major=major;
+	prop.minor=minor;
+	HANDLE_ERROR(cudaChooseDevice(&dev, &prop));
+	HANDLE_ERROR(cudaGetDeviceProperties(&prop, dev));
+	HANDLE_ERROR(cudaGLSetGLDevice(dev));
+}
+
+static void* cudaMapResource(cudaGraphicsResource* cudaBufferResource)
+{
+	//this function takes a predefined CUDA resource and maps it to a pointer
+	void* buffer;
+	HANDLE_ERROR(cudaGraphicsMapResources(1, &cudaBufferResource, NULL));
+	size_t size;
+	HANDLE_ERROR(cudaGraphicsResourceGetMappedPointer( (void**)&buffer, &size, cudaBufferResource));
+	return buffer;
+}
+static void cudaUnmapResource(cudaGraphicsResource* resource)
+{
+	//this function unmaps the CUDA resource so it can be used by OpenGL
+	HANDLE_ERROR(cudaGraphicsUnmapResources(1, &resource, NULL));
+}
+
+static void cudaCreateRenderBuffer(GLuint &glBufferName, cudaGraphicsResource* &cudaBufferResource, int resX, int resY)
+{
+	//delete the previous buffer name and resource
+	if(cudaBufferResource != 0)
+		HANDLE_ERROR(cudaGraphicsUnregisterResource(cudaBufferResource));
+	if(glBufferName != 0)
+		glDeleteBuffers(1, &glBufferName);
+
+	//generate an OpenGL offscreen buffer
+	glGenBuffers(1, &glBufferName);
+
+	//bind the buffer - directs all calls to this buffer
+	glBindBuffer(GL_PIXEL_UNPACK_BUFFER, glBufferName);
+	glBufferData(GL_PIXEL_UNPACK_BUFFER, resX * resY * sizeof(uchar3), NULL, GL_DYNAMIC_DRAW_ARB);
+	CHECK_OPENGL_ERROR
+	HANDLE_ERROR(cudaGraphicsGLRegisterBuffer(&cudaBufferResource, glBufferName, cudaGraphicsMapFlagsNone));
+}
+}
+#endif
diff --git a/stim/cuda/cudatools/threads.h b/stim/cuda/cudatools/threads.h
new file mode 100644
index 0000000..0315082
--- /dev/null
+++ b/stim/cuda/cudatools/threads.h
@@ -0,0 +1,34 @@
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include <stim/cuda/cudatools/callable.h>
+
+#ifndef CUDA_THREADS_H
+#define CUDA_THREADS_H
+
+#define MAX_GRID        65535
+
+__device__ unsigned int ThreadIndex1D()
+{
+    return blockIdx.y * gridDim.x * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+}
+
+dim3 GenGrid1D(unsigned int N, unsigned int blocksize = 128)
+{
+    dim3 dimgrid;
+
+    dimgrid.x = (N + blocksize - 1)/blocksize;
+    dimgrid.y = 1;
+    dimgrid.z = 1;
+
+    if(dimgrid.x > MAX_GRID)
+    {
+        dimgrid.y = (dimgrid.x + MAX_GRID - 1) / MAX_GRID;
+        dimgrid.x = MAX_GRID;
+    }
+
+    return dimgrid;
+
+}
+
+
+#endif
diff --git a/stim/cuda/cudatools/timer.h b/stim/cuda/cudatools/timer.h
new file mode 100644
index 0000000..27d6352
--- /dev/null
+++ b/stim/cuda/cudatools/timer.h
@@ -0,0 +1,31 @@
+#ifndef STIM_CUDA_TIMER
+#define STIM_CUDA_TIMER
+
+static cudaEvent_t tStartEvent;
+static cudaEvent_t tStopEvent;
+
+namespace stim{
+
+/// These functions calculate the time between GPU functions in milliseconds
+static void gpuStartTimer()
+{
+	//set up timing events
+	cudaEventCreate(&tStartEvent);
+	cudaEventCreate(&tStopEvent);
+	cudaEventRecord(tStartEvent, 0);
+}
+
+static float gpuStopTimer()
+{
+	cudaEventRecord(tStopEvent, 0);
+	cudaEventSynchronize(tStopEvent);
+	float elapsedTime;
+	cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent);
+	cudaEventDestroy(tStartEvent);
+	cudaEventDestroy(tStopEvent);
+	return elapsedTime;
+}
+
+}	//end namespace stim
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/devices.h b/stim/cuda/devices.h
deleted file mode 100644
index 4dce378..0000000
--- a/stim/cuda/devices.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef RTS_CUDA_DEVICES
-#define RTS_CUDA_DEVICES
-
-#include <cuda.h>
-
-namespace stim{
-
-int maxThreadsPerBlock()
-{
-	int device;
-	cudaGetDevice(&device);		//get the id of the current device
-	cudaDeviceProp props;		//device property structure
-	cudaGetDeviceProperties(&props, device);
-	return props.maxThreadsPerBlock;
-}
-}	//end namespace rts
-
-#endif
diff --git a/stim/cuda/down_sample.cuh b/stim/cuda/down_sample.cuh
index 73081e3..ecd19c9 100644
--- a/stim/cuda/down_sample.cuh
+++ b/stim/cuda/down_sample.cuh
@@ -89,7 +89,7 @@ namespace stim{
 			gpu_down_sample<T>(gpuI, gpuI0, resize, x, y);
 
 			//copy the image data to the GPU
-			cudaMemcpy(re_image, gpuI, bytes_ds, cudaMemcpyHostToDevice);
+			cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
 
 			cudaFree(gpuI0);
 			cudeFree(gpuI);
diff --git a/stim/cuda/error.h b/stim/cuda/error.h
deleted file mode 100644
index b49cf4b..0000000
--- a/stim/cuda/error.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#include <stdio.h>
-#include <iostream>
-using namespace std;
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-#include "cufft.h"
-
-#ifndef CUDA_HANDLE_ERROR_H
-#define CUDA_HANDLE_ERROR_H
-
-//handle error macro
-static void HandleError( cudaError_t err, const char *file,  int line ) {
-   	if (err != cudaSuccess) {
-			//FILE* outfile = fopen("cudaErrorLog.txt", "w");
-      		//fprintf(outfile,  "%s in %s at line %d\n", cudaGetErrorString( err ),  file, line );
-			//fclose(outfile);
-            printf("%s in %s at line %d\n", cudaGetErrorString( err ),  file, line );
-       		//exit( EXIT_FAILURE );
-
-   	}
-}
-#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
-
-static void CufftError( cufftResult err )
-{
-    if (err != CUFFT_SUCCESS)
-    {
-        if(err == CUFFT_INVALID_PLAN)
-            cout<<"The plan parameter is not a valid handle."<<endl;
-        else if(err == CUFFT_ALLOC_FAILED)
-            cout<<"Allocation failed."<<endl;
-        else if(err == CUFFT_INVALID_VALUE)
-            cout<<"At least one of the parameters idata, odata, and direction is not valid."<<endl;
-        else if(err == CUFFT_INTERNAL_ERROR)
-            cout<<"An internal driver error was detected."<<endl;
-        else if(err == CUFFT_EXEC_FAILED)
-            cout<<"CUFFT failed to execute the transform on the GPU."<<endl;
-        else if(err == CUFFT_SETUP_FAILED)
-            cout<<"The CUFFT library failed to initialize."<<endl;
-        else
-            cout<<"Unknown error: "<<err<<endl;
-
-    }
-}
-
-
-
-#endif
diff --git a/stim/cuda/gaussian_blur.cuh b/stim/cuda/gaussian_blur.cuh
index d2256d8..0b060f0 100644
--- a/stim/cuda/gaussian_blur.cuh
+++ b/stim/cuda/gaussian_blur.cuh
@@ -3,9 +3,9 @@
 
 #include <iostream>
 #include <cuda.h>
-#include <stim/cuda/devices.h>
-#include <stim/cuda/timer.h>
+#include <stim/cuda/cudatools.h>
 #include <stim/cuda/sharedmem.cuh>
+#include <stim/cuda/templates/conv2sep.cuh>		//GPU-based separable convolution algorithm
 
 #define pi	3.14159
 
@@ -13,228 +13,74 @@ namespace stim{
 	namespace cuda{
 
 		template<typename T>
-		__global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){
+		void gen_gaussian(T* out, T sigma, unsigned int width){
 
-			//generate a pointer to shared memory (size will be specified as a kernel parameter)
-			extern __shared__ T s[];
+			//fill the kernel with a gaussian
+			for(unsigned int xi = 0; xi < width; xi++){
 
-			int kr = sigma * 4;				//calculate the kernel radius
-
-			//get a pointer to the gaussian in memory
-			T* g = (T*)&s[blockDim.x + 2 * kr];
-
-			//calculate the start point for this block
-			int bxi = blockIdx.x * blockDim.x;
-			int byi = blockIdx.y;
-
-			//copy the portion of the image necessary for this block to shared memory
-			stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
-
-			//calculate the thread index and block index
-			int ti = threadIdx.x;
-
-			//calculate the spatial coordinate for this thread
-			int xi = bxi + ti;
-
-			//pre-compute the gaussian values for each kernel point
-			T a = 1.0 / (sigma * sqrt(2 * pi));
-			T c = - 1.0 / (2*sigma*sigma);
-			int ki;
-
-			//use the first 2kr+1 threads to evaluate a gaussian and store the result
-			if(ti <= 2* kr+1){
-				ki = ti - kr;
-				g[ti] = a * exp((ki*ki) * c);
+				float x = (float)xi - (float)(width/2);	//calculate the x position of the gaussian
+				float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) );
+				out[xi] = g;
 			}
 
-			//make sure that all writing to shared memory is done before continuing
-			__syncthreads();
-			
-			//if the current pixel is outside of the image
-			if(bxi + ti > x || byi > y)
-				return;
-
-			
-
-			//calculate the coordinates of the current thread in shared memory
-			int si = ti + kr;
-
-			T sum = 0;		//running weighted sum across the kernel
-
-			
-			//for each element of the kernel
-			for(int ki = -kr; ki <= kr; ki++){
-				sum += g[ki + kr] * s[si + ki];
-			}
-
-			//calculate the 1D image index for this thread
-			unsigned int i = byi * x + xi;
-
-			//output the result to global memory
-			out[i] = sum;
 		}
 
 		template<typename T>
-		__global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){
-
-			//generate a pointer to shared memory (size will be specified as a kernel parameter)
-			extern __shared__ T s[];
-
-			int kr = sigma * 4;				//calculate the kernel radius
+		void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){
 
-			//get a pointer to the gaussian in memory
-			T* g = (T*)&s[blockDim.y + 2 * kr];
+			//allocate space for the kernel
+			unsigned int kwidth = sigma * 8 + 1;
+			float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
 
-			//calculate the start point for this block
-			int bxi = blockIdx.x;
-			int byi = blockIdx.y * blockDim.y;
+			//fill the kernel with a gaussian
+			gen_gaussian(kernel0, sigma, kwidth);
 
-			//copy the portion of the image necessary for this block to shared memory
-			stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
+			//copy the kernel to the GPU
+			T* gpuKernel0;
+			HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
 
-			//calculate the thread index and block index
-			int ti = threadIdx.y;
+			//perform the gaussian blur as a separable convolution
+			stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth);
 
-			//calculate the spatial coordinate for this thread
-			int yi = byi + ti;
+			HANDLE_ERROR(cudaFree(gpuKernel0));
 
-			//pre-compute the gaussian values for each kernel point
-			T a = 1.0 / (sigma * sqrt(2 * pi));
-			T c = - 1.0 / (2*sigma*sigma);
-			int ki;
-
-			//use the first 2kr+1 threads to evaluate a gaussian and store the result
-			if(ti <= 2* kr+1){
-				ki = ti - kr;
-				g[ti] = a * exp((ki*ki) * c);
-			}
-
-			//make sure that all writing to shared memory is done before continuing
-			__syncthreads();
-			
-			//if the current pixel is outside of the image
-			if(bxi >= x || yi >= y)
-				return;
-
-			
-
-			//calculate the coordinates of the current thread in shared memory
-			int si = ti + kr;
-
-			T sum = 0;		//running weighted sum across the kernel
-
-			
-			//for each element of the kernel
-			for(int ki = -kr; ki <= kr; ki++){
-				sum += g[ki + kr] * s[si + ki];
-			}
-
-			//calculate the 1D image index for this thread
-			unsigned int i = yi * x + bxi;
-
-			//output the result to global memory
-			out[i] = sum;
 		}
 
-		/// Applies a Gaussian blur to a 2D image stored on the GPU
 		template<typename T>
-		void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
+		void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
 
-			//get the number of pixels in the image
-			unsigned int pixels = x * y;
-			unsigned int bytes = sizeof(T) * pixels;
+			//allocate space for the kernel
+			unsigned int kwidth = sigma * 8 + 1;
+			float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
 
-			// Allocate CUDA array in device memory
-			
-			//define a channel descriptor for a single 32-bit channel
-			cudaChannelFormatDesc channelDesc =
-					   cudaCreateChannelDesc(32, 0, 0, 0,
-											 cudaChannelFormatKindFloat);
-			cudaArray* cuArray;												//declare the cuda array
-			cudaMallocArray(&cuArray, &channelDesc, x, y);			//allocate the cuda array
-
-			// Copy the image data from global memory to the array
-			cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
-							  cudaMemcpyDeviceToDevice);
-
-			// Specify texture
-			struct cudaResourceDesc resDesc;				//create a resource descriptor
-			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
-			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
-			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
-
-			// Specify texture object parameters
-			struct cudaTextureDesc texDesc;							//create a texture descriptor
-			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
-			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
-			texDesc.addressMode[1]   = cudaAddressModeWrap;
-			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
-			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
-			texDesc.normalizedCoords = 0;							//not using normalized coordinates
-
-			// Create texture object
-			cudaTextureObject_t texObj = 0;
-			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
+			//fill the kernel with a gaussian
+			gen_gaussian(kernel0, sigma, kwidth);
 
-			
-			//get the maximum number of threads per block for the CUDA device
-			int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads, 1);
-
-			//calculate the number of blocks
-			dim3 blocks(x / threads.x + 1, y);
-
-			//calculate the shared memory used in the kernel
-			unsigned int pixel_bytes = max_threads * 4;									//bytes devoted to pixel data being processed
-			unsigned int apron_bytes = sigma * 8 * 4;									//bytes devoted to pixels outside the window
-			unsigned int gaussian_bytes = (sigma * 8 + 1) * 4;							//bytes devoted to memory used to store the pre-computed Gaussian window
-			unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes;		//total number of bytes shared memory used
-
-			//blur the image along the x-axis
-			gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);
-
-			// Copy the x-blurred data from global memory to the texture
-			cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
-							  cudaMemcpyDeviceToDevice);
-
-			//transpose the block and thread dimensions
-			threads.x = 1;
-			threads.y = max_threads;
-			blocks.x = x;
-			blocks.y = y / threads.y + 1;
-			
-			//blur the image along the y-axis
-			gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);
+			//copy the kernel to the GPU
+			T* gpuKernel0;
+			HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
+
+			//perform the gaussian blur as a separable convolution
+			stim::cuda::gpu_conv2sep<float>(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth);
 
-			//free allocated memory
-			cudaFree(cuArray);
+			HANDLE_ERROR(cudaFree(gpuKernel0));
 
 		}
 
 		/// Applies a Gaussian blur to a 2D image stored on the CPU
 		template<typename T>
-		void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
+		void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
 
-			//get the number of pixels in the image
-			unsigned int pixels = x * y;
-			unsigned int bytes = sizeof(T) * pixels;
+			//allocate space for the kernel
+			unsigned int kwidth = sigma * 8 + 1;
+			float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
 
-			//allocate space on the GPU
-			T* gpuI0;
-			cudaMalloc(&gpuI0, bytes);
-			
-			
-			//copy the image data to the GPU
-			cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
-
-			//run the GPU-based version of the algorithm
-			gpu_gaussian_blur_2d<T>(gpuI0, sigma, x, y);
+			//fill the kernel with a gaussian
+			gen_gaussian(kernel0, sigma, kwidth);
 
-			//copy the image data from the device
-			cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost);
-
-			//free allocated memory
-			cudaFree(gpuI0);
+			//perform the gaussian blur as a separable convolution
+			stim::cuda::cpu_conv2sep<float>(image, x, y, kernel0, kwidth, kernel0, kwidth);
+			
 		}
 		
 	};
diff --git a/stim/cuda/glbind.h b/stim/cuda/glbind.h
deleted file mode 100644
index bd841cc..0000000
--- a/stim/cuda/glbind.h
+++ /dev/null
@@ -1,79 +0,0 @@
-#ifndef RTS_GL_BIND_H
-#define RTS_GL_BIND_H
-
-#include <GL/glew.h>
-#include <GL/gl.h>
-
-#include <stdio.h>
-#include <cstring>
-
-//#include <cudaHandleError.h>
-#include "cuda_gl_interop.h"
-#include "../gl/error.h"
-
-namespace stim
-{
-
-static void InitGLEW()
-{
-	//Initialize the GLEW toolkit
-
-	GLenum err = glewInit();
-	if(GLEW_OK != err)
-	{
-		printf("Error starting GLEW.");
-	}
-	fprintf(stdout, "Status: Using GLEW %s\n", glewGetString(GLEW_VERSION));
-}
-
-static void cudaSetDevice(int major = 1, int minor = 3)
-{
-	cudaDeviceProp prop;
-	int dev;
-
-	//find a CUDA device that can handle an offscreen buffer
-	int num_gpu;
-	HANDLE_ERROR(cudaGetDeviceCount(&num_gpu));
-	printf("Number of CUDA devices detected: %d\n", num_gpu);
-	memset(&prop, 0, sizeof(cudaDeviceProp));
-	prop.major=major;
-	prop.minor=minor;
-	HANDLE_ERROR(cudaChooseDevice(&dev, &prop));
-	HANDLE_ERROR(cudaGetDeviceProperties(&prop, dev));
-	HANDLE_ERROR(cudaGLSetGLDevice(dev));
-}
-
-static void* cudaMapResource(cudaGraphicsResource* cudaBufferResource)
-{
-	//this function takes a predefined CUDA resource and maps it to a pointer
-	void* buffer;
-	HANDLE_ERROR(cudaGraphicsMapResources(1, &cudaBufferResource, NULL));
-	size_t size;
-	HANDLE_ERROR(cudaGraphicsResourceGetMappedPointer( (void**)&buffer, &size, cudaBufferResource));
-	return buffer;
-}
-static void cudaUnmapResource(cudaGraphicsResource* resource)
-{
-	//this function unmaps the CUDA resource so it can be used by OpenGL
-	HANDLE_ERROR(cudaGraphicsUnmapResources(1, &resource, NULL));
-}
-
-static void cudaCreateRenderBuffer(GLuint &glBufferName, cudaGraphicsResource* &cudaBufferResource, int resX, int resY)
-{
-	//delete the previous buffer name and resource
-	if(cudaBufferResource != 0)
-		HANDLE_ERROR(cudaGraphicsUnregisterResource(cudaBufferResource));
-	if(glBufferName != 0)
-		glDeleteBuffers(1, &glBufferName);
-
-	//generate an OpenGL offscreen buffer
-	glGenBuffers(1, &glBufferName);
-
-	//bind the buffer - directs all calls to this buffer
-	glBindBuffer(GL_PIXEL_UNPACK_BUFFER, glBufferName);
-	glBufferData(GL_PIXEL_UNPACK_BUFFER, resX * resY * sizeof(uchar3), NULL, GL_DYNAMIC_DRAW_ARB);
-	CHECK_OPENGL_ERROR
-	HANDLE_ERROR(cudaGraphicsGLRegisterBuffer(&cudaBufferResource, glBufferName, cudaGraphicsMapFlagsNone));
-}
-}
-#endif
diff --git a/stim/cuda/gradient.cuh b/stim/cuda/gradient.cuh
deleted file mode 100644
index 95931dc..0000000
--- a/stim/cuda/gradient.cuh
+++ /dev/null
@@ -1,115 +0,0 @@
-#ifndef STIM_CUDA_GRADIENT_H
-#define STIM_CUDA_GRADIENT_H
-
-#include <iostream>
-#include <cuda.h>
-#include <stim/cuda/devices.h>
-#include <stim/cuda/error.h>
-
-namespace stim{
-	namespace cuda{
-
-		template<typename T>
-		__global__ void gradient_2d(T* out, T* in, unsigned int x, unsigned int y){
-
-			//calculate the 1D image index for this thread
-			int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-			//convert this to 2D pixel coordinates
-			int yi = i / x;
-			int xi = i - (yi * x);
-
-			//return if the pixel is outside of the image
-			if(xi >= x || yi >= y) return;
-
-			//calculate indices for the forward difference
-			int i_xp = yi * x + (xi + 1);
-			int i_yp = (yi + 1) * x + xi;
-
-			//use forward differences if a coordinate is zero
-			if(xi == 0)
-				out[i * 2 + 0] = in[i_xp] - in[i];
-			if(yi == 0)
-				out[i * 2 + 1] = in[i_yp] - in[i];
-
-			//calculate indices for the backward difference
-			int i_xn = yi * x + (xi - 1);
-			int i_yn = (yi - 1) * x + xi;
-
-			//use backward differences if the coordinate is at the maximum edge
-			if(xi == x-1)
-				out[i * 2 + 0] = in[i] - in[i_xn];
-			if(yi == y-1)
-				out[i * 2 + 1] = in[i] - in[i_yn];
-
-			//otherwise use central differences
-			if(xi > 0 && xi < x-1)
-				out[i * 2 + 0] = (in[i_xp] - in[i_xn]) / 2;
-
-			if(yi > 0 && yi < y-1)
-				out[i * 2 + 1] = (in[i_yp] - in[i_yn]) / 2;
-
-		}
-
-		template<typename T>
-		//void gpu_gradient_2d(T* gpuOut, T* gpuIn, unsigned int x, unsigned int y){
-		void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){
-
-			//get the number of pixels in the image
-			unsigned int pixels = x * y;
-			
-			//allocate space on the GPU for the input image
-			//T* gpuI;
-			//HANDLE_ERROR(cudaMalloc(&gpuI, bytes));
-
-			//cudaMemcpy(gpuI, gpuI0, bytes, cudaMemcpyDeviceToDevice);
-
-
-			//allocate space on the GPU for the output gradient image
-			//T* gpuGrad;
-			//cudaMalloc(&gpuGrad, bytes * 2);		//the output image will have two channels (x, y)
-
-			//get the maximum number of threads per block for the CUDA device
-			int threads = stim::maxThreadsPerBlock();
-
-			//calculate the number of blocks
-			int blocks = pixels / threads + (pixels%threads == 0 ? 0:1);
-
-			//call the GPU kernel to determine the gradient
-			gradient_2d<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y);
-
-		}
-
-		template<typename T>
-		void cpu_gradient_2d(T* out, T* in, unsigned int x, unsigned int y){
-
-			//get the number of pixels in the image
-			unsigned int pixels = x * y;
-			unsigned int bytes = pixels * sizeof(T);
-
-			//allocate space on the GPU for the input image
-			T* gpuIn;
-			HANDLE_ERROR(cudaMalloc(&gpuIn, bytes));
-
-			//copy the image data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuIn, in, bytes, cudaMemcpyHostToDevice));
-
-			//allocate space on the GPU for the output gradient image
-			T* gpuOut;
-			cudaMalloc(&gpuOut, bytes * 2);		//the output image will have two channels (x, y)
-
-			//call the GPU version of this function
-			gpu_gradient_2d(gpuOut, gpuIn, x, y);	
-
-			//copy the results to the CPU
-			cudaMemcpy(out, gpuOut, bytes * 2, cudaMemcpyDeviceToHost);
-
-			//free allocated memory
-			cudaFree(gpuOut);
-		}
-
-	}
-}
-
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/templates/conv2.cuh b/stim/cuda/templates/conv2.cuh
new file mode 100644
index 0000000..a40fdd0
--- /dev/null
+++ b/stim/cuda/templates/conv2.cuh
@@ -0,0 +1,147 @@
+#ifndef STIM_CUDA_CONV2_H
+#define STIM_CUDA_CONV2_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/devices.h>
+#include <stim/cuda/error.h>
+#include <cmath>
+#include <algorithm>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		//__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){
+		__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){
+
+
+			//the radius of mask
+			int r = (M - 1)/2;
+
+
+			//calculate the 1D index for this thread
+			//int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+			//change 1D index to 2D cordinates
+			int i = blockIdx.x * blockDim.x + threadIdx.x;
+			int j = blockIdx.y;
+
+			int idx = j * w + i;
+			//unsigned long N = w * h;
+
+			if(i < w && j < h){
+
+				//copy[idx] = tex2D<float>(texObj, i+100, j+100);
+				//return;
+
+				//tex2D<float>(texObj, i, j);
+
+				//allocate memory for result
+				T sum = 0;
+
+				//for (unsigned int y = max(j - r, 0); y <= min(j + r, h - 1); y++){
+
+					//for (unsigned int x = max(i - r, 0); x <= min(i + r, w - 1); x++){
+
+				for (int y = j - r; y <= j + r; y++){
+
+					for (int x = i - r; x <= i + r; x++){
+
+						//idx to mask cordinates(xx, yy)
+						int xx = x - (i - r);
+						int yy = y - (j - r);
+
+						//T temp = img[y * w + x] * mask[yy * M + xx];
+						//sum += img[y * w + x] * mask[yy * M + xx];
+						sum += tex2D<T>(texObj, x, y) * 1.0;//mask[yy * M + xx];
+					}		
+				}
+				copy[idx] = sum;
+			 }
+		}
+		
+
+		template<typename T>
+		void gpu_conv2(T* img, T* mask, T* copy, unsigned int w, unsigned int h, unsigned M){
+
+			unsigned long N = w * h;
+
+			// Allocate CUDA array in device memory
+			
+			//define a channel descriptor for a single 32-bit channel
+			cudaChannelFormatDesc channelDesc =
+					   cudaCreateChannelDesc(32, 0, 0, 0,
+											 cudaChannelFormatKindFloat);
+			cudaArray* cuArray;												//declare the cuda array
+			cudaMallocArray(&cuArray, &channelDesc, w, h);			//allocate the cuda array
+
+			// Copy the image data from global memory to the array
+			cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T),
+							  cudaMemcpyDeviceToDevice);
+
+			// Specify texture
+			struct cudaResourceDesc resDesc;				//create a resource descriptor
+			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
+			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
+			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
+
+			// Specify texture object parameters
+			struct cudaTextureDesc texDesc;							//create a texture descriptor
+			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
+			texDesc.addressMode[0]   = cudaAddressModeMirror;			//use wrapping (around the edges)
+			texDesc.addressMode[1]   = cudaAddressModeMirror;
+			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
+			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
+			texDesc.normalizedCoords = 0;							//not using normalized coordinates
+
+			// Create texture object
+			cudaTextureObject_t texObj = 0;
+			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			dim3 blocks(w / threads + 1, h);
+
+			//call the kernel to do the multiplication
+			//cuda_conv2 <<< blocks, threads >>>(img, mask, copy, w, h, M);
+			cuda_conv2 <<< blocks, threads >>>(img, mask, copy, texObj, w, h, M);
+
+		}
+
+		template<typename T>
+		void cpu_conv2(T* img, T* mask, T* cpu_copy, unsigned int w, unsigned int h, unsigned M){
+
+			unsigned long N = w * h;
+			//allocate memory on the GPU for the array
+			T* gpu_img; 
+			T* gpu_mask; 
+			T* gpu_copy;
+			HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_mask, pow(M, 2) * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) );
+			HANDLE_ERROR( cudaMemcpy( gpu_mask, mask, pow(M, 2) * sizeof(T), cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_conv2<T>(gpu_img, gpu_mask ,gpu_copy, w, h, M);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_img);
+			cudaFree(gpu_mask);
+			cudaFree(gpu_copy);
+
+		}
+		
+	}
+}
+
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/templates/conv2sep.cuh b/stim/cuda/templates/conv2sep.cuh
new file mode 100644
index 0000000..d63d4e0
--- /dev/null
+++ b/stim/cuda/templates/conv2sep.cuh
@@ -0,0 +1,260 @@
+#ifndef STIM_CUDA_CONV2SEP_H
+#define STIM_CUDA_CONV2SEP_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools/devices.h>
+#include <stim/cuda/cudatools/timer.h>
+#include <stim/cuda/sharedmem.cuh>
+#include <stim/cuda/cudatools/error.h>
+
+#define pi	3.14159
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void conv2sep_0(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y,
+										   T* kernel0, unsigned int k0){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ T s[];
+
+			int kr = k0/2;				//calculate the kernel radius
+
+			//get a pointer to the gaussian in memory
+			T* g = (T*)&s[blockDim.x + 2 * kr];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			int byi = blockIdx.y;
+
+			//copy the portion of the image necessary for this block to shared memory
+			stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
+
+			//calculate the thread index
+			int ti = threadIdx.x;
+
+			//calculate the spatial coordinate for this thread
+			int xi = bxi + ti;
+			int yi = byi;
+
+			
+			//use the first 2kr+1 threads to transfer the kernel to shared memory
+			if(ti < k0){
+				g[ti] = kernel0[ti];
+			}
+
+			//make sure that all writing to shared memory is done before continuing
+			__syncthreads();
+			
+			//if the current pixel is outside of the image
+			if(xi > x || yi > y)
+				return;
+			
+
+			//calculate the coordinates of the current thread in shared memory
+			int si = ti + kr;
+
+			T sum = 0;		//running weighted sum across the kernel
+
+			
+			//for each element of the kernel
+			for(int ki = -kr; ki <= kr; ki++){
+				sum += g[ki + kr] * s[si + ki];
+			}
+			
+			//calculate the 1D image index for this thread
+			unsigned int i = byi * x + xi;
+
+			//output the result to global memory
+			out[i] = sum;
+		}
+
+		template<typename T>
+		__global__ void conv2sep_1(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y,
+										   T* kernel0, unsigned int k0){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ T s[];
+
+			int kr = k0/2;				//calculate the kernel radius
+
+			//get a pointer to the gaussian in memory
+			T* g = (T*)&s[blockDim.y + 2 * kr];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x;
+			int byi = blockIdx.y * blockDim.y;
+
+			//copy the portion of the image necessary for this block to shared memory
+			stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
+
+			//calculate the thread index
+			int ti = threadIdx.y;
+
+			//calculate the spatial coordinate for this thread
+			int xi = bxi;
+			int yi = byi + ti;
+
+			
+			//use the first 2kr+1 threads to transfer the kernel to shared memory
+			if(ti < k0){
+				g[ti] = kernel0[ti];
+			}
+
+			//make sure that all writing to shared memory is done before continuing
+			__syncthreads();
+			
+			//if the current pixel is outside of the image
+			if(xi > x || yi > y)
+				return;
+			
+
+			//calculate the coordinates of the current thread in shared memory
+			int si = ti + kr;
+
+			T sum = 0;		//running weighted sum across the kernel
+
+			
+			//for each element of the kernel
+			for(int ki = -kr; ki <= kr; ki++){
+				sum += g[ki + kr] * s[si + ki];
+			}
+			
+			//calculate the 1D image index for this thread
+			unsigned int i = yi * x + xi;
+
+			//output the result to global memory
+			out[i] = sum;
+		}
+
+		template<typename T>
+		void tex_conv2sep(T* out, unsigned int x, unsigned int y,
+						  cudaTextureObject_t texObj, cudaArray* cuArray,
+						  T* kernel0, unsigned int k0,
+						  T* kernel1, unsigned int k1){
+
+			//get the maximum number of threads per block for the CUDA device
+			int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+
+			//calculate the number of blocks
+			dim3 blocks(x / threads.x + 1, y);
+
+			//calculate the shared memory used in the kernel
+			unsigned int pixel_bytes = max_threads * sizeof(T);							//bytes devoted to pixel data being processed
+			unsigned int apron_bytes = k0/2 * sizeof(T);								//bytes devoted to the apron on each side of the window
+			unsigned int gaussian_bytes = k0 * sizeof(T);								//bytes devoted to memory used to store the pre-computed Gaussian window
+			unsigned int shared_bytes = pixel_bytes + 2 * apron_bytes + gaussian_bytes;		//total number of bytes shared memory used
+
+			//blur the image along the x-axis
+			conv2sep_0<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel0, k0);
+
+			// Copy the x-blurred data from global memory to the texture
+			cudaMemcpyToArray(cuArray, 0, 0, out, x * y * sizeof(T),
+							  cudaMemcpyDeviceToDevice);
+			
+			//transpose the block and thread dimensions
+			threads.x = 1;
+			threads.y = max_threads;
+			blocks.x = x;
+			blocks.y = y / threads.y + 1;
+			
+			//blur the image along the y-axis
+			conv2sep_1<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel1, k1);
+
+		}
+
+		template<typename T>
+		void gpu_conv2sep(T* image, unsigned int x, unsigned int y,
+						  T* kernel0, unsigned int k0,
+						  T* kernel1, unsigned int k1){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			unsigned int bytes = sizeof(T) * pixels;
+
+			// Allocate CUDA array in device memory
+			
+			//define a channel descriptor for a single 32-bit channel
+			cudaChannelFormatDesc channelDesc =
+					   cudaCreateChannelDesc(32, 0, 0, 0,
+											 cudaChannelFormatKindFloat);
+			cudaArray* cuArray;												//declare the cuda array
+			cudaMallocArray(&cuArray, &channelDesc, x, y);			//allocate the cuda array
+
+			// Copy the image data from global memory to the array
+			cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
+							  cudaMemcpyDeviceToDevice);
+
+			// Specify texture
+			struct cudaResourceDesc resDesc;				//create a resource descriptor
+			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
+			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
+			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
+
+			// Specify texture object parameters
+			struct cudaTextureDesc texDesc;							//create a texture descriptor
+			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
+			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
+			texDesc.addressMode[1]   = cudaAddressModeWrap;
+			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
+			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
+			texDesc.normalizedCoords = 0;							//not using normalized coordinates
+
+			// Create texture object
+			cudaTextureObject_t texObj = 0;
+			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
+
+			//call the texture version of the separable convolution function
+			tex_conv2sep(image, x, y, texObj, cuArray, kernel0, k0, kernel1, k1);			
+			
+			//free allocated memory
+			cudaFree(cuArray);
+
+		}
+
+		/// Applies a Gaussian blur to a 2D image stored on the CPU
+		template<typename T>
+		void cpu_conv2sep(T* image, unsigned int x, unsigned int y, 
+						  T* kernel0, unsigned int k0,
+						  T* kernel1, unsigned int k1){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			unsigned int bytes = sizeof(T) * pixels;
+
+			//---------Allocate Image---------
+			//allocate space on the GPU for the image
+			T* gpuI0;
+			HANDLE_ERROR(cudaMalloc(&gpuI0, bytes));			
+			
+			//copy the image data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice));
+
+			//---------Allocate Kernel--------
+			//allocate and copy the 0 (x) kernel
+			T* gpuK0;
+			HANDLE_ERROR(cudaMalloc(&gpuK0, k0 * sizeof(T)));
+			HANDLE_ERROR(cudaMemcpy(gpuK0, kernel0, k0 * sizeof(T), cudaMemcpyHostToDevice));
+
+			//allocate and copy the 1 (y) kernel
+			T* gpuK1;
+			HANDLE_ERROR(cudaMalloc(&gpuK1, k1 * sizeof(T)));
+			HANDLE_ERROR(cudaMemcpy(gpuK1, kernel1, k1 * sizeof(T), cudaMemcpyHostToDevice));
+
+			//run the GPU-based version of the algorithm
+			gpu_conv2sep<T>(gpuI0, x, y, gpuK0, k0, gpuK1, k1);
+
+			//copy the image data from the device
+			cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost);
+
+			//free allocated memory
+			cudaFree(gpuI0);
+		}
+		
+	};
+};
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/templates/gaussian_blur.cuh b/stim/cuda/templates/gaussian_blur.cuh
new file mode 100644
index 0000000..0b060f0
--- /dev/null
+++ b/stim/cuda/templates/gaussian_blur.cuh
@@ -0,0 +1,89 @@
+#ifndef STIM_CUDA_GAUSSIAN_BLUR_H
+#define STIM_CUDA_GAUSSIAN_BLUR_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include <stim/cuda/templates/conv2sep.cuh>		//GPU-based separable convolution algorithm
+
+#define pi	3.14159
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		void gen_gaussian(T* out, T sigma, unsigned int width){
+
+			//fill the kernel with a gaussian
+			for(unsigned int xi = 0; xi < width; xi++){
+
+				float x = (float)xi - (float)(width/2);	//calculate the x position of the gaussian
+				float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) );
+				out[xi] = g;
+			}
+
+		}
+
+		template<typename T>
+		void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){
+
+			//allocate space for the kernel
+			unsigned int kwidth = sigma * 8 + 1;
+			float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
+
+			//fill the kernel with a gaussian
+			gen_gaussian(kernel0, sigma, kwidth);
+
+			//copy the kernel to the GPU
+			T* gpuKernel0;
+			HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
+
+			//perform the gaussian blur as a separable convolution
+			stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth);
+
+			HANDLE_ERROR(cudaFree(gpuKernel0));
+
+		}
+
+		template<typename T>
+		void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
+
+			//allocate space for the kernel
+			unsigned int kwidth = sigma * 8 + 1;
+			float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
+
+			//fill the kernel with a gaussian
+			gen_gaussian(kernel0, sigma, kwidth);
+
+			//copy the kernel to the GPU
+			T* gpuKernel0;
+			HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
+
+			//perform the gaussian blur as a separable convolution
+			stim::cuda::gpu_conv2sep<float>(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth);
+
+			HANDLE_ERROR(cudaFree(gpuKernel0));
+
+		}
+
+		/// Applies a Gaussian blur to a 2D image stored on the CPU
+		template<typename T>
+		void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
+
+			//allocate space for the kernel
+			unsigned int kwidth = sigma * 8 + 1;
+			float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
+
+			//fill the kernel with a gaussian
+			gen_gaussian(kernel0, sigma, kwidth);
+
+			//perform the gaussian blur as a separable convolution
+			stim::cuda::cpu_conv2sep<float>(image, x, y, kernel0, kwidth, kernel0, kwidth);
+			
+		}
+		
+	};
+};
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/templates/gradient.cuh b/stim/cuda/templates/gradient.cuh
new file mode 100644
index 0000000..95931dc
--- /dev/null
+++ b/stim/cuda/templates/gradient.cuh
@@ -0,0 +1,115 @@
+#ifndef STIM_CUDA_GRADIENT_H
+#define STIM_CUDA_GRADIENT_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/devices.h>
+#include <stim/cuda/error.h>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void gradient_2d(T* out, T* in, unsigned int x, unsigned int y){
+
+			//calculate the 1D image index for this thread
+			int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+			//convert this to 2D pixel coordinates
+			int yi = i / x;
+			int xi = i - (yi * x);
+
+			//return if the pixel is outside of the image
+			if(xi >= x || yi >= y) return;
+
+			//calculate indices for the forward difference
+			int i_xp = yi * x + (xi + 1);
+			int i_yp = (yi + 1) * x + xi;
+
+			//use forward differences if a coordinate is zero
+			if(xi == 0)
+				out[i * 2 + 0] = in[i_xp] - in[i];
+			if(yi == 0)
+				out[i * 2 + 1] = in[i_yp] - in[i];
+
+			//calculate indices for the backward difference
+			int i_xn = yi * x + (xi - 1);
+			int i_yn = (yi - 1) * x + xi;
+
+			//use backward differences if the coordinate is at the maximum edge
+			if(xi == x-1)
+				out[i * 2 + 0] = in[i] - in[i_xn];
+			if(yi == y-1)
+				out[i * 2 + 1] = in[i] - in[i_yn];
+
+			//otherwise use central differences
+			if(xi > 0 && xi < x-1)
+				out[i * 2 + 0] = (in[i_xp] - in[i_xn]) / 2;
+
+			if(yi > 0 && yi < y-1)
+				out[i * 2 + 1] = (in[i_yp] - in[i_yn]) / 2;
+
+		}
+
+		template<typename T>
+		//void gpu_gradient_2d(T* gpuOut, T* gpuIn, unsigned int x, unsigned int y){
+		void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			
+			//allocate space on the GPU for the input image
+			//T* gpuI;
+			//HANDLE_ERROR(cudaMalloc(&gpuI, bytes));
+
+			//cudaMemcpy(gpuI, gpuI0, bytes, cudaMemcpyDeviceToDevice);
+
+
+			//allocate space on the GPU for the output gradient image
+			//T* gpuGrad;
+			//cudaMalloc(&gpuGrad, bytes * 2);		//the output image will have two channels (x, y)
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = pixels / threads + (pixels%threads == 0 ? 0:1);
+
+			//call the GPU kernel to determine the gradient
+			gradient_2d<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y);
+
+		}
+
+		template<typename T>
+		void cpu_gradient_2d(T* out, T* in, unsigned int x, unsigned int y){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			unsigned int bytes = pixels * sizeof(T);
+
+			//allocate space on the GPU for the input image
+			T* gpuIn;
+			HANDLE_ERROR(cudaMalloc(&gpuIn, bytes));
+
+			//copy the image data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuIn, in, bytes, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the output gradient image
+			T* gpuOut;
+			cudaMalloc(&gpuOut, bytes * 2);		//the output image will have two channels (x, y)
+
+			//call the GPU version of this function
+			gpu_gradient_2d(gpuOut, gpuIn, x, y);	
+
+			//copy the results to the CPU
+			cudaMemcpy(out, gpuOut, bytes * 2, cudaMemcpyDeviceToHost);
+
+			//free allocated memory
+			cudaFree(gpuOut);
+		}
+
+	}
+}
+
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/threads.h b/stim/cuda/threads.h
deleted file mode 100644
index 87c443e..0000000
--- a/stim/cuda/threads.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-#include "../cuda/callable.h"
-
-#ifndef CUDA_THREADS_H
-#define CUDA_THREADS_H
-
-#define MAX_GRID        65535
-
-__device__ unsigned int ThreadIndex1D()
-{
-    return blockIdx.y * gridDim.x * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
-}
-
-dim3 GenGrid1D(unsigned int N, unsigned int blocksize = 128)
-{
-    dim3 dimgrid;
-
-    dimgrid.x = (N + blocksize - 1)/blocksize;
-    dimgrid.y = 1;
-    dimgrid.z = 1;
-
-    if(dimgrid.x > MAX_GRID)
-    {
-        dimgrid.y = (dimgrid.x + MAX_GRID - 1) / MAX_GRID;
-        dimgrid.x = MAX_GRID;
-    }
-
-    return dimgrid;
-
-}
-
-
-#endif
diff --git a/stim/cuda/timer.h b/stim/cuda/timer.h
deleted file mode 100644
index f7ce5e1..0000000
--- a/stim/cuda/timer.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef STIM_CUDA_TIMER
-#define STIM_CUDA_TIMER
-
-static cudaEvent_t tStartEvent;
-static cudaEvent_t tStopEvent;
-
-namespace stim{
-
-/// These functions calculate the time between GPU functions in milliseconds
-static void gpuStartTimer()
-{
-	//set up timing events
-	cudaEventCreate(&tStartEvent);
-	cudaEventCreate(&tStopEvent);
-	cudaEventRecord(tStartEvent, 0);
-}
-
-static float gpuStopTimer()
-{
-	cudaEventRecord(tStopEvent, 0);
-	cudaEventSynchronize(tStopEvent);
-	float elapsedTime;
-	cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent);
-	cudaEventDestroy(tStartEvent);
-	cudaEventDestroy(tStopEvent);
-	return elapsedTime;
-}
-
-}	//end namespace stim
-
-#endif
\ No newline at end of file
--
libgit2 0.21.4