diff --git a/stim/math/filters/conv2.cuh b/stim/math/filters/conv2.cuh
new file mode 100644
index 0000000..48ea7f7
--- /dev/null
+++ b/stim/math/filters/conv2.cuh
@@ -0,0 +1,119 @@
+#ifndef STIM_CUDA_CONV2_H
+#define STIM_CUDA_CONV2_H
+//#define __CUDACC__
+
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+
+namespace stim {
+	//Kernel function that performs the 2D convolution.
+	template<typename T, typename K>
+	__global__ void kernel_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
+		extern __shared__ T s[];								//declare a shared memory array
+		size_t xi = blockIdx.x * blockDim.x + threadIdx.x;	//threads correspond to indices into the output image
+		size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
+		size_t tid = threadIdx.y * blockDim.x + threadIdx.x;
+		size_t nt = blockDim.x * blockDim.y;
+
+		size_t cx = blockIdx.x * blockDim.x;					//find the upper left corner of the input region
+		size_t cy = blockIdx.y * blockDim.y;
+
+		size_t X = sx - kx + 1;								//calculate the size of the output image
+		size_t Y = sy - ky + 1;
+
+		if (cx >= X || cy >= Y) return;						//return if the entire block is outside the image
+		size_t smx = min(blockDim.x + kx - 1, sx - cx);			//size of the shared copy of the input image
+		size_t smy = min(blockDim.y + ky - 1, sy - cy);			//	min function is used to deal with boundary blocks
+		stim::cuda::threadedMemcpy2D<T>(s, smx, smy, in, cx, cy, sx, sy, tid, nt);	//copy the input region to shared memory
+		__syncthreads();
+
+		if (xi >= X || yi >= Y) return;						//returns if the thread is outside of the output image
+		
+		//loop through the kernel
+		size_t kxi, kyi;
+		K v = 0;
+		for (kyi = 0; kyi < ky; kyi++) {
+			for (kxi = 0; kxi < kx; kxi++) {
+				v += s[(threadIdx.y + kyi) * smx + threadIdx.x + kxi] * kernel[kyi * kx + kxi];
+				//v += in[(yi + kyi) * sx + xi + kxi] * kernel[kyi * kx + kxi];
+			}
+		}
+		out[yi * X + xi] = (T)v;								//write the result to global memory
+
+	}
+
+	//Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device.
+	//@param out is a pointer to the output image, which is of size (sx - kx + 1) x (sy - ky + 1)
+	//@param in is a pointer to the input image
+	//@param sx is the size of the input image along X
+	//@param sy is the size of the input image along Y
+	//@param kx is the size of the kernel along X
+	//@param ky is the size of the kernel along Y
+	template<typename T, typename K>
+	void gpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
+		cudaDeviceProp p;
+		HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
+		size_t tmax = p.maxThreadsPerBlock;
+		dim3 nt(sqrt(tmax), sqrt(tmax));					//calculate the block dimensions
+		size_t X = sx - kx + 1;								//calculate the size of the output image
+		size_t Y = sy - ky + 1;
+		dim3 nb(X / nt.x + 1, Y / nt.y + 1);							//calculate the grid dimensions
+		size_t sm = (nt.x + kx - 1) * (nt.y + ky - 1) * sizeof(T);		//shared memory bytes required to store block data
+		if (sm > p.sharedMemPerBlock) {
+			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
+			exit(1);
+		}
+		kernel_conv2 <<<nb, nt, sm>>> (out, in, kernel, sx, sy, kx, ky);	//launch the kernel
+	}
+//#endif
+	//Performs a convolution of a 2D image. Only valid pixels based on the kernel are returned.
+	//	As a result, the output image will be smaller than the input image by (kx-1, ky-1)
+	//@param out is a pointer to the output image
+	//@param in is a pointer to the input image
+	//@param sx is the size of the input image along X
+	//@param sy is the size of the input image along Y
+	//@param kx is the size of the kernel along X
+	//@param ky is the size of the kernel along Y
+	template<typename T, typename K>
+	void cpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
+		size_t X = sx - kx + 1;					//x size of the output image
+		size_t Y = sy - ky + 1;					//y size of the output image
+
+		//allocate memory and copy everything to the GPU
+		T* gpu_in;
+		HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice));
+		K* gpu_kernel;
+		HANDLE_ERROR(cudaMalloc(&gpu_kernel, kx * ky * sizeof(K)));
+		HANDLE_ERROR(cudaMemcpy(gpu_kernel, kernel, kx * ky * sizeof(K), cudaMemcpyHostToDevice));
+		T* gpu_out;
+		HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T)));
+		gpu_conv2(gpu_out, gpu_in, gpu_kernel, sx, sy, kx, ky);								//execute the GPU kernel
+		HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost));	//copy the result to the host
+		HANDLE_ERROR(cudaFree(gpu_in));
+		HANDLE_ERROR(cudaFree(gpu_kernel));
+		HANDLE_ERROR(cudaFree(gpu_out));
+/* CPU CODE
+		K v;												//register stores the integral of the current pixel value
+		size_t yi, xi, kyi, kxi, yi_kyi_sx;
+		for (yi = 0; yi < Y; yi++) {					//for each pixel in the output image
+			for (xi = 0; xi < X; xi++) {
+				v = 0;
+				for (kyi = 0; kyi < ky; kyi++) {		//for each pixel in the kernel
+					yi_kyi_sx = (yi + kyi) * sx;
+					for (kxi = 0; kxi < kx; kxi++) {
+						v += in[yi_kyi_sx + xi + kxi] * kernel[kyi * kx + kxi];
+					}
+				}
+				out[yi * X + xi] = v;						//save the result to the output array
+			}
+		}
+		
+		*/
+	}
+	
+
+}
+
+
+#endif
\ No newline at end of file
diff --git a/stim/math/filters/conv2.h b/stim/math/filters/conv2.h
deleted file mode 100644
index 94e9f02..0000000
--- a/stim/math/filters/conv2.h
+++ /dev/null
@@ -1,123 +0,0 @@
-#ifndef STIM_CUDA_CONV2_H
-#define STIM_CUDA_CONV2_H
-//#define __CUDACC__
-
-#ifdef __CUDACC__
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-#endif
-
-namespace stim {
-#ifdef __CUDACC__
-	//Kernel function that performs the 2D convolution.
-	template<typename T, typename K>
-	__global__ void kernel_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
-		extern __shared__ T s[];								//declare a shared memory array
-		size_t xi = blockIdx.x * blockDim.x + threadIdx.x;	//threads correspond to indices into the output image
-		size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
-		size_t tid = threadIdx.y * blockDim.x + threadIdx.x;
-		size_t nt = blockDim.x * blockDim.y;
-
-		size_t cx = blockIdx.x * blockDim.x;					//find the upper left corner of the input region
-		size_t cy = blockIdx.y * blockDim.y;
-
-		size_t X = sx - kx + 1;								//calculate the size of the output image
-		size_t Y = sy - ky + 1;
-
-		if (cx >= X || cy >= Y) return;						//return if the entire block is outside the image
-		size_t smx = min(blockDim.x + kx - 1, sx - cx);			//size of the shared copy of the input image
-		size_t smy = min(blockDim.y + ky - 1, sy - cy);			//	min function is used to deal with boundary blocks
-		stim::cuda::threadedMemcpy2D<T>(s, smx, smy, in, cx, cy, sx, sy, tid, nt);	//copy the input region to shared memory
-		__syncthreads();
-
-		if (xi >= X || yi >= Y) return;						//returns if the thread is outside of the output image
-		
-		//loop through the kernel
-		size_t kxi, kyi;
-		K v = 0;
-		for (kyi = 0; kyi < ky; kyi++) {
-			for (kxi = 0; kxi < kx; kxi++) {
-				v += s[(threadIdx.y + kyi) * smx + threadIdx.x + kxi] * kernel[kyi * kx + kxi];
-				//v += in[(yi + kyi) * sx + xi + kxi] * kernel[kyi * kx + kxi];
-			}
-		}
-		out[yi * X + xi] = (T)v;								//write the result to global memory
-
-	}
-
-	//Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device.
-	//@param out is a pointer to the output image, which is of size (sx - kx + 1) x (sy - ky + 1)
-	//@param in is a pointer to the input image
-	//@param sx is the size of the input image along X
-	//@param sy is the size of the input image along Y
-	//@param kx is the size of the kernel along X
-	//@param ky is the size of the kernel along Y
-	template<typename T, typename K>
-	void gpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
-		cudaDeviceProp p;
-		HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
-		size_t tmax = p.maxThreadsPerBlock;
-		dim3 nt(sqrt(tmax), sqrt(tmax));					//calculate the block dimensions
-		size_t X = sx - kx + 1;								//calculate the size of the output image
-		size_t Y = sy - ky + 1;
-		dim3 nb(X / nt.x + 1, Y / nt.y + 1);							//calculate the grid dimensions
-		size_t sm = (nt.x + kx - 1) * (nt.y + ky - 1) * sizeof(T);		//shared memory bytes required to store block data
-		if (sm > p.sharedMemPerBlock) {
-			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
-			exit(1);
-		}
-		kernel_conv2 <<<nb, nt, sm>>> (out, in, kernel, sx, sy, kx, ky);	//launch the kernel
-	}
-#endif
-	//Performs a convolution of a 2D image. Only valid pixels based on the kernel are returned.
-	//	As a result, the output image will be smaller than the input image by (kx-1, ky-1)
-	//@param out is a pointer to the output image
-	//@param in is a pointer to the input image
-	//@param sx is the size of the input image along X
-	//@param sy is the size of the input image along Y
-	//@param kx is the size of the kernel along X
-	//@param ky is the size of the kernel along Y
-	template<typename T, typename K>
-	void cpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
-		size_t X = sx - kx + 1;					//x size of the output image
-		size_t Y = sy - ky + 1;					//y size of the output image
-
-#ifdef __CUDACC__
-		//allocate memory and copy everything to the GPU
-		T* gpu_in;
-		HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T)));
-		HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice));
-		K* gpu_kernel;
-		HANDLE_ERROR(cudaMalloc(&gpu_kernel, kx * ky * sizeof(K)));
-		HANDLE_ERROR(cudaMemcpy(gpu_kernel, kernel, kx * ky * sizeof(K), cudaMemcpyHostToDevice));
-		T* gpu_out;
-		HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T)));
-		gpu_conv2(gpu_out, gpu_in, gpu_kernel, sx, sy, kx, ky);								//execute the GPU kernel
-		HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost));	//copy the result to the host
-		HANDLE_ERROR(cudaFree(gpu_in));
-		HANDLE_ERROR(cudaFree(gpu_kernel));
-		HANDLE_ERROR(cudaFree(gpu_out));
-#else
-		K v;												//register stores the integral of the current pixel value
-		size_t yi, xi, kyi, kxi, yi_kyi_sx;
-		for (yi = 0; yi < Y; yi++) {					//for each pixel in the output image
-			for (xi = 0; xi < X; xi++) {
-				v = 0;
-				for (kyi = 0; kyi < ky; kyi++) {		//for each pixel in the kernel
-					yi_kyi_sx = (yi + kyi) * sx;
-					for (kxi = 0; kxi < kx; kxi++) {
-						v += in[yi_kyi_sx + xi + kxi] * kernel[kyi * kx + kxi];
-					}
-				}
-				out[yi * X + xi] = v;						//save the result to the output array
-			}
-		}
-		
-#endif
-	}
-
-
-}
-
-
-#endif
\ No newline at end of file
diff --git a/stim/math/filters/gauss2.cuh b/stim/math/filters/gauss2.cuh
new file mode 100644
index 0000000..b27cd07
--- /dev/null
+++ b/stim/math/filters/gauss2.cuh
@@ -0,0 +1,47 @@
+#ifndef STIM_CUDA_GAUSS2_H
+#define STIM_CUDA_GAUSS2_H
+
+#include <stim/image/image.h>
+#include <stim/math/filters/sepconv2.cuh>
+#include <stim/math/constants.h>
+
+namespace stim {
+
+	template<typename T>
+	T gauss1d(T x, T mu, T std) {
+		return ((T)1 / (T)sqrt(stim::TAU * std * std) * exp(-(x - mu)*(x - mu) / (2 * std*std)));
+	}
+	///Perform a 2D gaussian convolution on an input image.
+	///@param in is a pointer to the input image
+	///@param stdx is the standard deviation (in pixels) along the x axis
+	///@param stdy is the standard deviation (in pixels) along the y axis
+	///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel
+	template<typename T, typename K>
+	stim::image<T> cpu_gauss2(const stim::image<T>& in, K stdx, K stdy, size_t nstds = 3) {
+		size_t kx = stdx * nstds * 2;					//calculate the kernel sizes
+		size_t ky = stdy * nstds * 2;
+		size_t X = in.width() - kx + 1;					//calculate the size of the output image
+		size_t Y = in.height() - ky + 1;
+		stim::image<T> r(X, Y, in.channels());		//create an output image
+
+		K* gx = (K*)malloc(kx * sizeof(K));			//allocate space for the gaussian kernels
+		K* gy = (K*)malloc(ky * sizeof(K));
+		K mux = (K)kx / (K)2;						//calculate the mean of the gaussian (center of the kernel)
+		K muy = (K)ky / (K)2;
+		for (size_t xi = 0; xi < kx; xi++)			//evaluate the kernels
+			gx[xi] = gauss1d((K)xi, mux, stdx);
+		for (size_t yi = 0; yi < ky; yi++)
+			gy[yi] = gauss1d((K)yi, muy, stdy);
+
+		std::vector< stim::image<T> > IN = in.split();	//split the input image into channels
+		std::vector< stim::image<T> > R = r.split();		//split the output image into channels
+		for (size_t c = 0; c < in.channels(); c++)		//for each channel
+			cpu_sepconv2(R[c].data(), IN[c].data(), gx, gy, IN[c].width(), IN[c].height(), kx, ky);
+
+		r.merge(R);										//merge the blurred channels into the final image
+		return r;
+	}
+}
+
+
+#endif
\ No newline at end of file
diff --git a/stim/math/filters/gauss2.h b/stim/math/filters/gauss2.h
deleted file mode 100644
index 44cae82..0000000
--- a/stim/math/filters/gauss2.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef STIM_CUDA_GAUSS2_H
-#define STIM_CUDA_GAUSS2_H
-
-#include <stim/image/image.h>
-#include <stim/math/filters/sepconv2.h>
-#include <stim/math/constants.h>
-
-namespace stim {
-
-	template<typename T>
-	T gauss1d(T x, T mu, T std) {
-		return ((T)1 / (T)sqrt(stim::TAU * std * std) * exp(-(x - mu)*(x - mu) / (2 * std*std)));
-	}
-	///Perform a 2D gaussian convolution on an input image.
-	///@param in is a pointer to the input image
-	///@param stdx is the standard deviation (in pixels) along the x axis
-	///@param stdy is the standard deviation (in pixels) along the y axis
-	///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel
-	template<typename T, typename K>
-	stim::image<T> cpu_gauss2(const stim::image<T>& in, K stdx, K stdy, size_t nstds = 3) {
-		size_t kx = stdx * nstds * 2;					//calculate the kernel sizes
-		size_t ky = stdy * nstds * 2;
-		size_t X = in.width() - kx + 1;					//calculate the size of the output image
-		size_t Y = in.height() - ky + 1;
-		stim::image<T> r(X, Y, in.channels());		//create an output image
-
-		K* gx = (K*)malloc(kx * sizeof(K));			//allocate space for the gaussian kernels
-		K* gy = (K*)malloc(ky * sizeof(K));
-		K mux = (K)kx / (K)2;						//calculate the mean of the gaussian (center of the kernel)
-		K muy = (K)ky / (K)2;
-		for (size_t xi = 0; xi < kx; xi++)			//evaluate the kernels
-			gx[xi] = gauss1d((K)xi, mux, stdx);
-		for (size_t yi = 0; yi < ky; yi++)
-			gy[yi] = gauss1d((K)yi, muy, stdy);
-
-		std::vector< stim::image<T> > IN = in.split();	//split the input image into channels
-		std::vector< stim::image<T> > R = r.split();		//split the output image into channels
-		for (size_t c = 0; c < in.channels(); c++)		//for each channel
-			cpu_sepconv2(R[c].data(), IN[c].data(), gx, gy, IN[c].width(), IN[c].height(), kx, ky);
-
-		r.merge(R);										//merge the blurred channels into the final image
-		return r;
-	}
-}
-
-
-#endif
\ No newline at end of file
diff --git a/stim/math/filters/gauss3.cuh b/stim/math/filters/gauss3.cuh
new file mode 100644
index 0000000..4b72dd9
--- /dev/null
+++ b/stim/math/filters/gauss3.cuh
@@ -0,0 +1,55 @@
+#ifndef STIM_CUDA_GAUSS3_H
+#define STIM_CUDA_GAUSS3_H
+#include <stim/math/filters/sepconv3.cuh>
+#include <stim/math/filters/gauss2.cuh>
+#include <stim/math/constants.h>
+
+namespace stim
+{
+	///Perform a 3D gaussian convolution on an input image.
+        ///@param in is a pointer to the input data.
+	///@param dimx is the size of in* in the x direction.
+	///@param dimx is the size of in* in the y direction.
+	///@param dimx is the size of in* in the z direction.
+        ///@param stdx is the standard deviation (in pixels) along the x axis.
+        ///@param stdy is the standard deviation (in pixels) along the y axis.
+        ///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel.
+	template<typename T, typename K>
+	void cpu_gauss3(T* in, K dimx, K dimy, K dimz, K stdx, K stdy, K stdz, size_t nstds = 3)
+	{
+		//Set up the sizes of the gaussian Kernels.
+		size_t kx = stdx * nstds * 2;
+		size_t ky = stdy * nstds * 2;
+		size_t kz = stdz * nstds * 2;
+	
+		//Set up the sizes of the new output, which will be kx, ky, kz, smaller than the input.
+		size_t X = dimx - kx +1; 
+		size_t Y = dimy - ky +1; 
+		size_t Z = dimz - kz +1; 
+		T* out = (T*) malloc(X*Y*Z* sizeof(T));
+
+		///Set up the memory that will store the gaussians
+		K* gaussx = (K*)malloc(kx *sizeof(K));
+		K* gaussy = (K*)malloc(ky *sizeof(K));
+		K* gaussz = (K*)malloc(kz *sizeof(K));
+
+		///Set up the midpoints of the gaussians.
+		K midgaussx = (K) kx/ (K)2;
+		K midgaussy = (K) ky/ (K)2;
+		K midgaussz = (K) kz/ (K)2;
+
+		///Evaluate the kernels in each cardinal direction.
+		for(size_t i = 0; i < kx; i++)
+			gaussx[i] = gauss1d((K) i, midgaussx, stdx);
+
+		for(size_t i = 0; i < kx; i++)
+			gaussy[i] = gauss1d((K) i, midgaussy, stdy);
+
+		for(size_t i = 0; i < kx; i++)
+			gaussz[i] = gauss1d((K) i, midgaussz, stdz);
+
+		cpu_sepconv3(out, in, gaussx, gaussy, gaussz, dimx, dimy, dimz, kx, ky, kz);
+
+	}
+}
+#endif
diff --git a/stim/math/filters/gauss3.h b/stim/math/filters/gauss3.h
deleted file mode 100644
index f6a5588..0000000
--- a/stim/math/filters/gauss3.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef STIM_CUDA_GAUSS3_H
-#define STIM_CUDA_GAUSS3_H
-#include <stim/math/filters/sepconv3.h>
-#include <stim/math/filters/gauss2.h>
-#include <stim/math/constants.h>
-
-namespace stim
-{
-	///Perform a 3D gaussian convolution on an input image.
-        ///@param in is a pointer to the input data.
-	///@param dimx is the size of in* in the x direction.
-	///@param dimx is the size of in* in the y direction.
-	///@param dimx is the size of in* in the z direction.
-        ///@param stdx is the standard deviation (in pixels) along the x axis.
-        ///@param stdy is the standard deviation (in pixels) along the y axis.
-        ///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel.
-	template<typename T, typename K>
-	void cpu_gauss3(T* in, K dimx, K dimy, K dimz, K stdx, K stdy, K stdz, size_t nstds = 3)
-	{
-		//Set up the sizes of the gaussian Kernels.
-		size_t kx = stdx * nstds * 2;
-		size_t ky = stdy * nstds * 2;
-		size_t kz = stdz * nstds * 2;
-	
-		//Set up the sizes of the new output, which will be kx, ky, kz, smaller than the input.
-		size_t X = dimx - kx +1; 
-		size_t Y = dimy - ky +1; 
-		size_t Z = dimz - kz +1; 
-		T* out = (T*) malloc(X*Y*Z* sizeof(T));
-
-		///Set up the memory that will store the gaussians
-		K* gaussx = (K*)malloc(kx *sizeof(K));
-		K* gaussy = (K*)malloc(ky *sizeof(K));
-		K* gaussz = (K*)malloc(kz *sizeof(K));
-
-		///Set up the midpoints of the gaussians.
-		K midgaussx = (K) kx/ (K)2;
-		K midgaussy = (K) ky/ (K)2;
-		K midgaussz = (K) kz/ (K)2;
-
-		///Evaluate the kernels in each cardinal direction.
-		for(size_t i = 0; i < kx; i++)
-			gaussx[i] = gauss1d((K) i, midgaussx, stdx);
-
-		for(size_t i = 0; i < kx; i++)
-			gaussy[i] = gauss1d((K) i, midgaussy, stdy);
-
-		for(size_t i = 0; i < kx; i++)
-			gaussz[i] = gauss1d((K) i, midgaussz, stdz);
-
-		cpu_sepconv3(out, in, gaussx, gaussy, gaussz, dimx, dimy, dimz, kx, ky, kz);
-
-	}
-}
-#endif
diff --git a/stim/math/filters/resample2.cuh b/stim/math/filters/resample2.cuh
new file mode 100644
index 0000000..a3263ae
--- /dev/null
+++ b/stim/math/filters/resample2.cuh
@@ -0,0 +1,16 @@
+#ifndef STIM_CUDA_RESAMPLE2_H
+#define STIM_CUDA_RESAMPLE2_H
+
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+
+
+///Downsamples a 2D image by a factor f using a box filter. Any pixels outside of the valid region
+///		(for example if X%f != 0) are chopped.
+template<typename T, typename K>
+void cpu_resample2(T* out, T* in, size_t f, size_t sx, size_t sy) {
+
+}
+
+
+#endif
\ No newline at end of file
diff --git a/stim/math/filters/sepconv2.cuh b/stim/math/filters/sepconv2.cuh
new file mode 100644
index 0000000..5946c6a
--- /dev/null
+++ b/stim/math/filters/sepconv2.cuh
@@ -0,0 +1,89 @@
+#ifndef STIM_CUDA_SEPCONV2_H
+#define STIM_CUDA_SEPCONV2_H
+#include <stim/math/filters/conv2.cuh>
+#ifdef __CUDACC__
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#endif
+
+namespace stim {
+#ifdef __CUDACC__
+	//Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device.
+	//@param out is a pointer to the output image
+	//@param in is a pointer to the input image
+	//@param sx is the size of the input image along X
+	//@param sy is the size of the input image along Y
+	//@param kx is the size of the kernel along X
+	//@param ky is the size of the kernel along Y
+	template<typename T, typename K>
+	void gpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) {
+		cudaDeviceProp p;
+		HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
+		size_t tmax = p.maxThreadsPerBlock;
+		dim3 nt(sqrt(tmax), sqrt(tmax));						//calculate the block dimensions
+		size_t X = sx - kx + 1;									//calculate the x size of the output image
+		T* temp;												//declare a temporary variable to store the intermediate image
+		HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T)));	//allocate memory for the intermediate image
+
+		dim3 nb(X / nt.x + 1, sy / nt.y + 1);							//calculate the grid dimensions
+		size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T);					//shared memory bytes required to store block data
+		if (sm > p.sharedMemPerBlock) {
+			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
+			exit(1);
+		}
+		kernel_conv2 <<<nb, nt, sm>>> (temp, in, k0, sx, sy, kx, 1);	//launch the kernel to compute the intermediate image
+
+		size_t Y = sy - ky + 1;									//calculate the y size of the output image
+		nb.y = Y / nt.y + 1;									//update the grid dimensions to reflect the Y-axis size of the output image
+		sm = nt.x * (nt.y + ky - 1) * sizeof(T);				//calculate the amount of shared memory needed for the second pass
+		if (sm > p.sharedMemPerBlock) {
+			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
+			exit(1);
+		}
+		kernel_conv2 <<<nb, nt, sm>>> (out, temp, k1, X, sy, 1, ky);	//launch the kernel to compute the final image
+		HANDLE_ERROR(cudaFree(temp));							//free memory allocated for the intermediate image
+	}
+#endif
+	//Performs a separable convolution of a 2D image. Only valid pixels based on the kernel are returned.
+	//	As a result, the output image will be smaller than the input image by (kx-1, ky-1)
+	//@param out is a pointer to the output image
+	//@param in is a pointer to the input image
+	//@param k0 is the x-axis convolution filter
+	//@param k1 is the y-axis convolution filter
+	//@param sx is the size of the input image along X
+	//@param sy is the size of the input image along Y
+	//@param kx is the size of the kernel along X
+	//@param ky is the size of the kernel along Y
+	template<typename T, typename K>
+	void cpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) {
+		size_t X = sx - kx + 1;					//x size of the output image
+		size_t Y = sy - ky + 1;
+#ifdef __CUDACC__
+		//allocate memory and copy everything to the GPU
+		T* gpu_in;
+		HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice));
+		K* gpu_k0;
+		HANDLE_ERROR(cudaMalloc(&gpu_k0, kx * sizeof(K)));
+		HANDLE_ERROR(cudaMemcpy(gpu_k0, k0, kx * sizeof(K), cudaMemcpyHostToDevice));
+		K* gpu_k1;
+		HANDLE_ERROR(cudaMalloc(&gpu_k1, ky * sizeof(K)));
+		HANDLE_ERROR(cudaMemcpy(gpu_k1, k1, ky * sizeof(K), cudaMemcpyHostToDevice));
+		T* gpu_out;
+		HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T)));
+		gpu_sepconv2(gpu_out, gpu_in, gpu_k0, gpu_k1, sx, sy, kx, ky);								//execute the GPU kernel
+		HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost));	//copy the result to the host
+		HANDLE_ERROR(cudaFree(gpu_in));
+		HANDLE_ERROR(cudaFree(gpu_k0));
+		HANDLE_ERROR(cudaFree(gpu_k1));
+		HANDLE_ERROR(cudaFree(gpu_out));
+#else
+		T* temp = (T*)malloc(X * sy * sizeof(T));		//allocate space for the intermediate image
+		cpu_conv2(temp, in, k0, sx, sy, kx, 1);			//evaluate the intermediate image
+		cpu_conv2(out, temp, k1, X, sy, 1, ky);			//evaluate the final image
+		free(temp);										//free the memory for the intermediate image
+#endif
+	}
+}
+
+#endif
diff --git a/stim/math/filters/sepconv2.h b/stim/math/filters/sepconv2.h
deleted file mode 100644
index d05b697..0000000
--- a/stim/math/filters/sepconv2.h
+++ /dev/null
@@ -1,89 +0,0 @@
-#ifndef STIM_CUDA_SEPCONV2_H
-#define STIM_CUDA_SEPCONV2_H
-#include <stim/math/filters/conv2.h>
-#ifdef __CUDACC__
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-#endif
-
-namespace stim {
-#ifdef __CUDACC__
-	//Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device.
-	//@param out is a pointer to the output image
-	//@param in is a pointer to the input image
-	//@param sx is the size of the input image along X
-	//@param sy is the size of the input image along Y
-	//@param kx is the size of the kernel along X
-	//@param ky is the size of the kernel along Y
-	template<typename T, typename K>
-	void gpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) {
-		cudaDeviceProp p;
-		HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
-		size_t tmax = p.maxThreadsPerBlock;
-		dim3 nt(sqrt(tmax), sqrt(tmax));						//calculate the block dimensions
-		size_t X = sx - kx + 1;									//calculate the x size of the output image
-		T* temp;												//declare a temporary variable to store the intermediate image
-		HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T)));	//allocate memory for the intermediate image
-
-		dim3 nb(X / nt.x + 1, sy / nt.y + 1);							//calculate the grid dimensions
-		size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T);					//shared memory bytes required to store block data
-		if (sm > p.sharedMemPerBlock) {
-			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
-			exit(1);
-		}
-		kernel_conv2 <<<nb, nt, sm>>> (temp, in, k0, sx, sy, kx, 1);	//launch the kernel to compute the intermediate image
-
-		size_t Y = sy - ky + 1;									//calculate the y size of the output image
-		nb.y = Y / nt.y + 1;									//update the grid dimensions to reflect the Y-axis size of the output image
-		sm = nt.x * (nt.y + ky - 1) * sizeof(T);				//calculate the amount of shared memory needed for the second pass
-		if (sm > p.sharedMemPerBlock) {
-			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
-			exit(1);
-		}
-		kernel_conv2 <<<nb, nt, sm>>> (out, temp, k1, X, sy, 1, ky);	//launch the kernel to compute the final image
-		HANDLE_ERROR(cudaFree(temp));							//free memory allocated for the intermediate image
-	}
-#endif
-	//Performs a separable convolution of a 2D image. Only valid pixels based on the kernel are returned.
-	//	As a result, the output image will be smaller than the input image by (kx-1, ky-1)
-	//@param out is a pointer to the output image
-	//@param in is a pointer to the input image
-	//@param k0 is the x-axis convolution filter
-	//@param k1 is the y-axis convolution filter
-	//@param sx is the size of the input image along X
-	//@param sy is the size of the input image along Y
-	//@param kx is the size of the kernel along X
-	//@param ky is the size of the kernel along Y
-	template<typename T, typename K>
-	void cpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) {
-		size_t X = sx - kx + 1;					//x size of the output image
-		size_t Y = sy - ky + 1;
-#ifdef __CUDACC__
-		//allocate memory and copy everything to the GPU
-		T* gpu_in;
-		HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T)));
-		HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice));
-		K* gpu_k0;
-		HANDLE_ERROR(cudaMalloc(&gpu_k0, kx * sizeof(K)));
-		HANDLE_ERROR(cudaMemcpy(gpu_k0, k0, kx * sizeof(K), cudaMemcpyHostToDevice));
-		K* gpu_k1;
-		HANDLE_ERROR(cudaMalloc(&gpu_k1, ky * sizeof(K)));
-		HANDLE_ERROR(cudaMemcpy(gpu_k1, k1, ky * sizeof(K), cudaMemcpyHostToDevice));
-		T* gpu_out;
-		HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T)));
-		gpu_sepconv2(gpu_out, gpu_in, gpu_k0, gpu_k1, sx, sy, kx, ky);								//execute the GPU kernel
-		HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost));	//copy the result to the host
-		HANDLE_ERROR(cudaFree(gpu_in));
-		HANDLE_ERROR(cudaFree(gpu_k0));
-		HANDLE_ERROR(cudaFree(gpu_k1));
-		HANDLE_ERROR(cudaFree(gpu_out));
-#else
-		T* temp = (T*)malloc(X * sy * sizeof(T));		//allocate space for the intermediate image
-		cpu_conv2(temp, in, k0, sx, sy, kx, 1);			//evaluate the intermediate image
-		cpu_conv2(out, temp, k1, X, sy, 1, ky);			//evaluate the final image
-		free(temp);										//free the memory for the intermediate image
-#endif
-	}
-}
-
-#endif
diff --git a/stim/math/filters/sepconv3.cuh b/stim/math/filters/sepconv3.cuh
new file mode 100644
index 0000000..7250d26
--- /dev/null
+++ b/stim/math/filters/sepconv3.cuh
@@ -0,0 +1,123 @@
+#ifndef STIM_CUDA_SEPCONV3_H
+#define STIM_CUDA_SEPCONV3_H
+
+#include <stim/math/filters/conv2.cuh>
+#include <stim/math/filters/sepconv2.cuh>
+#ifdef __CUDACC__
+	#include <stim/cuda/cudatools.h>
+	#include <stim/cuda/sharedmem.cuh>
+#endif
+
+namespace stim
+{
+#ifdef __CUDACC__
+	template<typename T, typename K>
+	void gpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz)
+{
+
+	size_t X = dimx - kx + 1; 
+	size_t Y = dimy - ky + 1; 
+	size_t Z = dimz - kz + 1;
+	
+	T* temp_out;
+	int idx_IN;
+	int idx_OUT;
+	HANDLE_ERROR(cudaMalloc(&temp_out, X*Y*dimz*sizeof(T)));
+
+	for(int i = 0; i < dimz; i++)
+	{
+		idx_IN 	= (dimx*dimy)*i-i;
+		idx_OUT = (X*Y)*i-i;
+		gpu_sepconv2(&temp_out[idx_OUT], &in[idx_IN], k0, k1, dimx, dimy, kx, ky);
+	}
+
+	cudaDeviceProp p;
+	HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
+	size_t tmax = p.maxThreadsPerBlock;
+
+	dim3 numThreads(sqrt(tmax), sqrt(tmax));
+	dim3 numBlocks(X*Y/numThreads.x +1, dimz/numThreads.y + 1);
+	size_t sharedMem = (numThreads.x + kz - 1) * numThreads.y * sizeof(T);
+	if(sharedMem > p.sharedMemPerBlock)
+	{
+		std::cout << "Error in stim::gpu_sepconv3() - insufficient shared memory for this kernel." << std::endl;
+		exit(1);
+	}
+	kernel_conv2 <<< numBlocks, numThreads, sharedMem >>> (out, temp_out, k2, X*Y, dimz, 1, kz);
+	HANDLE_ERROR(cudaFree(temp_out));
+
+
+}
+#endif
+
+	//Performs a separable convolution of a 3D image. Only valid pixels based on the kernel ar    e returned.
+	//      As a result, the output image will be smaller than the input image by (kx-1, ky-1 , kz-1)
+	//@param out is a pointer to the output image
+	//@param in is a pointer to the input image
+	//@param kx is the x-axis convolution filter
+	//@param ky is the y-axis convolution filter
+	//@param kz is the z-axis convolution filter
+	//@param dimx is the size of the input image along X
+	//@param dimy is the size of the input image along Y
+	//@param dimz is the size of the input image along Z
+	//@param kx is the size of the kernel along X
+	//@param ky is the size of the kernel along Y
+	//@param kz is the size of the kernel along Z
+
+	template <typename T, typename K>
+	void cpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz)
+	{
+		//Set up the sizes of the new output, which will be kx, ky, kz, smaller than the i    nput.
+		size_t X = dimx - kx + 1; 
+		size_t Y = dimy - ky + 1; 
+		size_t Z = dimz - kz + 1;
+
+#ifdef __CUDACC__
+	///Set up all of the memory on the GPU
+	T* gpu_in;
+	HANDLE_ERROR(cudaMalloc(&gpu_in, dimx*dimy*dimz*sizeof(T)));
+	HANDLE_ERROR(cudaMemcpy(gpu_in, in, dimx*dimy*dimz*sizeof(T),cudaMemcpyHostToDevice));
+	K* gpu_kx;
+	HANDLE_ERROR(cudaMalloc(&gpu_kx, kx*sizeof(K)));
+	HANDLE_ERROR(cudaMemcpy(gpu_kx, k0, kx*sizeof(K),cudaMemcpyHostToDevice));
+	K* gpu_ky;
+	HANDLE_ERROR(cudaMalloc(&gpu_ky, ky*sizeof(K)));
+	HANDLE_ERROR(cudaMemcpy(gpu_ky, k1, ky*sizeof(K),cudaMemcpyHostToDevice));
+	K* gpu_kz;
+	HANDLE_ERROR(cudaMalloc(&gpu_kz, kz*sizeof(K)));
+	HANDLE_ERROR(cudaMemcpy(gpu_kz, k2, kz*sizeof(K),cudaMemcpyHostToDevice));
+	T* gpu_out;
+	HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * Z*sizeof(T)));
+
+	///run the kernel
+	gpu_sepconv3(gpu_out, gpu_in, gpu_kx, gpu_ky, gpu_kz, dimx, dimy, dimz, kx, ky, kz);
+
+	///Copy the output
+	HANDLE_ERROR(cudaMemcpy(out, gpu_out, X*Y*Z*sizeof(T), cudaMemcpyDeviceToHost));
+
+	///Free all the memory used.
+	HANDLE_ERROR(cudaFree(gpu_in));
+	HANDLE_ERROR(cudaFree(gpu_kx));
+	HANDLE_ERROR(cudaFree(gpu_ky));
+	HANDLE_ERROR(cudaFree(gpu_kz));
+	HANDLE_ERROR(cudaFree(gpu_out));
+#else
+	T* temp = (T*) malloc(X * dimy * sizeof(T));
+	T* temp3 = (T*) malloc(X * Y * dimz * sizeof(T));
+	for(int i = 0; i < dimz; i++)
+	{
+		idx_IN 	= (dimx*dimy)*i-i;
+		idx_OUT = (X*Y)*i-i;
+		cpu_conv2(temp, &in[idx_IN], k0, dimx, dimy, kx, 1)
+		cpu_conv2(&temp3[idx_OUT], temp, k1, X, dimy, 1, ky);
+	}
+	cpu_conv2(out, temp, k2, X*Y, dimz, 1, kz);
+	free(temp);
+	free(temp3);
+
+#endif
+	}
+}
+
+
+#endif
diff --git a/stim/math/filters/sepconv3.h b/stim/math/filters/sepconv3.h
deleted file mode 100644
index 11f1b0d..0000000
--- a/stim/math/filters/sepconv3.h
+++ /dev/null
@@ -1,123 +0,0 @@
-#ifndef STIM_CUDA_SEPCONV3_H
-#define STIM_CUDA_SEPCONV3_H
-
-#include <stim/math/filters/conv2.h>
-#include <stim/math/filters/sepconv2.h>
-#ifdef __CUDACC__
-	#include <stim/cuda/cudatools.h>
-	#include <stim/cuda/sharedmem.cuh>
-#endif
-
-namespace stim
-{
-#ifdef __CUDACC__
-	template<typename T, typename K>
-	void gpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz)
-{
-
-	size_t X = dimx - kx + 1; 
-	size_t Y = dimy - ky + 1; 
-	size_t Z = dimz - kz + 1;
-	
-	T* temp_out;
-	int idx_IN;
-	int idx_OUT;
-	HANDLE_ERROR(cudaMalloc(&temp_out, X*Y*dimz*sizeof(T)));
-
-	for(int i = 0; i < dimz; i++)
-	{
-		idx_IN 	= (dimx*dimy)*i-i;
-		idx_OUT = (X*Y)*i-i;
-		gpu_sepconv2(&temp_out[idx_OUT], &in[idx_IN], k0, k1, dimx, dimy, kx, ky);
-	}
-
-	cudaDeviceProp p;
-	HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
-	size_t tmax = p.maxThreadsPerBlock;
-
-	dim3 numThreads(sqrt(tmax), sqrt(tmax));
-	dim3 numBlocks(X*Y/numThreads.x +1, dimz/numThreads.y + 1);
-	size_t sharedMem = (numThreads.x + kz - 1) * numThreads.y * sizeof(T);
-	if(sharedMem > p.sharedMemPerBlock)
-	{
-		std::cout << "Error in stim::gpu_sepconv3() - insufficient shared memory for this kernel." << std::endl;
-		exit(1);
-	}
-	kernel_conv2 <<< numBlocks, numThreads, sharedMem >>> (out, temp_out, k2, X*Y, dimz, 1, kz);
-	HANDLE_ERROR(cudaFree(temp_out));
-
-
-}
-#endif
-
-	//Performs a separable convolution of a 3D image. Only valid pixels based on the kernel ar    e returned.
-	//      As a result, the output image will be smaller than the input image by (kx-1, ky-1 , kz-1)
-	//@param out is a pointer to the output image
-	//@param in is a pointer to the input image
-	//@param kx is the x-axis convolution filter
-	//@param ky is the y-axis convolution filter
-	//@param kz is the z-axis convolution filter
-	//@param dimx is the size of the input image along X
-	//@param dimy is the size of the input image along Y
-	//@param dimz is the size of the input image along Z
-	//@param kx is the size of the kernel along X
-	//@param ky is the size of the kernel along Y
-	//@param kz is the size of the kernel along Z
-
-	template <typename T, typename K>
-	void cpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz)
-	{
-		//Set up the sizes of the new output, which will be kx, ky, kz, smaller than the i    nput.
-		size_t X = dimx - kx + 1; 
-		size_t Y = dimy - ky + 1; 
-		size_t Z = dimz - kz + 1;
-
-#ifdef __CUDACC__
-	///Set up all of the memory on the GPU
-	T* gpu_in;
-	HANDLE_ERROR(cudaMalloc(&gpu_in, dimx*dimy*dimz*sizeof(T)));
-	HANDLE_ERROR(cudaMemcpy(gpu_in, in, dimx*dimy*dimz*sizeof(T),cudaMemcpyHostToDevice));
-	K* gpu_kx;
-	HANDLE_ERROR(cudaMalloc(&gpu_kx, kx*sizeof(K)));
-	HANDLE_ERROR(cudaMemcpy(gpu_kx, k0, kx*sizeof(K),cudaMemcpyHostToDevice));
-	K* gpu_ky;
-	HANDLE_ERROR(cudaMalloc(&gpu_ky, ky*sizeof(K)));
-	HANDLE_ERROR(cudaMemcpy(gpu_ky, k1, ky*sizeof(K),cudaMemcpyHostToDevice));
-	K* gpu_kz;
-	HANDLE_ERROR(cudaMalloc(&gpu_kz, kz*sizeof(K)));
-	HANDLE_ERROR(cudaMemcpy(gpu_kz, k2, kz*sizeof(K),cudaMemcpyHostToDevice));
-	T* gpu_out;
-	HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * Z*sizeof(T)));
-
-	///run the kernel
-	gpu_sepconv3(gpu_out, gpu_in, gpu_kx, gpu_ky, gpu_kz, dimx, dimy, dimz, kx, ky, kz);
-
-	///Copy the output
-	HANDLE_ERROR(cudaMemcpy(out, gpu_out, X*Y*Z*sizeof(T), cudaMemcpyDeviceToHost));
-
-	///Free all the memory used.
-	HANDLE_ERROR(cudaFree(gpu_in));
-	HANDLE_ERROR(cudaFree(gpu_kx));
-	HANDLE_ERROR(cudaFree(gpu_ky));
-	HANDLE_ERROR(cudaFree(gpu_kz));
-	HANDLE_ERROR(cudaFree(gpu_out));
-#else
-	T* temp = (T*) malloc(X * dimy * sizeof(T));
-	T* temp3 = (T*) malloc(X * Y * dimz * sizeof(T));
-	for(int i = 0; i < dimz; i++)
-	{
-		idx_IN 	= (dimx*dimy)*i-i;
-		idx_OUT = (X*Y)*i-i;
-		cpu_conv2(temp, &in[idx_IN], k0, dimx, dimy, kx, 1)
-		cpu_conv2(&temp3[idx_OUT], temp, k1, X, dimy, 1, ky);
-	}
-	cpu_conv2(out, temp, k2, X*Y, dimz, 1, kz);
-	free(temp);
-	free(temp3);
-
-#endif
-	}
-}
-
-
-#endif
--
libgit2 0.21.4