diff --git a/stim/math/filters/gauss3.h b/stim/math/filters/gauss3.h
new file mode 100644
index 0000000..15e0ae2
--- /dev/null
+++ b/stim/math/filters/gauss3.h
@@ -0,0 +1,55 @@
+#ifndef STIM_CUDA_GAUSS3_H
+#define STIM_CUDA_GAUSS3_H
+#include <stim/math/filters/sepconv3.h>
+#include <stim/math/filters/gauss2.h>
+#include <stim/math/constants.h>
+
+namespace stim
+{
+	///Perform a 3D gaussian convolution on an input image.
+        ///@param in is a pointer to the input data.
+	///@param dimx is the size of in* in the x direction.
+	///@param dimx is the size of in* in the y direction.
+	///@param dimx is the size of in* in the z direction.
+        ///@param stdx is the standard deviation (in pixels) along the x axis.
+        ///@param stdy is the standard deviation (in pixels) along the y axis.
+        ///@param nstds specifies the number of standard deviations of the Gaussian that will be k    ept in the kernel.
+	template<typename T, typename K>
+	void cpu_gauss3(T* in, K dimx, K dimy, K dimz, K stdx, K stdy, K stdz, size_t nstds = 3)
+	{
+		//Set up the sizes of the gaussian Kernels.
+		size_t kx = stdx * nstds * 2;
+		size_t ky = stdy * nstds * 2;
+		size_t kz = stdz * nstds * 2;
+	
+		//Set up the sizes of the new output, which will be kx, ky, kz, smaller than the input.
+		size_t X = dimx - kx +1; 
+		size_t Y = dimy - ky +1; 
+		size_t Z = dimz - kz +1; 
+		T* out = (T*) malloc(X*Y*Z* sizeof(T));
+
+		///Set up the memory that will store the gaussians
+		K* gaussx = (K*)malloc(kx *sizeof(K));
+		K* gaussy = (K*)malloc(ky *sizeof(K));
+		K* gaussz = (K*)malloc(kz *sizeof(K));
+
+		///Set up the midpoints of the gaussians.
+		K midgaussx = (K) kx/ (K)2;
+		K midgaussy = (K) ky/ (K)2;
+		K midgaussz = (K) kz/ (K)2;
+
+		///Evaluate the kernels in each cardinal direction.
+		for(size_t i = 0; i < kx; i++)
+			gaussx[i] = gauss1d((K) i, midgaussx, stdx);
+
+		for(size_t i = 0; i < kx; i++)
+			gaussy[i] = gauss1d((K) i, midgaussy, stdy);
+
+		for(size_t i = 0; i < kx; i++)
+			gaussz[i] = gauss1d((K) i, midgaussz, stdz);
+
+		cpu_sepconv3(out, in, gaussx, gaussy, gaussz, dimx, dimy, dimz, kx, ky, kz);
+
+	}
+}
+#endif
diff --git a/stim/math/filters/sepconv2.h b/stim/math/filters/sepconv2.h
index 5809595..d05b697 100644
--- a/stim/math/filters/sepconv2.h
+++ b/stim/math/filters/sepconv2.h
@@ -86,4 +86,4 @@ namespace stim {
 	}
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/stim/math/filters/sepconv3.h b/stim/math/filters/sepconv3.h
new file mode 100644
index 0000000..11f1b0d
--- /dev/null
+++ b/stim/math/filters/sepconv3.h
@@ -0,0 +1,123 @@
+#ifndef STIM_CUDA_SEPCONV3_H
+#define STIM_CUDA_SEPCONV3_H
+
+#include <stim/math/filters/conv2.h>
+#include <stim/math/filters/sepconv2.h>
+#ifdef __CUDACC__
+	#include <stim/cuda/cudatools.h>
+	#include <stim/cuda/sharedmem.cuh>
+#endif
+
+namespace stim
+{
+#ifdef __CUDACC__
+	template<typename T, typename K>
+	void gpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz)
+{
+
+	size_t X = dimx - kx + 1; 
+	size_t Y = dimy - ky + 1; 
+	size_t Z = dimz - kz + 1;
+	
+	T* temp_out;
+	int idx_IN;
+	int idx_OUT;
+	HANDLE_ERROR(cudaMalloc(&temp_out, X*Y*dimz*sizeof(T)));
+
+	for(int i = 0; i < dimz; i++)
+	{
+		idx_IN 	= (dimx*dimy)*i-i;
+		idx_OUT = (X*Y)*i-i;
+		gpu_sepconv2(&temp_out[idx_OUT], &in[idx_IN], k0, k1, dimx, dimy, kx, ky);
+	}
+
+	cudaDeviceProp p;
+	HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
+	size_t tmax = p.maxThreadsPerBlock;
+
+	dim3 numThreads(sqrt(tmax), sqrt(tmax));
+	dim3 numBlocks(X*Y/numThreads.x +1, dimz/numThreads.y + 1);
+	size_t sharedMem = (numThreads.x + kz - 1) * numThreads.y * sizeof(T);
+	if(sharedMem > p.sharedMemPerBlock)
+	{
+		std::cout << "Error in stim::gpu_sepconv3() - insufficient shared memory for this kernel." << std::endl;
+		exit(1);
+	}
+	kernel_conv2 <<< numBlocks, numThreads, sharedMem >>> (out, temp_out, k2, X*Y, dimz, 1, kz);
+	HANDLE_ERROR(cudaFree(temp_out));
+
+
+}
+#endif
+
+	//Performs a separable convolution of a 3D image. Only valid pixels based on the kernel ar    e returned.
+	//      As a result, the output image will be smaller than the input image by (kx-1, ky-1 , kz-1)
+	//@param out is a pointer to the output image
+	//@param in is a pointer to the input image
+	//@param kx is the x-axis convolution filter
+	//@param ky is the y-axis convolution filter
+	//@param kz is the z-axis convolution filter
+	//@param dimx is the size of the input image along X
+	//@param dimy is the size of the input image along Y
+	//@param dimz is the size of the input image along Z
+	//@param kx is the size of the kernel along X
+	//@param ky is the size of the kernel along Y
+	//@param kz is the size of the kernel along Z
+
+	template <typename T, typename K>
+	void cpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz)
+	{
+		//Set up the sizes of the new output, which will be kx, ky, kz, smaller than the i    nput.
+		size_t X = dimx - kx + 1; 
+		size_t Y = dimy - ky + 1; 
+		size_t Z = dimz - kz + 1;
+
+#ifdef __CUDACC__
+	///Set up all of the memory on the GPU
+	T* gpu_in;
+	HANDLE_ERROR(cudaMalloc(&gpu_in, dimx*dimy*dimz*sizeof(T)));
+	HANDLE_ERROR(cudaMemcpy(gpu_in, in, dimx*dimy*dimz*sizeof(T),cudaMemcpyHostToDevice));
+	K* gpu_kx;
+	HANDLE_ERROR(cudaMalloc(&gpu_kx, kx*sizeof(K)));
+	HANDLE_ERROR(cudaMemcpy(gpu_kx, k0, kx*sizeof(K),cudaMemcpyHostToDevice));
+	K* gpu_ky;
+	HANDLE_ERROR(cudaMalloc(&gpu_ky, ky*sizeof(K)));
+	HANDLE_ERROR(cudaMemcpy(gpu_ky, k1, ky*sizeof(K),cudaMemcpyHostToDevice));
+	K* gpu_kz;
+	HANDLE_ERROR(cudaMalloc(&gpu_kz, kz*sizeof(K)));
+	HANDLE_ERROR(cudaMemcpy(gpu_kz, k2, kz*sizeof(K),cudaMemcpyHostToDevice));
+	T* gpu_out;
+	HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * Z*sizeof(T)));
+
+	///run the kernel
+	gpu_sepconv3(gpu_out, gpu_in, gpu_kx, gpu_ky, gpu_kz, dimx, dimy, dimz, kx, ky, kz);
+
+	///Copy the output
+	HANDLE_ERROR(cudaMemcpy(out, gpu_out, X*Y*Z*sizeof(T), cudaMemcpyDeviceToHost));
+
+	///Free all the memory used.
+	HANDLE_ERROR(cudaFree(gpu_in));
+	HANDLE_ERROR(cudaFree(gpu_kx));
+	HANDLE_ERROR(cudaFree(gpu_ky));
+	HANDLE_ERROR(cudaFree(gpu_kz));
+	HANDLE_ERROR(cudaFree(gpu_out));
+#else
+	T* temp = (T*) malloc(X * dimy * sizeof(T));
+	T* temp3 = (T*) malloc(X * Y * dimz * sizeof(T));
+	for(int i = 0; i < dimz; i++)
+	{
+		idx_IN 	= (dimx*dimy)*i-i;
+		idx_OUT = (X*Y)*i-i;
+		cpu_conv2(temp, &in[idx_IN], k0, dimx, dimy, kx, 1)
+		cpu_conv2(&temp3[idx_OUT], temp, k1, X, dimy, 1, ky);
+	}
+	cpu_conv2(out, temp, k2, X*Y, dimz, 1, kz);
+	free(temp);
+	free(temp3);
+
+#endif
+	}
+}
+
+
+#endif
--
libgit2 0.21.4