diff --git a/stim/math/filters/median2.cuh b/stim/math/filters/median2.cuh
new file mode 100644
index 0000000..78499ac
--- /dev/null
+++ b/stim/math/filters/median2.cuh
@@ -0,0 +1,165 @@
+#ifndef STIM_CUDA_MEDIAN2_H
+#define STIM_CUDA_MEDIAN2_H
+
+#include <iostream>
+#include <cuda.h>
+#include <cmath>
+#include <algorithm>
+
+#ifdef USING_CUDA
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include <stim/cuda/cudatools.h>
+#endif
+
+//#include <thrust/sort.h>
+//#include <thrust/execution_policy.h>
+//#include <thrust/binary_search.h>
+//#include <thrust/device_ptr.h>
+
+template <typename T> 
+__device__ void cuswap ( T& a, T& b ){
+	T c(a);
+	a=b;
+	b=c;
+}
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_median2(T* in, T* out, T* kernel, size_t X, size_t Y, size_t K){
+
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;			
+
+			size_t Xout = X - K + 1;
+			size_t Yout = Y - K + 1;
+			int i = yi * Xout + xi;
+
+			//return if (i,j) is outside the matrix
+			if(xi >= Xout || yi >= Yout)   return;		
+
+			//scan the image, copy data from input to 2D window
+			size_t kxi, kyi;
+			for(kyi = 0; kyi < K; kyi++){
+				for(kxi = 0; kxi < K; kxi++){
+					kernel[i * K * K + kyi * K + kxi] = in[(yi + kyi)* X + xi + kxi];				      
+				}
+			}					
+					
+			//calculate kernel radius 4
+            int r = (K*K)/2;
+			
+			//sort the smallest half pixel values inside the window, calculate the middle one
+			size_t Ki = i * K * K;
+		
+			//sort the smallest half pixel values inside the window, calculate the middle one
+            for (int  p = 0; p < r+1; p++){                             
+			    for(int kk = p + 1; kk < K*K; kk++){
+                       if (kernel[Ki + kk] < kernel[Ki + p]){
+						   cuswap<T>(kernel[Ki + kk], kernel[Ki + p]);
+					   }
+                  }
+             }
+            
+			//copy the middle pixel value inside the window to output
+            out[i] = kernel[Ki + r]; 
+				     
+        }
+		 
+		
+		template<typename T>
+		void gpu_median2(T* gpu_in, T* gpu_out, T* gpu_kernel, size_t X, size_t Y, size_t K){
+               
+			//get the maximum number of threads per block for the CUDA device
+            int threads_total = stim::maxThreadsPerBlock();
+
+			//set threads in each block
+			dim3 threads(sqrt(threads_total), sqrt(threads_total));
+
+			//calculate the number of blocks
+			dim3 blocks(( (X - K + 1) / threads.x) + 1, ((Y - K + 1) / threads.y) + 1);
+
+			//call the kernel to perform median filter function
+		    cuda_median2 <<< blocks, threads >>>( gpu_in, gpu_out, gpu_kernel, X, Y, K);
+		
+		}
+
+		template<typename T>
+		void cpu_median2(T* cpu_in, T* cpu_out, size_t X, size_t Y, size_t K){
+
+		#ifndef USING_CUDA
+			
+			//output image width and height
+			 size_t X_out = X + K -1;
+			 size_t Y_out = Y + K -1;
+
+			 float* window = (float*)malloc(K * k *sizeof(float));
+
+			 for(int i = 0; i< Y; i++){
+ 		       for(int j =0; j< X; j++){
+
+				//Pick up window elements		
+				int k = 0; 			
+				for (int m=0; m< kernel_width; m++){
+					for(int n = 0; n < kernel_width; n++){
+						window[k] = in_image.at<float>(m+i, n+j);
+						k++;					
+					 }
+				}
+
+				//calculate kernel radius 4
+				r_ker = K * K/2;
+			    //Order elements (only half of them)
+				for(int p = 0; p<r_ker+1; p++){			
+				  //Find position of minimum element			       
+			       for (int l = p + 1; l < size_kernel; l++){
+     
+					 if(window[l] < window[p]){
+					     float t  = window[p];
+					    window[p] = window[l];
+					    window[l] = t;          }		
+		     	}
+					}
+					
+			  //Get result - the middle element	
+				cpu_out[i * X_out + j] =  window[r_ker];	
+			   }
+			 }
+		#else
+			//calculate input and out image pixels & calculate kernel size
+			size_t N_in = X * Y;									//number of pixels in the input image
+			size_t N_out = (X - K + 1) * (Y - K + 1);								//number of pixels in the output image
+            //size_t kernel_size = kernel_width*kernel_width;				//total number of pixels in the kernel
+            
+			//allocate memory on the GPU for the array
+			T* gpu_in;																	//allocate device memory for the input image
+			HANDLE_ERROR( cudaMalloc( &gpu_in, N_in * sizeof(T) ) );
+			T* gpu_kernel; 															//allocate device memory for the kernel
+			HANDLE_ERROR( cudaMalloc( &gpu_kernel, K * K * N_out * sizeof(T) ) );
+			T* gpu_out;																	//allocate device memory for the output image
+			HANDLE_ERROR( cudaMalloc( &gpu_out, N_out * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_in, cpu_in, N_in * sizeof(T), cudaMemcpyHostToDevice) );
+			
+			//call the GPU version of this function
+			gpu_median2<T>(gpu_in, gpu_out, gpu_kernel, X, Y, K);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N_out * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_in);
+			cudaFree(gpu_kernel);
+			cudaFree(gpu_out);
+
+		}
+		
+	}
+	#endif
+}
+
+
+#endif
--
libgit2 0.21.4