stim/math/filters/median2.cuh

#ifndef STIM_CUDA_MEDIAN2_H
#define STIM_CUDA_MEDIAN2_H
#include <iostream>
#include <cuda.h>
#include <cmath>
#include <algorithm>
#ifdef USING_CUDA
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stim/cuda/cudatools.h>
#endif
//#include <thrust/sort.h>
//#include <thrust/execution_policy.h>
//#include <thrust/binary_search.h>
//#include <thrust/device_ptr.h>
template <typename T> 
__device__ void cuswap ( T& a, T& b ){
	T c(a);
	a=b;
	b=c;
}
namespace stim{
	namespace cuda{
		template<typename T>
		__global__ void cuda_median2(T* in, T* out, T* kernel, size_t X, size_t Y, size_t K){
			int xi = blockIdx.x * blockDim.x + threadIdx.x;
			int yi = blockIdx.y * blockDim.y + threadIdx.y;			
			size_t Xout = X - K + 1;
			size_t Yout = Y - K + 1;
			int i = yi * Xout + xi;
			//return if (i,j) is outside the matrix
			if(xi >= Xout || yi >= Yout)   return;		
			//scan the image, copy data from input to 2D window
			size_t kxi, kyi;
			for(kyi = 0; kyi < K; kyi++){
				for(kxi = 0; kxi < K; kxi++){
					kernel[i * K * K + kyi * K + kxi] = in[(yi + kyi)* X + xi + kxi];				      
				}
			}					
					
			//calculate kernel radius 4
            int r = (K*K)/2;
			
			//sort the smallest half pixel values inside the window, calculate the middle one
			size_t Ki = i * K * K;
		
			//sort the smallest half pixel values inside the window, calculate the middle one
            for (int  p = 0; p < r+1; p++){                             
			    for(int kk = p + 1; kk < K*K; kk++){
                       if (kernel[Ki + kk] < kernel[Ki + p]){
						   cuswap<T>(kernel[Ki + kk], kernel[Ki + p]);
					   }
                  }
             }
            
			//copy the middle pixel value inside the window to output
            out[i] = kernel[Ki + r]; 
				     
        }
		 
		
		template<typename T>
		void gpu_median2(T* gpu_in, T* gpu_out, T* gpu_kernel, size_t X, size_t Y, size_t K){
               
			//get the maximum number of threads per block for the CUDA device
            int threads_total = stim::maxThreadsPerBlock();
			//set threads in each block
			dim3 threads(sqrt(threads_total), sqrt(threads_total));
			//calculate the number of blocks
			dim3 blocks(( (X - K + 1) / threads.x) + 1, ((Y - K + 1) / threads.y) + 1);
			//call the kernel to perform median filter function
		    cuda_median2 <<< blocks, threads >>>( gpu_in, gpu_out, gpu_kernel, X, Y, K);
		
		}
		template<typename T>
		void cpu_median2(T* cpu_in, T* cpu_out, size_t X, size_t Y, size_t K){
		#ifndef USING_CUDA
			
			//output image width and height
			 size_t X_out = X + K -1;
			 size_t Y_out = Y + K -1;
			 float* window = (float*)malloc(K * k *sizeof(float));
			 for(int i = 0; i< Y; i++){
 		       for(int j =0; j< X; j++){
				//Pick up window elements		
				int k = 0; 			
				for (int m=0; m< kernel_width; m++){
					for(int n = 0; n < kernel_width; n++){
						window[k] = in_image.at<float>(m+i, n+j);
						k++;					
					 }
				}
				//calculate kernel radius 4
				r_ker = K * K/2;
			    //Order elements (only half of them)
				for(int p = 0; p<r_ker+1; p++){			
				  //Find position of minimum element			       
			       for (int l = p + 1; l < size_kernel; l++){
     
					 if(window[l] < window[p]){
					     float t  = window[p];
					    window[p] = window[l];
					    window[l] = t;          }		
		     	}
					}
					
			  //Get result - the middle element	
				cpu_out[i * X_out + j] =  window[r_ker];	
			   }
			 }
		#else
			//calculate input and out image pixels & calculate kernel size
			size_t N_in = X * Y;									//number of pixels in the input image
			size_t N_out = (X - K + 1) * (Y - K + 1);								//number of pixels in the output image
            //size_t kernel_size = kernel_width*kernel_width;				//total number of pixels in the kernel
            
			//allocate memory on the GPU for the array
			T* gpu_in;																	//allocate device memory for the input image
			HANDLE_ERROR( cudaMalloc( &gpu_in, N_in * sizeof(T) ) );
			T* gpu_kernel; 															//allocate device memory for the kernel
			HANDLE_ERROR( cudaMalloc( &gpu_kernel, K * K * N_out * sizeof(T) ) );
			T* gpu_out;																	//allocate device memory for the output image
			HANDLE_ERROR( cudaMalloc( &gpu_out, N_out * sizeof(T) ) );
			//copy the array to the GPU
			HANDLE_ERROR( cudaMemcpy( gpu_in, cpu_in, N_in * sizeof(T), cudaMemcpyHostToDevice) );
			
			//call the GPU version of this function
			gpu_median2<T>(gpu_in, gpu_out, gpu_kernel, X, Y, K);
			//copy the array back to the CPU
			HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N_out * sizeof(T), cudaMemcpyDeviceToHost) );
			//free allocated memory
			cudaFree(gpu_in);
			cudaFree(gpu_kernel);
			cudaFree(gpu_out);
		}
		
	}
	#endif
}
#endif