codebase / stimlib

Browse Code »

Commit 58fdd08b6e25b0d7f1fc07457b4a9a22ed918711

Authored by Jiabing Li 2016-11-09 14:52:45 -0600

1 parent d4ba0a6e

added median filter

Showing 1 changed file with 165 additions and 0 deletions Show diff stats

Inline Side-by-side

stim/math/filters/median2.cuh 0 → 100644

Wrap text Show/Hide comments View file @58fdd08

	1	+#ifndef STIM_CUDA_MEDIAN2_H
	2	+#define STIM_CUDA_MEDIAN2_H
	3	+
	4	+#include <iostream>
	5	+#include <cuda.h>
	6	+#include <cmath>
	7	+#include <algorithm>
	8	+
	9	+#ifdef USING_CUDA
	10	+#include "cuda_runtime.h"
	11	+#include "device_launch_parameters.h"
	12	+#include <stim/cuda/cudatools.h>
	13	+#endif
	14	+
	15	+//#include <thrust/sort.h>
	16	+//#include <thrust/execution_policy.h>
	17	+//#include <thrust/binary_search.h>
	18	+//#include <thrust/device_ptr.h>
	19	+
	20	+template <typename T>
	21	+__device__ void cuswap ( T& a, T& b ){
	22	+ T c(a);
	23	+ a=b;
	24	+ b=c;
	25	+}
	26	+
	27	+namespace stim{
	28	+ namespace cuda{
	29	+
	30	+ template<typename T>
	31	+ __global__ void cuda_median2(T* in, T* out, T* kernel, size_t X, size_t Y, size_t K){
	32	+
	33	+ int xi = blockIdx.x * blockDim.x + threadIdx.x;
	34	+ int yi = blockIdx.y * blockDim.y + threadIdx.y;
	35	+
	36	+ size_t Xout = X - K + 1;
	37	+ size_t Yout = Y - K + 1;
	38	+ int i = yi * Xout + xi;
	39	+
	40	+ //return if (i,j) is outside the matrix
	41	+ if(xi >= Xout \|\| yi >= Yout) return;
	42	+
	43	+ //scan the image, copy data from input to 2D window
	44	+ size_t kxi, kyi;
	45	+ for(kyi = 0; kyi < K; kyi++){
	46	+ for(kxi = 0; kxi < K; kxi++){
	47	+ kernel[i * K * K + kyi * K + kxi] = in[(yi + kyi)* X + xi + kxi];
	48	+ }
	49	+ }
	50	+
	51	+ //calculate kernel radius 4
	52	+ int r = (K*K)/2;
	53	+
	54	+ //sort the smallest half pixel values inside the window, calculate the middle one
	55	+ size_t Ki = i * K * K;
	56	+
	57	+ //sort the smallest half pixel values inside the window, calculate the middle one
	58	+ for (int p = 0; p < r+1; p++){
	59	+ for(int kk = p + 1; kk < K*K; kk++){
	60	+ if (kernel[Ki + kk] < kernel[Ki + p]){
	61	+ cuswap<T>(kernel[Ki + kk], kernel[Ki + p]);
	62	+ }
	63	+ }
	64	+ }
	65	+
	66	+ //copy the middle pixel value inside the window to output
	67	+ out[i] = kernel[Ki + r];
	68	+
	69	+ }
	70	+
	71	+
	72	+ template<typename T>
	73	+ void gpu_median2(T* gpu_in, T* gpu_out, T* gpu_kernel, size_t X, size_t Y, size_t K){
	74	+
	75	+ //get the maximum number of threads per block for the CUDA device
	76	+ int threads_total = stim::maxThreadsPerBlock();
	77	+
	78	+ //set threads in each block
	79	+ dim3 threads(sqrt(threads_total), sqrt(threads_total));
	80	+
	81	+ //calculate the number of blocks
	82	+ dim3 blocks(( (X - K + 1) / threads.x) + 1, ((Y - K + 1) / threads.y) + 1);
	83	+
	84	+ //call the kernel to perform median filter function
	85	+ cuda_median2 <<< blocks, threads >>>( gpu_in, gpu_out, gpu_kernel, X, Y, K);
	86	+
	87	+ }
	88	+
	89	+ template<typename T>
	90	+ void cpu_median2(T* cpu_in, T* cpu_out, size_t X, size_t Y, size_t K){
	91	+
	92	+ #ifndef USING_CUDA
	93	+
	94	+ //output image width and height
	95	+ size_t X_out = X + K -1;
	96	+ size_t Y_out = Y + K -1;
	97	+
	98	+ float* window = (float)malloc(K k *sizeof(float));
	99	+
	100	+ for(int i = 0; i< Y; i++){
	101	+ for(int j =0; j< X; j++){
	102	+
	103	+ //Pick up window elements
	104	+ int k = 0;
	105	+ for (int m=0; m< kernel_width; m++){
	106	+ for(int n = 0; n < kernel_width; n++){
	107	+ window[k] = in_image.at<float>(m+i, n+j);
	108	+ k++;
	109	+ }
	110	+ }
	111	+
	112	+ //calculate kernel radius 4
	113	+ r_ker = K * K/2;
	114	+ //Order elements (only half of them)
	115	+ for(int p = 0; p<r_ker+1; p++){
	116	+ //Find position of minimum element
	117	+ for (int l = p + 1; l < size_kernel; l++){
	118	+
	119	+ if(window[l] < window[p]){
	120	+ float t = window[p];
	121	+ window[p] = window[l];
	122	+ window[l] = t; }
	123	+ }
	124	+ }
	125	+
	126	+ //Get result - the middle element
	127	+ cpu_out[i * X_out + j] = window[r_ker];
	128	+ }
	129	+ }
	130	+ #else
	131	+ //calculate input and out image pixels & calculate kernel size
	132	+ size_t N_in = X * Y; //number of pixels in the input image
	133	+ size_t N_out = (X - K + 1) * (Y - K + 1); //number of pixels in the output image
	134	+ //size_t kernel_size = kernel_width*kernel_width; //total number of pixels in the kernel
	135	+
	136	+ //allocate memory on the GPU for the array
	137	+ T* gpu_in; //allocate device memory for the input image
	138	+ HANDLE_ERROR( cudaMalloc( &gpu_in, N_in * sizeof(T) ) );
	139	+ T* gpu_kernel; //allocate device memory for the kernel
	140	+ HANDLE_ERROR( cudaMalloc( &gpu_kernel, K * K * N_out * sizeof(T) ) );
	141	+ T* gpu_out; //allocate device memory for the output image
	142	+ HANDLE_ERROR( cudaMalloc( &gpu_out, N_out * sizeof(T) ) );
	143	+
	144	+ //copy the array to the GPU
	145	+ HANDLE_ERROR( cudaMemcpy( gpu_in, cpu_in, N_in * sizeof(T), cudaMemcpyHostToDevice) );
	146	+
	147	+ //call the GPU version of this function
	148	+ gpu_median2<T>(gpu_in, gpu_out, gpu_kernel, X, Y, K);
	149	+
	150	+ //copy the array back to the CPU
	151	+ HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N_out * sizeof(T), cudaMemcpyDeviceToHost) );
	152	+
	153	+ //free allocated memory
	154	+ cudaFree(gpu_in);
	155	+ cudaFree(gpu_kernel);
	156	+ cudaFree(gpu_out);
	157	+
	158	+ }
	159	+
	160	+ }
	161	+ #endif
	162	+}
	163	+
	164	+
	165	+#endif
...	...