codebase / stimlib

Browse Code »

Commit 58fdd08b6e25b0d7f1fc07457b4a9a22ed918711

Authored by Jiabing Li 2016-11-09 14:52:45 -0600

1 parent d4ba0a6e

added median filter

Showing 1 changed file with 165 additions and 0 deletions Show diff stats

Inline Side-by-side

stim/math/filters/median2.cuh 0 → 100644

Show/Hide comments View file @58fdd08

		1	+#ifndef STIM_CUDA_MEDIAN2_H
		2	+#define STIM_CUDA_MEDIAN2_H
		3	+
		4	+#include <iostream>
		5	+#include <cuda.h>
		6	+#include <cmath>
		7	+#include <algorithm>
		8	+
		9	+#ifdef USING_CUDA
		10	+#include "cuda_runtime.h"
		11	+#include "device_launch_parameters.h"
		12	+#include <stim/cuda/cudatools.h>
		13	+#endif
		14	+
		15	+//#include <thrust/sort.h>
		16	+//#include <thrust/execution_policy.h>
		17	+//#include <thrust/binary_search.h>
		18	+//#include <thrust/device_ptr.h>
		19	+
		20	+template <typename T>
		21	+__device__ void cuswap ( T& a, T& b ){
		22	+ T c(a);
		23	+ a=b;
		24	+ b=c;
		25	+}
		26	+
		27	+namespace stim{
		28	+ namespace cuda{
		29	+
		30	+ template<typename T>
		31	+ __global__ void cuda_median2(T* in, T* out, T* kernel, size_t X, size_t Y, size_t K){
		32	+
		33	+ int xi = blockIdx.x * blockDim.x + threadIdx.x;
		34	+ int yi = blockIdx.y * blockDim.y + threadIdx.y;
		35	+
		36	+ size_t Xout = X - K + 1;
		37	+ size_t Yout = Y - K + 1;
		38	+ int i = yi * Xout + xi;
		39	+
		40	+ //return if (i,j) is outside the matrix
		41	+ if(xi >= Xout \|\| yi >= Yout) return;
		42	+
		43	+ //scan the image, copy data from input to 2D window
		44	+ size_t kxi, kyi;
		45	+ for(kyi = 0; kyi < K; kyi++){
		46	+ for(kxi = 0; kxi < K; kxi++){
		47	+ kernel[i * K * K + kyi * K + kxi] = in[(yi + kyi)* X + xi + kxi];
		48	+ }
		49	+ }
		50	+
		51	+ //calculate kernel radius 4
		52	+ int r = (K*K)/2;
		53	+
		54	+ //sort the smallest half pixel values inside the window, calculate the middle one
		55	+ size_t Ki = i * K * K;
		56	+
		57	+ //sort the smallest half pixel values inside the window, calculate the middle one
		58	+ for (int p = 0; p < r+1; p++){
		59	+ for(int kk = p + 1; kk < K*K; kk++){
		60	+ if (kernel[Ki + kk] < kernel[Ki + p]){
		61	+ cuswap<T>(kernel[Ki + kk], kernel[Ki + p]);
		62	+ }
		63	+ }
		64	+ }
		65	+
		66	+ //copy the middle pixel value inside the window to output
		67	+ out[i] = kernel[Ki + r];
		68	+
		69	+ }
		70	+
		71	+
		72	+ template<typename T>
		73	+ void gpu_median2(T* gpu_in, T* gpu_out, T* gpu_kernel, size_t X, size_t Y, size_t K){
		74	+
		75	+ //get the maximum number of threads per block for the CUDA device
		76	+ int threads_total = stim::maxThreadsPerBlock();
		77	+
		78	+ //set threads in each block
		79	+ dim3 threads(sqrt(threads_total), sqrt(threads_total));
		80	+
		81	+ //calculate the number of blocks
		82	+ dim3 blocks(( (X - K + 1) / threads.x) + 1, ((Y - K + 1) / threads.y) + 1);
		83	+
		84	+ //call the kernel to perform median filter function
		85	+ cuda_median2 <<< blocks, threads >>>( gpu_in, gpu_out, gpu_kernel, X, Y, K);
		86	+
		87	+ }
		88	+
		89	+ template<typename T>
		90	+ void cpu_median2(T* cpu_in, T* cpu_out, size_t X, size_t Y, size_t K){
		91	+
		92	+ #ifndef USING_CUDA
		93	+
		94	+ //output image width and height
		95	+ size_t X_out = X + K -1;
		96	+ size_t Y_out = Y + K -1;
		97	+
		98	+ float* window = (float)malloc(K k *sizeof(float));
		99	+
		100	+ for(int i = 0; i< Y; i++){
		101	+ for(int j =0; j< X; j++){
		102	+
		103	+ //Pick up window elements
		104	+ int k = 0;
		105	+ for (int m=0; m< kernel_width; m++){
		106	+ for(int n = 0; n < kernel_width; n++){
		107	+ window[k] = in_image.at<float>(m+i, n+j);
		108	+ k++;
		109	+ }
		110	+ }
		111	+
		112	+ //calculate kernel radius 4
		113	+ r_ker = K * K/2;
		114	+ //Order elements (only half of them)
		115	+ for(int p = 0; p<r_ker+1; p++){
		116	+ //Find position of minimum element
		117	+ for (int l = p + 1; l < size_kernel; l++){
		118	+
		119	+ if(window[l] < window[p]){
		120	+ float t = window[p];
		121	+ window[p] = window[l];
		122	+ window[l] = t; }
		123	+ }
		124	+ }
		125	+
		126	+ //Get result - the middle element
		127	+ cpu_out[i * X_out + j] = window[r_ker];
		128	+ }
		129	+ }
		130	+ #else
		131	+ //calculate input and out image pixels & calculate kernel size
		132	+ size_t N_in = X * Y; //number of pixels in the input image
		133	+ size_t N_out = (X - K + 1) * (Y - K + 1); //number of pixels in the output image
		134	+ //size_t kernel_size = kernel_width*kernel_width; //total number of pixels in the kernel
		135	+
		136	+ //allocate memory on the GPU for the array
		137	+ T* gpu_in; //allocate device memory for the input image
		138	+ HANDLE_ERROR( cudaMalloc( &gpu_in, N_in * sizeof(T) ) );
		139	+ T* gpu_kernel; //allocate device memory for the kernel
		140	+ HANDLE_ERROR( cudaMalloc( &gpu_kernel, K * K * N_out * sizeof(T) ) );
		141	+ T* gpu_out; //allocate device memory for the output image
		142	+ HANDLE_ERROR( cudaMalloc( &gpu_out, N_out * sizeof(T) ) );
		143	+
		144	+ //copy the array to the GPU
		145	+ HANDLE_ERROR( cudaMemcpy( gpu_in, cpu_in, N_in * sizeof(T), cudaMemcpyHostToDevice) );
		146	+
		147	+ //call the GPU version of this function
		148	+ gpu_median2<T>(gpu_in, gpu_out, gpu_kernel, X, Y, K);
		149	+
		150	+ //copy the array back to the CPU
		151	+ HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N_out * sizeof(T), cudaMemcpyDeviceToHost) );
		152	+
		153	+ //free allocated memory
		154	+ cudaFree(gpu_in);
		155	+ cudaFree(gpu_kernel);
		156	+ cudaFree(gpu_out);
		157	+
		158	+ }
		159	+
		160	+ }
		161	+ #endif
		162	+}
		163	+
		164	+
		165	+#endif