codebase / stimlib

Browse Code »

Commit ac134a0711b58259e5859f5836a6a749170fa47e

Authored by Jiabing Li 2018-01-19 15:45:01 -0600

2 parents 041ab420 b87014ef

Merge branch 'jiabing' of git.stim.ee.uh.edu:codebase/stimlib into jiabing

Showing 1 changed file with 166 additions and 0 deletions Show diff stats

Inline Side-by-side

stim/math/filters/median2.cuh 0 → 100644

Wrap text Show/Hide comments View file @ac134a0

	1	+#ifndef STIM_CUDA_MEDIAN2_H
	2	+#define STIM_CUDA_MEDIAN2_H
	3	+
	4	+#include <iostream>
	5	+#include <cuda.h>
	6	+#include <cmath>
	7	+#include <algorithm>
	8	+
	9	+#ifdef USING_CUDA
	10	+#include "cuda_runtime.h"
	11	+#include "device_launch_parameters.h"
	12	+#include <stim/cuda/cudatools.h>
	13	+#endif
	14	+
	15	+
	16	+//#include <thrust/sort.h>
	17	+//#include <thrust/execution_policy.h>
	18	+//#include <thrust/binary_search.h>
	19	+//#include <thrust/device_ptr.h>
	20	+
	21	+template <typename T>
	22	+__device__ void cuswap ( T& a, T& b ){
	23	+ T c(a);
	24	+ a=b;
	25	+ b=c;
	26	+}
	27	+
	28	+namespace stim{
	29	+ namespace cuda{
	30	+
	31	+ template<typename T>
	32	+ __global__ void cuda_median2(T* in, T* out, T* kernel, size_t X, size_t Y, size_t K){
	33	+
	34	+ int xi = blockIdx.x * blockDim.x + threadIdx.x;
	35	+ int yi = blockIdx.y * blockDim.y + threadIdx.y;
	36	+
	37	+ size_t Xout = X - K + 1;
	38	+ size_t Yout = Y - K + 1;
	39	+ int i = yi * Xout + xi;
	40	+
	41	+ //return if (i,j) is outside the matrix
	42	+ if(xi >= Xout \|\| yi >= Yout) return;
	43	+
	44	+ //scan the image, copy data from input to 2D window
	45	+ size_t kxi, kyi;
	46	+ for(kyi = 0; kyi < K; kyi++){
	47	+ for(kxi = 0; kxi < K; kxi++){
	48	+ kernel[i * K * K + kyi * K + kxi] = in[(yi + kyi)* X + xi + kxi];
	49	+ }
	50	+ }
	51	+
	52	+ //calculate kernel radius 4
	53	+ int r = (K*K)/2;
	54	+
	55	+ //sort the smallest half pixel values inside the window, calculate the middle one
	56	+ size_t Ki = i * K * K;
	57	+
	58	+ //sort the smallest half pixel values inside the window, calculate the middle one
	59	+ for (int p = 0; p < r+1; p++){
	60	+ for(int kk = p + 1; kk < K*K; kk++){
	61	+ if (kernel[Ki + kk] < kernel[Ki + p]){
	62	+ cuswap<T>(kernel[Ki + kk], kernel[Ki + p]);
	63	+ }
	64	+ }
	65	+ }
	66	+
	67	+ //copy the middle pixel value inside the window to output
	68	+ out[i] = kernel[Ki + r];
	69	+
	70	+ }
	71	+
	72	+
	73	+ template<typename T>
	74	+ void gpu_median2(T* gpu_in, T* gpu_out, T* gpu_kernel, size_t X, size_t Y, size_t K){
	75	+
	76	+ //get the maximum number of threads per block for the CUDA device
	77	+ int threads_total = stim::maxThreadsPerBlock();
	78	+
	79	+ //set threads in each block
	80	+ dim3 threads(sqrt(threads_total), sqrt(threads_total));
	81	+
	82	+ //calculate the number of blocks
	83	+ dim3 blocks(( (X - K + 1) / threads.x) + 1, ((Y - K + 1) / threads.y) + 1);
	84	+
	85	+ //call the kernel to perform median filter function
	86	+ cuda_median2 <<< blocks, threads >>>( gpu_in, gpu_out, gpu_kernel, X, Y, K);
	87	+
	88	+ }
	89	+
	90	+ template<typename T>
	91	+ void cpu_median2(T* cpu_in, T* cpu_out, size_t X, size_t Y, size_t K){
	92	+
	93	+ #ifndef USING_CUDA
	94	+
	95	+ //output image width and height
	96	+ size_t X_out = X + K -1;
	97	+ size_t Y_out = Y + K -1;
	98	+
	99	+ float* window = (float)malloc(K k *sizeof(float));
	100	+
	101	+ for(int i = 0; i< Y; i++){
	102	+ for(int j =0; j< X; j++){
	103	+
	104	+ //Pick up window elements
	105	+ int k = 0;
	106	+ for (int m=0; m< kernel_width; m++){
	107	+ for(int n = 0; n < kernel_width; n++){
	108	+ window[k] = in_image.at<float>(m+i, n+j);
	109	+ k++;
	110	+ }
	111	+ }
	112	+
	113	+ //calculate kernel radius 4
	114	+ r_ker = K * K/2;
	115	+ //Order elements (only half of them)
	116	+ for(int p = 0; p<r_ker+1; p++){
	117	+ //Find position of minimum element
	118	+ for (int l = p + 1; l < size_kernel; l++){
	119	+
	120	+ if(window[l] < window[p]){
	121	+ float t = window[p];
	122	+ window[p] = window[l];
	123	+ window[l] = t; }
	124	+ }
	125	+ }
	126	+
	127	+ //Get result - the middle element
	128	+ cpu_out[i * X_out + j] = window[r_ker];
	129	+ }
	130	+ }
	131	+ #else
	132	+ //calculate input and out image pixels & calculate kernel size
	133	+ size_t N_in = X * Y; //number of pixels in the input image
	134	+ size_t N_out = (X - K + 1) * (Y - K + 1); //number of pixels in the output image
	135	+ //size_t kernel_size = kernel_width*kernel_width; //total number of pixels in the kernel
	136	+
	137	+ //allocate memory on the GPU for the array
	138	+ T* gpu_in; //allocate device memory for the input image
	139	+ HANDLE_ERROR( cudaMalloc( &gpu_in, N_in * sizeof(T) ) );
	140	+ T* gpu_kernel; //allocate device memory for the kernel
	141	+ HANDLE_ERROR( cudaMalloc( &gpu_kernel, K * K * N_out * sizeof(T) ) );
	142	+ T* gpu_out; //allocate device memory for the output image
	143	+ HANDLE_ERROR( cudaMalloc( &gpu_out, N_out * sizeof(T) ) );
	144	+
	145	+ //copy the array to the GPU
	146	+ HANDLE_ERROR( cudaMemcpy( gpu_in, cpu_in, N_in * sizeof(T), cudaMemcpyHostToDevice) );
	147	+
	148	+ //call the GPU version of this function
	149	+ gpu_median2<T>(gpu_in, gpu_out, gpu_kernel, X, Y, K);
	150	+
	151	+ //copy the array back to the CPU
	152	+ HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N_out * sizeof(T), cudaMemcpyDeviceToHost) );
	153	+
	154	+ //free allocated memory
	155	+ cudaFree(gpu_in);
	156	+ cudaFree(gpu_kernel);
	157	+ cudaFree(gpu_out);
	158	+
	159	+ }
	160	+
	161	+ }
	162	+ #endif
	163	+}
	164	+
	165	+
	166	+#endif
...	...