Commit 58fdd08b6e25b0d7f1fc07457b4a9a22ed918711
1 parent
d4ba0a6e
added median filter
Showing
1 changed file
with
165 additions
and
0 deletions
Show diff stats
1 | +#ifndef STIM_CUDA_MEDIAN2_H | ||
2 | +#define STIM_CUDA_MEDIAN2_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <cmath> | ||
7 | +#include <algorithm> | ||
8 | + | ||
9 | +#ifdef USING_CUDA | ||
10 | +#include "cuda_runtime.h" | ||
11 | +#include "device_launch_parameters.h" | ||
12 | +#include <stim/cuda/cudatools.h> | ||
13 | +#endif | ||
14 | + | ||
15 | +//#include <thrust/sort.h> | ||
16 | +//#include <thrust/execution_policy.h> | ||
17 | +//#include <thrust/binary_search.h> | ||
18 | +//#include <thrust/device_ptr.h> | ||
19 | + | ||
20 | +template <typename T> | ||
21 | +__device__ void cuswap ( T& a, T& b ){ | ||
22 | + T c(a); | ||
23 | + a=b; | ||
24 | + b=c; | ||
25 | +} | ||
26 | + | ||
27 | +namespace stim{ | ||
28 | + namespace cuda{ | ||
29 | + | ||
30 | + template<typename T> | ||
31 | + __global__ void cuda_median2(T* in, T* out, T* kernel, size_t X, size_t Y, size_t K){ | ||
32 | + | ||
33 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
34 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
35 | + | ||
36 | + size_t Xout = X - K + 1; | ||
37 | + size_t Yout = Y - K + 1; | ||
38 | + int i = yi * Xout + xi; | ||
39 | + | ||
40 | + //return if (i,j) is outside the matrix | ||
41 | + if(xi >= Xout || yi >= Yout) return; | ||
42 | + | ||
43 | + //scan the image, copy data from input to 2D window | ||
44 | + size_t kxi, kyi; | ||
45 | + for(kyi = 0; kyi < K; kyi++){ | ||
46 | + for(kxi = 0; kxi < K; kxi++){ | ||
47 | + kernel[i * K * K + kyi * K + kxi] = in[(yi + kyi)* X + xi + kxi]; | ||
48 | + } | ||
49 | + } | ||
50 | + | ||
51 | + //calculate kernel radius 4 | ||
52 | + int r = (K*K)/2; | ||
53 | + | ||
54 | + //sort the smallest half pixel values inside the window, calculate the middle one | ||
55 | + size_t Ki = i * K * K; | ||
56 | + | ||
57 | + //sort the smallest half pixel values inside the window, calculate the middle one | ||
58 | + for (int p = 0; p < r+1; p++){ | ||
59 | + for(int kk = p + 1; kk < K*K; kk++){ | ||
60 | + if (kernel[Ki + kk] < kernel[Ki + p]){ | ||
61 | + cuswap<T>(kernel[Ki + kk], kernel[Ki + p]); | ||
62 | + } | ||
63 | + } | ||
64 | + } | ||
65 | + | ||
66 | + //copy the middle pixel value inside the window to output | ||
67 | + out[i] = kernel[Ki + r]; | ||
68 | + | ||
69 | + } | ||
70 | + | ||
71 | + | ||
72 | + template<typename T> | ||
73 | + void gpu_median2(T* gpu_in, T* gpu_out, T* gpu_kernel, size_t X, size_t Y, size_t K){ | ||
74 | + | ||
75 | + //get the maximum number of threads per block for the CUDA device | ||
76 | + int threads_total = stim::maxThreadsPerBlock(); | ||
77 | + | ||
78 | + //set threads in each block | ||
79 | + dim3 threads(sqrt(threads_total), sqrt(threads_total)); | ||
80 | + | ||
81 | + //calculate the number of blocks | ||
82 | + dim3 blocks(( (X - K + 1) / threads.x) + 1, ((Y - K + 1) / threads.y) + 1); | ||
83 | + | ||
84 | + //call the kernel to perform median filter function | ||
85 | + cuda_median2 <<< blocks, threads >>>( gpu_in, gpu_out, gpu_kernel, X, Y, K); | ||
86 | + | ||
87 | + } | ||
88 | + | ||
89 | + template<typename T> | ||
90 | + void cpu_median2(T* cpu_in, T* cpu_out, size_t X, size_t Y, size_t K){ | ||
91 | + | ||
92 | + #ifndef USING_CUDA | ||
93 | + | ||
94 | + //output image width and height | ||
95 | + size_t X_out = X + K -1; | ||
96 | + size_t Y_out = Y + K -1; | ||
97 | + | ||
98 | + float* window = (float*)malloc(K * k *sizeof(float)); | ||
99 | + | ||
100 | + for(int i = 0; i< Y; i++){ | ||
101 | + for(int j =0; j< X; j++){ | ||
102 | + | ||
103 | + //Pick up window elements | ||
104 | + int k = 0; | ||
105 | + for (int m=0; m< kernel_width; m++){ | ||
106 | + for(int n = 0; n < kernel_width; n++){ | ||
107 | + window[k] = in_image.at<float>(m+i, n+j); | ||
108 | + k++; | ||
109 | + } | ||
110 | + } | ||
111 | + | ||
112 | + //calculate kernel radius 4 | ||
113 | + r_ker = K * K/2; | ||
114 | + //Order elements (only half of them) | ||
115 | + for(int p = 0; p<r_ker+1; p++){ | ||
116 | + //Find position of minimum element | ||
117 | + for (int l = p + 1; l < size_kernel; l++){ | ||
118 | + | ||
119 | + if(window[l] < window[p]){ | ||
120 | + float t = window[p]; | ||
121 | + window[p] = window[l]; | ||
122 | + window[l] = t; } | ||
123 | + } | ||
124 | + } | ||
125 | + | ||
126 | + //Get result - the middle element | ||
127 | + cpu_out[i * X_out + j] = window[r_ker]; | ||
128 | + } | ||
129 | + } | ||
130 | + #else | ||
131 | + //calculate input and out image pixels & calculate kernel size | ||
132 | + size_t N_in = X * Y; //number of pixels in the input image | ||
133 | + size_t N_out = (X - K + 1) * (Y - K + 1); //number of pixels in the output image | ||
134 | + //size_t kernel_size = kernel_width*kernel_width; //total number of pixels in the kernel | ||
135 | + | ||
136 | + //allocate memory on the GPU for the array | ||
137 | + T* gpu_in; //allocate device memory for the input image | ||
138 | + HANDLE_ERROR( cudaMalloc( &gpu_in, N_in * sizeof(T) ) ); | ||
139 | + T* gpu_kernel; //allocate device memory for the kernel | ||
140 | + HANDLE_ERROR( cudaMalloc( &gpu_kernel, K * K * N_out * sizeof(T) ) ); | ||
141 | + T* gpu_out; //allocate device memory for the output image | ||
142 | + HANDLE_ERROR( cudaMalloc( &gpu_out, N_out * sizeof(T) ) ); | ||
143 | + | ||
144 | + //copy the array to the GPU | ||
145 | + HANDLE_ERROR( cudaMemcpy( gpu_in, cpu_in, N_in * sizeof(T), cudaMemcpyHostToDevice) ); | ||
146 | + | ||
147 | + //call the GPU version of this function | ||
148 | + gpu_median2<T>(gpu_in, gpu_out, gpu_kernel, X, Y, K); | ||
149 | + | ||
150 | + //copy the array back to the CPU | ||
151 | + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N_out * sizeof(T), cudaMemcpyDeviceToHost) ); | ||
152 | + | ||
153 | + //free allocated memory | ||
154 | + cudaFree(gpu_in); | ||
155 | + cudaFree(gpu_kernel); | ||
156 | + cudaFree(gpu_out); | ||
157 | + | ||
158 | + } | ||
159 | + | ||
160 | + } | ||
161 | + #endif | ||
162 | +} | ||
163 | + | ||
164 | + | ||
165 | +#endif |