Commit ac134a0711b58259e5859f5836a6a749170fa47e
Merge branch 'jiabing' of git.stim.ee.uh.edu:codebase/stimlib into jiabing
Showing
1 changed file
with
166 additions
and
0 deletions
Show diff stats
1 | +#ifndef STIM_CUDA_MEDIAN2_H | |
2 | +#define STIM_CUDA_MEDIAN2_H | |
3 | + | |
4 | +#include <iostream> | |
5 | +#include <cuda.h> | |
6 | +#include <cmath> | |
7 | +#include <algorithm> | |
8 | + | |
9 | +#ifdef USING_CUDA | |
10 | +#include "cuda_runtime.h" | |
11 | +#include "device_launch_parameters.h" | |
12 | +#include <stim/cuda/cudatools.h> | |
13 | +#endif | |
14 | + | |
15 | + | |
16 | +//#include <thrust/sort.h> | |
17 | +//#include <thrust/execution_policy.h> | |
18 | +//#include <thrust/binary_search.h> | |
19 | +//#include <thrust/device_ptr.h> | |
20 | + | |
21 | +template <typename T> | |
22 | +__device__ void cuswap ( T& a, T& b ){ | |
23 | + T c(a); | |
24 | + a=b; | |
25 | + b=c; | |
26 | +} | |
27 | + | |
28 | +namespace stim{ | |
29 | + namespace cuda{ | |
30 | + | |
31 | + template<typename T> | |
32 | + __global__ void cuda_median2(T* in, T* out, T* kernel, size_t X, size_t Y, size_t K){ | |
33 | + | |
34 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
35 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
36 | + | |
37 | + size_t Xout = X - K + 1; | |
38 | + size_t Yout = Y - K + 1; | |
39 | + int i = yi * Xout + xi; | |
40 | + | |
41 | + //return if (i,j) is outside the matrix | |
42 | + if(xi >= Xout || yi >= Yout) return; | |
43 | + | |
44 | + //scan the image, copy data from input to 2D window | |
45 | + size_t kxi, kyi; | |
46 | + for(kyi = 0; kyi < K; kyi++){ | |
47 | + for(kxi = 0; kxi < K; kxi++){ | |
48 | + kernel[i * K * K + kyi * K + kxi] = in[(yi + kyi)* X + xi + kxi]; | |
49 | + } | |
50 | + } | |
51 | + | |
52 | + //calculate kernel radius 4 | |
53 | + int r = (K*K)/2; | |
54 | + | |
55 | + //sort the smallest half pixel values inside the window, calculate the middle one | |
56 | + size_t Ki = i * K * K; | |
57 | + | |
58 | + //sort the smallest half pixel values inside the window, calculate the middle one | |
59 | + for (int p = 0; p < r+1; p++){ | |
60 | + for(int kk = p + 1; kk < K*K; kk++){ | |
61 | + if (kernel[Ki + kk] < kernel[Ki + p]){ | |
62 | + cuswap<T>(kernel[Ki + kk], kernel[Ki + p]); | |
63 | + } | |
64 | + } | |
65 | + } | |
66 | + | |
67 | + //copy the middle pixel value inside the window to output | |
68 | + out[i] = kernel[Ki + r]; | |
69 | + | |
70 | + } | |
71 | + | |
72 | + | |
73 | + template<typename T> | |
74 | + void gpu_median2(T* gpu_in, T* gpu_out, T* gpu_kernel, size_t X, size_t Y, size_t K){ | |
75 | + | |
76 | + //get the maximum number of threads per block for the CUDA device | |
77 | + int threads_total = stim::maxThreadsPerBlock(); | |
78 | + | |
79 | + //set threads in each block | |
80 | + dim3 threads(sqrt(threads_total), sqrt(threads_total)); | |
81 | + | |
82 | + //calculate the number of blocks | |
83 | + dim3 blocks(( (X - K + 1) / threads.x) + 1, ((Y - K + 1) / threads.y) + 1); | |
84 | + | |
85 | + //call the kernel to perform median filter function | |
86 | + cuda_median2 <<< blocks, threads >>>( gpu_in, gpu_out, gpu_kernel, X, Y, K); | |
87 | + | |
88 | + } | |
89 | + | |
90 | + template<typename T> | |
91 | + void cpu_median2(T* cpu_in, T* cpu_out, size_t X, size_t Y, size_t K){ | |
92 | + | |
93 | + #ifndef USING_CUDA | |
94 | + | |
95 | + //output image width and height | |
96 | + size_t X_out = X + K -1; | |
97 | + size_t Y_out = Y + K -1; | |
98 | + | |
99 | + float* window = (float*)malloc(K * k *sizeof(float)); | |
100 | + | |
101 | + for(int i = 0; i< Y; i++){ | |
102 | + for(int j =0; j< X; j++){ | |
103 | + | |
104 | + //Pick up window elements | |
105 | + int k = 0; | |
106 | + for (int m=0; m< kernel_width; m++){ | |
107 | + for(int n = 0; n < kernel_width; n++){ | |
108 | + window[k] = in_image.at<float>(m+i, n+j); | |
109 | + k++; | |
110 | + } | |
111 | + } | |
112 | + | |
113 | + //calculate kernel radius 4 | |
114 | + r_ker = K * K/2; | |
115 | + //Order elements (only half of them) | |
116 | + for(int p = 0; p<r_ker+1; p++){ | |
117 | + //Find position of minimum element | |
118 | + for (int l = p + 1; l < size_kernel; l++){ | |
119 | + | |
120 | + if(window[l] < window[p]){ | |
121 | + float t = window[p]; | |
122 | + window[p] = window[l]; | |
123 | + window[l] = t; } | |
124 | + } | |
125 | + } | |
126 | + | |
127 | + //Get result - the middle element | |
128 | + cpu_out[i * X_out + j] = window[r_ker]; | |
129 | + } | |
130 | + } | |
131 | + #else | |
132 | + //calculate input and out image pixels & calculate kernel size | |
133 | + size_t N_in = X * Y; //number of pixels in the input image | |
134 | + size_t N_out = (X - K + 1) * (Y - K + 1); //number of pixels in the output image | |
135 | + //size_t kernel_size = kernel_width*kernel_width; //total number of pixels in the kernel | |
136 | + | |
137 | + //allocate memory on the GPU for the array | |
138 | + T* gpu_in; //allocate device memory for the input image | |
139 | + HANDLE_ERROR( cudaMalloc( &gpu_in, N_in * sizeof(T) ) ); | |
140 | + T* gpu_kernel; //allocate device memory for the kernel | |
141 | + HANDLE_ERROR( cudaMalloc( &gpu_kernel, K * K * N_out * sizeof(T) ) ); | |
142 | + T* gpu_out; //allocate device memory for the output image | |
143 | + HANDLE_ERROR( cudaMalloc( &gpu_out, N_out * sizeof(T) ) ); | |
144 | + | |
145 | + //copy the array to the GPU | |
146 | + HANDLE_ERROR( cudaMemcpy( gpu_in, cpu_in, N_in * sizeof(T), cudaMemcpyHostToDevice) ); | |
147 | + | |
148 | + //call the GPU version of this function | |
149 | + gpu_median2<T>(gpu_in, gpu_out, gpu_kernel, X, Y, K); | |
150 | + | |
151 | + //copy the array back to the CPU | |
152 | + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N_out * sizeof(T), cudaMemcpyDeviceToHost) ); | |
153 | + | |
154 | + //free allocated memory | |
155 | + cudaFree(gpu_in); | |
156 | + cudaFree(gpu_kernel); | |
157 | + cudaFree(gpu_out); | |
158 | + | |
159 | + } | |
160 | + | |
161 | + } | |
162 | + #endif | |
163 | +} | |
164 | + | |
165 | + | |
166 | +#endif | ... | ... |