Commit 58fdd08b6e25b0d7f1fc07457b4a9a22ed918711

Authored by Jiabing Li
1 parent d4ba0a6e

added median filter

Showing 1 changed file with 165 additions and 0 deletions   Show diff stats
stim/math/filters/median2.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_MEDIAN2_H
  2 +#define STIM_CUDA_MEDIAN2_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <cmath>
  7 +#include <algorithm>
  8 +
  9 +#ifdef USING_CUDA
  10 +#include "cuda_runtime.h"
  11 +#include "device_launch_parameters.h"
  12 +#include <stim/cuda/cudatools.h>
  13 +#endif
  14 +
  15 +//#include <thrust/sort.h>
  16 +//#include <thrust/execution_policy.h>
  17 +//#include <thrust/binary_search.h>
  18 +//#include <thrust/device_ptr.h>
  19 +
  20 +template <typename T>
  21 +__device__ void cuswap ( T& a, T& b ){
  22 + T c(a);
  23 + a=b;
  24 + b=c;
  25 +}
  26 +
  27 +namespace stim{
  28 + namespace cuda{
  29 +
  30 + template<typename T>
  31 + __global__ void cuda_median2(T* in, T* out, T* kernel, size_t X, size_t Y, size_t K){
  32 +
  33 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  34 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  35 +
  36 + size_t Xout = X - K + 1;
  37 + size_t Yout = Y - K + 1;
  38 + int i = yi * Xout + xi;
  39 +
  40 + //return if (i,j) is outside the matrix
  41 + if(xi >= Xout || yi >= Yout) return;
  42 +
  43 + //scan the image, copy data from input to 2D window
  44 + size_t kxi, kyi;
  45 + for(kyi = 0; kyi < K; kyi++){
  46 + for(kxi = 0; kxi < K; kxi++){
  47 + kernel[i * K * K + kyi * K + kxi] = in[(yi + kyi)* X + xi + kxi];
  48 + }
  49 + }
  50 +
  51 + //calculate kernel radius 4
  52 + int r = (K*K)/2;
  53 +
  54 + //sort the smallest half pixel values inside the window, calculate the middle one
  55 + size_t Ki = i * K * K;
  56 +
  57 + //sort the smallest half pixel values inside the window, calculate the middle one
  58 + for (int p = 0; p < r+1; p++){
  59 + for(int kk = p + 1; kk < K*K; kk++){
  60 + if (kernel[Ki + kk] < kernel[Ki + p]){
  61 + cuswap<T>(kernel[Ki + kk], kernel[Ki + p]);
  62 + }
  63 + }
  64 + }
  65 +
  66 + //copy the middle pixel value inside the window to output
  67 + out[i] = kernel[Ki + r];
  68 +
  69 + }
  70 +
  71 +
  72 + template<typename T>
  73 + void gpu_median2(T* gpu_in, T* gpu_out, T* gpu_kernel, size_t X, size_t Y, size_t K){
  74 +
  75 + //get the maximum number of threads per block for the CUDA device
  76 + int threads_total = stim::maxThreadsPerBlock();
  77 +
  78 + //set threads in each block
  79 + dim3 threads(sqrt(threads_total), sqrt(threads_total));
  80 +
  81 + //calculate the number of blocks
  82 + dim3 blocks(( (X - K + 1) / threads.x) + 1, ((Y - K + 1) / threads.y) + 1);
  83 +
  84 + //call the kernel to perform median filter function
  85 + cuda_median2 <<< blocks, threads >>>( gpu_in, gpu_out, gpu_kernel, X, Y, K);
  86 +
  87 + }
  88 +
  89 + template<typename T>
  90 + void cpu_median2(T* cpu_in, T* cpu_out, size_t X, size_t Y, size_t K){
  91 +
  92 + #ifndef USING_CUDA
  93 +
  94 + //output image width and height
  95 + size_t X_out = X + K -1;
  96 + size_t Y_out = Y + K -1;
  97 +
  98 + float* window = (float*)malloc(K * k *sizeof(float));
  99 +
  100 + for(int i = 0; i< Y; i++){
  101 + for(int j =0; j< X; j++){
  102 +
  103 + //Pick up window elements
  104 + int k = 0;
  105 + for (int m=0; m< kernel_width; m++){
  106 + for(int n = 0; n < kernel_width; n++){
  107 + window[k] = in_image.at<float>(m+i, n+j);
  108 + k++;
  109 + }
  110 + }
  111 +
  112 + //calculate kernel radius 4
  113 + r_ker = K * K/2;
  114 + //Order elements (only half of them)
  115 + for(int p = 0; p<r_ker+1; p++){
  116 + //Find position of minimum element
  117 + for (int l = p + 1; l < size_kernel; l++){
  118 +
  119 + if(window[l] < window[p]){
  120 + float t = window[p];
  121 + window[p] = window[l];
  122 + window[l] = t; }
  123 + }
  124 + }
  125 +
  126 + //Get result - the middle element
  127 + cpu_out[i * X_out + j] = window[r_ker];
  128 + }
  129 + }
  130 + #else
  131 + //calculate input and out image pixels & calculate kernel size
  132 + size_t N_in = X * Y; //number of pixels in the input image
  133 + size_t N_out = (X - K + 1) * (Y - K + 1); //number of pixels in the output image
  134 + //size_t kernel_size = kernel_width*kernel_width; //total number of pixels in the kernel
  135 +
  136 + //allocate memory on the GPU for the array
  137 + T* gpu_in; //allocate device memory for the input image
  138 + HANDLE_ERROR( cudaMalloc( &gpu_in, N_in * sizeof(T) ) );
  139 + T* gpu_kernel; //allocate device memory for the kernel
  140 + HANDLE_ERROR( cudaMalloc( &gpu_kernel, K * K * N_out * sizeof(T) ) );
  141 + T* gpu_out; //allocate device memory for the output image
  142 + HANDLE_ERROR( cudaMalloc( &gpu_out, N_out * sizeof(T) ) );
  143 +
  144 + //copy the array to the GPU
  145 + HANDLE_ERROR( cudaMemcpy( gpu_in, cpu_in, N_in * sizeof(T), cudaMemcpyHostToDevice) );
  146 +
  147 + //call the GPU version of this function
  148 + gpu_median2<T>(gpu_in, gpu_out, gpu_kernel, X, Y, K);
  149 +
  150 + //copy the array back to the CPU
  151 + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N_out * sizeof(T), cudaMemcpyDeviceToHost) );
  152 +
  153 + //free allocated memory
  154 + cudaFree(gpu_in);
  155 + cudaFree(gpu_kernel);
  156 + cudaFree(gpu_out);
  157 +
  158 + }
  159 +
  160 + }
  161 + #endif
  162 +}
  163 +
  164 +
  165 +#endif
... ...