Commit ac134a0711b58259e5859f5836a6a749170fa47e

Authored by Jiabing Li
2 parents 041ab420 b87014ef

Merge branch 'jiabing' of git.stim.ee.uh.edu:codebase/stimlib into jiabing

Showing 1 changed file with 166 additions and 0 deletions   Show diff stats
stim/math/filters/median2.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_MEDIAN2_H
  2 +#define STIM_CUDA_MEDIAN2_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <cmath>
  7 +#include <algorithm>
  8 +
  9 +#ifdef USING_CUDA
  10 +#include "cuda_runtime.h"
  11 +#include "device_launch_parameters.h"
  12 +#include <stim/cuda/cudatools.h>
  13 +#endif
  14 +
  15 +
  16 +//#include <thrust/sort.h>
  17 +//#include <thrust/execution_policy.h>
  18 +//#include <thrust/binary_search.h>
  19 +//#include <thrust/device_ptr.h>
  20 +
  21 +template <typename T>
  22 +__device__ void cuswap ( T& a, T& b ){
  23 + T c(a);
  24 + a=b;
  25 + b=c;
  26 +}
  27 +
  28 +namespace stim{
  29 + namespace cuda{
  30 +
  31 + template<typename T>
  32 + __global__ void cuda_median2(T* in, T* out, T* kernel, size_t X, size_t Y, size_t K){
  33 +
  34 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  35 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  36 +
  37 + size_t Xout = X - K + 1;
  38 + size_t Yout = Y - K + 1;
  39 + int i = yi * Xout + xi;
  40 +
  41 + //return if (i,j) is outside the matrix
  42 + if(xi >= Xout || yi >= Yout) return;
  43 +
  44 + //scan the image, copy data from input to 2D window
  45 + size_t kxi, kyi;
  46 + for(kyi = 0; kyi < K; kyi++){
  47 + for(kxi = 0; kxi < K; kxi++){
  48 + kernel[i * K * K + kyi * K + kxi] = in[(yi + kyi)* X + xi + kxi];
  49 + }
  50 + }
  51 +
  52 + //calculate kernel radius 4
  53 + int r = (K*K)/2;
  54 +
  55 + //sort the smallest half pixel values inside the window, calculate the middle one
  56 + size_t Ki = i * K * K;
  57 +
  58 + //sort the smallest half pixel values inside the window, calculate the middle one
  59 + for (int p = 0; p < r+1; p++){
  60 + for(int kk = p + 1; kk < K*K; kk++){
  61 + if (kernel[Ki + kk] < kernel[Ki + p]){
  62 + cuswap<T>(kernel[Ki + kk], kernel[Ki + p]);
  63 + }
  64 + }
  65 + }
  66 +
  67 + //copy the middle pixel value inside the window to output
  68 + out[i] = kernel[Ki + r];
  69 +
  70 + }
  71 +
  72 +
  73 + template<typename T>
  74 + void gpu_median2(T* gpu_in, T* gpu_out, T* gpu_kernel, size_t X, size_t Y, size_t K){
  75 +
  76 + //get the maximum number of threads per block for the CUDA device
  77 + int threads_total = stim::maxThreadsPerBlock();
  78 +
  79 + //set threads in each block
  80 + dim3 threads(sqrt(threads_total), sqrt(threads_total));
  81 +
  82 + //calculate the number of blocks
  83 + dim3 blocks(( (X - K + 1) / threads.x) + 1, ((Y - K + 1) / threads.y) + 1);
  84 +
  85 + //call the kernel to perform median filter function
  86 + cuda_median2 <<< blocks, threads >>>( gpu_in, gpu_out, gpu_kernel, X, Y, K);
  87 +
  88 + }
  89 +
  90 + template<typename T>
  91 + void cpu_median2(T* cpu_in, T* cpu_out, size_t X, size_t Y, size_t K){
  92 +
  93 + #ifndef USING_CUDA
  94 +
  95 + //output image width and height
  96 + size_t X_out = X + K -1;
  97 + size_t Y_out = Y + K -1;
  98 +
  99 + float* window = (float*)malloc(K * k *sizeof(float));
  100 +
  101 + for(int i = 0; i< Y; i++){
  102 + for(int j =0; j< X; j++){
  103 +
  104 + //Pick up window elements
  105 + int k = 0;
  106 + for (int m=0; m< kernel_width; m++){
  107 + for(int n = 0; n < kernel_width; n++){
  108 + window[k] = in_image.at<float>(m+i, n+j);
  109 + k++;
  110 + }
  111 + }
  112 +
  113 + //calculate kernel radius 4
  114 + r_ker = K * K/2;
  115 + //Order elements (only half of them)
  116 + for(int p = 0; p<r_ker+1; p++){
  117 + //Find position of minimum element
  118 + for (int l = p + 1; l < size_kernel; l++){
  119 +
  120 + if(window[l] < window[p]){
  121 + float t = window[p];
  122 + window[p] = window[l];
  123 + window[l] = t; }
  124 + }
  125 + }
  126 +
  127 + //Get result - the middle element
  128 + cpu_out[i * X_out + j] = window[r_ker];
  129 + }
  130 + }
  131 + #else
  132 + //calculate input and out image pixels & calculate kernel size
  133 + size_t N_in = X * Y; //number of pixels in the input image
  134 + size_t N_out = (X - K + 1) * (Y - K + 1); //number of pixels in the output image
  135 + //size_t kernel_size = kernel_width*kernel_width; //total number of pixels in the kernel
  136 +
  137 + //allocate memory on the GPU for the array
  138 + T* gpu_in; //allocate device memory for the input image
  139 + HANDLE_ERROR( cudaMalloc( &gpu_in, N_in * sizeof(T) ) );
  140 + T* gpu_kernel; //allocate device memory for the kernel
  141 + HANDLE_ERROR( cudaMalloc( &gpu_kernel, K * K * N_out * sizeof(T) ) );
  142 + T* gpu_out; //allocate device memory for the output image
  143 + HANDLE_ERROR( cudaMalloc( &gpu_out, N_out * sizeof(T) ) );
  144 +
  145 + //copy the array to the GPU
  146 + HANDLE_ERROR( cudaMemcpy( gpu_in, cpu_in, N_in * sizeof(T), cudaMemcpyHostToDevice) );
  147 +
  148 + //call the GPU version of this function
  149 + gpu_median2<T>(gpu_in, gpu_out, gpu_kernel, X, Y, K);
  150 +
  151 + //copy the array back to the CPU
  152 + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N_out * sizeof(T), cudaMemcpyDeviceToHost) );
  153 +
  154 + //free allocated memory
  155 + cudaFree(gpu_in);
  156 + cudaFree(gpu_kernel);
  157 + cudaFree(gpu_out);
  158 +
  159 + }
  160 +
  161 + }
  162 + #endif
  163 +}
  164 +
  165 +
  166 +#endif
... ...