diff --git a/stim/math/filters/median2.cuh b/stim/math/filters/median2.cuh new file mode 100644 index 0000000..78499ac --- /dev/null +++ b/stim/math/filters/median2.cuh @@ -0,0 +1,165 @@ +#ifndef STIM_CUDA_MEDIAN2_H +#define STIM_CUDA_MEDIAN2_H + +#include +#include +#include +#include + +#ifdef USING_CUDA +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include +#endif + +//#include +//#include +//#include +//#include + +template +__device__ void cuswap ( T& a, T& b ){ + T c(a); + a=b; + b=c; +} + +namespace stim{ + namespace cuda{ + + template + __global__ void cuda_median2(T* in, T* out, T* kernel, size_t X, size_t Y, size_t K){ + + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y * blockDim.y + threadIdx.y; + + size_t Xout = X - K + 1; + size_t Yout = Y - K + 1; + int i = yi * Xout + xi; + + //return if (i,j) is outside the matrix + if(xi >= Xout || yi >= Yout) return; + + //scan the image, copy data from input to 2D window + size_t kxi, kyi; + for(kyi = 0; kyi < K; kyi++){ + for(kxi = 0; kxi < K; kxi++){ + kernel[i * K * K + kyi * K + kxi] = in[(yi + kyi)* X + xi + kxi]; + } + } + + //calculate kernel radius 4 + int r = (K*K)/2; + + //sort the smallest half pixel values inside the window, calculate the middle one + size_t Ki = i * K * K; + + //sort the smallest half pixel values inside the window, calculate the middle one + for (int p = 0; p < r+1; p++){ + for(int kk = p + 1; kk < K*K; kk++){ + if (kernel[Ki + kk] < kernel[Ki + p]){ + cuswap(kernel[Ki + kk], kernel[Ki + p]); + } + } + } + + //copy the middle pixel value inside the window to output + out[i] = kernel[Ki + r]; + + } + + + template + void gpu_median2(T* gpu_in, T* gpu_out, T* gpu_kernel, size_t X, size_t Y, size_t K){ + + //get the maximum number of threads per block for the CUDA device + int threads_total = stim::maxThreadsPerBlock(); + + //set threads in each block + dim3 threads(sqrt(threads_total), sqrt(threads_total)); + + //calculate the number of blocks + dim3 blocks(( (X - K + 1) / threads.x) + 1, ((Y - K + 1) / threads.y) + 1); + + //call the kernel to perform median filter function + cuda_median2 <<< blocks, threads >>>( gpu_in, gpu_out, gpu_kernel, X, Y, K); + + } + + template + void cpu_median2(T* cpu_in, T* cpu_out, size_t X, size_t Y, size_t K){ + + #ifndef USING_CUDA + + //output image width and height + size_t X_out = X + K -1; + size_t Y_out = Y + K -1; + + float* window = (float*)malloc(K * k *sizeof(float)); + + for(int i = 0; i< Y; i++){ + for(int j =0; j< X; j++){ + + //Pick up window elements + int k = 0; + for (int m=0; m< kernel_width; m++){ + for(int n = 0; n < kernel_width; n++){ + window[k] = in_image.at(m+i, n+j); + k++; + } + } + + //calculate kernel radius 4 + r_ker = K * K/2; + //Order elements (only half of them) + for(int p = 0; p(gpu_in, gpu_out, gpu_kernel, X, Y, K); + + //copy the array back to the CPU + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N_out * sizeof(T), cudaMemcpyDeviceToHost) ); + + //free allocated memory + cudaFree(gpu_in); + cudaFree(gpu_kernel); + cudaFree(gpu_out); + + } + + } + #endif +} + + +#endif -- libgit2 0.21.4