#ifndef STIM_CUDA_MEDIAN2_H #define STIM_CUDA_MEDIAN2_H #include #include #include #include #ifdef USING_CUDA #include "cuda_runtime.h" #include "device_launch_parameters.h" #include #endif //#include //#include //#include //#include template __device__ void cuswap ( T& a, T& b ){ T c(a); a=b; b=c; } namespace stim{ namespace cuda{ template __global__ void cuda_median2(T* in, T* out, T* kernel, size_t X, size_t Y, size_t K){ int xi = blockIdx.x * blockDim.x + threadIdx.x; int yi = blockIdx.y * blockDim.y + threadIdx.y; size_t Xout = X - K + 1; size_t Yout = Y - K + 1; int i = yi * Xout + xi; //return if (i,j) is outside the matrix if(xi >= Xout || yi >= Yout) return; //scan the image, copy data from input to 2D window size_t kxi, kyi; for(kyi = 0; kyi < K; kyi++){ for(kxi = 0; kxi < K; kxi++){ kernel[i * K * K + kyi * K + kxi] = in[(yi + kyi)* X + xi + kxi]; } } //calculate kernel radius 4 int r = (K*K)/2; //sort the smallest half pixel values inside the window, calculate the middle one size_t Ki = i * K * K; //sort the smallest half pixel values inside the window, calculate the middle one for (int p = 0; p < r+1; p++){ for(int kk = p + 1; kk < K*K; kk++){ if (kernel[Ki + kk] < kernel[Ki + p]){ cuswap(kernel[Ki + kk], kernel[Ki + p]); } } } //copy the middle pixel value inside the window to output out[i] = kernel[Ki + r]; } template void gpu_median2(T* gpu_in, T* gpu_out, T* gpu_kernel, size_t X, size_t Y, size_t K){ //get the maximum number of threads per block for the CUDA device int threads_total = stim::maxThreadsPerBlock(); //set threads in each block dim3 threads(sqrt(threads_total), sqrt(threads_total)); //calculate the number of blocks dim3 blocks(( (X - K + 1) / threads.x) + 1, ((Y - K + 1) / threads.y) + 1); //call the kernel to perform median filter function cuda_median2 <<< blocks, threads >>>( gpu_in, gpu_out, gpu_kernel, X, Y, K); } template void cpu_median2(T* cpu_in, T* cpu_out, size_t X, size_t Y, size_t K){ #ifndef USING_CUDA //output image width and height size_t X_out = X + K -1; size_t Y_out = Y + K -1; float* window = (float*)malloc(K * k *sizeof(float)); for(int i = 0; i< Y; i++){ for(int j =0; j< X; j++){ //Pick up window elements int k = 0; for (int m=0; m< kernel_width; m++){ for(int n = 0; n < kernel_width; n++){ window[k] = in_image.at(m+i, n+j); k++; } } //calculate kernel radius 4 r_ker = K * K/2; //Order elements (only half of them) for(int p = 0; p(gpu_in, gpu_out, gpu_kernel, X, Y, K); //copy the array back to the CPU HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N_out * sizeof(T), cudaMemcpyDeviceToHost) ); //free allocated memory cudaFree(gpu_in); cudaFree(gpu_kernel); cudaFree(gpu_out); } } #endif } #endif