#ifndef STIM_CUDA_GAUSSIAN_BLUR3_H #define STIM_CUDA_GAUSSIAN_BLUR3_H #include #include #include #include #define pi 3.14159 template __global__ void blur_x(T* out, T* in, T sigma, int x, int y, int z){ //calculate x,y,z coordinates for this thread int xi = blockIdx.x * blockDim.x + threadIdx.x; //find the grid size along y int grid_y = y / blockDim.y; int blockidx_y = blockIdx.y % grid_y; int yi = blockidx_y * blockDim.y + threadIdx.y; int zi = blockIdx.y / grid_y; int i = zi * x * y + yi * x + xi; // calculate the kernel size T k_size = sigma * 4; //if the current pixel is outside of the image if(xi >= x || yi >= y || zi>=z) return; int gx, gi; T G; T sum = 0; //running weighted sum across the kernel out[i] = 0; T sigma_sq = 2.0 * sigma * sigma; T a = 1.0 / (sigma * sqrt(2.0 * pi)); //handle the boundary in x direction int kl = -(int)k_size; //if (xi < k_size) kl = -xi; int kh = (int)k_size; //if (xi >= x - (int)k_size) kh = x - xi - 1; //for each element of the kernel for(int ki = kl; ki <= kh; ki++){ //calculate the gaussian value G = a * exp(-(T)(ki*ki) / (sigma_sq)); //calculate the global coordinates for this point in the kernel gx = (xi + ki) % x; gi = zi * x * y + yi * x + gx; sum += G * in[gi]; } //output the result to global memory out[i] = sum; } template __global__ void blur_y(T* out, T* in, T sigma, int x, int y, int z){ //calculate x,y,z coordinates for this thread int xi = blockIdx.x * blockDim.x + threadIdx.x; //find the grid size along y int grid_y = y / blockDim.y; int blockidx_y = blockIdx.y % grid_y; int yi = blockidx_y * blockDim.y + threadIdx.y; int zi = blockIdx.y / grid_y; int i = zi * x * y + yi * x + xi; // calculate the kernel size T k_size = sigma * 4; //if the current pixel is outside of the image if(xi >= x || yi >= y || zi>=z) return; int gy, gi; T G; T sum = 0; //running weighted sum across the kernel out[i] = 0; T sigma_sq = 2.0 * sigma * sigma; T a = 1.0 / (sigma * sqrt(2.0 * pi)); //handle the boundary in y direction int kl = -(int)k_size; //if (yi < k_size) kl = -yi; int kh = (int)k_size; //if (yi >= y - (int)k_size) kh = y - yi - 1; //for each element of the kernel for(int ki = kl; ki <= kh; ki++){ //calculate the gaussian value G = a * exp(-(T)(ki*ki) / sigma_sq); //calculate the global coordinates for this point in the kernel gy = (yi + ki ) % y; gi = zi * x * y + gy * x + xi; sum += G * in[gi]; } //output the result to global memory out[i] = sum; } template __global__ void blur_z(T* out, T* in, T sigma, int x, int y, int z){ //calculate x,y,z coordinates for this thread int xi = blockIdx.x * blockDim.x + threadIdx.x; //find the grid size along y int grid_y = y / blockDim.y; int blockidx_y = blockIdx.y % grid_y; int yi = blockidx_y * blockDim.y + threadIdx.y; int zi = blockIdx.y / grid_y; int i = zi * x * y + yi * x + xi; // calculate the kernel size T k_size = sigma * 4; //if the current pixel is outside of the image if(xi >= x || yi >= y || zi>=z) return; int gz, gi; T G; T sum = 0; //running weighted sum across the kernel out[i] = 0; T sigma_sq = 2.0 * sigma * sigma; T a = 1.0 / (sigma * sqrt(2.0 * pi)); //handle the boundary in z direction int kl = -(int)k_size; //if (zi < k_size) kl = -zi; int kh = (int)k_size; //if (zi >= z - (int)k_size) kh = z - zi - 1; //for each element of the kernel for(int ki = kl; ki <= kh; ki++){ //calculate the gaussian value G = a * exp(-(T)(ki*ki) / sigma_sq); //calculate the global coordinates for this point in the kernel gz = (zi + ki) % z; gi = gz * x * y + yi * x + xi; sum += G * in[gi]; } //output the result to global memory out[i] = sum; } template void gpu_gaussian_blur3(T* image, T sigma[], size_t x, size_t y, size_t z){ //get the number of pixels in the image size_t pixels = x * y * z; size_t bytes = sizeof(T) * pixels; unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads)); dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z); //allocate temporary space on the GPU T* gpuIb_x; cudaMalloc(&gpuIb_x, bytes); //allocate temporary space on the GPU T* gpuIb_y; cudaMalloc(&gpuIb_y, bytes); // blur the original image along the x direction blur_x <<< blocks, threads >>>(gpuIb_x, image, sigma[0], (int)x, (int)y, (int)z); // blur the x-blurred image along the y direction blur_y <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], (int)x, (int)y, (int)z); // blur the xy-blurred image along the z direction blur_z <<< blocks, threads >>>(image, gpuIb_y, sigma[2], (int)x, (int)y, (int)z); //cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice); cudaFree(gpuIb_x); cudaFree(gpuIb_y); } /// Applies a Gaussian blur to a 2D image stored on the CPU template void cpu_gaussian_blur3(T* blur, T* image, T sigma[], unsigned int x, unsigned int y, unsigned int z){ //get the number of pixels in the image unsigned int pixels = x * y *z; unsigned int bytes = sizeof(T) * pixels; //---------Allocate Image--------- //allocate space on the GPU for the image T* gpuI0; cudaMalloc(&gpuI0, bytes); //copy the image data to the GPU cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice); //run the GPU-based version of the algorithm gpu_gaussian_blur3(gpuI0, sigma, x, y, z); //copy the image data from the device cudaMemcpy(blur, gpuI0, bytes, cudaMemcpyDeviceToHost); //free allocated memory cudaFree(gpuI0); } #endif