cpp/gaussian_blur3.cuh

#ifndef STIM_CUDA_GAUSSIAN_BLUR3_H
#define STIM_CUDA_GAUSSIAN_BLUR3_H
#include <iostream>
#include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <stim/cuda/cudatools/error.h>
#define pi	3.14159
				
		template<typename T>
		__global__ void blur_x(T* out, T* in, T sigma, int x, int y, int z){
			//calculate x,y,z coordinates for this thread
			int xi = blockIdx.x * blockDim.x + threadIdx.x;
			//find the grid size along y
			int grid_y = y / blockDim.y;
			int blockidx_y = blockIdx.y % grid_y;
			int yi = blockidx_y * blockDim.y + threadIdx.y;
			int zi = blockIdx.y / grid_y;
			int i = zi * x * y + yi * x + xi;
			
			// calculate the kernel size			
			T k_size = sigma * 4;
			
			//if the current pixel is outside of the image
			if(xi >= x || yi >= y || zi>=z)
				return;
			
			int gx, gi;
			T G;
			T sum = 0;		//running weighted sum across the kernel
			out[i] = 0;
			T sigma_sq = 2.0 * sigma * sigma;	
			T a = 1.0 / (sigma * sqrt(2.0 * pi));
			//handle the boundary in x direction
			int kl = -(int)k_size;
			//if (xi < k_size) kl = -xi;
			int kh = (int)k_size;
			//if (xi >= x - (int)k_size) kh = x - xi - 1;
			//for each element of the kernel
			for(int ki = kl; ki <= kh; ki++){
				//calculate the gaussian value
				G = a * exp(-(T)(ki*ki) / (sigma_sq));
				//calculate the global coordinates for this point in the kernel
				gx = (xi + ki) % x;					
				gi = zi * x * y + yi * x + gx;
				sum += G * in[gi];				
			}
			
			//output the result to global memory
			out[i] = sum;
		}
	
		template<typename T>
		__global__ void blur_y(T* out, T* in, T sigma, int x, int y, int z){
			
			//calculate x,y,z coordinates for this thread
			int xi = blockIdx.x * blockDim.x + threadIdx.x;
			//find the grid size along y
			int grid_y = y / blockDim.y;
			int blockidx_y = blockIdx.y % grid_y;
			int yi = blockidx_y * blockDim.y + threadIdx.y;
			int zi = blockIdx.y / grid_y;
			int i = zi * x * y + yi * x + xi;
			// calculate the kernel size			
			T k_size = sigma * 4;
			
			//if the current pixel is outside of the image
			if(xi >= x || yi >= y || zi>=z)
				return;
			
			int gy, gi;
			T G;
			T sum = 0;		//running weighted sum across the kernel
			out[i] = 0;
			T sigma_sq = 2.0 * sigma * sigma;	
			T a = 1.0 / (sigma * sqrt(2.0 * pi));	
			//handle the boundary in y direction
			int kl = -(int)k_size;
			//if (yi < k_size) kl = -yi;
			int kh = (int)k_size;
			//if (yi >= y - (int)k_size) kh = y - yi - 1;
			//for each element of the kernel
			for(int ki = kl; ki <= kh; ki++){
				//calculate the gaussian value
				G = a * exp(-(T)(ki*ki) / sigma_sq);
				//calculate the global coordinates for this point in the kernel
				gy = (yi + ki ) % y;				
				gi = zi * x * y + gy * x + xi;
				sum += G * in[gi];						
			}
			
			
			//output the result to global memory
			out[i] = sum;
		}
		
		template<typename T>
		__global__ void blur_z(T* out, T* in, T sigma, int x, int y, int z){
			
			//calculate x,y,z coordinates for this thread
			int xi = blockIdx.x * blockDim.x + threadIdx.x;
			//find the grid size along y
			int grid_y = y / blockDim.y;
			int blockidx_y = blockIdx.y % grid_y;
			int yi = blockidx_y * blockDim.y + threadIdx.y;
			int zi = blockIdx.y / grid_y;
			int i = zi * x * y + yi * x + xi;
			// calculate the kernel size			
			T k_size = sigma * 4;
			
			//if the current pixel is outside of the image
			if(xi >= x || yi >= y || zi>=z)
				return;
			
			int gz, gi;
			T G;
			T sum = 0;		//running weighted sum across the kernel
			out[i] = 0;
			T sigma_sq = 2.0 * sigma * sigma;	
			T a = 1.0 / (sigma * sqrt(2.0 * pi));	
			//handle the boundary in z direction
			int kl = -(int)k_size;
			//if (zi < k_size) kl = -zi;
			int kh = (int)k_size;
			//if (zi >= z - (int)k_size) kh = z - zi - 1;
			//for each element of the kernel
			for(int ki = kl; ki <= kh; ki++){
				//calculate the gaussian value
				G = a * exp(-(T)(ki*ki) / sigma_sq);
				//calculate the global coordinates for this point in the kernel
				gz = (zi + ki) % z;				
				gi = gz * x * y + yi * x + xi;					
				sum += G * in[gi];
			}
			
			//output the result to global memory
			out[i] = sum;
		}
		
		template<typename T>
		void gpu_gaussian_blur3(T* image, T sigma[], size_t x, size_t y, size_t z){
			//get the number of pixels in the image
			size_t pixels = x * y * z;
			size_t bytes = sizeof(T) * pixels;
			unsigned int max_threads = stim::maxThreadsPerBlock();
			dim3 threads(sqrt (max_threads),sqrt (max_threads));
			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
			//allocate temporary space on the GPU
			T* gpuIb_x;
			cudaMalloc(&gpuIb_x, bytes);
			//allocate temporary space on the GPU
			T* gpuIb_y;
			cudaMalloc(&gpuIb_y, bytes);
			// blur the original image along the x direction
			blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], x, y, z);
			
			// blur the x-blurred image along the y direction
			blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], x, y, z);
			
			// blur the xy-blurred image along the z direction
			blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], x, y, z);
			//cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice);
			cudaFree(gpuIb_x);
			cudaFree(gpuIb_y);
		}
		/// Applies a Gaussian blur to a 2D image stored on the CPU
		template<typename T>
		void cpu_gaussian_blur3(T* blur, T* image, T sigma[], unsigned int x, unsigned int y, unsigned int z){
			
			//get the number of pixels in the image
			unsigned int pixels = x * y *z;
			unsigned int bytes = sizeof(T) * pixels;
			//---------Allocate Image---------
			//allocate space on the GPU for the image
			T* gpuI0;
			cudaMalloc(&gpuI0, bytes);			
			
			//copy the image data to the GPU
			cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
			//run the GPU-based version of the algorithm
			gpu_gaussian_blur3<T>(gpuI0, sigma, x, y, z);
			//copy the image data from the device
			cudaMemcpy(blur, gpuI0, bytes, cudaMemcpyDeviceToHost);
			//free allocated memory
			cudaFree(gpuI0);
			
		}
#endif