gaussian_blur.cuh 3.85 KB
#ifndef STIM_CUDA_GAUSSIAN_BLUR_H
#define STIM_CUDA_GAUSSIAN_BLUR_H

#include <iostream>
#include <cuda.h>
#include "gaussian_blur.cuh"
#include <stim/cuda/devices.h>

#define pi	3.14159

// This CUDA kernel applies a Gaussian blur along the x axis
template<typename T>
__global__ void gaussian_blur_x(T* out, T* in, T sigma, unsigned int x, unsigned int y){

	//calculate the 1D image index for this thread
	int i = blockIdx.x * blockDim.x + threadIdx.x;

	//convert this to 2D pixel coordinates
	int yi = i / x;
	int xi = i - (yi * x);

	int k_size = sigma * 4;				//calculate the kernel size

	int gx, gy, gi;								//variables to store global coordinates
	T sum = 0;								//variable stores the integral of the kernel times the image
	T G;									//stores the value of the gaussian kernel

	out[i] = 0;
	//for each element of the kernel
	for(int ki = -k_size; ki <= k_size; ki++){

		//calculate the gaussian value
		G = 1.0 / (sigma * sqrt(2 * pi)) * exp(-(ki*ki) / (2*sigma*sigma));

		//calculate the global coordinates for this point in the kernel
		gx = xi + ki;
		gy = yi;

		//make sure we are inside the bounds of the image
		if(gx >= 0 && gy >= 0 && gx < x && gy < y){
			gi = gy * x + gx;

			//perform the integration
			sum += G * in[gi];
		}
	}

	//set the output value to the integral of the function times the kernel
	out[i] = sum;
}

// This CUDA kernel applies a Gaussian blur along the y axis.
template<typename T>
__global__ void gaussian_blur_y(T* out, T* in, T sigma, unsigned int x, unsigned int y){

	//calculate the 1D image index for this thread
	int i = blockIdx.x * blockDim.x + threadIdx.x;

	//convert this to 2D pixel coordinates
	int yi = i / x;
	int xi = i - (yi * x);

	int k_size = sigma * 4;				//calculate the kernel size

	int gx, gy, gi;								//variables to store global coordinates
	T sum = 0;								//variable stores the integral of the kernel times the image
	T G;									//stores the value of the gaussian kernel

	out[i] = 0;
	//for each element of the kernel
	for(int ki = -k_size; ki <= k_size; ki++){

		//calculate the gaussian value
		G = 1.0 / (sigma * sqrt(2 * pi)) * exp(-(ki*ki) / (2*sigma*sigma));

		//calculate the global coordinates for this point in the kernel
		gx = xi;
		gy = yi + ki;

		//make sure we are inside the bounds of the image
		if(gx >= 0 && gy >= 0 && gx < x && gy < y){
			gi = gy * x + gx;

			//perform the integration
			sum += G * in[gi];
		}
	}

	//set the output value to the integral of the function times the kernel
	out[i] = sum;
}

/// Applies a Gaussian blur to a 2D image stored on the GPU
template<typename T>
void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){

	//get the number of pixels in the image
	unsigned int pixels = x * y;
	unsigned int bytes = sizeof(T) * pixels;

	//allocate temporary space on the GPU
	T* gpuI1;
	cudaMalloc(&gpuI1, bytes);

	//get the maximum number of threads per block for the CUDA device
	int threads = stim::maxThreadsPerBlock();

	//calculate the number of blocks
	int blocks = pixels / threads + (pixels%threads == 0 ? 0:1);

	//run the kernel for the x dimension
	gaussian_blur_x <<< blocks, threads >>>(gpuI1, image, sigma, x, y);

	//run the kernel for the y dimension
	gaussian_blur_y <<< blocks, threads >>>(image, gpuI1, sigma, x, y);

}

/// Applies a Gaussian blur to a 2D image stored on the CPU
template<typename T>
void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){

	//get the number of pixels in the image
	unsigned int pixels = x * y;
	unsigned int bytes = sizeof(T) * pixels;

	//allocate space on the GPU
	T* gpuI0;
	cudaMalloc(&gpuI0, bytes);

	//copy the image data to the GPU
	cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);

	//run the GPU-based version of the algorithm
	gpu_gaussian_blur_2d<T>(gpuI0, sigma, x, y);

	//copy the image data from the device
	cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost);
}

#endif