stim/cuda/gaussian_blur.cuh

#ifndef STIM_CUDA_GAUSSIAN_BLUR_H
#define STIM_CUDA_GAUSSIAN_BLUR_H
#include <iostream>
#include <cuda.h>
#include <stim/cuda/devices.h>
#include <stim/cuda/timer.h>
#include <stim/cuda/sharedmem.cuh>
#define pi	3.14159
namespace stim{
	namespace cuda{
		template<typename T>
		__global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){
			//generate a pointer to shared memory (size will be specified as a kernel parameter)
			extern __shared__ T s[];
			int kr = sigma * 4;				//calculate the kernel radius
			//get a pointer to the gaussian in memory
			T* g = (T*)&s[blockDim.x + 2 * kr];
			//calculate the start point for this block
			int bxi = blockIdx.x * blockDim.x;
			int byi = blockIdx.y;
			//copy the portion of the image necessary for this block to shared memory
			stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
			//calculate the thread index and block index
			int ti = threadIdx.x;
			//calculate the spatial coordinate for this thread
			int xi = bxi + ti;
			//pre-compute the gaussian values for each kernel point
			T a = 1.0 / (sigma * sqrt(2 * pi));
			T c = - 1.0 / (2*sigma*sigma);
			int ki;
			//use the first 2kr+1 threads to evaluate a gaussian and store the result
			if(ti <= 2* kr+1){
				ki = ti - kr;
				g[ti] = a * exp((ki*ki) * c);
			}
			//make sure that all writing to shared memory is done before continuing
			__syncthreads();
			
			//if the current pixel is outside of the image
			if(bxi + ti > x || byi > y)
				return;
			
			//calculate the coordinates of the current thread in shared memory
			int si = ti + kr;
			T sum = 0;		//running weighted sum across the kernel
			
			//for each element of the kernel
			for(int ki = -kr; ki <= kr; ki++){
				sum += g[ki + kr] * s[si + ki];
			}
			//calculate the 1D image index for this thread
			unsigned int i = byi * x + xi;
			//output the result to global memory
			out[i] = sum;
		}
		template<typename T>
		__global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){
			//generate a pointer to shared memory (size will be specified as a kernel parameter)
			extern __shared__ T s[];
			int kr = sigma * 4;				//calculate the kernel radius
			//get a pointer to the gaussian in memory
			T* g = (T*)&s[blockDim.y + 2 * kr];
			//calculate the start point for this block
			int bxi = blockIdx.x;
			int byi = blockIdx.y * blockDim.y;
			//copy the portion of the image necessary for this block to shared memory
			stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
			//calculate the thread index and block index
			int ti = threadIdx.y;
			//calculate the spatial coordinate for this thread
			int yi = byi + ti;
			//pre-compute the gaussian values for each kernel point
			T a = 1.0 / (sigma * sqrt(2 * pi));
			T c = - 1.0 / (2*sigma*sigma);
			int ki;
			//use the first 2kr+1 threads to evaluate a gaussian and store the result
			if(ti <= 2* kr+1){
				ki = ti - kr;
				g[ti] = a * exp((ki*ki) * c);
			}
			//make sure that all writing to shared memory is done before continuing
			__syncthreads();
			
			//if the current pixel is outside of the image
			if(bxi >= x || yi >= y)
				return;
			
			//calculate the coordinates of the current thread in shared memory
			int si = ti + kr;
			T sum = 0;		//running weighted sum across the kernel
			
			//for each element of the kernel
			for(int ki = -kr; ki <= kr; ki++){
				sum += g[ki + kr] * s[si + ki];
			}
			//calculate the 1D image index for this thread
			unsigned int i = yi * x + bxi;
			//output the result to global memory
			out[i] = sum;
		}
		/// Applies a Gaussian blur to a 2D image stored on the GPU
		template<typename T>
		void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
			//get the number of pixels in the image
			unsigned int pixels = x * y;
			unsigned int bytes = sizeof(T) * pixels;
			// Allocate CUDA array in device memory
			
			//define a channel descriptor for a single 32-bit channel
			cudaChannelFormatDesc channelDesc =
					   cudaCreateChannelDesc(32, 0, 0, 0,
											 cudaChannelFormatKindFloat);
			cudaArray* cuArray;												//declare the cuda array
			cudaMallocArray(&cuArray, &channelDesc, x, y);			//allocate the cuda array
			// Copy the image data from global memory to the array
			cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
							  cudaMemcpyDeviceToDevice);
			// Specify texture
			struct cudaResourceDesc resDesc;				//create a resource descriptor
			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
			// Specify texture object parameters
			struct cudaTextureDesc texDesc;							//create a texture descriptor
			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
			texDesc.addressMode[1]   = cudaAddressModeWrap;
			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
			texDesc.normalizedCoords = 0;							//not using normalized coordinates
			// Create texture object
			cudaTextureObject_t texObj = 0;
			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
			
			//get the maximum number of threads per block for the CUDA device
			int max_threads = stim::maxThreadsPerBlock();
			dim3 threads(max_threads, 1);
			//calculate the number of blocks
			dim3 blocks(x / threads.x + 1, y);
			//calculate the shared memory used in the kernel
			unsigned int pixel_bytes = max_threads * 4;									//bytes devoted to pixel data being processed
			unsigned int apron_bytes = sigma * 8 * 4;									//bytes devoted to pixels outside the window
			unsigned int gaussian_bytes = (sigma * 8 + 1) * 4;							//bytes devoted to memory used to store the pre-computed Gaussian window
			unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes;		//total number of bytes shared memory used
			//blur the image along the x-axis
			gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);
			// Copy the x-blurred data from global memory to the texture
			cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
							  cudaMemcpyDeviceToDevice);
			//transpose the block and thread dimensions
			threads.x = 1;
			threads.y = max_threads;
			blocks.x = x;
			blocks.y = y / threads.y + 1;
			
			//blur the image along the y-axis
			gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);
			//free allocated memory
			cudaFree(cuArray);
		}
		/// Applies a Gaussian blur to a 2D image stored on the CPU
		template<typename T>
		void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
			//get the number of pixels in the image
			unsigned int pixels = x * y;
			unsigned int bytes = sizeof(T) * pixels;
			//allocate space on the GPU
			T* gpuI0;
			cudaMalloc(&gpuI0, bytes);
			
			
			//copy the image data to the GPU
			cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
			//run the GPU-based version of the algorithm
			gpu_gaussian_blur_2d<T>(gpuI0, sigma, x, y);
			//copy the image data from the device
			cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost);
			//free allocated memory
			cudaFree(gpuI0);
		}
		
	};
};
#endif