stim/cuda/templates/conv2sep.cuh

#ifndef STIM_CUDA_CONV2SEP_H
#define STIM_CUDA_CONV2SEP_H
#include <iostream>
#include <cuda.h>
#include <stim/cuda/cudatools/devices.h>
#include <stim/cuda/cudatools/timer.h>
#include <stim/cuda/sharedmem.cuh>
#include <stim/cuda/cudatools/error.h>
#define pi	3.14159
namespace stim{
	namespace cuda{
		template<typename T>
		__global__ void conv2sep_0(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y,
										   T* kernel0, unsigned int k0){
			//generate a pointer to shared memory (size will be specified as a kernel parameter)
			extern __shared__ T s[];
			int kr = k0/2;				//calculate the kernel radius
			//get a pointer to the gaussian in memory
			T* g = (T*)&s[blockDim.x + 2 * kr];
			//calculate the start point for this block
			int bxi = blockIdx.x * blockDim.x;
			int byi = blockIdx.y;
			//copy the portion of the image necessary for this block to shared memory
			stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
			//calculate the thread index
			int ti = threadIdx.x;
			//calculate the spatial coordinate for this thread
			int xi = bxi + ti;
			int yi = byi;
			
			//use the first 2kr+1 threads to transfer the kernel to shared memory
			if(ti < k0){
				g[ti] = kernel0[ti];
			}
			//make sure that all writing to shared memory is done before continuing
			__syncthreads();
			
			//if the current pixel is outside of the image
			if(xi > x || yi > y)
				return;
			
			//calculate the coordinates of the current thread in shared memory
			int si = ti + kr;
			T sum = 0;		//running weighted sum across the kernel
			
			//for each element of the kernel
			for(int ki = -kr; ki <= kr; ki++){
				sum += g[ki + kr] * s[si + ki];
			}
			
			//calculate the 1D image index for this thread
			unsigned int i = byi * x + xi;
			//output the result to global memory
			out[i] = sum;
		}
		template<typename T>
		__global__ void conv2sep_1(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y,
										   T* kernel0, unsigned int k0){
			//generate a pointer to shared memory (size will be specified as a kernel parameter)
			extern __shared__ T s[];
			int kr = k0/2;				//calculate the kernel radius
			//get a pointer to the gaussian in memory
			T* g = (T*)&s[blockDim.y + 2 * kr];
			//calculate the start point for this block
			int bxi = blockIdx.x;
			int byi = blockIdx.y * blockDim.y;
			//copy the portion of the image necessary for this block to shared memory
			stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
			//calculate the thread index
			int ti = threadIdx.y;
			//calculate the spatial coordinate for this thread
			int xi = bxi;
			int yi = byi + ti;
			
			//use the first 2kr+1 threads to transfer the kernel to shared memory
			if(ti < k0){
				g[ti] = kernel0[ti];
			}
			//make sure that all writing to shared memory is done before continuing
			__syncthreads();
			
			//if the current pixel is outside of the image
			if(xi > x || yi > y)
				return;
			
			//calculate the coordinates of the current thread in shared memory
			int si = ti + kr;
			T sum = 0;		//running weighted sum across the kernel
			
			//for each element of the kernel
			for(int ki = -kr; ki <= kr; ki++){
				sum += g[ki + kr] * s[si + ki];
			}
			
			//calculate the 1D image index for this thread
			unsigned int i = yi * x + xi;
			//output the result to global memory
			out[i] = sum;
		}
		template<typename T>
		void tex_conv2sep(T* out, unsigned int x, unsigned int y,
						  cudaTextureObject_t texObj, cudaArray* cuArray,
						  T* kernel0, unsigned int k0,
						  T* kernel1, unsigned int k1){
			//get the maximum number of threads per block for the CUDA device
			int max_threads = stim::maxThreadsPerBlock();
			dim3 threads(max_threads, 1);
			//calculate the number of blocks
			dim3 blocks(x / threads.x + 1, y);
			//calculate the shared memory used in the kernel
			unsigned int pixel_bytes = max_threads * sizeof(T);							//bytes devoted to pixel data being processed
			unsigned int apron_bytes = k0/2 * sizeof(T);								//bytes devoted to the apron on each side of the window
			unsigned int gaussian_bytes = k0 * sizeof(T);								//bytes devoted to memory used to store the pre-computed Gaussian window
			unsigned int shared_bytes = pixel_bytes + 2 * apron_bytes + gaussian_bytes;		//total number of bytes shared memory used
			//blur the image along the x-axis
			conv2sep_0<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel0, k0);
			// Copy the x-blurred data from global memory to the texture
			cudaMemcpyToArray(cuArray, 0, 0, out, x * y * sizeof(T),
							  cudaMemcpyDeviceToDevice);
			
			//transpose the block and thread dimensions
			threads.x = 1;
			threads.y = max_threads;
			blocks.x = x;
			blocks.y = y / threads.y + 1;
			
			//blur the image along the y-axis
			conv2sep_1<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel1, k1);
		}
		template<typename T>
		void gpu_conv2sep(T* image, unsigned int x, unsigned int y,
						  T* kernel0, unsigned int k0,
						  T* kernel1, unsigned int k1){
			//get the number of pixels in the image
			unsigned int pixels = x * y;
			unsigned int bytes = sizeof(T) * pixels;
			// Allocate CUDA array in device memory
			
			//define a channel descriptor for a single 32-bit channel
			cudaChannelFormatDesc channelDesc =
					   cudaCreateChannelDesc(32, 0, 0, 0,
											 cudaChannelFormatKindFloat);
			cudaArray* cuArray;												//declare the cuda array
			cudaMallocArray(&cuArray, &channelDesc, x, y);			//allocate the cuda array
			// Copy the image data from global memory to the array
			cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
							  cudaMemcpyDeviceToDevice);
			// Specify texture
			struct cudaResourceDesc resDesc;				//create a resource descriptor
			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
			// Specify texture object parameters
			struct cudaTextureDesc texDesc;							//create a texture descriptor
			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
			texDesc.addressMode[1]   = cudaAddressModeWrap;
			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
			texDesc.normalizedCoords = 0;							//not using normalized coordinates
			// Create texture object
			cudaTextureObject_t texObj = 0;
			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
			//call the texture version of the separable convolution function
			tex_conv2sep(image, x, y, texObj, cuArray, kernel0, k0, kernel1, k1);			
			
			//free allocated memory
			cudaFree(cuArray);
		}
		/// Applies a Gaussian blur to a 2D image stored on the CPU
		template<typename T>
		void cpu_conv2sep(T* image, unsigned int x, unsigned int y, 
						  T* kernel0, unsigned int k0,
						  T* kernel1, unsigned int k1){
			//get the number of pixels in the image
			unsigned int pixels = x * y;
			unsigned int bytes = sizeof(T) * pixels;
			//---------Allocate Image---------
			//allocate space on the GPU for the image
			T* gpuI0;
			HANDLE_ERROR(cudaMalloc(&gpuI0, bytes));			
			
			//copy the image data to the GPU
			HANDLE_ERROR(cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice));
			//---------Allocate Kernel--------
			//allocate and copy the 0 (x) kernel
			T* gpuK0;
			HANDLE_ERROR(cudaMalloc(&gpuK0, k0 * sizeof(T)));
			HANDLE_ERROR(cudaMemcpy(gpuK0, kernel0, k0 * sizeof(T), cudaMemcpyHostToDevice));
			//allocate and copy the 1 (y) kernel
			T* gpuK1;
			HANDLE_ERROR(cudaMalloc(&gpuK1, k1 * sizeof(T)));
			HANDLE_ERROR(cudaMemcpy(gpuK1, kernel1, k1 * sizeof(T), cudaMemcpyHostToDevice));
			//run the GPU-based version of the algorithm
			gpu_conv2sep<T>(gpuI0, x, y, gpuK0, k0, gpuK1, k1);
			//copy the image data from the device
			cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost);
			//free allocated memory
			cudaFree(gpuI0);
		}
		
	};
};
#endif