stim/cuda/templates/conv2.cuh

#ifndef STIM_CUDA_CONV2_H
#define STIM_CUDA_CONV2_H
#include <iostream>
#include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <cmath>
#include <algorithm>
namespace stim{
	namespace cuda{
		template<typename T>
		__global__ void cuda_conv2(T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned int M){
			//the radius of mask
			int r = (M - 1)/2;
			//calculate the 1D index for this thread
			//int idx = blockIdx.x * blockDim.x + threadIdx.x;
			//change 1D index to 2D cordinates
			int i = blockIdx.x * blockDim.x + threadIdx.x;
			int j = blockIdx.y;
			int idx = j * w + i;
			//unsigned long N = w * h;
			if(i < w && j < h){
				//copy[idx] = tex2D<float>(texObj, i+100, j+100);
				//return;
				tex2D<float>(texObj, (float)i/w, (float)j/h);
				//allocate memory for result
				T sum = 0;
				//for (unsigned int y = max(j - r, 0); y <= min(j + r, h - 1); y++){
					//for (unsigned int x = max(i - r, 0); x <= min(i + r, w - 1); x++){
				for (int y = j - r; y <= j + r; y++){
					for (int x = i - r; x <= i + r; x++){
						//idx to mask cordinates(xx, yy)
						int xx = x - (i - r);
						int yy = y - (j - r);
						sum += tex2D<T>(texObj, (float)x/w, (float)y/h) * mask[yy * M + xx];
					}		
				}
				copy[idx] = sum;
			 }
		}
		
		template<typename T>
		void gpu_conv2(T* img, T* mask, T* copy, unsigned int w, unsigned int h, unsigned M){
			unsigned long N = w * h;
			// Allocate CUDA array in device memory
			
			//define a channel descriptor for a single 32-bit channel
			cudaChannelFormatDesc channelDesc =
					   cudaCreateChannelDesc(32, 0, 0, 0,
											 cudaChannelFormatKindFloat);
			cudaArray* cuArray;												//declare the cuda array
			cudaMallocArray(&cuArray, &channelDesc, w, h);			//allocate the cuda array
			// Copy the image data from global memory to the array
			cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T),
							  cudaMemcpyDeviceToDevice);
			// Specify texture
			struct cudaResourceDesc resDesc;				//create a resource descriptor
			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
			// Specify texture object parameters
			struct cudaTextureDesc texDesc;							//create a texture descriptor
			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
			texDesc.addressMode[0]   = cudaAddressModeClamp;			//use wrapping (around the edges)
			texDesc.addressMode[1]   = cudaAddressModeClamp;
			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
			texDesc.normalizedCoords = 1;							//using normalized coordinates
			// Create texture object
			cudaTextureObject_t texObj = 0;
			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
			//get the maximum number of threads per block for the CUDA device
			int threads = stim::maxThreadsPerBlock();
			//calculate the number of blocks
			dim3 blocks(w / threads + 1, h);
			//call the kernel to do the multiplication
			//cuda_conv2 <<< blocks, threads >>>(img, mask, copy, w, h, M);
			cuda_conv2 <<< blocks, threads >>>(img, mask, copy, texObj, w, h, M);
			cudaDestroyTextureObject(texObj);
			cudaFreeArray(cuArray);
		}
		template<typename T>
		void cpu_conv2(T* img, T* mask, T* cpu_copy, unsigned int w, unsigned int h, unsigned M){
			unsigned long N = w * h;
			//allocate memory on the GPU for the array
			T* gpu_img; 
			T* gpu_mask; 
			T* gpu_copy;
			HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) );
			HANDLE_ERROR( cudaMalloc( &gpu_mask, pow(M, 2) * sizeof(T) ) );
			HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) );
			//copy the array to the GPU
			HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) );
			HANDLE_ERROR( cudaMemcpy( gpu_mask, mask, pow(M, 2) * sizeof(T), cudaMemcpyHostToDevice) );
			//call the GPU version of this function
			gpu_conv2<T>(gpu_img, gpu_mask ,gpu_copy, w, h, M);
			//copy the array back to the CPU
			HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) );
			//free allocated memory
			cudaFree(gpu_img);
			cudaFree(gpu_mask);
			cudaFree(gpu_copy);
		}
		
	}
}
#endif