From 348f8ab9945f81ca275a3d51e5586ba618e0b7db Mon Sep 17 00:00:00 2001
From: Tianshu Cheng <tianshucheng@foxmail.com>
Date: Fri, 4 Sep 2015 15:33:10 -0500
Subject: [PATCH] inseparable 2D convolution

---
 stim/cuda/conv2.cuh | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+), 0 deletions(-)
 create mode 100644 stim/cuda/conv2.cuh
diff --git a/stim/cuda/conv2.cuh b/stim/cuda/conv2.cuh
new file mode 100644
index 0000000..b1d154e
--- /dev/null
+++ b/stim/cuda/conv2.cuh
@@ -0,0 +1,147 @@
+#ifndef STIM_CUDA_CONV2_H
+#define STIM_CUDA_CONV2_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/devices.h>
+#include <stim/cuda/error.h>
+#include <cmath>
+#include <algorithm>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		//__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){
+		__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){
+
+
+			//the radius of mask
+			unsigned r = (M - 1)/2;
+
+
+			//calculate the 1D index for this thread
+			//int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+			//change 1D index to 2D cordinates
+			int i = blockIdx.x * blockDim.x + threadIdx.x;
+			int j = blockIdx.y;
+
+			int idx = j * w + i;
+			//unsigned long N = w * h;
+
+			if(i < w && j < h){
+
+				copy[idx] = tex2D<float>(texObj, i+100, j+100);
+				return;
+
+				//tex2D<float>(texObj, i, j);
+
+				//allocate memory for result
+				T sum = 0;
+
+				//for (unsigned int y = max(j - r, 0); y <= min(j + r, h - 1); y++){
+
+					//for (unsigned int x = max(i - r, 0); x <= min(i + r, w - 1); x++){
+
+				for (int y = j - r; y <= j + r; y++){
+
+					for (int x = i - r; x <= i + r; x++){
+
+						//idx to mask cordinates(xx, yy)
+						unsigned int xx = x - (i - r);
+						unsigned int yy = y - (j - r);
+
+						//T temp = img[y * w + x] * mask[yy * M + xx];
+						//sum += img[y * w + x] * mask[yy * M + xx];
+						sum += tex2D<float>(texObj, x, y);// * mask[yy * M + xx];
+					}		
+				}
+				copy[idx] = sum;
+			 }
+		}
+		
+
+		template<typename T>
+		void gpu_conv2(T* img, T* mask, T* copy, unsigned int w, unsigned int h, unsigned M){
+
+			unsigned long N = w * h;
+
+			// Allocate CUDA array in device memory
+			
+			//define a channel descriptor for a single 32-bit channel
+			cudaChannelFormatDesc channelDesc =
+					   cudaCreateChannelDesc(32, 0, 0, 0,
+											 cudaChannelFormatKindFloat);
+			cudaArray* cuArray;												//declare the cuda array
+			cudaMallocArray(&cuArray, &channelDesc, w, h);			//allocate the cuda array
+
+			// Copy the image data from global memory to the array
+			cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T),
+							  cudaMemcpyDeviceToDevice);
+
+			// Specify texture
+			struct cudaResourceDesc resDesc;				//create a resource descriptor
+			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
+			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
+			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
+
+			// Specify texture object parameters
+			struct cudaTextureDesc texDesc;							//create a texture descriptor
+			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
+			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
+			texDesc.addressMode[1]   = cudaAddressModeWrap;
+			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
+			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
+			texDesc.normalizedCoords = 0;							//not using normalized coordinates
+
+			// Create texture object
+			cudaTextureObject_t texObj = 0;
+			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			dim3 blocks(w / threads + 1, h);
+
+			//call the kernel to do the multiplication
+			//cuda_conv2 <<< blocks, threads >>>(img, mask, copy, w, h, M);
+			cuda_conv2 <<< blocks, threads >>>(img, mask, copy, texObj, w, h, M);
+
+		}
+
+		template<typename T>
+		void cpu_conv2(T* img, T* mask, T* cpu_copy, unsigned int w, unsigned int h, unsigned M){
+
+			unsigned long N = w * h;
+			//allocate memory on the GPU for the array
+			T* gpu_img; 
+			T* gpu_mask; 
+			T* gpu_copy;
+			HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_mask, pow(M, 2) * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) );
+			HANDLE_ERROR( cudaMemcpy( gpu_mask, mask, pow(M, 2) * sizeof(T), cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_conv2<T>(gpu_img, gpu_mask ,gpu_copy, w, h, M);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_img);
+			cudaFree(gpu_mask);
+			cudaFree(gpu_copy);
+
+		}
+		
+	}
+}
+
+
+#endif
\ No newline at end of file
--
libgit2 0.21.4