From dbeb83f2a83d7c1624fee2f0e2e5c860abd3c5b6 Mon Sep 17 00:00:00 2001
From: David <mayerich@uh.edu>
Date: Thu, 6 Oct 2016 00:36:51 -0500
Subject: [PATCH] added separable convolution CPU and GPU

---
 stim/cuda/sharedmem.cuh      |  4 ++++
 stim/image/image.h           | 49 +++++++++++++++++++++++--------------------------
 stim/math/filters/conv2.h    | 57 +++++++++++++++++++++++++++++++++++++--------------------
 stim/math/filters/sepconv2.h | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 153 insertions(+), 46 deletions(-)
 create mode 100644 stim/math/filters/sepconv2.h
diff --git a/stim/cuda/sharedmem.cuh b/stim/cuda/sharedmem.cuh
index a5f05cc..60db5e0 100644
--- a/stim/cuda/sharedmem.cuh
+++ b/stim/cuda/sharedmem.cuh
@@ -48,7 +48,11 @@ namespace stim{
 
 		/// Threaded copying of 2D data on a CUDA device
 		/// @param dest is a linear destination array of size nx * ny
+		/// @param nx is the size of the region to be copied along the X dimension
+		/// @param ny is the size of the region to be copied along the Y dimension
 		/// @param src is a 2D image stored as a linear array with a pitch of X
+		/// @param x is the x position in the source image where the copy is started
+		/// @param y is the y position in the source image where the copy is started
 		/// @param X is the number of bytes in a row of src
 		/// @param tid is a 1D id for the current thread
 		/// @param nt is the number of threads in the block
diff --git a/stim/image/image.h b/stim/image/image.h
index 7a461b9..fa7b5eb 100644
--- a/stim/image/image.h
+++ b/stim/image/image.h
@@ -23,9 +23,9 @@ class image{
 	T* img;										//pointer to the image data (interleaved RGB for color)
 	size_t R[3];
 
-	size_t X() const { return R[1]; }
-	size_t Y() const { return R[2]; }
-	size_t C() const { return R[0]; }
+	inline size_t X() const { return R[1]; }
+	inline size_t Y() const { return R[2]; }
+	inline size_t C() const { return R[0]; }
 
 	void init(){								//initializes all variables, assumes no memory is allocated
 		memset(R, 0, sizeof(size_t) * 3);		//set the resolution and number of channels to zero
@@ -54,8 +54,8 @@ class image{
 
 	size_t bytes(){ return size() * sizeof(T); }
 
-	size_t idx(size_t x, size_t y, size_t c = 0){
-		return y * C() * X() + x * C() + c;
+	inline size_t idx(size_t x, size_t y, size_t c = 0){
+		return y * R[0] * R[1] + x * R[0] + c;
 	}
 
 #ifdef USING_OPENCV
@@ -338,20 +338,27 @@ public:
 		}
 	}
 
-
-	image<T> channel(size_t c){
-
-		//create a new image
-		image<T> r(X(), Y(), 1);
-
+	/// Return an image representing a specified channel
+	/// @param c is the channel to be returned
+	image<T> channel(size_t c){		
+		image<T> r(X(), Y(), 1);				//create a new image
 		for(size_t x = 0; x < X(); x++){
 			for(size_t y = 0; y < Y(); y++){
 				r.img[r.idx(x, y, 0)] = img[idx(x, y, c)];
 			}
 		}
-
 		return r;
+	}
+
+	/// Returns an std::vector containing each channel as a separate image
+	std::vector<image<T>> split() {
+		std::vector<image<T>> r;			//create an image array
+		r.resize(C());						//create images for each channel
 
+		for (size_t c = 0; c < C(); c++) {	//for each channel
+			r[c] = channel(c);				//copy the channel image to the array
+		}
+		return r;
 	}
 
 	T& operator()(size_t x, size_t y, size_t c = 0){
@@ -505,22 +512,12 @@ public:
 		exit(1);
 	}
 
+	/// Casting operator, casts every value in an image to a different data type V
 	template<typename V>
 	operator image<V>() {
-		//create a new image
-		//image<V> r(X(), Y(), C());
-		V* dst = (V*)malloc(size() * sizeof(V));
-
-		for (size_t x = 0; x < X(); x++) {
-			for (size_t y = 0; y < Y(); y++) {
-				for (size_t c = 0; c < C(); c++) {
-					dst[idx(x, y, c)] = (V)img[idx(x, y, c)];
-				}
-			}
-		}
-
-		image<V> r(dst, X(), Y(), C());
-		return r;
+		image<V> r(X(), Y(), C());					//create a new image
+		std::copy(img, img + size(), r.data());		//copy and cast the data
+		return r;									//return the new image
 	}
 
 };
diff --git a/stim/math/filters/conv2.h b/stim/math/filters/conv2.h
index 760dea4..5c149ea 100644
--- a/stim/math/filters/conv2.h
+++ b/stim/math/filters/conv2.h
@@ -4,30 +4,44 @@
 
 #ifdef __CUDACC__
 #include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
 #endif
 
 namespace stim {
-
+#ifdef __CUDACC__
 	//Kernel function that performs the 2D convolution.
-	template<typename T>
-	__global__ void kernel_conv2(T* out, T* in, T* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
+	template<typename T, typename K = T>
+	__global__ void kernel_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
+		extern __shared__ T s[];								//declare a shared memory array
 		size_t xi = blockIdx.x * blockDim.x + threadIdx.x;	//threads correspond to indices into the output image
 		size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
+		size_t tid = threadIdx.y * blockDim.x + threadIdx.x;
+		size_t nt = blockDim.x * blockDim.y;
+
+		size_t cx = blockIdx.x * blockDim.x;					//find the upper left corner of the input region
+		size_t cy = blockIdx.y * blockDim.y;
 
 		size_t X = sx - kx + 1;								//calculate the size of the output image
 		size_t Y = sy - ky + 1;
 
+		if (cx >= X || cy >= Y) return;						//return if the entire block is outside the image
+		size_t smx = min(blockDim.x + kx - 1, sx - cx);			//size of the shared copy of the input image
+		size_t smy = min(blockDim.y + ky - 1, sy - cy);			//	min function is used to deal with boundary blocks
+		stim::cuda::threadedMemcpy2D<T>(s, smx, smy, in, cx, cy, sx, sy, tid, nt);	//copy the input region to shared memory
+		__syncthreads();
+
 		if (xi >= X || yi >= Y) return;						//returns if the thread is outside of the output image
 		
 		//loop through the kernel
 		size_t kxi, kyi;
-		T v = 0;
+		K v = 0;
 		for (kyi = 0; kyi < ky; kyi++) {
 			for (kxi = 0; kxi < kx; kxi++) {
-				v += in[(yi + kyi) * sx + xi + kxi] * kernel[kyi * kx + kxi];
+				v += s[(threadIdx.y + kyi) * smx + threadIdx.x + kxi] * kernel[kyi * kx + kxi];
+				//v += in[(yi + kyi) * sx + xi + kxi] * kernel[kyi * kx + kxi];
 			}
 		}
-		out[yi * X + xi] = v;								//write the result to global memory
+		out[yi * X + xi] = (T)v;								//write the result to global memory
 
 	}
 
@@ -38,18 +52,23 @@ namespace stim {
 	//@param sy is the size of the input image along Y
 	//@param kx is the size of the kernel along X
 	//@param ky is the size of the kernel along Y
-	template<typename T>
-	void gpu_conv2(T* out, T* in, T* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
+	template<typename T, typename K = T>
+	void gpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
 		cudaDeviceProp p;
 		HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
 		size_t tmax = p.maxThreadsPerBlock;
-		dim3 tn(sqrt(tmax), sqrt(tmax));					//calculate the block dimensions
+		dim3 nt(sqrt(tmax), sqrt(tmax));					//calculate the block dimensions
 		size_t X = sx - kx + 1;								//calculate the size of the output image
 		size_t Y = sy - ky + 1;
-		dim3 bn(X / tn.x + 1, Y / tn.y + 1);				//calculate the grid dimensions
-		kernel_conv2 <<<bn, tn >>> (out, in, kernel, sx, sy, kx, ky);	//launch the kernel
+		dim3 nb(X / nt.x + 1, Y / nt.y + 1);							//calculate the grid dimensions
+		size_t sm = (nt.x + kx - 1) * (nt.y + ky - 1) * sizeof(T);		//shared memory bytes required to store block data
+		if (sm > p.sharedMemPerBlock) {
+			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
+			exit(1);
+		}
+		kernel_conv2 <<<nb, nt, sm>>> (out, in, kernel, sx, sy, kx, ky);	//launch the kernel
 	}
-
+#endif
 	//Performs a convolution of a 2D image. Only valid pixels based on the kernel are returned.
 	//	As a result, the output image will be smaller than the input image by (kx-1, ky-1)
 	//@param out is a pointer to the output image
@@ -58,8 +77,8 @@ namespace stim {
 	//@param sy is the size of the input image along Y
 	//@param kx is the size of the kernel along X
 	//@param ky is the size of the kernel along Y
-	template<typename T>
-	void cpu_conv2(T* out, T* in, T* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
+	template<typename T, typename K = T>
+	void cpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
 		size_t X = sx - kx + 1;					//x size of the output image
 		size_t Y = sy - ky + 1;					//y size of the output image
 
@@ -68,9 +87,9 @@ namespace stim {
 		T* gpu_in;
 		HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T)));
 		HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice));
-		T* gpu_kernel;
-		HANDLE_ERROR(cudaMalloc(&gpu_kernel, kx * ky * sizeof(T)));
-		HANDLE_ERROR(cudaMemcpy(gpu_kernel, kernel, kx * ky * sizeof(T), cudaMemcpyHostToDevice));
+		K* gpu_kernel;
+		HANDLE_ERROR(cudaMalloc(&gpu_kernel, kx * ky * sizeof(K)));
+		HANDLE_ERROR(cudaMemcpy(gpu_kernel, kernel, kx * ky * sizeof(K), cudaMemcpyHostToDevice));
 		T* gpu_out;
 		HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T)));
 		gpu_conv2(gpu_out, gpu_in, gpu_kernel, sx, sy, kx, ky);								//execute the GPU kernel
@@ -79,9 +98,7 @@ namespace stim {
 		HANDLE_ERROR(cudaFree(gpu_kernel));
 		HANDLE_ERROR(cudaFree(gpu_out));
 #else
-		
-
-		T v;												//register stores the integral of the current pixel value
+		K v;												//register stores the integral of the current pixel value
 		size_t yi, xi, kyi, kxi, yi_kyi_sx;
 		for (yi = 0; yi < Y; yi++) {					//for each pixel in the output image
 			for (xi = 0; xi < X; xi++) {
diff --git a/stim/math/filters/sepconv2.h b/stim/math/filters/sepconv2.h
new file mode 100644
index 0000000..90503aa
--- /dev/null
+++ b/stim/math/filters/sepconv2.h
@@ -0,0 +1,89 @@
+#ifndef STIM_CUDA_SEPCONV2_H
+#define STIM_CUDA_SEPCONV2_H
+#include <stim/math/filters/conv2.h>
+#ifdef __CUDACC__
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#endif
+
+namespace stim {
+#ifdef __CUDACC__
+	//Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device.
+	//@param out is a pointer to the output image
+	//@param in is a pointer to the input image
+	//@param sx is the size of the input image along X
+	//@param sy is the size of the input image along Y
+	//@param kx is the size of the kernel along X
+	//@param ky is the size of the kernel along Y
+	template<typename T, typename K = T>
+	void gpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) {
+		cudaDeviceProp p;
+		HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
+		size_t tmax = p.maxThreadsPerBlock;
+		dim3 nt(sqrt(tmax), sqrt(tmax));						//calculate the block dimensions
+		size_t X = sx - kx + 1;									//calculate the x size of the output image
+		T* temp;												//declare a temporary variable to store the intermediate image
+		HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T)));	//allocate memory for the intermediate image
+
+		dim3 nb(X / nt.x + 1, sy / nt.y + 1);							//calculate the grid dimensions
+		size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T);					//shared memory bytes required to store block data
+		if (sm > p.sharedMemPerBlock) {
+			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
+			exit(1);
+		}
+		kernel_conv2 <<<nb, nt, sm>>> (temp, in, k0, sx, sy, kx, 1);	//launch the kernel to compute the intermediate image
+
+		size_t Y = sy - ky + 1;									//calculate the y size of the output image
+		nb.y = Y / nt.y + 1;									//update the grid dimensions to reflect the Y-axis size of the output image
+		sm = nt.x * (nt.y + ky - 1) * sizeof(T);				//calculate the amount of shared memory needed for the second pass
+		if (sm > p.sharedMemPerBlock) {
+			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
+			exit(1);
+		}
+		kernel_conv2 <<<nb, nt, sm>>> (out, temp, k1, X, sy, 1, ky);	//launch the kernel to compute the final image
+		HANDLE_ERROR(cudaFree(temp));							//free memory allocated for the intermediate image
+	}
+#endif
+	//Performs a separable convolution of a 2D image. Only valid pixels based on the kernel are returned.
+	//	As a result, the output image will be smaller than the input image by (kx-1, ky-1)
+	//@param out is a pointer to the output image
+	//@param in is a pointer to the input image
+	//@param k0 is the x-axis convolution filter
+	//@param k1 is the y-axis convolution filter
+	//@param sx is the size of the input image along X
+	//@param sy is the size of the input image along Y
+	//@param kx is the size of the kernel along X
+	//@param ky is the size of the kernel along Y
+	template<typename T, typename K = T>
+	void cpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) {
+		size_t X = sx - kx + 1;					//x size of the output image
+		size_t Y = sy - ky + 1;
+#ifdef __CUDACC__
+		//allocate memory and copy everything to the GPU
+		T* gpu_in;
+		HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice));
+		K* gpu_k0;
+		HANDLE_ERROR(cudaMalloc(&gpu_k0, kx * sizeof(K)));
+		HANDLE_ERROR(cudaMemcpy(gpu_k0, k0, kx * sizeof(K), cudaMemcpyHostToDevice));
+		K* gpu_k1;
+		HANDLE_ERROR(cudaMalloc(&gpu_k1, ky * sizeof(K)));
+		HANDLE_ERROR(cudaMemcpy(gpu_k1, k1, ky * sizeof(K), cudaMemcpyHostToDevice));
+		T* gpu_out;
+		HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T)));
+		gpu_sepconv2(gpu_out, gpu_in, gpu_k0, gpu_k1, sx, sy, kx, ky);								//execute the GPU kernel
+		HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost));	//copy the result to the host
+		HANDLE_ERROR(cudaFree(gpu_in));
+		HANDLE_ERROR(cudaFree(gpu_k0));
+		HANDLE_ERROR(cudaFree(gpu_k1));
+		HANDLE_ERROR(cudaFree(gpu_out));
+#else
+		T* temp = (T*)malloc(X * sy * sizeof(T));		//allocate space for the intermediate image
+		cpu_conv2(temp, in, k0, sx, sy, kx, 1);			//evaluate the intermediate image
+		cpu_conv2(out, temp, k1, X, sy, 1, ky);			//evaluate the final image
+		free(temp);										//free the memory for the intermediate image
+#endif
+	}
+}
+
+#endif
\ No newline at end of file
--
libgit2 0.21.4