added 2D convolution algorithm

David Mayerich
1 parent cfcf8619
Showing 2 changed files with 181 additions and 51 deletions Show diff stats
stim/image/image.h
stim/math/filters/conv2.h
 #ifndef STIM_IMAGE_H
 #define STIM_IMAGE_H
-#include <opencv2/core/core.hpp>
-#include <opencv2/highgui/highgui.hpp>
+#ifdef USING_OPENCV
+	#include <opencv2/core/core.hpp>
+	#include <opencv2/highgui/highgui.hpp>
+#endif
 #include <vector>
 #include <iostream>
 #include <limits>
@@ -18,7 +20,6 @@ namespace stim{
 template <class T>
 class image{
-	//cimg_library::CImg<T> img;
 	T* img;										//pointer to the image data (interleaved RGB for color)
 	size_t R[3];
@@ -57,18 +58,8 @@ class image{
 		return y * C() * X() + x * C() + c;
 	}
-
+#ifdef USING_OPENCV
 	int cv_type(){
-		// The following is C++ 11 code, but causes problems on some compilers (ex. nvcc). Below is my best approximation to a solution
-
-		//if(std::is_same<T, unsigned char>::value)	return CV_MAKETYPE(CV_8U, (int)C());
-		//if(std::is_same<T, char>::value)			return CV_MAKETYPE(CV_8S, (int)C());
-		//if(std::is_same<T, unsigned short>::value)	return CV_MAKETYPE(CV_16U, (int)C());
-		//if(std::is_same<T, short>::value)			return CV_MAKETYPE(CV_16S, (int)C());
-		//if(std::is_same<T, int>::value)				return CV_MAKETYPE(CV_32S, (int)C());
-		//if(std::is_same<T, float>::value)			return CV_MAKETYPE(CV_32F, (int)C());
-		//if(std::is_same<T, double>::value)			return CV_MAKETYPE(CV_64F, (int)C());
-
 		if(typeid(T) == typeid(unsigned char))		return CV_MAKETYPE(CV_8U, (int)C());
 		if(typeid(T) == typeid(char))				return CV_MAKETYPE(CV_8S, (int)C());
 		if(typeid(T) == typeid(unsigned short))		return CV_MAKETYPE(CV_16U, (int)C());
@@ -80,18 +71,9 @@ class image{
 		std::cout<<"ERROR in stim::image::cv_type - no valid data type found"<<std::endl;
 		exit(1);
 	}
-
+#endif
 	/// Returns the value for "white" based on the dynamic range (assumes white is 1.0 for floating point images)
 	T white(){
-		// The following is C++ 11 code, but causes problems on some compilers (ex. nvcc). Below is my best approximation to a solution
-
-		//if(std::is_same<T, unsigned char>::value)		return UCHAR_MAX;
-		//if(std::is_same<T, unsigned short>::value)		return SHRT_MAX;
-		//if(std::is_same<T, unsigned>::value)			return UINT_MAX;
-		//if(std::is_same<T, unsigned long>::value)		return ULONG_MAX;
-		//if(std::is_same<T, unsigned long long>::value)	return ULLONG_MAX;
-		//if(std::is_same<T, float>::value)				return 1.0f;
-		//if(std::is_same<T, double>::value)				return 1.0;
 		if(typeid(T) == typeid(unsigned char))		return UCHAR_MAX;
 		if(typeid(T) == typeid(unsigned short))		return SHRT_MAX;
@@ -164,23 +146,31 @@ public:
 			exit(1);
 		}
-		size_t c;													//allocate space for the number of channels
-		unsigned char format[2];									//allocate space to hold the image format tag
+		size_t nc;													//allocate space for the number of channels
+		char format[2];												//allocate space to hold the image format tag
 		infile.read(format, 2);										//read the image format tag
-		infile.seekg(1);											//skip the newline character
+		infile.seekg(1, std::ios::cur);								//skip the newline character
 		if (format[0] != 'P') {
 			std::cout << "Error in image::load_netpbm() - file format tag is invalid: " << format[0] << format[1] << std::endl;
 			exit(1);
 		}
-		if (format[1] == '5') c = 1;								//get the number of channels from the format flag
-		else if (format[1] == '6') c = 3;
+		if (format[1] == '5') nc = 1;								//get the number of channels from the format flag
+		else if (format[1] == '6') nc = 3;
 		else {
 			std::cout << "Error in image::load_netpbm() - file format tag is invalid: " << format[0] << format[1] << std::endl;
 			exit(1);
 		}
+
+		unsigned char c;								//stores a character
+		while (infile.peek() == '#') {					//if the next character indicates the start of a comment
+			while (true) {
+				c = infile.get();
+				if (c == 0x0A) break;
+			}
+		}
+
 		std::string sw;									//create a string to store the width of the image
-		unsigned char c;
 		while(true){
 			c = infile.get();							//get a single character
 			if (c == ' ') break;						//exit if we've encountered a space
@@ -194,18 +184,38 @@ public:
 			if (c == 0x0A) break;
 			sh.push_back(c);
 		}
-		size_t h = atoi(ah.c_str());					//convert the string into an integer
-		allocate(w, h, c);								//allocate space for the image
-		infile.read(img, size());						//copy the binary data from the file to the image
+		while (true) {									//skip the maximum value
+			c = infile.get();
+			if (c == 0x0A) break;
+		}
+		size_t h = atoi(sh.c_str());					//convert the string into an integer
+
+		allocate(w, h, nc);								//allocate space for the image
+		infile.read((char*)img, size());						//copy the binary data from the file to the image
 		infile.close();
 	}
-
+#ifdef USING_OPENCV
+	void from_opencv(unsigned char* buffer, size_t width, size_t height) {
+		allocate(width, height, 3);
+		T value;
+		size_t i;
+		for (size_t c = 0; c < C(); c++) {								//copy directly
+			for (size_t y = 0; y < Y(); y++) {
+				for (size_t x = 0; x < X(); x++) {
+					i = y * X() * C() + x * C() + (2 - c);
+					value = buffer[i];
+					img[idx(x, y, c)] = value;
+				}
+			}
+		}
+	}
+#endif
 	/// Load an image from a file
 	void load(std::string filename){
-
+#ifdef USING_OPENCV
 		cv::Mat cvImage = cv::imread(filename, CV_LOAD_IMAGE_UNCHANGED);	//use OpenCV to open the image file
 		if(!cvImage.data){
 			std::cout<<"ERROR stim::image::load() - unable to find image "<<filename<<std::endl;
@@ -220,22 +230,12 @@ public:
 			memcpy(img, cv_ptr, bytes());
 		if(C() == 3)														//if this is a 3-color image, OpenCV uses BGR interleaving
 			from_opencv(cv_ptr, X(), Y());
+#else
+		load_netpbm(filename);
+#endif
 	}
-	void from_opencv(unsigned char* buffer, size_t width, size_t height){
-		allocate(width, height, 3);
-		T value;
-		size_t i;
-		for(size_t c = 0; c < C(); c++){								//copy directly
-			for(size_t y = 0; y < Y(); y++){
-				for(size_t x = 0; x < X(); x++){
-					i = y * X() * C() + x * C() + (2-c);
-					value = buffer[i];
-					img[idx(x, y, c)] = value;
-				}
-			}
-		}
-	}
+
 	//save a Netpbm file
 	void save_netpbm(std::string filename) {
@@ -255,7 +255,9 @@ public:
 			std::cout << "Error in image::save_netpbm() - data must be grayscale or RGB." << std::endl;
 			exit(1);
 		}
-		outfile << width() << " " << height() << (char)0x0A;			//save the width and height
+		size_t w = width();
+		size_t h = height();
+		outfile << w << " " << h << (char)0x0A;			//save the width and height
 		outfile << "255" << (char)0x0A;								//output the maximum value
 		outfile.write((const char*)img, size());			//write the binary data
 		outfile.close();
@@ -263,6 +265,7 @@ public:
 	//save a file
 	void save(std::string filename){
+#ifdef USING_OPENCV
 		//OpenCV uses an interleaved format, so convert first and then output
 		T* buffer = (T*) malloc(bytes());
@@ -273,6 +276,9 @@ public:
 		cv::Mat cvImage((int)Y(), (int)X(), cv_type(), buffer);
 		cv::imwrite(filename, cvImage);
 		free(buffer);
+#else
+		save_netpbm(filename);
+#endif
 	}
 	void set_interleaved(T* buffer, size_t width, size_t height, size_t channels){
@@ -376,7 +382,7 @@ public:
 		size_t x, y;
 		for(y = 0; y < Y(); y++){
 			for(x = 0; x < X(); x++){
-				img[idx(x, y, c)] = buffer[c];
+				img[idx(x, y, c)] = buffer[y * X() + x];
 			}
 		}
 	}
@@ -499,6 +505,24 @@ public:
 		exit(1);
 	}
+	template<typename V>
+	operator image<V>() {
+		//create a new image
+		//image<V> r(X(), Y(), C());
+		V* dst = (V*)malloc(size() * sizeof(V));
+
+		for (size_t x = 0; x < X(); x++) {
+			for (size_t y = 0; y < Y(); y++) {
+				for (size_t c = 0; c < C(); c++) {
+					dst[idx(x, y, c)] = (V)img[idx(x, y, c)];
+				}
+			}
+		}
+
+		image<V> r(dst, X(), Y(), C());
+		return r;
+	}
+
 };
 };		//end namespace stim
+#ifndef STIM_CUDA_CONV2_H
+#define STIM_CUDA_CONV2_H
+//#define __CUDACC__
+
+#ifdef __CUDACC__
+#include <stim/cuda/cudatools.h>
+#endif
+
+namespace stim {
+
+	//Kernel function that performs the 2D convolution.
+	template<typename T>
+	__global__ void kernel_conv2(T* out, T* in, T* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
+		size_t xi = blockIdx.x * blockDim.x + threadIdx.x;	//threads correspond to indices into the output image
+		size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
+
+		size_t X = sx - kx + 1;								//calculate the size of the output image
+		size_t Y = sy - ky + 1;
+
+		if (xi >= X || yi >= Y) return;						//returns if the thread is outside of the output image
+		
+		//loop through the kernel
+		size_t kxi, kyi;
+		T v = 0;
+		for (kyi = 0; kyi < ky; kyi++) {
+			for (kxi = 0; kxi < kx; kxi++) {
+				v += in[(yi + kyi) * sx + xi + kxi] * kernel[kyi * kx + kxi];
+			}
+		}
+		out[yi * X + xi] = v;								//write the result to global memory
+
+	}
+
+	//Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device.
+	//@param out is a pointer to the output image
+	//@param in is a pointer to the input image
+	//@param sx is the size of the input image along X
+	//@param sy is the size of the input image along Y
+	//@param kx is the size of the kernel along X
+	//@param ky is the size of the kernel along Y
+	template<typename T>
+	void gpu_conv2(T* out, T* in, T* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
+		cudaDeviceProp p;
+		HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
+		size_t tmax = p.maxThreadsPerBlock;
+		dim3 tn(sqrt(tmax), sqrt(tmax));					//calculate the block dimensions
+		size_t X = sx - kx + 1;								//calculate the size of the output image
+		size_t Y = sy - ky + 1;
+		dim3 bn(X / tn.x + 1, Y / tn.y + 1);				//calculate the grid dimensions
+		kernel_conv2 <<<bn, tn >>> (out, in, kernel, sx, sy, kx, ky);	//launch the kernel
+	}
+
+	//Performs a convolution of a 2D image. Only valid pixels based on the kernel are returned.
+	//	As a result, the output image will be smaller than the input image by (kx-1, ky-1)
+	//@param out is a pointer to the output image
+	//@param in is a pointer to the input image
+	//@param sx is the size of the input image along X
+	//@param sy is the size of the input image along Y
+	//@param kx is the size of the kernel along X
+	//@param ky is the size of the kernel along Y
+	template<typename T>
+	void cpu_conv2(T* out, T* in, T* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
+		size_t X = sx - kx + 1;					//x size of the output image
+		size_t Y = sy - ky + 1;					//y size of the output image
+
+#ifdef __CUDACC__
+		//allocate memory and copy everything to the GPU
+		T* gpu_in;
+		HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice));
+		T* gpu_kernel;
+		HANDLE_ERROR(cudaMalloc(&gpu_kernel, kx * ky * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(gpu_kernel, kernel, kx * ky * sizeof(T), cudaMemcpyHostToDevice));
+		T* gpu_out;
+		HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T)));
+		gpu_conv2(gpu_out, gpu_in, gpu_kernel, sx, sy, kx, ky);								//execute the GPU kernel
+		HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost));	//copy the result to the host
+		HANDLE_ERROR(cudaFree(gpu_in));
+		HANDLE_ERROR(cudaFree(gpu_kernel));
+		HANDLE_ERROR(cudaFree(gpu_out));
+#else
+		
+
+		T v;												//register stores the integral of the current pixel value
+		size_t yi, xi, kyi, kxi, yi_kyi_sx;
+		for (yi = 0; yi < Y; yi++) {					//for each pixel in the output image
+			for (xi = 0; xi < X; xi++) {
+				v = 0;
+				for (kyi = 0; kyi < ky; kyi++) {		//for each pixel in the kernel
+					yi_kyi_sx = (yi + kyi) * sx;
+					for (kxi = 0; kxi < kx; kxi++) {
+						v += in[yi_kyi_sx + xi + kxi] * kernel[kyi * kx + kxi];
+					}
+				}
+				out[yi * X + xi] = v;						//save the result to the output array
+			}
+		}
+		
+#endif
+	}
+
+
+}
+
+
+#endif
 \ No newline at end of file