From f186dbdaad1858d7e18d4b35a1e099ef81ff1f42 Mon Sep 17 00:00:00 2001
From: Tianshu Cheng <tianshucheng@foxmail.com>
Date: Wed, 7 Oct 2015 13:45:25 -0500
Subject: [PATCH] header file for bsds500

---
 stim/cuda/arraymath.cuh                 |   5 +++++
 stim/cuda/arraymath/array_add.cuh       |   1 +
 stim/cuda/arraymath/array_atan.cuh      |  67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/arraymath/array_cos.cuh       |  67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/arraymath/array_divide.cuh    |  70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/arraymath/array_multiply2.cuh |  70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/arraymath/array_sin.cuh       |  67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/cudatools/devices.h           |   9 +++++++++
 stim/cuda/templates/chi_gradient.cuh    | 221 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/templates/conv2.cuh           |   8 ++------
 stim/image/image.h                      |  35 ++++++++++++++++++++++++++++++++++-
 stim/image/image_contour_detection.h    |  18 +++++++++++++++---
 12 files changed, 628 insertions(+), 10 deletions(-)
 create mode 100644 stim/cuda/arraymath/array_atan.cuh
 create mode 100644 stim/cuda/arraymath/array_cos.cuh
 create mode 100644 stim/cuda/arraymath/array_divide.cuh
 create mode 100644 stim/cuda/arraymath/array_multiply2.cuh
 create mode 100644 stim/cuda/arraymath/array_sin.cuh
 create mode 100644 stim/cuda/templates/chi_gradient.cuh
diff --git a/stim/cuda/arraymath.cuh b/stim/cuda/arraymath.cuh
index 43d0154..adbf66a 100644
--- a/stim/cuda/arraymath.cuh
+++ b/stim/cuda/arraymath.cuh
@@ -3,6 +3,11 @@
 
 #include <stim/cuda/arraymath/array_add.cuh>
 #include <stim/cuda/arraymath/array_multiply.cuh>
+#include <stim/cuda/arraymath/array_multiply2.cuh>
+#include <stim/cuda/arraymath/array_divide.cuh>
+#include <stim/cuda/arraymath/array_cos.cuh>
+#include <stim/cuda/arraymath/array_sin.cuh>
+#include <stim/cuda/arraymath/array_atan.cuh>
 #include <stim/cuda/arraymath/array_abs.cuh>
 #include <stim/cuda/arraymath/array_cart2polar.cuh>
 
diff --git a/stim/cuda/arraymath/array_add.cuh b/stim/cuda/arraymath/array_add.cuh
index 32ac40e..82be364 100644
--- a/stim/cuda/arraymath/array_add.cuh
+++ b/stim/cuda/arraymath/array_add.cuh
@@ -3,6 +3,7 @@
 
 #include <iostream>
 #include <cuda.h>
+//#include <cmath>
 #include <stim/cuda/cudatools.h>
 
 namespace stim{
diff --git a/stim/cuda/arraymath/array_atan.cuh b/stim/cuda/arraymath/array_atan.cuh
new file mode 100644
index 0000000..65fc458
--- /dev/null
+++ b/stim/cuda/arraymath/array_atan.cuh
@@ -0,0 +1,67 @@
+#ifndef STIM_CUDA_ARRAY_ATAN_H
+#define STIM_CUDA_ARRAY_ATAN_H
+
+#include <iostream>
+#include <cuda.h>
+#include <cmath>
+#include <stim/cuda/cudatools.h>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_atan(T* ptr1, T* out, unsigned int N){
+
+			//calculate the 1D index for this thread
+			int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(idx < N){
+				out[idx] = atan(ptr1[idx]);
+			}
+
+		}
+
+		template<typename T>
+		void gpu_atan(T* ptr1, T* out, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + 1;
+
+			//call the kernel to do the multiplication
+			cuda_atan <<< blocks, threads >>>(ptr1, out, N);
+
+		}
+
+		template<typename T>
+		void cpu_atan(T* ptr1, T* cpu_out, unsigned int N){
+
+			//allocate memory on the GPU for the array
+			T* gpu_ptr1; 
+			T* gpu_out;
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_out, N * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_atan<T>(gpu_ptr1 ,gpu_out, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_ptr1);
+			cudaFree(gpu_out);
+
+		}
+		
+	}
+}
+
+
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/arraymath/array_cos.cuh b/stim/cuda/arraymath/array_cos.cuh
new file mode 100644
index 0000000..6880c6b
--- /dev/null
+++ b/stim/cuda/arraymath/array_cos.cuh
@@ -0,0 +1,67 @@
+#ifndef STIM_CUDA_ARRAY_COS_H
+#define STIM_CUDA_ARRAY_COS_H
+
+#include <iostream>
+#include <cuda.h>
+#include <cmath>
+#include <stim/cuda/cudatools.h>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_cos(T* ptr1, T* out, unsigned int N){
+
+			//calculate the 1D index for this thread
+			int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(idx < N){
+				out[idx] = cos(ptr1[idx]);
+			}
+
+		}
+
+		template<typename T>
+		void gpu_cos(T* ptr1, T* out, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + 1;
+
+			//call the kernel to do the multiplication
+			cuda_cos <<< blocks, threads >>>(ptr1, out, N);
+
+		}
+
+		template<typename T>
+		void cpu_cos(T* ptr1, T* cpu_out, unsigned int N){
+
+			//allocate memory on the GPU for the array
+			T* gpu_ptr1; 
+			T* gpu_out;
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_out, N * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_cos<T>(gpu_ptr1 ,gpu_out, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_ptr1);
+			cudaFree(gpu_out);
+
+		}
+		
+	}
+}
+
+
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/arraymath/array_divide.cuh b/stim/cuda/arraymath/array_divide.cuh
new file mode 100644
index 0000000..c92c9df
--- /dev/null
+++ b/stim/cuda/arraymath/array_divide.cuh
@@ -0,0 +1,70 @@
+#ifndef STIM_CUDA_ARRAY_DIVIDE_H
+#define STIM_CUDA_ARRAY_DIVIDE_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools.h>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_divide(T* ptr1, T* ptr2, T* quotient, unsigned int N){
+
+			//calculate the 1D index for this thread
+			int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(idx < N){
+				quotient[idx] = ptr1[idx] / ptr2[idx];
+			}
+
+		}
+
+		template<typename T>
+		void gpu_divide(T* ptr1, T* ptr2, T* quotient, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + 1;
+
+			//call the kernel to do the multiplication
+			cuda_divide <<< blocks, threads >>>(ptr1, ptr2, quotient, N);
+
+		}
+
+		template<typename T>
+		void cpu_divide(T* ptr1, T* ptr2, T* cpu_quotient, unsigned int N){
+
+			//allocate memory on the GPU for the array
+			T* gpu_ptr1; 
+			T* gpu_ptr2; 
+			T* gpu_quotient;
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_quotient, N * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_divide<T>(gpu_ptr1, gpu_ptr2 ,gpu_quotient, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_quotient, gpu_quotient, N * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_ptr1);
+			cudaFree(gpu_ptr2);
+			cudaFree(gpu_quotient);
+
+		}
+		
+	}
+}
+
+
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/arraymath/array_multiply2.cuh b/stim/cuda/arraymath/array_multiply2.cuh
new file mode 100644
index 0000000..6296ed0
--- /dev/null
+++ b/stim/cuda/arraymath/array_multiply2.cuh
@@ -0,0 +1,70 @@
+#ifndef STIM_CUDA_ARRAY_MULTIPLY_H
+#define STIM_CUDA_ARRAY_MULTIPLY_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools.h>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_multiply(T* ptr1, T* ptr2, T* product, unsigned int N){
+
+			//calculate the 1D index for this thread
+			int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(idx < N){
+				product[idx] = ptr1[idx] * ptr2[idx];
+			}
+
+		}
+
+		template<typename T>
+		void gpu_multiply(T* ptr1, T* ptr2, T* product, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + 1;
+
+			//call the kernel to do the multiplication
+			cuda_multiply <<< blocks, threads >>>(ptr1, ptr2, product, N);
+
+		}
+
+		template<typename T>
+		void cpu_multiply(T* ptr1, T* ptr2, T* cpu_product, unsigned int N){
+
+			//allocate memory on the GPU for the array
+			T* gpu_ptr1; 
+			T* gpu_ptr2; 
+			T* gpu_product;
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_product, N * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_multiply<T>(gpu_ptr1, gpu_ptr2 ,gpu_product, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_product, gpu_product, N * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_ptr1);
+			cudaFree(gpu_ptr2);
+			cudaFree(gpu_product);
+
+		}
+		
+	}
+}
+
+
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/arraymath/array_sin.cuh b/stim/cuda/arraymath/array_sin.cuh
new file mode 100644
index 0000000..1c150d7
--- /dev/null
+++ b/stim/cuda/arraymath/array_sin.cuh
@@ -0,0 +1,67 @@
+#ifndef STIM_CUDA_ARRAY_SIN_H
+#define STIM_CUDA_ARRAY_SIN_H
+
+#include <iostream>
+#include <cuda.h>
+#include <cmath>
+#include <stim/cuda/cudatools.h>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_sin(T* ptr1, T* out, unsigned int N){
+
+			//calculate the 1D index for this thread
+			int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(idx < N){
+				out[idx] = sin(ptr1[idx]);
+			}
+
+		}
+
+		template<typename T>
+		void gpu_sin(T* ptr1, T* out, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + 1;
+
+			//call the kernel to do the multiplication
+			cuda_sin <<< blocks, threads >>>(ptr1, out, N);
+
+		}
+
+		template<typename T>
+		void cpu_sin(T* ptr1, T* cpu_out, unsigned int N){
+
+			//allocate memory on the GPU for the array
+			T* gpu_ptr1; 
+			T* gpu_out;
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_out, N * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_sin<T>(gpu_ptr1 ,gpu_out, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_ptr1);
+			cudaFree(gpu_out);
+
+		}
+		
+	}
+}
+
+
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/cudatools/devices.h b/stim/cuda/cudatools/devices.h
index 4dce378..231a26b 100644
--- a/stim/cuda/cudatools/devices.h
+++ b/stim/cuda/cudatools/devices.h
@@ -13,6 +13,15 @@ int maxThreadsPerBlock()
 	cudaGetDeviceProperties(&props, device);
 	return props.maxThreadsPerBlock;
 }
+
+int sharedMemPerBlock()
+{
+	int device;
+	cudaGetDevice(&device);		//get the id of the current device
+	cudaDeviceProp props;		//device property structure
+	cudaGetDeviceProperties(&props, device);
+	return props.sharedMemPerBlock;
+}
 }	//end namespace rts
 
 #endif
diff --git a/stim/cuda/templates/chi_gradient.cuh b/stim/cuda/templates/chi_gradient.cuh
new file mode 100644
index 0000000..cb4b225
--- /dev/null
+++ b/stim/cuda/templates/chi_gradient.cuh
@@ -0,0 +1,221 @@
+#ifndef STIM_CUDA_CHI_GRAD_H
+#define STIM_CUDA_CHI_GRAD_H
+
+#include <iostream>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stim/cuda/sharedmem.cuh>
+#include <cmath>
+#include <algorithm>
+
+#define PI 3.14159265358979
+
+namespace stim{
+	namespace cuda{
+
+		/// template parameter @param T is the data type
+		template<typename T>
+		__global__ void cuda_chi_grad(T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){
+
+			double theta_r = ((theta) * PI)/180; //change angle unit from degree to rad
+			float sum = 0;
+			unsigned int N = w * h;
+
+			//change 1D index to 2D cordinates
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yj = blockIdx.y;
+			int idx = yj * w + xi;
+			int shareidx = threadIdx.x;
+			
+			extern __shared__ unsigned short bin[];
+
+
+			if(xi < w && yj < h){
+
+				int gidx;
+				int hidx;
+
+				//initialize histogram bin to zeros
+				for(int i = 0; i < bin_n; i++){      
+			
+				bin[shareidx * bin_n + i] = 0;
+				__syncthreads();
+
+				}
+				
+				//get the histogram of the first half of disc and store in bin
+				for (int y = yj - r; y <= yj + r; y++){
+					for (int x = xi - r; x <= xi + r; x++){
+						
+							if ((y - yj)*cos(theta_r) + (x - xi)*sin(theta_r) > 0){
+
+								gidx = (int) tex2D<T>(texObj, (float)x/w, (float)y/h)/bin_size;
+								__syncthreads();
+
+								bin[shareidx * bin_n + gidx]++;
+								__syncthreads();
+
+							}
+
+							else{}
+					}		
+				}
+
+				//initiallize the gbin
+				unsigned short* gbin = (unsigned short*) malloc(bin_n*sizeof(unsigned short));
+				memset (gbin, 0, bin_n*sizeof(unsigned short));  
+
+				//copy the histogram to gbin
+				for (unsigned int gi = 0; gi < bin_n; gi++){
+
+					gbin[gi] = bin[shareidx * bin_n + gi];
+				
+				}
+
+				//initialize histogram bin to zeros
+				for(int j = 0; j < bin_n; j++){      //initialize histogram bin to zeros
+			
+				bin[shareidx * bin_n + j] = 0;
+				__syncthreads();
+				}
+
+				//get the histogram of the second half of disc and store in bin
+				for (int y = yj - r; y <= yj + r; y++){
+					for (int x = xi - r; x <= xi + r; x++){
+						
+							if ((y - yj)*cos(theta_r) + (x - xi)*sin(theta_r) < 0){
+
+								hidx = (int) tex2D<T>(texObj, (float)x/w, (float)y/h)/bin_size;
+								__syncthreads();
+
+								bin[shareidx * bin_n + hidx]++;
+								__syncthreads();	
+
+							}
+							else{}
+					}		
+				}
+
+				//initiallize the gbin
+				unsigned short* hbin = (unsigned short*) malloc(bin_n*sizeof(unsigned short));
+				memset (hbin, 0, bin_n*sizeof(unsigned short));      
+
+				//copy the histogram to hbin
+				for (unsigned int hi = 0; hi < bin_n; hi++){
+
+					hbin[hi] = bin[shareidx * bin_n + hi];
+				
+				}
+
+				//compare gbin, hbin and calculate the chi distance
+				for (int k = 0; k < bin_n; k++){
+
+					float flag;              // set flag to avoid zero denominator
+					
+					if ((gbin[k] + hbin[k]) == 0){
+						flag = 1;
+					}
+					else {
+						flag = (gbin[k] + hbin[k]);
+						__syncthreads();
+					}
+
+					sum += (gbin[k] - hbin[k])*(gbin[k] - hbin[k])/flag;
+					__syncthreads();
+
+				}
+
+				// return chi-distance for each pixel
+				copy[idx] = sum;
+
+				free(gbin);
+				free(hbin);
+			 }
+		}
+		
+
+		template<typename T>
+		void gpu_chi_grad(T* img, T* copy, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){
+
+			unsigned long N = w * h;
+
+			// Allocate CUDA array in device memory
+			
+			//define a channel descriptor for a single 32-bit channel
+			cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+			cudaArray* cuArray;												//declare the cuda array
+			cudaMallocArray(&cuArray, &channelDesc, w, h);			//allocate the cuda array
+
+			// Copy the image data from global memory to the array
+			cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T), cudaMemcpyDeviceToDevice);
+
+			// Specify texture
+			struct cudaResourceDesc resDesc;				//create a resource descriptor
+			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
+			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
+			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
+
+			// Specify texture object parameters
+			struct cudaTextureDesc texDesc;							//create a texture descriptor
+			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
+			texDesc.addressMode[0]   = cudaAddressModeMirror;			//use wrapping (around the edges)
+			texDesc.addressMode[1]   = cudaAddressModeMirror;
+			texDesc.filterMode       = cudaFilterModePoint;			//use linear filtering
+			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
+			texDesc.normalizedCoords = 1;							//using normalized coordinates
+
+			// Create texture object
+			cudaTextureObject_t texObj = 0;
+			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+			int sharemax = stim::sharedMemPerBlock();                   //get the size of Shared memory available per block in bytes
+			unsigned int shared_bytes = threads * bin_n * sizeof(unsigned short);
+
+			if(threads * bin_n > sharemax){
+
+				cout <<"Error: shared_bytes exceeds the max value."<<'\n';
+				exit(1);
+			
+			}
+			
+
+			//calculate the number of blocks
+			dim3 blocks(w / threads + 1, h);
+
+			//call the kernel to do the multiplication
+			cuda_chi_grad <<< blocks, threads, shared_bytes >>>(copy, texObj, w, h, r, bin_n, bin_size, theta);
+
+		}
+
+		template<typename T>
+		void cpu_chi_grad(T* img, T* cpu_copy, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){
+			
+			unsigned long N = w * h;
+			//allocate memory on the GPU for the array
+			T* gpu_img; 
+			T* gpu_copy;
+			HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_chi_grad<T>(gpu_img, gpu_copy, w, h, r, bin_n, bin_size, theta);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_img);
+			cudaFree(gpu_copy);
+
+		}
+		
+	}
+}
+
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/templates/conv2.cuh b/stim/cuda/templates/conv2.cuh
index c9d6673..ca9918d 100644
--- a/stim/cuda/templates/conv2.cuh
+++ b/stim/cuda/templates/conv2.cuh
@@ -11,8 +11,7 @@ namespace stim{
 	namespace cuda{
 
 		template<typename T>
-		//__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){
-		__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned int M){
+		__global__ void cuda_conv2(T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned int M){
 
 
 			//the radius of mask
@@ -51,8 +50,6 @@ namespace stim{
 						int xx = x - (i - r);
 						int yy = y - (j - r);
 
-						//T temp = img[y * w + x] * mask[yy * M + xx];
-						//sum += img[y * w + x] * mask[yy * M + xx];
 						sum += tex2D<T>(texObj, (float)x/w, (float)y/h) * mask[yy * M + xx];
 					}		
 				}
@@ -105,8 +102,7 @@ namespace stim{
 			dim3 blocks(w / threads + 1, h);
 
 			//call the kernel to do the multiplication
-			//cuda_conv2 <<< blocks, threads >>>(img, mask, copy, w, h, M);
-			cuda_conv2 <<< blocks, threads >>>(img, mask, copy, texObj, w, h, M);
+			cuda_conv2 <<< blocks, threads >>>(mask, copy, texObj, w, h, M);
 
 		}
 
diff --git a/stim/image/image.h b/stim/image/image.h
index 8bbb507..e7d7fd1 100644
--- a/stim/image/image.h
+++ b/stim/image/image.h
@@ -31,8 +31,12 @@ public:
 	}
 
 	/// Constructor initializes an image to a given size
-	image(unsigned int x, unsigned int y = 1, unsigned int z = 1){
+	/*image(unsigned int x, unsigned int y = 1, unsigned int z = 1){
 		img = cimg_library::CImg<T>(x, y, z);
+	}*/
+
+	image(unsigned int x, unsigned int y = 1, unsigned int z = 1, unsigned int c = 1){
+		img = cimg_library::CImg<T>(x, y, z, c);
 	}
 
 	//Load an image from a file
@@ -90,6 +94,23 @@ public:
 
 	}
 
+	/// Copy the given data to the specified channel
+
+	/// @param c is the channel number that the data will be copied to
+	/// @param buffer is a pointer to the image to be copied to channel c
+
+	void set_channel(unsigned int c, T* buffer){
+
+		//calculate the number of pixels in a channel
+		unsigned int channel_size = width() * height();        
+
+		//retreive a pointer to the raw image data
+		T* ptr = img.data() + channel_size * c;
+
+		//copy the buffer to the specified channel
+		memcpy(ptr, buffer, sizeof(T) * channel_size);
+	}
+
 	image<T> getslice(unsigned int c){
 
 		//create a new image
@@ -228,6 +249,18 @@ public:
 	}
 
 
+	image<T> rotate(float angle, float cx, float cy){
+		
+		image<T> result;
+		float zoom = 1;
+		unsigned int interpolation = 1;
+		unsigned int boundary = 1;
+		result.img = img.get_rotate (angle, cx, cy, zoom, interpolation, boundary);
+		//result.save("data_output/test_rotate_neum.bmp");
+
+		return result;
+	}
+	
 };
 
 };		//end namespace stim
diff --git a/stim/image/image_contour_detection.h b/stim/image/image_contour_detection.h
index 2db9174..055d06a 100644
--- a/stim/image/image_contour_detection.h
+++ b/stim/image/image_contour_detection.h
@@ -1,4 +1,16 @@
 
-stim::image<float> gaussian_derivative_filter_odd(stim::image<float> image, int r, unsigned int sigma_n, float theta);
-stim::image<float> func_mPb_theta(stim::image<float> img, float theta, int* r, float* alpha, int s);
-stim::image<float> func_mPb(stim::image<float> img, unsigned int theta_n, int* r, float* alpha, int s);
\ No newline at end of file
+//stim::image<float> gaussian_derivative_filter_odd(stim::image<float> image, int r, unsigned int sigma_n, float theta);
+//stim::image<float> func_mPb_theta(stim::image<float> img, float theta, int* r, float* alpha, int s);
+//stim::image<float> func_mPb(stim::image<float> img, unsigned int theta_n, int* r, float* alpha, int s);
+
+stim::image<float> Gd1(stim::image<float> image, int r, unsigned int sigma_n);
+stim::image<float> Gd2(stim::image<float> image, int r, unsigned int sigma_n);
+stim::image<float> Gd_odd(stim::image<float> image, int r, unsigned int sigma_n, float theta);
+stim::image<float> Gd_even(stim::image<float> image, int r, unsigned int sigma_n, float theta);
+stim::image<float> Gd_center(stim::image<float> image, int r, unsigned int sigma_n);
+
+stim::image<float> textons(stim::image<float> image, unsigned int theta_n);
+stim::image<float> kmeans(stim::image<float> textons, unsigned int K);
+stim::image<float> Pb(stim::image<float> image, int r, unsigned int sigma_n);
+stim::image<float> cPb(stim::image<float> img, int* r, float* alpha, int s);
+stim::image<float> tPb(stim::image<float> img, int* r, float* alpha, unsigned int theta_n, unsigned int bin_n, int s, unsigned int K);
\ No newline at end of file
--
libgit2 0.21.4