CHECKPOINT: before the swap of globj for glnetwork in the use of segmentation.

Pavel Govyadinov
2 parents 1306fd96 91d8912e
Showing 24 changed files with 1371 additions and 53 deletions Show diff stats
stim/cuda/arraymath.cuh
stim/cuda/arraymath/array_add.cuh
stim/cuda/arraymath/array_atan.cuh
stim/cuda/arraymath/array_cos.cuh
stim/cuda/arraymath/array_divide.cuh
stim/cuda/arraymath/array_multiply2.cuh
stim/cuda/arraymath/array_sin.cuh
stim/cuda/bsds500/cPb.cpp
stim/cuda/bsds500/dG1_conv2.cpp
stim/cuda/bsds500/dG1_theta_conv2.cpp
stim/cuda/bsds500/dG2_conv2.cpp
stim/cuda/bsds500/dG_d2x_theta_conv2.cpp
stim/cuda/bsds500/kmeans.cpp
stim/cuda/bsds500/laplacian_conv2.cpp
stim/cuda/bsds500/tPb.cpp
stim/cuda/bsds500/textons.cpp
stim/cuda/cudatools/devices.h
stim/cuda/ivote/update_dir.cuh
stim/cuda/templates/chi_gradient.cuh
stim/cuda/templates/conv2.cuh
@@ -3,6 +3,11 @@
  
 #include <stim/cuda/arraymath/array_add.cuh>
 #include <stim/cuda/arraymath/array_multiply.cuh>
+#include <stim/cuda/arraymath/array_multiply2.cuh>
+#include <stim/cuda/arraymath/array_divide.cuh>
+#include <stim/cuda/arraymath/array_cos.cuh>
+#include <stim/cuda/arraymath/array_sin.cuh>
+#include <stim/cuda/arraymath/array_atan.cuh>
 #include <stim/cuda/arraymath/array_abs.cuh>
 #include <stim/cuda/arraymath/array_cart2polar.cuh>
  
@@ -3,6 +3,7 @@
  
 #include <iostream>
 #include <cuda.h>
+//#include <cmath>
 #include <stim/cuda/cudatools.h>
  
 namespace stim{
@@ -27,7 +28,7 @@ namespace stim{
 			int threads = stim::maxThreadsPerBlock();
  
 			//calculate the number of blocks
-			int blocks = N / threads + (N%threads == 0 ? 0:1);
+			int blocks = N / threads + 1;
  
 			//call the kernel to do the multiplication
 			cuda_add <<< blocks, threads >>>(ptr1, ptr2, sum, N);
+#ifndef STIM_CUDA_ARRAY_ATAN_H
+#define STIM_CUDA_ARRAY_ATAN_H
+
+#include <iostream>
+#include <cuda.h>
+#include <cmath>
+#include <stim/cuda/cudatools.h>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_atan(T* ptr1, T* out, unsigned int N){
+
+			//calculate the 1D index for this thread
+			int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(idx < N){
+				out[idx] = atan(ptr1[idx]);
+			}
+
+		}
+
+		template<typename T>
+		void gpu_atan(T* ptr1, T* out, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + 1;
+
+			//call the kernel to do the multiplication
+			cuda_atan <<< blocks, threads >>>(ptr1, out, N);
+
+		}
+
+		template<typename T>
+		void cpu_atan(T* ptr1, T* cpu_out, unsigned int N){
+
+			//allocate memory on the GPU for the array
+			T* gpu_ptr1; 
+			T* gpu_out;
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_out, N * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_atan<T>(gpu_ptr1 ,gpu_out, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_ptr1);
+			cudaFree(gpu_out);
+
+		}
+		
+	}
+}
+
+
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_ARRAY_COS_H
+#define STIM_CUDA_ARRAY_COS_H
+
+#include <iostream>
+#include <cuda.h>
+#include <cmath>
+#include <stim/cuda/cudatools.h>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_cos(T* ptr1, T* out, unsigned int N){
+
+			//calculate the 1D index for this thread
+			int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(idx < N){
+				out[idx] = cos(ptr1[idx]);
+			}
+
+		}
+
+		template<typename T>
+		void gpu_cos(T* ptr1, T* out, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + 1;
+
+			//call the kernel to do the multiplication
+			cuda_cos <<< blocks, threads >>>(ptr1, out, N);
+
+		}
+
+		template<typename T>
+		void cpu_cos(T* ptr1, T* cpu_out, unsigned int N){
+
+			//allocate memory on the GPU for the array
+			T* gpu_ptr1; 
+			T* gpu_out;
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_out, N * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_cos<T>(gpu_ptr1 ,gpu_out, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_ptr1);
+			cudaFree(gpu_out);
+
+		}
+		
+	}
+}
+
+
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_ARRAY_DIVIDE_H
+#define STIM_CUDA_ARRAY_DIVIDE_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools.h>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_divide(T* ptr1, T* ptr2, T* quotient, unsigned int N){
+
+			//calculate the 1D index for this thread
+			int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(idx < N){
+				quotient[idx] = ptr1[idx] / ptr2[idx];
+			}
+
+		}
+
+		template<typename T>
+		void gpu_divide(T* ptr1, T* ptr2, T* quotient, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + 1;
+
+			//call the kernel to do the multiplication
+			cuda_divide <<< blocks, threads >>>(ptr1, ptr2, quotient, N);
+
+		}
+
+		template<typename T>
+		void cpu_divide(T* ptr1, T* ptr2, T* cpu_quotient, unsigned int N){
+
+			//allocate memory on the GPU for the array
+			T* gpu_ptr1; 
+			T* gpu_ptr2; 
+			T* gpu_quotient;
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_quotient, N * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_divide<T>(gpu_ptr1, gpu_ptr2 ,gpu_quotient, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_quotient, gpu_quotient, N * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_ptr1);
+			cudaFree(gpu_ptr2);
+			cudaFree(gpu_quotient);
+
+		}
+		
+	}
+}
+
+
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_ARRAY_MULTIPLY_H
+#define STIM_CUDA_ARRAY_MULTIPLY_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools.h>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_multiply(T* ptr1, T* ptr2, T* product, unsigned int N){
+
+			//calculate the 1D index for this thread
+			int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(idx < N){
+				product[idx] = ptr1[idx] * ptr2[idx];
+			}
+
+		}
+
+		template<typename T>
+		void gpu_multiply(T* ptr1, T* ptr2, T* product, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + 1;
+
+			//call the kernel to do the multiplication
+			cuda_multiply <<< blocks, threads >>>(ptr1, ptr2, product, N);
+
+		}
+
+		template<typename T>
+		void cpu_multiply(T* ptr1, T* ptr2, T* cpu_product, unsigned int N){
+
+			//allocate memory on the GPU for the array
+			T* gpu_ptr1; 
+			T* gpu_ptr2; 
+			T* gpu_product;
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_product, N * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_multiply<T>(gpu_ptr1, gpu_ptr2 ,gpu_product, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_product, gpu_product, N * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_ptr1);
+			cudaFree(gpu_ptr2);
+			cudaFree(gpu_product);
+
+		}
+		
+	}
+}
+
+
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_ARRAY_SIN_H
+#define STIM_CUDA_ARRAY_SIN_H
+
+#include <iostream>
+#include <cuda.h>
+#include <cmath>
+#include <stim/cuda/cudatools.h>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_sin(T* ptr1, T* out, unsigned int N){
+
+			//calculate the 1D index for this thread
+			int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(idx < N){
+				out[idx] = sin(ptr1[idx]);
+			}
+
+		}
+
+		template<typename T>
+		void gpu_sin(T* ptr1, T* out, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + 1;
+
+			//call the kernel to do the multiplication
+			cuda_sin <<< blocks, threads >>>(ptr1, out, N);
+
+		}
+
+		template<typename T>
+		void cpu_sin(T* ptr1, T* cpu_out, unsigned int N){
+
+			//allocate memory on the GPU for the array
+			T* gpu_ptr1; 
+			T* gpu_out;
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_out, N * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_sin<T>(gpu_ptr1 ,gpu_out, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_ptr1);
+			cudaFree(gpu_out);
+
+		}
+		
+	}
+}
+
+
+
+#endif
 \ No newline at end of file
+#include <stim/image/image.h>
+#include <cmath>
+#include <stim/visualization/colormap.h>
+#include <stim/image/image_contour_detection.h>
+#include <sstream>
+
+
+void array_multiply(float* lhs, float rhs, unsigned int N);
+void array_add(float* ptr1, float* ptr2, float* sum, unsigned int N);
+
+/// This function evaluates the cPb given an multi-channel image
+
+/// @param img is the multi-channel image
+/// @param r is an array of radii for different scaled discs(filters)
+/// @param alpha is is an array of weights for different scaled discs(filters)
+/// @param s is the number of scales
+
+stim::image<float> cPb(stim::image<float> img, int* r, float* alpha, int s){
+
+	unsigned int w = img.width();              // get the width of picture
+	unsigned int h = img.height();             // get the height of picture
+	unsigned int c = img.channels();		   // get the channels of picture
+
+
+	stim::image<float> cPb(w, h, 1);               // allocate space for cPb
+	unsigned size = cPb.size();                    // get the size of cPb
+	memset ( cPb.data(), 0, size * sizeof(float)); // initialize all the pixels of cPb to 0
+
+
+	unsigned int N = w * h;						// get the number of pixels
+	int sigma_n = 3; 							// set the number of standard deviations used to define the sigma
+
+	std::ostringstream ss;                      // (optional) set the stream to designate the test result file
+
+	stim::image<float> temp;                    // set the temporary image to store the addtion result
+
+	for (int i = 0; i < c; i++){
+		for (int j = 0; j < s; j++){
+
+			ss << "data_output/cPb_slice"<< i*s + j << ".bmp";  // set the name for test result file (optional)
+			std::string sss = ss.str();
+		    
+			// get the gaussian gradient by convolving each image slice with the mask
+			temp = Pb(img.channel(i), r[i*s + j], sigma_n);
+
+			// output the test result of each slice (optional) 
+			//stim::cpu2image(temp.data(), sss, w, h, stim::cmBrewer);
+
+			// multiply each gaussian gradient with its weight
+			array_multiply(temp.data(), alpha[i*s + j], N);
+
+			// add up all the weighted gaussian gradients
+			array_add(cPb.data(), temp.data(), cPb.data(), N);
+
+			ss.str("");   //(optional) clear the space for stream
+
+		}	
+	}
+
+	float max = cPb.maxv();						// get the maximum of cPb used for normalization
+	array_multiply(cPb.data(), 1/max, N);		    // normalize the cPb
+
+	// output the test result of cPb (optional) 
+	//stim::cpu2image(cPb.data(),  "data_output/cPb_0916.bmp", w, h, stim::cmBrewer);
+
+	return cPb;
+}
+#include <stim/image/image.h>
+//#include <cmath>
+#include <stim/visualization/colormap.h>
+#include <stim/image/image_contour_detection.h>
+
+/// This function generates the first-order gaussian derivative filter gx gy, 
+/// convolves the image with gx gy, 
+/// and returns an image class which channel(0) is Ix and channel(1) is Iy 
+
+/// @param img is the one-channel image
+/// @param r is an array of radii for different scaled discs(filters)
+/// @param sigma_n is the number of standard deviations used to define the sigma
+
+void conv2_sep(float* img, unsigned int x, unsigned int y, float* kernel0, unsigned int k0, float* kernel1, unsigned int k1);
+//void array_abs(float* img, unsigned int N);
+
+stim::image<float> Gd1(stim::image<float> image, int r, unsigned int sigma_n){
+
+	unsigned int w = image.width();    // get the width of picture
+	unsigned int h = image.height();   // get the height of picture
+	unsigned N = w * h;				   // get the number of pixels of picture
+	int winsize = 2 * r + 1;           // set the winsdow size of disc(filter)
+	float sigma  = float(r)/float(sigma_n); // calculate the sigma used in gaussian function
+
+	stim::image<float> I(w, h, 1, 2);      // allocate space for return image class
+	stim::image<float> Ix(w, h);      // allocate space for Ix
+	stim::image<float> Iy(w, h);      // allocate space for Iy
+	Ix = image;  // initialize Ix
+	Iy = image;  // initialize Iy
+
+	float* array_x1;   
+	array_x1 = new float[winsize];  //allocate space for the 1D x-oriented gaussian derivative filter array_x1 for gx
+	float* array_y1;   
+	array_y1 = new float[winsize];  //allocate space for the 1D y-oriented gaussian derivative filter array_y1 for gx
+	float* array_x2;   
+	array_x2 = new float[winsize];  //allocate space for the 1D x-oriented gaussian derivative filter array_x2 for gy
+	float* array_y2;   
+	array_y2 = new float[winsize];  //allocate space for the 1D y-oriented gaussian derivative filter array_y2 for gy
+
+
+	for (int i = 0; i < winsize; i++){	
+
+		int x = i - r;          //range of x
+		int y = i - r;          //range of y
+
+		// create the 1D x-oriented gaussian derivative filter array_x1 for gx
+		array_x1[i] = (-1) * x * exp((-1)*(pow(x, 2))/(2*pow(sigma, 2)));
+		// create the 1D y-oriented gaussian derivative filter array_y1  for gx
+		array_y1[i] = exp((-1)*(pow(y, 2))/(2*pow(sigma, 2)));
+		// create the 1D x-oriented gaussian derivative filter array_x2 for gy
+		array_x2[i] = exp((-1)*(pow(x, 2))/(2*pow(sigma, 2)));
+		// create the 1D y-oriented gaussian derivative filter array_y2  for gy
+		array_y2[i] = (-1) * y * exp((-1)*(pow(y, 2))/(2*pow(sigma, 2)));
+	}
+
+	//stim::cpu2image(array_x1, "data_output/array_x1_0915.bmp", winsize, 1, stim::cmBrewer); // (optional) show the mask result
+	//stim::cpu2image(array_y1, "data_output/array_y1_0915.bmp", winsize, 1, stim::cmBrewer); // (optional) show the mask result
+	//stim::cpu2image(array_x2, "data_output/array_x2_0915.bmp", winsize, 1, stim::cmBrewer); // (optional) show the mask result
+	//stim::cpu2image(array_y2, "data_output/array_y2_0915.bmp", winsize, 1, stim::cmBrewer); // (optional) show the mask result
+
+	// get Ix by convolving the image with gx
+	conv2_sep(Ix.data(), w, h, array_x1, winsize, array_y1, winsize);
+	
+	//stim::cpu2image(Ix.data(), "data_output/Ix_0915.bmp", w, h, stim::cmBrewer); 
+	// get Iy by convolving the image with gy
+	conv2_sep(Iy.data(), w, h, array_x2, winsize, array_y2, winsize);
+	
+	//stim::cpu2image(Iy.data(), "data_output/Iy_0915.bmp", w, h, stim::cmBrewer); 
+
+	delete [] array_x1;            //free the memory of array_x1
+	delete [] array_y1;            //free the memory of array_y1
+	delete [] array_x2;            //free the memory of array_x2
+	delete [] array_y2;            //free the memory of array_y2
+
+	I.set_channel(0, Ix.data());
+	I.set_channel(1, Iy.data());
+
+	return I;
+
+}
 \ No newline at end of file
+#include <stim/image/image.h>
+#include <cmath>
+#include <stim/visualization/colormap.h>
+#include <stim/image/image_contour_detection.h>
+
+#define PI 3.1415926
+
+void array_multiply(float* lhs, float rhs, unsigned int N);
+void array_add(float* ptr1, float* ptr2, float* sum, unsigned int N);
+void array_abs(float* img, unsigned int N);
+
+/// This function evaluates the theta-dependent odd symmetric gaussian derivative gradient of an one-channel image
+
+/// @param img is the one-channel image
+/// @param r is an array of radii for different scaled discs(filters)
+/// @param sigma_n is the number of standard deviations used to define the sigma
+/// @param theta is angle used for computing the gradient
+
+stim::image<float> Gd_odd(stim::image<float> image, int r, unsigned int sigma_n, float theta){
+
+	float theta_r = (theta * PI)/180; //change angle unit from degree to rad
+
+	unsigned int w = image.width();    // get the width of picture
+	unsigned int h = image.height();   // get the height of picture
+	unsigned N = w * h;				   // get the number of pixels of picture
+	int winsize = 2 * r + 1;           // set the winsdow size of disc(filter)
+
+	stim::image<float> I(w, h, 1, 2);       // allocate space for return image of Gd1
+	stim::image<float> Ix(w, h);       // allocate space for Ix
+	stim::image<float> Iy(w, h);       // allocate space for Iy
+	stim::image<float> Gd_odd_theta(w, h);       // allocate space for Pb
+
+	I = Gd1(image, r, sigma_n);  // calculate the Ix, Iy
+	Ix = I.channel(0);
+	Iy = I.channel(1);
+
+	array_multiply(Ix.data(), cos(theta_r), N);								//Ix = Ix*cos(theta_r)
+	array_multiply(Iy.data(), sin(theta_r), N);								//Iy = Iy*sin(theta_r)
+	array_add(Ix.data(), Iy.data(), Gd_odd_theta.data(), N);				//Gd_odd_theta = Ix + Iy;
+	array_abs(Gd_odd_theta.data(), N);		
+
+	//stim::cpu2image(I.channel(0).data(), "data_output/Gd_odd_x_0919.bmp", w, h, stim::cmBrewer); 
+	//stim::cpu2image(I.channel(1).data(), "data_output/Gd_odd_y_0919.bmp", w, h, stim::cmBrewer); 
+	//stim::cpu2image(Gd_odd_theta.data(), "data_output/Gd_odd_theta_0919.bmp", w, h, stim::cmBrewer); 
+
+	return Gd_odd_theta;
+
+}
+#include <stim/image/image.h>
+//#include <cmath>
+#include <stim/visualization/colormap.h>
+#include <stim/image/image_contour_detection.h>
+
+/// This function generates the second-order gaussian derivative filter gxx gyy, 
+/// convolves the image with gxx gyy, 
+/// and returns an image class which channel(0) is Ixx and channel(1) is Iyy 
+
+/// @param img is the one-channel image
+/// @param r is an array of radii for different scaled discs(filters)
+/// @param sigma_n is the number of standard deviations used to define the sigma
+
+void conv2_sep(float* img, unsigned int x, unsigned int y, float* kernel0, unsigned int k0, float* kernel1, unsigned int k1);
+//void array_abs(float* img, unsigned int N);
+
+stim::image<float> Gd2(stim::image<float> image, int r, unsigned int sigma_n){
+
+	unsigned int w = image.width();    // get the width of picture
+	unsigned int h = image.height();   // get the height of picture
+	unsigned N = w * h;				   // get the number of pixels of picture
+	int winsize = 2 * r + 1;           // set the winsdow size of disc(filter)
+	float sigma  = float(r)/float(sigma_n); // calculate the sigma used in gaussian function
+
+	stim::image<float> I(w, h, 1, 2);      // allocate space for return image class
+	stim::image<float> Ixx(w, h);      // allocate space for Ixx
+	stim::image<float> Iyy(w, h);      // allocate space for Iyy
+	Ixx = image;  // initialize Ixx
+	Iyy = image;  // initialize Iyy
+
+	float* array_x1;   
+	array_x1 = new float[winsize];  //allocate space for the 1D x-oriented gaussian derivative filter array_x1 for gxx
+	float* array_y1;   
+	array_y1 = new float[winsize];  //allocate space for the 1D y-oriented gaussian derivative filter array_y1 for gxx
+	float* array_x2;   
+	array_x2 = new float[winsize];  //allocate space for the 1D x-oriented gaussian derivative filter array_x2 for gyy
+	float* array_y2;   
+	array_y2 = new float[winsize];  //allocate space for the 1D y-oriented gaussian derivative filter array_y2 for gyy
+
+
+	for (int i = 0; i < winsize; i++){	
+
+		int x = i - r;          //range of x
+		int y = i - r;          //range of y
+
+		// create the 1D x-oriented gaussian derivative filter array_x1 for gxx
+		array_x1[i] = (-1) * (1 - pow(x, 2)) * exp((-1)*(pow(x, 2))/(2*pow(sigma, 2)));
+		// create the 1D y-oriented gaussian derivative filter array_y1  for gxx
+		array_y1[i] = exp((-1)*(pow(y, 2))/(2*pow(sigma, 2)));
+		// create the 1D x-oriented gaussian derivative filter array_x2 for gyy
+		array_x2[i] = exp((-1)*(pow(x, 2))/(2*pow(sigma, 2)));
+		// create the 1D y-oriented gaussian derivative filter array_y2  for gyy
+		array_y2[i] = (-1) * (1 - pow(y, 2)) * exp((-1)*(pow(y, 2))/(2*pow(sigma, 2)));
+	}
+
+	//stim::cpu2image(array_x1, "data_output/array_x1_0915.bmp", winsize, 1, stim::cmBrewer); // (optional) show the mask result
+	//stim::cpu2image(array_y1, "data_output/array_y1_0915.bmp", winsize, 1, stim::cmBrewer); // (optional) show the mask result
+	//stim::cpu2image(array_x2, "data_output/array_x2_0915.bmp", winsize, 1, stim::cmBrewer); // (optional) show the mask result
+	//stim::cpu2image(array_y2, "data_output/array_y2_0915.bmp", winsize, 1, stim::cmBrewer); // (optional) show the mask result
+
+	// get Ixx by convolving the image with gxx
+	conv2_sep(Ixx.data(), w, h, array_x1, winsize, array_y1, winsize);
+	
+	//stim::cpu2image(Ixx.data(), "data_output/Ixx_0915.bmp", w, h, stim::cmBrewer); 
+	// get Iyy by convolving the image with gyy
+	conv2_sep(Iyy.data(), w, h, array_x2, winsize, array_y2, winsize);
+	
+	//stim::cpu2image(Iyy.data(), "data_output/Iyy_0915.bmp", w, h, stim::cmBrewer); 
+
+	delete [] array_x1;            //free the memory of array_x1
+	delete [] array_y1;            //free the memory of array_y1
+	delete [] array_x2;            //free the memory of array_x2
+	delete [] array_y2;            //free the memory of array_y2
+
+	I.set_channel(0, Ixx.data());
+	I.set_channel(1, Iyy.data());
+
+	return I;
+
+}
 \ No newline at end of file
+#include <stim/image/image.h>
+#include <cmath>
+#include <stim/visualization/colormap.h>
+#include <stim/image/image_contour_detection.h>
+
+/// This function evaluates the theta-dependent even-symmetric gaussian derivative gradient of an one-channel image
+
+/// @param img is the one-channel image
+/// @param r is an array of radii for different scaled discs(filters)
+/// @param sigma_n is the number of standard deviations used to define the sigma
+/// @param theta is angle used for computing the gradient
+
+void conv2(float* img, float* mask, float* cpu_copy, unsigned int w, unsigned int h, unsigned int M);
+void array_abs(float* img, unsigned int N);
+
+stim::image<float> Gd_even(stim::image<float> image, int r, unsigned int sigma_n, float theta){
+
+	unsigned int w = image.width();    // get the width of picture
+	unsigned int h = image.height();   // get the height of picture
+	unsigned N = w * h;				   // get the number of pixels of picture
+	int winsize = 2 * r + 1;           // set the winsdow size of disc(filter)
+	float sigma  = float(r)/float(sigma_n); // calculate the sigma used in gaussian function
+
+	stim::image<float> I(w, h, 1, 2);      // allocate space for return image class
+	stim::image<float> Gd_even_theta(w, h);      // allocate space for Gd_even_theta
+	stim::image<float> mask_x(winsize, winsize);    // allocate space for x-axis-oriented filter
+	stim::image<float> mask_r(winsize, winsize);    // allocate space for theta-oriented filter
+
+	for (int j = 0; j < winsize; j++){
+		for (int i = 0; i< winsize; i++){
+
+			int x = i - r;          //range of x
+			int y = j - r;          //range of y
+
+			// create the x-oriented gaussian derivative filter mask_x
+			mask_x.data()[j*winsize + i] = (-1) * (1 - pow(x, 2)) * exp((-1)*(pow(x, 2))/(2*pow(sigma, 2))) * exp((-1)*(pow(y, 2))/(2*pow(sigma, 2)));
+		
+		}
+	}
+
+	mask_r = mask_x.rotate(theta, r, r);
+	//mask_r = mask_x.rotate(45, r, r);
+	//stim::cpu2image(mask_r.data(), "data_output/mask_r_0919.bmp", winsize, winsize, stim::cmBrewer);  
+
+	// do the 2D convolution with image and mask
+	conv2(image.data(), mask_r.data(), Gd_even_theta.data(), w, h, winsize);
+	array_abs(Gd_even_theta.data(), N);
+
+	//stim::cpu2image(Gd_even_theta.data(), "data_output/Gd_even_theta_0919.bmp", w, h, stim::cmGrayscale); 
+	
+	return Gd_even_theta;
+}
 \ No newline at end of file
+#include <stim/image/image.h>
+//#include <cmath>
+#include <stim/visualization/colormap.h>
+#include <stim/image/image_contour_detection.h>
+#include <opencv2/opencv.hpp>
+#include <iostream>
+
+/// This function use cvkmeans to cluster given textons  
+
+/// @param testons is a multi-channel image
+/// @param k is the number of clusters
+
+stim::image<float> kmeans(stim::image<float> textons, unsigned int K){
+
+	unsigned int w = textons.width();              // get the width of picture
+	unsigned int h = textons.height();             // get the height of picture
+	unsigned int feature_n = textons.channels();   // get the spectrum of picture
+	unsigned int N = w * h;						   // get the number of pixels
+
+	float* sample1 = (float*) malloc(sizeof(float) * N * feature_n);  //allocate the space for textons
+
+	//reallocate a multi-channel texton image to a single-channel image
+	for(unsigned int c = 0; c < feature_n; c++){
+
+		stim::image<float> temp;
+		temp = textons.channel(c);
+
+		for(unsigned int j = 0; j < N; j++){
+
+			sample1[c + j * feature_n] = temp.data()[j];
+		}
+	}
+	
+	
+	cv::Mat sample2(N, feature_n, CV_32F, sample1); //copy image to cv::mat
+
+	//(optional) show the test result
+	//imshow("sample2", sample2);
+
+
+	cv::TermCriteria criteria(CV_TERMCRIT_EPS+CV_TERMCRIT_ITER, 10, 0.1);	// set stop-criteria for kmeans iteration
+	cv::Mat labels(N, 1, CV_8U, cvScalarAll(0));							// allocate space for kmeans output
+	cv::Mat centers;														// allocate space for kmeans output
+
+	unsigned int test_times = 2;  // set the number of times of trying kmeans, it will return the best result
+
+	cv::kmeans(sample2, K, labels, criteria, test_times, cv::KMEANS_PP_CENTERS, centers); // kmeans clustering
+
+	//(optional) show the test result
+	//imwrite( "data_output/labels_1D.bmp", labels);
+
+	stim::image<float> texture(w, h, 1, 1);						// allocate space for texture
+	 
+	for(unsigned int i = 0; i < N; i++){                        // reshape the labels from iD array to image
+
+		texture.data()[i] = labels.at<int>(i);
+	
+	}
+
+	//texture.save("data_output/kmeans_test0924_2.bmp");
+
+	//(optional) show the test result
+	//stim::cpu2image(texture.data(), "data_output/kmeans_test.bmp", w, h, stim::cmBrewer);  
+
+	return texture;
+
+}
 \ No newline at end of file
+#include <stim/image/image.h>
+#include <cmath>
+#include <stim/visualization/colormap.h>
+#include <stim/image/image_contour_detection.h>
+
+#define PI 3.1415926
+
+void array_multiply(float* lhs, float rhs, unsigned int N);
+void array_add(float* ptr1, float* ptr2, float* sum, unsigned int N);
+void array_abs(float* img, unsigned int N);
+
+/// This function evaluates the center-surround(Laplacian of Gaussian) gaussian derivative gradient of an one-channel image
+
+/// @param img is the one-channel image
+/// @param r is an array of radii for different scaled discs(filters)
+/// @param sigma_n is the number of standard deviations used to define the sigma
+
+stim::image<float> Gd_center(stim::image<float> image, int r, unsigned int sigma_n){
+
+	unsigned int w = image.width();    // get the width of picture
+	unsigned int h = image.height();   // get the height of picture
+	unsigned N = w * h;				   // get the number of pixels of picture
+	int winsize = 2 * r + 1;           // set the winsdow size of disc(filter)
+
+	stim::image<float> I(w, h, 1, 2);       // allocate space for return image of Gd2
+	stim::image<float> Ixx(w, h);       // allocate space for Ixx
+	stim::image<float> Iyy(w, h);       // allocate space for Iyy
+	stim::image<float> Gd_center(w, h);       // allocate space for Pb
+
+	I = Gd2(image, r, sigma_n);  // calculate the Ixx, Iyy
+	Ixx = I.channel(0);
+	Iyy = I.channel(1);
+
+	array_add(Ixx.data(), Iyy.data(), Gd_center.data(), N);				//Gd_center = Ixx + Iyy;
+	array_abs(Gd_center.data(), N);
+
+	//stim::cpu2image(Gd_center.data(), "data_output/Gd_center_0919.bmp", w, h, stim::cmBrewer); 
+
+	return Gd_center;
+
+}
+#include <stim/image/image.h>
+#include <cmath>
+#include <stim/visualization/colormap.h>
+#include <stim/image/image_contour_detection.h>
+#include <sstream>
+
+
+void array_multiply(float* lhs, float rhs, unsigned int N);
+void array_add(float* ptr1, float* ptr2, float* sum, unsigned int N);
+void chi_grad(float* img, float* cpu_copy, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta);
+
+/// This function evaluates the tPb given a grayscale image
+
+/// @param img is the multi-channel image
+/// @param theta_n is the number of angles used for computing oriented chi-gradient
+/// @param r is an array of radii for different scaled discs(filters)
+/// @param alpha is is an array of weights for different scaled discs(filters)
+/// @param s is the number of scales
+/// @param K is the number of clusters
+
+stim::image<float> tPb(stim::image<float> img, int* r, float* alpha, unsigned int theta_n, unsigned int bin_n, int s, unsigned K){
+
+	unsigned int w = img.width();              // get the width of picture
+	unsigned int h = img.height();             // get the height of picture
+	unsigned int N = w * h;						// get the number of pixels
+
+	stim::image<float> img_textons(w, h, 1, theta_n*2+1);               // allocate space for img_textons
+	stim::image<float> img_texture(w, h, 1, 1);               // allocate space for img_texture
+	stim::image<float> tPb_theta(w, h, 1, 1);               // allocate space for tPb_theta
+	stim::image<float> tPb(w, h, 1, 1);						// allocate space for tPb
+	unsigned size = tPb_theta.size();                    // get the size of tPb_theta
+	memset (tPb.data(), 0, size * sizeof(float)); // initialize all the pixels of tPb to 0
+	stim::image<float> temp(w, h, 1, 1);                    // set the temporary image to store the addtion result
+
+	std::ostringstream ss;                      // (optional) set the stream to designate the test result file
+
+	
+	img_textons = textons(img, theta_n);
+
+	img_texture = kmeans(img_textons, K);                // changing kmeans result into float type is required
+
+	stim::cpu2image(img_texture.data(),  "data_output/texture_0925.bmp", w, h, stim::cmBrewer);
+	
+
+	unsigned int max1 = img_texture.maxv();						// get the maximum of Pb used for normalization
+	unsigned int bin_size = (max1 + 1)/bin_n;					// (whether"+1" or not depends on kmeans result)
+
+	for (int i = 0; i < theta_n; i++){
+
+		float theta = 180 * ((float)i/theta_n);              // calculate the even-splited angle for each tPb_theta
+
+		memset (tPb_theta.data(), 0, size * sizeof(float)); // initialize all the pixels of tPb_theta to 0
+
+		//ss << "data_output/0922tPb_theta"<< theta << ".bmp";  // set the name for test result file (optional)
+		//std::string sss = ss.str();
+
+		for (int j = 0; j < s; j++){
+		    
+			// get the chi-gradient by convolving each image slice with the mask
+			chi_grad(img_texture.data(), temp.data(), w, h, r[j], bin_n, bin_size, theta);
+
+			float max2 = temp.maxv();						// get the maximum of tPb_theta used for normalization
+			array_multiply(temp.data(), 1/max2, N);		    // normalize the tPb_theta
+
+			//output the test result of each slice (optional) 
+			//stim::cpu2image(temp.data(), "data_output/tPb_slice0924_2.bmp", w, h, stim::cmBrewer);
+			
+			// multiply each chi-gradient with its weight
+			array_multiply(temp.data(), alpha[j], N);
+			
+			// add up all the weighted chi-gradients
+			array_add(tPb_theta.data(), temp.data(), tPb_theta.data(), N);
+
+
+		}	
+		
+		//ss.str("");   //(optional) clear the space for stream
+
+		for(unsigned long ti = 0; ti < N; ti++){
+
+			if(tPb_theta.data()[ti] > tPb.data()[ti]){          //get the maximum value among all tPb_theta for ith pixel
+				tPb.data()[ti] = tPb_theta.data()[ti];
+			}
+			
+			else{
+			}
+		}
+	}
+	
+	float max3 = tPb.maxv();						// get the maximum of tPb used for normalization
+	array_multiply(tPb.data(), 1/max3, N);		    // normalize the tPb
+
+	//output the test result of tPb (optional) 
+	//stim::cpu2image(tPb.data(),  "data_output/tPb_0922.bmp", w, h, stim::cmBrewer);
+
+	return tPb;
+}
+#include <stim/image/image.h>
+//#include <cmath>
+#include <stim/visualization/colormap.h>
+#include <stim/image/image_contour_detection.h>
+#include <sstream>
+
+/// This function convolve the grayscale image with a set of oriented Gaussian 
+/// derivative filters, and return a texton image with (theta_n*2+1) channels
+
+/// @param image is an one-channel grayscale image
+/// @param theta_n is the number of angles used for computing the gradient
+
+stim::image<float> textons(stim::image<float> image, unsigned int theta_n){
+
+	unsigned int w = image.width();    // get the width of picture
+	unsigned int h = image.height();   // get the height of picture
+	unsigned N = w * h;				   // get the number of pixels of picture
+
+	stim::image<float> textons(w, h, 1, theta_n*2+1);    // allocate space for textons
+	stim::image<float> temp(w, h);						 // allocate space for temp
+
+	unsigned int r_odd = 3;            // set disc radii for odd-symmetric filter
+	unsigned int sigma_n_odd = 3;	   // set sigma_n for odd-symmetric filter
+	unsigned int r_even = 3;		   // set disc radii for even-symmetric filter
+	unsigned int sigma_n_even = 3;     // set sigma_n for even-symmetric filter
+	unsigned int r_center = 3;		   // set disc radii for center-surround filter
+	unsigned int sigma_n_center = 3;   // set sigma_n for center-surround filter
+
+	//std::ostringstream ss1, ss2;                      // (optional) set the stream to designate the test result file
+
+	for (unsigned int i = 0; i < theta_n; i++){
+
+		//ss1 << "data_output/textons_channel_"<< i << ".bmp";  // set the name for test result file (optional)
+		//std::string sss1 = ss1.str();
+		//ss2 << "data_output/textons_channel_"<< i+theta_n << ".bmp";  // set the name for test result file (optional)
+		//std::string sss2 = ss2.str();
+	
+		float theta = 180 * ((float)i/theta_n);              // calculate the even-splited angle for each oriented filter
+
+		temp = Gd_odd(image, r_odd, sigma_n_odd, theta);     // return Gd_odd to temp
+		//stim::cpu2image(temp.data(), sss1, w, h, stim::cmBrewer);  
+		textons.set_channel(i, temp.data());                 // copy temp to ith channel of textons
+
+		temp = Gd_even(image, r_even, sigma_n_even, theta);  // return Gd_even to temp
+		//stim::cpu2image(temp.data(), sss2, w, h, stim::cmBrewer);  
+		textons.set_channel(i + theta_n, temp.data());       // copy temp to (i+theta_n)th channel of textons
+
+		//ss1.str("");   //(optional) clear the space for stream
+		//ss2.str("");   //(optional) clear the space for stream
+
+	}
+
+	temp = Gd_center(image, r_center, sigma_n_center);  // return Gd_center to temp
+	//stim::cpu2image(temp.data(), "data_output/textons_channel_16.bmp", w, h, stim::cmBrewer);  
+	textons.set_channel(theta_n*2, temp.data());        // copy temp to (theta_n*2)th channel of textons
+
+	return textons;
+
+}
+
+	
 \ No newline at end of file
@@ -4,7 +4,7 @@
 #include <cuda.h>
  
 namespace stim{
-
+extern "C"
 int maxThreadsPerBlock()
 {
 	int device;
@@ -13,6 +13,16 @@ int maxThreadsPerBlock()
 	cudaGetDeviceProperties(&props, device);
 	return props.maxThreadsPerBlock;
 }
+
+extern "C"
+int sharedMemPerBlock()
+{
+	int device;
+	cudaGetDevice(&device);		//get the id of the current device
+	cudaDeviceProp props;		//device property structure
+	cudaGetDeviceProperties(&props, device);
+	return props.sharedMemPerBlock;
+}
 }	//end namespace rts
  
 #endif
@@ -165,6 +165,7 @@ namespace stim{
 			cudaFree(gpuDir);
  
 			cudaDestroyTextureObject(texObj);
+			cudaFreeArray(cuArray);
  
 		}
  
+#ifndef STIM_CUDA_CHI_GRAD_H
+#define STIM_CUDA_CHI_GRAD_H
+
+#include <iostream>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stim/cuda/sharedmem.cuh>
+#include <cmath>
+#include <algorithm>
+
+#define PI 3.14159265358979
+
+namespace stim{
+	namespace cuda{
+
+		/// template parameter @param T is the data type
+		template<typename T>
+		__global__ void cuda_chi_grad(T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){
+
+			double theta_r = ((theta) * PI)/180; //change angle unit from degree to rad
+			float sum = 0;
+			unsigned int N = w * h;
+
+			//change 1D index to 2D cordinates
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yj = blockIdx.y;
+			int idx = yj * w + xi;
+			int shareidx = threadIdx.x;
+			
+			extern __shared__ unsigned short bin[];
+
+
+			if(xi < w && yj < h){
+
+				int gidx;
+				int hidx;
+
+				//initialize histogram bin to zeros
+				for(int i = 0; i < bin_n; i++){      
+			
+				bin[shareidx * bin_n + i] = 0;
+				__syncthreads();
+
+				}
+				
+				//get the histogram of the first half of disc and store in bin
+				for (int y = yj - r; y <= yj + r; y++){
+					for (int x = xi - r; x <= xi + r; x++){
+						
+							if ((y - yj)*cos(theta_r) + (x - xi)*sin(theta_r) > 0){
+
+								gidx = (int) tex2D<T>(texObj, (float)x/w, (float)y/h)/bin_size;
+								__syncthreads();
+
+								bin[shareidx * bin_n + gidx]++;
+								__syncthreads();
+
+							}
+
+							else{}
+					}		
+				}
+
+				//initiallize the gbin
+				unsigned short* gbin = (unsigned short*) malloc(bin_n*sizeof(unsigned short));
+				memset (gbin, 0, bin_n*sizeof(unsigned short));  
+
+				//copy the histogram to gbin
+				for (unsigned int gi = 0; gi < bin_n; gi++){
+
+					gbin[gi] = bin[shareidx * bin_n + gi];
+				
+				}
+
+				//initialize histogram bin to zeros
+				for(int j = 0; j < bin_n; j++){      //initialize histogram bin to zeros
+			
+				bin[shareidx * bin_n + j] = 0;
+				__syncthreads();
+				}
+
+				//get the histogram of the second half of disc and store in bin
+				for (int y = yj - r; y <= yj + r; y++){
+					for (int x = xi - r; x <= xi + r; x++){
+						
+							if ((y - yj)*cos(theta_r) + (x - xi)*sin(theta_r) < 0){
+
+								hidx = (int) tex2D<T>(texObj, (float)x/w, (float)y/h)/bin_size;
+								__syncthreads();
+
+								bin[shareidx * bin_n + hidx]++;
+								__syncthreads();	
+
+							}
+							else{}
+					}		
+				}
+
+				//initiallize the gbin
+				unsigned short* hbin = (unsigned short*) malloc(bin_n*sizeof(unsigned short));
+				memset (hbin, 0, bin_n*sizeof(unsigned short));      
+
+				//copy the histogram to hbin
+				for (unsigned int hi = 0; hi < bin_n; hi++){
+
+					hbin[hi] = bin[shareidx * bin_n + hi];
+				
+				}
+
+				//compare gbin, hbin and calculate the chi distance
+				for (int k = 0; k < bin_n; k++){
+
+					float flag;              // set flag to avoid zero denominator
+					
+					if ((gbin[k] + hbin[k]) == 0){
+						flag = 1;
+					}
+					else {
+						flag = (gbin[k] + hbin[k]);
+						__syncthreads();
+					}
+
+					sum += (gbin[k] - hbin[k])*(gbin[k] - hbin[k])/flag;
+					__syncthreads();
+
+				}
+
+				// return chi-distance for each pixel
+				copy[idx] = sum;
+
+				free(gbin);
+				free(hbin);
+			 }
+		}
+		
+
+		template<typename T>
+		void gpu_chi_grad(T* img, T* copy, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){
+
+			unsigned long N = w * h;
+
+			// Allocate CUDA array in device memory
+			
+			//define a channel descriptor for a single 32-bit channel
+			cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+			cudaArray* cuArray;												//declare the cuda array
+			cudaMallocArray(&cuArray, &channelDesc, w, h);			//allocate the cuda array
+
+			// Copy the image data from global memory to the array
+			cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T), cudaMemcpyDeviceToDevice);
+
+			// Specify texture
+			struct cudaResourceDesc resDesc;				//create a resource descriptor
+			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
+			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
+			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
+
+			// Specify texture object parameters
+			struct cudaTextureDesc texDesc;							//create a texture descriptor
+			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
+			texDesc.addressMode[0]   = cudaAddressModeMirror;			//use wrapping (around the edges)
+			texDesc.addressMode[1]   = cudaAddressModeMirror;
+			texDesc.filterMode       = cudaFilterModePoint;			//use linear filtering
+			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
+			texDesc.normalizedCoords = 1;							//using normalized coordinates
+
+			// Create texture object
+			cudaTextureObject_t texObj = 0;
+			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+			int sharemax = stim::sharedMemPerBlock();                   //get the size of Shared memory available per block in bytes
+			unsigned int shared_bytes = threads * bin_n * sizeof(unsigned short);
+
+			if(threads * bin_n > sharemax){
+
+				cout <<"Error: shared_bytes exceeds the max value."<<'\n';
+				exit(1);
+			
+			}
+			
+
+			//calculate the number of blocks
+			dim3 blocks(w / threads + 1, h);
+
+			//call the kernel to do the multiplication
+			cuda_chi_grad <<< blocks, threads, shared_bytes >>>(copy, texObj, w, h, r, bin_n, bin_size, theta);
+
+		}
+
+		template<typename T>
+		void cpu_chi_grad(T* img, T* cpu_copy, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){
+			
+			unsigned long N = w * h;
+			//allocate memory on the GPU for the array
+			T* gpu_img; 
+			T* gpu_copy;
+			HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_chi_grad<T>(gpu_img, gpu_copy, w, h, r, bin_n, bin_size, theta);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_img);
+			cudaFree(gpu_copy);
+
+		}
+		
+	}
+}
+
+
+#endif
 \ No newline at end of file
@@ -11,8 +11,7 @@ namespace stim{
 	namespace cuda{
  
 		template<typename T>
-		//__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){
-		__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){
+		__global__ void cuda_conv2(T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned int M){
  
  
 			//the radius of mask
@@ -34,7 +33,7 @@ namespace stim{
 				//copy[idx] = tex2D<float>(texObj, i+100, j+100);
 				//return;
  
-				//tex2D<float>(texObj, i, j);
+				tex2D<float>(texObj, (float)i/w, (float)j/h);
  
 				//allocate memory for result
 				T sum = 0;
@@ -51,9 +50,7 @@ namespace stim{
 						int xx = x - (i - r);
 						int yy = y - (j - r);
  
-						//T temp = img[y * w + x] * mask[yy * M + xx];
-						//sum += img[y * w + x] * mask[yy * M + xx];
-						sum += tex2D<T>(texObj, x, y) * 1.0;//mask[yy * M + xx];
+						sum += tex2D<T>(texObj, (float)x/w, (float)y/h) * mask[yy * M + xx];
 					}		
 				}
 				copy[idx] = sum;
@@ -88,11 +85,11 @@ namespace stim{
 			// Specify texture object parameters
 			struct cudaTextureDesc texDesc;							//create a texture descriptor
 			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
-			texDesc.addressMode[0]   = cudaAddressModeMirror;			//use wrapping (around the edges)
-			texDesc.addressMode[1]   = cudaAddressModeMirror;
+			texDesc.addressMode[0]   = cudaAddressModeClamp;			//use wrapping (around the edges)
+			texDesc.addressMode[1]   = cudaAddressModeClamp;
 			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
 			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
-			texDesc.normalizedCoords = 0;							//not using normalized coordinates
+			texDesc.normalizedCoords = 1;							//using normalized coordinates
  
 			// Create texture object
 			cudaTextureObject_t texObj = 0;
@@ -109,7 +106,6 @@ namespace stim{
 			cuda_conv2 <<< blocks, threads >>>(img, mask, copy, texObj, w, h, M);
 			cudaDestroyTextureObject(texObj);
 			cudaFreeArray(cuArray);
-
 		}
  
 		template<typename T>
@@ -25,6 +25,17 @@
 	{
 			cudaFree(print);         ///temporary
 	}  
+
+        __device__
+        float templ(int x)                                                                      
+        {
+                if(x < 16/6 || x > 16*5/6 || (x > 16*2/6 && x < 16*4/6)){
+                        return 1.0;
+                }else{
+                        return 0.0;
+                }
+
+        }
  
 	///Find the difference of the given set of samples and the template
 	///using cuda acceleration.
@@ -40,8 +51,9 @@
 		int idx = y*16+x;
  
 		float valIn             = tex2D<unsigned char>(texIn, x, y);
-
-		print[idx]              = abs(valIn);             ///temporary
+		float templa		= templ(x);
+		//print[idx]              = abs(valIn);             ///temporary
+		print[idx]              = abs(templa);             ///temporary
  
 	}
  
@@ -52,7 +64,6 @@
 	///@param GLenum texType	--either GL_TEXTURE_1D, GL_TEXTURE_2D or GL_TEXTURE_3D
 	///				  may work with other gl texture types, but untested.
 	///@param DIM_Y, the number of samples in the template.
-	extern "C"
 	void test(GLint texbufferID, GLenum texType)
 	{
  
@@ -81,7 +92,7 @@
 		cudaDeviceSynchronize();
 		stringstream name;      //for debugging
 		name << "FromTex.bmp";
-		stim::gpu2image<float>(print, name.str(),16,1089*8,0,255);
+		stim::gpu2image<float>(print, name.str(),16,1089*8,0,1.0);
  
 		tx.UnmapCudaTexture();
 		cleanUP();
@@ -21,6 +21,7 @@
 #include <stim/visualization/glObj.h>
 #include <vector>
 #include <stim/cuda/branch_detection.cuh>
+#include "../../../volume-spider/fiber.h"
 //#include <stim/cuda/testKernel.cuh>
  
 //#include <stim/cuda/testKernel.cuh>
@@ -157,16 +158,13 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 		{
 			setMatrix();
 			glCallList(dList+3);
-			std::cerr << 1 << std::endl;
 			std::vector< stim::vec<float> > result = find_branch(
 					btexbufferID, GL_TEXTURE_2D, 16, 216);
 			stim::vec<float> size(S[0]*R[0], S[1]*R[1], S[2]*R[2]);
-			std::cerr << 2 << std::endl;
 			if(!result.empty())
 			{
 				for(int i = 1; i < result.size(); i++)
 				{
-					std::cerr << 2 << " " << i << std::endl;
 					stim::vec<float> cylp(
 						0.5 * cos(2*M_PI*(result[i][1])),
 						0.5 * sin(2*M_PI*(result[i][1])),
@@ -183,12 +181,12 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 						   -p[2] + cylp[2]*S[2]*R[2]);
 						seeddir = seeddir.norm();
 					float seedm = m[0]/2.0;
-/* Uncomment for global run 
-					stim::vec<float> lSeed = getLastSeed();
+// Uncomment for global run 
+/*					stim::vec<float> lSeed = getLastSeed();
 					if(sqrt(pow((lSeed[0] - vec[0]),2)
 					 + pow((lSeed[1] - vec[1]),2) + 
 					 pow((lSeed[2] - vec[2]),2)) > m[0]/4.0
-					 && */ 
+					 &&  */
 					if(
 					 !(vec[0] > size[0] || vec[1] > size[1]
 					 || vec[2] > size[2] || vec[0] < 0
@@ -196,9 +194,8 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 					{
 						setSeed(vec);
 						setSeedVec(seeddir);
-					//	setSeedMag(seedm);
+						setSeedMag(seedm);
 					}
-					std::cerr << 2 << " " << i << " end" <<   std::endl;
 				}
 			}    
  
@@ -1001,7 +998,7 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 				start = std::clock();
 			#endif
 			findOptimalDirection();
-			test(texbufferID, GL_TEXTURE_2D); 
+			//test(texbufferID, GL_TEXTURE_2D); 
 			findOptimalPosition();
 			findOptimalScale();
 			Unbind();
@@ -1024,6 +1021,7 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 				start = std::clock();
 			#endif
 			findOptimalDirection();
+			//test(texbufferID, GL_TEXTURE_2D); 
 			findOptimalPosition();
 			findOptimalScale();
 			Unbind();
@@ -1144,7 +1142,7 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 		{
 			stim::vec<float> pos;
 			stim::vec<float> mag;
-			bool h;
+			int h;
 			bool started = false;
 			bool running = true;
 			stim::vec<float> size(S[0]*R[0], S[1]*R[1], S[2]*R[2]);
@@ -1184,11 +1182,13 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 					{
 						h = selectObject(pos, getDirection(), m[0]);
 						//Have we hit something previously traced?
-						if(h){
-						running = false;
-						break;
+						if(h != -1){
+							std::cout << "I hit a line" << h << std::endl;
+							running = false;
+							break;
 						}
 						else {          
+							cL.push_back(stim::vec<float>(p[0], p[1],p[2]));
 							sk.TexCoord(m[0]);
 							sk.Vertex(p[0], p[1], p[2]);
 							Bind(btexbufferID, bfboId, 27);
@@ -1204,7 +1204,7 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 		}	
  
  
-		bool
+		int
 		selectObject(stim::vec<float> loc, stim::vec<float> dir, float mag) 
 		{
 		//Define the varibles and turn on Selection Mode
@@ -1257,36 +1257,133 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
  
 		//	glEnable(GL_CULL_FACE);
 			hits = glRenderMode(GL_RENDER);
-			bool found_hits = processHits(hits, selectBuf);
+			int found_hits = processHits(hits, selectBuf);
 			return found_hits;
 		}
  
 		//Given a size of the array (hits) and the memory holding it (buffer)
 		//returns whether a hit tool place or not.
-		bool
+		int
 		processHits(GLint hits, GLuint buffer[])
 		{
 			GLuint names, *ptr;
 			//printf("hits = %u\n", hits);
 			ptr = (GLuint *) buffer;
-			for (int i = 0; i < hits; i++) { /*  for each hit  */
-				names = *ptr;
-			//	printf (" number of names for hit = %u\n", names);
-				ptr++;
-				ptr++; //Skip the minimum depth value.  
-				ptr++; //Skip the maximum depth value.
-			//	printf ("   the name is ");                                                        
-			//	for (int j = 0; j < names; j++) {     /*  for each name */
-			//		printf ("%u ", *ptr); ptr++;
-			//	}
-			//	printf ("\n");
-			}
+		//	for (int i = 0; i < hits; i++) { /*  for each hit  */
+			names = *ptr;
+		//		printf (" number of names for hit = %u\n", names);
+			ptr++;
+			ptr++; //Skip the minimum depth value.  
+			ptr++; //Skip the maximum depth value.
+		//		printf ("   the name is ");                                                        
+		//		for (int j = 0; j < names; j++) {     /*  for each name */
+		//			printf ("%u ", *ptr); ptr++;
+		//		}
+		//		printf ("\n");
+	//		}
+
+			
 			if(hits == 0)
-				return 0;
+			{
+				return -1;
+			}
 			else
-				return 1;
+			{
+				printf ("%u ", *ptr);
+				return *ptr;
+			}
+		}
+
+		void
+		clearCurrent()
+		{
+			cL.clear();
+		}	
+	
+		std::pair<stim::fiber<float>, int >
+		traceLine(stim::vec<float> pos, stim::vec<float> mag, int min_cost)
+		{
+			Bind();
+			sk.Begin(stim::OBJ_LINE);
+			sk.createFromSelf(GL_SELECT);
+			std::vector<stim::vec<float> > cM;
+			cL.push_back(pos);
+			cM.push_back(mag);
+
+//			setPosition(pos);
+//			setMagnitude(mag);
+			int h;
+			bool started = false;
+			bool running = true;
+			stim::vec<float> size(S[0]*R[0], S[1]*R[1], S[2]*R[2]);
+			while(running)
+			{
+				int cost = Step();
+				if (cost > min_cost){
+					running = false;
+					sk.End();
+					return pair<stim::fiber<float>, int>(stim::fiber<float> (cL, cM), -1);
+					break;
+				} else {
+					//Have we found the edge of the map?
+					pos = getPosition();
+					if(pos[0] > size[0] || pos[1] > size[1]
+					 || pos[2] > size[2] || pos[0] < 0
+					 || pos[1] < 0 || pos[2] < 0)
+					{
+//					       std::cout << "Found Edge" << std::endl;
+						running = false;
+						sk.End();
+						return pair<stim::fiber<float>, int>(stim::fiber<float> (cL, cM), -1);
+						break;
+					}
+					//If this is the first step in the trace,
+					// save the direction
+					//(to be used later to trace the fiber in the opposite direction)
+					if(started == false){
+						rev = -getDirection();
+						started = true;
+					}
+//					std::cout << i << p << std::endl;
+					//Has the template size gotten unreasonable?
+					mag = getMagnitude();
+					if(mag[0] > 75 || mag[0] < 1){
+//						std::cout << "Magnitude Limit" << std::endl;
+						running = false;
+						sk.End();
+						return pair<stim::fiber<float>, int>(stim::fiber<float> (cL, cM), -1);
+						break;
+					}
+					else
+					{
+						h = selectObject(p, getDirection(), m[0]);
+						//Have we hit something previously traced?
+						if(h != -1){
+							std::cout << "I hit a line" << h << std::endl;
+							running = false;
+							sk.End();
+							return pair<stim::fiber<float>, int>(stim::fiber<float> (cL, cM), h);
+							break;
+						}
+						else {  
+							cL.push_back(stim::vec<float>(p[0], p[1],p[2]));
+							cM.push_back(m[0]);
+							sk.TexCoord(m[0]);
+							sk.Vertex(p[0], p[1], p[2]);
+							Bind(btexbufferID, bfboId, 27);
+							CHECK_OPENGL_ERROR
+							branchDetection();
+							CHECK_OPENGL_ERROR
+							Unbind();
+							CHECK_OPENGL_ERROR
+							
+						}
+				 	}
+                 		}
+         		}
 		}
  
+			
  
 };
 }
@@ -31,8 +31,12 @@ public:
 	}
  
 	/// Constructor initializes an image to a given size
-	image(unsigned int x, unsigned int y = 1, unsigned int z = 1){
+	/*image(unsigned int x, unsigned int y = 1, unsigned int z = 1){
 		img = cimg_library::CImg<T>(x, y, z);
+	}*/
+
+	image(unsigned int x, unsigned int y = 1, unsigned int z = 1, unsigned int c = 1){
+		img = cimg_library::CImg<T>(x, y, z, c);
 	}
  
 	//Load an image from a file
@@ -90,6 +94,23 @@ public:
  
 	}
  
+	/// Copy the given data to the specified channel
+
+	/// @param c is the channel number that the data will be copied to
+	/// @param buffer is a pointer to the image to be copied to channel c
+
+	void set_channel(unsigned int c, T* buffer){
+
+		//calculate the number of pixels in a channel
+		unsigned int channel_size = width() * height();        
+
+		//retreive a pointer to the raw image data
+		T* ptr = img.data() + channel_size * c;
+
+		//copy the buffer to the specified channel
+		memcpy(ptr, buffer, sizeof(T) * channel_size);
+	}
+
 	image<T> getslice(unsigned int c){
  
 		//create a new image
@@ -228,6 +249,18 @@ public:
 	}
  
  
+	image<T> rotate(float angle, float cx, float cy){
+		
+		image<T> result;
+		float zoom = 1;
+		unsigned int interpolation = 1;
+		unsigned int boundary = 1;
+		result.img = img.get_rotate (angle, cx, cy, zoom, interpolation, boundary);
+		//result.save("data_output/test_rotate_neum.bmp");
+
+		return result;
+	}
+	
 };
  
 };		//end namespace stim
-//#include <stim/image/image.h>
-//#include <cmath>
-//#include <stim/visualization/colormap.h>
  
-stim::image<float> gaussian_derivative_filter_odd(stim::image<float> image, float sigma, unsigned int sigma_n, unsigned int winsize, float theta, unsigned int w, unsigned int h);
-stim::image<float> func_mPb_theta(stim::image<float> lab, float theta, unsigned int w, unsigned int h);
-stim::image<float> func_mPb(stim::image<float> lab, unsigned int theta_n, unsigned int w, unsigned int h);
 \ No newline at end of file
+//stim::image<float> gaussian_derivative_filter_odd(stim::image<float> image, int r, unsigned int sigma_n, float theta);
+//stim::image<float> func_mPb_theta(stim::image<float> img, float theta, int* r, float* alpha, int s);
+//stim::image<float> func_mPb(stim::image<float> img, unsigned int theta_n, int* r, float* alpha, int s);
+
+stim::image<float> Gd1(stim::image<float> image, int r, unsigned int sigma_n);
+stim::image<float> Gd2(stim::image<float> image, int r, unsigned int sigma_n);
+stim::image<float> Gd_odd(stim::image<float> image, int r, unsigned int sigma_n, float theta);
+stim::image<float> Gd_even(stim::image<float> image, int r, unsigned int sigma_n, float theta);
+stim::image<float> Gd_center(stim::image<float> image, int r, unsigned int sigma_n);
+
+stim::image<float> textons(stim::image<float> image, unsigned int theta_n);
+stim::image<float> kmeans(stim::image<float> textons, unsigned int K);
+stim::image<float> Pb(stim::image<float> image, int r, unsigned int sigma_n);
+stim::image<float> cPb(stim::image<float> img, int* r, float* alpha, int s);
+stim::image<float> tPb(stim::image<float> img, int* r, float* alpha, unsigned int theta_n, unsigned int bin_n, int s, unsigned int K);
 \ No newline at end of file