From 96f9b10f641f32443600d9789e6d244a285b564f Mon Sep 17 00:00:00 2001
From: Laila Saadatifard <saadatileila@gmail.com>
Date: Mon, 14 Sep 2015 13:43:49 -0500
Subject: [PATCH] change the header files to be compatible with the new organized stim/cuda directory , and add the stim/cude/ivote subdirectory that includes the ivote related functions

---
 stim/cuda/arraymath/array_add.cuh      |   3 +--
 stim/cuda/arraymath/array_multiply.cuh |   3 +--
 stim/cuda/down_sample.cuh              | 101 -----------------------------------------------------------------------------------------------------
 stim/cuda/gaussian_blur.cuh            |  89 -----------------------------------------------------------------------------------------
 stim/cuda/ivote.cuh                    |  17 +++++++++++++++++
 stim/cuda/ivote/down_sample.cuh        | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/ivote/local_max.cuh          | 150 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/ivote/update_dir.cuh         | 231 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/ivote/vote.cuh               | 187 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/local_max.cuh                | 151 -------------------------------------------------------------------------------------------------------------------------------------------------------
 stim/cuda/templates/conv2.cuh          |   3 +--
 stim/cuda/templates/gaussian_blur.cuh  |   1 +
 stim/cuda/templates/gradient.cuh       |   3 +--
 stim/cuda/update_dir.cuh               | 232 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 stim/cuda/vote.cuh                     | 188 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 stim/image/image.h                     |   4 ++--
 stim/math/vector.h                     |   2 +-
 17 files changed, 693 insertions(+), 772 deletions(-)
 delete mode 100644 stim/cuda/down_sample.cuh
 delete mode 100644 stim/cuda/gaussian_blur.cuh
 create mode 100644 stim/cuda/ivote.cuh
 create mode 100644 stim/cuda/ivote/down_sample.cuh
 create mode 100644 stim/cuda/ivote/local_max.cuh
 create mode 100644 stim/cuda/ivote/update_dir.cuh
 create mode 100644 stim/cuda/ivote/vote.cuh
 delete mode 100644 stim/cuda/local_max.cuh
 delete mode 100644 stim/cuda/update_dir.cuh
 delete mode 100644 stim/cuda/vote.cuh
diff --git a/stim/cuda/arraymath/array_add.cuh b/stim/cuda/arraymath/array_add.cuh
index a05353c..cfc35b4 100644
--- a/stim/cuda/arraymath/array_add.cuh
+++ b/stim/cuda/arraymath/array_add.cuh
@@ -3,8 +3,7 @@
 
 #include <iostream>
 #include <cuda.h>
-#include <stim/cuda/devices.h>
-#include <stim/cuda/error.h>
+#include <stim/cuda/cudatools.h>
 
 namespace stim{
 	namespace cuda{
diff --git a/stim/cuda/arraymath/array_multiply.cuh b/stim/cuda/arraymath/array_multiply.cuh
index add19ba..4385e77 100644
--- a/stim/cuda/arraymath/array_multiply.cuh
+++ b/stim/cuda/arraymath/array_multiply.cuh
@@ -3,8 +3,7 @@
 
 #include <iostream>
 #include <cuda.h>
-#include <stim/cuda/devices.h>
-#include <stim/cuda/error.h>
+#include <stim/cuda/cudatools.h>
 
 namespace stim{
 	namespace cuda{
diff --git a/stim/cuda/down_sample.cuh b/stim/cuda/down_sample.cuh
deleted file mode 100644
index ecd19c9..0000000
--- a/stim/cuda/down_sample.cuh
+++ /dev/null
@@ -1,101 +0,0 @@
-#ifndef STIM_CUDA_DOWN_SAMPLE_H
-#define STIM_CUDA_DOWN_SAMPLE_H
-
-#include <iostream>
-#include <cuda.h>
-#include <stim/cuda/devices.h>
-#include <stim/cuda/timer.h>
-#include <stim/cuda/gaussian_blur.cuh>
-
-namespace stim{
-	namespace cuda{
-
-		template<typename T>
-		__global__ void down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
-
-			unsigned int sigma_ds = 1/resize;
-			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
-			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
-			
-			
-			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y;
-			// convert 2D coordinates to 1D
-			int i = yi * x_ds + xi;
-			
-			if(xi< x_ds && yi< y_ds){
-
-				int x_org = xi * sigma_ds;
-				int y_org = yi * sigma_ds;
-				int i_org = y_org * x + x_org;
-				gpuI[i] = gpuI0[i_org];
-			}
-
-		}
-
-
-		/// Applies a Gaussian blur to a 2D image stored on the GPU
-		template<typename T>
-		void gpu_down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
-
-			
-			unsigned int sigma_ds = 1/resize;
-			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
-			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
-			
-			//get the number of pixels in the image
-			unsigned int pixels_ds = x_ds * y_ds;
-			
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads, 1);
-			dim3 blocks(x_ds/threads.x + (x_ds %threads.x == 0 ? 0:1) , y_ds);
-			
-			stim::cuda::gpu_gaussian_blur_2d<float>(gpuI0, sigma_ds,x ,y);
-			
-			//resample the image
-			down_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y);
-
-		}
-
-		/// Applies a Gaussian blur to a 2D image stored on the CPU
-		template<typename T>
-		void cpu_down_sample(T* re_img, T* image, T resize, unsigned int x, unsigned int y){
-
-			//get the number of pixels in the image
-			unsigned int pixels = x * y;
-			unsigned int bytes = sizeof(T) * pixels;
-			
-			unsigned int sigma_ds = 1/resize;
-			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
-			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
-			unsigned int bytes_ds = sizeof(T) * x_ds * y_ds;
-			
-
-
-			//allocate space on the GPU for the original image
-			T* gpuI0;
-			cudaMalloc(&gpuI0, bytes);
-			
-			
-			//copy the image data to the GPU
-			cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
-
-			//allocate space on the GPU for the down sampled image
-			T* gpuI;
-			cudaMalloc(&gpuI, bytes_ds);
-
-			//run the GPU-based version of the algorithm
-			gpu_down_sample<T>(gpuI, gpuI0, resize, x, y);
-
-			//copy the image data to the GPU
-			cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
-
-			cudaFree(gpuI0);
-			cudeFree(gpuI);
-		}
-	
-	}
-}
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/gaussian_blur.cuh b/stim/cuda/gaussian_blur.cuh
deleted file mode 100644
index 0b060f0..0000000
--- a/stim/cuda/gaussian_blur.cuh
+++ /dev/null
@@ -1,89 +0,0 @@
-#ifndef STIM_CUDA_GAUSSIAN_BLUR_H
-#define STIM_CUDA_GAUSSIAN_BLUR_H
-
-#include <iostream>
-#include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-#include <stim/cuda/templates/conv2sep.cuh>		//GPU-based separable convolution algorithm
-
-#define pi	3.14159
-
-namespace stim{
-	namespace cuda{
-
-		template<typename T>
-		void gen_gaussian(T* out, T sigma, unsigned int width){
-
-			//fill the kernel with a gaussian
-			for(unsigned int xi = 0; xi < width; xi++){
-
-				float x = (float)xi - (float)(width/2);	//calculate the x position of the gaussian
-				float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) );
-				out[xi] = g;
-			}
-
-		}
-
-		template<typename T>
-		void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){
-
-			//allocate space for the kernel
-			unsigned int kwidth = sigma * 8 + 1;
-			float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
-
-			//fill the kernel with a gaussian
-			gen_gaussian(kernel0, sigma, kwidth);
-
-			//copy the kernel to the GPU
-			T* gpuKernel0;
-			HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
-
-			//perform the gaussian blur as a separable convolution
-			stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth);
-
-			HANDLE_ERROR(cudaFree(gpuKernel0));
-
-		}
-
-		template<typename T>
-		void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
-
-			//allocate space for the kernel
-			unsigned int kwidth = sigma * 8 + 1;
-			float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
-
-			//fill the kernel with a gaussian
-			gen_gaussian(kernel0, sigma, kwidth);
-
-			//copy the kernel to the GPU
-			T* gpuKernel0;
-			HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
-
-			//perform the gaussian blur as a separable convolution
-			stim::cuda::gpu_conv2sep<float>(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth);
-
-			HANDLE_ERROR(cudaFree(gpuKernel0));
-
-		}
-
-		/// Applies a Gaussian blur to a 2D image stored on the CPU
-		template<typename T>
-		void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
-
-			//allocate space for the kernel
-			unsigned int kwidth = sigma * 8 + 1;
-			float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
-
-			//fill the kernel with a gaussian
-			gen_gaussian(kernel0, sigma, kwidth);
-
-			//perform the gaussian blur as a separable convolution
-			stim::cuda::cpu_conv2sep<float>(image, x, y, kernel0, kwidth, kernel0, kwidth);
-			
-		}
-		
-	};
-};
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote.cuh b/stim/cuda/ivote.cuh
new file mode 100644
index 0000000..cc07d1d
--- /dev/null
+++ b/stim/cuda/ivote.cuh
@@ -0,0 +1,17 @@
+#ifndef STIM_CUDA_IVOTE_H
+#define STIM_CUDA_IVOTE_H
+
+#include <stim/cuda/ivote/down_sample.cuh>
+#include <stim/cuda/ivote/local_max.cuh>
+#include <stim/cuda/ivote/update_dir.cuh>
+#include <stim/cuda/ivote/vote.cuh>
+
+namespace stim{
+	namespace cuda{
+	
+	}
+}
+
+
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/down_sample.cuh b/stim/cuda/ivote/down_sample.cuh
new file mode 100644
index 0000000..598d095
--- /dev/null
+++ b/stim/cuda/ivote/down_sample.cuh
@@ -0,0 +1,100 @@
+#ifndef STIM_CUDA_DOWN_SAMPLE_H
+#define STIM_CUDA_DOWN_SAMPLE_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/templates/gaussian_blur.cuh>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
+
+			unsigned int sigma_ds = 1/resize;
+			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
+			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
+			
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x_ds + xi;
+			
+			if(xi< x_ds && yi< y_ds){
+
+				int x_org = xi * sigma_ds;
+				int y_org = yi * sigma_ds;
+				int i_org = y_org * x + x_org;
+				gpuI[i] = gpuI0[i_org];
+			}
+
+		}
+
+
+		/// Applies a Gaussian blur to a 2D image stored on the GPU
+		template<typename T>
+		void gpu_down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
+
+			
+			unsigned int sigma_ds = 1/resize;
+			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
+			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
+			
+			//get the number of pixels in the image
+			unsigned int pixels_ds = x_ds * y_ds;
+			
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x_ds/threads.x + (x_ds %threads.x == 0 ? 0:1) , y_ds);
+			
+			stim::cuda::gpu_gaussian_blur2<float>(gpuI0, sigma_ds,x ,y);
+			
+			//resample the image
+			down_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y);
+
+		}
+
+		/// Applies a Gaussian blur to a 2D image stored on the CPU
+		template<typename T>
+		void cpu_down_sample(T* re_img, T* image, T resize, unsigned int x, unsigned int y){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			unsigned int bytes = sizeof(T) * pixels;
+			
+			unsigned int sigma_ds = 1/resize;
+			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
+			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
+			unsigned int bytes_ds = sizeof(T) * x_ds * y_ds;
+			
+
+
+			//allocate space on the GPU for the original image
+			T* gpuI0;
+			cudaMalloc(&gpuI0, bytes);
+			
+			
+			//copy the image data to the GPU
+			cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
+
+			//allocate space on the GPU for the down sampled image
+			T* gpuI;
+			cudaMalloc(&gpuI, bytes_ds);
+
+			//run the GPU-based version of the algorithm
+			gpu_down_sample<T>(gpuI, gpuI0, resize, x, y);
+
+			//copy the image data to the GPU
+			cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
+
+			cudaFree(gpuI0);
+			cudeFree(gpuI);
+		}
+	
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/local_max.cuh b/stim/cuda/ivote/local_max.cuh
new file mode 100644
index 0000000..9519735
--- /dev/null
+++ b/stim/cuda/ivote/local_max.cuh
@@ -0,0 +1,150 @@
+#ifndef STIM_CUDA_LOCAL_MAX_H
+#define STIM_CUDA_LOCAL_MAX_H
+
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+
+namespace stim{
+	namespace cuda{
+
+
+		// this kernel calculates the local maximum for finding the cell centers
+		template<typename T>
+		__global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
+
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+
+
+			//calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
+			int xl = xi - conn;
+			int yl = yi - conn;
+			
+			// use zero for the lowest limits if the xi or yi is less than conn.
+			if (xi <= conn)
+				xl = 0;
+			if (yi <= conn)
+				yl = 0;
+
+			//calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
+			int xh = xi + conn;
+			int yh = yi + conn;
+
+			// use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn.
+			if (xi >= x - conn)
+				xh = x;
+			if (yi>= y - conn)
+				yh = y;
+
+			// calculate the limits for finding the local maximum location in the connected neighbors for the current pixel
+			int n_l = yl * x + xl;
+			int n_h = yh * x + xh;
+			
+			//initial the centers image to zero
+			gpuCenters[i] = 0;
+
+
+			int n = n_l;
+
+			float l_value = 0;
+
+			if (i < x * y)
+
+				// check if the vote value for this pixel is greater than threshold, so this pixel may be a local max.
+				if (gpuVote[i]>final_t){
+
+					// compare the vote value for this pixel with the vote values with its neighbors.
+					while (n<=n_h){
+
+						// check if this vote value is a local max in its neighborhood.
+							if (gpuVote[i] < gpuVote[n]){
+								l_value  = 0;
+								n =n_h+1;
+							}
+							else if (n == n_h){
+								l_value = 1;
+								n = n+1;
+							}
+							// check if the current neighbor is the last one at the current row
+							else if ((n - n_l - 2*conn)% x ==0){
+								n = n + x - 2*conn -1;
+								n ++;						
+							}
+							else
+								n ++;
+				}
+					// set the center value for this pixel to high if it's a local max ,and to low if not.
+					gpuCenters[i] = l_value ;
+				}
+
+		}
+
+
+
+		template<typename T>
+		void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
+
+			
+			
+			
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+			
+			
+			//call the kernel to find the local maximum.
+			cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y);
+
+
+		}
+
+
+
+		template<typename T>
+		void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
+		
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			// allocate space on the GPU for the detected cell centes
+			T* gpuCenters;
+			cudaMalloc(&gpuCenters, bytes);		
+
+
+			//allocate space on the GPU for the input Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			
+			//copy the Vote image data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
+
+			
+			//call the GPU version of the local max function
+			gpu_local_max<T>(gpuCenters, gpuVote, final_t, conn, x, y);
+
+				
+			//copy the cell centers data to the CPU
+			cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ;
+
+			
+			//free allocated memory
+			cudaFree(gpuCenters);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/update_dir.cuh b/stim/cuda/ivote/update_dir.cuh
new file mode 100644
index 0000000..5ec9f0a
--- /dev/null
+++ b/stim/cuda/ivote/update_dir.cuh
@@ -0,0 +1,231 @@
+#ifndef STIM_CUDA_UPDATE_DIR_H
+#define STIM_CUDA_UPDATE_DIR_H
+
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+
+namespace stim{
+	namespace cuda{
+	
+		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
+		template<typename T>
+		__global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ float s_vote[];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			
+			//calculate the width of the shared memory block
+			int swidth = 2 * rmax + blockDim.x;
+
+			// calculate the 2D coordinates for this current thread.
+			int xi = bxi + threadIdx.x;
+			int yi = blockIdx.y;
+
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuGrad[2*i];
+
+			//initialize the vote direction to zero
+			gpuDir[i] = 0;
+
+			// define a local variable to maximum value of the vote image in the voting area for this voter
+			float max = 0;
+
+			// define two local variables for the x and y coordinations where the maximum happened
+			int id_x = 0;
+			int id_y = 0;
+
+			// compute the size of window which will be checked for finding the voting area for this voter
+			unsigned int x_table = 2*rmax +1;
+			unsigned int rmax_sq = rmax * rmax;
+			int r = (int)rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			int bxs = bxi - rmax;
+			
+			for(int yr = -r; yr <= r; yr++){
+
+				//copy the portion of the image necessary for this block to shared memory
+				__syncthreads();
+				stim::cuda::sharedMemcpy_tex2D<float>(s_vote, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim);
+				__syncthreads();
+				
+				//if the current thread is outside of the image, it doesn't have to be computed
+				if(xi < x && yi < y){
+
+					for(int xr = -r; xr <= r; xr++){
+
+						unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+						// calculate the angle between the voter and the current pixel in x and y directions
+						float atan_angle = gpuTable[ind_t];
+
+
+						// calculate the voting direction based on the grtadient direction
+						int idx_share_update = xr + tx_rmax ;
+						float share_vote = s_vote[idx_share_update];
+						
+						// check if the current pixel is located in the voting area of this voter.
+						if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+
+						// compare the vote value of this pixel with the max value to find the maxima and its index.
+							if  (share_vote>max) {
+
+								max = share_vote;
+								id_x =  xr;
+								id_y =  yr;
+							}
+						}
+					}
+				}
+			}
+			
+				
+		//float new_angle = atan2(dy, dx);
+		unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
+
+		float new_angle = gpuTable[ind_m];
+
+		gpuDir[i] = new_angle;
+
+		}
+
+		// this kernel updates the gradient direction by the calculated voting direction.
+		template<typename T>
+		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, unsigned int x, unsigned int y){
+
+			//************ when the number of threads are (1024,1) *************
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+
+			//update the gradient image with the vote direction
+			gpuGrad[2*i] = gpuDir[i];
+		}
+
+
+		template<typename T>
+		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			unsigned int bytes = sizeof(T) * pixels;
+
+			
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+			// Allocate CUDA array in device memory
+			
+			//define a channel descriptor for a single 32-bit channel
+			cudaChannelFormatDesc channelDesc =
+					   cudaCreateChannelDesc(32, 0, 0, 0,
+											 cudaChannelFormatKindFloat);
+			cudaArray* cuArray;												//declare the cuda array
+			cudaMallocArray(&cuArray, &channelDesc, x, y);			//allocate the cuda array
+
+			// Copy the image data from global memory to the array
+			cudaMemcpyToArray(cuArray, 0, 0, gpuVote, bytes,
+							  cudaMemcpyDeviceToDevice);
+
+			// Specify texture
+			struct cudaResourceDesc resDesc;				//create a resource descriptor
+			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
+			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
+			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
+
+			// Specify texture object parameters
+			struct cudaTextureDesc texDesc;							//create a texture descriptor
+			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
+			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
+			texDesc.addressMode[1]   = cudaAddressModeWrap;
+			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
+			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
+			texDesc.normalizedCoords = 0;							//not using normalized coordinates
+
+			// Create texture object
+			cudaTextureObject_t texObj = 0;
+			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
+
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*(1)*4;
+			
+			// allocate space on the GPU for the updated vote direction
+			T* gpuDir;
+			cudaMalloc(&gpuDir, bytes);	
+
+			//call the kernel to calculate the new voting direction
+			cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, texObj, gpuGrad, gpuTable, phi, rmax, x , y);
+
+			//call the kernel to update the gradient direction
+			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
+
+
+			//free allocated memory
+			cudaFree(gpuDir);
+
+		}
+
+
+		template<typename T>
+		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);
+
+			//copy the input vote image to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+
+			
+			//call the GPU version of the update direction function
+			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+				
+			//copy the new gradient image back to the CPU
+			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/vote.cuh b/stim/cuda/ivote/vote.cuh
new file mode 100644
index 0000000..7e98894
--- /dev/null
+++ b/stim/cuda/ivote/vote.cuh
@@ -0,0 +1,187 @@
+#ifndef STIM_CUDA_VOTE_H
+#define STIM_CUDA_VOTE_H
+
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ float2 s_grad[];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			
+			//calculate the width of the shared memory block
+			int swidth = 2 * rmax + blockDim.x;
+
+			// calculate the 2D coordinates for this current thread.
+			int xi = bxi + threadIdx.x;
+			int yi = blockIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			
+			// define a local variable to sum the votes from the voters
+			float sum = 0;
+
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			unsigned int x_table = 2*rmax +1;
+
+			unsigned int rmax_sq = rmax * rmax;
+			int r = (int)rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			int bxs = bxi - rmax;
+			
+			
+				for(int yr = -r; yr <= r; yr++){
+
+				//copy the portion of the image necessary for this block to shared memory
+				__syncthreads();
+				stim::cuda::sharedMemcpy_tex2D<float2>(s_grad, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim);
+				__syncthreads();
+				
+				//if the current thread is outside of the image, it doesn't have to be computed
+				if(xi < x && yi < y){
+					
+					for(int xr = -r; xr <= r; xr++){
+					
+							//find the location of this voter in the atan2 table
+							unsigned int id_t = (yr + rmax) * x_table + xr + rmax;
+
+							// calculate the angle between the pixel and the current voter in x and y directions
+							float atan_angle = gpuTable[id_t];
+							
+					
+							// calculate the voting direction based on the grtadient direction
+							int idx_share = xr + tx_rmax ;
+							float2 g = s_grad[idx_share];
+							float theta = g.x;
+
+							// check if the current voter is located in the voting area of this pixel.
+							if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+								sum += g.y;		
+
+							}
+				
+					}
+				}
+			}
+
+			gpuVote[i] = sum;
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			unsigned int bytes = sizeof(T) * pixels;
+
+			
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			//unsigned int thread_dim = sqrt(max_threads);
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+			// Allocate CUDA array in device memory
+			
+			//define a channel descriptor for a single 32-bit channel
+			cudaChannelFormatDesc channelDesc =
+					   cudaCreateChannelDesc(32, 32, 0, 0,
+											 cudaChannelFormatKindFloat);
+			cudaArray* cuArray;												//declare the cuda array
+			cudaMallocArray(&cuArray, &channelDesc, x, y);			//allocate the cuda array
+
+			// Copy the image data from global memory to the array
+			cudaMemcpyToArray(cuArray, 0, 0, gpuGrad, bytes*2,
+							  cudaMemcpyDeviceToDevice);
+
+			// Specify texture
+			struct cudaResourceDesc resDesc;				//create a resource descriptor
+			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
+			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
+			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
+
+			// Specify texture object parameters
+			struct cudaTextureDesc texDesc;							//create a texture descriptor
+			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
+			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
+			texDesc.addressMode[1]   = cudaAddressModeWrap;
+			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
+			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
+			texDesc.normalizedCoords = 0;							//not using normalized coordinates
+
+			// Create texture object
+			cudaTextureObject_t texObj = 0;
+			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
+
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4;
+			
+
+			//call the kernel to do the voting
+			
+			cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+
+			//cudaMemcpyToSymbol (cstTable, cpuTable, bytes_table );
+
+			
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+				
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/local_max.cuh b/stim/cuda/local_max.cuh
deleted file mode 100644
index 5083952..0000000
--- a/stim/cuda/local_max.cuh
+++ /dev/null
@@ -1,151 +0,0 @@
-#ifndef STIM_CUDA_LOCAL_MAX_H
-#define STIM_CUDA_LOCAL_MAX_H
-
-
-# include <iostream>
-# include <cuda.h>
-# include <stim/cuda/devices.h>
-# include <stim/cuda/error.h>
-
-namespace stim{
-	namespace cuda{
-
-
-		// this kernel calculates the local maximum for finding the cell centers
-		template<typename T>
-		__global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
-
-			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y;
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
-
-
-
-			//calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
-			int xl = xi - conn;
-			int yl = yi - conn;
-			
-			// use zero for the lowest limits if the xi or yi is less than conn.
-			if (xi <= conn)
-				xl = 0;
-			if (yi <= conn)
-				yl = 0;
-
-			//calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
-			int xh = xi + conn;
-			int yh = yi + conn;
-
-			// use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn.
-			if (xi >= x - conn)
-				xh = x;
-			if (yi>= y - conn)
-				yh = y;
-
-			// calculate the limits for finding the local maximum location in the connected neighbors for the current pixel
-			int n_l = yl * x + xl;
-			int n_h = yh * x + xh;
-			
-			//initial the centers image to zero
-			gpuCenters[i] = 0;
-
-
-			int n = n_l;
-
-			float l_value = 0;
-
-			if (i < x * y)
-
-				// check if the vote value for this pixel is greater than threshold, so this pixel may be a local max.
-				if (gpuVote[i]>final_t){
-
-					// compare the vote value for this pixel with the vote values with its neighbors.
-					while (n<=n_h){
-
-						// check if this vote value is a local max in its neighborhood.
-							if (gpuVote[i] < gpuVote[n]){
-								l_value  = 0;
-								n =n_h+1;
-							}
-							else if (n == n_h){
-								l_value = 1;
-								n = n+1;
-							}
-							// check if the current neighbor is the last one at the current row
-							else if ((n - n_l - 2*conn)% x ==0){
-								n = n + x - 2*conn -1;
-								n ++;						
-							}
-							else
-								n ++;
-				}
-					// set the center value for this pixel to high if it's a local max ,and to low if not.
-					gpuCenters[i] = l_value ;
-				}
-
-		}
-
-
-
-		template<typename T>
-		void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
-
-			
-			
-			
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads, 1);
-			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
-			
-			
-			
-			//call the kernel to find the local maximum.
-			cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y);
-
-
-		}
-
-
-
-		template<typename T>
-		void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
-		
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			// allocate space on the GPU for the detected cell centes
-			T* gpuCenters;
-			cudaMalloc(&gpuCenters, bytes);		
-
-
-			//allocate space on the GPU for the input Vote Image
-			T* gpuVote;
-			cudaMalloc(&gpuVote, bytes);		
-
-			
-			//copy the Vote image data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
-
-			
-			//call the GPU version of the local max function
-			gpu_local_max<T>(gpuCenters, gpuVote, final_t, conn, x, y);
-
-				
-			//copy the cell centers data to the CPU
-			cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ;
-
-			
-			//free allocated memory
-			cudaFree(gpuCenters);
-			cudaFree(gpuVote);
-			cudaFree(gpuGrad);
-		}
-		
-	}
-}
-
-
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/templates/conv2.cuh b/stim/cuda/templates/conv2.cuh
index a40fdd0..045e06d 100644
--- a/stim/cuda/templates/conv2.cuh
+++ b/stim/cuda/templates/conv2.cuh
@@ -3,8 +3,7 @@
 
 #include <iostream>
 #include <cuda.h>
-#include <stim/cuda/devices.h>
-#include <stim/cuda/error.h>
+#include <stim/cuda/cudatools.h>
 #include <cmath>
 #include <algorithm>
 
diff --git a/stim/cuda/templates/gaussian_blur.cuh b/stim/cuda/templates/gaussian_blur.cuh
index 0b060f0..f4485ae 100644
--- a/stim/cuda/templates/gaussian_blur.cuh
+++ b/stim/cuda/templates/gaussian_blur.cuh
@@ -58,6 +58,7 @@ namespace stim{
 
 			//copy the kernel to the GPU
 			T* gpuKernel0;
+			HANDLE_ERROR(cudaMalloc(&gpuKernel0, kwidth * sizeof(T)));
 			HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
 
 			//perform the gaussian blur as a separable convolution
diff --git a/stim/cuda/templates/gradient.cuh b/stim/cuda/templates/gradient.cuh
index 95931dc..c55ec91 100644
--- a/stim/cuda/templates/gradient.cuh
+++ b/stim/cuda/templates/gradient.cuh
@@ -3,8 +3,7 @@
 
 #include <iostream>
 #include <cuda.h>
-#include <stim/cuda/devices.h>
-#include <stim/cuda/error.h>
+#include <stim/cuda/cudatools.h>
 
 namespace stim{
 	namespace cuda{
diff --git a/stim/cuda/update_dir.cuh b/stim/cuda/update_dir.cuh
deleted file mode 100644
index 8522f2f..0000000
--- a/stim/cuda/update_dir.cuh
+++ /dev/null
@@ -1,232 +0,0 @@
-#ifndef STIM_CUDA_UPDATE_DIR_H
-#define STIM_CUDA_UPDATE_DIR_H
-
-
-# include <iostream>
-# include <cuda.h>
-# include <stim/cuda/devices.h>
-# include <stim/cuda/error.h>
-#include <stim/cuda/sharedmem.cuh>
-
-namespace stim{
-	namespace cuda{
-	
-		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
-		template<typename T>
-		__global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//generate a pointer to shared memory (size will be specified as a kernel parameter)
-			extern __shared__ float s_vote[];
-
-			//calculate the start point for this block
-			int bxi = blockIdx.x * blockDim.x;
-			
-			//calculate the width of the shared memory block
-			int swidth = 2 * rmax + blockDim.x;
-
-			// calculate the 2D coordinates for this current thread.
-			int xi = bxi + threadIdx.x;
-			int yi = blockIdx.y;
-
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
-
-			// calculate the voting direction based on the grtadient direction
-			float theta = gpuGrad[2*i];
-
-			//initialize the vote direction to zero
-			gpuDir[i] = 0;
-
-			// define a local variable to maximum value of the vote image in the voting area for this voter
-			float max = 0;
-
-			// define two local variables for the x and y coordinations where the maximum happened
-			int id_x = 0;
-			int id_y = 0;
-
-			// compute the size of window which will be checked for finding the voting area for this voter
-			unsigned int x_table = 2*rmax +1;
-			unsigned int rmax_sq = rmax * rmax;
-			int r = (int)rmax;
-			int tx_rmax = threadIdx.x + rmax;
-			int bxs = bxi - rmax;
-			
-			for(int yr = -r; yr <= r; yr++){
-
-				//copy the portion of the image necessary for this block to shared memory
-				__syncthreads();
-				stim::cuda::sharedMemcpy_tex2D<float>(s_vote, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim);
-				__syncthreads();
-				
-				//if the current thread is outside of the image, it doesn't have to be computed
-				if(xi < x && yi < y){
-
-					for(int xr = -r; xr <= r; xr++){
-
-						unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
-
-						// calculate the angle between the voter and the current pixel in x and y directions
-						float atan_angle = gpuTable[ind_t];
-
-
-						// calculate the voting direction based on the grtadient direction
-						int idx_share_update = xr + tx_rmax ;
-						float share_vote = s_vote[idx_share_update];
-						
-						// check if the current pixel is located in the voting area of this voter.
-						if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
-
-						// compare the vote value of this pixel with the max value to find the maxima and its index.
-							if  (share_vote>max) {
-
-								max = share_vote;
-								id_x =  xr;
-								id_y =  yr;
-							}
-						}
-					}
-				}
-			}
-			
-				
-		//float new_angle = atan2(dy, dx);
-		unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
-
-		float new_angle = gpuTable[ind_m];
-
-		gpuDir[i] = new_angle;
-
-		}
-
-		// this kernel updates the gradient direction by the calculated voting direction.
-		template<typename T>
-		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, unsigned int x, unsigned int y){
-
-			//************ when the number of threads are (1024,1) *************
-			
-			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y;
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
-
-
-			//update the gradient image with the vote direction
-			gpuGrad[2*i] = gpuDir[i];
-		}
-
-
-		template<typename T>
-		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//get the number of pixels in the image
-			unsigned int pixels = x * y;
-			unsigned int bytes = sizeof(T) * pixels;
-
-			
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads, 1);
-			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
-			
-			// Allocate CUDA array in device memory
-			
-			//define a channel descriptor for a single 32-bit channel
-			cudaChannelFormatDesc channelDesc =
-					   cudaCreateChannelDesc(32, 0, 0, 0,
-											 cudaChannelFormatKindFloat);
-			cudaArray* cuArray;												//declare the cuda array
-			cudaMallocArray(&cuArray, &channelDesc, x, y);			//allocate the cuda array
-
-			// Copy the image data from global memory to the array
-			cudaMemcpyToArray(cuArray, 0, 0, gpuVote, bytes,
-							  cudaMemcpyDeviceToDevice);
-
-			// Specify texture
-			struct cudaResourceDesc resDesc;				//create a resource descriptor
-			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
-			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
-			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
-
-			// Specify texture object parameters
-			struct cudaTextureDesc texDesc;							//create a texture descriptor
-			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
-			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
-			texDesc.addressMode[1]   = cudaAddressModeWrap;
-			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
-			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
-			texDesc.normalizedCoords = 0;							//not using normalized coordinates
-
-			// Create texture object
-			cudaTextureObject_t texObj = 0;
-			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
-
-			// specify  share memory
-			unsigned int share_bytes = (2*rmax + threads.x)*(1)*4;
-			
-			// allocate space on the GPU for the updated vote direction
-			T* gpuDir;
-			cudaMalloc(&gpuDir, bytes);	
-
-			//call the kernel to calculate the new voting direction
-			cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, texObj, gpuGrad, gpuTable, phi, rmax, x , y);
-
-			//call the kernel to update the gradient direction
-			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
-
-
-			//free allocated memory
-			cudaFree(gpuDir);
-
-		}
-
-
-		template<typename T>
-		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			//calculate the number of bytes in the atan2 table
-			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
-
-			//allocate space on the GPU for the Vote Image
-			T* gpuVote;
-			cudaMalloc(&gpuVote, bytes);
-
-			//copy the input vote image to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
-
-			//allocate space on the GPU for the input Gradient image
-			T* gpuGrad;
-			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
-
-			//copy the Gradient data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
-
-			//allocate space on the GPU for the atan2 table
-			T* gpuTable;
-			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
-
-			//copy the atan2 values to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
-
-			
-			//call the GPU version of the update direction function
-			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-
-				
-			//copy the new gradient image back to the CPU
-			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
-
-			//free allocated memory
-			cudaFree(gpuTable);
-			cudaFree(gpuVote);
-			cudaFree(gpuGrad);
-		}
-		
-	}
-}
-
-
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/vote.cuh b/stim/cuda/vote.cuh
deleted file mode 100644
index 9abcb04..0000000
--- a/stim/cuda/vote.cuh
+++ /dev/null
@@ -1,188 +0,0 @@
-#ifndef STIM_CUDA_VOTE_H
-#define STIM_CUDA_VOTE_H
-
-
-# include <iostream>
-# include <cuda.h>
-# include <stim/cuda/devices.h>
-# include <stim/cuda/error.h>
-#include <stim/cuda/sharedmem.cuh>
-
-
-namespace stim{
-	namespace cuda{
-
-		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
-		template<typename T>
-		__global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//generate a pointer to shared memory (size will be specified as a kernel parameter)
-			extern __shared__ float2 s_grad[];
-
-			//calculate the start point for this block
-			int bxi = blockIdx.x * blockDim.x;
-			
-			//calculate the width of the shared memory block
-			int swidth = 2 * rmax + blockDim.x;
-
-			// calculate the 2D coordinates for this current thread.
-			int xi = bxi + threadIdx.x;
-			int yi = blockIdx.y;
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
-
-			
-			// define a local variable to sum the votes from the voters
-			float sum = 0;
-
-			// compute the size of window which will be checked for finding the proper voters for this pixel
-			unsigned int x_table = 2*rmax +1;
-
-			unsigned int rmax_sq = rmax * rmax;
-			int r = (int)rmax;
-			int tx_rmax = threadIdx.x + rmax;
-			int bxs = bxi - rmax;
-			
-			
-				for(int yr = -r; yr <= r; yr++){
-
-				//copy the portion of the image necessary for this block to shared memory
-				__syncthreads();
-				stim::cuda::sharedMemcpy_tex2D<float2>(s_grad, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim);
-				__syncthreads();
-				
-				//if the current thread is outside of the image, it doesn't have to be computed
-				if(xi < x && yi < y){
-					
-					for(int xr = -r; xr <= r; xr++){
-					
-							//find the location of this voter in the atan2 table
-							unsigned int id_t = (yr + rmax) * x_table + xr + rmax;
-
-							// calculate the angle between the pixel and the current voter in x and y directions
-							float atan_angle = gpuTable[id_t];
-							
-					
-							// calculate the voting direction based on the grtadient direction
-							int idx_share = xr + tx_rmax ;
-							float2 g = s_grad[idx_share];
-							float theta = g.x;
-
-							// check if the current voter is located in the voting area of this pixel.
-							if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
-								sum += g.y;		
-
-							}
-				
-					}
-				}
-			}
-
-			gpuVote[i] = sum;
-		}
-
-		template<typename T>
-		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//get the number of pixels in the image
-			unsigned int pixels = x * y;
-			unsigned int bytes = sizeof(T) * pixels;
-
-			
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			//unsigned int thread_dim = sqrt(max_threads);
-			dim3 threads(max_threads, 1);
-			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
-			
-			// Allocate CUDA array in device memory
-			
-			//define a channel descriptor for a single 32-bit channel
-			cudaChannelFormatDesc channelDesc =
-					   cudaCreateChannelDesc(32, 32, 0, 0,
-											 cudaChannelFormatKindFloat);
-			cudaArray* cuArray;												//declare the cuda array
-			cudaMallocArray(&cuArray, &channelDesc, x, y);			//allocate the cuda array
-
-			// Copy the image data from global memory to the array
-			cudaMemcpyToArray(cuArray, 0, 0, gpuGrad, bytes*2,
-							  cudaMemcpyDeviceToDevice);
-
-			// Specify texture
-			struct cudaResourceDesc resDesc;				//create a resource descriptor
-			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
-			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
-			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
-
-			// Specify texture object parameters
-			struct cudaTextureDesc texDesc;							//create a texture descriptor
-			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
-			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
-			texDesc.addressMode[1]   = cudaAddressModeWrap;
-			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
-			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
-			texDesc.normalizedCoords = 0;							//not using normalized coordinates
-
-			// Create texture object
-			cudaTextureObject_t texObj = 0;
-			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
-
-			// specify  share memory
-			unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4;
-			
-
-			//call the kernel to do the voting
-			
-			cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y);
-
-		}
-
-
-		template<typename T>
-		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			//calculate the number of bytes in the atan2 table
-			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
-
-			//allocate space on the GPU for the Vote Image
-			T* gpuVote;
-			cudaMalloc(&gpuVote, bytes);		
-
-			//allocate space on the GPU for the input Gradient image
-			T* gpuGrad;
-			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
-
-			//copy the Gradient Magnitude data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
-
-			//allocate space on the GPU for the atan2 table
-			T* gpuTable;
-			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
-
-			//copy the atan2 values to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
-
-			//cudaMemcpyToSymbol (cstTable, cpuTable, bytes_table );
-
-			
-			//call the GPU version of the vote calculation function
-			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-
-				
-			//copy the Vote Data back to the CPU
-			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
-
-			//free allocated memory
-			cudaFree(gpuTable);
-			cudaFree(gpuVote);
-			cudaFree(gpuGrad);
-		}
-		
-	}
-}
-
-
-
-#endif
\ No newline at end of file
diff --git a/stim/image/image.h b/stim/image/image.h
index 0db4d91..8bbb507 100644
--- a/stim/image/image.h
+++ b/stim/image/image.h
@@ -174,7 +174,7 @@ public:
 
 	
 	/// Returns the maximum pixel value in the image
-	T max(){
+	T maxv(){
 		float max = 0;
 		unsigned long N = width() * height();		//get the number of pixels
 
@@ -190,7 +190,7 @@ public:
 	}
 
 	/// Returns the minimum pixel value in the image
-	T min(){
+	T minv(){
 		float min = 0;
 		unsigned long N = width() * height();		//get the number of pixels
 
diff --git a/stim/math/vector.h b/stim/math/vector.h
index b2a7db7..7833f82 100644
--- a/stim/math/vector.h
+++ b/stim/math/vector.h
@@ -5,7 +5,7 @@
 #include <cmath>
 #include <sstream>
 #include <vector>
-#include "../cuda/callable.h"
+#include "../cuda/cudatools/callable.h"
 
 namespace stim
 {
--
libgit2 0.21.4