diff --git a/stim/cuda/ivote.cuh b/stim/cuda/ivote.cuh
deleted file mode 100644
index cc07d1d..0000000
--- a/stim/cuda/ivote.cuh
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef STIM_CUDA_IVOTE_H
-#define STIM_CUDA_IVOTE_H
-
-#include <stim/cuda/ivote/down_sample.cuh>
-#include <stim/cuda/ivote/local_max.cuh>
-#include <stim/cuda/ivote/update_dir.cuh>
-#include <stim/cuda/ivote/vote.cuh>
-
-namespace stim{
-	namespace cuda{
-	
-	}
-}
-
-
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/down_sample.cuh b/stim/cuda/ivote/down_sample.cuh
deleted file mode 100644
index be14528..0000000
--- a/stim/cuda/ivote/down_sample.cuh
+++ /dev/null
@@ -1,100 +0,0 @@
-#ifndef STIM_CUDA_DOWN_SAMPLE_H
-#define STIM_CUDA_DOWN_SAMPLE_H
-
-#include <iostream>
-#include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/templates/gaussian_blur.cuh>
-
-namespace stim{
-	namespace cuda{
-
-		template<typename T>
-		__global__ void down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
-
-			unsigned int sigma_ds = 1/resize;
-			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
-			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
-			
-			
-			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y;
-			// convert 2D coordinates to 1D
-			int i = yi * x_ds + xi;
-			
-			if(xi< x_ds && yi< y_ds){
-
-				int x_org = xi * sigma_ds ;
-				int y_org = yi * sigma_ds ;
-				int i_org = y_org * x + x_org;
-				gpuI[i] = gpuI0[i_org];
-			}
-
-		}
-
-
-		/// Applies a Gaussian blur to a 2D image stored on the GPU
-		template<typename T>
-		void gpu_down_sample(T* gpuI, T* gpuI0, T resize, size_t x, size_t y){
-
-			
-			unsigned int sigma_ds = (unsigned int)(1.0f/resize);
-			size_t x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
-			size_t y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
-			
-			//get the number of pixels in the image
-//			unsigned int pixels_ds = x_ds * y_ds;
-			
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads, 1);
-			dim3 blocks(x_ds/threads.x + (x_ds %threads.x == 0 ? 0:1) , y_ds);
-			
-			stim::cuda::gpu_gaussian_blur2<float>(gpuI0, sigma_ds,x ,y);
-			
-			//resample the image
-			down_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y);
-
-		}
-
-		/// Applies a Gaussian blur to a 2D image stored on the CPU
-		template<typename T>
-		void cpu_down_sample(T* re_img, T* image, T resize, unsigned int x, unsigned int y){
-
-			//get the number of pixels in the image
-			unsigned int pixels = x * y;
-			unsigned int bytes = sizeof(T) * pixels;
-			
-			unsigned int sigma_ds = 1/resize;
-			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
-			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
-			unsigned int bytes_ds = sizeof(T) * x_ds * y_ds;
-			
-
-
-			//allocate space on the GPU for the original image
-			T* gpuI0;
-			cudaMalloc(&gpuI0, bytes);
-			
-			
-			//copy the image data to the GPU
-			cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
-
-			//allocate space on the GPU for the down sampled image
-			T* gpuI;
-			cudaMalloc(&gpuI, bytes_ds);
-
-			//run the GPU-based version of the algorithm
-			gpu_down_sample<T>(gpuI, gpuI0, resize, x, y);
-
-			//copy the image data to the GPU
-			cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
-
-			cudaFree(gpuI0);
-			cudeFree(gpuI);
-		}
-	
-	}
-}
-
-#endif
diff --git a/stim/cuda/ivote/local_max.cuh b/stim/cuda/ivote/local_max.cuh
deleted file mode 100644
index 38b7096..0000000
--- a/stim/cuda/ivote/local_max.cuh
+++ /dev/null
@@ -1,109 +0,0 @@
-#ifndef STIM_CUDA_LOCAL_MAX_H
-#define STIM_CUDA_LOCAL_MAX_H
-
-# include <iostream>
-# include <cuda.h>
-#include <stim/cuda/cudatools.h>
-
-namespace stim{
-	namespace cuda{
-
-		// this kernel calculates the local maximum for finding the cell centers
-		template<typename T>
-		__global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, size_t x, size_t y){
-
-			// calculate the 2D coordinates for this current thread.
-			size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
-			size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
-			
-			if(xi >= x || yi >= y)
-				return;
-						
-			// convert 2D coordinates to 1D
-			size_t i = yi * x + xi;
-			
-			gpuCenters[i] = 0;		//initialize the value at this location to zero
-			
-			T val = gpuVote[i];
-			
-			//compare to the threshold
-			//if(val < final_t) return;
-			
-			//define an array to store indices with same vote value
-			/*int * IdxEq;
-			IdxEq = new int  [2*conn];
-			int n = 0;*/
-			
-			for(int xl = xi - conn; xl < xi + conn; xl++){
-				for(int yl = yi - conn; yl < yi + conn; yl++){
-					if(xl >= 0 && xl < x && yl >= 0 && yl < y){
-						int il = yl * x + xl;
-						if(gpuVote[il] > val){							
-							return;
-							}
-						if (gpuVote[il] == val){
-							/*IdxEq[n] = il;
-							n = n+1;*/
-							 if( il > i){
-								 return;
-							}
-						}
-					}							
-				}
-			}
-			/*if (n!=0){
-				if(IdxEq[n/2] !=i){
-					return;
-				}
-			}	*/	
-			//gpuCenters[i] = 1;
-			gpuCenters[i] = gpuVote[i];
-		}
-		
-		template<typename T>
-		void gpu_local_max(T* gpuCenters, T* gpuVote,  unsigned int conn, size_t x, size_t y){
-
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			/*dim3 threads(max_threads, 1);
-			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/
-			dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
-			dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
-			
-			//call the kernel to find the local maximum.
-			cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, conn, x, y);
-		}
-
-		template<typename T>
-		void cpu_local_max(T* cpuCenters, T* cpuVote, unsigned int conn, unsigned int x, unsigned int y){
-		
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			// allocate space on the GPU for the detected cell centes
-			T* gpuCenters;
-			cudaMalloc(&gpuCenters, bytes);		
-
-			//allocate space on the GPU for the input Vote Image
-			T* gpuVote;
-			cudaMalloc(&gpuVote, bytes);		
-
-			//copy the Vote image data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
-						
-			//call the GPU version of the local max function
-			gpu_local_max<T>(gpuCenters, gpuVote, conn, x, y);
-							
-			//copy the cell centers data to the CPU
-			cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ;
-						
-			//free allocated memory
-			cudaFree(gpuCenters);
-			cudaFree(gpuVote);
-		}
-		
-	}
-}
-
-
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/re_sample.cuh b/stim/cuda/ivote/re_sample.cuh
deleted file mode 100644
index 06bfeb7..0000000
--- a/stim/cuda/ivote/re_sample.cuh
+++ /dev/null
@@ -1,106 +0,0 @@
-#ifndef STIM_CUDA_RE_SAMPLE_H
-#define STIM_CUDA_RE_SAMPLE_H
-
-#include <iostream>
-#include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/templates/gaussian_blur.cuh>
-
-namespace stim{
-	namespace cuda{
-
-		template<typename T>
-		__global__ void cuda_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
-
-			unsigned int sigma_ds = 1/resize;
-			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
-			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
-			
-			
-			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y;
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
-			
-			if(xi< x && yi< y){
-				if(xi%sigma_ds==0){
-					if(yi%sigma_ds==0){
-						gpuI[i] = gpuI0[(yi/sigma_ds)*x_ds + xi/sigma_ds];
-					}
-				}
-				else gpuI[i] = 0;
-
-				//int x_org = xi * sigma_ds ;
-				//int y_org = yi * sigma_ds ;
-				//int i_org = y_org * x + x_org;
-				//gpuI[i] = gpuI0[i_org];
-			}
-
-		}
-
-
-		/// Applies a Gaussian blur to a 2D image stored on the GPU
-		template<typename T>
-		void gpu_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
-
-			
-			//unsigned int sigma_ds = 1/resize;
-			//unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
-			//unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
-			
-			//get the number of pixels in the image
-			//unsigned int pixels_ds = x_ds * y_ds;
-			
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads, 1);
-			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
-			
-			//stim::cuda::gpu_gaussian_blur2<float>(gpuI0, sigma_ds,x ,y);
-			
-			//resample the image
-			cuda_re_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y);
-
-		}
-
-		/// Applies a Gaussian blur to a 2D image stored on the CPU
-		template<typename T>
-		void cpu_re_sample(T* out, T* in, T resize, unsigned int x, unsigned int y){
-
-			//get the number of pixels in the image
-			unsigned int pixels = x*y;
-			unsigned int bytes = sizeof(T) * pixels;
-			
-			unsigned int sigma_ds = 1/resize;
-			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
-			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
-			unsigned int bytes_ds = sizeof(T) * x_ds * y_ds;
-			
-
-
-			//allocate space on the GPU for the original image
-			T* gpuI0;
-			cudaMalloc(&gpuI0, bytes_ds);
-			
-			
-			//copy the image data to the GPU
-			cudaMemcpy(gpuI0, in, bytes_ds, cudaMemcpyHostToDevice);
-
-			//allocate space on the GPU for the down sampled image
-			T* gpuI;
-			cudaMalloc(&gpuI, bytes);
-
-			//run the GPU-based version of the algorithm
-			gpu_re_sample<T>(gpuI, gpuI0, resize, x, y);
-
-			//copy the image data to the GPU
-			cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
-
-			cudaFree(gpuI0);
-			cudeFree(gpuI);
-		}
-	
-	}
-}
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/update_dir.cuh b/stim/cuda/ivote/update_dir.cuh
deleted file mode 100644
index 3052bf4..0000000
--- a/stim/cuda/ivote/update_dir.cuh
+++ /dev/null
@@ -1,217 +0,0 @@
-#ifndef STIM_CUDA_UPDATE_DIR_H
-#define STIM_CUDA_UPDATE_DIR_H
-
-# include <iostream>
-# include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-
-namespace stim{
-	namespace cuda{
-	
-		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
-		template<typename T>
-		__global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
-
-			//generate a pointer to shared memory (size will be specified as a kernel parameter)
-			extern __shared__ float s_vote[];
-
-			//calculate the start point for this block
-			int bxi = blockIdx.x * blockDim.x;
-			
-			// calculate the 2D coordinates for this current thread.
-			int xi = bxi + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
-
-			// calculate the voting direction based on the grtadient direction
-			float theta = gpuGrad[2*i];
-
-			//initialize the vote direction to zero
-			gpuDir[i] = 0;
-
-			// define a local variable to maximum value of the vote image in the voting area for this voter
-			float max = 0;
-
-			// define two local variables for the x and y coordinations where the maximum happened
-			int id_x = 0;
-			int id_y = 0;
-
-			//calculate the width of the shared memory block
-			int swidth = 2 * rmax + blockDim.x;
-
-			// compute the size of window which will be checked for finding the voting area for this voter
-			int x_table = 2*rmax +1;
-			int rmax_sq = rmax * rmax;
-			int tx_rmax = threadIdx.x + rmax;
-			int bxs = bxi - rmax;
-						
-			for(int yr = -rmax; yr <= rmax; yr++){
-
-				//copy the portion of the image necessary for this block to shared memory
-				__syncthreads();
-				stim::cuda::sharedMemcpy_tex2D<float>(s_vote, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim);
-				__syncthreads();
-				
-				//if the current thread is outside of the image, it doesn't have to be computed
-				if(xi < x && yi < y){
-
-					for(int xr = -rmax; xr <= rmax; xr++){
-
-						unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
-
-						// calculate the angle between the voter and the current pixel in x and y directions
-						float atan_angle = gpuTable[ind_t];
-						
-						// calculate the voting direction based on the grtadient direction
-						int idx_share_update = xr + tx_rmax ;
-						float share_vote = s_vote[idx_share_update];
-						
-						// check if the current pixel is located in the voting area of this voter.
-						if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
-
-						// compare the vote value of this pixel with the max value to find the maxima and its index.
-							if  (share_vote>max) {
-
-								max = share_vote;
-								id_x =  xr;
-								id_y =  yr;
-							}
-						}
-					}
-				}
-			}
-							
-		unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
-		float new_angle = gpuTable[ind_m];
-
-		if(xi < x && yi < y)
-			gpuDir[i] = new_angle;
-
-		}
-
-		// this kernel updates the gradient direction by the calculated voting direction.
-		template<typename T>
-		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
-
-			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-		
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
-			
-			//update the gradient image with the vote direction
-			gpuGrad[2*i] = gpuDir[i];
-		}
-		
-		template<typename T>
-		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//get the number of pixels in the image
-			unsigned int pixels = x * y;
-			unsigned int bytes = sizeof(T) * pixels;
-						
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads, 1);
-			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
-			
-			//define a channel descriptor for a single 32-bit channel
-			cudaChannelFormatDesc channelDesc =
-					   cudaCreateChannelDesc(32, 0, 0, 0,
-											 cudaChannelFormatKindFloat);
-			cudaArray* cuArray;												//declare the cuda array
-			cudaMallocArray(&cuArray, &channelDesc, x, y);			//allocate the cuda array
-
-			// Copy the image data from global memory to the array
-			cudaMemcpyToArray(cuArray, 0, 0, gpuVote, bytes,
-							  cudaMemcpyDeviceToDevice);
-
-			// Specify texture
-			struct cudaResourceDesc resDesc;				//create a resource descriptor
-			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
-			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
-			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
-
-			// Specify texture object parameters
-			struct cudaTextureDesc texDesc;							//create a texture descriptor
-			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
-			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
-			texDesc.addressMode[1]   = cudaAddressModeWrap;
-			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
-			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
-			texDesc.normalizedCoords = 0;							//not using normalized coordinates
-
-			// Create texture object
-			cudaTextureObject_t texObj = 0;
-			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
-
-			// specify  share memory
-			unsigned int share_bytes = (2*rmax + threads.x)*(1)*4;
-			
-			// allocate space on the GPU for the updated vote direction
-			T* gpuDir;
-			cudaMalloc(&gpuDir, bytes);	
-
-			//call the kernel to calculate the new voting direction
-			cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, texObj, gpuGrad, gpuTable, phi, rmax, x , y);
-
-			//call the kernel to update the gradient direction
-			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
-			
-			//free allocated memory
-			cudaFree(gpuDir);
-
-			cudaDestroyTextureObject(texObj);
-			cudaFreeArray(cuArray);
-
-		}
-		
-		template<typename T>
-		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			//calculate the number of bytes in the atan2 table
-			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
-
-			//allocate space on the GPU for the Vote Image
-			T* gpuVote;
-			cudaMalloc(&gpuVote, bytes);
-
-			//copy the input vote image to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
-
-			//allocate space on the GPU for the input Gradient image
-			T* gpuGrad;
-			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
-
-			//copy the Gradient data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
-
-			//allocate space on the GPU for the atan2 table
-			T* gpuTable;
-			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
-
-			//copy the atan2 values to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
-						
-			//call the GPU version of the update direction function
-			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-							
-			//copy the new gradient image back to the CPU
-			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
-
-			//free allocated memory
-			cudaFree(gpuTable);
-			cudaFree(gpuVote);
-			cudaFree(gpuGrad);
-		}
-		
-	}
-}
-
-#endif
diff --git a/stim/cuda/ivote/update_dir_bb.cuh b/stim/cuda/ivote/update_dir_bb.cuh
deleted file mode 100644
index 43869ec..0000000
--- a/stim/cuda/ivote/update_dir_bb.cuh
+++ /dev/null
@@ -1,181 +0,0 @@
-#ifndef STIM_CUDA_UPDATE_DIR_BB_H
-#define STIM_CUDA_UPDATE_DIR_BB_H
-
-# include <iostream>
-# include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-#include <stim/visualization/aabb2.h>
-#include <stim/visualization/colormap.h>
-#include <math.h> 
-
-//#define RMAX_TEST	8
-
-namespace stim{
-	namespace cuda{
-
-		template<typename T>
-		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
-			extern __shared__ T S[];
-			T* shared_atan = S;
-			size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
-			stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
-
-			//T* shared_vote = &S[n_table];
-			//size_t template_size_x = (blockDim.x + 2 * rmax);
-			//size_t template_size_y = (blockDim.y + 2 * rmax);
-			//stim::cuda::threadedMemcpy2D((char*)shared_vote, (char*)gpuVote, template_size_x, template_size_y, x,  threadIdx.y * blockDim.x + threadIdx.x, blockDim.x * blockDim.y);
-			
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;				//calculate the 2D coordinates for this current thread.
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-
-			if(xi >= x || yi >= y) return;								//if the index is outside of the image, terminate the kernel
-
-			int i = yi * x + xi;										//convert 2D coordinates to 1D
-			float theta = gpuGrad[2*i];									//calculate the voting direction based on the grtadient direction - global memory fetch
-			
-			stim::aabb2<int> bb(xi, yi);								//initialize a bounding box at the current point
-			bb.insert(xi + ceil(rmax * cos(theta)),       ceil(yi + rmax * sin(theta)));
-			bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi)));		//insert one corner of the triangle into the bounding box
-			bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi)));		//insert the final corner into the bounding box
-
-			int x_table = 2*rmax +1;
-			T rmax_sq = rmax * rmax;
-
-			int lut_i;
-			T dx_sq, dy_sq;
-
-			bb.trim_low(0, 0);															//make sure the bounding box doesn't go outside the image
-			bb.trim_high(x-1, y-1);
-
-			int by, bx;
-			int dx, dy;													//coordinate relative to (xi, yi)
-			
-			T v;
-			T max_v = 0;												//initialize the maximum vote value to zero
-			T alpha;
-			int max_dx = bb.low[0] - xi;
-			int max_dy = bb.low[1] - yi;
-			for(by = bb.low[1]; by <= bb.high[1]; by++){					//for each element in the bounding box
-				dy = by - yi;											//calculate the y coordinate of the current point relative to yi
-				dy_sq = dy * dy;
-				for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
-					dx = bx - xi;
-					dx_sq = dx * dx;
-					lut_i = (rmax - dy) * x_table + rmax - dx;
-					alpha = shared_atan[lut_i];
-					if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
-						v = gpuVote[by * x + bx];				// find the vote value for the current counter
-						if(v > max_v){
-							max_v = v;
-							max_dx = dx;
-							max_dy = dy;
-						}
-					}
-				}
-			}			
-			gpuDir[i] = atan2((T)max_dy, (T)max_dx);
-		}
-	
-		
-
-		// this kernel updates the gradient direction by the calculated voting direction.
-		template<typename T>
-		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, size_t x, size_t y){
-
-			// calculate the 2D coordinates for this current thread.
-			size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
-			size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
-
-			if(xi >= x || yi >= y) return;
-		
-			// convert 2D coordinates to 1D
-			size_t i = yi * x + xi;
-			
-			//update the gradient image with the vote direction
-			gpuGrad[2*i] = gpuDir[i];
-		}
-		
-		template<typename T>
-		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y){
-
-			//calculate the number of bytes in the array
-			size_t bytes = x * y * sizeof(T);
-			
-			// allocate space on the GPU for the updated vote direction
-			T* gpuDir;
-			HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) );	
-
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			
-			dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
-			dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
-
-			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
-			//size_t curtain = 2 * rmax;
-			//size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain);
-			size_t shared_mem_req = table_bytes;// + template_bytes;
-			if (DEBUG) std::cout << "Shared Memory required: " << shared_mem_req << std::endl;
-
-			size_t shared_mem = stim::sharedMemPerBlock();
-			if(shared_mem_req > shared_mem){
-				std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
-				exit(1);
-			}
-
-			//call the kernel to calculate the new voting direction
-			cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, (int)x , (int)y);
-
-			//call the kernel to update the gradient direction
-			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, (int)x , (int)y);
-			//free allocated memory
-			HANDLE_ERROR( cudaFree(gpuDir) );
-
-		}
-		
-		template<typename T>
-		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			//calculate the number of bytes in the atan2 table
-			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
-
-			//allocate space on the GPU for the Vote Image
-			T* gpuVote;
-			cudaMalloc(&gpuVote, bytes);
-
-			//copy the input vote image to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
-
-			//allocate space on the GPU for the input Gradient image
-			T* gpuGrad;
-			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
-
-			//copy the Gradient data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
-
-			//allocate space on the GPU for the atan2 table
-			T* gpuTable;
-			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
-
-			//copy the atan2 values to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
-						
-			//call the GPU version of the update direction function
-			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-							
-			//copy the new gradient image back to the CPU
-			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
-
-			//free allocated memory
-			cudaFree(gpuTable);
-			cudaFree(gpuVote);
-			cudaFree(gpuGrad);
-		}
-		
-	}
-}
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/update_dir_shared.cuh b/stim/cuda/ivote/update_dir_shared.cuh
deleted file mode 100644
index 91aa717..0000000
--- a/stim/cuda/ivote/update_dir_shared.cuh
+++ /dev/null
@@ -1,184 +0,0 @@
-#ifndef STIM_CUDA_UPDATE_DIR_SHARED_H
-#define STIM_CUDA_UPDATE_DIR_SHARED_H
-
-# include <iostream>
-# include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-#include "cpyToshare.cuh"
-
-namespace stim{
-	namespace cuda{
-	
-		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
-		template<typename T>
-		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
-
-			//generate a pointer to shared memory (size will be specified as a kernel parameter)
-			extern __shared__ float s_vote[];
-
-			//calculate the start point for this block
-			int bxi = blockIdx.x * blockDim.x;
-			
-			// calculate the 2D coordinates for this current thread.
-			int xi = bxi + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
-
-			// calculate the voting direction based on the grtadient direction
-			float theta = gpuGrad[2*i];
-
-			//initialize the vote direction to zero
-			gpuDir[i] = 0;
-
-			// define a local variable to maximum value of the vote image in the voting area for this voter
-			float max = 0;
-
-			// define two local variables for the x and y coordinations where the maximum happened
-			int id_x = 0;
-			int id_y = 0;
-
-			//calculate the width of the shared memory block
-			int swidth = 2 * rmax + blockDim.x;
-
-			// compute the size of window which will be checked for finding the voting area for this voter
-			int x_table = 2*rmax +1;
-			int rmax_sq = rmax * rmax;
-			int tx_rmax = threadIdx.x + rmax;
-			int bxs = bxi - rmax;
-						
-			for(int yr = -rmax; yr <= rmax; yr++){
-				//if (yi+yr >= 0 && yi + yr < y){
-					//copy the portion of the image necessary for this block to shared memory
-					__syncthreads();
-					cpyG2S1D<float>(s_vote, gpuVote, bxs, yi + yr , swidth, 1, threadIdx, blockDim, x, y);
-					__syncthreads();
-				
-					//if the current thread is outside of the image, it doesn't have to be computed
-					if(xi < x && yi < y){
-
-						for(int xr = -rmax; xr <= rmax; xr++){
-
-							unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
-
-							// calculate the angle between the voter and the current pixel in x and y directions
-							float atan_angle = gpuTable[ind_t];
-						
-							// calculate the voting direction based on the grtadient direction
-							int idx_share_update = xr + tx_rmax ;
-							float share_vote = s_vote[idx_share_update];
-						
-							// check if the current pixel is located in the voting area of this voter.
-							if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
-
-							// compare the vote value of this pixel with the max value to find the maxima and its index.
-								if  (share_vote>max) {
-
-									max = share_vote;
-									id_x =  xr;
-									id_y =  yr;
-								}
-							}
-						}
-					}
-				//}
-			}
-							
-		unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
-		float new_angle = gpuTable[ind_m];
-
-		if(xi < x && yi < y)
-			gpuDir[i] = new_angle;
-
-		}
-
-		// this kernel updates the gradient direction by the calculated voting direction.
-		template<typename T>
-		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
-
-			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-		
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
-			
-			//update the gradient image with the vote direction
-			gpuGrad[2*i] = gpuDir[i];
-		}
-		
-		template<typename T>
-		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads, 1);
-			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
-			
-			// specify  share memory
-			unsigned int share_bytes = (2*rmax + threads.x)*(1)*4;
-			
-			// allocate space on the GPU for the updated vote direction
-			T* gpuDir;
-			cudaMalloc(&gpuDir, bytes);	
-
-			//call the kernel to calculate the new voting direction
-			cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-
-			//call the kernel to update the gradient direction
-			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
-			
-			//free allocated memory
-			cudaFree(gpuDir);
-
-		}
-		
-		template<typename T>
-		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			//calculate the number of bytes in the atan2 table
-			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
-
-			//allocate space on the GPU for the Vote Image
-			T* gpuVote;
-			cudaMalloc(&gpuVote, bytes);
-
-			//copy the input vote image to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
-
-			//allocate space on the GPU for the input Gradient image
-			T* gpuGrad;
-			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
-
-			//copy the Gradient data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
-
-			//allocate space on the GPU for the atan2 table
-			T* gpuTable;
-			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
-
-			//copy the atan2 values to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
-						
-			//call the GPU version of the update direction function
-			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-							
-			//copy the new gradient image back to the CPU
-			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
-
-			//free allocated memory
-			cudaFree(gpuTable);
-			cudaFree(gpuVote);
-			cudaFree(gpuGrad);
-		}
-		
-	}
-}
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/update_dir_threshold_global.cuh b/stim/cuda/ivote/update_dir_threshold_global.cuh
deleted file mode 100644
index f73efa6..0000000
--- a/stim/cuda/ivote/update_dir_threshold_global.cuh
+++ /dev/null
@@ -1,159 +0,0 @@
-#ifndef STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBALD_H
-#define STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBAL_H
-
-# include <iostream>
-# include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-#include "cpyToshare.cuh"   
-
-namespace stim{
-	namespace cuda{
-	
-		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
-		template<typename T>
-		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x,  int y){
-
-			
-			
-			// calculate the coordinate for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			// calculate the voting direction based on the grtadient direction
-			float theta = gpuTh[3*xi];
-			
-			//calculate the position and x, y coordinations of this voter in the original image
-			unsigned int i_v = gpuTh[3*xi+2];
-			unsigned int y_v = i_v/x;
-			unsigned int x_v = i_v - (y_v*x);
-			
-			//initialize the vote direction to zero
-			gpuDir[xi] = 0;
-
-			// define a local variable to maximum value of the vote image in the voting area for this voter
-			float max = 0;
-
-			// define two local variables for the x and y coordinations where the maximum happened
-			int id_x = 0;
-			int id_y = 0;
-
-			// compute the size of window which will be checked for finding the voting area for this voter
-			int x_table = 2*rmax +1;
-			int rmax_sq = rmax * rmax;
-			int tx_rmax = threadIdx.x + rmax;
-			if(xi < th_size){
-				
-				for(int yr = -rmax; yr <= rmax; yr++){
-					
-					for(int xr = -rmax; xr <= rmax; xr++){
-
-						unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
-
-						// find the angle between the voter and the current pixel in x and y directions
-						float atan_angle = gpuTable[ind_t];
-										
-						// check if the current pixel is located in the voting area of this voter.
-						if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
-							// find the vote value for the current counter
-							float vote_c = gpuVote[(y_v+yr)*x + (x_v+xr)];
-							// compare the vote value of this pixel with the max value to find the maxima and its index.
-							if  (vote_c>max) {
-
-								max = vote_c;
-								id_x =  xr;
-								id_y =  yr;
-							}
-						}
-					}
-				}
-			
-							
-				unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
-				float new_angle = gpuTable[ind_m];
-				gpuDir[xi] = new_angle;
-			}
-
-		}
-
-		// this kernel updates the gradient direction by the calculated voting direction.
-		template<typename T>
-		__global__ void cuda_update_grad(T* gpuTh, T* gpuDir, int th_size, int x, int y){
-
-			// calculate the coordinate for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			
-		
-			//update the gradient image with the vote direction
-			gpuTh[3*xi] = gpuDir[xi];
-		}
-		
-		template<typename T>
-		void gpu_update_dir(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes_th = th_size* sizeof(T);
-
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads);
-			dim3 blocks(th_size/threads.x+1);
-			
-			// allocate space on the GPU for the updated vote direction
-			T* gpuDir;
-			cudaMalloc(&gpuDir, bytes_th);	
-
-			//call the kernel to calculate the new voting direction
-			cuda_update_dir <<< blocks, threads>>>(gpuDir, gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y);
-
-			//call the kernel to update the gradient direction
-			cuda_update_grad <<< blocks, threads >>>(gpuTh, gpuDir, th_size, x , y);
-			
-			//free allocated memory
-			cudaFree(gpuDir);
-
-		}
-		
-		template<typename T>
-		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			//calculate the number of bytes in the atan2 table
-			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
-
-			//allocate space on the GPU for the Vote Image
-			T* gpuVote;
-			cudaMalloc(&gpuVote, bytes);
-
-			//copy the input vote image to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
-
-			//allocate space on the GPU for the input Gradient image
-			T* gpuGrad;
-			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
-
-			//copy the Gradient data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
-
-			//allocate space on the GPU for the atan2 table
-			T* gpuTable;
-			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
-
-			//copy the atan2 values to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
-						
-			//call the GPU version of the update direction function
-			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-							
-			//copy the new gradient image back to the CPU
-			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
-
-			//free allocated memory
-			cudaFree(gpuTable);
-			cudaFree(gpuVote);
-			cudaFree(gpuGrad);
-		}
-		
-	}
-}
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/vote.cuh b/stim/cuda/ivote/vote.cuh
deleted file mode 100644
index 94250eb..0000000
--- a/stim/cuda/ivote/vote.cuh
+++ /dev/null
@@ -1,175 +0,0 @@
-#ifndef STIM_CUDA_VOTE_H
-#define STIM_CUDA_VOTE_H
-
-# include <iostream>
-# include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-
-namespace stim{
-	namespace cuda{
-
-		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
-		template<typename T>
-		__global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, int rmax, int x, int y){
-
-			//generate a pointer to shared memory (size will be specified as a kernel parameter)
-			extern __shared__ float2 s_grad[];
-
-			//calculate the start point for this block
-			int bxi = blockIdx.x * blockDim.x;
-			
-			// calculate the 2D coordinates for this current thread.
-			int xi = bxi + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
-						
-			// define a local variable to sum the votes from the voters
-			float sum = 0;
-			
-			//calculate the width of the shared memory block
-			int swidth = 2 * rmax + blockDim.x;
-			
-			// compute the size of window which will be checked for finding the proper voters for this pixel
-			int x_table = 2*rmax +1;
-			int rmax_sq = rmax * rmax;
-			int tx_rmax = threadIdx.x + rmax;
-			int bxs = bxi - rmax;
-			
-			//for every line (along y)
-			for(int yr = -rmax; yr <= rmax; yr++){
-
-				//copy the portion of the image necessary for this block to shared memory
-				__syncthreads();
-				stim::cuda::sharedMemcpy_tex2D<float2>(s_grad, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim);
-				__syncthreads();
-				
-				if(xi < x && yi < y){
-
-					for(int xr = -rmax; xr <= rmax; xr++){
-					
-							//find the location of this voter in the atan2 table
-							int id_t = (yr + rmax) * x_table + xr + rmax;
-
-							// calculate the angle between the pixel and the current voter in x and y directions
-							float atan_angle = gpuTable[id_t];
-												
-							// calculate the voting direction based on the grtadient direction
-							int idx_share = xr + tx_rmax ;
-							float2 g = s_grad[idx_share];
-							float theta = g.x;
-
-							// check if the current voter is located in the voting area of this pixel.
-							if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
-								sum += g.y;		
-
-							}
-					}
-				
-				}
-			}
-			if(xi < x && yi < y)
-				gpuVote[i] = sum;
-			
-		}
-
-		template<typename T>
-		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//get the number of pixels in the image
-			unsigned int pixels = x * y;
-			unsigned int bytes = sizeof(T) * pixels;
-						
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads, 1);
-			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
-			
-			// Allocate CUDA array in device memory
-			
-			//define a channel descriptor for a single 32-bit channel
-			cudaChannelFormatDesc channelDesc =
-					   cudaCreateChannelDesc(32, 32, 0, 0,
-											 cudaChannelFormatKindFloat);
-			cudaArray* cuArray;												//declare the cuda array
-			cudaMallocArray(&cuArray, &channelDesc, x, y);			//allocate the cuda array
-
-			// Copy the image data from global memory to the array
-			cudaMemcpyToArray(cuArray, 0, 0, gpuGrad, bytes*2,
-							  cudaMemcpyDeviceToDevice);
-
-			// Specify texture
-			struct cudaResourceDesc resDesc;				//create a resource descriptor
-			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
-			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
-			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
-
-			// Specify texture object parameters
-			struct cudaTextureDesc texDesc;							//create a texture descriptor
-			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
-			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
-			texDesc.addressMode[1]   = cudaAddressModeWrap;
-			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
-			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
-			texDesc.normalizedCoords = 0;							//not using normalized coordinates
-
-			// Create texture object
-			cudaTextureObject_t texObj = 0;
-			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
-
-			// specify  share memory
-			unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4;
-			
-			//call the kernel to do the voting
-			
-			cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y);
-
-			cudaDestroyTextureObject(texObj);
-			cudaFreeArray(cuArray);
-
-		}
-
-
-		template<typename T>
-		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			//calculate the number of bytes in the atan2 table
-			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
-
-			//allocate space on the GPU for the Vote Image
-			T* gpuVote;
-			cudaMalloc(&gpuVote, bytes);		
-
-			//allocate space on the GPU for the input Gradient image
-			T* gpuGrad;
-			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
-
-			//copy the Gradient Magnitude data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
-
-			//allocate space on the GPU for the atan2 table
-			T* gpuTable;
-			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
-
-			//copy the atan2 values to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
-						
-			//call the GPU version of the vote calculation function
-			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-							
-			//copy the Vote Data back to the CPU
-			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
-
-			//free allocated memory
-			cudaFree(gpuTable);
-			cudaFree(gpuVote);
-			cudaFree(gpuGrad);
-		}
-		
-	}
-}
-
-#endif
diff --git a/stim/cuda/ivote/vote_atomic.cuh b/stim/cuda/ivote/vote_atomic.cuh
deleted file mode 100644
index fc0ce47..0000000
--- a/stim/cuda/ivote/vote_atomic.cuh
+++ /dev/null
@@ -1,116 +0,0 @@
-#ifndef STIM_CUDA_VOTE_ATOMIC_H
-#define STIM_CUDA_VOTE_ATOMIC_H
-
-# include <iostream>
-# include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-#include "cpyToshare.cuh"
-
-namespace stim{
-	namespace cuda{
-
-		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
-		template<typename T>
-		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
-
-			
-			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
-
-			// calculate the voting direction based on the grtadient direction
-			float theta = gpuGrad[2*i];
-			//calculate the amount of vote for the voter
-			float mag = gpuGrad[2*i + 1];
-						
-			// compute the size of window which will be checked for finding the proper voters for this pixel
-			int x_table = 2*rmax +1;
-			int rmax_sq = rmax * rmax;
-			if(xi < x && yi < y){
-			//for every line (along y)
-				for(int yr = -rmax; yr <= rmax; yr++){					
-					for(int xr = -rmax; xr <= rmax; xr++){
-						if ((yi+yr)>=0 && (yi+yr)<y && (xi+xr)>=0 && (xi+xr)<x){
-					
-							//find the location of the current pixel in the atan2 table
-							unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
-
-							// calculate the angle between the voter and the current pixel in x and y directions
-							float atan_angle = gpuTable[ind_t];
-						
-							// check if the current pixel is located in the voting area of this voter.
-							if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
-								// calculate the 1D index for the current pixel in global memory
-								unsigned int ind_g = (yi+yr)*x + (xi+xr);
-								atomicAdd(&gpuVote[ind_g], mag);
-							
-								}
-						}
-					}
-				}	
-			}
-		}
-
-		template<typename T>
-		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-							
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads, 1);
-			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
-					
-			// specify  share memory
-			//unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4;
-			
-			//call the kernel to do the voting
-			cuda_vote <<< blocks, threads>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-
-		}
-
-
-		template<typename T>
-		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			//calculate the number of bytes in the atan2 table
-			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
-
-			//allocate space on the GPU for the Vote Image
-			T* gpuVote;
-			cudaMalloc(&gpuVote, bytes);		
-
-			//allocate space on the GPU for the input Gradient image
-			T* gpuGrad;
-			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
-
-			//copy the Gradient Magnitude data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
-
-			//allocate space on the GPU for the atan2 table
-			T* gpuTable;
-			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
-
-			//copy the atan2 values to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
-						
-			//call the GPU version of the vote calculation function
-			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-							
-			//copy the Vote Data back to the CPU
-			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
-
-			//free allocated memory
-			cudaFree(gpuTable);
-			cudaFree(gpuVote);
-			cudaFree(gpuGrad);
-		}
-		
-	}
-}
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/vote_atomic_bb.cuh b/stim/cuda/ivote/vote_atomic_bb.cuh
deleted file mode 100644
index 5a05001..0000000
--- a/stim/cuda/ivote/vote_atomic_bb.cuh
+++ /dev/null
@@ -1,151 +0,0 @@
-#ifndef STIM_CUDA_VOTE_ATOMIC_BB_H
-#define STIM_CUDA_VOTE_ATOMIC_BB_H
-
-# include <iostream>
-# include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-#include <stim/visualization/aabb2.h>
-#include <stim/visualization/colormap.h>
-#include <math.h>
-
-
-
-namespace stim{
-	namespace cuda{
-
-		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
-		template<typename T>
-		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, size_t x, size_t y, bool gradmag = true){
-
-			extern __shared__ T S[];
-			T* shared_atan = S;
-			size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
-			stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
-			
-			// calculate the 2D coordinates for this current thread.
-			size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
-			size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
-			
-			if(xi >= x || yi >= y) return;			
-			// convert 2D coordinates to 1D
-			size_t i = yi * x + xi;
-
-			// calculate the voting direction based on the grtadient direction
-			float theta = gpuGrad[2*i];
-			//calculate the amount of vote for the voter
-			float mag = gpuGrad[2*i + 1];
-			
-
-			stim::aabb2<int> bb(xi, yi);								//initialize a bounding box at the current point
-			bb.insert(xi + ceil(rmax * cos(theta)),       ceil(yi + rmax * sin(theta)));
-			bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi)));		//insert one corner of the triangle into the bounding box
-			bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi)));		//insert the final corner into the bounding box
-			
-			// compute the size of window which will be checked for finding the proper voters for this pixel
-			int x_table = 2*rmax +1;
-			int rmax_sq = rmax * rmax;
-			
-			int lut_i;
-			T dx_sq, dy_sq;
-
-			bb.trim_low(0, 0);															//make sure the bounding box doesn't go outside the image
-			bb.trim_high(x-1, y-1);
-
-			size_t by, bx;
-			int dx, dy;					
-			
-			unsigned int ind_g;											//initialize the maximum vote value to zero
-			T alpha;
-			
-			for(by = bb.low[1]; by <= bb.high[1]; by++){					//for each element in the bounding box
-				dy = by - yi;											//calculate the y coordinate of the current point relative to yi
-				dy_sq = dy * dy;
-				for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
-					dx = bx - xi;
-					dx_sq = dx * dx;
-					lut_i = (rmax - dy) * x_table + rmax - dx;
-					alpha = shared_atan[lut_i];
-					if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
-						ind_g = (by)*x + (bx);
-						if(gradmag) atomicAdd(&gpuVote[ind_g], mag);			//add the gradient magnitude (if the gradmag flag is enabled)
-						else		atomicAdd(&gpuVote[ind_g], 1.0f);			//otherwise just add 1
-					
-					}
-				}
-			}			
-			
-		}
-	
-
-		/// Iterative voting for an image
-		/// @param gpuVote is the resulting vote image
-		/// @param gpuGrad is the gradient of the input image
-		/// @param gpuTable is the pre-computed atan2() table
-		/// @param phi is the angle of the vote region
-		/// @param rmax is the estimated radius of the blob (defines the "width" of the vote region)
-		/// @param x and y are the spatial dimensions of the gradient image
-		/// @param gradmag defines whether or not the gradient magnitude is taken into account during the vote
-		template<typename T>
-		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y, bool gradmag = true){
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
-			dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
-			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
-			size_t shared_mem_req = table_bytes;// + template_bytes;
-			if (DEBUG) std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
-			size_t shared_mem = stim::sharedMemPerBlock();
-			if(shared_mem_req > shared_mem){
-				std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
-				exit(1);
-			}
-
-			//call the kernel to do the voting
-			cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y, gradmag);
-
-		}
-
-
-		template<typename T>
-		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			//calculate the number of bytes in the atan2 table
-			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
-
-			//allocate space on the GPU for the Vote Image
-			T* gpuVote;
-			cudaMalloc(&gpuVote, bytes);		
-
-			//allocate space on the GPU for the input Gradient image
-			T* gpuGrad;
-			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
-
-			//copy the Gradient Magnitude data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
-
-			//allocate space on the GPU for the atan2 table
-			T* gpuTable;
-			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
-
-			//copy the atan2 values to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
-						
-			//call the GPU version of the vote calculation function
-			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-							
-			//copy the Vote Data back to the CPU
-			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
-
-			//free allocated memory
-			cudaFree(gpuTable);
-			cudaFree(gpuVote);
-			cudaFree(gpuGrad);
-		}
-		
-	}
-}
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/vote_atomic_shared.cuh b/stim/cuda/ivote/vote_atomic_shared.cuh
deleted file mode 100644
index 102bd20..0000000
--- a/stim/cuda/ivote/vote_atomic_shared.cuh
+++ /dev/null
@@ -1,166 +0,0 @@
-#ifndef STIM_CUDA_VOTE_ATOMIC_SHARED_H
-#define STIM_CUDA_VOTE_ATOMIC_SHARED_H
-
-# include <iostream>
-# include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-
-//#include "writebackshared.cuh"
-namespace stim{
-	namespace cuda{
-
-		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
-		template<typename T>
-		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
-
-			//generate a pointer to the shared memory
-			extern __shared__ float s_vote[];
-			// calculate the 2D coordinates for this current thread.
-			int bxi = blockIdx.x * blockDim.x;
-			int byi = blockIdx.y * blockDim.y;
-			int xi = bxi + threadIdx.x;
-			int yi = byi + threadIdx.y;
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
-
-			// calculate the voting direction based on the gradient direction
-			float theta = gpuGrad[2*i];
-			//calculate the amount of vote for the voter
-			float mag = gpuGrad[2*i + 1];
-			
-			//find the starting points and size of window, wich will be copied to the shared memory
-			int bxs = bxi - rmax;
-			int bys = byi - rmax;
-			int xwidth = 2*rmax + blockDim.x;
-			int ywidth = 2*rmax + blockDim.y;
-			//compute the coordinations of this pixel in the 2D-shared memory.
-			int sx_rx = threadIdx.x + rmax;
-			int sy_ry = threadIdx.y + rmax;
-			// compute the size of window which will be checked for finding the counters for this voter
-			int x_table = 2*rmax +1;
-			int rmax_sq = rmax * rmax;
-			//calculate some parameters for indexing shared memory
-				//calculate the total number of threads available
-				unsigned int tThreads = blockDim.x * blockDim.y;
-				//calculate the current 1D thread ID
-				unsigned int ti =  threadIdx.y * (blockDim.x) + threadIdx.x;
-				//calculate the number of iteration required
-				unsigned int In = xwidth*ywidth/tThreads + 1;
-			if(xi < x && yi < y){
-				__syncthreads();
-				//initialize the shared memory to zero				
-				for (unsigned int i = 0; i < In; i++){								
-					unsigned int sIdx0 = i * tThreads + ti;
-					if (sIdx0< xwidth*ywidth) {
-						s_vote[sIdx0] = 0;
-					}
-				}
-				__syncthreads();
-				//for every line (along y)
-				for(int yr = -rmax; yr <= rmax; yr++){	
-					//compute the position of the current voter in the shared memory along the y axis.
-					unsigned int sIdx_y1d = (sy_ry + yr)* xwidth;
-					for(int xr = -rmax; xr <= rmax; xr++){												
-						
-						//find the location of the current pixel in the atan2 table
-						unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
-
-						// calculate the angle between the voter and the current pixel in x and y directions
-						float atan_angle = gpuTable[ind_t];
-							
-						// check if the current pixel is located in the voting area of this voter.
-						if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
-							//compute the position of the current voter in the 2D-shared memory along the x axis.
-							unsigned int sIdx_x = (sx_rx + xr);
-							//find the 1D index of this voter in the 2D-shared memory.
-							unsigned int s_Idx = (sIdx_y1d  + sIdx_x);
-								
-							atomicAdd(&s_vote[s_Idx], mag);								
-							}
-					}
-				}	
-				//write shared memory back to global memory
-				
-				__syncthreads();						
-				for (unsigned int i = 0; i < In; i++){
-				
-					unsigned int sIdx = i * tThreads + ti;
-					if (sIdx>= xwidth*ywidth) return;
-
-					unsigned int sy = sIdx/xwidth;
-					unsigned int sx = sIdx - (sy * xwidth);
-					
-					unsigned int gx = bxs + sx;
-					unsigned int gy = bys + sy;
-					if (gx<x&& gy<y){
-						unsigned int gIdx = gy * x + gx;
-						//write shared to global memory
-						atomicAdd(&gpuVote[gIdx], s_vote[sIdx]);
-						
-					}						
-				}
-				
-			}
-		}
-
-		template<typename T>
-		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-							
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(sqrt(max_threads), sqrt(max_threads));
-			dim3 blocks(x/threads.x + 1 , y/threads.y+1);
-					
-			// specify  share memory
-			unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*sizeof(T);
-			
-			//call the kernel to do the voting
-			cuda_vote <<< blocks, threads, share_bytes>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-
-		}
-
-
-		template<typename T>
-		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			//calculate the number of bytes in the atan2 table
-			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
-
-			//allocate space on the GPU for the Vote Image
-			T* gpuVote;
-			cudaMalloc(&gpuVote, bytes);		
-
-			//allocate space on the GPU for the input Gradient image
-			T* gpuGrad;
-			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
-
-			//copy the Gradient Magnitude data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
-
-			//allocate space on the GPU for the atan2 table
-			T* gpuTable;
-			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
-
-			//copy the atan2 values to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
-						
-			//call the GPU version of the vote calculation function
-			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-							
-			//copy the Vote Data back to the CPU
-			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
-
-			//free allocated memory
-			cudaFree(gpuTable);
-			cudaFree(gpuVote);
-			cudaFree(gpuGrad);
-		}
-		
-	}
-}
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/vote_shared.cuh b/stim/cuda/ivote/vote_shared.cuh
deleted file mode 100644
index f53fe5d..0000000
--- a/stim/cuda/ivote/vote_shared.cuh
+++ /dev/null
@@ -1,139 +0,0 @@
-#ifndef STIM_CUDA_VOTE_SHARED_H
-#define STIM_CUDA_VOTE_SHARED
-# include <iostream>
-# include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-#include "cpyToshare.cuh"
-
-namespace stim{
-	namespace cuda{
-
-		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
-		template<typename T>
-		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
-
-			//generate a pointer to shared memory (size will be specified as a kernel parameter)
-			extern __shared__ float s_grad[];
-
-			//calculate the start point for this block
-			int bxi = blockIdx.x * blockDim.x;
-			
-			// calculate the 2D coordinates for this current thread.
-			int xi = bxi + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
-						
-			// define a local variable to sum the votes from the voters
-			float sum = 0;
-			
-			//calculate the width of the shared memory block
-			int swidth = 2 * rmax + blockDim.x;
-			
-			// compute the size of window which will be checked for finding the proper voters for this pixel
-			int x_table = 2*rmax +1;
-			int rmax_sq = rmax * rmax;
-			int tx_rmax = threadIdx.x + rmax;
-			int bxs = bxi - rmax;
-			
-			//for every line (along y)
-			for(int yr = -rmax; yr <= rmax; yr++){
-				if (yi+yr<y && yi+yr>=0){
-					//copy the portion of the image necessary for this block to shared memory
-					__syncthreads();
-					cpyG2S1D2ch<float>(s_grad, gpuGrad, bxs, yi + yr , 2*swidth, 1, threadIdx, blockDim, x, y);
-					__syncthreads();
-				
-					if(xi < x && yi < y){
-
-						for(int xr = -rmax; xr <= rmax; xr++){
-					
-								//find the location of this voter in the atan2 table
-								int id_t = (yr + rmax) * x_table + xr + rmax;
-
-								// calculate the angle between the pixel and the current voter in x and y directions
-								float atan_angle = gpuTable[id_t];
-												
-								// calculate the voting direction based on the grtadient direction
-								int idx_share = xr + tx_rmax ;
-								float theta = s_grad[idx_share*2];
-								float mag = s_grad[idx_share*2 + 1];
-							
-
-								// check if the current voter is located in the voting area of this pixel.
-								if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
-									sum += mag;		
-
-								}
-						}
-				
-					}
-				}
-			}
-			if(xi < x && yi < y)
-				gpuVote[i] = sum;
-			
-		}
-
-		template<typename T>
-		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-							
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads, 1);
-			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
-			
-					
-			// specify  share memory
-			unsigned int share_bytes = (2*rmax + threads.x)*1*2*sizeof(T);
-			
-			//call the kernel to do the voting
-			cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-
-		}
-
-
-		template<typename T>
-		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			//calculate the number of bytes in the atan2 table
-			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
-
-			//allocate space on the GPU for the Vote Image
-			T* gpuVote;
-			cudaMalloc(&gpuVote, bytes);		
-
-			//allocate space on the GPU for the input Gradient image
-			T* gpuGrad;
-			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
-
-			//copy the Gradient Magnitude data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
-
-			//allocate space on the GPU for the atan2 table
-			T* gpuTable;
-			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
-
-			//copy the atan2 values to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
-						
-			//call the GPU version of the vote calculation function
-			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-							
-			//copy the Vote Data back to the CPU
-			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
-
-			//free allocated memory
-			cudaFree(gpuTable);
-			cudaFree(gpuVote);
-			cudaFree(gpuGrad);
-		}
-		
-	}
-}
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/vote_shared_32-32.cuh b/stim/cuda/ivote/vote_shared_32-32.cuh
deleted file mode 100644
index 23c9481..0000000
--- a/stim/cuda/ivote/vote_shared_32-32.cuh
+++ /dev/null
@@ -1,150 +0,0 @@
-#ifndef STIM_CUDA_VOTE_SHARED_H
-#define STIM_CUDA_VOTE_SHARED
-# include <iostream>
-# include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-#include "cpyToshare.cuh"
-
-namespace stim{
-	namespace cuda{
-
-		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
-		template<typename T>
-		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
-
-			//generate a pointer to shared memory (size will be specified as a kernel parameter)
-			extern __shared__ float s_grad[];
-
-			//calculate the start point for this block
-			int bxi = blockIdx.x * blockDim.x;
-			int byi = blockIdx.y * blockDim.y;
-			// calculate the 2D coordinates for this current thread.
-			int xi = bxi + threadIdx.x;
-			int yi = byi + threadIdx.y;
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
-						
-			// define a local variable to sum the votes from the voters
-			float sum = 0;
-			
-			//calculate the width of the shared memory block
-			int xwidth = 2 * rmax + blockDim.x;
-			int ywidth = 2 * rmax + blockDim.y;
-			// compute the size of window which will be checked for finding the proper voters for this pixel
-			int x_table = 2*rmax +1;
-			int rmax_sq = rmax * rmax;
-			int tx_rmax = threadIdx.x + rmax;
-			int bxs = bxi - rmax;			
-			int bys = byi - rmax;	
-			//compute the coordinations of this pixel in the 2D-shared memory.
-			int sx_rx = threadIdx.x + rmax;
-			int sy_ry = threadIdx.y + rmax;
-			//copy the portion of the image necessary for this block to shared memory
-			__syncthreads();
-			cpyG2S2D2ch<float>(s_grad, gpuGrad, bxs, bys, 2*xwidth, ywidth, threadIdx, blockDim, x, y);
-			__syncthreads();
-			
-			for(int yr = -rmax; yr <= rmax; yr++){
-				int yi_v = (yi + yr) ;
-				//compute the position of the current voter in the shared memory along the y axis.
-				unsigned int sIdx_y1d = (sy_ry + yr)* xwidth;
-				//if (yi+yr<y && yi+yr>=0){
-					if(xi < x && yi < y){
-
-						for(int xr = -rmax; xr <= rmax; xr++){
-					
-								//compute the position of the current voter in the 2D-shared memory along the x axis.
-								unsigned int sIdx_x = (sx_rx + xr);
-								//find the 1D index of this voter in the 2D-shared memory.
-								unsigned int s_Idx = (sIdx_y1d  + sIdx_x);
-								unsigned int s_Idx2 = s_Idx * 2;
-								
-								//find the location of this voter in the atan2 table
-								int id_t = (yr + rmax) * x_table + xr + rmax;
-
-								// calculate the angle between the pixel and the current voter in x and y directions
-								float atan_angle = gpuTable[id_t];
-												
-								// calculate the voting direction based on the grtadient direction
-								//int idx_share = xr + tx_rmax ;
-								float theta = s_grad[s_Idx2];
-								float mag = s_grad[s_Idx2 + 1];
-							
-
-								// check if the current voter is located in the voting area of this pixel.
-								if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
-									sum += mag;		
-
-								}
-						}
-				
-					}
-				//}
-			}
-			if(xi < x && yi < y)
-				gpuVote[i] = sum;
-			
-		}
-
-		template<typename T>
-		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-							
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(sqrt(max_threads), sqrt(max_threads));
-			dim3 blocks(x/threads.x + 1 , y/threads.y+1);
-			
-					
-			// specify  share memory
-			unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*2*sizeof(T);
-			
-			//call the kernel to do the voting
-			cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-
-		}
-
-
-		template<typename T>
-		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			//calculate the number of bytes in the atan2 table
-			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
-
-			//allocate space on the GPU for the Vote Image
-			T* gpuVote;
-			cudaMalloc(&gpuVote, bytes);		
-
-			//allocate space on the GPU for the input Gradient image
-			T* gpuGrad;
-			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
-
-			//copy the Gradient Magnitude data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
-
-			//allocate space on the GPU for the atan2 table
-			T* gpuTable;
-			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
-
-			//copy the atan2 values to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
-						
-			//call the GPU version of the vote calculation function
-			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-							
-			//copy the Vote Data back to the CPU
-			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
-
-			//free allocated memory
-			cudaFree(gpuTable);
-			cudaFree(gpuVote);
-			cudaFree(gpuGrad);
-		}
-		
-	}
-}
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/vote_threshold_global.cuh b/stim/cuda/ivote/vote_threshold_global.cuh
deleted file mode 100644
index 7a944f1..0000000
--- a/stim/cuda/ivote/vote_threshold_global.cuh
+++ /dev/null
@@ -1,112 +0,0 @@
-#ifndef STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H
-#define STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H
-# include <iostream>
-# include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-#include "cpyToshare.cuh"
-
-namespace stim{
-	namespace cuda{
-
-		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
-		template<typename T>
-		__global__ void cuda_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x, int y){
-
-						
-			// calculate the x coordinate for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			
-			// calculate the voting direction based on the grtadient direction
-			float theta = gpuTh[3*xi];
-			//find the gradient magnitude for the current voter
-			float mag = gpuTh[3*xi + 1];
-			//calculate the position and x, y coordinations of this voter in the original image
-			unsigned int i_v = gpuTh[3*xi+2];
-			unsigned int y_v = i_v/x;
-			unsigned int x_v = i_v - (y_v*x);
-			
-			// compute the size of window which will be checked for finding the proper voters for this pixel
-			int x_table = 2*rmax +1;
-			int rmax_sq = rmax * rmax;
-			if(xi < th_size){
-				for(int yr = -rmax; yr <= rmax; yr++){					
-					for(int xr = -rmax; xr <= rmax; xr++){
-						if ((y_v+yr)>=0 && (y_v+yr)<y && (x_v+xr)>=0 && (x_v+xr)<x){
-					
-							//find the location of the current pixel in the atan2 table
-							unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
-
-							// calculate the angle between the voter and the current pixel in x and y directions
-							float atan_angle = gpuTable[ind_t];
-						
-							// check if the current pixel is located in the voting area of this voter.
-							if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
-								// calculate the 1D index for the current pixel in global memory
-								unsigned int ind_g = (y_v+yr)*x + (x_v+xr);
-								atomicAdd(&gpuVote[ind_g], mag);
-							
-								}
-						}
-					}
-				}	
-			}
-		}
-
-		template<typename T>
-		void gpu_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){
-
-							
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads);
-			dim3 blocks(th_size/threads.x + 1);
-			
-			//call the kernel to do the voting
-			cuda_vote <<< blocks, threads>>>(gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y);
-
-		}
-
-
-		template<typename T>
-		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			//calculate the number of bytes in the atan2 table
-			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
-
-			//allocate space on the GPU for the Vote Image
-			T* gpuVote;
-			cudaMalloc(&gpuVote, bytes);		
-
-			//allocate space on the GPU for the input Gradient image
-			T* gpuGrad;
-			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
-
-			//copy the Gradient Magnitude data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
-
-			//allocate space on the GPU for the atan2 table
-			T* gpuTable;
-			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
-
-			//copy the atan2 values to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
-						
-			//call the GPU version of the vote calculation function
-			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-							
-			//copy the Vote Data back to the CPU
-			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
-
-			//free allocated memory
-			cudaFree(gpuTable);
-			cudaFree(gpuVote);
-			cudaFree(gpuGrad);
-		}
-		
-	}
-}
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote_atomic_bb.cuh b/stim/cuda/ivote_atomic_bb.cuh
deleted file mode 100644
index 07f3224..0000000
--- a/stim/cuda/ivote_atomic_bb.cuh
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H
-#define STIM_CUDA_IVOTE_ATOMIC_BB_H
-
-extern bool DEBUG;
-#include <stim/cuda/ivote/down_sample.cuh>
-#include <stim/cuda/ivote/local_max.cuh>
-#include <stim/cuda/ivote/update_dir_bb.cuh>
-#include <stim/cuda/ivote/vote_atomic_bb.cuh>
-
-namespace stim{
-	namespace cuda{
-	
-	}
-}
-
-
-
-#endif
\ No newline at end of file
diff --git a/stim/iVote/ivote2.cuh b/stim/iVote/ivote2.cuh
new file mode 100644
index 0000000..e88dab8
--- /dev/null
+++ b/stim/iVote/ivote2.cuh
@@ -0,0 +1,173 @@
+#ifndef STIM_IVOTE2_CUH
+#define STIM_IVOTE2_CUH
+
+#include <iostream>
+#include <fstream>
+#include <stim/cuda/cudatools/error.h>
+#include <stim/cuda/templates/gradient.cuh>
+#include <stim/cuda/arraymath.cuh>
+#include <stim/iVote/ivote2/ivote2.cuh>
+#include <stim/math/constants.h>
+#include <stim/math/vector.h>
+#include <stim/visualization/colormap.h>
+
+namespace stim {
+
+	// this function precomputes the atan2 values
+	template<typename T>
+	void atan_2(T* cpuTable, unsigned int rmax) {
+		int xsize = 2 * rmax + 1;						//initialize the width and height of the window which atan2 are computed in.
+		int ysize = 2 * rmax + 1;
+		int yi = rmax;									// assign the center coordinates of the atan2 window to yi and xi
+		int xi = rmax;
+		for (int xt = 0; xt < xsize; xt++) {			//for each element in the atan2 table
+			for (int yt = 0; yt < ysize; yt++) {
+				int id = yt * xsize + xt;				//convert the current 2D coordinates to 1D
+				int xd = xi - xt;						// calculate the distance between the pixel and the center of the atan2 window
+				int yd = yi - yt;
+				T atan_2d = atan2((T)yd, (T)xd);	// calculate the angle between the pixel and the center of the atan2 window and store the result.
+				cpuTable[id] = atan_2d;
+			}
+		}
+	}
+
+	//this kernel invert the 2D image
+	template<typename T>
+	__global__ void cuda_invert(T* gpuI, size_t x, size_t y) {
+		// calculate the 2D coordinates for this current thread.
+		size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
+		size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if (xi >= x || yi >= y) return;
+		size_t i = yi * x + xi;					// convert 2D coordinates to 1D
+		gpuI[i] = 255 - gpuI[i];				//invert the pixel intensity
+	}
+
+
+
+	//this function calculate the threshold using OTSU method
+	template<typename T>
+	T th_otsu(T* pts, size_t pixels, unsigned int th_num = 20) {
+		T Imax = pts[0];				//initialize the maximum value to the first one
+		T Imin = pts[0];				//initialize the maximum value to the first on
+
+		for (size_t n = 0; n < pixels; n++) {		//for every value
+			if (pts[n] > Imax) {			//if the value is higher than the current max
+				Imax = pts[n];
+			}
+		}
+		for (size_t n = 0; n< pixels; n++) {		//for every value
+			if (pts[n] < Imin) {			//if the value is higher than the current max
+				Imin = pts[n];
+			}
+		}
+
+		T th_step = ((Imax - Imin) / th_num);
+		vector<T> var_b;
+		for (unsigned int t0 = 0; t0 < th_num; t0++) {
+			T th = t0 * th_step + Imin;
+			unsigned int n_b(0), n_o(0);		//these variables save the number of elements that are below and over the threshold
+			T m_b(0), m_o(0);				//these variables save the mean value for each cluster
+			for (unsigned int idx = 0; idx < pixels; idx++) {
+				if (pts[idx] <= th) {
+					m_b += pts[idx];
+					n_b += 1;
+				}
+				else {
+					m_o += pts[idx];
+					n_o += 1;
+				}
+			}
+
+			m_b = m_b / n_b;		//calculate the mean value for the below threshold cluster
+			m_o = m_o / n_o;		//calculate the mean value for the over threshold cluster
+
+			var_b.push_back(n_b * n_o * pow((m_b - m_o), 2));
+		}
+
+		vector<float>::iterator max_var = std::max_element(var_b.begin(), var_b.end());	//finding maximum elements in the vector
+		size_t th_idx = std::distance(var_b.begin(), max_var);
+		T threshold = Imin + (T)(th_idx * th_step);
+		return threshold;
+	}
+
+	//this function performs the 2D iterative voting algorithm on the image stored in the gpu 
+	template<typename T>
+	void gpu_ivote2(T* gpuI, unsigned int rmax, size_t x, size_t y, bool invert, T t = 0, std::string outname_img = "out.bmp", std::string outname_txt = "out.txt",
+					int iter = 8, T phi = 15.0f * (float)stim::PI / 180, int conn = 8) {
+
+		size_t pixels = x * y;				//compute the size of input image
+		//
+		if (invert) {						//if inversion is required call the kernel to invert the image
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads));
+			dim3 blocks((unsigned int)x / threads.x + 1, (unsigned int)y / threads.y + 1);
+			cuda_invert << <blocks, threads >> > (gpuI, x, y);
+		}
+		//
+		size_t table_bytes = (size_t)(pow(2 * rmax + 1, 2) * sizeof(T));				// create the atan2 table
+		T* cpuTable = (T*)malloc(table_bytes);											//assign memory on the cpu for atan2 table
+		atan_2<T>(cpuTable, rmax);														//call the function to precompute the atan2 table
+		T* gpuTable;  HANDLE_ERROR(cudaMalloc(&gpuTable, table_bytes));
+		HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, table_bytes, cudaMemcpyHostToDevice));	//copy atan2 table to the gpu
+
+		size_t bytes = pixels* sizeof(T);													//calculate the bytes of the input
+		float dphi = phi / iter;															//change in phi for each iteration
+
+		float* gpuGrad; HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes * 2));									//allocate space to store the 2D gradient
+		float* gpuVote; HANDLE_ERROR(cudaMalloc(&gpuVote, bytes));										//allocate space to store the vote image
+
+		stim::cuda::gpu_gradient_2d<float>(gpuGrad, gpuI, x, y);			//calculate the 2D gradient
+		//if (invert)  stim::cuda::gpu_cart2polar<float>(gpuGrad, x, y, stim::PI);
+		//else stim::cuda::gpu_cart2polar<float>(gpuGrad, x, y);
+		stim::cuda::gpu_cart2polar<float>(gpuGrad, x, y);		//convert cartesian coordinate of gradient to the polar
+
+		for (int i = 0; i < iter; i++) {														//for each iteration
+			cudaMemset(gpuVote, 0, bytes);													//reset the vote image to 0
+			stim::cuda::gpu_vote<float>(gpuVote, gpuGrad, gpuTable, phi, rmax, x, y);		//perform voting
+			stim::cuda::gpu_update_dir<float>(gpuVote, gpuGrad, gpuTable, phi, rmax, x, y);	//update the voter directions
+			phi = phi - dphi;																//decrement phi
+		}
+		stim::cuda::gpu_local_max<float>(gpuI, gpuVote, conn, x, y);				//calculate the local maxima
+
+		T* pts = (T*)malloc(bytes);													//allocate memory on the cpu to store the output of iterative voting
+		HANDLE_ERROR(cudaMemcpy(pts, gpuI, bytes, cudaMemcpyDeviceToHost));			//copy the output from gpu to the cpu memory
+		
+		T threshold;
+		if (t == 0) threshold = stim::th_otsu<T>(pts, pixels);	//if threshold value is not set call the function to compute the threshold
+		else threshold = t;
+		
+		std::ofstream output;		//save the thresholded detected seeds in a text file
+		output.open(outname_txt);
+		output << "X" << " " << "Y" << " " << "threshold" << "\n";
+		size_t ind;
+		for (size_t ix = 0; ix < x; ix++) {
+			for (size_t iy = 0; iy < y; iy++) {
+				ind = iy * x + ix;
+				if (pts[ind] > threshold) {
+					output << ix << " " << iy << " " << pts[ind] << "\n";
+					pts[ind] = 1;
+				}
+				else pts[ind] = 0;
+			}
+		}
+		output.close();
+
+		HANDLE_ERROR(cudaMemcpy(gpuI, pts, bytes, cudaMemcpyHostToDevice));		//copy the points to the gpu
+		stim::cpu2image(pts, outname_img, x, y); //output the image
+		
+	}
+
+
+	template<typename T>
+	void cpu_ivote2(T* cpuI, unsigned int rmax, size_t x, size_t y, bool invert, T t = 0, std::string outname_img = "out.bmp", std::string outname_txt = "out.txt",
+					int iter = 8, T phi = 15.0f * (float)stim::PI / 180, int conn = 8) {
+		size_t bytes = x*y * sizeof(T);
+		T* gpuI;						//allocate space on the gpu to save the input image
+		HANDLE_ERROR(cudaMalloc(&gpuI, bytes));
+		HANDLE_ERROR(cudaMemcpy(gpuI, cpuI, bytes, cudaMemcpyHostToDevice));		//copy the image to the gpu
+		stim::gpu_ivote2<T>(gpuI, rmax, x, y, invert, t, outname_img, outname_txt, iter, phi, conn);				//call the gpu version of the ivote
+		HANDLE_ERROR(cudaMemcpy(cpuI, gpuI, bytes, cudaMemcpyDeviceToHost));		//copy the output to the cpu
+	}
+}
+#endif
\ No newline at end of file
diff --git a/stim/iVote/ivote2/iter_vote2.cuh b/stim/iVote/ivote2/iter_vote2.cuh
new file mode 100644
index 0000000..423d916
--- /dev/null
+++ b/stim/iVote/ivote2/iter_vote2.cuh
@@ -0,0 +1,18 @@
+#ifndef STIM_CUDA_ITER_VOTE2_H
+#define STIM_CUDA_ITER_VOTE2_H
+
+extern bool DEBUG;
+
+#include "local_max.cuh"
+#include "update_dir_bb.cuh"
+#include "vote_atomic_bb.cuh"
+
+namespace stim{
+	namespace cuda{
+	
+	}
+}
+
+
+
+#endif
\ No newline at end of file
diff --git a/stim/iVote/ivote2/local_max.cuh b/stim/iVote/ivote2/local_max.cuh
new file mode 100644
index 0000000..38b7096
--- /dev/null
+++ b/stim/iVote/ivote2/local_max.cuh
@@ -0,0 +1,109 @@
+#ifndef STIM_CUDA_LOCAL_MAX_H
+#define STIM_CUDA_LOCAL_MAX_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the local maximum for finding the cell centers
+		template<typename T>
+		__global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, size_t x, size_t y){
+
+			// calculate the 2D coordinates for this current thread.
+			size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
+			size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
+			
+			if(xi >= x || yi >= y)
+				return;
+						
+			// convert 2D coordinates to 1D
+			size_t i = yi * x + xi;
+			
+			gpuCenters[i] = 0;		//initialize the value at this location to zero
+			
+			T val = gpuVote[i];
+			
+			//compare to the threshold
+			//if(val < final_t) return;
+			
+			//define an array to store indices with same vote value
+			/*int * IdxEq;
+			IdxEq = new int  [2*conn];
+			int n = 0;*/
+			
+			for(int xl = xi - conn; xl < xi + conn; xl++){
+				for(int yl = yi - conn; yl < yi + conn; yl++){
+					if(xl >= 0 && xl < x && yl >= 0 && yl < y){
+						int il = yl * x + xl;
+						if(gpuVote[il] > val){							
+							return;
+							}
+						if (gpuVote[il] == val){
+							/*IdxEq[n] = il;
+							n = n+1;*/
+							 if( il > i){
+								 return;
+							}
+						}
+					}							
+				}
+			}
+			/*if (n!=0){
+				if(IdxEq[n/2] !=i){
+					return;
+				}
+			}	*/	
+			//gpuCenters[i] = 1;
+			gpuCenters[i] = gpuVote[i];
+		}
+		
+		template<typename T>
+		void gpu_local_max(T* gpuCenters, T* gpuVote,  unsigned int conn, size_t x, size_t y){
+
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			/*dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/
+			dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
+			dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
+			
+			//call the kernel to find the local maximum.
+			cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, conn, x, y);
+		}
+
+		template<typename T>
+		void cpu_local_max(T* cpuCenters, T* cpuVote, unsigned int conn, unsigned int x, unsigned int y){
+		
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			// allocate space on the GPU for the detected cell centes
+			T* gpuCenters;
+			cudaMalloc(&gpuCenters, bytes);		
+
+			//allocate space on the GPU for the input Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//copy the Vote image data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the local max function
+			gpu_local_max<T>(gpuCenters, gpuVote, conn, x, y);
+							
+			//copy the cell centers data to the CPU
+			cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ;
+						
+			//free allocated memory
+			cudaFree(gpuCenters);
+			cudaFree(gpuVote);
+		}
+		
+	}
+}
+
+
+
+#endif
\ No newline at end of file
diff --git a/stim/iVote/ivote2/update_dir.cuh b/stim/iVote/ivote2/update_dir.cuh
new file mode 100644
index 0000000..3052bf4
--- /dev/null
+++ b/stim/iVote/ivote2/update_dir.cuh
@@ -0,0 +1,217 @@
+#ifndef STIM_CUDA_UPDATE_DIR_H
+#define STIM_CUDA_UPDATE_DIR_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+
+namespace stim{
+	namespace cuda{
+	
+		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
+		template<typename T>
+		__global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ float s_vote[];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = bxi + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuGrad[2*i];
+
+			//initialize the vote direction to zero
+			gpuDir[i] = 0;
+
+			// define a local variable to maximum value of the vote image in the voting area for this voter
+			float max = 0;
+
+			// define two local variables for the x and y coordinations where the maximum happened
+			int id_x = 0;
+			int id_y = 0;
+
+			//calculate the width of the shared memory block
+			int swidth = 2 * rmax + blockDim.x;
+
+			// compute the size of window which will be checked for finding the voting area for this voter
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			int bxs = bxi - rmax;
+						
+			for(int yr = -rmax; yr <= rmax; yr++){
+
+				//copy the portion of the image necessary for this block to shared memory
+				__syncthreads();
+				stim::cuda::sharedMemcpy_tex2D<float>(s_vote, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim);
+				__syncthreads();
+				
+				//if the current thread is outside of the image, it doesn't have to be computed
+				if(xi < x && yi < y){
+
+					for(int xr = -rmax; xr <= rmax; xr++){
+
+						unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+						// calculate the angle between the voter and the current pixel in x and y directions
+						float atan_angle = gpuTable[ind_t];
+						
+						// calculate the voting direction based on the grtadient direction
+						int idx_share_update = xr + tx_rmax ;
+						float share_vote = s_vote[idx_share_update];
+						
+						// check if the current pixel is located in the voting area of this voter.
+						if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+
+						// compare the vote value of this pixel with the max value to find the maxima and its index.
+							if  (share_vote>max) {
+
+								max = share_vote;
+								id_x =  xr;
+								id_y =  yr;
+							}
+						}
+					}
+				}
+			}
+							
+		unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
+		float new_angle = gpuTable[ind_m];
+
+		if(xi < x && yi < y)
+			gpuDir[i] = new_angle;
+
+		}
+
+		// this kernel updates the gradient direction by the calculated voting direction.
+		template<typename T>
+		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
+
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+		
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+			
+			//update the gradient image with the vote direction
+			gpuGrad[2*i] = gpuDir[i];
+		}
+		
+		template<typename T>
+		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			unsigned int bytes = sizeof(T) * pixels;
+						
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+			//define a channel descriptor for a single 32-bit channel
+			cudaChannelFormatDesc channelDesc =
+					   cudaCreateChannelDesc(32, 0, 0, 0,
+											 cudaChannelFormatKindFloat);
+			cudaArray* cuArray;												//declare the cuda array
+			cudaMallocArray(&cuArray, &channelDesc, x, y);			//allocate the cuda array
+
+			// Copy the image data from global memory to the array
+			cudaMemcpyToArray(cuArray, 0, 0, gpuVote, bytes,
+							  cudaMemcpyDeviceToDevice);
+
+			// Specify texture
+			struct cudaResourceDesc resDesc;				//create a resource descriptor
+			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
+			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
+			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
+
+			// Specify texture object parameters
+			struct cudaTextureDesc texDesc;							//create a texture descriptor
+			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
+			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
+			texDesc.addressMode[1]   = cudaAddressModeWrap;
+			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
+			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
+			texDesc.normalizedCoords = 0;							//not using normalized coordinates
+
+			// Create texture object
+			cudaTextureObject_t texObj = 0;
+			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
+
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*(1)*4;
+			
+			// allocate space on the GPU for the updated vote direction
+			T* gpuDir;
+			cudaMalloc(&gpuDir, bytes);	
+
+			//call the kernel to calculate the new voting direction
+			cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, texObj, gpuGrad, gpuTable, phi, rmax, x , y);
+
+			//call the kernel to update the gradient direction
+			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
+			
+			//free allocated memory
+			cudaFree(gpuDir);
+
+			cudaDestroyTextureObject(texObj);
+			cudaFreeArray(cuArray);
+
+		}
+		
+		template<typename T>
+		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);
+
+			//copy the input vote image to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the update direction function
+			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the new gradient image back to the CPU
+			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
diff --git a/stim/iVote/ivote2/update_dir_bb.cuh b/stim/iVote/ivote2/update_dir_bb.cuh
new file mode 100644
index 0000000..43869ec
--- /dev/null
+++ b/stim/iVote/ivote2/update_dir_bb.cuh
@@ -0,0 +1,181 @@
+#ifndef STIM_CUDA_UPDATE_DIR_BB_H
+#define STIM_CUDA_UPDATE_DIR_BB_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include <stim/visualization/aabb2.h>
+#include <stim/visualization/colormap.h>
+#include <math.h> 
+
+//#define RMAX_TEST	8
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
+			extern __shared__ T S[];
+			T* shared_atan = S;
+			size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
+			stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
+
+			//T* shared_vote = &S[n_table];
+			//size_t template_size_x = (blockDim.x + 2 * rmax);
+			//size_t template_size_y = (blockDim.y + 2 * rmax);
+			//stim::cuda::threadedMemcpy2D((char*)shared_vote, (char*)gpuVote, template_size_x, template_size_y, x,  threadIdx.y * blockDim.x + threadIdx.x, blockDim.x * blockDim.y);
+			
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;				//calculate the 2D coordinates for this current thread.
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+
+			if(xi >= x || yi >= y) return;								//if the index is outside of the image, terminate the kernel
+
+			int i = yi * x + xi;										//convert 2D coordinates to 1D
+			float theta = gpuGrad[2*i];									//calculate the voting direction based on the grtadient direction - global memory fetch
+			
+			stim::aabb2<int> bb(xi, yi);								//initialize a bounding box at the current point
+			bb.insert(xi + ceil(rmax * cos(theta)),       ceil(yi + rmax * sin(theta)));
+			bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi)));		//insert one corner of the triangle into the bounding box
+			bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi)));		//insert the final corner into the bounding box
+
+			int x_table = 2*rmax +1;
+			T rmax_sq = rmax * rmax;
+
+			int lut_i;
+			T dx_sq, dy_sq;
+
+			bb.trim_low(0, 0);															//make sure the bounding box doesn't go outside the image
+			bb.trim_high(x-1, y-1);
+
+			int by, bx;
+			int dx, dy;													//coordinate relative to (xi, yi)
+			
+			T v;
+			T max_v = 0;												//initialize the maximum vote value to zero
+			T alpha;
+			int max_dx = bb.low[0] - xi;
+			int max_dy = bb.low[1] - yi;
+			for(by = bb.low[1]; by <= bb.high[1]; by++){					//for each element in the bounding box
+				dy = by - yi;											//calculate the y coordinate of the current point relative to yi
+				dy_sq = dy * dy;
+				for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
+					dx = bx - xi;
+					dx_sq = dx * dx;
+					lut_i = (rmax - dy) * x_table + rmax - dx;
+					alpha = shared_atan[lut_i];
+					if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
+						v = gpuVote[by * x + bx];				// find the vote value for the current counter
+						if(v > max_v){
+							max_v = v;
+							max_dx = dx;
+							max_dy = dy;
+						}
+					}
+				}
+			}			
+			gpuDir[i] = atan2((T)max_dy, (T)max_dx);
+		}
+	
+		
+
+		// this kernel updates the gradient direction by the calculated voting direction.
+		template<typename T>
+		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, size_t x, size_t y){
+
+			// calculate the 2D coordinates for this current thread.
+			size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
+			size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
+
+			if(xi >= x || yi >= y) return;
+		
+			// convert 2D coordinates to 1D
+			size_t i = yi * x + xi;
+			
+			//update the gradient image with the vote direction
+			gpuGrad[2*i] = gpuDir[i];
+		}
+		
+		template<typename T>
+		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y){
+
+			//calculate the number of bytes in the array
+			size_t bytes = x * y * sizeof(T);
+			
+			// allocate space on the GPU for the updated vote direction
+			T* gpuDir;
+			HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) );	
+
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			
+			dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
+			dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
+
+			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
+			//size_t curtain = 2 * rmax;
+			//size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain);
+			size_t shared_mem_req = table_bytes;// + template_bytes;
+			if (DEBUG) std::cout << "Shared Memory required: " << shared_mem_req << std::endl;
+
+			size_t shared_mem = stim::sharedMemPerBlock();
+			if(shared_mem_req > shared_mem){
+				std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
+				exit(1);
+			}
+
+			//call the kernel to calculate the new voting direction
+			cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, (int)x , (int)y);
+
+			//call the kernel to update the gradient direction
+			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, (int)x , (int)y);
+			//free allocated memory
+			HANDLE_ERROR( cudaFree(gpuDir) );
+
+		}
+		
+		template<typename T>
+		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);
+
+			//copy the input vote image to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the update direction function
+			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the new gradient image back to the CPU
+			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/stim/iVote/ivote2/update_dir_shared.cuh b/stim/iVote/ivote2/update_dir_shared.cuh
new file mode 100644
index 0000000..91aa717
--- /dev/null
+++ b/stim/iVote/ivote2/update_dir_shared.cuh
@@ -0,0 +1,184 @@
+#ifndef STIM_CUDA_UPDATE_DIR_SHARED_H
+#define STIM_CUDA_UPDATE_DIR_SHARED_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+
+namespace stim{
+	namespace cuda{
+	
+		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
+		template<typename T>
+		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ float s_vote[];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = bxi + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuGrad[2*i];
+
+			//initialize the vote direction to zero
+			gpuDir[i] = 0;
+
+			// define a local variable to maximum value of the vote image in the voting area for this voter
+			float max = 0;
+
+			// define two local variables for the x and y coordinations where the maximum happened
+			int id_x = 0;
+			int id_y = 0;
+
+			//calculate the width of the shared memory block
+			int swidth = 2 * rmax + blockDim.x;
+
+			// compute the size of window which will be checked for finding the voting area for this voter
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			int bxs = bxi - rmax;
+						
+			for(int yr = -rmax; yr <= rmax; yr++){
+				//if (yi+yr >= 0 && yi + yr < y){
+					//copy the portion of the image necessary for this block to shared memory
+					__syncthreads();
+					cpyG2S1D<float>(s_vote, gpuVote, bxs, yi + yr , swidth, 1, threadIdx, blockDim, x, y);
+					__syncthreads();
+				
+					//if the current thread is outside of the image, it doesn't have to be computed
+					if(xi < x && yi < y){
+
+						for(int xr = -rmax; xr <= rmax; xr++){
+
+							unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+							// calculate the angle between the voter and the current pixel in x and y directions
+							float atan_angle = gpuTable[ind_t];
+						
+							// calculate the voting direction based on the grtadient direction
+							int idx_share_update = xr + tx_rmax ;
+							float share_vote = s_vote[idx_share_update];
+						
+							// check if the current pixel is located in the voting area of this voter.
+							if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+
+							// compare the vote value of this pixel with the max value to find the maxima and its index.
+								if  (share_vote>max) {
+
+									max = share_vote;
+									id_x =  xr;
+									id_y =  yr;
+								}
+							}
+						}
+					}
+				//}
+			}
+							
+		unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
+		float new_angle = gpuTable[ind_m];
+
+		if(xi < x && yi < y)
+			gpuDir[i] = new_angle;
+
+		}
+
+		// this kernel updates the gradient direction by the calculated voting direction.
+		template<typename T>
+		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
+
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+		
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+			
+			//update the gradient image with the vote direction
+			gpuGrad[2*i] = gpuDir[i];
+		}
+		
+		template<typename T>
+		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*(1)*4;
+			
+			// allocate space on the GPU for the updated vote direction
+			T* gpuDir;
+			cudaMalloc(&gpuDir, bytes);	
+
+			//call the kernel to calculate the new voting direction
+			cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+			//call the kernel to update the gradient direction
+			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
+			
+			//free allocated memory
+			cudaFree(gpuDir);
+
+		}
+		
+		template<typename T>
+		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);
+
+			//copy the input vote image to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the update direction function
+			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the new gradient image back to the CPU
+			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/stim/iVote/ivote2/update_dir_threshold_global.cuh b/stim/iVote/ivote2/update_dir_threshold_global.cuh
new file mode 100644
index 0000000..f73efa6
--- /dev/null
+++ b/stim/iVote/ivote2/update_dir_threshold_global.cuh
@@ -0,0 +1,159 @@
+#ifndef STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBALD_H
+#define STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBAL_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"   
+
+namespace stim{
+	namespace cuda{
+	
+		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
+		template<typename T>
+		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x,  int y){
+
+			
+			
+			// calculate the coordinate for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuTh[3*xi];
+			
+			//calculate the position and x, y coordinations of this voter in the original image
+			unsigned int i_v = gpuTh[3*xi+2];
+			unsigned int y_v = i_v/x;
+			unsigned int x_v = i_v - (y_v*x);
+			
+			//initialize the vote direction to zero
+			gpuDir[xi] = 0;
+
+			// define a local variable to maximum value of the vote image in the voting area for this voter
+			float max = 0;
+
+			// define two local variables for the x and y coordinations where the maximum happened
+			int id_x = 0;
+			int id_y = 0;
+
+			// compute the size of window which will be checked for finding the voting area for this voter
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			if(xi < th_size){
+				
+				for(int yr = -rmax; yr <= rmax; yr++){
+					
+					for(int xr = -rmax; xr <= rmax; xr++){
+
+						unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+						// find the angle between the voter and the current pixel in x and y directions
+						float atan_angle = gpuTable[ind_t];
+										
+						// check if the current pixel is located in the voting area of this voter.
+						if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+							// find the vote value for the current counter
+							float vote_c = gpuVote[(y_v+yr)*x + (x_v+xr)];
+							// compare the vote value of this pixel with the max value to find the maxima and its index.
+							if  (vote_c>max) {
+
+								max = vote_c;
+								id_x =  xr;
+								id_y =  yr;
+							}
+						}
+					}
+				}
+			
+							
+				unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
+				float new_angle = gpuTable[ind_m];
+				gpuDir[xi] = new_angle;
+			}
+
+		}
+
+		// this kernel updates the gradient direction by the calculated voting direction.
+		template<typename T>
+		__global__ void cuda_update_grad(T* gpuTh, T* gpuDir, int th_size, int x, int y){
+
+			// calculate the coordinate for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			
+		
+			//update the gradient image with the vote direction
+			gpuTh[3*xi] = gpuDir[xi];
+		}
+		
+		template<typename T>
+		void gpu_update_dir(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes_th = th_size* sizeof(T);
+
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads);
+			dim3 blocks(th_size/threads.x+1);
+			
+			// allocate space on the GPU for the updated vote direction
+			T* gpuDir;
+			cudaMalloc(&gpuDir, bytes_th);	
+
+			//call the kernel to calculate the new voting direction
+			cuda_update_dir <<< blocks, threads>>>(gpuDir, gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y);
+
+			//call the kernel to update the gradient direction
+			cuda_update_grad <<< blocks, threads >>>(gpuTh, gpuDir, th_size, x , y);
+			
+			//free allocated memory
+			cudaFree(gpuDir);
+
+		}
+		
+		template<typename T>
+		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);
+
+			//copy the input vote image to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the update direction function
+			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the new gradient image back to the CPU
+			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/stim/iVote/ivote2/vote.cuh b/stim/iVote/ivote2/vote.cuh
new file mode 100644
index 0000000..94250eb
--- /dev/null
+++ b/stim/iVote/ivote2/vote.cuh
@@ -0,0 +1,175 @@
+#ifndef STIM_CUDA_VOTE_H
+#define STIM_CUDA_VOTE_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, int rmax, int x, int y){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ float2 s_grad[];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = bxi + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+						
+			// define a local variable to sum the votes from the voters
+			float sum = 0;
+			
+			//calculate the width of the shared memory block
+			int swidth = 2 * rmax + blockDim.x;
+			
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			int bxs = bxi - rmax;
+			
+			//for every line (along y)
+			for(int yr = -rmax; yr <= rmax; yr++){
+
+				//copy the portion of the image necessary for this block to shared memory
+				__syncthreads();
+				stim::cuda::sharedMemcpy_tex2D<float2>(s_grad, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim);
+				__syncthreads();
+				
+				if(xi < x && yi < y){
+
+					for(int xr = -rmax; xr <= rmax; xr++){
+					
+							//find the location of this voter in the atan2 table
+							int id_t = (yr + rmax) * x_table + xr + rmax;
+
+							// calculate the angle between the pixel and the current voter in x and y directions
+							float atan_angle = gpuTable[id_t];
+												
+							// calculate the voting direction based on the grtadient direction
+							int idx_share = xr + tx_rmax ;
+							float2 g = s_grad[idx_share];
+							float theta = g.x;
+
+							// check if the current voter is located in the voting area of this pixel.
+							if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+								sum += g.y;		
+
+							}
+					}
+				
+				}
+			}
+			if(xi < x && yi < y)
+				gpuVote[i] = sum;
+			
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			unsigned int bytes = sizeof(T) * pixels;
+						
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+			// Allocate CUDA array in device memory
+			
+			//define a channel descriptor for a single 32-bit channel
+			cudaChannelFormatDesc channelDesc =
+					   cudaCreateChannelDesc(32, 32, 0, 0,
+											 cudaChannelFormatKindFloat);
+			cudaArray* cuArray;												//declare the cuda array
+			cudaMallocArray(&cuArray, &channelDesc, x, y);			//allocate the cuda array
+
+			// Copy the image data from global memory to the array
+			cudaMemcpyToArray(cuArray, 0, 0, gpuGrad, bytes*2,
+							  cudaMemcpyDeviceToDevice);
+
+			// Specify texture
+			struct cudaResourceDesc resDesc;				//create a resource descriptor
+			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
+			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
+			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
+
+			// Specify texture object parameters
+			struct cudaTextureDesc texDesc;							//create a texture descriptor
+			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
+			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
+			texDesc.addressMode[1]   = cudaAddressModeWrap;
+			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
+			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
+			texDesc.normalizedCoords = 0;							//not using normalized coordinates
+
+			// Create texture object
+			cudaTextureObject_t texObj = 0;
+			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
+
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4;
+			
+			//call the kernel to do the voting
+			
+			cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y);
+
+			cudaDestroyTextureObject(texObj);
+			cudaFreeArray(cuArray);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
diff --git a/stim/iVote/ivote2/vote_atomic.cuh b/stim/iVote/ivote2/vote_atomic.cuh
new file mode 100644
index 0000000..fc0ce47
--- /dev/null
+++ b/stim/iVote/ivote2/vote_atomic.cuh
@@ -0,0 +1,116 @@
+#ifndef STIM_CUDA_VOTE_ATOMIC_H
+#define STIM_CUDA_VOTE_ATOMIC_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuGrad[2*i];
+			//calculate the amount of vote for the voter
+			float mag = gpuGrad[2*i + 1];
+						
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			if(xi < x && yi < y){
+			//for every line (along y)
+				for(int yr = -rmax; yr <= rmax; yr++){					
+					for(int xr = -rmax; xr <= rmax; xr++){
+						if ((yi+yr)>=0 && (yi+yr)<y && (xi+xr)>=0 && (xi+xr)<x){
+					
+							//find the location of the current pixel in the atan2 table
+							unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+							// calculate the angle between the voter and the current pixel in x and y directions
+							float atan_angle = gpuTable[ind_t];
+						
+							// check if the current pixel is located in the voting area of this voter.
+							if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+								// calculate the 1D index for the current pixel in global memory
+								unsigned int ind_g = (yi+yr)*x + (xi+xr);
+								atomicAdd(&gpuVote[ind_g], mag);
+							
+								}
+						}
+					}
+				}	
+			}
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+					
+			// specify  share memory
+			//unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4;
+			
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/stim/iVote/ivote2/vote_atomic_bb.cuh b/stim/iVote/ivote2/vote_atomic_bb.cuh
new file mode 100644
index 0000000..5a05001
--- /dev/null
+++ b/stim/iVote/ivote2/vote_atomic_bb.cuh
@@ -0,0 +1,151 @@
+#ifndef STIM_CUDA_VOTE_ATOMIC_BB_H
+#define STIM_CUDA_VOTE_ATOMIC_BB_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include <stim/visualization/aabb2.h>
+#include <stim/visualization/colormap.h>
+#include <math.h>
+
+
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, size_t x, size_t y, bool gradmag = true){
+
+			extern __shared__ T S[];
+			T* shared_atan = S;
+			size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
+			stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
+			
+			// calculate the 2D coordinates for this current thread.
+			size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
+			size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
+			
+			if(xi >= x || yi >= y) return;			
+			// convert 2D coordinates to 1D
+			size_t i = yi * x + xi;
+
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuGrad[2*i];
+			//calculate the amount of vote for the voter
+			float mag = gpuGrad[2*i + 1];
+			
+
+			stim::aabb2<int> bb(xi, yi);								//initialize a bounding box at the current point
+			bb.insert(xi + ceil(rmax * cos(theta)),       ceil(yi + rmax * sin(theta)));
+			bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi)));		//insert one corner of the triangle into the bounding box
+			bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi)));		//insert the final corner into the bounding box
+			
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			
+			int lut_i;
+			T dx_sq, dy_sq;
+
+			bb.trim_low(0, 0);															//make sure the bounding box doesn't go outside the image
+			bb.trim_high(x-1, y-1);
+
+			size_t by, bx;
+			int dx, dy;					
+			
+			unsigned int ind_g;											//initialize the maximum vote value to zero
+			T alpha;
+			
+			for(by = bb.low[1]; by <= bb.high[1]; by++){					//for each element in the bounding box
+				dy = by - yi;											//calculate the y coordinate of the current point relative to yi
+				dy_sq = dy * dy;
+				for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
+					dx = bx - xi;
+					dx_sq = dx * dx;
+					lut_i = (rmax - dy) * x_table + rmax - dx;
+					alpha = shared_atan[lut_i];
+					if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
+						ind_g = (by)*x + (bx);
+						if(gradmag) atomicAdd(&gpuVote[ind_g], mag);			//add the gradient magnitude (if the gradmag flag is enabled)
+						else		atomicAdd(&gpuVote[ind_g], 1.0f);			//otherwise just add 1
+					
+					}
+				}
+			}			
+			
+		}
+	
+
+		/// Iterative voting for an image
+		/// @param gpuVote is the resulting vote image
+		/// @param gpuGrad is the gradient of the input image
+		/// @param gpuTable is the pre-computed atan2() table
+		/// @param phi is the angle of the vote region
+		/// @param rmax is the estimated radius of the blob (defines the "width" of the vote region)
+		/// @param x and y are the spatial dimensions of the gradient image
+		/// @param gradmag defines whether or not the gradient magnitude is taken into account during the vote
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y, bool gradmag = true){
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
+			dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
+			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
+			size_t shared_mem_req = table_bytes;// + template_bytes;
+			if (DEBUG) std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
+			size_t shared_mem = stim::sharedMemPerBlock();
+			if(shared_mem_req > shared_mem){
+				std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
+				exit(1);
+			}
+
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y, gradmag);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/stim/iVote/ivote2/vote_atomic_shared.cuh b/stim/iVote/ivote2/vote_atomic_shared.cuh
new file mode 100644
index 0000000..102bd20
--- /dev/null
+++ b/stim/iVote/ivote2/vote_atomic_shared.cuh
@@ -0,0 +1,166 @@
+#ifndef STIM_CUDA_VOTE_ATOMIC_SHARED_H
+#define STIM_CUDA_VOTE_ATOMIC_SHARED_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+
+//#include "writebackshared.cuh"
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+
+			//generate a pointer to the shared memory
+			extern __shared__ float s_vote[];
+			// calculate the 2D coordinates for this current thread.
+			int bxi = blockIdx.x * blockDim.x;
+			int byi = blockIdx.y * blockDim.y;
+			int xi = bxi + threadIdx.x;
+			int yi = byi + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			// calculate the voting direction based on the gradient direction
+			float theta = gpuGrad[2*i];
+			//calculate the amount of vote for the voter
+			float mag = gpuGrad[2*i + 1];
+			
+			//find the starting points and size of window, wich will be copied to the shared memory
+			int bxs = bxi - rmax;
+			int bys = byi - rmax;
+			int xwidth = 2*rmax + blockDim.x;
+			int ywidth = 2*rmax + blockDim.y;
+			//compute the coordinations of this pixel in the 2D-shared memory.
+			int sx_rx = threadIdx.x + rmax;
+			int sy_ry = threadIdx.y + rmax;
+			// compute the size of window which will be checked for finding the counters for this voter
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			//calculate some parameters for indexing shared memory
+				//calculate the total number of threads available
+				unsigned int tThreads = blockDim.x * blockDim.y;
+				//calculate the current 1D thread ID
+				unsigned int ti =  threadIdx.y * (blockDim.x) + threadIdx.x;
+				//calculate the number of iteration required
+				unsigned int In = xwidth*ywidth/tThreads + 1;
+			if(xi < x && yi < y){
+				__syncthreads();
+				//initialize the shared memory to zero				
+				for (unsigned int i = 0; i < In; i++){								
+					unsigned int sIdx0 = i * tThreads + ti;
+					if (sIdx0< xwidth*ywidth) {
+						s_vote[sIdx0] = 0;
+					}
+				}
+				__syncthreads();
+				//for every line (along y)
+				for(int yr = -rmax; yr <= rmax; yr++){	
+					//compute the position of the current voter in the shared memory along the y axis.
+					unsigned int sIdx_y1d = (sy_ry + yr)* xwidth;
+					for(int xr = -rmax; xr <= rmax; xr++){												
+						
+						//find the location of the current pixel in the atan2 table
+						unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+						// calculate the angle between the voter and the current pixel in x and y directions
+						float atan_angle = gpuTable[ind_t];
+							
+						// check if the current pixel is located in the voting area of this voter.
+						if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+							//compute the position of the current voter in the 2D-shared memory along the x axis.
+							unsigned int sIdx_x = (sx_rx + xr);
+							//find the 1D index of this voter in the 2D-shared memory.
+							unsigned int s_Idx = (sIdx_y1d  + sIdx_x);
+								
+							atomicAdd(&s_vote[s_Idx], mag);								
+							}
+					}
+				}	
+				//write shared memory back to global memory
+				
+				__syncthreads();						
+				for (unsigned int i = 0; i < In; i++){
+				
+					unsigned int sIdx = i * tThreads + ti;
+					if (sIdx>= xwidth*ywidth) return;
+
+					unsigned int sy = sIdx/xwidth;
+					unsigned int sx = sIdx - (sy * xwidth);
+					
+					unsigned int gx = bxs + sx;
+					unsigned int gy = bys + sy;
+					if (gx<x&& gy<y){
+						unsigned int gIdx = gy * x + gx;
+						//write shared to global memory
+						atomicAdd(&gpuVote[gIdx], s_vote[sIdx]);
+						
+					}						
+				}
+				
+			}
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(sqrt(max_threads), sqrt(max_threads));
+			dim3 blocks(x/threads.x + 1 , y/threads.y+1);
+					
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*sizeof(T);
+			
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads, share_bytes>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/stim/iVote/ivote2/vote_shared.cuh b/stim/iVote/ivote2/vote_shared.cuh
new file mode 100644
index 0000000..f53fe5d
--- /dev/null
+++ b/stim/iVote/ivote2/vote_shared.cuh
@@ -0,0 +1,139 @@
+#ifndef STIM_CUDA_VOTE_SHARED_H
+#define STIM_CUDA_VOTE_SHARED
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ float s_grad[];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = bxi + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+						
+			// define a local variable to sum the votes from the voters
+			float sum = 0;
+			
+			//calculate the width of the shared memory block
+			int swidth = 2 * rmax + blockDim.x;
+			
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			int bxs = bxi - rmax;
+			
+			//for every line (along y)
+			for(int yr = -rmax; yr <= rmax; yr++){
+				if (yi+yr<y && yi+yr>=0){
+					//copy the portion of the image necessary for this block to shared memory
+					__syncthreads();
+					cpyG2S1D2ch<float>(s_grad, gpuGrad, bxs, yi + yr , 2*swidth, 1, threadIdx, blockDim, x, y);
+					__syncthreads();
+				
+					if(xi < x && yi < y){
+
+						for(int xr = -rmax; xr <= rmax; xr++){
+					
+								//find the location of this voter in the atan2 table
+								int id_t = (yr + rmax) * x_table + xr + rmax;
+
+								// calculate the angle between the pixel and the current voter in x and y directions
+								float atan_angle = gpuTable[id_t];
+												
+								// calculate the voting direction based on the grtadient direction
+								int idx_share = xr + tx_rmax ;
+								float theta = s_grad[idx_share*2];
+								float mag = s_grad[idx_share*2 + 1];
+							
+
+								// check if the current voter is located in the voting area of this pixel.
+								if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+									sum += mag;		
+
+								}
+						}
+				
+					}
+				}
+			}
+			if(xi < x && yi < y)
+				gpuVote[i] = sum;
+			
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+					
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*1*2*sizeof(T);
+			
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/stim/iVote/ivote2/vote_shared_32-32.cuh b/stim/iVote/ivote2/vote_shared_32-32.cuh
new file mode 100644
index 0000000..23c9481
--- /dev/null
+++ b/stim/iVote/ivote2/vote_shared_32-32.cuh
@@ -0,0 +1,150 @@
+#ifndef STIM_CUDA_VOTE_SHARED_H
+#define STIM_CUDA_VOTE_SHARED
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ float s_grad[];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			int byi = blockIdx.y * blockDim.y;
+			// calculate the 2D coordinates for this current thread.
+			int xi = bxi + threadIdx.x;
+			int yi = byi + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+						
+			// define a local variable to sum the votes from the voters
+			float sum = 0;
+			
+			//calculate the width of the shared memory block
+			int xwidth = 2 * rmax + blockDim.x;
+			int ywidth = 2 * rmax + blockDim.y;
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			int bxs = bxi - rmax;			
+			int bys = byi - rmax;	
+			//compute the coordinations of this pixel in the 2D-shared memory.
+			int sx_rx = threadIdx.x + rmax;
+			int sy_ry = threadIdx.y + rmax;
+			//copy the portion of the image necessary for this block to shared memory
+			__syncthreads();
+			cpyG2S2D2ch<float>(s_grad, gpuGrad, bxs, bys, 2*xwidth, ywidth, threadIdx, blockDim, x, y);
+			__syncthreads();
+			
+			for(int yr = -rmax; yr <= rmax; yr++){
+				int yi_v = (yi + yr) ;
+				//compute the position of the current voter in the shared memory along the y axis.
+				unsigned int sIdx_y1d = (sy_ry + yr)* xwidth;
+				//if (yi+yr<y && yi+yr>=0){
+					if(xi < x && yi < y){
+
+						for(int xr = -rmax; xr <= rmax; xr++){
+					
+								//compute the position of the current voter in the 2D-shared memory along the x axis.
+								unsigned int sIdx_x = (sx_rx + xr);
+								//find the 1D index of this voter in the 2D-shared memory.
+								unsigned int s_Idx = (sIdx_y1d  + sIdx_x);
+								unsigned int s_Idx2 = s_Idx * 2;
+								
+								//find the location of this voter in the atan2 table
+								int id_t = (yr + rmax) * x_table + xr + rmax;
+
+								// calculate the angle between the pixel and the current voter in x and y directions
+								float atan_angle = gpuTable[id_t];
+												
+								// calculate the voting direction based on the grtadient direction
+								//int idx_share = xr + tx_rmax ;
+								float theta = s_grad[s_Idx2];
+								float mag = s_grad[s_Idx2 + 1];
+							
+
+								// check if the current voter is located in the voting area of this pixel.
+								if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+									sum += mag;		
+
+								}
+						}
+				
+					}
+				//}
+			}
+			if(xi < x && yi < y)
+				gpuVote[i] = sum;
+			
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(sqrt(max_threads), sqrt(max_threads));
+			dim3 blocks(x/threads.x + 1 , y/threads.y+1);
+			
+					
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*2*sizeof(T);
+			
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/stim/iVote/ivote2/vote_threshold_global.cuh b/stim/iVote/ivote2/vote_threshold_global.cuh
new file mode 100644
index 0000000..7a944f1
--- /dev/null
+++ b/stim/iVote/ivote2/vote_threshold_global.cuh
@@ -0,0 +1,112 @@
+#ifndef STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H
+#define STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x, int y){
+
+						
+			// calculate the x coordinate for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuTh[3*xi];
+			//find the gradient magnitude for the current voter
+			float mag = gpuTh[3*xi + 1];
+			//calculate the position and x, y coordinations of this voter in the original image
+			unsigned int i_v = gpuTh[3*xi+2];
+			unsigned int y_v = i_v/x;
+			unsigned int x_v = i_v - (y_v*x);
+			
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			if(xi < th_size){
+				for(int yr = -rmax; yr <= rmax; yr++){					
+					for(int xr = -rmax; xr <= rmax; xr++){
+						if ((y_v+yr)>=0 && (y_v+yr)<y && (x_v+xr)>=0 && (x_v+xr)<x){
+					
+							//find the location of the current pixel in the atan2 table
+							unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+							// calculate the angle between the voter and the current pixel in x and y directions
+							float atan_angle = gpuTable[ind_t];
+						
+							// check if the current pixel is located in the voting area of this voter.
+							if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+								// calculate the 1D index for the current pixel in global memory
+								unsigned int ind_g = (y_v+yr)*x + (x_v+xr);
+								atomicAdd(&gpuVote[ind_g], mag);
+							
+								}
+						}
+					}
+				}	
+			}
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads);
+			dim3 blocks(th_size/threads.x + 1);
+			
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads>>>(gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
\ No newline at end of file
--
libgit2 0.21.4