diff --git a/stim/cuda/ivote/david_update_dir_global.cuh b/stim/cuda/ivote/david_update_dir_global.cuh
deleted file mode 100644
index 3ae624c..0000000
--- a/stim/cuda/ivote/david_update_dir_global.cuh
+++ /dev/null
@@ -1,171 +0,0 @@
-#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
-#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
-
-# include <iostream>
-# include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-#include <math.h>
-#include "cpyToshare.cuh" 
-
-#define RMAX_TEST	8
-
-namespace stim{
-	namespace cuda{
-	
-		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
-		template<typename T>
-		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
-			extern __shared__ T atan2_table[];
-			
-			//calculate the start point for this block
-			//int bxi = blockIdx.x * blockDim.x;
-
-			stim::cuda::sharedMemcpy(atan2_table, gpuTable, (2 * rmax + 1) * (2 * rmax + 1), threadIdx.x, blockDim.x);
-
-			__syncthreads();
-			
-			// calculate the 2D coordinates for this current thread.
-			//int xi = bxi + threadIdx.x;
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-			if(xi >= x || yi >= y) return;													//if the index is outside of the image, terminate the kernel
-
-			int i = yi * x + xi;												// convert 2D coordinates to 1D
-			
-			float theta = gpuGrad[2*i];											// calculate the voting direction based on the grtadient direction - global memory fetch			
-			gpuDir[i] = 0;														//initialize the vote direction to zero			
-			float max = 0;														// define a local variable to maximum value of the vote image in the voting area for this voter
-			int id_x = 0;														// define two local variables for the x and y position of the maximum
-			int id_y = 0;
-			
-			int x_table = 2*rmax +1;											// compute the size of window which will be checked for finding the voting area for this voter
-			int rmax_sq = rmax * rmax;
-			int tx_rmax = threadIdx.x + rmax;
-			float atan_angle;
-			float vote_c;
-			unsigned int ind_t;
-			for(int yr = -rmax; yr <= rmax; yr++){					//for each counter in the y direction
-				if (yi+yr >= 0 && yi + yr < y){									//if the counter exists (we aren't looking outside of the image)
-					for(int xr = -rmax; xr <= rmax; xr++){					//for each counter in the x direction
-						if((xr * xr + yr *yr)< rmax_sq){								//if the counter is within range of the voter
-
-							ind_t = (rmax - yr) * x_table + rmax - xr;		//calculate the index to the atan2 table							
-							atan_angle = atan2_table[ind_t];								//retrieve the direction vector from the table						
-
-							//atan_angle = atan2((float)yr, (float)xr);
-							
-							if (abs(atan_angle - theta) <phi){							// check if the current pixel is located in the voting angle of this voter.				
-								vote_c = gpuVote[(yi+yr)*x + (xi+xr)];			// find the vote value for the current counter						
-								if(vote_c>max) {								// compare the vote value of this pixel with the max value to find the maxima and its index.
-									max = vote_c;
-									id_x =  xr;
-									id_y =  yr;
-								}
-							}
-						}
-					}
-				}
-			}
-							
-			unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
-			float new_angle = gpuTable[ind_m];
-
-			if(xi < x && yi < y)
-				gpuDir[i] = new_angle;
-		}										//end kernel
-
-		// this kernel updates the gradient direction by the calculated voting direction.
-		template<typename T>
-		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
-
-			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-		
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
-			
-			//update the gradient image with the vote direction
-			gpuGrad[2*i] = gpuDir[i];
-		}
-		
-		template<typename T>
-		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-
-			dim3 threads(sqrt(max_threads), sqrt(max_threads));
-			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
-
-			
-			
-			// allocate space on the GPU for the updated vote direction
-			T* gpuDir;
-			cudaMalloc(&gpuDir, bytes);	
-
-			size_t shared_mem = sizeof(T) * std::pow((2 * rmax + 1), 2);
-			std::cout<<"Shared memory for atan2 table: "<<shared_mem<<std::endl;
-
-			//call the kernel to calculate the new voting direction
-			cuda_update_dir <<< blocks, threads, shared_mem>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-
-			//call the kernel to update the gradient direction
-			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
-			
-			//free allocated memory
-			cudaFree(gpuDir);
-
-		}
-		
-		template<typename T>
-		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			//calculate the number of bytes in the atan2 table
-			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
-
-			//allocate space on the GPU for the Vote Image
-			T* gpuVote;
-			cudaMalloc(&gpuVote, bytes);
-
-			//copy the input vote image to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
-
-			//allocate space on the GPU for the input Gradient image
-			T* gpuGrad;
-			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
-
-			//copy the Gradient data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
-
-			//allocate space on the GPU for the atan2 table
-			T* gpuTable;
-			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
-
-			//copy the atan2 values to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
-						
-			//call the GPU version of the update direction function
-			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-							
-			//copy the new gradient image back to the CPU
-			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
-
-			//free allocated memory
-			cudaFree(gpuTable);
-			cudaFree(gpuVote);
-			cudaFree(gpuGrad);
-		}
-		
-	}
-}
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/re_sample.cuh b/stim/cuda/ivote/re_sample.cuh
new file mode 100644
index 0000000..06bfeb7
--- /dev/null
+++ b/stim/cuda/ivote/re_sample.cuh
@@ -0,0 +1,106 @@
+#ifndef STIM_CUDA_RE_SAMPLE_H
+#define STIM_CUDA_RE_SAMPLE_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/templates/gaussian_blur.cuh>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
+
+			unsigned int sigma_ds = 1/resize;
+			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
+			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
+			
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+			
+			if(xi< x && yi< y){
+				if(xi%sigma_ds==0){
+					if(yi%sigma_ds==0){
+						gpuI[i] = gpuI0[(yi/sigma_ds)*x_ds + xi/sigma_ds];
+					}
+				}
+				else gpuI[i] = 0;
+
+				//int x_org = xi * sigma_ds ;
+				//int y_org = yi * sigma_ds ;
+				//int i_org = y_org * x + x_org;
+				//gpuI[i] = gpuI0[i_org];
+			}
+
+		}
+
+
+		/// Applies a Gaussian blur to a 2D image stored on the GPU
+		template<typename T>
+		void gpu_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
+
+			
+			//unsigned int sigma_ds = 1/resize;
+			//unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
+			//unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
+			
+			//get the number of pixels in the image
+			//unsigned int pixels_ds = x_ds * y_ds;
+			
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+			//stim::cuda::gpu_gaussian_blur2<float>(gpuI0, sigma_ds,x ,y);
+			
+			//resample the image
+			cuda_re_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y);
+
+		}
+
+		/// Applies a Gaussian blur to a 2D image stored on the CPU
+		template<typename T>
+		void cpu_re_sample(T* out, T* in, T resize, unsigned int x, unsigned int y){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x*y;
+			unsigned int bytes = sizeof(T) * pixels;
+			
+			unsigned int sigma_ds = 1/resize;
+			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
+			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
+			unsigned int bytes_ds = sizeof(T) * x_ds * y_ds;
+			
+
+
+			//allocate space on the GPU for the original image
+			T* gpuI0;
+			cudaMalloc(&gpuI0, bytes_ds);
+			
+			
+			//copy the image data to the GPU
+			cudaMemcpy(gpuI0, in, bytes_ds, cudaMemcpyHostToDevice);
+
+			//allocate space on the GPU for the down sampled image
+			T* gpuI;
+			cudaMalloc(&gpuI, bytes);
+
+			//run the GPU-based version of the algorithm
+			gpu_re_sample<T>(gpuI, gpuI0, resize, x, y);
+
+			//copy the image data to the GPU
+			cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
+
+			cudaFree(gpuI0);
+			cudeFree(gpuI);
+		}
+	
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/update_dir_bb.cuh b/stim/cuda/ivote/update_dir_bb.cuh
new file mode 100644
index 0000000..bb04f23
--- /dev/null
+++ b/stim/cuda/ivote/update_dir_bb.cuh
@@ -0,0 +1,185 @@
+#ifndef STIM_CUDA_UPDATE_DIR_BB_H
+#define STIM_CUDA_UPDATE_DIR_BB_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include <stim/visualization/aabb2.h>
+#include <stim/visualization/colormap.h>
+#include <math.h> 
+
+//#define RMAX_TEST	8
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
+			extern __shared__ T S[];
+			T* shared_atan = S;
+			size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
+			stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
+
+			//T* shared_vote = &S[n_table];
+			//size_t template_size_x = (blockDim.x + 2 * rmax);
+			//size_t template_size_y = (blockDim.y + 2 * rmax);
+			//stim::cuda::threadedMemcpy2D((char*)shared_vote, (char*)gpuVote, template_size_x, template_size_y, x,  threadIdx.y * blockDim.x + threadIdx.x, blockDim.x * blockDim.y);
+			
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;				//calculate the 2D coordinates for this current thread.
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+
+			if(xi >= x || yi >= y) return;								//if the index is outside of the image, terminate the kernel
+
+			int i = yi * x + xi;										//convert 2D coordinates to 1D
+			float theta = gpuGrad[2*i];									//calculate the voting direction based on the grtadient direction - global memory fetch
+			
+			stim::aabb2<int> bb(xi, yi);								//initialize a bounding box at the current point
+			bb.insert(xi + ceil(rmax * cos(theta)),       ceil(yi + rmax * sin(theta)));
+			bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi)));		//insert one corner of the triangle into the bounding box
+			bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi)));		//insert the final corner into the bounding box
+
+			int x_table = 2*rmax +1;
+			int lut_i;
+			T rmax_sq = rmax * rmax;
+			T dx_sq, dy_sq;
+
+			bb.trim_low(0, 0);															//make sure the bounding box doesn't go outside the image
+			bb.trim_high(x-1, y-1);
+
+			int by, bx;
+			int dx, dy;													//coordinate relative to (xi, yi)
+			T v;
+			T max_v = 0;												//initialize the maximum vote value to zero
+			T alpha;
+			int max_dx = bb.low[0];
+			int max_dy = bb.low[1];
+			for(by = bb.low[1]; by <= bb.high[1]; by++){					//for each element in the bounding box
+				dy = by - yi;											//calculate the y coordinate of the current point relative to yi
+				dy_sq = dy * dy;
+				for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
+					dx = bx - xi;
+					dx_sq = dx * dx;
+					lut_i = (rmax - dy) * x_table + rmax - dx;
+					alpha = shared_atan[lut_i];
+					if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
+						v = gpuVote[by * x + bx];				// find the vote value for the current counter
+						if(v > max_v){
+							max_v = v;
+							max_dx = dx;
+							max_dy = dy;
+						}
+					}
+				}
+			}			
+			gpuDir[i] = atan2((T)max_dy, (T)max_dx);
+		}
+	
+		
+
+		// this kernel updates the gradient direction by the calculated voting direction.
+		template<typename T>
+		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
+
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+
+			if(xi >= x || yi >= y) return;
+		
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+			
+			//update the gradient image with the vote direction
+			gpuGrad[2*i] = gpuDir[i];
+		}
+		
+		template<typename T>
+		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+			
+			// allocate space on the GPU for the updated vote direction
+			T* gpuDir;
+			HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) );	
+
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			
+			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
+			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
+
+			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
+			//size_t curtain = 2 * rmax;
+			//size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain);
+			size_t shared_mem_req = table_bytes;// + template_bytes;
+			std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
+
+			size_t shared_mem = stim::sharedMemPerBlock();
+			if(shared_mem_req > shared_mem){
+				std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
+				exit(1);
+			}
+
+			//call the kernel to calculate the new voting direction
+			cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+			//stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
+
+			//exit(0);
+
+			//threads = dim3( sqrt(max_threads), sqrt(max_threads) );
+			//blocks = dim3(x/threads.x + 1, y/threads.y + 1);
+
+			//call the kernel to update the gradient direction
+			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
+			//free allocated memory
+			HANDLE_ERROR( cudaFree(gpuDir) );
+
+		}
+		
+		template<typename T>
+		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);
+
+			//copy the input vote image to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the update direction function
+			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the new gradient image back to the CPU
+			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/update_dir_global.cuh b/stim/cuda/ivote/update_dir_global.cuh
deleted file mode 100644
index b575619..0000000
--- a/stim/cuda/ivote/update_dir_global.cuh
+++ /dev/null
@@ -1,185 +0,0 @@
-#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
-#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
-
-# include <iostream>
-# include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-#include <stim/visualization/aabb2.h>
-#include <stim/visualization/colormap.h>
-#include <math.h> 
-
-//#define RMAX_TEST	8
-
-namespace stim{
-	namespace cuda{
-
-		template<typename T>
-		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
-			extern __shared__ T S[];
-			T* shared_atan = S;
-			size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
-			stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
-
-			//T* shared_vote = &S[n_table];
-			//size_t template_size_x = (blockDim.x + 2 * rmax);
-			//size_t template_size_y = (blockDim.y + 2 * rmax);
-			//stim::cuda::threadedMemcpy2D((char*)shared_vote, (char*)gpuVote, template_size_x, template_size_y, x,  threadIdx.y * blockDim.x + threadIdx.x, blockDim.x * blockDim.y);
-			
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;				//calculate the 2D coordinates for this current thread.
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-
-			if(xi >= x || yi >= y) return;								//if the index is outside of the image, terminate the kernel
-
-			int i = yi * x + xi;										//convert 2D coordinates to 1D
-			float theta = gpuGrad[2*i];									//calculate the voting direction based on the grtadient direction - global memory fetch
-			
-			stim::aabb2<int> bb(xi, yi);								//initialize a bounding box at the current point
-			bb.insert(xi + ceil(rmax * cos(theta)),       ceil(yi + rmax * sin(theta)));
-			bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi)));		//insert one corner of the triangle into the bounding box
-			bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi)));		//insert the final corner into the bounding box
-
-			int x_table = 2*rmax +1;
-			int lut_i;
-			T rmax_sq = rmax * rmax;
-			T dx_sq, dy_sq;
-
-			bb.trim_low(0, 0);															//make sure the bounding box doesn't go outside the image
-			bb.trim_high(x-1, y-1);
-
-			int by, bx;
-			int dx, dy;													//coordinate relative to (xi, yi)
-			T v;
-			T max_v = 0;												//initialize the maximum vote value to zero
-			T alpha;
-			int max_dx = bb.low[0];
-			int max_dy = bb.low[1];
-			for(by = bb.low[1]; by <= bb.high[1]; by++){					//for each element in the bounding box
-				dy = by - yi;											//calculate the y coordinate of the current point relative to yi
-				dy_sq = dy * dy;
-				for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
-					dx = bx - xi;
-					dx_sq = dx * dx;
-					lut_i = (rmax - dy) * x_table + rmax - dx;
-					alpha = shared_atan[lut_i];
-					if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
-						v = gpuVote[by * x + bx];				// find the vote value for the current counter
-						if(v > max_v){
-							max_v = v;
-							max_dx = dx;
-							max_dy = dy;
-						}
-					}
-				}
-			}			
-			gpuDir[i] = atan2((T)max_dy, (T)max_dx);
-		}
-	
-		
-
-		// this kernel updates the gradient direction by the calculated voting direction.
-		template<typename T>
-		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
-
-			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-
-			if(xi >= x || yi >= y) return;
-		
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
-			
-			//update the gradient image with the vote direction
-			gpuGrad[2*i] = gpuDir[i];
-		}
-		
-		template<typename T>
-		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-			
-			// allocate space on the GPU for the updated vote direction
-			T* gpuDir;
-			HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) );	
-
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			
-			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
-			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
-
-			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
-			//size_t curtain = 2 * rmax;
-			//size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain);
-			size_t shared_mem_req = table_bytes;// + template_bytes;
-			std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
-
-			size_t shared_mem = stim::sharedMemPerBlock();
-			if(shared_mem_req > shared_mem){
-				std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
-				exit(1);
-			}
-
-			//call the kernel to calculate the new voting direction
-			cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-			//stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
-
-			//exit(0);
-
-			//threads = dim3( sqrt(max_threads), sqrt(max_threads) );
-			//blocks = dim3(x/threads.x + 1, y/threads.y + 1);
-
-			//call the kernel to update the gradient direction
-			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
-			//free allocated memory
-			HANDLE_ERROR( cudaFree(gpuDir) );
-
-		}
-		
-		template<typename T>
-		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			//calculate the number of bytes in the atan2 table
-			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
-
-			//allocate space on the GPU for the Vote Image
-			T* gpuVote;
-			cudaMalloc(&gpuVote, bytes);
-
-			//copy the input vote image to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
-
-			//allocate space on the GPU for the input Gradient image
-			T* gpuGrad;
-			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
-
-			//copy the Gradient data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
-
-			//allocate space on the GPU for the atan2 table
-			T* gpuTable;
-			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
-
-			//copy the atan2 values to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
-						
-			//call the GPU version of the update direction function
-			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-							
-			//copy the new gradient image back to the CPU
-			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
-
-			//free allocated memory
-			cudaFree(gpuTable);
-			cudaFree(gpuVote);
-			cudaFree(gpuGrad);
-		}
-		
-	}
-}
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/vote_atomic_bb.cuh b/stim/cuda/ivote/vote_atomic_bb.cuh
new file mode 100644
index 0000000..32de21f
--- /dev/null
+++ b/stim/cuda/ivote/vote_atomic_bb.cuh
@@ -0,0 +1,142 @@
+#ifndef STIM_CUDA_VOTE_ATOMIC_BB_H
+#define STIM_CUDA_VOTE_ATOMIC_BB_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include <stim/visualization/aabb2.h>
+#include <stim/visualization/colormap.h>
+#include <math.h>
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+
+			extern __shared__ T S[];
+			T* shared_atan = S;
+			size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
+			stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			
+			if(xi >= x || yi >= y) return;			
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuGrad[2*i];
+			//calculate the amount of vote for the voter
+			float mag = gpuGrad[2*i + 1];
+			
+
+			stim::aabb2<int> bb(xi, yi);								//initialize a bounding box at the current point
+			bb.insert(xi + ceil(rmax * cos(theta)),       ceil(yi + rmax * sin(theta)));
+			bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi)));		//insert one corner of the triangle into the bounding box
+			bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi)));		//insert the final corner into the bounding box
+			
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			
+			int lut_i;
+			T dx_sq, dy_sq;
+
+			bb.trim_low(0, 0);															//make sure the bounding box doesn't go outside the image
+			bb.trim_high(x-1, y-1);
+
+			int by, bx;
+			int dx, dy;					
+			
+			unsigned int ind_g;											//initialize the maximum vote value to zero
+			T alpha;
+			
+			for(by = bb.low[1]; by <= bb.high[1]; by++){					//for each element in the bounding box
+				dy = by - yi;											//calculate the y coordinate of the current point relative to yi
+				dy_sq = dy * dy;
+				for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
+					dx = bx - xi;
+					dx_sq = dx * dx;
+					lut_i = (rmax - dy) * x_table + rmax - dx;
+					alpha = shared_atan[lut_i];
+					if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
+						ind_g = (by)*x + (bx);
+						atomicAdd(&gpuVote[ind_g], mag);
+					
+					}
+				}
+			}			
+			
+		}
+	
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
+			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
+			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
+			size_t shared_mem_req = table_bytes;// + template_bytes;
+			std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;		
+			size_t shared_mem = stim::sharedMemPerBlock();
+			if(shared_mem_req > shared_mem){
+				std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
+				exit(1);
+			}
+
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/vote_atomic_global.cuh b/stim/cuda/ivote/vote_atomic_global.cuh
deleted file mode 100644
index 33e1da8..0000000
--- a/stim/cuda/ivote/vote_atomic_global.cuh
+++ /dev/null
@@ -1,142 +0,0 @@
-#ifndef STIM_CUDA_VOTE_ATOMIC_GLOBAL_H
-#define STIM_CUDA_VOTE_ATOMIC_GLOBAL_H
-
-# include <iostream>
-# include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-#include <stim/visualization/aabb2.h>
-#include <stim/visualization/colormap.h>
-#include <math.h>
-
-namespace stim{
-	namespace cuda{
-
-		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
-		template<typename T>
-		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
-
-			extern __shared__ T S[];
-			T* shared_atan = S;
-			size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
-			stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
-			
-			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-			
-			if(xi >= x || yi >= y) return;			
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
-
-			// calculate the voting direction based on the grtadient direction
-			float theta = gpuGrad[2*i];
-			//calculate the amount of vote for the voter
-			float mag = gpuGrad[2*i + 1];
-			
-
-			stim::aabb2<int> bb(xi, yi);								//initialize a bounding box at the current point
-			bb.insert(xi + ceil(rmax * cos(theta)),       ceil(yi + rmax * sin(theta)));
-			bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi)));		//insert one corner of the triangle into the bounding box
-			bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi)));		//insert the final corner into the bounding box
-			
-			// compute the size of window which will be checked for finding the proper voters for this pixel
-			int x_table = 2*rmax +1;
-			int rmax_sq = rmax * rmax;
-			
-			int lut_i;
-			T dx_sq, dy_sq;
-
-			bb.trim_low(0, 0);															//make sure the bounding box doesn't go outside the image
-			bb.trim_high(x-1, y-1);
-
-			int by, bx;
-			int dx, dy;					
-			
-			unsigned int ind_g;											//initialize the maximum vote value to zero
-			T alpha;
-			
-			for(by = bb.low[1]; by <= bb.high[1]; by++){					//for each element in the bounding box
-				dy = by - yi;											//calculate the y coordinate of the current point relative to yi
-				dy_sq = dy * dy;
-				for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
-					dx = bx - xi;
-					dx_sq = dx * dx;
-					lut_i = (rmax - dy) * x_table + rmax - dx;
-					alpha = shared_atan[lut_i];
-					if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
-						ind_g = (by)*x + (bx);
-						atomicAdd(&gpuVote[ind_g], mag);
-					
-					}
-				}
-			}			
-			
-		}
-	
-
-		template<typename T>
-		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-							
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
-			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
-			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
-			size_t shared_mem_req = table_bytes;// + template_bytes;
-			std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;		
-			size_t shared_mem = stim::sharedMemPerBlock();
-			if(shared_mem_req > shared_mem){
-				std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
-				exit(1);
-			}
-
-			//call the kernel to do the voting
-			cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-
-		}
-
-
-		template<typename T>
-		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
-
-			//calculate the number of bytes in the atan2 table
-			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
-
-			//allocate space on the GPU for the Vote Image
-			T* gpuVote;
-			cudaMalloc(&gpuVote, bytes);		
-
-			//allocate space on the GPU for the input Gradient image
-			T* gpuGrad;
-			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
-
-			//copy the Gradient Magnitude data to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
-
-			//allocate space on the GPU for the atan2 table
-			T* gpuTable;
-			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
-
-			//copy the atan2 values to the GPU
-			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
-						
-			//call the GPU version of the vote calculation function
-			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-							
-			//copy the Vote Data back to the CPU
-			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
-
-			//free allocated memory
-			cudaFree(gpuTable);
-			cudaFree(gpuVote);
-			cudaFree(gpuGrad);
-		}
-		
-	}
-}
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote_atomic.cuh b/stim/cuda/ivote_atomic.cuh
deleted file mode 100644
index f265c60..0000000
--- a/stim/cuda/ivote_atomic.cuh
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef STIM_CUDA_IVOTE_ATOMIC_H
-#define STIM_CUDA_IVOTE_ATOMIC_H
-
-#include <stim/cuda/ivote/down_sample.cuh>
-#include <stim/cuda/ivote/local_max.cuh>
-#include <stim/cuda/ivote/update_dir_global.cuh>
-//#include <stim/cuda/ivote/vote_shared_32-32.cuh>
-#include <stim/cuda/ivote/vote_atomic_global.cuh>
-//#include <stim/cuda/ivote/re_sample.cuh>
-namespace stim{
-	namespace cuda{
-	
-	}
-}
-
-
-
-#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote_atomic_bb.cuh b/stim/cuda/ivote_atomic_bb.cuh
new file mode 100644
index 0000000..14e002d
--- /dev/null
+++ b/stim/cuda/ivote_atomic_bb.cuh
@@ -0,0 +1,17 @@
+#ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H
+#define STIM_CUDA_IVOTE_ATOMIC_BB_H
+
+#include <stim/cuda/ivote/down_sample.cuh>
+#include <stim/cuda/ivote/local_max.cuh>
+#include <stim/cuda/ivote/update_dir_bb.cuh>
+#include <stim/cuda/ivote/vote_atomic_bb.cuh>
+
+namespace stim{
+	namespace cuda{
+	
+	}
+}
+
+
+
+#endif
\ No newline at end of file
--
libgit2 0.21.4