diff --git a/stim/cuda/ivote/down_sample.cuh b/stim/cuda/ivote/down_sample.cuh
index 598d095..3e26b50 100644
--- a/stim/cuda/ivote/down_sample.cuh
+++ b/stim/cuda/ivote/down_sample.cuh
@@ -25,8 +25,8 @@ namespace stim{
 			
 			if(xi< x_ds && yi< y_ds){
 
-				int x_org = xi * sigma_ds;
-				int y_org = yi * sigma_ds;
+				int x_org = xi * sigma_ds ;
+				int y_org = yi * sigma_ds ;
 				int i_org = y_org * x + x_org;
 				gpuI[i] = gpuI0[i_org];
 			}
diff --git a/stim/cuda/ivote/local_max.cuh b/stim/cuda/ivote/local_max.cuh
index 9519735..057f544 100644
--- a/stim/cuda/ivote/local_max.cuh
+++ b/stim/cuda/ivote/local_max.cuh
@@ -12,20 +12,48 @@ namespace stim{
 
 		// this kernel calculates the local maximum for finding the cell centers
 		template<typename T>
-		__global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
+		__global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, int conn, int x, int y){
 
 			// calculate the 2D coordinates for this current thread.
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
 			int yi = blockIdx.y;
+			
+			if(xi >= x || yi >= y)
+				return;
+			
+			
 			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
-
-
-
+			int i = yi * x + xi;			
+			
+			// START DAVID
+			
+			gpuCenters[i] = 0;		//initialize the value at this location to zero
+			
+			T val = gpuVote[i];
+			
+			//compare to the threshold
+			if(val < final_t) return;
+			
+			for(int xl = xi - conn; xl < xi + conn; xl++){
+				for(int yl = yi - conn; yl < yi + conn; yl++){
+					if(xl >= 0 && xl < x && yl >= 0 && yl < y){
+						int il = yl * x + xl;
+						if(gpuVote[il] > val){							
+							return;
+						}							
+					}
+				}
+			}
+			
+			gpuCenters[i] = 1;
+			
+			// END DAVID
+			/*
 			//calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
 			int xl = xi - conn;
 			int yl = yi - conn;
 			
+			
 			// use zero for the lowest limits if the xi or yi is less than conn.
 			if (xi <= conn)
 				xl = 0;
@@ -82,6 +110,7 @@ namespace stim{
 					// set the center value for this pixel to high if it's a local max ,and to low if not.
 					gpuCenters[i] = l_value ;
 				}
+				*/
 
 		}
 
diff --git a/stim/cuda/ivote/update_dir.cuh b/stim/cuda/ivote/update_dir.cuh
index 5ec9f0a..a4e5d62 100644
--- a/stim/cuda/ivote/update_dir.cuh
+++ b/stim/cuda/ivote/update_dir.cuh
@@ -12,7 +12,7 @@ namespace stim{
 	
 		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
 		template<typename T>
-		__global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+		__global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
 
 			//generate a pointer to shared memory (size will be specified as a kernel parameter)
 			extern __shared__ float s_vote[];
@@ -20,12 +20,9 @@ namespace stim{
 			//calculate the start point for this block
 			int bxi = blockIdx.x * blockDim.x;
 			
-			//calculate the width of the shared memory block
-			int swidth = 2 * rmax + blockDim.x;
-
 			// calculate the 2D coordinates for this current thread.
 			int xi = bxi + threadIdx.x;
-			int yi = blockIdx.y;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
 
 			// convert 2D coordinates to 1D
 			int i = yi * x + xi;
@@ -43,14 +40,17 @@ namespace stim{
 			int id_x = 0;
 			int id_y = 0;
 
+			//calculate the width of the shared memory block
+			int swidth = 2 * rmax + blockDim.x;
+
 			// compute the size of window which will be checked for finding the voting area for this voter
-			unsigned int x_table = 2*rmax +1;
-			unsigned int rmax_sq = rmax * rmax;
-			int r = (int)rmax;
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
 			int tx_rmax = threadIdx.x + rmax;
 			int bxs = bxi - rmax;
 			
-			for(int yr = -r; yr <= r; yr++){
+			
+			for(int yr = -rmax; yr <= rmax; yr++){
 
 				//copy the portion of the image necessary for this block to shared memory
 				__syncthreads();
@@ -60,7 +60,7 @@ namespace stim{
 				//if the current thread is outside of the image, it doesn't have to be computed
 				if(xi < x && yi < y){
 
-					for(int xr = -r; xr <= r; xr++){
+					for(int xr = -rmax; xr <= rmax; xr++){
 
 						unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
 
@@ -93,19 +93,21 @@ namespace stim{
 
 		float new_angle = gpuTable[ind_m];
 
-		gpuDir[i] = new_angle;
+		if(xi < x && yi < y)
+			gpuDir[i] = new_angle;
 
 		}
 
 		// this kernel updates the gradient direction by the calculated voting direction.
 		template<typename T>
-		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, unsigned int x, unsigned int y){
+		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
 
 			//************ when the number of threads are (1024,1) *************
 			
 			// calculate the 2D coordinates for this current thread.
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+		
 			// convert 2D coordinates to 1D
 			int i = yi * x + xi;
 
@@ -126,7 +128,8 @@ namespace stim{
 			unsigned int max_threads = stim::maxThreadsPerBlock();
 			dim3 threads(max_threads, 1);
 			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
-			
+			//dim3 threads(1, 1);
+			//dim3 blocks(x, y);
 			// Allocate CUDA array in device memory
 			
 			//define a channel descriptor for a single 32-bit channel
diff --git a/stim/cuda/ivote/vote.cuh b/stim/cuda/ivote/vote.cuh
index 7e98894..54daf54 100644
--- a/stim/cuda/ivote/vote.cuh
+++ b/stim/cuda/ivote/vote.cuh
@@ -13,7 +13,7 @@ namespace stim{
 
 		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
 		template<typename T>
-		__global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+		__global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, int rmax, int x, int y){
 
 			//generate a pointer to shared memory (size will be specified as a kernel parameter)
 			extern __shared__ float2 s_grad[];
@@ -21,42 +21,41 @@ namespace stim{
 			//calculate the start point for this block
 			int bxi = blockIdx.x * blockDim.x;
 			
-			//calculate the width of the shared memory block
-			int swidth = 2 * rmax + blockDim.x;
-
 			// calculate the 2D coordinates for this current thread.
 			int xi = bxi + threadIdx.x;
-			int yi = blockIdx.y;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
 			// convert 2D coordinates to 1D
 			int i = yi * x + xi;
-
+			
+			
 			
 			// define a local variable to sum the votes from the voters
 			float sum = 0;
+			
 
+			//calculate the width of the shared memory block
+			int swidth = 2 * rmax + blockDim.x;
+			
 			// compute the size of window which will be checked for finding the proper voters for this pixel
-			unsigned int x_table = 2*rmax +1;
-
-			unsigned int rmax_sq = rmax * rmax;
-			int r = (int)rmax;
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
 			int tx_rmax = threadIdx.x + rmax;
 			int bxs = bxi - rmax;
 			
-			
-				for(int yr = -r; yr <= r; yr++){
+			//for every line (along y)
+			for(int yr = -rmax; yr <= rmax; yr++){
 
 				//copy the portion of the image necessary for this block to shared memory
 				__syncthreads();
 				stim::cuda::sharedMemcpy_tex2D<float2>(s_grad, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim);
 				__syncthreads();
 				
-				//if the current thread is outside of the image, it doesn't have to be computed
 				if(xi < x && yi < y){
-					
-					for(int xr = -r; xr <= r; xr++){
+
+					for(int xr = -rmax; xr <= rmax; xr++){
 					
 							//find the location of this voter in the atan2 table
-							unsigned int id_t = (yr + rmax) * x_table + xr + rmax;
+							int id_t = (yr + rmax) * x_table + xr + rmax;
 
 							// calculate the angle between the pixel and the current voter in x and y directions
 							float atan_angle = gpuTable[id_t];
@@ -74,10 +73,12 @@ namespace stim{
 							}
 				
 					}
+				
 				}
 			}
-
-			gpuVote[i] = sum;
+			if(xi < x && yi < y)
+				gpuVote[i] = sum;
+			
 		}
 
 		template<typename T>
@@ -92,7 +93,9 @@ namespace stim{
 			//unsigned int thread_dim = sqrt(max_threads);
 			dim3 threads(max_threads, 1);
 			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
-			
+			//dim3 threads(1,1);
+			//dim3 blocks(x, y);
+
 			// Allocate CUDA array in device memory
 			
 			//define a channel descriptor for a single 32-bit channel
diff --git a/stim/cuda/templates/gradient.cuh b/stim/cuda/templates/gradient.cuh
index c55ec91..25e225e 100644
--- a/stim/cuda/templates/gradient.cuh
+++ b/stim/cuda/templates/gradient.cuh
@@ -9,7 +9,7 @@ namespace stim{
 	namespace cuda{
 
 		template<typename T>
-		__global__ void gradient_2d(T* out, T* in, unsigned int x, unsigned int y){
+		__global__ void gradient_2d(T* out, T* in, int x, int y){
 
 			//calculate the 1D image index for this thread
 			int i = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/stim/image/image.h b/stim/image/image.h
index 8bbb507..439cbf6 100644
--- a/stim/image/image.h
+++ b/stim/image/image.h
@@ -48,7 +48,7 @@ public:
 	//create an image from an interleaved buffer
 	void set_interleaved(T* buffer, unsigned int width, unsigned int height, unsigned int channels = 1){
 
-		unsigned char* non_interleaved = (unsigned char*)malloc(width * height * 3);
+		T* non_interleaved = (T*)malloc(width * height * 3 * sizeof(T));
 		unsigned int S = width * height;
 
 		for(unsigned int i = 0; i < S; i++){
@@ -57,7 +57,7 @@ public:
 			}
 		}
 
-		img = cimg_library::CImg<unsigned char>(non_interleaved, width, height, 1, channels);
+		img = cimg_library::CImg<T>(non_interleaved, width, height, 1, channels);
 	}
 
 	//fills an allocated region of memory with non-interleaved data
--
libgit2 0.21.4