diff --git a/stim/cuda/arraymath/array_cart2polar.cuh b/stim/cuda/arraymath/array_cart2polar.cuh
index 8889b03..37e44db 100644
--- a/stim/cuda/arraymath/array_cart2polar.cuh
+++ b/stim/cuda/arraymath/array_cart2polar.cuh
@@ -4,42 +4,47 @@
 namespace stim{
 	namespace cuda{
 		template<typename T>
-		__global__ void cuda_cart2polar(T* a, unsigned int N){
+		__global__ void cuda_cart2polar(T* a, int x, int y){
 
 			
-			//calculate the 1D index for this thread
-			int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-			if(i < N){
-				float x = a[i * 2 + 0];
-				float y = a[i * 2 + 1];
-				float theta = atan2( y, x ) ;
-				float r = sqrt(x * x + y * y);
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+			
+
+			if(xi >= x|| yi >= y) return;
+
+				float xl = a[i * 2 + 0];
+				float yl = a[i * 2 + 1];
+				float theta = atan2( yl, xl ) ;
+				float r = sqrt(xl * xl + yl * yl);
 				a[i * 2 + 0] = theta;
 				a[i * 2 + 1] = r;
-			}
+			
 		}
 
 
 		template<typename T>
-		void gpu_cart2polar(T* gpuGrad, unsigned int N){
-
-			//get the maximum number of threads per block for the CUDA device
-			int threads = stim::maxThreadsPerBlock();
+		void gpu_cart2polar(T* gpuGrad, unsigned int x, unsigned int y){
 
-			//calculate the number of blocks
-			int blocks = N / threads + (N % threads == 0 ? 0:1);
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
 
 			//call the kernel to do the multiplication
-			cuda_cart2polar <<< blocks, threads >>>(gpuGrad, N);
+			cuda_cart2polar <<< blocks, threads >>>(gpuGrad, x, y);
 
 		}
 
 
 		template<typename T>
-		void cpu_cart2polar(T* a, unsigned int N){
+		void cpu_cart2polar(T* a, unsigned int x, unsigned int y){
 
 			//calculate the number of bytes in the array
+			unsigned int N = x *y;
 			unsigned int bytes = N * sizeof(T) * 2;
 
 			//allocate memory on the GPU for the array
@@ -50,7 +55,7 @@ namespace stim{
 			HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) );
 
 			//call the GPU version of this function
-			gpu_cart2polar<T>(gpuA, N);
+			gpu_cart2polar<T>(gpuA, x, y);
 
 			//copy the array back to the CPU
 			HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) );
diff --git a/stim/cuda/ivote/local_max.cuh b/stim/cuda/ivote/local_max.cuh
index 057f544..a2bd2fe 100644
--- a/stim/cuda/ivote/local_max.cuh
+++ b/stim/cuda/ivote/local_max.cuh
@@ -1,7 +1,6 @@
 #ifndef STIM_CUDA_LOCAL_MAX_H
 #define STIM_CUDA_LOCAL_MAX_H
 
-
 # include <iostream>
 # include <cuda.h>
 #include <stim/cuda/cudatools.h>
@@ -9,7 +8,6 @@
 namespace stim{
 	namespace cuda{
 
-
 		// this kernel calculates the local maximum for finding the cell centers
 		template<typename T>
 		__global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, int conn, int x, int y){
@@ -20,13 +18,10 @@ namespace stim{
 			
 			if(xi >= x || yi >= y)
 				return;
-			
-			
+						
 			// convert 2D coordinates to 1D
 			int i = yi * x + xi;			
 			
-			// START DAVID
-			
 			gpuCenters[i] = 0;		//initialize the value at this location to zero
 			
 			T val = gpuVote[i];
@@ -46,100 +41,22 @@ namespace stim{
 			}
 			
 			gpuCenters[i] = 1;
-			
-			// END DAVID
-			/*
-			//calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
-			int xl = xi - conn;
-			int yl = yi - conn;
-			
-			
-			// use zero for the lowest limits if the xi or yi is less than conn.
-			if (xi <= conn)
-				xl = 0;
-			if (yi <= conn)
-				yl = 0;
-
-			//calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
-			int xh = xi + conn;
-			int yh = yi + conn;
-
-			// use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn.
-			if (xi >= x - conn)
-				xh = x;
-			if (yi>= y - conn)
-				yh = y;
-
-			// calculate the limits for finding the local maximum location in the connected neighbors for the current pixel
-			int n_l = yl * x + xl;
-			int n_h = yh * x + xh;
-			
-			//initial the centers image to zero
-			gpuCenters[i] = 0;
-
-
-			int n = n_l;
-
-			float l_value = 0;
-
-			if (i < x * y)
-
-				// check if the vote value for this pixel is greater than threshold, so this pixel may be a local max.
-				if (gpuVote[i]>final_t){
-
-					// compare the vote value for this pixel with the vote values with its neighbors.
-					while (n<=n_h){
-
-						// check if this vote value is a local max in its neighborhood.
-							if (gpuVote[i] < gpuVote[n]){
-								l_value  = 0;
-								n =n_h+1;
-							}
-							else if (n == n_h){
-								l_value = 1;
-								n = n+1;
-							}
-							// check if the current neighbor is the last one at the current row
-							else if ((n - n_l - 2*conn)% x ==0){
-								n = n + x - 2*conn -1;
-								n ++;						
-							}
-							else
-								n ++;
-				}
-					// set the center value for this pixel to high if it's a local max ,and to low if not.
-					gpuCenters[i] = l_value ;
-				}
-				*/
-
 		}
-
-
-
+		
 		template<typename T>
 		void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
 
-			
-			
-			
 			unsigned int max_threads = stim::maxThreadsPerBlock();
 			dim3 threads(max_threads, 1);
 			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
 			
-			
-			
 			//call the kernel to find the local maximum.
 			cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y);
-
-
 		}
 
-
-
 		template<typename T>
 		void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
 		
-
 			//calculate the number of bytes in the array
 			unsigned int bytes = x * y * sizeof(T);
 
@@ -147,28 +64,22 @@ namespace stim{
 			T* gpuCenters;
 			cudaMalloc(&gpuCenters, bytes);		
 
-
 			//allocate space on the GPU for the input Vote Image
 			T* gpuVote;
 			cudaMalloc(&gpuVote, bytes);		
 
-			
 			//copy the Vote image data to the GPU
 			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
-
-			
+						
 			//call the GPU version of the local max function
 			gpu_local_max<T>(gpuCenters, gpuVote, final_t, conn, x, y);
-
-				
+							
 			//copy the cell centers data to the CPU
 			cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ;
-
-			
+						
 			//free allocated memory
 			cudaFree(gpuCenters);
 			cudaFree(gpuVote);
-			cudaFree(gpuGrad);
 		}
 		
 	}
diff --git a/stim/cuda/ivote/update_dir.cuh b/stim/cuda/ivote/update_dir.cuh
index a4e5d62..5b3d37e 100644
--- a/stim/cuda/ivote/update_dir.cuh
+++ b/stim/cuda/ivote/update_dir.cuh
@@ -1,7 +1,6 @@
 #ifndef STIM_CUDA_UPDATE_DIR_H
 #define STIM_CUDA_UPDATE_DIR_H
 
-
 # include <iostream>
 # include <cuda.h>
 #include <stim/cuda/cudatools.h>
@@ -48,8 +47,7 @@ namespace stim{
 			int rmax_sq = rmax * rmax;
 			int tx_rmax = threadIdx.x + rmax;
 			int bxs = bxi - rmax;
-			
-			
+						
 			for(int yr = -rmax; yr <= rmax; yr++){
 
 				//copy the portion of the image necessary for this block to shared memory
@@ -66,8 +64,7 @@ namespace stim{
 
 						// calculate the angle between the voter and the current pixel in x and y directions
 						float atan_angle = gpuTable[ind_t];
-
-
+						
 						// calculate the voting direction based on the grtadient direction
 						int idx_share_update = xr + tx_rmax ;
 						float share_vote = s_vote[idx_share_update];
@@ -86,11 +83,8 @@ namespace stim{
 					}
 				}
 			}
-			
-				
-		//float new_angle = atan2(dy, dx);
+							
 		unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
-
 		float new_angle = gpuTable[ind_m];
 
 		if(xi < x && yi < y)
@@ -102,35 +96,27 @@ namespace stim{
 		template<typename T>
 		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
 
-			//************ when the number of threads are (1024,1) *************
-			
 			// calculate the 2D coordinates for this current thread.
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
 			int yi = blockIdx.y * blockDim.y + threadIdx.y;
 		
 			// convert 2D coordinates to 1D
 			int i = yi * x + xi;
-
-
+			
 			//update the gradient image with the vote direction
 			gpuGrad[2*i] = gpuDir[i];
 		}
-
-
+		
 		template<typename T>
 		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
 
 			//get the number of pixels in the image
 			unsigned int pixels = x * y;
 			unsigned int bytes = sizeof(T) * pixels;
-
-			
+						
 			unsigned int max_threads = stim::maxThreadsPerBlock();
 			dim3 threads(max_threads, 1);
 			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
-			//dim3 threads(1, 1);
-			//dim3 blocks(x, y);
-			// Allocate CUDA array in device memory
 			
 			//define a channel descriptor for a single 32-bit channel
 			cudaChannelFormatDesc channelDesc =
@@ -174,14 +160,12 @@ namespace stim{
 
 			//call the kernel to update the gradient direction
 			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
-
-
+			
 			//free allocated memory
 			cudaFree(gpuDir);
 
 		}
-
-
+		
 		template<typename T>
 		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
 
@@ -211,12 +195,10 @@ namespace stim{
 
 			//copy the atan2 values to the GPU
 			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
-
-			
+						
 			//call the GPU version of the update direction function
 			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-
-				
+							
 			//copy the new gradient image back to the CPU
 			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
 
@@ -229,6 +211,4 @@ namespace stim{
 	}
 }
 
-
-
 #endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/vote.cuh b/stim/cuda/ivote/vote.cuh
index 54daf54..fdc3677 100644
--- a/stim/cuda/ivote/vote.cuh
+++ b/stim/cuda/ivote/vote.cuh
@@ -1,13 +1,11 @@
 #ifndef STIM_CUDA_VOTE_H
 #define STIM_CUDA_VOTE_H
 
-
 # include <iostream>
 # include <cuda.h>
 #include <stim/cuda/cudatools.h>
 #include <stim/cuda/sharedmem.cuh>
 
-
 namespace stim{
 	namespace cuda{
 
@@ -26,13 +24,10 @@ namespace stim{
 			int yi = blockIdx.y * blockDim.y + threadIdx.y;
 			// convert 2D coordinates to 1D
 			int i = yi * x + xi;
-			
-			
-			
+						
 			// define a local variable to sum the votes from the voters
 			float sum = 0;
 			
-
 			//calculate the width of the shared memory block
 			int swidth = 2 * rmax + blockDim.x;
 			
@@ -59,8 +54,7 @@ namespace stim{
 
 							// calculate the angle between the pixel and the current voter in x and y directions
 							float atan_angle = gpuTable[id_t];
-							
-					
+												
 							// calculate the voting direction based on the grtadient direction
 							int idx_share = xr + tx_rmax ;
 							float2 g = s_grad[idx_share];
@@ -71,7 +65,6 @@ namespace stim{
 								sum += g.y;		
 
 							}
-				
 					}
 				
 				}
@@ -87,15 +80,11 @@ namespace stim{
 			//get the number of pixels in the image
 			unsigned int pixels = x * y;
 			unsigned int bytes = sizeof(T) * pixels;
-
-			
+						
 			unsigned int max_threads = stim::maxThreadsPerBlock();
-			//unsigned int thread_dim = sqrt(max_threads);
 			dim3 threads(max_threads, 1);
 			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
-			//dim3 threads(1,1);
-			//dim3 blocks(x, y);
-
+			
 			// Allocate CUDA array in device memory
 			
 			//define a channel descriptor for a single 32-bit channel
@@ -131,7 +120,6 @@ namespace stim{
 			// specify  share memory
 			unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4;
 			
-
 			//call the kernel to do the voting
 			
 			cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y);
@@ -165,14 +153,10 @@ namespace stim{
 
 			//copy the atan2 values to the GPU
 			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
-
-			//cudaMemcpyToSymbol (cstTable, cpuTable, bytes_table );
-
-			
+						
 			//call the GPU version of the vote calculation function
 			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-
-				
+							
 			//copy the Vote Data back to the CPU
 			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
 
@@ -185,6 +169,4 @@ namespace stim{
 	}
 }
 
-
-
 #endif
\ No newline at end of file
diff --git a/stim/cuda/templates/conv2sep.cuh b/stim/cuda/templates/conv2sep.cuh
index d63d4e0..b7156c1 100644
--- a/stim/cuda/templates/conv2sep.cuh
+++ b/stim/cuda/templates/conv2sep.cuh
@@ -49,7 +49,7 @@ namespace stim{
 			__syncthreads();
 			
 			//if the current pixel is outside of the image
-			if(xi > x || yi > y)
+			if(xi >= x || yi >= y)
 				return;
 			
 
@@ -61,7 +61,7 @@ namespace stim{
 			
 			//for each element of the kernel
 			for(int ki = -kr; ki <= kr; ki++){
-				sum += g[ki + kr] * s[si + ki];
+				sum += s[si + ki] * g[ki + kr];
 			}
 			
 			//calculate the 1D image index for this thread
diff --git a/stim/cuda/templates/gradient.cuh b/stim/cuda/templates/gradient.cuh
index 25e225e..d0718b2 100644
--- a/stim/cuda/templates/gradient.cuh
+++ b/stim/cuda/templates/gradient.cuh
@@ -11,12 +11,12 @@ namespace stim{
 		template<typename T>
 		__global__ void gradient_2d(T* out, T* in, int x, int y){
 
-			//calculate the 1D image index for this thread
-			int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-			//convert this to 2D pixel coordinates
-			int yi = i / x;
-			int xi = i - (yi * x);
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
 
 			//return if the pixel is outside of the image
 			if(xi >= x || yi >= y) return;
@@ -25,16 +25,16 @@ namespace stim{
 			int i_xp = yi * x + (xi + 1);
 			int i_yp = (yi + 1) * x + xi;
 
+			//calculate indices for the backward difference
+			int i_xn = yi * x + (xi - 1);
+			int i_yn = (yi - 1) * x + xi;
+
 			//use forward differences if a coordinate is zero
 			if(xi == 0)
 				out[i * 2 + 0] = in[i_xp] - in[i];
 			if(yi == 0)
 				out[i * 2 + 1] = in[i_yp] - in[i];
 
-			//calculate indices for the backward difference
-			int i_xn = yi * x + (xi - 1);
-			int i_yn = (yi - 1) * x + xi;
-
 			//use backward differences if the coordinate is at the maximum edge
 			if(xi == x-1)
 				out[i * 2 + 0] = in[i] - in[i_xn];
@@ -51,28 +51,16 @@ namespace stim{
 		}
 
 		template<typename T>
-		//void gpu_gradient_2d(T* gpuOut, T* gpuIn, unsigned int x, unsigned int y){
 		void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){
 
 			//get the number of pixels in the image
 			unsigned int pixels = x * y;
 			
-			//allocate space on the GPU for the input image
-			//T* gpuI;
-			//HANDLE_ERROR(cudaMalloc(&gpuI, bytes));
-
-			//cudaMemcpy(gpuI, gpuI0, bytes, cudaMemcpyDeviceToDevice);
-
-
-			//allocate space on the GPU for the output gradient image
-			//T* gpuGrad;
-			//cudaMalloc(&gpuGrad, bytes * 2);		//the output image will have two channels (x, y)
-
 			//get the maximum number of threads per block for the CUDA device
-			int threads = stim::maxThreadsPerBlock();
-
-			//calculate the number of blocks
-			int blocks = pixels / threads + (pixels%threads == 0 ? 0:1);
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + 1 , y);
+			
 
 			//call the GPU kernel to determine the gradient
 			gradient_2d<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y);
@@ -105,6 +93,7 @@ namespace stim{
 
 			//free allocated memory
 			cudaFree(gpuOut);
+			cudaFree(gpuIn);
 		}
 
 	}
--
libgit2 0.21.4