diff --git a/stim/cuda/arraymath/array_cart2polar.cuh b/stim/cuda/arraymath/array_cart2polar.cuh
index 7f4d0e2..46a78bb 100644
--- a/stim/cuda/arraymath/array_cart2polar.cuh
+++ b/stim/cuda/arraymath/array_cart2polar.cuh
@@ -4,14 +4,14 @@
 namespace stim{
 	namespace cuda{
 		template<typename T>
-		__global__ void cuda_cart2polar(T* a, int x, int y, float rotation){
+		__global__ void cuda_cart2polar(T* a, size_t x, size_t y, float rotation){
 
 			
 			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
+			size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
 			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
+			size_t i = yi * x + xi;
 			
 
 			if(xi >= x|| yi >= y) return;
@@ -27,11 +27,11 @@ namespace stim{
 
 
 		template<typename T>
-		void gpu_cart2polar(T* gpuGrad, unsigned int x, unsigned int y, float rotation = 0){
+		void gpu_cart2polar(T* gpuGrad, size_t x, size_t y, float rotation = 0){
 
 			unsigned int max_threads = stim::maxThreadsPerBlock();
 			dim3 threads(max_threads, 1);
-			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			dim3 blocks(((unsigned int)x/threads.x) + ((unsigned int)x %threads.x == 0 ? 0:1) , (unsigned int)y);
 			
 			//call the kernel to do the multiplication
 			cuda_cart2polar <<< blocks, threads >>>(gpuGrad, x, y, rotation);
@@ -40,11 +40,11 @@ namespace stim{
 
 
 		template<typename T>
-		void cpu_cart2polar(T* a, unsigned int x, unsigned int y){
+		void cpu_cart2polar(T* a, size_t x, size_t y){
 
 			//calculate the number of bytes in the array
-			unsigned int N = x *y;
-			unsigned int bytes = N * sizeof(T) * 2;
+			size_t N = x *y;
+			size_t bytes = N * sizeof(T) * 2;
 
 			//allocate memory on the GPU for the array
 			T* gpuA;
diff --git a/stim/cuda/ivote/local_max.cuh b/stim/cuda/ivote/local_max.cuh
index b65f8a0..38b7096 100644
--- a/stim/cuda/ivote/local_max.cuh
+++ b/stim/cuda/ivote/local_max.cuh
@@ -10,17 +10,17 @@ namespace stim{
 
 		// this kernel calculates the local maximum for finding the cell centers
 		template<typename T>
-		__global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, int x, int y){
+		__global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, size_t x, size_t y){
 
 			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
+			size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
 			
 			if(xi >= x || yi >= y)
 				return;
 						
 			// convert 2D coordinates to 1D
-			int i = yi * x + xi;			
+			size_t i = yi * x + xi;
 			
 			gpuCenters[i] = 0;		//initialize the value at this location to zero
 			
@@ -61,13 +61,13 @@ namespace stim{
 		}
 		
 		template<typename T>
-		void gpu_local_max(T* gpuCenters, T* gpuVote,  unsigned int conn, unsigned int x, unsigned int y){
+		void gpu_local_max(T* gpuCenters, T* gpuVote,  unsigned int conn, size_t x, size_t y){
 
 			unsigned int max_threads = stim::maxThreadsPerBlock();
 			/*dim3 threads(max_threads, 1);
 			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/
-			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
-			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
+			dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
+			dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
 			
 			//call the kernel to find the local maximum.
 			cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, conn, x, y);
diff --git a/stim/cuda/ivote/update_dir_bb.cuh b/stim/cuda/ivote/update_dir_bb.cuh
index f3c324d..43869ec 100644
--- a/stim/cuda/ivote/update_dir_bb.cuh
+++ b/stim/cuda/ivote/update_dir_bb.cuh
@@ -81,26 +81,26 @@ namespace stim{
 
 		// this kernel updates the gradient direction by the calculated voting direction.
 		template<typename T>
-		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
+		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, size_t x, size_t y){
 
 			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
+			size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
 
 			if(xi >= x || yi >= y) return;
 		
 			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
+			size_t i = yi * x + xi;
 			
 			//update the gradient image with the vote direction
 			gpuGrad[2*i] = gpuDir[i];
 		}
 		
 		template<typename T>
-		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y){
 
 			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
+			size_t bytes = x * y * sizeof(T);
 			
 			// allocate space on the GPU for the updated vote direction
 			T* gpuDir;
@@ -108,14 +108,14 @@ namespace stim{
 
 			unsigned int max_threads = stim::maxThreadsPerBlock();
 			
-			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
-			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
+			dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
+			dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
 
 			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
 			//size_t curtain = 2 * rmax;
 			//size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain);
 			size_t shared_mem_req = table_bytes;// + template_bytes;
-			std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
+			if (DEBUG) std::cout << "Shared Memory required: " << shared_mem_req << std::endl;
 
 			size_t shared_mem = stim::sharedMemPerBlock();
 			if(shared_mem_req > shared_mem){
@@ -124,16 +124,10 @@ namespace stim{
 			}
 
 			//call the kernel to calculate the new voting direction
-			cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-			//stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
-
-			//exit(0);
-
-			//threads = dim3( sqrt(max_threads), sqrt(max_threads) );
-			//blocks = dim3(x/threads.x + 1, y/threads.y + 1);
+			cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, (int)x , (int)y);
 
 			//call the kernel to update the gradient direction
-			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
+			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, (int)x , (int)y);
 			//free allocated memory
 			HANDLE_ERROR( cudaFree(gpuDir) );
 
diff --git a/stim/cuda/ivote/vote_atomic_bb.cuh b/stim/cuda/ivote/vote_atomic_bb.cuh
index 32de21f..5a05001 100644
--- a/stim/cuda/ivote/vote_atomic_bb.cuh
+++ b/stim/cuda/ivote/vote_atomic_bb.cuh
@@ -9,12 +9,14 @@
 #include <stim/visualization/colormap.h>
 #include <math.h>
 
+
+
 namespace stim{
 	namespace cuda{
 
 		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
 		template<typename T>
-		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, size_t x, size_t y, bool gradmag = true){
 
 			extern __shared__ T S[];
 			T* shared_atan = S;
@@ -22,12 +24,12 @@ namespace stim{
 			stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
 			
 			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
+			size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
 			
 			if(xi >= x || yi >= y) return;			
 			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
+			size_t i = yi * x + xi;
 
 			// calculate the voting direction based on the grtadient direction
 			float theta = gpuGrad[2*i];
@@ -50,7 +52,7 @@ namespace stim{
 			bb.trim_low(0, 0);															//make sure the bounding box doesn't go outside the image
 			bb.trim_high(x-1, y-1);
 
-			int by, bx;
+			size_t by, bx;
 			int dx, dy;					
 			
 			unsigned int ind_g;											//initialize the maximum vote value to zero
@@ -66,7 +68,8 @@ namespace stim{
 					alpha = shared_atan[lut_i];
 					if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
 						ind_g = (by)*x + (bx);
-						atomicAdd(&gpuVote[ind_g], mag);
+						if(gradmag) atomicAdd(&gpuVote[ind_g], mag);			//add the gradient magnitude (if the gradmag flag is enabled)
+						else		atomicAdd(&gpuVote[ind_g], 1.0f);			//otherwise just add 1
 					
 					}
 				}
@@ -75,16 +78,22 @@ namespace stim{
 		}
 	
 
+		/// Iterative voting for an image
+		/// @param gpuVote is the resulting vote image
+		/// @param gpuGrad is the gradient of the input image
+		/// @param gpuTable is the pre-computed atan2() table
+		/// @param phi is the angle of the vote region
+		/// @param rmax is the estimated radius of the blob (defines the "width" of the vote region)
+		/// @param x and y are the spatial dimensions of the gradient image
+		/// @param gradmag defines whether or not the gradient magnitude is taken into account during the vote
 		template<typename T>
-		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-							
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y, bool gradmag = true){
 			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
-			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
+			dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
+			dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
 			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
 			size_t shared_mem_req = table_bytes;// + template_bytes;
-			std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;		
+			if (DEBUG) std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
 			size_t shared_mem = stim::sharedMemPerBlock();
 			if(shared_mem_req > shared_mem){
 				std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
@@ -92,7 +101,7 @@ namespace stim{
 			}
 
 			//call the kernel to do the voting
-			cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+			cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y, gradmag);
 
 		}
 
diff --git a/stim/cuda/ivote_atomic_bb.cuh b/stim/cuda/ivote_atomic_bb.cuh
index 14e002d..07f3224 100644
--- a/stim/cuda/ivote_atomic_bb.cuh
+++ b/stim/cuda/ivote_atomic_bb.cuh
@@ -1,6 +1,7 @@
 #ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H
 #define STIM_CUDA_IVOTE_ATOMIC_BB_H
 
+extern bool DEBUG;
 #include <stim/cuda/ivote/down_sample.cuh>
 #include <stim/cuda/ivote/local_max.cuh>
 #include <stim/cuda/ivote/update_dir_bb.cuh>
diff --git a/stim/cuda/templates/gradient.cuh b/stim/cuda/templates/gradient.cuh
index febd57b..c3816ec 100644
--- a/stim/cuda/templates/gradient.cuh
+++ b/stim/cuda/templates/gradient.cuh
@@ -9,25 +9,25 @@ namespace stim{
 	namespace cuda{
 
 		template<typename T>
-		__global__ void gradient_2d(T* out, T* in, int x, int y){
+		__global__ void gradient_2d(T* out, T* in, size_t x, size_t y){
 
 			
 			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
+			size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
 			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
+			size_t i = yi * x + xi;
 
 			//return if the pixel is outside of the image
 			if(xi >= x || yi >= y) return;
 
 			//calculate indices for the forward difference
-			int i_xp = yi * x + (xi + 1);
-			int i_yp = (yi + 1) * x + xi;
+			size_t i_xp = yi * x + (xi + 1);
+			size_t i_yp = (yi + 1) * x + xi;
 
 			//calculate indices for the backward difference
-			int i_xn = yi * x + (xi - 1);
-			int i_yn = (yi - 1) * x + xi;
+			size_t i_xn = yi * x + (xi - 1);
+			size_t i_yn = (yi - 1) * x + xi;
 
 			//use forward differences if a coordinate is zero
 			if(xi == 0)
@@ -47,12 +47,12 @@ namespace stim{
 		}
 
 		template<typename T>
-		void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){
+		void gpu_gradient_2d(T* gpuGrad, T* gpuI, size_t x, size_t y){
 			
 			//get the maximum number of threads per block for the CUDA device
 			unsigned int max_threads = stim::maxThreadsPerBlock();
 			dim3 threads(max_threads, 1);
-			dim3 blocks(x/threads.x + 1 , y);
+			dim3 blocks((unsigned int)(x/threads.x) + 1 , (unsigned int)y);
 			
 
 			//call the GPU kernel to determine the gradient
diff --git a/stim/image/image.h b/stim/image/image.h
index fbb5499..bd96c67 100644
--- a/stim/image/image.h
+++ b/stim/image/image.h
@@ -122,9 +122,9 @@ public:
 	}
 
 	stim::image<T>& operator=(const stim::image<T>& I){
-		init();
 		if(&I == this)									//handle self-assignment
 			return *this;
+		init();
 		allocate(I.X(), I.Y(), I.C());
 		memcpy(img, I.img, bytes());
 		return *this;
@@ -423,6 +423,33 @@ public:
 		return result;
 	}
 
+	/// Adds curcular padding for the specified number of pixels - in this case replicating the boundary pixels
+	image<T> pad_replicate(size_t p) {
+		image<T> result(width() + p * 2, height() + p * 2, channels());						//create an output image
+		result = 0;
+		//result = value;														//assign the border value to all pixels in the new image
+		for (size_t y = 0; y < height(); y++) {								//for each pixel in the original image
+			for (size_t x = 0; x < width(); x++) {
+				size_t n = (y + p) * (width() + p * 2) + x + p;				//calculate the index of the corresponding pixel in the result image
+				size_t n0 = idx(x, y);										//calculate the index for this pixel in the original image
+				result.data()[n] = img[n0];									// copy the original image to the result image afer the border area
+			}
+		}
+		size_t l = p;
+		size_t r = p + width() - 1;
+		size_t t = p;
+		size_t b = p + height() - 1;
+		for (size_t y = 0; y < p; y++) for (size_t x = l; x <= r; x++) result(x, y) = result(x, t);						//pad the top
+		for (size_t y = b + 1; y < result.height(); y++) for (size_t x = l; x <= r; x++) result(x, y) = result(x, b);	//pad the bottom
+		for (size_t y = t; y <= b; y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, y);						//pad the left
+		for (size_t y = t; y <= b; y++) for (size_t x = r+1; x < result.width(); x++) result(x, y) = result(r, y);		//pad the right
+		for (size_t y = 0; y < t; y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, t);						//pad the top left
+		for (size_t y = 0; y < t; y++) for (size_t x = r+1; x < result.width(); x++) result(x, y) = result(r, t);		//pad the top right
+		for (size_t y = b+1; y < result.height(); y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, b);		//pad the bottom left
+		for (size_t y = b+1; y < result.height(); y++) for (size_t x = r + 1; x < result.width(); x++) result(x, y) = result(r, b);		//pad the bottom right
+		return result;
+	}
+
 	/// Copy the given data to the specified channel
 
 	/// @param c is the channel number that the data will be copied to
@@ -546,7 +573,7 @@ public:
 	}
 
 	//crop regions given by an array of 1D index values
-	std::vector<image<T>> crop_idx(size_t w, size_t h, std::vector<size_t> idx) {
+	std::vector< image<T> > crop_idx(size_t w, size_t h, std::vector<size_t> idx) {
 		std::vector<image<T>> result(idx.size());										//create an array of image files to return
 		for (size_t i = 0; i < idx.size(); i++) {										//for each specified index point
 			size_t y = idx[i] / X();													//calculate the y coordinate from the 1D index (center of ROI)
diff --git a/stim/math/filters/gauss2.cuh b/stim/math/filters/gauss2.cuh
index 7e50ece..bd23d42 100644
--- a/stim/math/filters/gauss2.cuh
+++ b/stim/math/filters/gauss2.cuh
@@ -18,8 +18,8 @@ namespace stim {
 	///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel
 	template<typename T, typename K>
 	stim::image<T> cpu_gauss2(const stim::image<T>& in, K stdx, K stdy, size_t nstds = 3) {
-		size_t kx = stdx * nstds * 2 + 1;					//calculate the kernel sizes
-		size_t ky = stdy * nstds * 2 + 1;
+		size_t kx = (size_t)(stdx * nstds * 2) + 1;					//calculate the kernel sizes
+		size_t ky = (size_t)(stdy * nstds * 2) + 1;
 		size_t X = in.width() - kx + 1;					//calculate the size of the output image
 		size_t Y = in.height() - ky + 1;
 		stim::image<T> r(X, Y, in.channels());		//create an output image
diff --git a/stim/math/filters/sepconv2.cuh b/stim/math/filters/sepconv2.cuh
index 5946c6a..f733a76 100644
--- a/stim/math/filters/sepconv2.cuh
+++ b/stim/math/filters/sepconv2.cuh
@@ -20,12 +20,12 @@ namespace stim {
 		cudaDeviceProp p;
 		HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
 		size_t tmax = p.maxThreadsPerBlock;
-		dim3 nt(sqrt(tmax), sqrt(tmax));						//calculate the block dimensions
+		dim3 nt((unsigned int)sqrt(tmax), (unsigned int)sqrt(tmax));						//calculate the block dimensions
 		size_t X = sx - kx + 1;									//calculate the x size of the output image
 		T* temp;												//declare a temporary variable to store the intermediate image
 		HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T)));	//allocate memory for the intermediate image
 
-		dim3 nb(X / nt.x + 1, sy / nt.y + 1);							//calculate the grid dimensions
+		dim3 nb((unsigned int)(X / nt.x) + 1, (unsigned int)(sy / nt.y) + 1);							//calculate the grid dimensions
 		size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T);					//shared memory bytes required to store block data
 		if (sm > p.sharedMemPerBlock) {
 			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
@@ -34,7 +34,7 @@ namespace stim {
 		kernel_conv2 <<<nb, nt, sm>>> (temp, in, k0, sx, sy, kx, 1);	//launch the kernel to compute the intermediate image
 
 		size_t Y = sy - ky + 1;									//calculate the y size of the output image
-		nb.y = Y / nt.y + 1;									//update the grid dimensions to reflect the Y-axis size of the output image
+		nb.y = (unsigned int)(Y / nt.y) + 1;									//update the grid dimensions to reflect the Y-axis size of the output image
 		sm = nt.x * (nt.y + ky - 1) * sizeof(T);				//calculate the amount of shared memory needed for the second pass
 		if (sm > p.sharedMemPerBlock) {
 			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
diff --git a/stim/parser/arguments.h b/stim/parser/arguments.h
index 7a84ce3..06e258b 100644
--- a/stim/parser/arguments.h
+++ b/stim/parser/arguments.h
@@ -535,7 +535,13 @@ namespace stim{
 
         /// @param a is the index of the requested argument
         std::string arg(size_t a){
-        	return args[a];
+			if (a < nargs()) {
+				return args[a];
+			}
+			else {
+				std::cout << "stim::arguments ERROR: argument index exceeds number of available arguments" << std::endl;
+				exit(1);
+			}
         }
 	
 	/// Returns an std::vector of argument strings
--
libgit2 0.21.4