diff --git a/cpp/cudafunc.cu b/cpp/cudafunc.cu
index db74079..0d7d4a4 100644
--- a/cpp/cudafunc.cu
+++ b/cpp/cudafunc.cu
@@ -6,11 +6,11 @@
 #include "update_dir3_aabb.cuh"
 #include "local_max3.cuh"
 #include <fstream>
-
+#include <sstream>
 void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[],
 			int iter, float t, unsigned int conn[], size_t x, size_t y, size_t z){
 
-	
+	cudaSetDevice(0);
 	size_t bytes = x * y * z * sizeof(float);				// compute the number of bytes in the input data
 
 	float* gpuI0;											//assign memory on gpu for the input data
@@ -37,18 +37,13 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[],
 		cudaMemset(gpu_vote, 0, bytes);
 		gpu_vote3<float>(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z);
 		cudaDeviceSynchronize();
-		if (i == 0) {
-			cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
-			std::ofstream fvote("00-vote1_aabb.vol", std::ofstream::out | std::ofstream::binary);
-			fvote.write((char*)img, bytes);
-			fvote.close();
-		}
-		if (i == 1) {
-			cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
-			std::ofstream fvote("00-vote2_aabb.vol", std::ofstream::out | std::ofstream::binary);
-			fvote.write((char*)img, bytes);
-			fvote.close();
-		}
+		
+		cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); 
+		std::string filename = "0-vote";
+		std::ofstream fvote(filename + std::to_string(i+1) + "_aabb.vol", std::ofstream::out | std::ofstream::binary);
+		fvote.write((char*)img, bytes);
+		fvote.close();
+		
 			gpu_update_dir3<float>(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z);
 		cudaDeviceSynchronize();
 		phi = phi - d_phi;
@@ -64,7 +59,7 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[],
 }
 
 void lmax(float* out, float* in, float t, unsigned int conn[], size_t x, size_t y, size_t z){
-	unsigned int bytes = x * y * z * sizeof(float);
+	size_t bytes = x * y * z * sizeof(float);
 
 	cudaSetDevice(0);
 
diff --git a/cpp/gaussian_blur3.cuh b/cpp/gaussian_blur3.cuh
index f668a50..38eb25c 100644
--- a/cpp/gaussian_blur3.cuh
+++ b/cpp/gaussian_blur3.cuh
@@ -167,8 +167,8 @@
 			size_t bytes = sizeof(T) * pixels;
 
 			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(sqrt (max_threads),sqrt (max_threads));
-			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
+			dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
+			dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
 
 			//allocate temporary space on the GPU
 			T* gpuIb_x;
@@ -179,13 +179,13 @@
 			cudaMalloc(&gpuIb_y, bytes);
 
 			// blur the original image along the x direction
-			blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], x, y, z);
+			blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], (int)x, (int)y, (int)z);
 			
 			// blur the x-blurred image along the y direction
-			blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], x, y, z);
+			blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], (int)x, (int)y, (int)z);
 			
 			// blur the xy-blurred image along the z direction
-			blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], x, y, z);
+			blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], (int)x, (int)y, (int)z);
 
 			//cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice);
 
diff --git a/cpp/gradient3.cuh b/cpp/gradient3.cuh
index 92f0ccf..b515257 100644
--- a/cpp/gradient3.cuh
+++ b/cpp/gradient3.cuh
@@ -63,11 +63,11 @@ void gpu_gradient3(T* gpuGrad, T* gpuI, size_t x, size_t y, size_t z){
 
 			
 	unsigned int max_threads = stim::maxThreadsPerBlock();
-	dim3 threads(sqrt (max_threads),sqrt (max_threads));
-	dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
+	dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
+	dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
 	
 	//call the GPU kernel to determine the gradient
-	gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y, z);
+	gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, (int)x, (int)y, (int)z);
 
 }
 
diff --git a/cpp/local_max3.cuh b/cpp/local_max3.cuh
index a13bd56..677f3cb 100644
--- a/cpp/local_max3.cuh
+++ b/cpp/local_max3.cuh
@@ -49,15 +49,15 @@ __global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, int conn_x, int conn
 }
 
 template<typename T>
-void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){
+void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], size_t x, size_t y, size_t z){
 
 	//find the max number of threads per block.
 	unsigned int max_threads = stim::maxThreadsPerBlock();
-	dim3 threads(sqrt (max_threads),sqrt (max_threads));
-	dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
+	dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
+	dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
 
 	//call the kernel to find the local max
-	cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, conn[0], conn[1], conn[2], x, y, z);
+	cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, (int)conn[0], (int)conn[1], (int)conn[2], (int)x, (int)y, (int)z);
 
 
 
diff --git a/cpp/main.cpp b/cpp/main.cpp
index 1f0d7a2..72acb2d 100644
--- a/cpp/main.cpp
+++ b/cpp/main.cpp
@@ -93,8 +93,8 @@ void init_args(int argc, char* argv[]) {
 	iter = args["iter"].as_int();
 	rmax = (unsigned int)args["rmax"].as_int();
 	nlmax = (unsigned int)args["conn"].as_int();
-	t = args["t"].as_float();
-	sigma = args["sigma"].as_float();
+	t = (float)args["t"].as_float();
+	sigma = (float)args["sigma"].as_float();
 	phi = (float)args["phi"].as_float() * (float)stim::PI / 180;
 
 }
@@ -108,8 +108,8 @@ int main(int argc, char** argv) {
 		cudaGetDeviceProperties(&prop, i);
 		printf("current device ID: %d\n", i);
 		printf("device name: %s\n", prop.name);
-		printf("total global mem: %lu\n", prop.totalGlobalMem);
-		printf("shared memory per block: %lu\n", prop.sharedMemPerBlock);
+		printf("total global mem: %zu\n", prop.totalGlobalMem);
+		printf("shared memory per block: %zu\n", prop.sharedMemPerBlock);
 	}
 
 	init_args(argc, argv);
@@ -139,18 +139,16 @@ int main(int argc, char** argv) {
 
 
 	ivote3(cpuI, sigma3, phi, d_phi, r, iter, t, conn, x, y, z);			// call the ivote function
-
+	/*
 	std::ofstream fvote("00-vote8_aabb.vol", std::ofstream::out | std::ofstream::binary);
 	fvote.write((char*)cpuI, bytes);
 	fvote.close();
+	*/
 
 	//allocate space on the cpu for the output result
 	float* cpu_out = (float*)malloc(bytes * 3);
 
-	//write the output file.
-	//for (int t0=0; t0<=5000; t0+=100){
-	//	float t1 = t0;
-	int t0 = t;
+	
 	lmax(cpu_out, cpuI, t, conn, x, y, z);
 	
 	std::ofstream fo(args.arg(1), std::ofstream::out | std::ofstream::binary);
@@ -168,7 +166,7 @@ int main(int argc, char** argv) {
 			for (int iy = 0; iy<y; iy++) {
 				for (int ix = 0; ix<x; ix++) {
 
-					int idx = iz * x * y + iy * x + ix;
+					size_t idx = iz * x * y + iy * x + ix;
 					if (cpu_out[idx]>0) {
 						nod++;
 						list << ix << " " << iy << " " << iz << " " << cpu_out[idx] << '\n';
@@ -181,8 +179,6 @@ int main(int argc, char** argv) {
 		list.close();
 	}
 
-
-	//}
 	cudaDeviceReset();
 
 }
\ No newline at end of file
diff --git a/cpp/update_dir3_aabb.cuh b/cpp/update_dir3_aabb.cuh
index 5491985..bf8e0a7 100644
--- a/cpp/update_dir3_aabb.cuh
+++ b/cpp/update_dir3_aabb.cuh
@@ -125,14 +125,14 @@
 		void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z){
 
 			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(sqrt (max_threads),sqrt (max_threads));
-			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
+			dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
+			dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
 				
 			T* gpu_dir;												// allocate space on the GPU for the updated vote direction
 			cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3);	
 
 			//call the kernel to calculate the new voting direction
-			update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, phi, cos_phi, r[0], r[1], r[2], x , y, z);
+			update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, phi, cos_phi, (int)r[0], (int)r[1], (int)r[2], (int)x , (int)y, (int)z);
 			
 			//call the kernel to update the gradient direction
 			update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z);
diff --git a/cpp/vote3_atomic_aabb.cuh b/cpp/vote3_atomic_aabb.cuh
index f5f21d8..8e8556a 100644
--- a/cpp/vote3_atomic_aabb.cuh
+++ b/cpp/vote3_atomic_aabb.cuh
@@ -90,9 +90,9 @@ void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], siz
 
 
 	unsigned int max_threads = stim::maxThreadsPerBlock();
-	dim3 threads(sqrt(max_threads), sqrt(max_threads));
-	dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
-	vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x, y, z);			//call the kernel to do the voting
+	dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads));
+	dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
+	vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, (int)r[0], (int)r[1], (int)r[2], (int)x, (int)y, (int)z);			//call the kernel to do the voting
 
 }
 
--
libgit2 0.21.4