diff --git a/Matlab_3D/gt-128.vol b/Matlab_3D/gt-128.vol
new file mode 100644
index 0000000..1399c1a
Binary files /dev/null and b/Matlab_3D/gt-128.vol differ
diff --git a/Matlab_3D/gt2.vol b/Matlab_3D/gt2.vol
deleted file mode 100644
index 1399c1a..0000000
Binary files a/Matlab_3D/gt2.vol and /dev/null differ
diff --git a/Matlab_3D/nissl-float-64.64.64.vol b/Matlab_3D/nissl-float-64.64.64.vol
new file mode 100644
index 0000000..020b564
Binary files /dev/null and b/Matlab_3D/nissl-float-64.64.64.vol differ
diff --git a/Matlab_3D/validation.m b/Matlab_3D/validation.m
index 6e0056c..4c45605 100644
--- a/Matlab_3D/validation.m
+++ b/Matlab_3D/validation.m
@@ -1,6 +1,6 @@
 clear all;
 %for t=100:100:5000
-t=2350;
+t=2200;
 X = 128;
 Y = 128;
 Z = 128;
@@ -12,7 +12,7 @@ r2=10;
 itr=5;
 vote=7;
 std = [5 5];
-gt_filename = 'gt2.vol';
+gt_filename = 'gt-128.vol';  
 % out_filename = sprintf('128-128-128/0-nissl-std%d.%d-t0%d-r%d.%d-t%d-out%d.%d.vol',std(1), std(2),t0,r1,r2,t,itr,vote);
 out_filename = sprintf('D:/build/ivote3-bld/0-out.%d.vol',t);
 % txt_filename = sprintf('128-128-128/0-validation-nissl-std%d.%d-r%d.%d-t%d-out%d.%d-D%d.txt',std(1), std(2),r1,r2,t,itr,vote,D);
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 387d47e..ed32ace 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -51,3 +51,4 @@ target_link_libraries(ivote3
 #copy an image test case
 configure_file(nissl-raw-data/nissl-float-256.256.256.vol nissl-float-256.256.256.vol COPYONLY)
 configure_file(nissl-raw-data/nissl-float-128.128.128.vol nissl-float-128.128.128.vol COPYONLY)
+configure_file(nissl-raw-data/nissl-float-64.64.64.vol nissl-float-64.64.64.vol COPYONLY)
diff --git a/cpp/cudafunc.cu b/cpp/cudafunc.cu
index fe5734d..8ecbe66 100644
--- a/cpp/cudafunc.cu
+++ b/cpp/cudafunc.cu
@@ -10,7 +10,7 @@ gpu_test3(gpu_out, gpu_grad, rmax, phi, n, x, y, z);
 #include "gradient3.cuh"
 #include "mag3.cuh"
 #include "vote3_atomic_aabb.cuh"
-#include "update_dir3.cuh"
+#include "update_dir3_aabb.cuh"
 #include "local_max3.cuh"
 
 
@@ -19,27 +19,21 @@ void ivote3(float* img, float sigma[], float anisotropy, float phi, float d_phi,
 
 	
 	cudaSetDevice(1);
-	// compute the number of bytes in the input data
-	unsigned int bytes = x * y * z * sizeof(float);
+	
+	unsigned int bytes = x * y * z * sizeof(float);				// compute the number of bytes in the input data
 
-	//assign memory on gpu for the input data.z
-	float* gpuI0;
+	float* gpuI0;											//assign memory on gpu for the input data
 	cudaMalloc(&gpuI0, bytes);	
+	cudaMemcpy(gpuI0, img, bytes, cudaMemcpyHostToDevice);				//copy the image data to the GPU.
 
-	//copy the image data to the GPU.
-	cudaMemcpy(gpuI0, img, bytes, cudaMemcpyHostToDevice);
-
-	//call the blurring function from the gpu.
-	gpu_gaussian_blur3<float>(gpuI0, sigma, x, y, z);
-	//cudaMemcpy(img, gpuI0, bytes, cudaMemcpyDeviceToHost);
+	
+	gpu_gaussian_blur3<float>(gpuI0, sigma, x, y, z);							//call the blurring function from the gpu.
 	cudaDeviceSynchronize();
 		
-	//assign memory on the gpu for the gradient along the X, y, z.
-	float* gpu_grad;
+	float* gpu_grad;												//assign memory on the gpu for the gradient along the X, y, z.
 	cudaMalloc(&gpu_grad, bytes*3);
 	
-	//call the gradient function from the gpu.
-	gpu_gradient3<float>(gpu_grad, gpuI0, anisotropy, x, y, z);
+	gpu_gradient3<float>(gpu_grad, gpuI0, anisotropy, x, y, z);				//call the gradient function from the gpu.
 	cudaFree(gpuI0);
 	
 	float* gpu_vote;
@@ -51,11 +45,11 @@ void ivote3(float* img, float sigma[], float anisotropy, float phi, float d_phi,
 	for (int i = 0; i < iter; i++){
 	
 		cudaMemset(gpu_vote, 0, bytes);
-		gpu_vote3<float>(gpu_vote, gpu_grad, cos_phi, r, x, y, z);
+		gpu_vote3<float>(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z);
 		cudaDeviceSynchronize();
 	
 		//if (phi >= d_phi){	
-			gpu_update_dir3<float>(gpu_grad, gpu_vote, cos_phi, r, x, y, z);
+			gpu_update_dir3<float>(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z);
 			cudaDeviceSynchronize();
 			phi = phi - d_phi;
 			cos_phi = cos(phi);
@@ -87,21 +81,18 @@ void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, u
 
 	cudaSetDevice(1);
 
-	//assign memory on gpu for the input data.
-	float* gpuV;
+	
+	float* gpuV;					//assign memory on gpu for the input data.
 	cudaMalloc(&gpuV, bytes);	
 
-	//copy the image data to the GPU.
-	cudaMemcpy(gpuV, in, bytes, cudaMemcpyHostToDevice);
+	cudaMemcpy(gpuV, in, bytes, cudaMemcpyHostToDevice);			//copy the image data to the GPU.
 
 	float* gpuOut;
 	cudaMalloc(&gpuOut, bytes);
 
-	//call the local max function
-	gpu_local_max3<float>(gpuOut, gpuV, t, conn, x, y, z);
+	gpu_local_max3<float>(gpuOut, gpuV, t, conn, x, y, z);				//call the local max function
 
-	//copy the final result to the cpu.
-	cudaMemcpy(out, gpuOut, bytes, cudaMemcpyDeviceToHost);
+	cudaMemcpy(out, gpuOut, bytes, cudaMemcpyDeviceToHost);				//copy the final result to the cpu.
 	
 	cudaFree(gpuV);
 	cudaFree(gpuOut);
diff --git a/cpp/main.cpp b/cpp/main.cpp
index d2a9636..dbb6ae8 100644
--- a/cpp/main.cpp
+++ b/cpp/main.cpp
@@ -9,14 +9,13 @@
 #include <stim/image/image.h>
 #define pi	3.14159
 
-#define M_PI	3.14159
-#include <stim/math/circle.h>
-#include <stim/math/vec3.h>
-#include <stim/math/plane.h>
-#include <stim/math/vector.h>
-//#include <cuda.h>
-//#include <stim/cuda/cudatools.h>
-//#include <stim/cuda/cudatools/error.h>
+//#define M_PI	3.14159
+//#include <stim/math/circle.h>
+//#include <stim/math/vec3.h>
+//#include <stim/math/plane.h>
+//#include <stim/math/vector.h>
+//#include <stim/visualization/aabb3.h>
+
 
 
 /*void test_3(float* gpu_out, float* gpu_grad, float rmax, float phi, int n, int x, int y, int z);
@@ -52,42 +51,78 @@ int main(){
 	}
 	list.close();
 	*/
-	/*
-	int  n=10;
-	stim::circle<float>  cir;
-	float* c0= (float*) malloc(3*sizeof(float));
-	c0[0] =-4;
-	c0[1]=0;
-	c0[2] = 3;
-	stim::vec3<float> c(c0[0],c0[1],c0[2]);
-	float len = c.len();
-	stim::vec3<float> norm(c0[0]/len,c0[1]/len,c0[2]/len);
-	std::cout<< len << '\n';
-	std::cout<< norm << '\n';
-	cir.center(c);
-	cir.normal(norm);
-	cir.scale(2);
-	stim::vec3<float> out = cir.p(45);
-	std::vector<stim::vec3<float>> out2 = cir.getPoints(n);
+/*
+	int main(){
+
+
+		stim::vec3<float> g(-44,-3.4,-0.005);   // form a vec3 variable for the gradient vector
+			stim::vec3<float> g_sph = g.cart2sph();			//convert cartesian coordinate to spherical for the gradient vector
+			int n =36;										//set the number of points to find the boundaries of the conical voting area
+			int xi = 105;
+			int yi = 17;
+			int zi = 23;
+			float xc = 12 * cos(g_sph[1]) * sin(g_sph[2]);			//calculate the center point of the surface of the voting area for the voter
+			float yc = 10 * sin(g_sph[1]) * sin(g_sph[2]) ;
+			float zc = 10 * cos(g_sph[2]) ;
+			float r = sqrt(xc*xc + yc*yc + zc*zc);
+			xc+=xi;
+			yc+=yi;
+			zc+=zi;
+			stim::vec3<float> center(xc,yc,zc);
+			
+			float d = 2 * r * tan(25*pi/180 );		//find the diameter of the conical voting area
+			stim::vec3<float> norm = g.norm();			//compute the normalize gradient vector
+			float step = 360.0/(float) n;
+			stim::circle<float>  cir(center, d, norm);
+			stim::aabb3<int> bb(xi,yi,zi);
+			bb.insert(xc,yc,zc);
+			for(float j = 0; j <360.0; j += step){
+				stim::vec3<float> out = cir.p(j);
+				bb.insert(out[0], out[1], out[2]);
+			}
+
+			bb.trim_low(0,0,0);
+			bb.trim_high(128-1, 128-1, 128-1);
+
+			std::cout<< bb.low[0] << '\t' << bb.low[1] << '\t' << bb.low[2] << '\n';
+			std::cout<< bb.high[0] << '\t' << bb.high[1] << '\t' << bb.high[2] << '\n';
+			std::cin >> n;
+*/
+		/*int  n=10;
+		stim::circle<float>  cir;
+		float* c0= (float*) malloc(3*sizeof(float));
+		c0[0] =-4;
+		c0[1]=0;
+		c0[2] = 3;
+		stim::vec3<float> c(c0[0],c0[1],c0[2]);
+		float len = c.len();
+		stim::vec3<float> norm(c0[0]/len,c0[1]/len,c0[2]/len);
+		std::cout<< len << '\n';
+		std::cout<< norm << '\n';
+		cir.center(c);
+		cir.normal(norm);
+		cir.scale(2);
+		stim::vec3<float> out = cir.p(45);
+		std::vector<stim::vec3<float>> out2 = cir.getPoints(n);
 	
-	std::cout<< out << '\n';
-	std::cout <<out[0] << '\t' << out[1] << '\t' << out[2] <<'\n';
-	std::cout<< c << '\n';
+		std::cout<< out << '\n';
+		std::cout <<out[0] << '\t' << out[1] << '\t' << out[2] <<'\n';
+		std::cout<< c << '\n';
 	
-	for (std::vector<stim::vec3<float>>::const_iterator i = out2.begin(); i != out2.end(); ++i)
-    std::cout << *i << '\n';
-	std::ofstream list("circle_check.txt");
-	if (list.is_open()){
-		for (std::vector<stim::vec3<float>>::const_iterator j = out2.begin(); j != out2.end(); ++j)
-			list << *j << '\n';
-	}
-	list.close();
-	std::cin >> n;
+		for (std::vector<stim::vec3<float>>::const_iterator i = out2.begin(); i != out2.end(); ++i)
+		std::cout << *i << '\n';
+		std::ofstream list("circle_check.txt");
+		if (list.is_open()){
+			for (std::vector<stim::vec3<float>>::const_iterator j = out2.begin(); j != out2.end(); ++j)
+				list << *j << '\n';
+		}
+		list.close();
+		std::cin >> n;
 
 }
-
 */
 
+
 void ivote3(float* img, float std[], float anisotropy, float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[], 
 			unsigned int x, unsigned int y, unsigned int z);
 void lmax(float* center, float* vote, float t1, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z);
@@ -140,7 +175,7 @@ int main(int argc, char** argv){
 	args.add("z", "size of the dataset along Z axis", "positive value");
 	args.add("t", "threshold value for the final result", "positive valu");
 	args.add("invert", "to invert the input data set", "string");
-	args.add("anisotropy", "anisotropy value of the imaging", "positive value");
+	args.add("anisotropy", "anisotropy value of the imaging", "1");
 	//parse the command line arguments.
 	args.parse(argc, argv);
 
diff --git a/cpp/nissl-raw-data/nissl-float-64.64.64.vol b/cpp/nissl-raw-data/nissl-float-64.64.64.vol
new file mode 100644
index 0000000..020b564
Binary files /dev/null and b/cpp/nissl-raw-data/nissl-float-64.64.64.vol differ
diff --git a/cpp/update_dir3.cuh b/cpp/update_dir3.cuh
index da9837b..54104c2 100644
--- a/cpp/update_dir3.cuh
+++ b/cpp/update_dir3.cuh
@@ -129,7 +129,7 @@
 		}
 		
 		template<typename T>
-		void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+		void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
 
 			unsigned int max_threads = stim::maxThreadsPerBlock();
 			dim3 threads(sqrt (max_threads),sqrt (max_threads));
diff --git a/cpp/update_dir3_aabb.cuh b/cpp/update_dir3_aabb.cuh
index 1d533c5..bd64026 100644
--- a/cpp/update_dir3_aabb.cuh
+++ b/cpp/update_dir3_aabb.cuh
@@ -14,8 +14,7 @@
 
 		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
 		template<typename T>
-		__global__ void update_dir3(T* gpu_dir, T* gpu_grad, T* gpu_vote, T cos_phi, int rx, int ry, int rz,  int x,  int y, int z){
-			//extern __shared__ float s_vote[];
+		__global__ void update_dir3(T* gpu_dir, T* gpu_grad, T* gpu_vote, T phi, T cos_phi, int rx, int ry, int rz,  int x,  int y, int z){
 			
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;			//calculate x,y,z coordinates for this thread
 			
@@ -26,26 +25,13 @@
 			if(xi >= x|| yi >= y || zi>= z) return;
 			int i = zi * x * y + yi * x + xi;				//compute the global 1D index for this pixel
 
-			
-			// find the starting points for this block along the x and y directions
-			//int bxi = blockIdx.x * blockDim.x;
-			//int byi = blockidx_y * blockDim.y;
-			//find the starting points and the size of the window, which will be copied to the 2D-shared memory
-			//int bxs = bxi - rx;
-			//int bys = byi - ry;
-			//int xwidth = 2 * rx + blockDim.x;
-			//int ywidth = 2 * ry + blockDim.y;
-			//compute the coordinations of this pixel in the 2D-shared memory.
-			//int sx_rx = threadIdx.x + rx;
-			//int sy_ry = threadIdx.y + ry;
-
 			float rx_sq = rx * rx;        // compute the square for rmax 
 			float ry_sq = ry * ry;
 			float rz_sq = rz * rz;
 			
 			stim::vec3<float> g(gpu_grad[3*i],gpu_grad[3*i+1],gpu_grad[3*i+2]);   // form a vec3 variable for the gradient vector
 			stim::vec3<float> g_sph = g.cart2sph();			//convert cartesian coordinate to spherical for the gradient vector
-			int n =4;										//set the number of points to find the boundaries of the conical voting area
+			float n =8;										//set the number of points to find the boundaries of the conical voting area
 			float xc = rx * cos(g_sph[1]) * sin(g_sph[2]) ;			//calculate the center point of the surface of the voting area for the voter
 			float yc = ry * sin(g_sph[1]) * sin(g_sph[2]) ;
 			float zc = rz * cos(g_sph[2]) ;
@@ -54,9 +40,10 @@
 			yc+=yi;
 			zc+=zi;
 			stim::vec3<float> center(xc,yc,zc);
-			float d = 2 * r * tan(acos(cos_phi) );		//find the diameter of the conical voting area
+
+			float d = 2 * r * tan(phi);		//find the diameter of the conical voting area
 			stim::vec3<float> norm = g.norm();			//compute the normalize gradient vector
-			float step = 360.0/(float) n;
+			float step = 360.0/n;
 			stim::circle<float>  cir(center, d, norm);
 			stim::aabb3<int> bb(xi,yi,zi);
 			bb.insert(xc,yc,zc);
@@ -64,13 +51,13 @@
 				stim::vec3<float> out = cir.p(j);
 				bb.insert(out[0], out[1], out[2]);
 			}
-
+			bb.trim_low(xi-rx, yi-ry, zi-rz);
 			bb.trim_low(0,0,0);
+			bb.trim_high(xi+rx, yi+ry, zi+rz);
 			bb.trim_high(x-1, y-1, z-1);
 			int bx,by,bz;
 			int dx, dy, dz;
 			float dx_sq, dy_sq, dz_sq;
-			
 			float dist, cos_diff;
 			int idx_c;
 
@@ -118,42 +105,38 @@
 		template<typename T>
 		__global__ void update_grad3(T* gpu_grad, T* gpu_dir, int x, int y, int z){
 
-			//calculate x,y,z coordinates for this thread
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			//find the grid size along y
-			int grid_y = y / blockDim.y;
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;				//calculate x,y,z coordinates for this thread
+			
+			int grid_y = y / blockDim.y;							//find the grid size along y
 			int blockidx_y = blockIdx.y % grid_y;
 			int yi = blockidx_y * blockDim.y + threadIdx.y;
 			int zi = blockIdx.y / grid_y;
 			int i = zi * x * y + yi * x + xi;
 
 			if(xi >= x || yi >= y || zi >= z) return;
-			//update the gradient image with the new direction direction
-			gpu_grad[i * 3 + 0] = gpu_dir [i * 3 + 0];
+			
+			gpu_grad[i * 3 + 0] = gpu_dir [i * 3 + 0];			//update the gradient image with the new direction direction
 			gpu_grad[i * 3 + 1] = gpu_dir [i * 3 + 1];
 			gpu_grad[i * 3 + 2] = gpu_dir [i * 3 + 2];
 		}
 		
 		template<typename T>
-		void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+		void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
 
 			unsigned int max_threads = stim::maxThreadsPerBlock();
 			dim3 threads(sqrt (max_threads),sqrt (max_threads));
 			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
-			//unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*sizeof(T);			
-			// allocate space on the GPU for the updated vote direction
-			T* gpu_dir;
+				
+			T* gpu_dir;												// allocate space on the GPU for the updated vote direction
 			cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3);	
 
 			//call the kernel to calculate the new voting direction
-			update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z);
-			
+			update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, phi, cos_phi, r[0], r[1], r[2], x , y, z);
 			
 			//call the kernel to update the gradient direction
 			update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z);
-			
-			//free allocated memory
-			cudaFree(gpu_dir);
+						
+			cudaFree(gpu_dir);														//free allocated memory
 
 		}
 		
diff --git a/cpp/vote3.cuh b/cpp/vote3.cuh
index 5af3cef..6b06fb7 100644
--- a/cpp/vote3.cuh
+++ b/cpp/vote3.cuh
@@ -101,7 +101,7 @@
 		}
 
 		template<typename T>
-		void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+		void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
 
 			
 			unsigned int max_threads = stim::maxThreadsPerBlock();
diff --git a/cpp/vote3_atomic.cuh b/cpp/vote3_atomic.cuh
index 8d7f609..96e40c3 100644
--- a/cpp/vote3_atomic.cuh
+++ b/cpp/vote3_atomic.cuh
@@ -27,14 +27,14 @@
 
 			float mag_v = sqrt(gx_v*gx_v + gy_v*gy_v + gz_v*gz_v);  // compute the gradient magnitude for the voter
 
-			float gx_v_n = gx_v/mag_v;      // normalize the gradient vector for the voter
-			float gy_v_n = gy_v/mag_v;
-			float gz_v_n = gz_v/mag_v;
+			//float gx_v_n = gx_v/mag_v;      // normalize the gradient vector for the voter
+			//float gy_v_n = gy_v/mag_v;
+			//float gz_v_n = gz_v/mag_v;
 
 			float rx_sq = rx * rx;        // compute the square for rmax 
 			float ry_sq = ry * ry;
 			float rz_sq = rz * rz;
-			float x_sq, y_sq, z_sq, d_c, cos_diff;
+			float x_sq, y_sq, z_sq, dist, cos_diff;
 			int xi_c, yi_c, zi_c, idx_c;
 
 			for (int z_c=-rz; z_c<=rz; z_c++){
@@ -49,8 +49,8 @@
 								xi_c = xi + x_c;
 								if (xi_c < x && xi_c>=0){
 									x_sq = x_c * x_c;
-									d_c = sqrt(x_sq + y_sq + z_sq);			//calculate the distance between the voter and the current counter
-									cos_diff = (gx_v_n * x_c + gy_v_n * y_c +  gz_v_n * z_c)/(d_c);			 // calculate the cosine of angle between the voter and the current counter
+									dist = sqrt(x_sq + y_sq + z_sq);			//calculate the distance between the voter and the current counter
+									cos_diff = (gx_v * x_c + gy_v * y_c +  gz_v * z_c)/(dist * mag_v);			 // calculate the cosine of angle between the voter and the current counter
 									if ( ( (x_sq/rx_sq + y_sq/ry_sq + z_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){      
 										idx_c = (zi_c * y + yi_c) * x + xi_c;			//calculate the 1D index for the current counter
 										atomicAdd (&gpu_vote[idx_c] , mag_v);
@@ -64,7 +64,7 @@
 		}
 
 		template<typename T>
-		void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+		void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
 
 			
 			unsigned int max_threads = stim::maxThreadsPerBlock();
diff --git a/cpp/vote3_atomic_aabb.cuh b/cpp/vote3_atomic_aabb.cuh
index ab0b802..25d263f 100644
--- a/cpp/vote3_atomic_aabb.cuh
+++ b/cpp/vote3_atomic_aabb.cuh
@@ -15,7 +15,7 @@
 
 		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
 		template<typename T>
-		__global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
+		__global__ void vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
 
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;						//calculate x,y,z coordinates for this thread
 			
@@ -32,13 +32,10 @@
 			float rx_sq = rx * rx;        // compute the square for rmax 
 			float ry_sq = ry * ry;
 			float rz_sq = rz * rz;
-			float dist, cos_diff;
-			int idx_c;
-
-			//float rmax = sqrt(rx_sq + ry_sq + rz_sq);
+			
 			stim::vec3<float> g(gpu_grad[3*i],gpu_grad[3*i+1],gpu_grad[3*i+2]);   // form a vec3 variable for the gradient vector
 			stim::vec3<float> g_sph = g.cart2sph();			//convert cartesian coordinate to spherical for the gradient vector
-			int n =4;										//set the number of points to find the boundaries of the conical voting area
+			float n =8;										//set the number of points to find the boundaries of the conical voting area
 			float xc = rx * cos(g_sph[1]) * sin(g_sph[2]);			//calculate the center point of the surface of the voting area for the voter
 			float yc = ry * sin(g_sph[1]) * sin(g_sph[2]) ;
 			float zc = rz * cos(g_sph[2]) ;
@@ -48,9 +45,9 @@
 			zc+=zi;
 			stim::vec3<float> center(xc,yc,zc);
 			
-			float d = 2 * r * tan(acos(cos_phi) );		//find the diameter of the conical voting area
+			float d = 2 * r * tan(phi);		//find the diameter of the conical voting area
 			stim::vec3<float> norm = g.norm();			//compute the normalize gradient vector
-			float step = 360.0/(float) n;
+			float step = 360.0/n;
 			stim::circle<float>  cir(center, d, norm);
 			stim::aabb3<int> bb(xi,yi,zi);
 			bb.insert(xc,yc,zc);
@@ -58,12 +55,15 @@
 				stim::vec3<float> out = cir.p(j);
 				bb.insert(out[0], out[1], out[2]);
 			}
-
+			bb.trim_low(xi-rx, yi-ry, zi-rz);
 			bb.trim_low(0,0,0);
+			bb.trim_high(xi+rx, yi+ry, zi+rz);
 			bb.trim_high(x-1, y-1, z-1);
 			int bx,by,bz;
 			int dx, dy, dz;
 			float dx_sq, dy_sq, dz_sq;
+			float dist, cos_diff;
+			int idx_c;
 			for (bz=bb.low[2]; bz<=bb.high[2]; bz++){
 				dz = bz - zi;							//compute the distance bw the voter and the current counter along z axis
 				dz_sq = dz * dz;
@@ -86,13 +86,13 @@
 		}
 
 		template<typename T>
-		void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+		void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
 
 			
 			unsigned int max_threads = stim::maxThreadsPerBlock();
 			dim3 threads(sqrt (max_threads),sqrt (max_threads));
 			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
-			vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);			//call the kernel to do the voting
+			vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x , y, z);			//call the kernel to do the voting
 
 		}
 
--
libgit2 0.21.4