upload the ivote3 functions using the new method for finding voting area (bounda…

…ries of voting area)'

upload the ivote3 functions using the new method for finding voting area (bounda…
…ries of voting area)'
Laila Saadatifard
1 parent 46820b25
Showing 9 changed files with 657 additions and 122 deletions Show diff stats
Matlab_3D/gt2.vol
Matlab_3D/validation.m
cpp/CMakeLists.txt
cpp/args.txt
cpp/cudafunc.cu
cpp/main.cpp
cpp/update_dir3_aabb.cuh
cpp/vote3_atomic.cuh
cpp/vote3_atomic_aabb.cuh
 clear all;
-disp('***************** NEW RUN *********************');
+%for t=100:100:5000
+t=2350;
 X = 128;
 Y = 128;
 Z = 128;
@@ -7,15 +8,15 @@ D = 10;
 t0=1;
 r1=12;
 r2=10;
-t=2300;
-itr=8;
-vote=10;
+% t=2300;
+itr=5;
+vote=7;
 std = [5 5];
 gt_filename = 'gt2.vol';
 % out_filename = sprintf('128-128-128/0-nissl-std%d.%d-t0%d-r%d.%d-t%d-out%d.%d.vol',std(1), std(2),t0,r1,r2,t,itr,vote);
-out_filename = sprintf('D:/build/ivote3-bld/shared2D-v8/centers.%d.vol',t);
+out_filename = sprintf('D:/build/ivote3-bld/0-out.%d.vol',t);
 % txt_filename = sprintf('128-128-128/0-validation-nissl-std%d.%d-r%d.%d-t%d-out%d.%d-D%d.txt',std(1), std(2),r1,r2,t,itr,vote,D);
-txt_filename = sprintf('0-t%d--1.txt',t);
+txt_filename = sprintf('D:/build/ivote3-bld/0-t%d-atomic-aabb.txt',t);
 spec = sprintf('Nissl-std%d.%d-r%d.%d-t%d-out%d.%d',std(1), std(2),r1,r2,t,itr,vote);
 fid0 = fopen(gt_filename);
 gt = fread(fid0,[X Y*Z], 'single');
@@ -79,73 +80,73 @@ for k=1:size(id_ref,1)
 		oseg = oseg+1;
 	end
 end
-%****
-% check if the oversegmented detected cells could be assigned to the gt's element with no detected cell!
-%find the indices of gt's element with no detected cell
-u_idx1 = find(b==0);
-%find the coordinates of gt's
-ref1 = ref(u_idx1,:);
-% find the indices of detected cells which are assigned to a gt's element
-ub = unique(b(b>0));
-% set the number of detected cell which are not assigned to a gt's element to zero
-mm=0;
-% find the indices of detected cell which are not assigned to a gt's element
-for j=1:size(cent,1)
-	z = sum(ub(:)==j);
-	if z==0
-		mm = mm+1;
-		cent_id1(mm) = j;
-	end
-end
-% find the coordinated of of detected cells
-cent1 = cent(cent_id1,:);
-
-% check if there are osme low enough distances
-[idx1, dist1] = knnsearch(cent1,ref1);
-ind_eq_idx_rc = zeros(numel(idx1), 10);
-%set number of cells with the same detected cell center to zero.
-noc_rc=0;
-for i_rc=1:numel(idx1)	
-	% check the current element hasn't considered yet
-	s_ind_rc = sum(ind_eq_idx_rc==i_rc);
-	if s_ind_rc==0
-		% check how many of the elemnts assigned to the same detected cell
-		s_rc = sum(idx1==idx1(i_rc));		
-		if s_rc>1 
-			noc_rc = noc_rc+1;
-			%save the index and number of gt's elements with the same assigned detected cells.
-			id_ref_rc(noc_rc,1) = i_rc;
-			id_ref_rc(noc_rc,2) = s_rc;
-			ind1_rc = find(idx1==idx1(i_rc));
-			ind_eq_idx_rc(i_rc, 1:numel(ind1_rc)) = ind1_rc;
-		end
-	end
-end
-% determine those indices which hs assigned to only one detected cell
-b_rc = idx1;
-u_eq_idx_rc = unique(ind_eq_idx_rc(ind_eq_idx_rc>0));
-b_rc(u_eq_idx_rc)=0;
-% set the number of over sefgmented cells to zero.
-oseg_rc=0;
-for k_rc=1:size(id_ref_rc,1)
-	k1_rc = id_ref_rc(k_rc,1);
-	k2_rc = id_ref_rc(k_rc,2);
-	%find the minimum distance to the detected cell happened for which of the gt's element
-	l_rc = ind_eq_idx_rc(k1_rc,1:k2_rc);
-	[~, local_id_min_rc] = min(dist1(l_rc));
-	% set the element with minimum distance to the corresponding detected cell
-	b_rc(l_rc(local_id_min_rc)) = idx1(k1_rc);
-	%remove the proper element from the list of indices with same designated detected cells
-	u_eq_idx_rc (u_eq_idx_rc == l_rc(local_id_min_rc))=0;
-	% check if the indices remained in the list has distance less than the value is set for validation
-	ly_rc = l_rc (l_rc~=l_rc(local_id_min_rc));
-	distl_rc = dist1(ly_rc);
-	sl_rc = sum(distl_rc<D);
-	% if the distance is low enought, consider the corresponding cell as oversegmented
-	if sl_rc>0
-		oseg_rc = oseg_rc+1;
-	end
-end
+% %****
+% % check if the oversegmented detected cells could be assigned to the gt's element with no detected cell!
+% %find the indices of gt's element with no detected cell
+% u_idx1 = find(b==0);
+% %find the coordinates of gt's
+% ref1 = ref(u_idx1,:);
+% % find the indices of detected cells which are assigned to a gt's element
+% ub = unique(b(b>0));
+% % set the number of detected cell which are not assigned to a gt's element to zero
+% mm=0;
+% % find the indices of detected cell which are not assigned to a gt's element
+% for j=1:size(cent,1)
+% 	z = sum(ub(:)==j);
+% 	if z==0
+% 		mm = mm+1;
+% 		cent_id1(mm) = j;
+% 	end
+% end
+% % find the coordinated of of detected cells
+% cent1 = cent(cent_id1,:);
+% 
+% % check if there are osme low enough distances
+% [idx1, dist1] = knnsearch(cent1,ref1);
+% ind_eq_idx_rc = zeros(numel(idx1), 10);
+% %set number of cells with the same detected cell center to zero.
+% noc_rc=0;
+% for i_rc=1:numel(idx1)	
+% 	% check the current element hasn't considered yet
+% 	s_ind_rc = sum(ind_eq_idx_rc==i_rc);
+% 	if s_ind_rc==0
+% 		% check how many of the elemnts assigned to the same detected cell
+% 		s_rc = sum(idx1==idx1(i_rc));		
+% 		if s_rc>1 
+% 			noc_rc = noc_rc+1;
+% 			%save the index and number of gt's elements with the same assigned detected cells.
+% 			id_ref_rc(noc_rc,1) = i_rc;
+% 			id_ref_rc(noc_rc,2) = s_rc;
+% 			ind1_rc = find(idx1==idx1(i_rc));
+% 			ind_eq_idx_rc(i_rc, 1:numel(ind1_rc)) = ind1_rc;
+% 		end
+% 	end
+% end
+% % determine those indices which hs assigned to only one detected cell
+% b_rc = idx1;
+% u_eq_idx_rc = unique(ind_eq_idx_rc(ind_eq_idx_rc>0));
+% b_rc(u_eq_idx_rc)=0;
+% % set the number of over sefgmented cells to zero.
+% oseg_rc=0;
+% for k_rc=1:size(id_ref_rc,1)
+% 	k1_rc = id_ref_rc(k_rc,1);
+% 	k2_rc = id_ref_rc(k_rc,2);
+% 	%find the minimum distance to the detected cell happened for which of the gt's element
+% 	l_rc = ind_eq_idx_rc(k1_rc,1:k2_rc);
+% 	[~, local_id_min_rc] = min(dist1(l_rc));
+% 	% set the element with minimum distance to the corresponding detected cell
+% 	b_rc(l_rc(local_id_min_rc)) = idx1(k1_rc);
+% 	%remove the proper element from the list of indices with same designated detected cells
+% 	u_eq_idx_rc (u_eq_idx_rc == l_rc(local_id_min_rc))=0;
+% 	% check if the indices remained in the list has distance less than the value is set for validation
+% 	ly_rc = l_rc (l_rc~=l_rc(local_id_min_rc));
+% 	distl_rc = dist1(ly_rc);
+% 	sl_rc = sum(distl_rc<D);
+% 	% if the distance is low enought, consider the corresponding cell as oversegmented
+% 	if sl_rc>0
+% 		oseg_rc = oseg_rc+1;
+% 	end
+% end
 %******
 % b include the gt's element and its detected cells, for those element with no cell detection, b has zero value
@@ -154,11 +155,11 @@ b_dist = dist;
 % remove the disatances for those elements with no detected cells from distance array
 b_dist(b_ind)=-1;
-b_ind_rc = find(b_rc==0);
-b_dist_rc = dist1;
-b_dist_rc(b_ind_rc)=-1;
+% b_ind_rc = find(b_rc==0);
+% b_dist_rc = dist1;
+% b_dist_rc(b_ind_rc)=-1;
 % calculate Ttrue Positive, number of detected cells that have low enough distance to one of the gt's elements.
-TP = sum(b_dist>=0) - sum(b_dist>D) + (sum(b_dist_rc>=0) - sum(b_dist_rc>D));
+TP = sum(b_dist>=0) - sum(b_dist>D); % + (sum(b_dist_rc>=0) - sum(b_dist_rc>D));
 % calculate False Negative, number of gt's elements with no detected cells.
 FN = size(ref, 1) - TP;
 % calculate False Positive, number of detected cells, which their distance to the gt's element is long
@@ -179,3 +180,5 @@ fprintf(fid_text, &#39;Precision = %f\n&#39;, Precision);
 fprintf(fid_text, 'Accuracy = %f\n', Accuracy);
 fprintf(fid_text, 'Recall = %f\n', Recall);
 fclose(fid_text);
+clear all;
+%end
 \ No newline at end of file
@@ -10,6 +10,9 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} &quot;${CMAKE_SOURCE_DIR}&quot;)
 #set up CUDA
 find_package(CUDA REQUIRED)
+#set up opencv
+find_package(opencv)
+
 #find the STIM library
 find_package(STIM REQUIRED)
+nissl-float-128.128.128.vol 0-out --t 2350 --x 128 --y 128 --z 128 --anisotropy 1 --invert
 \ No newline at end of file
+/*#include "circle_check.cuh"
+
+void test_3(float* gpu_out, float* gpu_grad, float rmax, float phi, int n, int x, int y, int z){
+gpu_test3(gpu_out, gpu_grad, rmax, phi, n, x, y, z);
+}
+*/
+
+
 #include "gaussian_blur3.cuh"
 #include "gradient3.cuh"
 #include "mag3.cuh"
-#include "vote3.cuh"
+#include "vote3_atomic_aabb.cuh"
 #include "update_dir3.cuh"
 #include "local_max3.cuh"
@@ -10,11 +18,11 @@ void ivote3(float* img, float sigma[], float anisotropy, float phi, float d_phi,
 			int iter, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){
-	cudaSetDevice(0);
+	cudaSetDevice(1);
 	// compute the number of bytes in the input data
 	unsigned int bytes = x * y * z * sizeof(float);
-	//assign memory on gpu for the input data.
+	//assign memory on gpu for the input data.z
 	float* gpuI0;
 	cudaMalloc(&gpuI0, bytes);	
@@ -41,18 +49,17 @@ void ivote3(float* img, float sigma[], float anisotropy, float phi, float d_phi,
 	//call the vote function.
 	for (int i = 0; i < iter; i++){
-
+	
+		cudaMemset(gpu_vote, 0, bytes);
 		gpu_vote3<float>(gpu_vote, gpu_grad, cos_phi, r, x, y, z);
 		cudaDeviceSynchronize();
-		/*if (i==7)
-			cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);*/
-		
-		if (phi >= d_phi){	
+	
+		//if (phi >= d_phi){	
 			gpu_update_dir3<float>(gpu_grad, gpu_vote, cos_phi, r, x, y, z);
 			cudaDeviceSynchronize();
 			phi = phi - d_phi;
 			cos_phi = cos(phi);
-		}
+		//}
 	}
@@ -78,6 +85,8 @@ void ivote3(float* img, float sigma[], float anisotropy, float phi, float d_phi,
 void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){
 	unsigned int bytes = x * y * z * sizeof(float);
+	cudaSetDevice(1);
+
 	//assign memory on gpu for the input data.
 	float* gpuV;
 	cudaMalloc(&gpuV, bytes);	
@@ -96,4 +105,5 @@ void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, u
 	cudaFree(gpuV);
 	cudaFree(gpuOut);
-}
 \ No newline at end of file
+}
+
@@ -9,6 +9,84 @@
 #include <stim/image/image.h>
 #define pi	3.14159
+#define M_PI	3.14159
+#include <stim/math/circle.h>
+#include <stim/math/vec3.h>
+#include <stim/math/plane.h>
+#include <stim/math/vector.h>
+//#include <cuda.h>
+//#include <stim/cuda/cudatools.h>
+//#include <stim/cuda/cudatools/error.h>
+
+
+/*void test_3(float* gpu_out, float* gpu_grad, float rmax, float phi, int n, int x, int y, int z);
+
+int main(){
+	
+	int  n=20;
+	float rmax;
+	float phi_deg;
+	float phi;
+	rmax=4;
+	phi_deg = 15;
+	phi = phi_deg * pi/180;
+	int x,y,z;
+	x=y=z=1;
+	unsigned int size = x*y*z*sizeof (float);
+	float* cpu_grad = (float*) malloc(3*size);
+	float* gpu_grad;
+	cudaMalloc(&gpu_grad, 3*size);
+	cpu_grad[0]=1;
+	cpu_grad[1]=0;
+	cpu_grad[2]=-0.5;
+	cudaMemcpy(gpu_grad, cpu_grad, 3*size, cudaMemcpyHostToDevice);
+	float* cpu_out = (float*) malloc(3*size*(n+1));
+	float* gpu_out;
+	cudaMalloc(&gpu_out, 3*size*(n+1));
+	test_3(gpu_out, gpu_grad, rmax, phi, n, x, y, z);
+	cudaMemcpy(cpu_out, gpu_out, 3*size*(n+1), cudaMemcpyDeviceToHost);
+	std::ofstream list("circle_check_cuda1.txt");
+	if (list.is_open()){
+		for (int j=0; j<=n; ++j)
+			list << cpu_out[3*j] << '\t' << cpu_out[3*j +1] << '\t' << cpu_out[3*j + 2] << '\n';
+	}
+	list.close();
+	*/
+	/*
+	int  n=10;
+	stim::circle<float>  cir;
+	float* c0= (float*) malloc(3*sizeof(float));
+	c0[0] =-4;
+	c0[1]=0;
+	c0[2] = 3;
+	stim::vec3<float> c(c0[0],c0[1],c0[2]);
+	float len = c.len();
+	stim::vec3<float> norm(c0[0]/len,c0[1]/len,c0[2]/len);
+	std::cout<< len << '\n';
+	std::cout<< norm << '\n';
+	cir.center(c);
+	cir.normal(norm);
+	cir.scale(2);
+	stim::vec3<float> out = cir.p(45);
+	std::vector<stim::vec3<float>> out2 = cir.getPoints(n);
+	
+	std::cout<< out << '\n';
+	std::cout <<out[0] << '\t' << out[1] << '\t' << out[2] <<'\n';
+	std::cout<< c << '\n';
+	
+	for (std::vector<stim::vec3<float>>::const_iterator i = out2.begin(); i != out2.end(); ++i)
+    std::cout << *i << '\n';
+	std::ofstream list("circle_check.txt");
+	if (list.is_open()){
+		for (std::vector<stim::vec3<float>>::const_iterator j = out2.begin(); j != out2.end(); ++j)
+			list << *j << '\n';
+	}
+	list.close();
+	std::cin >> n;
+
+}
+
+*/
 void ivote3(float* img, float std[], float anisotropy, float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[], 
 			unsigned int x, unsigned int y, unsigned int z);
@@ -98,7 +176,7 @@ int main(int argc, char** argv){
 	float t = args["t"].as_float();
 	//set the anisotropy
 	float anisotropy =  args["anisotropy"].as_float();
-	unsigned int rmax = 10 ;
+	unsigned int rmax = 10;
 	unsigned int r[3] = { 12, rmax, rmax};
 	float std = 5;
 	float sigma[3] = { std, std, std};
@@ -137,50 +215,51 @@ int main(int argc, char** argv){
 	std::ofstream fblur("shared2D-v8/vote8.vol", std::ofstream::out | std::ofstream::binary);
 	fblur.write((char*)cpuI, bytes);
 	fblur.close();
-	/*
-	stim::image<float>imgrad3;
-	imgrad3.set_interleaved3(cpu_out, 128,128,128,3);
-	std::ofstream fgx("syn/gx-128.vol", std::ofstream::out | std::ofstream::binary);
-	fgx.write((char*)imgrad3.channel(0).data(), bytes);
-	fgx.close();
-	*/
+	
+	//stim::image<float>imgrad3;
+	//imgrad3.set_interleaved3(cpu_out, 128,128,128,3);
+	//std::ofstream fgx("syn/gx-128.vol", std::ofstream::out | std::ofstream::binary);
+	//fgx.write((char*)imgrad3.channel(0).data(), bytes);
+	//fgx.close();
+	
 	//write the output file.
-	//for (int t0=2000; t0<=2500; t0+=100){
+	//for (int t0=0; t0<=5000; t0+=100){
+	//	float t1 = t0;
 		int t0 = t;
 			lmax(cpu_out, cpuI, t, conn, x, y, z);
 			//std::ofstream fo("shared2D-v8/" + OutName.str(), std::ofstream::out | std::ofstream::binary);
-			std::ofstream fo("shared2D-v8/" + OutName.str()+std::to_string(t0)+".vol", std::ofstream::out | std::ofstream::binary);
+			std::ofstream fo( OutName.str()+std::to_string(t0)+".vol", std::ofstream::out | std::ofstream::binary);
 			fo.write((char*)cpu_out, bytes);
 			fo.close();
 	// creat a file for saving the list centers
-		std::ofstream list("shared2D-v8/" + OutName.str()+std::to_string(t0)+".obj");
-		// set the number of detected cells to zero.
-		int nod = 0;
-		if (list.is_open()){
-
-				for (int iz=0; iz<z; iz++){
-					for (int iy=0; iy<y; iy++){
-						for (int ix=0; ix<x; ix++){
-
-							int idx = iz * x * y + iy * x + ix;
-							if (cpu_out[idx]==1){
-								nod++;
-								list << "v" << "\t" << ix << "\t" << iy << "\t"<< iz << '\n' ;
-					
-							}
-						}
-					}
-				}
-				list << "p" << "\t";
-				for (unsigned int i_nod =1 ; i_nod <=nod; i_nod++){
-					list << i_nod << "\t";
-				}
+		//std::ofstream list("shared2D-v8/" + OutName.str()+std::to_string(t0)+".obj");
+		//// set the number of detected cells to zero.
+		//int nod = 0;
+		//if (list.is_open()){
-		list.close();
-		}
-	
+		//		for (int iz=0; iz<z; iz++){
+		//			for (int iy=0; iy<y; iy++){
+		//				for (int ix=0; ix<x; ix++){
+
+		//					int idx = iz * x * y + iy * x + ix;
+		//					if (cpu_out[idx]==1){
+		//						nod++;
+		//						list << "v" << "\t" << ix << "\t" << iy << "\t"<< iz << '\n' ;
+		//			
+		//					}
+		//				}
+		//			}
+		//		}
+		//		list << "p" << "\t";
+		//		for (unsigned int i_nod =1 ; i_nod <=nod; i_nod++){
+		//			list << i_nod << "\t";
+		//		}
+
+		//list.close();
+		//}
+	//}
 		cudaDeviceReset();       
 }
+#ifndef STIM_CUDA_UPDATE_DIR3_AABB_H
+#define STIM_CUDA_UPDATE_DIR3_AABB_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include "cpyToshare.cuh"
+#define M_PI	3.14159
+#include <stim/math/circle.h>
+#include <stim/math/vec3.h>
+#include <stim/math/plane.h>
+#include <stim/math/vector.h>
+#include <stim/visualization/aabb3.h>
+
+		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
+		template<typename T>
+		__global__ void update_dir3(T* gpu_dir, T* gpu_grad, T* gpu_vote, T cos_phi, int rx, int ry, int rz,  int x,  int y, int z){
+			//extern __shared__ float s_vote[];
+			
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;			//calculate x,y,z coordinates for this thread
+			
+			int grid_y = y / blockDim.y;						//find the grid size along y
+			int blockidx_y = blockIdx.y % grid_y;
+			int yi = blockidx_y * blockDim.y + threadIdx.y;
+			int zi = blockIdx.y / grid_y;
+			if(xi >= x|| yi >= y || zi>= z) return;
+			int i = zi * x * y + yi * x + xi;				//compute the global 1D index for this pixel
+
+			
+			// find the starting points for this block along the x and y directions
+			//int bxi = blockIdx.x * blockDim.x;
+			//int byi = blockidx_y * blockDim.y;
+			//find the starting points and the size of the window, which will be copied to the 2D-shared memory
+			//int bxs = bxi - rx;
+			//int bys = byi - ry;
+			//int xwidth = 2 * rx + blockDim.x;
+			//int ywidth = 2 * ry + blockDim.y;
+			//compute the coordinations of this pixel in the 2D-shared memory.
+			//int sx_rx = threadIdx.x + rx;
+			//int sy_ry = threadIdx.y + ry;
+
+			float rx_sq = rx * rx;        // compute the square for rmax 
+			float ry_sq = ry * ry;
+			float rz_sq = rz * rz;
+			
+			stim::vec3<float> g(gpu_grad[3*i],gpu_grad[3*i+1],gpu_grad[3*i+2]);   // form a vec3 variable for the gradient vector
+			stim::vec3<float> g_sph = g.cart2sph();			//convert cartesian coordinate to spherical for the gradient vector
+			int n =4;										//set the number of points to find the boundaries of the conical voting area
+			float xc = rx * cos(g_sph[1]) * sin(g_sph[2]) ;			//calculate the center point of the surface of the voting area for the voter
+			float yc = ry * sin(g_sph[1]) * sin(g_sph[2]) ;
+			float zc = rz * cos(g_sph[2]) ;
+			float r = sqrt(xc*xc + yc*yc + zc*zc);
+			xc+=xi;
+			yc+=yi;
+			zc+=zi;
+			stim::vec3<float> center(xc,yc,zc);
+			float d = 2 * r * tan(acos(cos_phi) );		//find the diameter of the conical voting area
+			stim::vec3<float> norm = g.norm();			//compute the normalize gradient vector
+			float step = 360.0/(float) n;
+			stim::circle<float>  cir(center, d, norm);
+			stim::aabb3<int> bb(xi,yi,zi);
+			bb.insert(xc,yc,zc);
+			for(float j = 0; j <360.0; j += step){
+				stim::vec3<float> out = cir.p(j);
+				bb.insert(out[0], out[1], out[2]);
+			}
+
+			bb.trim_low(0,0,0);
+			bb.trim_high(x-1, y-1, z-1);
+			int bx,by,bz;
+			int dx, dy, dz;
+			float dx_sq, dy_sq, dz_sq;
+			
+			float dist, cos_diff;
+			int idx_c;
+
+			float max = 0;					// define a local variable to maximum value of the vote image in the voting area for this voter
+			float l_vote = 0;
+			
+			float id_x = g[0];				// define local variables for the x, y, and z coordinations point to the vote direction
+			float id_y = g[1];
+			float id_z = g[2];
+
+			for (bz=bb.low[2]; bz<=bb.high[2]; bz++){
+				dz = bz - zi;							//compute the distance bw the voter and the current counter along z axis
+				dz_sq = dz * dz;
+				for (by=bb.low[1]; by<=bb.high[1]; by++){
+					dy = by - yi;								//compute the distance bw the voter and the current counter along y axis
+					dy_sq = dy * dy;
+					for (bx=bb.low[0]; bx<=bb.high[0]; bx++){
+						dx = bx - xi;								//compute the distance bw the voter and the current counter along x axis
+						dx_sq = dx * dx;
+
+						dist = sqrt(dx_sq + dy_sq + dz_sq);			//calculate the distance between the voter and the current counter
+						cos_diff = (norm[0] * dx + norm[1] * dy +  norm[2] * dz)/dist;			 // calculate the cosine of angle between the voter and the current counter
+						if ( ( (dx_sq/rx_sq + dy_sq/ry_sq + dz_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){			//check if the current counter located in the voting area of the voter    
+							idx_c = (bz* y + by) * x + bx;			//calculate the 1D index for the current counter
+							l_vote = gpu_vote[idx_c];
+							if  (l_vote>max) {
+								max = l_vote;
+								id_x = dx;
+								id_y = dy;
+								id_z = dz;
+							}
+						}								
+					}						
+				}				
+			}
+			float m_id = sqrt (id_x*id_x + id_y*id_y + id_z*id_z);
+			gpu_dir[i * 3 + 0] = g_sph[0] * (id_x/m_id);
+			gpu_dir[i * 3 + 1] = g_sph[0] * (id_y/m_id);
+			gpu_dir[i * 3 + 2] = g_sph[0] * (id_z/m_id);
+		}
+
+
+
+		// this kernel updates the gradient direction by the calculated voting direction.
+		template<typename T>
+		__global__ void update_grad3(T* gpu_grad, T* gpu_dir, int x, int y, int z){
+
+			//calculate x,y,z coordinates for this thread
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			//find the grid size along y
+			int grid_y = y / blockDim.y;
+			int blockidx_y = blockIdx.y % grid_y;
+			int yi = blockidx_y * blockDim.y + threadIdx.y;
+			int zi = blockIdx.y / grid_y;
+			int i = zi * x * y + yi * x + xi;
+
+			if(xi >= x || yi >= y || zi >= z) return;
+			//update the gradient image with the new direction direction
+			gpu_grad[i * 3 + 0] = gpu_dir [i * 3 + 0];
+			gpu_grad[i * 3 + 1] = gpu_dir [i * 3 + 1];
+			gpu_grad[i * 3 + 2] = gpu_dir [i * 3 + 2];
+		}
+		
+		template<typename T>
+		void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(sqrt (max_threads),sqrt (max_threads));
+			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
+			//unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*sizeof(T);			
+			// allocate space on the GPU for the updated vote direction
+			T* gpu_dir;
+			cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3);	
+
+			//call the kernel to calculate the new voting direction
+			update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z);
+			
+			
+			//call the kernel to update the gradient direction
+			update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z);
+			
+			//free allocated memory
+			cudaFree(gpu_dir);
+
+		}
+		
+		template<typename T>
+		void cpu_update_dir3(T* cpu_grad, T* cpu_vote, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * z * sizeof(T);
+
+			//allocate space on the GPU for the Vote data
+			T* gpu_vote;
+			cudaMalloc(&gpu_vote, bytes);
+
+			//copy the input vote data to the GPU
+			cudaMemcpy(gpu_vote, cpu_vote, bytes, cudaMemcpyHostToDevice);	
+
+			//allocate space on the GPU for the Gradient data
+			T* gpu_grad;
+			cudaMalloc(&gpu_grad, bytes*3);
+
+			//copy the Gradient data to the GPU
+			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
+
+			//call the GPU version of the update direction function
+			gpu_update_dir3<T>(gpu_grad, gpu_vote, cos_phi, r, x , y, z);
+							
+			//copy the new gradient image back to the CPU
+			cudaMemcpy(cpu_grad, gpu_grad, bytes*3, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpu_vote);
+			cudaFree(gpu_grad);
+		}
+		
+
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_VOTE3_ATOMIC_H
+#define STIM_CUDA_VOTE3_ATOMIC_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/cudatools/error.h>
+#include "cpyToshare.cuh"
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
+
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;						//calculate x,y,z coordinates for this thread
+			
+			int grid_y = y / blockDim.y;										//find the grid size along y
+			int blockidx_y = blockIdx.y % grid_y;
+			int yi = blockidx_y * blockDim.y + threadIdx.y;
+			int zi = blockIdx.y / grid_y;
+						
+			if(xi>=x || yi>=y || zi>=z) return;
+
+			int i = zi * x * y + yi * x + xi;	  // calculate the 1D index of the voter
+			float gx_v = gpu_grad[3*i];           // find the gradient information in cartesian coordinate for the voter - global memory fetch
+			float gy_v = gpu_grad[3*i+1];
+			float gz_v = gpu_grad[3*i+2];
+
+			float mag_v = sqrt(gx_v*gx_v + gy_v*gy_v + gz_v*gz_v);  // compute the gradient magnitude for the voter
+
+			float gx_v_n = gx_v/mag_v;      // normalize the gradient vector for the voter
+			float gy_v_n = gy_v/mag_v;
+			float gz_v_n = gz_v/mag_v;
+
+			float rx_sq = rx * rx;        // compute the square for rmax 
+			float ry_sq = ry * ry;
+			float rz_sq = rz * rz;
+			float x_sq, y_sq, z_sq, d_c, cos_diff;
+			int xi_c, yi_c, zi_c, idx_c;
+
+			for (int z_c=-rz; z_c<=rz; z_c++){
+				zi_c = zi + z_c;              // calculate the z position for the current counter
+				if (zi_c <z && zi_c>=0){      // make sure the current counter is inside the image
+					z_sq = z_c * z_c;
+					for (int y_c=-ry; y_c<=ry; y_c++){
+						yi_c = yi + y_c;
+						if (yi_c < y && yi_c>=0){
+							y_sq = y_c * y_c;
+							for (int x_c=-rx; x_c<=rx; x_c++){
+								xi_c = xi + x_c;
+								if (xi_c < x && xi_c>=0){
+									x_sq = x_c * x_c;
+									d_c = sqrt(x_sq + y_sq + z_sq);			//calculate the distance between the voter and the current counter
+									cos_diff = (gx_v_n * x_c + gy_v_n * y_c +  gz_v_n * z_c)/(d_c);			 // calculate the cosine of angle between the voter and the current counter
+									if ( ( (x_sq/rx_sq + y_sq/ry_sq + z_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){      
+										idx_c = (zi_c * y + yi_c) * x + xi_c;			//calculate the 1D index for the current counter
+										atomicAdd (&gpu_vote[idx_c] , mag_v);
+									}
+								}
+							}
+						}
+					}
+				}
+			}	
+		}
+
+		template<typename T>
+		void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+
+			
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(sqrt (max_threads),sqrt (max_threads));
+			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
+			//unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*4*sizeof(T);			
+			//call the kernel to do the voting
+			vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * z * sizeof(T);
+
+			
+			//allocate space on the GPU for the Vote Image
+			T* gpu_vote;
+			cudaMalloc(&gpu_vote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpu_grad;
+			cudaMalloc(&gpu_grad, bytes*3);
+			
+			
+			//copy the Gradient data to the GPU
+			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
+			
+					
+			//call the GPU version of the vote calculation function
+			gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpu_vote);
+			cudaFree(gpu_grad);
+			
+		}
+
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_VOTE3_ATOMIC_AABB_H
+#define STIM_CUDA_VOTE3_ATOMIC_AABB_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/cudatools/error.h>
+#include "cpyToshare.cuh"
+#define M_PI	3.14159
+#include <stim/math/circle.h>
+#include <stim/math/vec3.h>
+#include <stim/math/plane.h>
+#include <stim/math/vector.h>
+#include <stim/visualization/aabb3.h>
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
+
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;						//calculate x,y,z coordinates for this thread
+			
+			int grid_y = y / blockDim.y;										//find the grid size along y
+			int blockidx_y = blockIdx.y % grid_y;
+			int yi = blockidx_y * blockDim.y + threadIdx.y;
+			int zi = blockIdx.y / grid_y;
+						
+			if(xi>=x || yi>=y || zi>=z) return;
+
+			int i = zi * x * y + yi * x + xi;	  // calculate the 1D index of the voter
+			
+
+			float rx_sq = rx * rx;        // compute the square for rmax 
+			float ry_sq = ry * ry;
+			float rz_sq = rz * rz;
+			float dist, cos_diff;
+			int idx_c;
+
+			//float rmax = sqrt(rx_sq + ry_sq + rz_sq);
+			stim::vec3<float> g(gpu_grad[3*i],gpu_grad[3*i+1],gpu_grad[3*i+2]);   // form a vec3 variable for the gradient vector
+			stim::vec3<float> g_sph = g.cart2sph();			//convert cartesian coordinate to spherical for the gradient vector
+			int n =4;										//set the number of points to find the boundaries of the conical voting area
+			float xc = rx * cos(g_sph[1]) * sin(g_sph[2]);			//calculate the center point of the surface of the voting area for the voter
+			float yc = ry * sin(g_sph[1]) * sin(g_sph[2]) ;
+			float zc = rz * cos(g_sph[2]) ;
+			float r = sqrt(xc*xc + yc*yc + zc*zc);
+			xc+=xi;
+			yc+=yi;
+			zc+=zi;
+			stim::vec3<float> center(xc,yc,zc);
+			
+			float d = 2 * r * tan(acos(cos_phi) );		//find the diameter of the conical voting area
+			stim::vec3<float> norm = g.norm();			//compute the normalize gradient vector
+			float step = 360.0/(float) n;
+			stim::circle<float>  cir(center, d, norm);
+			stim::aabb3<int> bb(xi,yi,zi);
+			bb.insert(xc,yc,zc);
+			for(float j = 0; j <360.0; j += step){
+				stim::vec3<float> out = cir.p(j);
+				bb.insert(out[0], out[1], out[2]);
+			}
+
+			bb.trim_low(0,0,0);
+			bb.trim_high(x-1, y-1, z-1);
+			int bx,by,bz;
+			int dx, dy, dz;
+			float dx_sq, dy_sq, dz_sq;
+			for (bz=bb.low[2]; bz<=bb.high[2]; bz++){
+				dz = bz - zi;							//compute the distance bw the voter and the current counter along z axis
+				dz_sq = dz * dz;
+				for (by=bb.low[1]; by<=bb.high[1]; by++){
+					dy = by - yi;								//compute the distance bw the voter and the current counter along y axis
+					dy_sq = dy * dy;
+					for (bx=bb.low[0]; bx<=bb.high[0]; bx++){
+						dx = bx - xi;								//compute the distance bw the voter and the current counter along x axis
+						dx_sq = dx * dx;
+
+						dist = sqrt(dx_sq + dy_sq + dz_sq);			//calculate the distance between the voter and the current counter
+						cos_diff = (norm[0] * dx + norm[1] * dy +  norm[2] * dz)/dist;			 // calculate the cosine of angle between the voter and the current counter
+						if ( ( (dx_sq/rx_sq + dy_sq/ry_sq + dz_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){			//check if the current counter located in the voting area of the voter    
+							idx_c = (bz* y + by) * x + bx;			//calculate the 1D index for the current counter
+							atomicAdd (&gpu_vote[idx_c] , g_sph[0]);
+						}								
+					}						
+				}				
+			}	
+		}
+
+		template<typename T>
+		void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+
+			
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(sqrt (max_threads),sqrt (max_threads));
+			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
+			vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);			//call the kernel to do the voting
+
+		}
+
+
+		template<typename T>
+		void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * z * sizeof(T);
+
+			
+			//allocate space on the GPU for the Vote Image
+			T* gpu_vote;
+			cudaMalloc(&gpu_vote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpu_grad;
+			cudaMalloc(&gpu_grad, bytes*3);
+			
+			
+			//copy the Gradient data to the GPU
+			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
+			
+					
+			//call the GPU version of the vote calculation function
+			gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpu_vote);
+			cudaFree(gpu_grad);
+			
+		}
+
+
+#endif
 \ No newline at end of file