From a744d027c4d9d6583db594dc75811c6ff90eaef6 Mon Sep 17 00:00:00 2001
From: laila Saadatifard <saadatileila@gmail.com>
Date: Thu, 7 Jul 2016 10:44:37 -0500
Subject: [PATCH] upload the ivote3 functions using the new method for finding voting area (boundaries of voting area)'

---
 Matlab_3D/gt2.vol         | Bin 0 -> 8388608 bytes
 Matlab_3D/validation.m    | 157 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------------------------------------------------------
 cpp/CMakeLists.txt        |   3 +++
 cpp/args.txt              |   1 +
 cpp/cudafunc.cu           |  30 ++++++++++++++++++++----------
 cpp/main.cpp              | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
 cpp/update_dir3_aabb.cuh  | 193 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 cpp/vote3_atomic.cuh      | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 cpp/vote3_atomic_aabb.cuh | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 657 insertions(+), 122 deletions(-)
 create mode 100644 Matlab_3D/gt2.vol
 create mode 100644 cpp/args.txt
 create mode 100644 cpp/update_dir3_aabb.cuh
 create mode 100644 cpp/vote3_atomic.cuh
 create mode 100644 cpp/vote3_atomic_aabb.cuh

diff --git a/Matlab_3D/gt2.vol b/Matlab_3D/gt2.vol
new file mode 100644
index 0000000..1399c1a
Binary files /dev/null and b/Matlab_3D/gt2.vol differ
diff --git a/Matlab_3D/validation.m b/Matlab_3D/validation.m
index 99b6aba..6e0056c 100644
--- a/Matlab_3D/validation.m
+++ b/Matlab_3D/validation.m
@@ -1,5 +1,6 @@
 clear all;
-disp('***************** NEW RUN *********************');
+%for t=100:100:5000
+t=2350;
 X = 128;
 Y = 128;
 Z = 128;
@@ -7,15 +8,15 @@ D = 10;
 t0=1;
 r1=12;
 r2=10;
-t=2300;
-itr=8;
-vote=10;
+% t=2300;
+itr=5;
+vote=7;
 std = [5 5];
 gt_filename = 'gt2.vol';
 % out_filename = sprintf('128-128-128/0-nissl-std%d.%d-t0%d-r%d.%d-t%d-out%d.%d.vol',std(1), std(2),t0,r1,r2,t,itr,vote);
-out_filename = sprintf('D:/build/ivote3-bld/shared2D-v8/centers.%d.vol',t);
+out_filename = sprintf('D:/build/ivote3-bld/0-out.%d.vol',t);
 % txt_filename = sprintf('128-128-128/0-validation-nissl-std%d.%d-r%d.%d-t%d-out%d.%d-D%d.txt',std(1), std(2),r1,r2,t,itr,vote,D);
-txt_filename = sprintf('0-t%d--1.txt',t);
+txt_filename = sprintf('D:/build/ivote3-bld/0-t%d-atomic-aabb.txt',t);
 spec = sprintf('Nissl-std%d.%d-r%d.%d-t%d-out%d.%d',std(1), std(2),r1,r2,t,itr,vote);
 fid0 = fopen(gt_filename);
 gt = fread(fid0,[X Y*Z], 'single');
@@ -79,73 +80,73 @@ for k=1:size(id_ref,1)
 		oseg = oseg+1;
 	end
 end
-%****
-% check if the oversegmented detected cells could be assigned to the gt's element with no detected cell!
-%find the indices of gt's element with no detected cell
-u_idx1 = find(b==0);
-%find the coordinates of gt's
-ref1 = ref(u_idx1,:);
-% find the indices of detected cells which are assigned to a gt's element
-ub = unique(b(b>0));
-% set the number of detected cell which are not assigned to a gt's element to zero
-mm=0;
-% find the indices of detected cell which are not assigned to a gt's element
-for j=1:size(cent,1)
-	z = sum(ub(:)==j);
-	if z==0
-		mm = mm+1;
-		cent_id1(mm) = j;
-	end
-end
-% find the coordinated of of detected cells
-cent1 = cent(cent_id1,:);
-
-% check if there are osme low enough distances
-[idx1, dist1] = knnsearch(cent1,ref1);
-ind_eq_idx_rc = zeros(numel(idx1), 10);
-%set number of cells with the same detected cell center to zero.
-noc_rc=0;
-for i_rc=1:numel(idx1)	
-	% check the current element hasn't considered yet
-	s_ind_rc = sum(ind_eq_idx_rc==i_rc);
-	if s_ind_rc==0
-		% check how many of the elemnts assigned to the same detected cell
-		s_rc = sum(idx1==idx1(i_rc));		
-		if s_rc>1 
-			noc_rc = noc_rc+1;
-			%save the index and number of gt's elements with the same assigned detected cells.
-			id_ref_rc(noc_rc,1) = i_rc;
-			id_ref_rc(noc_rc,2) = s_rc;
-			ind1_rc = find(idx1==idx1(i_rc));
-			ind_eq_idx_rc(i_rc, 1:numel(ind1_rc)) = ind1_rc;
-		end
-	end
-end
-% determine those indices which hs assigned to only one detected cell
-b_rc = idx1;
-u_eq_idx_rc = unique(ind_eq_idx_rc(ind_eq_idx_rc>0));
-b_rc(u_eq_idx_rc)=0;
-% set the number of over sefgmented cells to zero.
-oseg_rc=0;
-for k_rc=1:size(id_ref_rc,1)
-	k1_rc = id_ref_rc(k_rc,1);
-	k2_rc = id_ref_rc(k_rc,2);
-	%find the minimum distance to the detected cell happened for which of the gt's element
-	l_rc = ind_eq_idx_rc(k1_rc,1:k2_rc);
-	[~, local_id_min_rc] = min(dist1(l_rc));
-	% set the element with minimum distance to the corresponding detected cell
-	b_rc(l_rc(local_id_min_rc)) = idx1(k1_rc);
-	%remove the proper element from the list of indices with same designated detected cells
-	u_eq_idx_rc (u_eq_idx_rc == l_rc(local_id_min_rc))=0;
-	% check if the indices remained in the list has distance less than the value is set for validation
-	ly_rc = l_rc (l_rc~=l_rc(local_id_min_rc));
-	distl_rc = dist1(ly_rc);
-	sl_rc = sum(distl_rc<D);
-	% if the distance is low enought, consider the corresponding cell as oversegmented
-	if sl_rc>0
-		oseg_rc = oseg_rc+1;
-	end
-end
+% %****
+% % check if the oversegmented detected cells could be assigned to the gt's element with no detected cell!
+% %find the indices of gt's element with no detected cell
+% u_idx1 = find(b==0);
+% %find the coordinates of gt's
+% ref1 = ref(u_idx1,:);
+% % find the indices of detected cells which are assigned to a gt's element
+% ub = unique(b(b>0));
+% % set the number of detected cell which are not assigned to a gt's element to zero
+% mm=0;
+% % find the indices of detected cell which are not assigned to a gt's element
+% for j=1:size(cent,1)
+% 	z = sum(ub(:)==j);
+% 	if z==0
+% 		mm = mm+1;
+% 		cent_id1(mm) = j;
+% 	end
+% end
+% % find the coordinated of of detected cells
+% cent1 = cent(cent_id1,:);
+% 
+% % check if there are osme low enough distances
+% [idx1, dist1] = knnsearch(cent1,ref1);
+% ind_eq_idx_rc = zeros(numel(idx1), 10);
+% %set number of cells with the same detected cell center to zero.
+% noc_rc=0;
+% for i_rc=1:numel(idx1)	
+% 	% check the current element hasn't considered yet
+% 	s_ind_rc = sum(ind_eq_idx_rc==i_rc);
+% 	if s_ind_rc==0
+% 		% check how many of the elemnts assigned to the same detected cell
+% 		s_rc = sum(idx1==idx1(i_rc));		
+% 		if s_rc>1 
+% 			noc_rc = noc_rc+1;
+% 			%save the index and number of gt's elements with the same assigned detected cells.
+% 			id_ref_rc(noc_rc,1) = i_rc;
+% 			id_ref_rc(noc_rc,2) = s_rc;
+% 			ind1_rc = find(idx1==idx1(i_rc));
+% 			ind_eq_idx_rc(i_rc, 1:numel(ind1_rc)) = ind1_rc;
+% 		end
+% 	end
+% end
+% % determine those indices which hs assigned to only one detected cell
+% b_rc = idx1;
+% u_eq_idx_rc = unique(ind_eq_idx_rc(ind_eq_idx_rc>0));
+% b_rc(u_eq_idx_rc)=0;
+% % set the number of over sefgmented cells to zero.
+% oseg_rc=0;
+% for k_rc=1:size(id_ref_rc,1)
+% 	k1_rc = id_ref_rc(k_rc,1);
+% 	k2_rc = id_ref_rc(k_rc,2);
+% 	%find the minimum distance to the detected cell happened for which of the gt's element
+% 	l_rc = ind_eq_idx_rc(k1_rc,1:k2_rc);
+% 	[~, local_id_min_rc] = min(dist1(l_rc));
+% 	% set the element with minimum distance to the corresponding detected cell
+% 	b_rc(l_rc(local_id_min_rc)) = idx1(k1_rc);
+% 	%remove the proper element from the list of indices with same designated detected cells
+% 	u_eq_idx_rc (u_eq_idx_rc == l_rc(local_id_min_rc))=0;
+% 	% check if the indices remained in the list has distance less than the value is set for validation
+% 	ly_rc = l_rc (l_rc~=l_rc(local_id_min_rc));
+% 	distl_rc = dist1(ly_rc);
+% 	sl_rc = sum(distl_rc<D);
+% 	% if the distance is low enought, consider the corresponding cell as oversegmented
+% 	if sl_rc>0
+% 		oseg_rc = oseg_rc+1;
+% 	end
+% end
 
 %******
 % b include the gt's element and its detected cells, for those element with no cell detection, b has zero value
@@ -154,11 +155,11 @@ b_dist = dist;
 % remove the disatances for those elements with no detected cells from distance array
 b_dist(b_ind)=-1;
 
-b_ind_rc = find(b_rc==0);
-b_dist_rc = dist1;
-b_dist_rc(b_ind_rc)=-1;
+% b_ind_rc = find(b_rc==0);
+% b_dist_rc = dist1;
+% b_dist_rc(b_ind_rc)=-1;
 % calculate Ttrue Positive, number of detected cells that have low enough distance to one of the gt's elements.
-TP = sum(b_dist>=0) - sum(b_dist>D) + (sum(b_dist_rc>=0) - sum(b_dist_rc>D));
+TP = sum(b_dist>=0) - sum(b_dist>D); % + (sum(b_dist_rc>=0) - sum(b_dist_rc>D));
 % calculate False Negative, number of gt's elements with no detected cells.
 FN = size(ref, 1) - TP;
 % calculate False Positive, number of detected cells, which their distance to the gt's element is long
@@ -179,3 +180,5 @@ fprintf(fid_text, 'Precision = %f\n', Precision);
 fprintf(fid_text, 'Accuracy = %f\n', Accuracy);
 fprintf(fid_text, 'Recall = %f\n', Recall);
 fclose(fid_text);
+clear all;
+%end
\ No newline at end of file
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d21dd51..387d47e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -10,6 +10,9 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}")
 #set up CUDA
 find_package(CUDA REQUIRED)
 
+#set up opencv
+find_package(opencv)
+
 #find the STIM library
 find_package(STIM REQUIRED)
 
diff --git a/cpp/args.txt b/cpp/args.txt
new file mode 100644
index 0000000..6302d1b
--- /dev/null
+++ b/cpp/args.txt
@@ -0,0 +1 @@
+nissl-float-128.128.128.vol 0-out --t 2350 --x 128 --y 128 --z 128 --anisotropy 1 --invert
\ No newline at end of file
diff --git a/cpp/cudafunc.cu b/cpp/cudafunc.cu
index 5a89eaf..fe5734d 100644
--- a/cpp/cudafunc.cu
+++ b/cpp/cudafunc.cu
@@ -1,7 +1,15 @@
+/*#include "circle_check.cuh"
+
+void test_3(float* gpu_out, float* gpu_grad, float rmax, float phi, int n, int x, int y, int z){
+gpu_test3(gpu_out, gpu_grad, rmax, phi, n, x, y, z);
+}
+*/
+
+
 #include "gaussian_blur3.cuh"
 #include "gradient3.cuh"
 #include "mag3.cuh"
-#include "vote3.cuh"
+#include "vote3_atomic_aabb.cuh"
 #include "update_dir3.cuh"
 #include "local_max3.cuh"
 
@@ -10,11 +18,11 @@ void ivote3(float* img, float sigma[], float anisotropy, float phi, float d_phi,
 			int iter, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){
 
 	
-	cudaSetDevice(0);
+	cudaSetDevice(1);
 	// compute the number of bytes in the input data
 	unsigned int bytes = x * y * z * sizeof(float);
 
-	//assign memory on gpu for the input data.
+	//assign memory on gpu for the input data.z
 	float* gpuI0;
 	cudaMalloc(&gpuI0, bytes);	
 
@@ -41,18 +49,17 @@ void ivote3(float* img, float sigma[], float anisotropy, float phi, float d_phi,
 
 	//call the vote function.
 	for (int i = 0; i < iter; i++){
-
+	
+		cudaMemset(gpu_vote, 0, bytes);
 		gpu_vote3<float>(gpu_vote, gpu_grad, cos_phi, r, x, y, z);
 		cudaDeviceSynchronize();
-		/*if (i==7)
-			cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);*/
-		
-		if (phi >= d_phi){	
+	
+		//if (phi >= d_phi){	
 			gpu_update_dir3<float>(gpu_grad, gpu_vote, cos_phi, r, x, y, z);
 			cudaDeviceSynchronize();
 			phi = phi - d_phi;
 			cos_phi = cos(phi);
-		}
+		//}
 	
 	}
 
@@ -78,6 +85,8 @@ void ivote3(float* img, float sigma[], float anisotropy, float phi, float d_phi,
 void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){
 	unsigned int bytes = x * y * z * sizeof(float);
 
+	cudaSetDevice(1);
+
 	//assign memory on gpu for the input data.
 	float* gpuV;
 	cudaMalloc(&gpuV, bytes);	
@@ -96,4 +105,5 @@ void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, u
 	
 	cudaFree(gpuV);
 	cudaFree(gpuOut);
-}
\ No newline at end of file
+}
+
diff --git a/cpp/main.cpp b/cpp/main.cpp
index 47d76d8..d2a9636 100644
--- a/cpp/main.cpp
+++ b/cpp/main.cpp
@@ -9,6 +9,84 @@
 #include <stim/image/image.h>
 #define pi	3.14159
 
+#define M_PI	3.14159
+#include <stim/math/circle.h>
+#include <stim/math/vec3.h>
+#include <stim/math/plane.h>
+#include <stim/math/vector.h>
+//#include <cuda.h>
+//#include <stim/cuda/cudatools.h>
+//#include <stim/cuda/cudatools/error.h>
+
+
+/*void test_3(float* gpu_out, float* gpu_grad, float rmax, float phi, int n, int x, int y, int z);
+
+int main(){
+	
+	int  n=20;
+	float rmax;
+	float phi_deg;
+	float phi;
+	rmax=4;
+	phi_deg = 15;
+	phi = phi_deg * pi/180;
+	int x,y,z;
+	x=y=z=1;
+	unsigned int size = x*y*z*sizeof (float);
+	float* cpu_grad = (float*) malloc(3*size);
+	float* gpu_grad;
+	cudaMalloc(&gpu_grad, 3*size);
+	cpu_grad[0]=1;
+	cpu_grad[1]=0;
+	cpu_grad[2]=-0.5;
+	cudaMemcpy(gpu_grad, cpu_grad, 3*size, cudaMemcpyHostToDevice);
+	float* cpu_out = (float*) malloc(3*size*(n+1));
+	float* gpu_out;
+	cudaMalloc(&gpu_out, 3*size*(n+1));
+	test_3(gpu_out, gpu_grad, rmax, phi, n, x, y, z);
+	cudaMemcpy(cpu_out, gpu_out, 3*size*(n+1), cudaMemcpyDeviceToHost);
+	std::ofstream list("circle_check_cuda1.txt");
+	if (list.is_open()){
+		for (int j=0; j<=n; ++j)
+			list << cpu_out[3*j] << '\t' << cpu_out[3*j +1] << '\t' << cpu_out[3*j + 2] << '\n';
+	}
+	list.close();
+	*/
+	/*
+	int  n=10;
+	stim::circle<float>  cir;
+	float* c0= (float*) malloc(3*sizeof(float));
+	c0[0] =-4;
+	c0[1]=0;
+	c0[2] = 3;
+	stim::vec3<float> c(c0[0],c0[1],c0[2]);
+	float len = c.len();
+	stim::vec3<float> norm(c0[0]/len,c0[1]/len,c0[2]/len);
+	std::cout<< len << '\n';
+	std::cout<< norm << '\n';
+	cir.center(c);
+	cir.normal(norm);
+	cir.scale(2);
+	stim::vec3<float> out = cir.p(45);
+	std::vector<stim::vec3<float>> out2 = cir.getPoints(n);
+	
+	std::cout<< out << '\n';
+	std::cout <<out[0] << '\t' << out[1] << '\t' << out[2] <<'\n';
+	std::cout<< c << '\n';
+	
+	for (std::vector<stim::vec3<float>>::const_iterator i = out2.begin(); i != out2.end(); ++i)
+    std::cout << *i << '\n';
+	std::ofstream list("circle_check.txt");
+	if (list.is_open()){
+		for (std::vector<stim::vec3<float>>::const_iterator j = out2.begin(); j != out2.end(); ++j)
+			list << *j << '\n';
+	}
+	list.close();
+	std::cin >> n;
+
+}
+
+*/
 
 void ivote3(float* img, float std[], float anisotropy, float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[], 
 			unsigned int x, unsigned int y, unsigned int z);
@@ -98,7 +176,7 @@ int main(int argc, char** argv){
 	float t = args["t"].as_float();
 	//set the anisotropy
 	float anisotropy =  args["anisotropy"].as_float();
-	unsigned int rmax = 10 ;
+	unsigned int rmax = 10;
 	unsigned int r[3] = { 12, rmax, rmax};
 	float std = 5;
 	float sigma[3] = { std, std, std};
@@ -137,50 +215,51 @@ int main(int argc, char** argv){
 	std::ofstream fblur("shared2D-v8/vote8.vol", std::ofstream::out | std::ofstream::binary);
 	fblur.write((char*)cpuI, bytes);
 	fblur.close();
-	/*
-	stim::image<float>imgrad3;
-	imgrad3.set_interleaved3(cpu_out, 128,128,128,3);
-	std::ofstream fgx("syn/gx-128.vol", std::ofstream::out | std::ofstream::binary);
-	fgx.write((char*)imgrad3.channel(0).data(), bytes);
-	fgx.close();
-	*/
+	
+	//stim::image<float>imgrad3;
+	//imgrad3.set_interleaved3(cpu_out, 128,128,128,3);
+	//std::ofstream fgx("syn/gx-128.vol", std::ofstream::out | std::ofstream::binary);
+	//fgx.write((char*)imgrad3.channel(0).data(), bytes);
+	//fgx.close();
+	
 	//write the output file.
-	//for (int t0=2000; t0<=2500; t0+=100){
+	//for (int t0=0; t0<=5000; t0+=100){
+	//	float t1 = t0;
 		int t0 = t;
 			lmax(cpu_out, cpuI, t, conn, x, y, z);
 			//std::ofstream fo("shared2D-v8/" + OutName.str(), std::ofstream::out | std::ofstream::binary);
-			std::ofstream fo("shared2D-v8/" + OutName.str()+std::to_string(t0)+".vol", std::ofstream::out | std::ofstream::binary);
+			std::ofstream fo( OutName.str()+std::to_string(t0)+".vol", std::ofstream::out | std::ofstream::binary);
 			fo.write((char*)cpu_out, bytes);
 			fo.close();
 	
 	// creat a file for saving the list centers
 	
-		std::ofstream list("shared2D-v8/" + OutName.str()+std::to_string(t0)+".obj");
-		// set the number of detected cells to zero.
-		int nod = 0;
-		if (list.is_open()){
-
-				for (int iz=0; iz<z; iz++){
-					for (int iy=0; iy<y; iy++){
-						for (int ix=0; ix<x; ix++){
-
-							int idx = iz * x * y + iy * x + ix;
-							if (cpu_out[idx]==1){
-								nod++;
-								list << "v" << "\t" << ix << "\t" << iy << "\t"<< iz << '\n' ;
-					
-							}
-						}
-					}
-				}
-				list << "p" << "\t";
-				for (unsigned int i_nod =1 ; i_nod <=nod; i_nod++){
-					list << i_nod << "\t";
-				}
+		//std::ofstream list("shared2D-v8/" + OutName.str()+std::to_string(t0)+".obj");
+		//// set the number of detected cells to zero.
+		//int nod = 0;
+		//if (list.is_open()){
 
-		list.close();
-		}
-	
+		//		for (int iz=0; iz<z; iz++){
+		//			for (int iy=0; iy<y; iy++){
+		//				for (int ix=0; ix<x; ix++){
+
+		//					int idx = iz * x * y + iy * x + ix;
+		//					if (cpu_out[idx]==1){
+		//						nod++;
+		//						list << "v" << "\t" << ix << "\t" << iy << "\t"<< iz << '\n' ;
+		//			
+		//					}
+		//				}
+		//			}
+		//		}
+		//		list << "p" << "\t";
+		//		for (unsigned int i_nod =1 ; i_nod <=nod; i_nod++){
+		//			list << i_nod << "\t";
+		//		}
+
+		//list.close();
+		//}
+	//}
 		cudaDeviceReset();       
 	
 }
diff --git a/cpp/update_dir3_aabb.cuh b/cpp/update_dir3_aabb.cuh
new file mode 100644
index 0000000..1d533c5
--- /dev/null
+++ b/cpp/update_dir3_aabb.cuh
@@ -0,0 +1,193 @@
+#ifndef STIM_CUDA_UPDATE_DIR3_AABB_H
+#define STIM_CUDA_UPDATE_DIR3_AABB_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include "cpyToshare.cuh"
+#define M_PI	3.14159
+#include <stim/math/circle.h>
+#include <stim/math/vec3.h>
+#include <stim/math/plane.h>
+#include <stim/math/vector.h>
+#include <stim/visualization/aabb3.h>
+
+		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
+		template<typename T>
+		__global__ void update_dir3(T* gpu_dir, T* gpu_grad, T* gpu_vote, T cos_phi, int rx, int ry, int rz,  int x,  int y, int z){
+			//extern __shared__ float s_vote[];
+			
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;			//calculate x,y,z coordinates for this thread
+			
+			int grid_y = y / blockDim.y;						//find the grid size along y
+			int blockidx_y = blockIdx.y % grid_y;
+			int yi = blockidx_y * blockDim.y + threadIdx.y;
+			int zi = blockIdx.y / grid_y;
+			if(xi >= x|| yi >= y || zi>= z) return;
+			int i = zi * x * y + yi * x + xi;				//compute the global 1D index for this pixel
+
+			
+			// find the starting points for this block along the x and y directions
+			//int bxi = blockIdx.x * blockDim.x;
+			//int byi = blockidx_y * blockDim.y;
+			//find the starting points and the size of the window, which will be copied to the 2D-shared memory
+			//int bxs = bxi - rx;
+			//int bys = byi - ry;
+			//int xwidth = 2 * rx + blockDim.x;
+			//int ywidth = 2 * ry + blockDim.y;
+			//compute the coordinations of this pixel in the 2D-shared memory.
+			//int sx_rx = threadIdx.x + rx;
+			//int sy_ry = threadIdx.y + ry;
+
+			float rx_sq = rx * rx;        // compute the square for rmax 
+			float ry_sq = ry * ry;
+			float rz_sq = rz * rz;
+			
+			stim::vec3<float> g(gpu_grad[3*i],gpu_grad[3*i+1],gpu_grad[3*i+2]);   // form a vec3 variable for the gradient vector
+			stim::vec3<float> g_sph = g.cart2sph();			//convert cartesian coordinate to spherical for the gradient vector
+			int n =4;										//set the number of points to find the boundaries of the conical voting area
+			float xc = rx * cos(g_sph[1]) * sin(g_sph[2]) ;			//calculate the center point of the surface of the voting area for the voter
+			float yc = ry * sin(g_sph[1]) * sin(g_sph[2]) ;
+			float zc = rz * cos(g_sph[2]) ;
+			float r = sqrt(xc*xc + yc*yc + zc*zc);
+			xc+=xi;
+			yc+=yi;
+			zc+=zi;
+			stim::vec3<float> center(xc,yc,zc);
+			float d = 2 * r * tan(acos(cos_phi) );		//find the diameter of the conical voting area
+			stim::vec3<float> norm = g.norm();			//compute the normalize gradient vector
+			float step = 360.0/(float) n;
+			stim::circle<float>  cir(center, d, norm);
+			stim::aabb3<int> bb(xi,yi,zi);
+			bb.insert(xc,yc,zc);
+			for(float j = 0; j <360.0; j += step){
+				stim::vec3<float> out = cir.p(j);
+				bb.insert(out[0], out[1], out[2]);
+			}
+
+			bb.trim_low(0,0,0);
+			bb.trim_high(x-1, y-1, z-1);
+			int bx,by,bz;
+			int dx, dy, dz;
+			float dx_sq, dy_sq, dz_sq;
+			
+			float dist, cos_diff;
+			int idx_c;
+
+			float max = 0;					// define a local variable to maximum value of the vote image in the voting area for this voter
+			float l_vote = 0;
+			
+			float id_x = g[0];				// define local variables for the x, y, and z coordinations point to the vote direction
+			float id_y = g[1];
+			float id_z = g[2];
+
+			for (bz=bb.low[2]; bz<=bb.high[2]; bz++){
+				dz = bz - zi;							//compute the distance bw the voter and the current counter along z axis
+				dz_sq = dz * dz;
+				for (by=bb.low[1]; by<=bb.high[1]; by++){
+					dy = by - yi;								//compute the distance bw the voter and the current counter along y axis
+					dy_sq = dy * dy;
+					for (bx=bb.low[0]; bx<=bb.high[0]; bx++){
+						dx = bx - xi;								//compute the distance bw the voter and the current counter along x axis
+						dx_sq = dx * dx;
+
+						dist = sqrt(dx_sq + dy_sq + dz_sq);			//calculate the distance between the voter and the current counter
+						cos_diff = (norm[0] * dx + norm[1] * dy +  norm[2] * dz)/dist;			 // calculate the cosine of angle between the voter and the current counter
+						if ( ( (dx_sq/rx_sq + dy_sq/ry_sq + dz_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){			//check if the current counter located in the voting area of the voter    
+							idx_c = (bz* y + by) * x + bx;			//calculate the 1D index for the current counter
+							l_vote = gpu_vote[idx_c];
+							if  (l_vote>max) {
+								max = l_vote;
+								id_x = dx;
+								id_y = dy;
+								id_z = dz;
+							}
+						}								
+					}						
+				}				
+			}
+			float m_id = sqrt (id_x*id_x + id_y*id_y + id_z*id_z);
+			gpu_dir[i * 3 + 0] = g_sph[0] * (id_x/m_id);
+			gpu_dir[i * 3 + 1] = g_sph[0] * (id_y/m_id);
+			gpu_dir[i * 3 + 2] = g_sph[0] * (id_z/m_id);
+		}
+
+
+
+		// this kernel updates the gradient direction by the calculated voting direction.
+		template<typename T>
+		__global__ void update_grad3(T* gpu_grad, T* gpu_dir, int x, int y, int z){
+
+			//calculate x,y,z coordinates for this thread
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			//find the grid size along y
+			int grid_y = y / blockDim.y;
+			int blockidx_y = blockIdx.y % grid_y;
+			int yi = blockidx_y * blockDim.y + threadIdx.y;
+			int zi = blockIdx.y / grid_y;
+			int i = zi * x * y + yi * x + xi;
+
+			if(xi >= x || yi >= y || zi >= z) return;
+			//update the gradient image with the new direction direction
+			gpu_grad[i * 3 + 0] = gpu_dir [i * 3 + 0];
+			gpu_grad[i * 3 + 1] = gpu_dir [i * 3 + 1];
+			gpu_grad[i * 3 + 2] = gpu_dir [i * 3 + 2];
+		}
+		
+		template<typename T>
+		void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(sqrt (max_threads),sqrt (max_threads));
+			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
+			//unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*sizeof(T);			
+			// allocate space on the GPU for the updated vote direction
+			T* gpu_dir;
+			cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3);	
+
+			//call the kernel to calculate the new voting direction
+			update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z);
+			
+			
+			//call the kernel to update the gradient direction
+			update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z);
+			
+			//free allocated memory
+			cudaFree(gpu_dir);
+
+		}
+		
+		template<typename T>
+		void cpu_update_dir3(T* cpu_grad, T* cpu_vote, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * z * sizeof(T);
+
+			//allocate space on the GPU for the Vote data
+			T* gpu_vote;
+			cudaMalloc(&gpu_vote, bytes);
+
+			//copy the input vote data to the GPU
+			cudaMemcpy(gpu_vote, cpu_vote, bytes, cudaMemcpyHostToDevice);	
+
+			//allocate space on the GPU for the Gradient data
+			T* gpu_grad;
+			cudaMalloc(&gpu_grad, bytes*3);
+
+			//copy the Gradient data to the GPU
+			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
+
+			//call the GPU version of the update direction function
+			gpu_update_dir3<T>(gpu_grad, gpu_vote, cos_phi, r, x , y, z);
+							
+			//copy the new gradient image back to the CPU
+			cudaMemcpy(cpu_grad, gpu_grad, bytes*3, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpu_vote);
+			cudaFree(gpu_grad);
+		}
+		
+
+
+#endif
\ No newline at end of file
diff --git a/cpp/vote3_atomic.cuh b/cpp/vote3_atomic.cuh
new file mode 100644
index 0000000..8d7f609
--- /dev/null
+++ b/cpp/vote3_atomic.cuh
@@ -0,0 +1,113 @@
+#ifndef STIM_CUDA_VOTE3_ATOMIC_H
+#define STIM_CUDA_VOTE3_ATOMIC_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/cudatools/error.h>
+#include "cpyToshare.cuh"
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
+
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;						//calculate x,y,z coordinates for this thread
+			
+			int grid_y = y / blockDim.y;										//find the grid size along y
+			int blockidx_y = blockIdx.y % grid_y;
+			int yi = blockidx_y * blockDim.y + threadIdx.y;
+			int zi = blockIdx.y / grid_y;
+						
+			if(xi>=x || yi>=y || zi>=z) return;
+
+			int i = zi * x * y + yi * x + xi;	  // calculate the 1D index of the voter
+			float gx_v = gpu_grad[3*i];           // find the gradient information in cartesian coordinate for the voter - global memory fetch
+			float gy_v = gpu_grad[3*i+1];
+			float gz_v = gpu_grad[3*i+2];
+
+			float mag_v = sqrt(gx_v*gx_v + gy_v*gy_v + gz_v*gz_v);  // compute the gradient magnitude for the voter
+
+			float gx_v_n = gx_v/mag_v;      // normalize the gradient vector for the voter
+			float gy_v_n = gy_v/mag_v;
+			float gz_v_n = gz_v/mag_v;
+
+			float rx_sq = rx * rx;        // compute the square for rmax 
+			float ry_sq = ry * ry;
+			float rz_sq = rz * rz;
+			float x_sq, y_sq, z_sq, d_c, cos_diff;
+			int xi_c, yi_c, zi_c, idx_c;
+
+			for (int z_c=-rz; z_c<=rz; z_c++){
+				zi_c = zi + z_c;              // calculate the z position for the current counter
+				if (zi_c <z && zi_c>=0){      // make sure the current counter is inside the image
+					z_sq = z_c * z_c;
+					for (int y_c=-ry; y_c<=ry; y_c++){
+						yi_c = yi + y_c;
+						if (yi_c < y && yi_c>=0){
+							y_sq = y_c * y_c;
+							for (int x_c=-rx; x_c<=rx; x_c++){
+								xi_c = xi + x_c;
+								if (xi_c < x && xi_c>=0){
+									x_sq = x_c * x_c;
+									d_c = sqrt(x_sq + y_sq + z_sq);			//calculate the distance between the voter and the current counter
+									cos_diff = (gx_v_n * x_c + gy_v_n * y_c +  gz_v_n * z_c)/(d_c);			 // calculate the cosine of angle between the voter and the current counter
+									if ( ( (x_sq/rx_sq + y_sq/ry_sq + z_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){      
+										idx_c = (zi_c * y + yi_c) * x + xi_c;			//calculate the 1D index for the current counter
+										atomicAdd (&gpu_vote[idx_c] , mag_v);
+									}
+								}
+							}
+						}
+					}
+				}
+			}	
+		}
+
+		template<typename T>
+		void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+
+			
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(sqrt (max_threads),sqrt (max_threads));
+			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
+			//unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*4*sizeof(T);			
+			//call the kernel to do the voting
+			vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * z * sizeof(T);
+
+			
+			//allocate space on the GPU for the Vote Image
+			T* gpu_vote;
+			cudaMalloc(&gpu_vote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpu_grad;
+			cudaMalloc(&gpu_grad, bytes*3);
+			
+			
+			//copy the Gradient data to the GPU
+			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
+			
+					
+			//call the GPU version of the vote calculation function
+			gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpu_vote);
+			cudaFree(gpu_grad);
+			
+		}
+
+
+#endif
\ No newline at end of file
diff --git a/cpp/vote3_atomic_aabb.cuh b/cpp/vote3_atomic_aabb.cuh
new file mode 100644
index 0000000..ab0b802
--- /dev/null
+++ b/cpp/vote3_atomic_aabb.cuh
@@ -0,0 +1,133 @@
+#ifndef STIM_CUDA_VOTE3_ATOMIC_AABB_H
+#define STIM_CUDA_VOTE3_ATOMIC_AABB_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/cudatools/error.h>
+#include "cpyToshare.cuh"
+#define M_PI	3.14159
+#include <stim/math/circle.h>
+#include <stim/math/vec3.h>
+#include <stim/math/plane.h>
+#include <stim/math/vector.h>
+#include <stim/visualization/aabb3.h>
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
+
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;						//calculate x,y,z coordinates for this thread
+			
+			int grid_y = y / blockDim.y;										//find the grid size along y
+			int blockidx_y = blockIdx.y % grid_y;
+			int yi = blockidx_y * blockDim.y + threadIdx.y;
+			int zi = blockIdx.y / grid_y;
+						
+			if(xi>=x || yi>=y || zi>=z) return;
+
+			int i = zi * x * y + yi * x + xi;	  // calculate the 1D index of the voter
+			
+
+			float rx_sq = rx * rx;        // compute the square for rmax 
+			float ry_sq = ry * ry;
+			float rz_sq = rz * rz;
+			float dist, cos_diff;
+			int idx_c;
+
+			//float rmax = sqrt(rx_sq + ry_sq + rz_sq);
+			stim::vec3<float> g(gpu_grad[3*i],gpu_grad[3*i+1],gpu_grad[3*i+2]);   // form a vec3 variable for the gradient vector
+			stim::vec3<float> g_sph = g.cart2sph();			//convert cartesian coordinate to spherical for the gradient vector
+			int n =4;										//set the number of points to find the boundaries of the conical voting area
+			float xc = rx * cos(g_sph[1]) * sin(g_sph[2]);			//calculate the center point of the surface of the voting area for the voter
+			float yc = ry * sin(g_sph[1]) * sin(g_sph[2]) ;
+			float zc = rz * cos(g_sph[2]) ;
+			float r = sqrt(xc*xc + yc*yc + zc*zc);
+			xc+=xi;
+			yc+=yi;
+			zc+=zi;
+			stim::vec3<float> center(xc,yc,zc);
+			
+			float d = 2 * r * tan(acos(cos_phi) );		//find the diameter of the conical voting area
+			stim::vec3<float> norm = g.norm();			//compute the normalize gradient vector
+			float step = 360.0/(float) n;
+			stim::circle<float>  cir(center, d, norm);
+			stim::aabb3<int> bb(xi,yi,zi);
+			bb.insert(xc,yc,zc);
+			for(float j = 0; j <360.0; j += step){
+				stim::vec3<float> out = cir.p(j);
+				bb.insert(out[0], out[1], out[2]);
+			}
+
+			bb.trim_low(0,0,0);
+			bb.trim_high(x-1, y-1, z-1);
+			int bx,by,bz;
+			int dx, dy, dz;
+			float dx_sq, dy_sq, dz_sq;
+			for (bz=bb.low[2]; bz<=bb.high[2]; bz++){
+				dz = bz - zi;							//compute the distance bw the voter and the current counter along z axis
+				dz_sq = dz * dz;
+				for (by=bb.low[1]; by<=bb.high[1]; by++){
+					dy = by - yi;								//compute the distance bw the voter and the current counter along y axis
+					dy_sq = dy * dy;
+					for (bx=bb.low[0]; bx<=bb.high[0]; bx++){
+						dx = bx - xi;								//compute the distance bw the voter and the current counter along x axis
+						dx_sq = dx * dx;
+
+						dist = sqrt(dx_sq + dy_sq + dz_sq);			//calculate the distance between the voter and the current counter
+						cos_diff = (norm[0] * dx + norm[1] * dy +  norm[2] * dz)/dist;			 // calculate the cosine of angle between the voter and the current counter
+						if ( ( (dx_sq/rx_sq + dy_sq/ry_sq + dz_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){			//check if the current counter located in the voting area of the voter    
+							idx_c = (bz* y + by) * x + bx;			//calculate the 1D index for the current counter
+							atomicAdd (&gpu_vote[idx_c] , g_sph[0]);
+						}								
+					}						
+				}				
+			}	
+		}
+
+		template<typename T>
+		void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+
+			
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(sqrt (max_threads),sqrt (max_threads));
+			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
+			vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);			//call the kernel to do the voting
+
+		}
+
+
+		template<typename T>
+		void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * z * sizeof(T);
+
+			
+			//allocate space on the GPU for the Vote Image
+			T* gpu_vote;
+			cudaMalloc(&gpu_vote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpu_grad;
+			cudaMalloc(&gpu_grad, bytes*3);
+			
+			
+			//copy the Gradient data to the GPU
+			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
+			
+					
+			//call the GPU version of the vote calculation function
+			gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpu_vote);
+			cudaFree(gpu_grad);
+			
+		}
+
+
+#endif
\ No newline at end of file
--
libgit2 0.21.4