update the ivote3 project

Laila Saadatifard
1 parent 2558ee86
Showing 7 changed files with 257 additions and 243 deletions Show diff stats
Matlab_3D/local_max.m
cpp/cudafunc.cu
cpp/gaussian_blur3.cuh
cpp/gradient3.cuh
cpp/main.cpp
cpp/update_dir3_aabb.cuh
cpp/vote3_atomic_aabb.cuh
@@ -26,8 +26,8 @@ for li = 1: nl
 	ind_m = sub2ind(size(Ivote), cx, cy, cz);
 	lm_Ivote = A(ind_m);
 	[~, lm_indx] = max(lm_Ivote(:));
-	if ind_f(li) == ind_m(lm_indx);
-		Icenter(ind_f(li)) = 255;
+	if ind_f(li) == ind_m(lm_indx)
+		Icenter(ind_f(li)) = A(ind_f(li));  %255
 	end
 end
  
@@ -5,71 +5,65 @@
 #include "vote3_atomic_aabb.cuh"
 #include "update_dir3_aabb.cuh"
 #include "local_max3.cuh"
-
+#include <fstream>
  
 void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[],
-			int iter, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){
+			int iter, float t, unsigned int conn[], size_t x, size_t y, size_t z){
  
  
-	cudaSetDevice(0);
-	
-	unsigned int bytes = x * y * z * sizeof(float);				// compute the number of bytes in the input data
+	size_t bytes = x * y * z * sizeof(float);				// compute the number of bytes in the input data
  
 	float* gpuI0;											//assign memory on gpu for the input data
 	cudaMalloc(&gpuI0, bytes);	
 	cudaMemcpy(gpuI0, img, bytes, cudaMemcpyHostToDevice);				//copy the image data to the GPU.
  
-	
 	gpu_gaussian_blur3<float>(gpuI0, sigma, x, y, z);							//call the blurring function from the gpu.
 	cudaDeviceSynchronize();
  
 	float* gpu_grad;												//assign memory on the gpu for the gradient along the X, y, z.
 	cudaMalloc(&gpu_grad, bytes*3);
  
-	gpu_gradient3<float>(gpu_grad, gpuI0, 1, x, y, z);				//call the gradient function from the gpu.
+	gpu_gradient3<float>(gpu_grad, gpuI0, x, y, z);				//call the gradient function from the gpu.
 	cudaFree(gpuI0);
  
 	float* gpu_vote;
 	cudaMalloc(&gpu_vote, bytes);
  
 	float cos_phi = cos(phi);
-
+	
 	//call the vote function.
 	for (int i = 0; i < iter; i++){
  
 		cudaMemset(gpu_vote, 0, bytes);
 		gpu_vote3<float>(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z);
 		cudaDeviceSynchronize();
-	
-		//if (phi >= d_phi){	
+		if (i == 0) {
+			cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
+			std::ofstream fvote("00-vote1_aabb.vol", std::ofstream::out | std::ofstream::binary);
+			fvote.write((char*)img, bytes);
+			fvote.close();
+		}
+		if (i == 1) {
+			cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
+			std::ofstream fvote("00-vote2_aabb.vol", std::ofstream::out | std::ofstream::binary);
+			fvote.write((char*)img, bytes);
+			fvote.close();
+		}
 			gpu_update_dir3<float>(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z);
-			cudaDeviceSynchronize();
-			phi = phi - d_phi;
-			cos_phi = cos(phi);
-		//}
-	
+		cudaDeviceSynchronize();
+		phi = phi - d_phi;
+		cos_phi = cos(phi);
 	}
  
 	cudaFree(gpu_grad);
 	cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
  
-	//allocate space on the gpu for the final detected cells.
-	//float* gpu_output;
-	//cudaMalloc(&gpu_output, bytes);
-
-	////call the local max function
-	//gpu_local_max3<float>(gpu_output, gpu_vote, t, conn, x, y, z);
-
-	////copy the final result to the cpu.
-	//cudaMemcpy(center, gpu_output, bytes, cudaMemcpyDeviceToHost);
-	//	
-	//
 	cudaFree(gpu_vote);
 	//cudaFree(gpu_output);
  
 }
  
-void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){
+void lmax(float* out, float* in, float t, unsigned int conn[], size_t x, size_t y, size_t z){
 	unsigned int bytes = x * y * z * sizeof(float);
  
 	cudaSetDevice(0);
@@ -89,5 +83,4 @@ void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, u
  
 	cudaFree(gpuV);
 	cudaFree(gpuOut);
-}
-
+}
 \ No newline at end of file
@@ -8,6 +8,7 @@
  
 #define pi	3.14159
  
+
  
 		template<typename T>
 		__global__ void blur_x(T* out, T* in, T sigma, int x, int y, int z){
@@ -22,7 +23,7 @@
 			int i = zi * x * y + yi * x + xi;
  
 			// calculate the kernel size			
-			int k_size = sigma * 4;
+			T k_size = sigma * 4;
  
 			//if the current pixel is outside of the image
 			if(xi >= x || yi >= y || zi>=z)
@@ -33,14 +34,21 @@
 			T G;
 			T sum = 0;		//running weighted sum across the kernel
 			out[i] = 0;
-			T sigma_sq = 2 * sigma * sigma;	
-			T a = 1.0 / (sigma * sqrt(2 * pi));
+			T sigma_sq = 2.0 * sigma * sigma;	
+			T a = 1.0 / (sigma * sqrt(2.0 * pi));
+
+			//handle the boundary in x direction
+			int kl = -(int)k_size;
+			//if (xi < k_size) kl = -xi;
+			int kh = (int)k_size;
+			//if (xi >= x - (int)k_size) kh = x - xi - 1;
+
  
 			//for each element of the kernel
-			for(int ki = -k_size; ki <= k_size; ki++){
+			for(int ki = kl; ki <= kh; ki++){
  
 				//calculate the gaussian value
-				G = a * exp(-(ki*ki) / (sigma_sq));
+				G = a * exp(-(T)(ki*ki) / (sigma_sq));
 				//calculate the global coordinates for this point in the kernel
 				gx = (xi + ki) % x;					
 				gi = zi * x * y + yi * x + gx;
@@ -65,7 +73,7 @@
 			int i = zi * x * y + yi * x + xi;
  
 			// calculate the kernel size			
-			int k_size = sigma * 4;
+			T k_size = sigma * 4;
  
 			//if the current pixel is outside of the image
 			if(xi >= x || yi >= y || zi>=z)
@@ -76,14 +84,22 @@
 			T G;
 			T sum = 0;		//running weighted sum across the kernel
 			out[i] = 0;
-			T sigma_sq = 2 * sigma * sigma;	
-			T a = 1.0 / (sigma * sqrt(2 * pi));	
+			T sigma_sq = 2.0 * sigma * sigma;	
+			T a = 1.0 / (sigma * sqrt(2.0 * pi));	
+
+			//handle the boundary in y direction
+			int kl = -(int)k_size;
+			//if (yi < k_size) kl = -yi;
+			int kh = (int)k_size;
+			//if (yi >= y - (int)k_size) kh = y - yi - 1;
+
+
  
 			//for each element of the kernel
-			for(int ki = -k_size; ki <= k_size; ki++){
+			for(int ki = kl; ki <= kh; ki++){
  
 				//calculate the gaussian value
-				G = a * exp(-(ki*ki) / sigma_sq);
+				G = a * exp(-(T)(ki*ki) / sigma_sq);
 				//calculate the global coordinates for this point in the kernel
 				gy = (yi + ki ) % y;				
 				gi = zi * x * y + gy * x + xi;
@@ -108,7 +124,7 @@
 			int i = zi * x * y + yi * x + xi;
  
 			// calculate the kernel size			
-			int k_size = sigma * 4;
+			T k_size = sigma * 4;
  
 			//if the current pixel is outside of the image
 			if(xi >= x || yi >= y || zi>=z)
@@ -119,14 +135,20 @@
 			T G;
 			T sum = 0;		//running weighted sum across the kernel
 			out[i] = 0;
-			T sigma_sq = 2 * sigma * sigma;	
-			T a = 1.0 / (sigma * sqrt(2 * pi));	
+			T sigma_sq = 2.0 * sigma * sigma;	
+			T a = 1.0 / (sigma * sqrt(2.0 * pi));	
+
+			//handle the boundary in z direction
+			int kl = -(int)k_size;
+			//if (zi < k_size) kl = -zi;
+			int kh = (int)k_size;
+			//if (zi >= z - (int)k_size) kh = z - zi - 1;
  
 			//for each element of the kernel
-			for(int ki = -k_size; ki <= k_size; ki++){
+			for(int ki = kl; ki <= kh; ki++){
  
 				//calculate the gaussian value
-				G = a * exp(-(ki*ki) / sigma_sq);
+				G = a * exp(-(T)(ki*ki) / sigma_sq);
 				//calculate the global coordinates for this point in the kernel
 				gz = (zi + ki) % z;				
 				gi = gz * x * y + yi * x + xi;					
@@ -138,13 +160,13 @@
 		}
  
 		template<typename T>
-		void gpu_gaussian_blur3(T* image, T sigma[], unsigned int x, unsigned int y, unsigned int z){
+		void gpu_gaussian_blur3(T* image, T sigma[], size_t x, size_t y, size_t z){
  
 			//get the number of pixels in the image
-			unsigned int pixels = x * y * z;
-			unsigned int bytes = sizeof(T) * pixels;
+			size_t pixels = x * y * z;
+			size_t bytes = sizeof(T) * pixels;
  
-			int max_threads = stim::maxThreadsPerBlock();
+			unsigned int max_threads = stim::maxThreadsPerBlock();
 			dim3 threads(sqrt (max_threads),sqrt (max_threads));
 			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  
@@ -7,7 +7,7 @@
 #include <stim/cuda/cudatools/error.h>
  
 template<typename T>
-__global__ void gradient3(T* out, T* in, float anisotropy, int x, int y, int z){
+__global__ void gradient3(T* out, T* in, int x, int y, int z){
  
 	//calculate x,y,z coordinates for this thread
 	int xi = blockIdx.x * blockDim.x + threadIdx.x;
@@ -55,30 +55,28 @@ __global__ void gradient3(T* out, T* in, float anisotropy, int x, int y, int z){
 	if(zi > 0 && zi < z-1)
 		out[i * 3 + 2] = (in[i_zp] - in[i_zn]) / 2;
  
-	out[i * 3 + 2] *= 1/anisotropy;
-
 }
  
 template<typename T>
  
-void gpu_gradient3(T* gpuGrad, T* gpuI, float anisotropy, unsigned int x, unsigned int y, unsigned int z){
+void gpu_gradient3(T* gpuGrad, T* gpuI, size_t x, size_t y, size_t z){
  
  
-	int max_threads = stim::maxThreadsPerBlock();
+	unsigned int max_threads = stim::maxThreadsPerBlock();
 	dim3 threads(sqrt (max_threads),sqrt (max_threads));
 	dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  
 	//call the GPU kernel to determine the gradient
-	gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, anisotropy, x, y, z);
+	gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y, z);
  
 }
  
 template<typename T>
-void cpu_gradient3(T* out, T* in, float anisotropy, unsigned int x, unsigned int y, unsigned int z){
+void cpu_gradient3(T* out, T* in, size_t x, size_t y, size_t z){
  
 	//get the number of pixels in the image
-	unsigned int pixels = x * y * z;
-	unsigned int bytes = pixels * sizeof(T);
+	size_t pixels = x * y * z;
+	size_t bytes = pixels * sizeof(T);
  
 	//allocate space on the GPU for the input image
 	T* gpuIn;
@@ -92,7 +90,7 @@ void cpu_gradient3(T* out, T* in, float anisotropy, unsigned int x, unsigned int
 	cudaMalloc(&gpuOut, bytes * 3);		//the output image will have two channels (x, y)
  
 	//call the GPU version of this function
-	gpu_gradient3(gpuOut, gpuIn, anisotropy, x, y, z);	
+	gpu_gradient3(gpuOut, gpuIn, x, y, z);	
  
 	//copy the results to the CPU
 	cudaMemcpy(out, gpuOut, bytes * 3, cudaMemcpyDeviceToHost);
@@ -19,21 +19,22 @@ float phi;
 size_t x, y, z;
  
  
-void ivote3(float* img, float std[], float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[], 
-			unsigned int x, unsigned int y, unsigned int z);
-void lmax(float* center, float* vote, float t1, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z);
-
-void invert_data(float* cpuI, unsigned int x, unsigned int y, unsigned int z){
-		for(int ix = 0; ix < x; ix++){
-			for (int iy = 0; iy < y; iy++){
-				for (int iz = 0; iz < z; iz++){
-					int idx = iz * x * y + iy * x + ix;
-					cpuI[idx] = 255 - cpuI[idx];
-				}
+void ivote3(float* img, float std[], float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[],
+	size_t x, size_t y, size_t z);
+void lmax(float* center, float* vote, float t1, unsigned int conn[], size_t x, size_t y, size_t z);
+
+void invert_data(float* cpuI, size_t x, size_t y, size_t z) {
+	size_t idx;
+	for (size_t ix = 0; ix < x; ix++) {
+		for (size_t iy = 0; iy < y; iy++) {
+			for (size_t iz = 0; iz < z; iz++) {
+				idx = iz * x * y + iy * x + ix;
+				cpuI[idx] = 255 - cpuI[idx];
 			}
 		}
 	}
-	
+}
+
  
  
 void advertise() {
@@ -57,12 +58,12 @@ void init_args(int argc, char* argv[]) {
 	args.add("x", "size of the dataset along X axis", "positive value");
 	args.add("y", "size of the dataset along Y axis", "positive value");
 	args.add("z", "size of the dataset along Z axis", "positive value");
-	args.add("t", "threshold value for the final result", "positive valu");
+	args.add("t", "threshold value for the final result", "0", "positive valu");
 	args.add("invert", "to invert the input data set", "string");
 	args.add("rmax", "maximum possible radius of the cells in the input image", "10", "[positive value]");
 	args.add("phi", "starting angle for the vote region (in degrees)", "25.0", "0 <= phi < 180");
 	args.add("iter", "number of iterations for voting", "8", "i > 0");
-	args.add("sigma", "the gaussian blur standard deviation", "5", "s >=0 (s = 0, no blurring)");
+	args.add("sigma", "the gaussian blur standard deviation", "3", "s >=0 (s = 0, no blurring)");
 	args.add("conn", "the number of connected neighbors for calculating the local maxima", "5", "[positive value]");
 	//parse the command line arguments.
 	args.parse(argc, argv);
@@ -95,94 +96,93 @@ void init_args(int argc, char* argv[]) {
 	t = args["t"].as_float();
 	sigma = args["sigma"].as_float();
 	phi = (float)args["phi"].as_float() * (float)stim::PI / 180;
-	
+
 }
-int main(int argc, char** argv){
+int main(int argc, char** argv) {
  
  
 	cudaDeviceProp prop;
 	int count;
 	cudaGetDeviceCount(&count);
-	for (int i=0; i<count; i++){
+	for (int i = 0; i<count; i++) {
 		cudaGetDeviceProperties(&prop, i);
 		printf("current device ID: %d\n", i);
 		printf("device name: %s\n", prop.name);
 		printf("total global mem: %lu\n", prop.totalGlobalMem);
 		printf("shared memory per block: %lu\n", prop.sharedMemPerBlock);
 	}
-		
+
 	init_args(argc, argv);
  
-	unsigned int r[3] = { rmax  , rmax, rmax};
-	
-	float sigma3[3] = { sigma, sigma, sigma};
-	unsigned int conn[3] = { nlmax, nlmax, nlmax};
-	float d_phi = phi/(iter+2);
-	
-	size_t bytes = x*y*z*sizeof(float);
+	unsigned int r[3] = { rmax , rmax, rmax };
+
+	float sigma3[3] = { sigma, sigma, sigma };
+	unsigned int conn[3] = { nlmax, nlmax, nlmax };
+	float d_phi = phi / (iter);
+
+	size_t bytes = x*y*z * sizeof(float);
  
 	//allocate space on the cpu for the input data
-	float* cpuI = (float*) malloc(bytes);
+	float* cpuI = (float*)malloc(bytes);
  
 	//load the input file into the cpuI
 	std::ifstream nissl(args.arg(0), std::ios::in | std::ios::binary);
 	nissl.read((char*)cpuI, bytes);
 	nissl.close();
-	if(args["invert"].is_set())
+	if (args["invert"].is_set())
 		invert_data(cpuI, x, y, z);
-	
+
 	//write a new file from the cpuI.
 	std::ofstream original("0-inv-128.vol", std::ofstream::out | std::ofstream::binary);
 	original.write((char*)cpuI, bytes);
 	original.close();
-	
-	
+
+
 	ivote3(cpuI, sigma3, phi, d_phi, r, iter, t, conn, x, y, z);			// call the ivote function
-		
-	std::ofstream fvote("0-vote8.vol", std::ofstream::out | std::ofstream::binary);
+
+	std::ofstream fvote("00-vote8_aabb.vol", std::ofstream::out | std::ofstream::binary);
 	fvote.write((char*)cpuI, bytes);
 	fvote.close();
-	
+
 	//allocate space on the cpu for the output result
 	float* cpu_out = (float*)malloc(bytes * 3);
  
 	//write the output file.
 	//for (int t0=0; t0<=5000; t0+=100){
 	//	float t1 = t0;
-		int t0 = t;
-			lmax(cpu_out, cpuI, t, conn, x, y, z);
-			//std::ofstream fo("shared2D-v8/" + OutName.str(), std::ofstream::out | std::ofstream::binary);
-			std::ofstream fo( args.arg(1), std::ofstream::out | std::ofstream::binary);
-			fo.write((char*)cpu_out, bytes);
-			fo.close();
+	int t0 = t;
+	lmax(cpu_out, cpuI, t, conn, x, y, z);
  
+	std::ofstream fo(args.arg(1), std::ofstream::out | std::ofstream::binary);
+	fo.write((char*)cpu_out, bytes);
+	fo.close();
+
 	// creat a file for saving the list centers
-	
-		std::ofstream list(args.arg(2));
-		// set the number of detected cells to zero.
-		int nod = 0;
-		if (list.is_open()){
-
-				for (int iz=0; iz<z; iz++){
-					for (int iy=0; iy<y; iy++){
-						for (int ix=0; ix<x; ix++){
-
-							int idx = iz * x * y + iy * x + ix;
-							if (cpu_out[idx]>0){
-								nod++;
-								list << ix << " " << iy << " "<< iz << " " << cpu_out[idx] << '\n' ;
-					
-							}
-						}
+
+	std::ofstream list(args.arg(2));
+	// set the number of detected cells to zero.
+	int nod = 0;
+	if (list.is_open()) {
+
+		for (int iz = 0; iz<z; iz++) {
+			for (int iy = 0; iy<y; iy++) {
+				for (int ix = 0; ix<x; ix++) {
+
+					int idx = iz * x * y + iy * x + ix;
+					if (cpu_out[idx]>0) {
+						nod++;
+						list << ix << " " << iy << " " << iz << " " << cpu_out[idx] << '\n';
+
 					}
 				}
-				
-		list.close();
+			}
 		}
  
+		list.close();
+	}
+
  
 	//}
-		cudaDeviceReset();       
-	
-}
+	cudaDeviceReset();
  
+}
 \ No newline at end of file
@@ -46,7 +46,7 @@
 			float step = 360.0/n;
 			stim::circle<float>  cir(center, d, norm);
 			stim::aabb3<int> bb(xi,yi,zi);
-			bb.insert(xc,yc,zc);
+			bb.insert((int) xc, (int)yc, (int)zc);
 			for(float j = 0; j <360.0; j += step){
 				stim::vec3<float> out = cir.p(j);
 				bb.insert(out[0], out[1], out[2]);
@@ -67,7 +67,7 @@
 			float id_x = g[0];				// define local variables for the x, y, and z coordinations point to the vote direction
 			float id_y = g[1];
 			float id_z = g[2];
-
+			
 			for (bz=bb.low[2]; bz<=bb.high[2]; bz++){
 				dz = bz - zi;							//compute the distance bw the voter and the current counter along z axis
 				dz_sq = dz * dz;
@@ -85,9 +85,9 @@
 							l_vote = gpu_vote[idx_c];
 							if  (l_vote>max) {
 								max = l_vote;
-								id_x = dx;
-								id_y = dy;
-								id_z = dz;
+								id_x = (float)dx;
+								id_y = (float)dy;
+								id_z = (float)dz;
 							}
 						}								
 					}						
@@ -103,25 +103,26 @@
  
 		// this kernel updates the gradient direction by the calculated voting direction.
 		template<typename T>
-		__global__ void update_grad3(T* gpu_grad, T* gpu_dir, int x, int y, int z){
+		__global__ void update_grad3(T* gpu_grad, T* gpu_dir, size_t x, size_t y, size_t z){
  
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;				//calculate x,y,z coordinates for this thread
+			size_t xi = blockIdx.x * blockDim.x + threadIdx.x;				//calculate x,y,z coordinates for this thread
+			
+			size_t grid_y = y / blockDim.y;							//find the grid size along y
+			size_t blockidx_y = blockIdx.y % grid_y;
+			size_t yi = blockidx_y * blockDim.y + threadIdx.y;
+			size_t zi = blockIdx.y / grid_y;
  
-			int grid_y = y / blockDim.y;							//find the grid size along y
-			int blockidx_y = blockIdx.y % grid_y;
-			int yi = blockidx_y * blockDim.y + threadIdx.y;
-			int zi = blockIdx.y / grid_y;
-			int i = zi * x * y + yi * x + xi;
-
 			if(xi >= x || yi >= y || zi >= z) return;
  
+			size_t i = zi * x * y + yi * x + xi;
+
 			gpu_grad[i * 3 + 0] = gpu_dir [i * 3 + 0];			//update the gradient image with the new direction direction
 			gpu_grad[i * 3 + 1] = gpu_dir [i * 3 + 1];
 			gpu_grad[i * 3 + 2] = gpu_dir [i * 3 + 2];
 		}
  
 		template<typename T>
-		void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+		void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z){
  
 			unsigned int max_threads = stim::maxThreadsPerBlock();
 			dim3 threads(sqrt (max_threads),sqrt (max_threads));
@@ -13,121 +13,121 @@
 #include <stim/math/vector.h>
 #include <stim/visualization/aabb3.h>
  
-		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
-		template<typename T>
-		__global__ void vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
-
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;						//calculate x,y,z coordinates for this thread
-			
-			int grid_y = y / blockDim.y;										//find the grid size along y
-			int blockidx_y = blockIdx.y % grid_y;
-			int yi = blockidx_y * blockDim.y + threadIdx.y;
-			int zi = blockIdx.y / grid_y;
-						
-			if(xi>=x || yi>=y || zi>=z) return;
-
-			int i = zi * x * y + yi * x + xi;	  // calculate the 1D index of the voter
-			
-
-			float rx_sq = rx * rx;        // compute the square for rmax 
-			float ry_sq = ry * ry;
-			float rz_sq = rz * rz;
-			
-			stim::vec3<float> g(gpu_grad[3*i],gpu_grad[3*i+1],gpu_grad[3*i+2]);   // form a vec3 variable for the gradient vector
-			stim::vec3<float> g_sph = g.cart2sph();			//convert cartesian coordinate to spherical for the gradient vector
-			float n =8;										//set the number of points to find the boundaries of the conical voting area
-			float xc = rx * cos(g_sph[1]) * sin(g_sph[2]);			//calculate the center point of the surface of the voting area for the voter
-			float yc = ry * sin(g_sph[1]) * sin(g_sph[2]) ;
-			float zc = rz * cos(g_sph[2]) ;
-			float r = sqrt(xc*xc + yc*yc + zc*zc);
-			xc+=xi;
-			yc+=yi;
-			zc+=zi;
-			stim::vec3<float> center(xc,yc,zc);
-			
-			float d = 2 * r * tan(phi);		//find the diameter of the conical voting area
-			stim::vec3<float> norm = g.norm();			//compute the normalize gradient vector
-			float step = 360.0/n;
-			stim::circle<float>  cir(center, d, norm);
-			stim::aabb3<int> bb(xi,yi,zi);
-			bb.insert((int)xc, (int)yc, (int)zc);
-			for(float j = 0; j <360.0; j += step){
-				stim::vec3<float> out = cir.p(j);
-				bb.insert(out[0], out[1], out[2]);
+// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+template<typename T>
+__global__ void vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, int rx, int ry, int rz, int x, int y, int z) {
+
+	int xi = blockIdx.x * blockDim.x + threadIdx.x;						//calculate x,y,z coordinates for this thread
+
+	int grid_y = y / blockDim.y;										//find the grid size along y
+	int blockidx_y = blockIdx.y % grid_y;
+	int yi = blockidx_y * blockDim.y + threadIdx.y;
+	int zi = blockIdx.y / grid_y;
+
+	if (xi >= x || yi >= y || zi >= z) return;
+
+	int i = zi * x * y + yi * x + xi;	  // calculate the 1D index of the voter
+
+
+	float rx_sq = rx * rx;        // compute the square for rmax 
+	float ry_sq = ry * ry;
+	float rz_sq = rz * rz;
+
+	stim::vec3<float> g(gpu_grad[3 * i], gpu_grad[3 * i + 1], gpu_grad[3 * i + 2]);   // form a vec3 variable for the gradient vector
+	stim::vec3<float> g_sph = g.cart2sph();			//convert cartesian coordinate to spherical for the gradient vector
+	float n = 8;										//set the number of points to find the boundaries of the conical voting area
+	float xc = rx * cos(g_sph[1]) * sin(g_sph[2]);			//calculate the center point of the surface of the voting area for the voter
+	float yc = ry * sin(g_sph[1]) * sin(g_sph[2]);
+	float zc = rz * cos(g_sph[2]);
+	float r = sqrt(xc*xc + yc*yc + zc*zc);
+	xc += xi;
+	yc += yi;
+	zc += zi;
+	stim::vec3<float> center(xc, yc, zc);
+
+	float d = 2 * r * tan(phi);		//find the diameter of the conical voting area
+	stim::vec3<float> norm = g.norm();			//compute the normalize gradient vector
+	float step = 360.0 / n;
+	stim::circle<float>  cir(center, d, norm);
+	stim::aabb3<int> bb(xi, yi, zi);
+	bb.insert((int)xc, (int)yc, (int)zc);
+	for (float j = 0; j <360.0; j += step) {
+		stim::vec3<float> out = cir.p(j);
+		bb.insert(out[0], out[1], out[2]);
+	}
+	bb.trim_low(xi - rx, yi - ry, zi - rz);
+	bb.trim_low(0, 0, 0);
+	bb.trim_high(xi + rx, yi + ry, zi + rz);
+	bb.trim_high(x - 1, y - 1, z - 1);
+	int bx, by, bz;
+	int dx, dy, dz;
+	float dx_sq, dy_sq, dz_sq;
+	float dist, cos_diff;
+	int idx_c;
+	for (bz = bb.low[2]; bz <= bb.high[2]; bz++) {
+		dz = bz - zi;							//compute the distance bw the voter and the current counter along z axis
+		dz_sq = dz * dz;
+		for (by = bb.low[1]; by <= bb.high[1]; by++) {
+			dy = by - yi;								//compute the distance bw the voter and the current counter along y axis
+			dy_sq = dy * dy;
+			for (bx = bb.low[0]; bx <= bb.high[0]; bx++) {
+				dx = bx - xi;								//compute the distance bw the voter and the current counter along x axis
+				dx_sq = dx * dx;
+
+				dist = sqrt(dx_sq + dy_sq + dz_sq);			//calculate the distance between the voter and the current counter
+				cos_diff = (norm[0] * dx + norm[1] * dy + norm[2] * dz) / dist;			 // calculate the cosine of angle between the voter and the current counter
+				if (((dx_sq / rx_sq + dy_sq / ry_sq + dz_sq / rz_sq) <= 1) && (cos_diff >= cos_phi)) {			//check if the current counter located in the voting area of the voter    
+					idx_c = (bz* y + by) * x + bx;			//calculate the 1D index for the current counter
+					atomicAdd(&gpu_vote[idx_c], g_sph[0]);
+				}
 			}
-			bb.trim_low(xi-rx, yi-ry, zi-rz);
-			bb.trim_low(0,0,0);
-			bb.trim_high(xi+rx, yi+ry, zi+rz);
-			bb.trim_high(x-1, y-1, z-1);
-			int bx,by,bz;
-			int dx, dy, dz;
-			float dx_sq, dy_sq, dz_sq;
-			float dist, cos_diff;
-			int idx_c;
-			for (bz=bb.low[2]; bz<=bb.high[2]; bz++){
-				dz = bz - zi;							//compute the distance bw the voter and the current counter along z axis
-				dz_sq = dz * dz;
-				for (by=bb.low[1]; by<=bb.high[1]; by++){
-					dy = by - yi;								//compute the distance bw the voter and the current counter along y axis
-					dy_sq = dy * dy;
-					for (bx=bb.low[0]; bx<=bb.high[0]; bx++){
-						dx = bx - xi;								//compute the distance bw the voter and the current counter along x axis
-						dx_sq = dx * dx;
-
-						dist = sqrt(dx_sq + dy_sq + dz_sq);			//calculate the distance between the voter and the current counter
-						cos_diff = (norm[0] * dx + norm[1] * dy +  norm[2] * dz)/dist;			 // calculate the cosine of angle between the voter and the current counter
-						if ( ( (dx_sq/rx_sq + dy_sq/ry_sq + dz_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){			//check if the current counter located in the voting area of the voter    
-							idx_c = (bz* y + by) * x + bx;			//calculate the 1D index for the current counter
-							atomicAdd (&gpu_vote[idx_c] , g_sph[0]);
-						}								
-					}						
-				}				
-			}	
 		}
+	}
+}
  
-		template<typename T>
-		void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+template<typename T>
+void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z) {
  
-			
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(sqrt (max_threads),sqrt (max_threads));
-			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
-			vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x , y, z);			//call the kernel to do the voting
  
-		}
+	unsigned int max_threads = stim::maxThreadsPerBlock();
+	dim3 threads(sqrt(max_threads), sqrt(max_threads));
+	dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
+	vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x, y, z);			//call the kernel to do the voting
  
+}
  
-		template<typename T>
-		void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * z * sizeof(T);
-
-			
-			//allocate space on the GPU for the Vote Image
-			T* gpu_vote;
-			cudaMalloc(&gpu_vote, bytes);		
-
-			//allocate space on the GPU for the input Gradient image
-			T* gpu_grad;
-			cudaMalloc(&gpu_grad, bytes*3);
-			
-			
-			//copy the Gradient data to the GPU
-			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
-			
-					
-			//call the GPU version of the vote calculation function
-			gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z);
-							
-			//copy the Vote Data back to the CPU
-			cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;
-
-			//free allocated memory
-			cudaFree(gpu_vote);
-			cudaFree(gpu_grad);
-			
-		}
+
+template<typename T>
+void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z) {
+
+	//calculate the number of bytes in the array
+	unsigned int bytes = x * y * z * sizeof(T);
+
+
+	//allocate space on the GPU for the Vote Image
+	T* gpu_vote;
+	cudaMalloc(&gpu_vote, bytes);
+
+	//allocate space on the GPU for the input Gradient image
+	T* gpu_grad;
+	cudaMalloc(&gpu_grad, bytes * 3);
+
+
+	//copy the Gradient data to the GPU
+	cudaMemcpy(gpu_grad, cpu_grad, bytes * 3, cudaMemcpyHostToDevice);
+
+
+	//call the GPU version of the vote calculation function
+	gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x, y, z);
+
+	//copy the Vote Data back to the CPU
+	cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost);
+
+	//free allocated memory
+	cudaFree(gpu_vote);
+	cudaFree(gpu_grad);
+
+}
  
  
 #endif
 \ No newline at end of file