update the ivote 2d for the last version of fast running code using bounding box…

…es for vote and update dir, and shared memory and atomic operation

update the ivote 2d for the last version of fast running code using bounding box…
…es for vote and update dir, and shared memory and atomic operation
Laila Saadatifard
1 parent 4252d827
Showing 8 changed files with 564 additions and 75 deletions Show diff stats
stim/cuda/ivote/local_max.cuh
stim/cuda/ivote/update_dir_global.cuh
stim/cuda/ivote/update_dir_threshold_global.cuh
stim/cuda/ivote/vote_atomic_global.cuh
stim/cuda/ivote/vote_atomic_shared.cuh
stim/cuda/ivote/vote_shared.cuh
stim/cuda/ivote/vote_threshold_global.cuh
stim/cuda/ivote_atomic.cuh
@@ -14,7 +14,7 @@ namespace stim{
 			// calculate the 2D coordinates for this current thread.
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
 			if(xi >= x || yi >= y)
 				return;
@@ -63,8 +63,10 @@ namespace stim{
 		void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
 			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads, 1);
-			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			/*dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/
+			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
+			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
 			//call the kernel to find the local maximum.
 			cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y);
@@ -7,8 +7,7 @@
 #include <stim/cuda/sharedmem.cuh>
 #include <stim/visualization/aabb2.h>
 #include <stim/visualization/colormap.h>
-#include <math.h>
-#include "cpyToshare.cuh" 
+#include <math.h> 
 //#define RMAX_TEST	8
@@ -76,68 +75,6 @@ namespace stim{
 			gpuDir[i] = atan2((T)max_dy, (T)max_dx);
 		}
-		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
-		template<typename T>
-		__global__ void leila_cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
-
-			
-			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-
-			if(xi >= x || yi >= y) return;													//if the index is outside of the image, terminate the kernel
-
-			int i = yi * x + xi;												// convert 2D coordinates to 1D
-			
-			float theta = gpuGrad[2*i];											// calculate the voting direction based on the grtadient direction - global memory fetch			
-			gpuDir[i] = 0;														//initialize the vote direction to zero			
-			float max = 0;														// define a local variable to maximum value of the vote image in the voting area for this voter
-			int id_x = 0;														// define two local variables for the x and y position of the maximum
-			int id_y = 0;
-			
-			int x_table = 2*rmax +1;											// compute the size of window which will be checked for finding the voting area for this voter
-			int rmax_sq = rmax * rmax;
-			int tx_rmax = threadIdx.x + rmax;
-			float atan_angle;
-			float vote_c;
-			int xidx, yidx, yr_sq, xr_sq;
-			for(int yr = -rmax; yr <= rmax; yr++){
-				yidx = yi + yr;													//compute the index into the image
-				if (yidx >= 0 && yidx < y){									//if the current y-index is inside the image
-					yr_sq = yr * yr;											//compute the square of yr, to save time later
-					for(int xr = -rmax; xr <= rmax; xr++){
-						xidx = xi + xr;
-						if(xidx >= 0 && xidx < x){
-							xr_sq = xr * xr;
-							unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
-
-							// calculate the angle between the voter and the current pixel in x and y directions
-							atan_angle = gpuTable[ind_t];
-							//atan_angle = atan2((T)yr, (T)xr);
-											
-							// check if the current pixel is located in the voting area of this voter.
-							if (((xr_sq + yr_sq)< rmax_sq) && (abs(atan_angle - theta) <phi)){
-								
-								vote_c = gpuVote[yidx * x + xidx];				// find the vote value for the current counter
-							// compare the vote value of this pixel with the max value to find the maxima and its index.
-								if  (vote_c>max) {
-
-									max = vote_c;
-									id_x =  xr;
-									id_y =  yr;
-								}
-							}
-						}
-					}
-				}
-			}
-							
-			unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
-			float new_angle = gpuTable[ind_m];
-
-			if(xi < x && yi < y)
-				gpuDir[i] = new_angle;
-		}										//end kernel
 		// this kernel updates the gradient direction by the calculated voting direction.
@@ -168,9 +105,7 @@ namespace stim{
 			HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) );	
 			unsigned int max_threads = stim::maxThreadsPerBlock();
-			//dim3 threads(min(x, max_threads), 1);
-			//dim3 blocks(x/threads.x, y);
-
+			
 			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
 			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
@@ -188,12 +123,12 @@ namespace stim{
 			//call the kernel to calculate the new voting direction
 			cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-			stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
+			//stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
 			//exit(0);
-			threads = dim3( sqrt(max_threads), sqrt(max_threads) );
-			blocks = dim3(x/threads.x + 1, y/threads.y + 1);
+			//threads = dim3( sqrt(max_threads), sqrt(max_threads) );
+			//blocks = dim3(x/threads.x + 1, y/threads.y + 1);
 			//call the kernel to update the gradient direction
 			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
+#ifndef STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBALD_H
+#define STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBAL_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"   
+
+namespace stim{
+	namespace cuda{
+	
+		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
+		template<typename T>
+		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x,  int y){
+
+			
+			
+			// calculate the coordinate for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuTh[3*xi];
+			
+			//calculate the position and x, y coordinations of this voter in the original image
+			unsigned int i_v = gpuTh[3*xi+2];
+			unsigned int y_v = i_v/x;
+			unsigned int x_v = i_v - (y_v*x);
+			
+			//initialize the vote direction to zero
+			gpuDir[xi] = 0;
+
+			// define a local variable to maximum value of the vote image in the voting area for this voter
+			float max = 0;
+
+			// define two local variables for the x and y coordinations where the maximum happened
+			int id_x = 0;
+			int id_y = 0;
+
+			// compute the size of window which will be checked for finding the voting area for this voter
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			if(xi < th_size){
+				
+				for(int yr = -rmax; yr <= rmax; yr++){
+					
+					for(int xr = -rmax; xr <= rmax; xr++){
+
+						unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+						// find the angle between the voter and the current pixel in x and y directions
+						float atan_angle = gpuTable[ind_t];
+										
+						// check if the current pixel is located in the voting area of this voter.
+						if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+							// find the vote value for the current counter
+							float vote_c = gpuVote[(y_v+yr)*x + (x_v+xr)];
+							// compare the vote value of this pixel with the max value to find the maxima and its index.
+							if  (vote_c>max) {
+
+								max = vote_c;
+								id_x =  xr;
+								id_y =  yr;
+							}
+						}
+					}
+				}
+			
+							
+				unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
+				float new_angle = gpuTable[ind_m];
+				gpuDir[xi] = new_angle;
+			}
+
+		}
+
+		// this kernel updates the gradient direction by the calculated voting direction.
+		template<typename T>
+		__global__ void cuda_update_grad(T* gpuTh, T* gpuDir, int th_size, int x, int y){
+
+			// calculate the coordinate for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			
+		
+			//update the gradient image with the vote direction
+			gpuTh[3*xi] = gpuDir[xi];
+		}
+		
+		template<typename T>
+		void gpu_update_dir(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes_th = th_size* sizeof(T);
+
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads);
+			dim3 blocks(th_size/threads.x+1);
+			
+			// allocate space on the GPU for the updated vote direction
+			T* gpuDir;
+			cudaMalloc(&gpuDir, bytes_th);	
+
+			//call the kernel to calculate the new voting direction
+			cuda_update_dir <<< blocks, threads>>>(gpuDir, gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y);
+
+			//call the kernel to update the gradient direction
+			cuda_update_grad <<< blocks, threads >>>(gpuTh, gpuDir, th_size, x , y);
+			
+			//free allocated memory
+			cudaFree(gpuDir);
+
+		}
+		
+		template<typename T>
+		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);
+
+			//copy the input vote image to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the update direction function
+			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the new gradient image back to the CPU
+			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_VOTE_ATOMIC_GLOBAL_H
+#define STIM_CUDA_VOTE_ATOMIC_GLOBAL_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include <stim/visualization/aabb2.h>
+#include <stim/visualization/colormap.h>
+#include <math.h>
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+
+			extern __shared__ T S[];
+			T* shared_atan = S;
+			size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
+			stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			
+			if(xi >= x || yi >= y) return;			
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuGrad[2*i];
+			//calculate the amount of vote for the voter
+			float mag = gpuGrad[2*i + 1];
+			
+
+			stim::aabb2<int> bb(xi, yi);								//initialize a bounding box at the current point
+			bb.insert(xi + ceil(rmax * cos(theta)),       ceil(yi + rmax * sin(theta)));
+			bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi)));		//insert one corner of the triangle into the bounding box
+			bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi)));		//insert the final corner into the bounding box
+			
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			
+			int lut_i;
+			T dx_sq, dy_sq;
+
+			bb.trim_low(0, 0);															//make sure the bounding box doesn't go outside the image
+			bb.trim_high(x-1, y-1);
+
+			int by, bx;
+			int dx, dy;					
+			
+			unsigned int ind_g;											//initialize the maximum vote value to zero
+			T alpha;
+			
+			for(by = bb.low[1]; by <= bb.high[1]; by++){					//for each element in the bounding box
+				dy = by - yi;											//calculate the y coordinate of the current point relative to yi
+				dy_sq = dy * dy;
+				for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
+					dx = bx - xi;
+					dx_sq = dx * dx;
+					lut_i = (rmax - dy) * x_table + rmax - dx;
+					alpha = shared_atan[lut_i];
+					if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
+						ind_g = (by)*x + (bx);
+						atomicAdd(&gpuVote[ind_g], mag);
+					
+					}
+				}
+			}			
+			
+		}
+	
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
+			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
+			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
+			size_t shared_mem_req = table_bytes;// + template_bytes;
+			std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;		
+			size_t shared_mem = stim::sharedMemPerBlock();
+			if(shared_mem_req > shared_mem){
+				std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
+				exit(1);
+			}
+
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
@@ -5,7 +5,7 @@
 # include <cuda.h>
 #include <stim/cuda/cudatools.h>
 #include <stim/cuda/sharedmem.cuh>
-#include "cpyToshare.cuh"
+
 //#include "writebackshared.cuh"
 namespace stim{
 	namespace cuda{
+#ifndef STIM_CUDA_VOTE_SHARED_H
+#define STIM_CUDA_VOTE_SHARED
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ float s_grad[];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = bxi + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+						
+			// define a local variable to sum the votes from the voters
+			float sum = 0;
+			
+			//calculate the width of the shared memory block
+			int swidth = 2 * rmax + blockDim.x;
+			
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			int bxs = bxi - rmax;
+			
+			//for every line (along y)
+			for(int yr = -rmax; yr <= rmax; yr++){
+				if (yi+yr<y && yi+yr>=0){
+					//copy the portion of the image necessary for this block to shared memory
+					__syncthreads();
+					cpyG2S1D2ch<float>(s_grad, gpuGrad, bxs, yi + yr , 2*swidth, 1, threadIdx, blockDim, x, y);
+					__syncthreads();
+				
+					if(xi < x && yi < y){
+
+						for(int xr = -rmax; xr <= rmax; xr++){
+					
+								//find the location of this voter in the atan2 table
+								int id_t = (yr + rmax) * x_table + xr + rmax;
+
+								// calculate the angle between the pixel and the current voter in x and y directions
+								float atan_angle = gpuTable[id_t];
+												
+								// calculate the voting direction based on the grtadient direction
+								int idx_share = xr + tx_rmax ;
+								float theta = s_grad[idx_share*2];
+								float mag = s_grad[idx_share*2 + 1];
+							
+
+								// check if the current voter is located in the voting area of this pixel.
+								if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+									sum += mag;		
+
+								}
+						}
+				
+					}
+				}
+			}
+			if(xi < x && yi < y)
+				gpuVote[i] = sum;
+			
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+					
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*1*2*sizeof(T);
+			
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H
+#define STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x, int y){
+
+						
+			// calculate the x coordinate for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuTh[3*xi];
+			//find the gradient magnitude for the current voter
+			float mag = gpuTh[3*xi + 1];
+			//calculate the position and x, y coordinations of this voter in the original image
+			unsigned int i_v = gpuTh[3*xi+2];
+			unsigned int y_v = i_v/x;
+			unsigned int x_v = i_v - (y_v*x);
+			
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			if(xi < th_size){
+				for(int yr = -rmax; yr <= rmax; yr++){					
+					for(int xr = -rmax; xr <= rmax; xr++){
+						if ((y_v+yr)>=0 && (y_v+yr)<y && (x_v+xr)>=0 && (x_v+xr)<x){
+					
+							//find the location of the current pixel in the atan2 table
+							unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+							// calculate the angle between the voter and the current pixel in x and y directions
+							float atan_angle = gpuTable[ind_t];
+						
+							// check if the current pixel is located in the voting area of this voter.
+							if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+								// calculate the 1D index for the current pixel in global memory
+								unsigned int ind_g = (y_v+yr)*x + (x_v+xr);
+								atomicAdd(&gpuVote[ind_g], mag);
+							
+								}
+						}
+					}
+				}	
+			}
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads);
+			dim3 blocks(th_size/threads.x + 1);
+			
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads>>>(gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
@@ -5,7 +5,7 @@
 #include <stim/cuda/ivote/local_max.cuh>
 #include <stim/cuda/ivote/update_dir_global.cuh>
 //#include <stim/cuda/ivote/vote_shared_32-32.cuh>
-#include <stim/cuda/ivote/vote_atomic_shared.cuh>
+#include <stim/cuda/ivote/vote_atomic_global.cuh>
 //#include <stim/cuda/ivote/re_sample.cuh>
 namespace stim{
 	namespace cuda{