Leila's ivote profiling push

Laila Saadatifard
1 parent 03c403fa
Showing 8 changed files with 814 additions and 4 deletions Show diff stats
stim/cuda/arraymath.cuh
stim/cuda/ivote/local_max.cuh
stim/cuda/ivote/update_dir_global.cuh
stim/cuda/ivote/update_dir_shared.cuh
stim/cuda/ivote/vote_atomic.cuh
stim/cuda/ivote/vote_atomic_shared.cuh
stim/cuda/ivote/vote_shared_32-32.cuh
stim/cuda/ivote_atomic.cuh
@@ -10,7 +10,8 @@
 #include <stim/cuda/arraymath/array_atan.cuh>
 #include <stim/cuda/arraymath/array_abs.cuh>
 #include <stim/cuda/arraymath/array_cart2polar.cuh>
-
+#include <stim/cuda/arraymath/array_threshold.cuh>
+#include <stim/cuda/arraymath/array_add3.cuh>
 namespace stim{
 	namespace cuda{
@@ -29,17 +29,33 @@ namespace stim{
 			//compare to the threshold
 			if(val < final_t) return;
+			//define an array to store indices with same vote value
+			/*int * IdxEq;
+			IdxEq = new int  [2*conn];
+			int n = 0;*/
+			
 			for(int xl = xi - conn; xl < xi + conn; xl++){
 				for(int yl = yi - conn; yl < yi + conn; yl++){
 					if(xl >= 0 && xl < x && yl >= 0 && yl < y){
 						int il = yl * x + xl;
 						if(gpuVote[il] > val){							
 							return;
-						}							
-					}
+							}
+						if (gpuVote[il] == val){
+							/*IdxEq[n] = il;
+							n = n+1;*/
+							 if( il > i){
+								 return;
+							}
+						}
+					}							
 				}
 			}
-			
+			/*if (n!=0){
+				if(IdxEq[n/2] !=i){
+					return;
+				}
+			}	*/	
 			gpuCenters[i] = 1;
 		}
+#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
+#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh" 
+
+#define RMAX_TEST	8
+
+namespace stim{
+	namespace cuda{
+	
+		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
+		template<typename T>
+		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
+
+			
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = bxi + threadIdx.x;
+			if(xi >= x) return;													//if the index is outside of the image, terminate the kernel
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;			
+			int i = yi * x + xi;												// convert 2D coordinates to 1D
+			
+			float theta = gpuGrad[2*i];											// calculate the voting direction based on the grtadient direction - global memory fetch			
+			gpuDir[i] = 0;														//initialize the vote direction to zero			
+			float max = 0;														// define a local variable to maximum value of the vote image in the voting area for this voter
+			int id_x = 0;														// define two local variables for the x and y position of the maximum
+			int id_y = 0;
+			
+			int x_table = 2*rmax +1;											// compute the size of window which will be checked for finding the voting area for this voter
+			int rmax_sq = rmax * rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			float atan_angle;
+			float vote_c;
+			for(int yr = -RMAX_TEST; yr <= RMAX_TEST; yr++){
+				if (yi+yr >= 0 && yi + yr < y){
+					for(int xr = -RMAX_TEST; xr <= RMAX_TEST; xr++){
+
+						unsigned int ind_t = (RMAX_TEST - yr) * x_table + RMAX_TEST - xr;
+
+						// calculate the angle between the voter and the current pixel in x and y directions
+						atan_angle = gpuTable[ind_t];
+						
+						// find the vote value for the current counter
+						vote_c = gpuVote[(yi+yr)*x + (xi+xr)];
+						
+						// check if the current pixel is located in the voting area of this voter.
+						if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+
+						// compare the vote value of this pixel with the max value to find the maxima and its index.
+							if  (vote_c>max) {
+
+								max = vote_c;
+								id_x =  xr;
+								id_y =  yr;
+							}
+						}
+					}
+				}
+			}
+							
+			unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
+			float new_angle = gpuTable[ind_m];
+
+			if(xi < x && yi < y)
+				gpuDir[i] = new_angle;
+		}										//end kernel
+
+		// this kernel updates the gradient direction by the calculated voting direction.
+		template<typename T>
+		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
+
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+		
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+			
+			//update the gradient image with the vote direction
+			gpuGrad[2*i] = gpuDir[i];
+		}
+		
+		template<typename T>
+		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+			// allocate space on the GPU for the updated vote direction
+			T* gpuDir;
+			cudaMalloc(&gpuDir, bytes);	
+
+			//call the kernel to calculate the new voting direction
+			cuda_update_dir <<< blocks, threads>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+			//call the kernel to update the gradient direction
+			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
+			
+			//free allocated memory
+			cudaFree(gpuDir);
+
+		}
+		
+		template<typename T>
+		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);
+
+			//copy the input vote image to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the update direction function
+			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the new gradient image back to the CPU
+			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_UPDATE_DIR_SHARED_H
+#define STIM_CUDA_UPDATE_DIR_SHARED_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+
+namespace stim{
+	namespace cuda{
+	
+		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
+		template<typename T>
+		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ float s_vote[];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = bxi + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuGrad[2*i];
+
+			//initialize the vote direction to zero
+			gpuDir[i] = 0;
+
+			// define a local variable to maximum value of the vote image in the voting area for this voter
+			float max = 0;
+
+			// define two local variables for the x and y coordinations where the maximum happened
+			int id_x = 0;
+			int id_y = 0;
+
+			//calculate the width of the shared memory block
+			int swidth = 2 * rmax + blockDim.x;
+
+			// compute the size of window which will be checked for finding the voting area for this voter
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			int bxs = bxi - rmax;
+						
+			for(int yr = -rmax; yr <= rmax; yr++){
+				//if (yi+yr >= 0 && yi + yr < y){
+					//copy the portion of the image necessary for this block to shared memory
+					__syncthreads();
+					cpyG2S1D<float>(s_vote, gpuVote, bxs, yi + yr , swidth, 1, threadIdx, blockDim, x, y);
+					__syncthreads();
+				
+					//if the current thread is outside of the image, it doesn't have to be computed
+					if(xi < x && yi < y){
+
+						for(int xr = -rmax; xr <= rmax; xr++){
+
+							unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+							// calculate the angle between the voter and the current pixel in x and y directions
+							float atan_angle = gpuTable[ind_t];
+						
+							// calculate the voting direction based on the grtadient direction
+							int idx_share_update = xr + tx_rmax ;
+							float share_vote = s_vote[idx_share_update];
+						
+							// check if the current pixel is located in the voting area of this voter.
+							if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+
+							// compare the vote value of this pixel with the max value to find the maxima and its index.
+								if  (share_vote>max) {
+
+									max = share_vote;
+									id_x =  xr;
+									id_y =  yr;
+								}
+							}
+						}
+					}
+				//}
+			}
+							
+		unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
+		float new_angle = gpuTable[ind_m];
+
+		if(xi < x && yi < y)
+			gpuDir[i] = new_angle;
+
+		}
+
+		// this kernel updates the gradient direction by the calculated voting direction.
+		template<typename T>
+		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
+
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+		
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+			
+			//update the gradient image with the vote direction
+			gpuGrad[2*i] = gpuDir[i];
+		}
+		
+		template<typename T>
+		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*(1)*4;
+			
+			// allocate space on the GPU for the updated vote direction
+			T* gpuDir;
+			cudaMalloc(&gpuDir, bytes);	
+
+			//call the kernel to calculate the new voting direction
+			cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+			//call the kernel to update the gradient direction
+			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
+			
+			//free allocated memory
+			cudaFree(gpuDir);
+
+		}
+		
+		template<typename T>
+		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);
+
+			//copy the input vote image to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the update direction function
+			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the new gradient image back to the CPU
+			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_VOTE_ATOMIC_H
+#define STIM_CUDA_VOTE_ATOMIC_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuGrad[2*i];
+			//calculate the amount of vote for the voter
+			float mag = gpuGrad[2*i + 1];
+						
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			if(xi < x && yi < y){
+			//for every line (along y)
+				for(int yr = -rmax; yr <= rmax; yr++){					
+					for(int xr = -rmax; xr <= rmax; xr++){
+						if ((yi+yr)>=0 && (yi+yr)<y && (xi+xr)>=0 && (xi+xr)<x){
+					
+							//find the location of the current pixel in the atan2 table
+							unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+							// calculate the angle between the voter and the current pixel in x and y directions
+							float atan_angle = gpuTable[ind_t];
+						
+							// check if the current pixel is located in the voting area of this voter.
+							if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+								// calculate the 1D index for the current pixel in global memory
+								unsigned int ind_g = (yi+yr)*x + (xi+xr);
+								atomicAdd(&gpuVote[ind_g], mag);
+							
+								}
+						}
+					}
+				}	
+			}
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+					
+			// specify  share memory
+			//unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4;
+			
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_VOTE_ATOMIC_SHARED_H
+#define STIM_CUDA_VOTE_ATOMIC_SHARED_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+//#include "writebackshared.cuh"
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+
+			//generate a pointer to the shared memory
+			extern __shared__ float s_vote[];
+			// calculate the 2D coordinates for this current thread.
+			int bxi = blockIdx.x * blockDim.x;
+			int byi = blockIdx.y * blockDim.y;
+			int xi = bxi + threadIdx.x;
+			int yi = byi + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			// calculate the voting direction based on the gradient direction
+			float theta = gpuGrad[2*i];
+			//calculate the amount of vote for the voter
+			float mag = gpuGrad[2*i + 1];
+			
+			//find the starting points and size of window, wich will be copied to the shared memory
+			int bxs = bxi - rmax;
+			int bys = byi - rmax;
+			int xwidth = 2*rmax + blockDim.x;
+			int ywidth = 2*rmax + blockDim.y;
+			//compute the coordinations of this pixel in the 2D-shared memory.
+			int sx_rx = threadIdx.x + rmax;
+			int sy_ry = threadIdx.y + rmax;
+			// compute the size of window which will be checked for finding the counters for this voter
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			//calculate some parameters for indexing shared memory
+				//calculate the total number of threads available
+				unsigned int tThreads = blockDim.x * blockDim.y;
+				//calculate the current 1D thread ID
+				unsigned int ti =  threadIdx.y * (blockDim.x) + threadIdx.x;
+				//calculate the number of iteration required
+				unsigned int In = xwidth*ywidth/tThreads + 1;
+			if(xi < x && yi < y){
+				__syncthreads();
+				//initialize the shared memory to zero				
+				for (unsigned int i = 0; i < In; i++){								
+					unsigned int sIdx0 = i * tThreads + ti;
+					if (sIdx0< xwidth*ywidth) {
+						s_vote[sIdx0] = 0;
+					}
+				}
+				__syncthreads();
+				//for every line (along y)
+				for(int yr = -rmax; yr <= rmax; yr++){	
+					//compute the position of the current voter in the shared memory along the y axis.
+					unsigned int sIdx_y1d = (sy_ry + yr)* xwidth;
+					for(int xr = -rmax; xr <= rmax; xr++){												
+						
+						//find the location of the current pixel in the atan2 table
+						unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+						// calculate the angle between the voter and the current pixel in x and y directions
+						float atan_angle = gpuTable[ind_t];
+							
+						// check if the current pixel is located in the voting area of this voter.
+						if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+							//compute the position of the current voter in the 2D-shared memory along the x axis.
+							unsigned int sIdx_x = (sx_rx + xr);
+							//find the 1D index of this voter in the 2D-shared memory.
+							unsigned int s_Idx = (sIdx_y1d  + sIdx_x);
+								
+							atomicAdd(&s_vote[s_Idx], mag);								
+							}
+					}
+				}	
+				//write shared memory back to global memory
+				
+				__syncthreads();						
+				for (unsigned int i = 0; i < In; i++){
+				
+					unsigned int sIdx = i * tThreads + ti;
+					if (sIdx>= xwidth*ywidth) return;
+
+					unsigned int sy = sIdx/xwidth;
+					unsigned int sx = sIdx - (sy * xwidth);
+					
+					unsigned int gx = bxs + sx;
+					unsigned int gy = bys + sy;
+					if (gx<x&& gy<y){
+						unsigned int gIdx = gy * x + gx;
+						//write shared to global memory
+						atomicAdd(&gpuVote[gIdx], s_vote[sIdx]);
+						
+					}						
+				}
+				
+			}
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(sqrt(max_threads), sqrt(max_threads));
+			dim3 blocks(x/threads.x + 1 , y/threads.y+1);
+					
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*sizeof(T);
+			
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads, share_bytes>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_VOTE_SHARED_H
+#define STIM_CUDA_VOTE_SHARED
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ float s_grad[];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			int byi = blockIdx.y * blockDim.y;
+			// calculate the 2D coordinates for this current thread.
+			int xi = bxi + threadIdx.x;
+			int yi = byi + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+						
+			// define a local variable to sum the votes from the voters
+			float sum = 0;
+			
+			//calculate the width of the shared memory block
+			int xwidth = 2 * rmax + blockDim.x;
+			int ywidth = 2 * rmax + blockDim.y;
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			int bxs = bxi - rmax;			
+			int bys = byi - rmax;	
+			//compute the coordinations of this pixel in the 2D-shared memory.
+			int sx_rx = threadIdx.x + rmax;
+			int sy_ry = threadIdx.y + rmax;
+			//copy the portion of the image necessary for this block to shared memory
+			__syncthreads();
+			cpyG2S2D2ch<float>(s_grad, gpuGrad, bxs, bys, 2*xwidth, ywidth, threadIdx, blockDim, x, y);
+			__syncthreads();
+			
+			for(int yr = -rmax; yr <= rmax; yr++){
+				int yi_v = (yi + yr) ;
+				//compute the position of the current voter in the shared memory along the y axis.
+				unsigned int sIdx_y1d = (sy_ry + yr)* xwidth;
+				//if (yi+yr<y && yi+yr>=0){
+					if(xi < x && yi < y){
+
+						for(int xr = -rmax; xr <= rmax; xr++){
+					
+								//compute the position of the current voter in the 2D-shared memory along the x axis.
+								unsigned int sIdx_x = (sx_rx + xr);
+								//find the 1D index of this voter in the 2D-shared memory.
+								unsigned int s_Idx = (sIdx_y1d  + sIdx_x);
+								unsigned int s_Idx2 = s_Idx * 2;
+								
+								//find the location of this voter in the atan2 table
+								int id_t = (yr + rmax) * x_table + xr + rmax;
+
+								// calculate the angle between the pixel and the current voter in x and y directions
+								float atan_angle = gpuTable[id_t];
+												
+								// calculate the voting direction based on the grtadient direction
+								//int idx_share = xr + tx_rmax ;
+								float theta = s_grad[s_Idx2];
+								float mag = s_grad[s_Idx2 + 1];
+							
+
+								// check if the current voter is located in the voting area of this pixel.
+								if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+									sum += mag;		
+
+								}
+						}
+				
+					}
+				//}
+			}
+			if(xi < x && yi < y)
+				gpuVote[i] = sum;
+			
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(sqrt(max_threads), sqrt(max_threads));
+			dim3 blocks(x/threads.x + 1 , y/threads.y+1);
+			
+					
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*2*sizeof(T);
+			
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_IVOTE_ATOMIC_H
+#define STIM_CUDA_IVOTE_ATOMIC_H
+
+#include <stim/cuda/ivote/down_sample.cuh>
+#include <stim/cuda/ivote/local_max.cuh>
+#include <stim/cuda/ivote/update_dir_global.cuh>
+//#include <stim/cuda/ivote/vote_shared_32-32.cuh>
+#include <stim/cuda/ivote/vote_atomic_shared.cuh>
+#include <stim/cuda/ivote/re_sample.cuh>
+namespace stim{
+	namespace cuda{
+	
+	}
+}
+
+
+
+#endif
 \ No newline at end of file