Laila Saadatifard · David Mayerich · Laila Saadatifard · Laila Saadatifard · David Mayerich · David Mayerich
Showing 9 changed files Show diff stats
stim/cuda/ivote/local_max.cuh
stim/cuda/ivote/re_sample.cuh
stim/cuda/ivote/update_dir_global.cuh → stim/cuda/ivote/update_dir_bb.cuh
stim/cuda/ivote/david_update_dir_global.cuh → stim/cuda/ivote/update_dir_threshold_global.cuh
stim/cuda/ivote/vote_atomic_bb.cuh
stim/cuda/ivote/vote_atomic_shared.cuh
stim/cuda/ivote/vote_shared.cuh
stim/cuda/ivote/vote_threshold_global.cuh
stim/cuda/ivote_atomic.cuh → stim/cuda/ivote_atomic_bb.cuh
@@ -14,7 +14,7 @@ namespace stim{
  
 			// calculate the 2D coordinates for this current thread.
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
  
 			if(xi >= x || yi >= y)
 				return;
@@ -63,8 +63,10 @@ namespace stim{
 		void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
  
 			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads, 1);
-			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			/*dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/
+			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
+			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
  
 			//call the kernel to find the local maximum.
 			cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y);
+#ifndef STIM_CUDA_RE_SAMPLE_H
+#define STIM_CUDA_RE_SAMPLE_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/templates/gaussian_blur.cuh>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
+
+			unsigned int sigma_ds = 1/resize;
+			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
+			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
+			
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+			
+			if(xi< x && yi< y){
+				if(xi%sigma_ds==0){
+					if(yi%sigma_ds==0){
+						gpuI[i] = gpuI0[(yi/sigma_ds)*x_ds + xi/sigma_ds];
+					}
+				}
+				else gpuI[i] = 0;
+
+				//int x_org = xi * sigma_ds ;
+				//int y_org = yi * sigma_ds ;
+				//int i_org = y_org * x + x_org;
+				//gpuI[i] = gpuI0[i_org];
+			}
+
+		}
+
+
+		/// Applies a Gaussian blur to a 2D image stored on the GPU
+		template<typename T>
+		void gpu_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
+
+			
+			//unsigned int sigma_ds = 1/resize;
+			//unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
+			//unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
+			
+			//get the number of pixels in the image
+			//unsigned int pixels_ds = x_ds * y_ds;
+			
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+			//stim::cuda::gpu_gaussian_blur2<float>(gpuI0, sigma_ds,x ,y);
+			
+			//resample the image
+			cuda_re_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y);
+
+		}
+
+		/// Applies a Gaussian blur to a 2D image stored on the CPU
+		template<typename T>
+		void cpu_re_sample(T* out, T* in, T resize, unsigned int x, unsigned int y){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x*y;
+			unsigned int bytes = sizeof(T) * pixels;
+			
+			unsigned int sigma_ds = 1/resize;
+			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
+			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
+			unsigned int bytes_ds = sizeof(T) * x_ds * y_ds;
+			
+
+
+			//allocate space on the GPU for the original image
+			T* gpuI0;
+			cudaMalloc(&gpuI0, bytes_ds);
+			
+			
+			//copy the image data to the GPU
+			cudaMemcpy(gpuI0, in, bytes_ds, cudaMemcpyHostToDevice);
+
+			//allocate space on the GPU for the down sampled image
+			T* gpuI;
+			cudaMalloc(&gpuI, bytes);
+
+			//run the GPU-based version of the algorithm
+			gpu_re_sample<T>(gpuI, gpuI0, resize, x, y);
+
+			//copy the image data to the GPU
+			cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
+
+			cudaFree(gpuI0);
+			cudeFree(gpuI);
+		}
+	
+	}
+}
+
+#endif
 \ No newline at end of file
-#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
-#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
+#ifndef STIM_CUDA_UPDATE_DIR_BB_H
+#define STIM_CUDA_UPDATE_DIR_BB_H
  
 # include <iostream>
 # include <cuda.h>
@@ -7,8 +7,7 @@
 #include <stim/cuda/sharedmem.cuh>
 #include <stim/visualization/aabb2.h>
 #include <stim/visualization/colormap.h>
-#include <math.h>
-#include "cpyToshare.cuh" 
+#include <math.h> 
  
 //#define RMAX_TEST	8
  
@@ -76,68 +75,6 @@ namespace stim{
 			gpuDir[i] = atan2((T)max_dy, (T)max_dx);
 		}
  
-		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
-		template<typename T>
-		__global__ void leila_cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
-
-			
-			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-
-			if(xi >= x || yi >= y) return;													//if the index is outside of the image, terminate the kernel
-
-			int i = yi * x + xi;												// convert 2D coordinates to 1D
-			
-			float theta = gpuGrad[2*i];											// calculate the voting direction based on the grtadient direction - global memory fetch			
-			gpuDir[i] = 0;														//initialize the vote direction to zero			
-			float max = 0;														// define a local variable to maximum value of the vote image in the voting area for this voter
-			int id_x = 0;														// define two local variables for the x and y position of the maximum
-			int id_y = 0;
-			
-			int x_table = 2*rmax +1;											// compute the size of window which will be checked for finding the voting area for this voter
-			int rmax_sq = rmax * rmax;
-			int tx_rmax = threadIdx.x + rmax;
-			float atan_angle;
-			float vote_c;
-			int xidx, yidx, yr_sq, xr_sq;
-			for(int yr = -rmax; yr <= rmax; yr++){
-				yidx = yi + yr;													//compute the index into the image
-				if (yidx >= 0 && yidx < y){									//if the current y-index is inside the image
-					yr_sq = yr * yr;											//compute the square of yr, to save time later
-					for(int xr = -rmax; xr <= rmax; xr++){
-						xidx = xi + xr;
-						if(xidx >= 0 && xidx < x){
-							xr_sq = xr * xr;
-							unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
-
-							// calculate the angle between the voter and the current pixel in x and y directions
-							atan_angle = gpuTable[ind_t];
-							//atan_angle = atan2((T)yr, (T)xr);
-											
-							// check if the current pixel is located in the voting area of this voter.
-							if (((xr_sq + yr_sq)< rmax_sq) && (abs(atan_angle - theta) <phi)){
-								
-								vote_c = gpuVote[yidx * x + xidx];				// find the vote value for the current counter
-							// compare the vote value of this pixel with the max value to find the maxima and its index.
-								if  (vote_c>max) {
-
-									max = vote_c;
-									id_x =  xr;
-									id_y =  yr;
-								}
-							}
-						}
-					}
-				}
-			}
-							
-			unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
-			float new_angle = gpuTable[ind_m];
-
-			if(xi < x && yi < y)
-				gpuDir[i] = new_angle;
-		}										//end kernel
  
  
 		// this kernel updates the gradient direction by the calculated voting direction.
@@ -168,9 +105,7 @@ namespace stim{
 			HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) );	
  
 			unsigned int max_threads = stim::maxThreadsPerBlock();
-			//dim3 threads(min(x, max_threads), 1);
-			//dim3 blocks(x/threads.x, y);
-
+			
 			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
 			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
  
@@ -188,12 +123,12 @@ namespace stim{
  
 			//call the kernel to calculate the new voting direction
 			cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-			stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
+			//stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
  
 			//exit(0);
  
-			threads = dim3( sqrt(max_threads), sqrt(max_threads) );
-			blocks = dim3(x/threads.x + 1, y/threads.y + 1);
+			//threads = dim3( sqrt(max_threads), sqrt(max_threads) );
+			//blocks = dim3(x/threads.x + 1, y/threads.y + 1);
  
 			//call the kernel to update the gradient direction
 			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
-#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
-#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
+#ifndef STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBALD_H
+#define STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBAL_H
  
 # include <iostream>
 # include <cuda.h>
 #include <stim/cuda/cudatools.h>
 #include <stim/cuda/sharedmem.cuh>
-#include <math.h>
-#include "cpyToshare.cuh" 
-
-#define RMAX_TEST	8
+#include "cpyToshare.cuh"   
  
 namespace stim{
 	namespace cuda{
  
 		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
 		template<typename T>
-		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
-			extern __shared__ T atan2_table[];
-			
-			//calculate the start point for this block
-			//int bxi = blockIdx.x * blockDim.x;
-
-			stim::cuda::sharedMemcpy(atan2_table, gpuTable, (2 * rmax + 1) * (2 * rmax + 1), threadIdx.x, blockDim.x);
+		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x,  int y){
  
-			__syncthreads();
  
-			// calculate the 2D coordinates for this current thread.
-			//int xi = bxi + threadIdx.x;
+			
+			// calculate the coordinate for this current thread.
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-			if(xi >= x || yi >= y) return;													//if the index is outside of the image, terminate the kernel
-
-			int i = yi * x + xi;												// convert 2D coordinates to 1D
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuTh[3*xi];
  
-			float theta = gpuGrad[2*i];											// calculate the voting direction based on the grtadient direction - global memory fetch			
-			gpuDir[i] = 0;														//initialize the vote direction to zero			
-			float max = 0;														// define a local variable to maximum value of the vote image in the voting area for this voter
-			int id_x = 0;														// define two local variables for the x and y position of the maximum
-			int id_y = 0;
+			//calculate the position and x, y coordinations of this voter in the original image
+			unsigned int i_v = gpuTh[3*xi+2];
+			unsigned int y_v = i_v/x;
+			unsigned int x_v = i_v - (y_v*x);
  
-			int x_table = 2*rmax +1;											// compute the size of window which will be checked for finding the voting area for this voter
+			//initialize the vote direction to zero
+			gpuDir[xi] = 0;
+
+			// define a local variable to maximum value of the vote image in the voting area for this voter
+			float max = 0;
+
+			// define two local variables for the x and y coordinations where the maximum happened
+			int id_x = 0;
+			int id_y = 0;
+
+			// compute the size of window which will be checked for finding the voting area for this voter
+			int x_table = 2*rmax +1;
 			int rmax_sq = rmax * rmax;
 			int tx_rmax = threadIdx.x + rmax;
-			float atan_angle;
-			float vote_c;
-			unsigned int ind_t;
-			for(int yr = -rmax; yr <= rmax; yr++){					//for each counter in the y direction
-				if (yi+yr >= 0 && yi + yr < y){									//if the counter exists (we aren't looking outside of the image)
-					for(int xr = -rmax; xr <= rmax; xr++){					//for each counter in the x direction
-						if((xr * xr + yr *yr)< rmax_sq){								//if the counter is within range of the voter
-
-							ind_t = (rmax - yr) * x_table + rmax - xr;		//calculate the index to the atan2 table							
-							atan_angle = atan2_table[ind_t];								//retrieve the direction vector from the table						
-
-							//atan_angle = atan2((float)yr, (float)xr);
-							
-							if (abs(atan_angle - theta) <phi){							// check if the current pixel is located in the voting angle of this voter.				
-								vote_c = gpuVote[(yi+yr)*x + (xi+xr)];			// find the vote value for the current counter						
-								if(vote_c>max) {								// compare the vote value of this pixel with the max value to find the maxima and its index.
-									max = vote_c;
-									id_x =  xr;
-									id_y =  yr;
-								}
+			if(xi < th_size){
+				
+				for(int yr = -rmax; yr <= rmax; yr++){
+					
+					for(int xr = -rmax; xr <= rmax; xr++){
+
+						unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+						// find the angle between the voter and the current pixel in x and y directions
+						float atan_angle = gpuTable[ind_t];
+										
+						// check if the current pixel is located in the voting area of this voter.
+						if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+							// find the vote value for the current counter
+							float vote_c = gpuVote[(y_v+yr)*x + (x_v+xr)];
+							// compare the vote value of this pixel with the max value to find the maxima and its index.
+							if  (vote_c>max) {
+
+								max = vote_c;
+								id_x =  xr;
+								id_y =  yr;
 							}
 						}
 					}
 				}
-			}
+			
  
-			unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
-			float new_angle = gpuTable[ind_m];
+				unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
+				float new_angle = gpuTable[ind_m];
+				gpuDir[xi] = new_angle;
+			}
  
-			if(xi < x && yi < y)
-				gpuDir[i] = new_angle;
-		}										//end kernel
+		}
  
 		// this kernel updates the gradient direction by the calculated voting direction.
 		template<typename T>
-		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
+		__global__ void cuda_update_grad(T* gpuTh, T* gpuDir, int th_size, int x, int y){
  
-			// calculate the 2D coordinates for this current thread.
+			// calculate the coordinate for this current thread.
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-		
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
  
+		
 			//update the gradient image with the vote direction
-			gpuGrad[2*i] = gpuDir[i];
+			gpuTh[3*xi] = gpuDir[xi];
 		}
  
 		template<typename T>
-		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			
+		void gpu_update_dir(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){
  
 			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
+			unsigned int bytes_th = th_size* sizeof(T);
  
 			unsigned int max_threads = stim::maxThreadsPerBlock();
-
-			dim3 threads(sqrt(max_threads), sqrt(max_threads));
-			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
-
-			
+			dim3 threads(max_threads);
+			dim3 blocks(th_size/threads.x+1);
  
 			// allocate space on the GPU for the updated vote direction
 			T* gpuDir;
-			cudaMalloc(&gpuDir, bytes);	
-
-			size_t shared_mem = sizeof(T) * std::pow((2 * rmax + 1), 2);
-			std::cout<<"Shared memory for atan2 table: "<<shared_mem<<std::endl;
+			cudaMalloc(&gpuDir, bytes_th);	
  
 			//call the kernel to calculate the new voting direction
-			cuda_update_dir <<< blocks, threads, shared_mem>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+			cuda_update_dir <<< blocks, threads>>>(gpuDir, gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y);
  
 			//call the kernel to update the gradient direction
-			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
+			cuda_update_grad <<< blocks, threads >>>(gpuTh, gpuDir, th_size, x , y);
  
 			//free allocated memory
 			cudaFree(gpuDir);
+#ifndef STIM_CUDA_VOTE_ATOMIC_BB_H
+#define STIM_CUDA_VOTE_ATOMIC_BB_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include <stim/visualization/aabb2.h>
+#include <stim/visualization/colormap.h>
+#include <math.h>
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+
+			extern __shared__ T S[];
+			T* shared_atan = S;
+			size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
+			stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			
+			if(xi >= x || yi >= y) return;			
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuGrad[2*i];
+			//calculate the amount of vote for the voter
+			float mag = gpuGrad[2*i + 1];
+			
+
+			stim::aabb2<int> bb(xi, yi);								//initialize a bounding box at the current point
+			bb.insert(xi + ceil(rmax * cos(theta)),       ceil(yi + rmax * sin(theta)));
+			bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi)));		//insert one corner of the triangle into the bounding box
+			bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi)));		//insert the final corner into the bounding box
+			
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			
+			int lut_i;
+			T dx_sq, dy_sq;
+
+			bb.trim_low(0, 0);															//make sure the bounding box doesn't go outside the image
+			bb.trim_high(x-1, y-1);
+
+			int by, bx;
+			int dx, dy;					
+			
+			unsigned int ind_g;											//initialize the maximum vote value to zero
+			T alpha;
+			
+			for(by = bb.low[1]; by <= bb.high[1]; by++){					//for each element in the bounding box
+				dy = by - yi;											//calculate the y coordinate of the current point relative to yi
+				dy_sq = dy * dy;
+				for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
+					dx = bx - xi;
+					dx_sq = dx * dx;
+					lut_i = (rmax - dy) * x_table + rmax - dx;
+					alpha = shared_atan[lut_i];
+					if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
+						ind_g = (by)*x + (bx);
+						atomicAdd(&gpuVote[ind_g], mag);
+					
+					}
+				}
+			}			
+			
+		}
+	
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
+			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
+			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
+			size_t shared_mem_req = table_bytes;// + template_bytes;
+			std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;		
+			size_t shared_mem = stim::sharedMemPerBlock();
+			if(shared_mem_req > shared_mem){
+				std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
+				exit(1);
+			}
+
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
@@ -5,7 +5,7 @@
 # include <cuda.h>
 #include <stim/cuda/cudatools.h>
 #include <stim/cuda/sharedmem.cuh>
-#include "cpyToshare.cuh"
+
 //#include "writebackshared.cuh"
 namespace stim{
 	namespace cuda{
+#ifndef STIM_CUDA_VOTE_SHARED_H
+#define STIM_CUDA_VOTE_SHARED
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ float s_grad[];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = bxi + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+						
+			// define a local variable to sum the votes from the voters
+			float sum = 0;
+			
+			//calculate the width of the shared memory block
+			int swidth = 2 * rmax + blockDim.x;
+			
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			int bxs = bxi - rmax;
+			
+			//for every line (along y)
+			for(int yr = -rmax; yr <= rmax; yr++){
+				if (yi+yr<y && yi+yr>=0){
+					//copy the portion of the image necessary for this block to shared memory
+					__syncthreads();
+					cpyG2S1D2ch<float>(s_grad, gpuGrad, bxs, yi + yr , 2*swidth, 1, threadIdx, blockDim, x, y);
+					__syncthreads();
+				
+					if(xi < x && yi < y){
+
+						for(int xr = -rmax; xr <= rmax; xr++){
+					
+								//find the location of this voter in the atan2 table
+								int id_t = (yr + rmax) * x_table + xr + rmax;
+
+								// calculate the angle between the pixel and the current voter in x and y directions
+								float atan_angle = gpuTable[id_t];
+												
+								// calculate the voting direction based on the grtadient direction
+								int idx_share = xr + tx_rmax ;
+								float theta = s_grad[idx_share*2];
+								float mag = s_grad[idx_share*2 + 1];
+							
+
+								// check if the current voter is located in the voting area of this pixel.
+								if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+									sum += mag;		
+
+								}
+						}
+				
+					}
+				}
+			}
+			if(xi < x && yi < y)
+				gpuVote[i] = sum;
+			
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+					
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*1*2*sizeof(T);
+			
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H
+#define STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x, int y){
+
+						
+			// calculate the x coordinate for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuTh[3*xi];
+			//find the gradient magnitude for the current voter
+			float mag = gpuTh[3*xi + 1];
+			//calculate the position and x, y coordinations of this voter in the original image
+			unsigned int i_v = gpuTh[3*xi+2];
+			unsigned int y_v = i_v/x;
+			unsigned int x_v = i_v - (y_v*x);
+			
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			if(xi < th_size){
+				for(int yr = -rmax; yr <= rmax; yr++){					
+					for(int xr = -rmax; xr <= rmax; xr++){
+						if ((y_v+yr)>=0 && (y_v+yr)<y && (x_v+xr)>=0 && (x_v+xr)<x){
+					
+							//find the location of the current pixel in the atan2 table
+							unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+							// calculate the angle between the voter and the current pixel in x and y directions
+							float atan_angle = gpuTable[ind_t];
+						
+							// check if the current pixel is located in the voting area of this voter.
+							if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+								// calculate the 1D index for the current pixel in global memory
+								unsigned int ind_g = (y_v+yr)*x + (x_v+xr);
+								atomicAdd(&gpuVote[ind_g], mag);
+							
+								}
+						}
+					}
+				}	
+			}
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads);
+			dim3 blocks(th_size/threads.x + 1);
+			
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads>>>(gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
-#ifndef STIM_CUDA_IVOTE_ATOMIC_H
-#define STIM_CUDA_IVOTE_ATOMIC_H
+#ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H
+#define STIM_CUDA_IVOTE_ATOMIC_BB_H
  
 #include <stim/cuda/ivote/down_sample.cuh>
 #include <stim/cuda/ivote/local_max.cuh>
-#include <stim/cuda/ivote/update_dir_global.cuh>
-//#include <stim/cuda/ivote/vote_shared_32-32.cuh>
-#include <stim/cuda/ivote/vote_atomic_shared.cuh>
-//#include <stim/cuda/ivote/re_sample.cuh>
+#include <stim/cuda/ivote/update_dir_bb.cuh>
+#include <stim/cuda/ivote/vote_atomic_bb.cuh>
+
 namespace stim{
 	namespace cuda{