Merge branch 'master' of git.stim.ee.uh.edu:codebase/stimlib

Pavel Govyadinov
2 parents 4fa3f483 ca99f951
Showing 14 changed files with 1196 additions and 18 deletions Show diff stats
stim/cuda/arraymath.cuh
stim/cuda/ivote/david_update_dir_global.cuh
stim/cuda/ivote/local_max.cuh
stim/cuda/ivote/update_dir_global.cuh
stim/cuda/ivote/update_dir_shared.cuh
stim/cuda/ivote/vote_atomic.cuh
stim/cuda/ivote/vote_atomic_shared.cuh
stim/cuda/ivote/vote_shared_32-32.cuh
stim/cuda/ivote_atomic.cuh
stim/cuda/sharedmem.cuh
stim/cuda/templates/conv2sep.cuh
stim/image/image.h
stim/optics/scalarfield.h
stim/visualization/aabb2.h
@@ -10,7 +10,8 @@
 #include <stim/cuda/arraymath/array_atan.cuh>
 #include <stim/cuda/arraymath/array_abs.cuh>
 #include <stim/cuda/arraymath/array_cart2polar.cuh>
-
+//#include <stim/cuda/arraymath/array_threshold.cuh>
+//#include <stim/cuda/arraymath/array_add3.cuh>
 namespace stim{
 	namespace cuda{
  
+#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
+#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include <math.h>
+#include "cpyToshare.cuh" 
+
+#define RMAX_TEST	8
+
+namespace stim{
+	namespace cuda{
+	
+		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
+		template<typename T>
+		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
+			extern __shared__ T atan2_table[];
+			
+			//calculate the start point for this block
+			//int bxi = blockIdx.x * blockDim.x;
+
+			stim::cuda::sharedMemcpy(atan2_table, gpuTable, (2 * rmax + 1) * (2 * rmax + 1), threadIdx.x, blockDim.x);
+
+			__syncthreads();
+			
+			// calculate the 2D coordinates for this current thread.
+			//int xi = bxi + threadIdx.x;
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			if(xi >= x || yi >= y) return;													//if the index is outside of the image, terminate the kernel
+
+			int i = yi * x + xi;												// convert 2D coordinates to 1D
+			
+			float theta = gpuGrad[2*i];											// calculate the voting direction based on the grtadient direction - global memory fetch			
+			gpuDir[i] = 0;														//initialize the vote direction to zero			
+			float max = 0;														// define a local variable to maximum value of the vote image in the voting area for this voter
+			int id_x = 0;														// define two local variables for the x and y position of the maximum
+			int id_y = 0;
+			
+			int x_table = 2*rmax +1;											// compute the size of window which will be checked for finding the voting area for this voter
+			int rmax_sq = rmax * rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			float atan_angle;
+			float vote_c;
+			unsigned int ind_t;
+			for(int yr = -rmax; yr <= rmax; yr++){					//for each counter in the y direction
+				if (yi+yr >= 0 && yi + yr < y){									//if the counter exists (we aren't looking outside of the image)
+					for(int xr = -rmax; xr <= rmax; xr++){					//for each counter in the x direction
+						if((xr * xr + yr *yr)< rmax_sq){								//if the counter is within range of the voter
+
+							ind_t = (rmax - yr) * x_table + rmax - xr;		//calculate the index to the atan2 table							
+							atan_angle = atan2_table[ind_t];								//retrieve the direction vector from the table						
+
+							//atan_angle = atan2((float)yr, (float)xr);
+							
+							if (abs(atan_angle - theta) <phi){							// check if the current pixel is located in the voting angle of this voter.				
+								vote_c = gpuVote[(yi+yr)*x + (xi+xr)];			// find the vote value for the current counter						
+								if(vote_c>max) {								// compare the vote value of this pixel with the max value to find the maxima and its index.
+									max = vote_c;
+									id_x =  xr;
+									id_y =  yr;
+								}
+							}
+						}
+					}
+				}
+			}
+							
+			unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
+			float new_angle = gpuTable[ind_m];
+
+			if(xi < x && yi < y)
+				gpuDir[i] = new_angle;
+		}										//end kernel
+
+		// this kernel updates the gradient direction by the calculated voting direction.
+		template<typename T>
+		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
+
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+		
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+			
+			//update the gradient image with the vote direction
+			gpuGrad[2*i] = gpuDir[i];
+		}
+		
+		template<typename T>
+		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+
+			dim3 threads(sqrt(max_threads), sqrt(max_threads));
+			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
+
+			
+			
+			// allocate space on the GPU for the updated vote direction
+			T* gpuDir;
+			cudaMalloc(&gpuDir, bytes);	
+
+			size_t shared_mem = sizeof(T) * std::pow((2 * rmax + 1), 2);
+			std::cout<<"Shared memory for atan2 table: "<<shared_mem<<std::endl;
+
+			//call the kernel to calculate the new voting direction
+			cuda_update_dir <<< blocks, threads, shared_mem>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+			//call the kernel to update the gradient direction
+			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
+			
+			//free allocated memory
+			cudaFree(gpuDir);
+
+		}
+		
+		template<typename T>
+		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);
+
+			//copy the input vote image to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the update direction function
+			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the new gradient image back to the CPU
+			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
@@ -29,17 +29,33 @@ namespace stim{
 			//compare to the threshold
 			if(val < final_t) return;
  
+			//define an array to store indices with same vote value
+			/*int * IdxEq;
+			IdxEq = new int  [2*conn];
+			int n = 0;*/
+			
 			for(int xl = xi - conn; xl < xi + conn; xl++){
 				for(int yl = yi - conn; yl < yi + conn; yl++){
 					if(xl >= 0 && xl < x && yl >= 0 && yl < y){
 						int il = yl * x + xl;
 						if(gpuVote[il] > val){							
 							return;
-						}							
-					}
+							}
+						if (gpuVote[il] == val){
+							/*IdxEq[n] = il;
+							n = n+1;*/
+							 if( il > i){
+								 return;
+							}
+						}
+					}							
 				}
 			}
-			
+			/*if (n!=0){
+				if(IdxEq[n/2] !=i){
+					return;
+				}
+			}	*/	
 			gpuCenters[i] = 1;
 		}
  
+#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
+#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include <stim/visualization/aabb2.h>
+#include <stim/visualization/colormap.h>
+#include <math.h>
+#include "cpyToshare.cuh" 
+
+//#define RMAX_TEST	8
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
+			extern __shared__ T S[];
+			T* shared_atan = S;
+			size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
+			stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
+
+			//T* shared_vote = &S[n_table];
+			//size_t template_size_x = (blockDim.x + 2 * rmax);
+			//size_t template_size_y = (blockDim.y + 2 * rmax);
+			//stim::cuda::threadedMemcpy2D((char*)shared_vote, (char*)gpuVote, template_size_x, template_size_y, x,  threadIdx.y * blockDim.x + threadIdx.x, blockDim.x * blockDim.y);
+			
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;				//calculate the 2D coordinates for this current thread.
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+
+			if(xi >= x || yi >= y) return;								//if the index is outside of the image, terminate the kernel
+
+			int i = yi * x + xi;										//convert 2D coordinates to 1D
+			float theta = gpuGrad[2*i];									//calculate the voting direction based on the grtadient direction - global memory fetch
+			
+			stim::aabb2<int> bb(xi, yi);								//initialize a bounding box at the current point
+			bb.insert(xi + ceil(rmax * cos(theta)),       ceil(yi + rmax * sin(theta)));
+			bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi)));		//insert one corner of the triangle into the bounding box
+			bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi)));		//insert the final corner into the bounding box
+
+			int x_table = 2*rmax +1;
+			int lut_i;
+			T rmax_sq = rmax * rmax;
+			T dx_sq, dy_sq;
+
+			bb.trim_low(0, 0);															//make sure the bounding box doesn't go outside the image
+			bb.trim_high(x-1, y-1);
+
+			int by, bx;
+			int dx, dy;													//coordinate relative to (xi, yi)
+			T v;
+			T max_v = 0;												//initialize the maximum vote value to zero
+			T alpha;
+			int max_dx = bb.low[0];
+			int max_dy = bb.low[1];
+			for(by = bb.low[1]; by <= bb.high[1]; by++){					//for each element in the bounding box
+				dy = by - yi;											//calculate the y coordinate of the current point relative to yi
+				dy_sq = dy * dy;
+				for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
+					dx = bx - xi;
+					dx_sq = dx * dx;
+					lut_i = (rmax - dy) * x_table + rmax - dx;
+					alpha = shared_atan[lut_i];
+					if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
+						v = gpuVote[by * x + bx];				// find the vote value for the current counter
+						if(v > max_v){
+							max_v = v;
+							max_dx = dx;
+							max_dy = dy;
+						}
+					}
+				}
+			}			
+			gpuDir[i] = atan2((T)max_dy, (T)max_dx);
+		}
+	
+		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
+		template<typename T>
+		__global__ void leila_cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
+
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+
+			if(xi >= x || yi >= y) return;													//if the index is outside of the image, terminate the kernel
+
+			int i = yi * x + xi;												// convert 2D coordinates to 1D
+			
+			float theta = gpuGrad[2*i];											// calculate the voting direction based on the grtadient direction - global memory fetch			
+			gpuDir[i] = 0;														//initialize the vote direction to zero			
+			float max = 0;														// define a local variable to maximum value of the vote image in the voting area for this voter
+			int id_x = 0;														// define two local variables for the x and y position of the maximum
+			int id_y = 0;
+			
+			int x_table = 2*rmax +1;											// compute the size of window which will be checked for finding the voting area for this voter
+			int rmax_sq = rmax * rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			float atan_angle;
+			float vote_c;
+			int xidx, yidx, yr_sq, xr_sq;
+			for(int yr = -rmax; yr <= rmax; yr++){
+				yidx = yi + yr;													//compute the index into the image
+				if (yidx >= 0 && yidx < y){									//if the current y-index is inside the image
+					yr_sq = yr * yr;											//compute the square of yr, to save time later
+					for(int xr = -rmax; xr <= rmax; xr++){
+						xidx = xi + xr;
+						if(xidx >= 0 && xidx < x){
+							xr_sq = xr * xr;
+							unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+							// calculate the angle between the voter and the current pixel in x and y directions
+							atan_angle = gpuTable[ind_t];
+							//atan_angle = atan2((T)yr, (T)xr);
+											
+							// check if the current pixel is located in the voting area of this voter.
+							if (((xr_sq + yr_sq)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+								
+								vote_c = gpuVote[yidx * x + xidx];				// find the vote value for the current counter
+							// compare the vote value of this pixel with the max value to find the maxima and its index.
+								if  (vote_c>max) {
+
+									max = vote_c;
+									id_x =  xr;
+									id_y =  yr;
+								}
+							}
+						}
+					}
+				}
+			}
+							
+			unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
+			float new_angle = gpuTable[ind_m];
+
+			if(xi < x && yi < y)
+				gpuDir[i] = new_angle;
+		}										//end kernel
+		
+
+		// this kernel updates the gradient direction by the calculated voting direction.
+		template<typename T>
+		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
+
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+
+			if(xi >= x || yi >= y) return;
+		
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+			
+			//update the gradient image with the vote direction
+			gpuGrad[2*i] = gpuDir[i];
+		}
+		
+		template<typename T>
+		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+			
+			// allocate space on the GPU for the updated vote direction
+			T* gpuDir;
+			HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) );	
+
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			//dim3 threads(min(x, max_threads), 1);
+			//dim3 blocks(x/threads.x, y);
+
+			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
+			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
+
+			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
+			//size_t curtain = 2 * rmax;
+			//size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain);
+			size_t shared_mem_req = table_bytes;// + template_bytes;
+			std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
+
+			size_t shared_mem = stim::sharedMemPerBlock();
+			if(shared_mem_req > shared_mem){
+				std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
+				exit(1);
+			}
+
+			//call the kernel to calculate the new voting direction
+			cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+			stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
+
+			//exit(0);
+
+			threads = dim3( sqrt(max_threads), sqrt(max_threads) );
+			blocks = dim3(x/threads.x + 1, y/threads.y + 1);
+
+			//call the kernel to update the gradient direction
+			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
+			//free allocated memory
+			HANDLE_ERROR( cudaFree(gpuDir) );
+
+		}
+		
+		template<typename T>
+		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);
+
+			//copy the input vote image to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the update direction function
+			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the new gradient image back to the CPU
+			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_UPDATE_DIR_SHARED_H
+#define STIM_CUDA_UPDATE_DIR_SHARED_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+
+namespace stim{
+	namespace cuda{
+	
+		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
+		template<typename T>
+		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ float s_vote[];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = bxi + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuGrad[2*i];
+
+			//initialize the vote direction to zero
+			gpuDir[i] = 0;
+
+			// define a local variable to maximum value of the vote image in the voting area for this voter
+			float max = 0;
+
+			// define two local variables for the x and y coordinations where the maximum happened
+			int id_x = 0;
+			int id_y = 0;
+
+			//calculate the width of the shared memory block
+			int swidth = 2 * rmax + blockDim.x;
+
+			// compute the size of window which will be checked for finding the voting area for this voter
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			int bxs = bxi - rmax;
+						
+			for(int yr = -rmax; yr <= rmax; yr++){
+				//if (yi+yr >= 0 && yi + yr < y){
+					//copy the portion of the image necessary for this block to shared memory
+					__syncthreads();
+					cpyG2S1D<float>(s_vote, gpuVote, bxs, yi + yr , swidth, 1, threadIdx, blockDim, x, y);
+					__syncthreads();
+				
+					//if the current thread is outside of the image, it doesn't have to be computed
+					if(xi < x && yi < y){
+
+						for(int xr = -rmax; xr <= rmax; xr++){
+
+							unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+							// calculate the angle between the voter and the current pixel in x and y directions
+							float atan_angle = gpuTable[ind_t];
+						
+							// calculate the voting direction based on the grtadient direction
+							int idx_share_update = xr + tx_rmax ;
+							float share_vote = s_vote[idx_share_update];
+						
+							// check if the current pixel is located in the voting area of this voter.
+							if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+
+							// compare the vote value of this pixel with the max value to find the maxima and its index.
+								if  (share_vote>max) {
+
+									max = share_vote;
+									id_x =  xr;
+									id_y =  yr;
+								}
+							}
+						}
+					}
+				//}
+			}
+							
+		unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
+		float new_angle = gpuTable[ind_m];
+
+		if(xi < x && yi < y)
+			gpuDir[i] = new_angle;
+
+		}
+
+		// this kernel updates the gradient direction by the calculated voting direction.
+		template<typename T>
+		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
+
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+		
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+			
+			//update the gradient image with the vote direction
+			gpuGrad[2*i] = gpuDir[i];
+		}
+		
+		template<typename T>
+		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*(1)*4;
+			
+			// allocate space on the GPU for the updated vote direction
+			T* gpuDir;
+			cudaMalloc(&gpuDir, bytes);	
+
+			//call the kernel to calculate the new voting direction
+			cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+			//call the kernel to update the gradient direction
+			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
+			
+			//free allocated memory
+			cudaFree(gpuDir);
+
+		}
+		
+		template<typename T>
+		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);
+
+			//copy the input vote image to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the update direction function
+			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the new gradient image back to the CPU
+			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_VOTE_ATOMIC_H
+#define STIM_CUDA_VOTE_ATOMIC_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuGrad[2*i];
+			//calculate the amount of vote for the voter
+			float mag = gpuGrad[2*i + 1];
+						
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			if(xi < x && yi < y){
+			//for every line (along y)
+				for(int yr = -rmax; yr <= rmax; yr++){					
+					for(int xr = -rmax; xr <= rmax; xr++){
+						if ((yi+yr)>=0 && (yi+yr)<y && (xi+xr)>=0 && (xi+xr)<x){
+					
+							//find the location of the current pixel in the atan2 table
+							unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+							// calculate the angle between the voter and the current pixel in x and y directions
+							float atan_angle = gpuTable[ind_t];
+						
+							// check if the current pixel is located in the voting area of this voter.
+							if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+								// calculate the 1D index for the current pixel in global memory
+								unsigned int ind_g = (yi+yr)*x + (xi+xr);
+								atomicAdd(&gpuVote[ind_g], mag);
+							
+								}
+						}
+					}
+				}	
+			}
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+					
+			// specify  share memory
+			//unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4;
+			
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_VOTE_ATOMIC_SHARED_H
+#define STIM_CUDA_VOTE_ATOMIC_SHARED_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+//#include "writebackshared.cuh"
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+
+			//generate a pointer to the shared memory
+			extern __shared__ float s_vote[];
+			// calculate the 2D coordinates for this current thread.
+			int bxi = blockIdx.x * blockDim.x;
+			int byi = blockIdx.y * blockDim.y;
+			int xi = bxi + threadIdx.x;
+			int yi = byi + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			// calculate the voting direction based on the gradient direction
+			float theta = gpuGrad[2*i];
+			//calculate the amount of vote for the voter
+			float mag = gpuGrad[2*i + 1];
+			
+			//find the starting points and size of window, wich will be copied to the shared memory
+			int bxs = bxi - rmax;
+			int bys = byi - rmax;
+			int xwidth = 2*rmax + blockDim.x;
+			int ywidth = 2*rmax + blockDim.y;
+			//compute the coordinations of this pixel in the 2D-shared memory.
+			int sx_rx = threadIdx.x + rmax;
+			int sy_ry = threadIdx.y + rmax;
+			// compute the size of window which will be checked for finding the counters for this voter
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			//calculate some parameters for indexing shared memory
+				//calculate the total number of threads available
+				unsigned int tThreads = blockDim.x * blockDim.y;
+				//calculate the current 1D thread ID
+				unsigned int ti =  threadIdx.y * (blockDim.x) + threadIdx.x;
+				//calculate the number of iteration required
+				unsigned int In = xwidth*ywidth/tThreads + 1;
+			if(xi < x && yi < y){
+				__syncthreads();
+				//initialize the shared memory to zero				
+				for (unsigned int i = 0; i < In; i++){								
+					unsigned int sIdx0 = i * tThreads + ti;
+					if (sIdx0< xwidth*ywidth) {
+						s_vote[sIdx0] = 0;
+					}
+				}
+				__syncthreads();
+				//for every line (along y)
+				for(int yr = -rmax; yr <= rmax; yr++){	
+					//compute the position of the current voter in the shared memory along the y axis.
+					unsigned int sIdx_y1d = (sy_ry + yr)* xwidth;
+					for(int xr = -rmax; xr <= rmax; xr++){												
+						
+						//find the location of the current pixel in the atan2 table
+						unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+						// calculate the angle between the voter and the current pixel in x and y directions
+						float atan_angle = gpuTable[ind_t];
+							
+						// check if the current pixel is located in the voting area of this voter.
+						if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+							//compute the position of the current voter in the 2D-shared memory along the x axis.
+							unsigned int sIdx_x = (sx_rx + xr);
+							//find the 1D index of this voter in the 2D-shared memory.
+							unsigned int s_Idx = (sIdx_y1d  + sIdx_x);
+								
+							atomicAdd(&s_vote[s_Idx], mag);								
+							}
+					}
+				}	
+				//write shared memory back to global memory
+				
+				__syncthreads();						
+				for (unsigned int i = 0; i < In; i++){
+				
+					unsigned int sIdx = i * tThreads + ti;
+					if (sIdx>= xwidth*ywidth) return;
+
+					unsigned int sy = sIdx/xwidth;
+					unsigned int sx = sIdx - (sy * xwidth);
+					
+					unsigned int gx = bxs + sx;
+					unsigned int gy = bys + sy;
+					if (gx<x&& gy<y){
+						unsigned int gIdx = gy * x + gx;
+						//write shared to global memory
+						atomicAdd(&gpuVote[gIdx], s_vote[sIdx]);
+						
+					}						
+				}
+				
+			}
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(sqrt(max_threads), sqrt(max_threads));
+			dim3 blocks(x/threads.x + 1 , y/threads.y+1);
+					
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*sizeof(T);
+			
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads, share_bytes>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_VOTE_SHARED_H
+#define STIM_CUDA_VOTE_SHARED
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ float s_grad[];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			int byi = blockIdx.y * blockDim.y;
+			// calculate the 2D coordinates for this current thread.
+			int xi = bxi + threadIdx.x;
+			int yi = byi + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+						
+			// define a local variable to sum the votes from the voters
+			float sum = 0;
+			
+			//calculate the width of the shared memory block
+			int xwidth = 2 * rmax + blockDim.x;
+			int ywidth = 2 * rmax + blockDim.y;
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			int bxs = bxi - rmax;			
+			int bys = byi - rmax;	
+			//compute the coordinations of this pixel in the 2D-shared memory.
+			int sx_rx = threadIdx.x + rmax;
+			int sy_ry = threadIdx.y + rmax;
+			//copy the portion of the image necessary for this block to shared memory
+			__syncthreads();
+			cpyG2S2D2ch<float>(s_grad, gpuGrad, bxs, bys, 2*xwidth, ywidth, threadIdx, blockDim, x, y);
+			__syncthreads();
+			
+			for(int yr = -rmax; yr <= rmax; yr++){
+				int yi_v = (yi + yr) ;
+				//compute the position of the current voter in the shared memory along the y axis.
+				unsigned int sIdx_y1d = (sy_ry + yr)* xwidth;
+				//if (yi+yr<y && yi+yr>=0){
+					if(xi < x && yi < y){
+
+						for(int xr = -rmax; xr <= rmax; xr++){
+					
+								//compute the position of the current voter in the 2D-shared memory along the x axis.
+								unsigned int sIdx_x = (sx_rx + xr);
+								//find the 1D index of this voter in the 2D-shared memory.
+								unsigned int s_Idx = (sIdx_y1d  + sIdx_x);
+								unsigned int s_Idx2 = s_Idx * 2;
+								
+								//find the location of this voter in the atan2 table
+								int id_t = (yr + rmax) * x_table + xr + rmax;
+
+								// calculate the angle between the pixel and the current voter in x and y directions
+								float atan_angle = gpuTable[id_t];
+												
+								// calculate the voting direction based on the grtadient direction
+								//int idx_share = xr + tx_rmax ;
+								float theta = s_grad[s_Idx2];
+								float mag = s_grad[s_Idx2 + 1];
+							
+
+								// check if the current voter is located in the voting area of this pixel.
+								if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+									sum += mag;		
+
+								}
+						}
+				
+					}
+				//}
+			}
+			if(xi < x && yi < y)
+				gpuVote[i] = sum;
+			
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(sqrt(max_threads), sqrt(max_threads));
+			dim3 blocks(x/threads.x + 1 , y/threads.y+1);
+			
+					
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*2*sizeof(T);
+			
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_IVOTE_ATOMIC_H
+#define STIM_CUDA_IVOTE_ATOMIC_H
+
+#include <stim/cuda/ivote/down_sample.cuh>
+#include <stim/cuda/ivote/local_max.cuh>
+#include <stim/cuda/ivote/update_dir_global.cuh>
+//#include <stim/cuda/ivote/vote_shared_32-32.cuh>
+#include <stim/cuda/ivote/vote_atomic_shared.cuh>
+//#include <stim/cuda/ivote/re_sample.cuh>
+namespace stim{
+	namespace cuda{
+	
+	}
+}
+
+
+
+#endif
 \ No newline at end of file
@@ -35,10 +35,8 @@ namespace stim{
 			}
 		}
  
-		// Copies values from global memory to shared memory, optimizing threads
-		template<typename T>
-		__device__ void sharedMemcpy(T* dest, T* src, size_t N, size_t tid, size_t nt){
-
+		// Threaded copying of data on a CUDA device.
+		__device__ void threadedMemcpy(char* dest, char* src, size_t N, size_t tid, size_t nt){
 			size_t I = N / nt + 1;	//calculate the number of iterations required to make the copy
 			size_t xi = tid;							//initialize the source and destination index to the thread ID
 			for(size_t i = 0; i < I; i++){ 				//for each iteration
@@ -48,7 +46,37 @@ namespace stim{
 			}
 		}
  
-		
+		/// Threaded copying of 2D data on a CUDA device
+		/// @param dest is a linear destination array of size nx * ny
+		/// @param src is a 2D image stored as a linear array with a pitch of X
+		/// @param X is the number of bytes in a row of src
+		/// @param tid is a 1D id for the current thread
+		/// @param nt is the number of threads in the block
+		template<typename T>
+		__device__ void threadedMemcpy2D(T* dest, size_t nx, size_t ny, 
+										 T* src, size_t x, size_t y, size_t sX, size_t sY,
+										 size_t tid, size_t nt){
+
+			size_t vals = nx * ny;								//calculate the total number of bytes to be copied
+			size_t I = vals / nt + 1;							//calculate the number of iterations required to perform the copy
+
+			size_t src_i, dest_i;
+			size_t dest_x, dest_y, src_x, src_y;
+			for(size_t i = 0; i < I; i++){						//for each iteration
+				dest_i = i * nt + tid;						//calculate the index into the destination array
+				dest_y = dest_i / nx;
+				dest_x = dest_i - dest_y * nx;
+
+				if(dest_y < ny && dest_x < nx){
+
+					src_x = x + dest_x;
+					src_y = y + dest_y;
+
+					src_i = src_y * sX + src_x;
+					dest[dest_i] = src[src_i];
+				}
+			}
+		}
 	}
 }
  
@@ -30,7 +30,8 @@ namespace stim{
 			int byi = blockIdx.y;
  
 			//copy the portion of the image necessary for this block to shared memory
-			stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
+			//stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
+			stim::cuda::sharedMemcpy_tex2D<float>(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
  
 			//calculate the thread index
 			int ti = threadIdx.x;
@@ -88,7 +89,8 @@ namespace stim{
 			int byi = blockIdx.y * blockDim.y;
  
 			//copy the portion of the image necessary for this block to shared memory
-			stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
+			//stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
+			stim::cuda::sharedMemcpy_tex2D<float>(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
  
 			//calculate the thread index
 			int ti = threadIdx.y;
@@ -160,11 +160,26 @@ public:
 			exit(1);
 		}
 		allocate(cvImage.cols, cvImage.rows, cvImage.channels());			//allocate space for the image
-		T* cv_ptr = (T*)cvImage.data;
+		unsigned char* cv_ptr = (unsigned char*)cvImage.data;
 		if(C() == 1)														//if this is a single-color image, just copy the data
 			memcpy(img, cv_ptr, bytes());
 		if(C() == 3)														//if this is a 3-color image, OpenCV uses BGR interleaving
-			set_interleaved_bgr(cv_ptr, X(), Y());
+			from_opencv(cv_ptr, X(), Y());
+	}
+
+	void from_opencv(unsigned char* buffer, size_t width, size_t height){
+		allocate(width, height, 3);
+		T value;
+		size_t i;
+		for(size_t c = 0; c < C(); c++){								//copy directly
+			for(size_t y = 0; y < Y(); y++){
+				for(size_t x = 0; x < X(); x++){
+					i = y * X() * C() + x * C() + (2-c);
+					value = buffer[i];
+					img[idx(x, y, c)] = value;
+				}
+			}
+		}
 	}
  
 	//save a file
@@ -180,23 +195,35 @@ public:
 		cv::imwrite(filename, cvImage);
 	}
  
+	void set_interleaved(T* buffer, size_t width, size_t height, size_t channels){
+		allocate(width, height, channels);
+		memcpy(img, buffer, bytes());
+	}
+
 	//create an image from an interleaved buffer
 	void set_interleaved_rgb(T* buffer, size_t width, size_t height){
-		allocate(width, height, 3);
-		memcpy(img, buffer, bytes());
+		set_interleaved(buffer, width, height, 3);
 	}
  
 	void set_interleaved_bgr(T* buffer, size_t width, size_t height){
 		allocate(width, height, 3);
+		T value;
+		size_t i;
 		for(size_t c = 0; c < C(); c++){								//copy directly
 			for(size_t y = 0; y < Y(); y++){
 				for(size_t x = 0; x < X(); x++){
-					img[idx(x, y, c)] = buffer[y * X() * C() + x * C() + (2-c)];
+					i = y * X() * C() + x * C() + (2-c);
+					value = buffer[i];
+					img[idx(x, y, c)] = value;
 				}
 			}
 		}
 	}
  
+	void set_interleaved(T* buffer, size_t width, size_t height){
+		set_interleaved_rgb(buffer, width, height);
+	}
+
 	void get_interleaved_bgr(T* data){
  
 		//for each channel
@@ -71,8 +71,8 @@ public:
 	void to_cpu(){
 		if(loc == CPUmem) return;
 		else{
-			stim::complex<T>* host_E = (stim::complex<T>*) malloc(e_bytes());			//allocate space in main memory
-			HANDLE_ERROR( cudaMemcpy(host_E, E, e_bytes(), cudaMemcpyDeviceToHost) );	//copy from GPU to CPU
+			stim::complex<T>* host_E = (stim::complex<T>*) malloc(grid_bytes());			//allocate space in main memory
+			HANDLE_ERROR( cudaMemcpy(host_E, E, grid_bytes(), cudaMemcpyDeviceToHost) );	//copy from GPU to CPU
 			HANDLE_ERROR( cudaFree(E) );												//free device memory
 			E = host_E;																	//swap pointers
 		}
+#ifndef STIM_AABB2_H
+#define STIM_AABB2_H
+
+#include <stim/cuda/cudatools/callable.h>
+
+namespace stim{
+
+/// Structure for a 2D axis aligned bounding box
+template<typename T>
+struct aabb2{
+
+//protected:
+
+	T low[2];						//top left corner position
+	T high[2];							//dimensions along x and y
+
+//public:
+
+	CUDA_CALLABLE aabb2(T x, T y){					//initialize an axis aligned bounding box of size 0 at the given position
+		low[0] = high[0] = x;					//set the position to the user specified coordinates
+		low[1] = high[1] = y;
+	}
+
+	//insert a point into the bounding box, growing the box appropriately
+	CUDA_CALLABLE void insert(T x, T y){
+		if(x < low[0]) low[0] = x;
+		if(y < low[1]) low[1] = y;
+
+		if(x > high[0]) high[0] = x;		
+		if(y > high[1]) high[1] = y;
+	}
+
+	//trim the bounding box so that the lower bounds are (x, y)
+	CUDA_CALLABLE void trim_low(T x, T y){
+		if(low[0] < x) low[0] = x;
+		if(low[1] < y) low[1] = y;
+	}
+
+	CUDA_CALLABLE void trim_high(T x, T y){
+		if(high[0] > x) high[0] = x;
+		if(high[1] > y) high[1] = y;
+	}
+
+};
+
+}
+
+
+#endif
 \ No newline at end of file