From ca99f95130a6562cb8827cbcd69c163b86592449 Mon Sep 17 00:00:00 2001
From: David <mayerich@uh.edu>
Date: Mon, 20 Jun 2016 10:19:07 -0500
Subject: [PATCH] faster implementation for stim::ivote

---
 stim/cuda/arraymath.cuh                     |   4 ++--
 stim/cuda/ivote/david_update_dir_global.cuh | 171 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/cuda/ivote/update_dir_global.cuh       | 165 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------------
 stim/cuda/ivote_atomic.cuh                  |   2 +-
 stim/cuda/sharedmem.cuh                     |  38 +++++++++++++++++++++++++++++++++-----
 stim/cuda/templates/conv2sep.cuh            |   6 ++++--
 stim/image/image.h                          |  37 ++++++++++++++++++++++++++++++++-----
 stim/optics/scalarfield.h                   |   4 ++--
 stim/visualization/aabb2.h                  |  49 +++++++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 422 insertions(+), 54 deletions(-)
 create mode 100644 stim/cuda/ivote/david_update_dir_global.cuh
 create mode 100644 stim/visualization/aabb2.h
diff --git a/stim/cuda/arraymath.cuh b/stim/cuda/arraymath.cuh
index 548d442..fa9677b 100644
--- a/stim/cuda/arraymath.cuh
+++ b/stim/cuda/arraymath.cuh
@@ -10,8 +10,8 @@
 #include <stim/cuda/arraymath/array_atan.cuh>
 #include <stim/cuda/arraymath/array_abs.cuh>
 #include <stim/cuda/arraymath/array_cart2polar.cuh>
-#include <stim/cuda/arraymath/array_threshold.cuh>
-#include <stim/cuda/arraymath/array_add3.cuh>
+//#include <stim/cuda/arraymath/array_threshold.cuh>
+//#include <stim/cuda/arraymath/array_add3.cuh>
 namespace stim{
 	namespace cuda{
 	
diff --git a/stim/cuda/ivote/david_update_dir_global.cuh b/stim/cuda/ivote/david_update_dir_global.cuh
new file mode 100644
index 0000000..3ae624c
--- /dev/null
+++ b/stim/cuda/ivote/david_update_dir_global.cuh
@@ -0,0 +1,171 @@
+#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
+#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include <math.h>
+#include "cpyToshare.cuh" 
+
+#define RMAX_TEST	8
+
+namespace stim{
+	namespace cuda{
+	
+		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
+		template<typename T>
+		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
+			extern __shared__ T atan2_table[];
+			
+			//calculate the start point for this block
+			//int bxi = blockIdx.x * blockDim.x;
+
+			stim::cuda::sharedMemcpy(atan2_table, gpuTable, (2 * rmax + 1) * (2 * rmax + 1), threadIdx.x, blockDim.x);
+
+			__syncthreads();
+			
+			// calculate the 2D coordinates for this current thread.
+			//int xi = bxi + threadIdx.x;
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			if(xi >= x || yi >= y) return;													//if the index is outside of the image, terminate the kernel
+
+			int i = yi * x + xi;												// convert 2D coordinates to 1D
+			
+			float theta = gpuGrad[2*i];											// calculate the voting direction based on the grtadient direction - global memory fetch			
+			gpuDir[i] = 0;														//initialize the vote direction to zero			
+			float max = 0;														// define a local variable to maximum value of the vote image in the voting area for this voter
+			int id_x = 0;														// define two local variables for the x and y position of the maximum
+			int id_y = 0;
+			
+			int x_table = 2*rmax +1;											// compute the size of window which will be checked for finding the voting area for this voter
+			int rmax_sq = rmax * rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			float atan_angle;
+			float vote_c;
+			unsigned int ind_t;
+			for(int yr = -rmax; yr <= rmax; yr++){					//for each counter in the y direction
+				if (yi+yr >= 0 && yi + yr < y){									//if the counter exists (we aren't looking outside of the image)
+					for(int xr = -rmax; xr <= rmax; xr++){					//for each counter in the x direction
+						if((xr * xr + yr *yr)< rmax_sq){								//if the counter is within range of the voter
+
+							ind_t = (rmax - yr) * x_table + rmax - xr;		//calculate the index to the atan2 table							
+							atan_angle = atan2_table[ind_t];								//retrieve the direction vector from the table						
+
+							//atan_angle = atan2((float)yr, (float)xr);
+							
+							if (abs(atan_angle - theta) <phi){							// check if the current pixel is located in the voting angle of this voter.				
+								vote_c = gpuVote[(yi+yr)*x + (xi+xr)];			// find the vote value for the current counter						
+								if(vote_c>max) {								// compare the vote value of this pixel with the max value to find the maxima and its index.
+									max = vote_c;
+									id_x =  xr;
+									id_y =  yr;
+								}
+							}
+						}
+					}
+				}
+			}
+							
+			unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
+			float new_angle = gpuTable[ind_m];
+
+			if(xi < x && yi < y)
+				gpuDir[i] = new_angle;
+		}										//end kernel
+
+		// this kernel updates the gradient direction by the calculated voting direction.
+		template<typename T>
+		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
+
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+		
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+			
+			//update the gradient image with the vote direction
+			gpuGrad[2*i] = gpuDir[i];
+		}
+		
+		template<typename T>
+		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+
+			dim3 threads(sqrt(max_threads), sqrt(max_threads));
+			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
+
+			
+			
+			// allocate space on the GPU for the updated vote direction
+			T* gpuDir;
+			cudaMalloc(&gpuDir, bytes);	
+
+			size_t shared_mem = sizeof(T) * std::pow((2 * rmax + 1), 2);
+			std::cout<<"Shared memory for atan2 table: "<<shared_mem<<std::endl;
+
+			//call the kernel to calculate the new voting direction
+			cuda_update_dir <<< blocks, threads, shared_mem>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+			//call the kernel to update the gradient direction
+			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
+			
+			//free allocated memory
+			cudaFree(gpuDir);
+
+		}
+		
+		template<typename T>
+		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);
+
+			//copy the input vote image to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the update direction function
+			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the new gradient image back to the CPU
+			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/stim/cuda/ivote/update_dir_global.cuh b/stim/cuda/ivote/update_dir_global.cuh
index 71eeebf..f489a8b 100644
--- a/stim/cuda/ivote/update_dir_global.cuh
+++ b/stim/cuda/ivote/update_dir_global.cuh
@@ -5,25 +5,88 @@
 # include <cuda.h>
 #include <stim/cuda/cudatools.h>
 #include <stim/cuda/sharedmem.cuh>
+#include <stim/visualization/aabb2.h>
+#include <stim/visualization/colormap.h>
+#include <math.h>
 #include "cpyToshare.cuh" 
 
-#define RMAX_TEST	8
+//#define RMAX_TEST	8
 
 namespace stim{
 	namespace cuda{
-	
-		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
+
 		template<typename T>
 		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
+			extern __shared__ T S[];
+			T* shared_atan = S;
+			size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
+			stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
+
+			//T* shared_vote = &S[n_table];
+			//size_t template_size_x = (blockDim.x + 2 * rmax);
+			//size_t template_size_y = (blockDim.y + 2 * rmax);
+			//stim::cuda::threadedMemcpy2D((char*)shared_vote, (char*)gpuVote, template_size_x, template_size_y, x,  threadIdx.y * blockDim.x + threadIdx.x, blockDim.x * blockDim.y);
+			
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;				//calculate the 2D coordinates for this current thread.
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
 
+			if(xi >= x || yi >= y) return;								//if the index is outside of the image, terminate the kernel
+
+			int i = yi * x + xi;										//convert 2D coordinates to 1D
+			float theta = gpuGrad[2*i];									//calculate the voting direction based on the grtadient direction - global memory fetch
 			
-			//calculate the start point for this block
-			int bxi = blockIdx.x * blockDim.x;
+			stim::aabb2<int> bb(xi, yi);								//initialize a bounding box at the current point
+			bb.insert(xi + ceil(rmax * cos(theta)),       ceil(yi + rmax * sin(theta)));
+			bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi)));		//insert one corner of the triangle into the bounding box
+			bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi)));		//insert the final corner into the bounding box
+
+			int x_table = 2*rmax +1;
+			int lut_i;
+			T rmax_sq = rmax * rmax;
+			T dx_sq, dy_sq;
+
+			bb.trim_low(0, 0);															//make sure the bounding box doesn't go outside the image
+			bb.trim_high(x-1, y-1);
+
+			int by, bx;
+			int dx, dy;													//coordinate relative to (xi, yi)
+			T v;
+			T max_v = 0;												//initialize the maximum vote value to zero
+			T alpha;
+			int max_dx = bb.low[0];
+			int max_dy = bb.low[1];
+			for(by = bb.low[1]; by <= bb.high[1]; by++){					//for each element in the bounding box
+				dy = by - yi;											//calculate the y coordinate of the current point relative to yi
+				dy_sq = dy * dy;
+				for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
+					dx = bx - xi;
+					dx_sq = dx * dx;
+					lut_i = (rmax - dy) * x_table + rmax - dx;
+					alpha = shared_atan[lut_i];
+					if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
+						v = gpuVote[by * x + bx];				// find the vote value for the current counter
+						if(v > max_v){
+							max_v = v;
+							max_dx = dx;
+							max_dy = dy;
+						}
+					}
+				}
+			}			
+			gpuDir[i] = atan2((T)max_dy, (T)max_dx);
+		}
+	
+		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
+		template<typename T>
+		__global__ void leila_cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
+
 			
 			// calculate the 2D coordinates for this current thread.
-			int xi = bxi + threadIdx.x;
-			if(xi >= x) return;													//if the index is outside of the image, terminate the kernel
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;			
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+
+			if(xi >= x || yi >= y) return;													//if the index is outside of the image, terminate the kernel
+
 			int i = yi * x + xi;												// convert 2D coordinates to 1D
 			
 			float theta = gpuGrad[2*i];											// calculate the voting direction based on the grtadient direction - global memory fetch			
@@ -37,27 +100,32 @@ namespace stim{
 			int tx_rmax = threadIdx.x + rmax;
 			float atan_angle;
 			float vote_c;
-			for(int yr = -RMAX_TEST; yr <= RMAX_TEST; yr++){
-				if (yi+yr >= 0 && yi + yr < y){
-					for(int xr = -RMAX_TEST; xr <= RMAX_TEST; xr++){
-
-						unsigned int ind_t = (RMAX_TEST - yr) * x_table + RMAX_TEST - xr;
-
-						// calculate the angle between the voter and the current pixel in x and y directions
-						atan_angle = gpuTable[ind_t];
-						
-						// find the vote value for the current counter
-						vote_c = gpuVote[(yi+yr)*x + (xi+xr)];
-						
-						// check if the current pixel is located in the voting area of this voter.
-						if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
-
-						// compare the vote value of this pixel with the max value to find the maxima and its index.
-							if  (vote_c>max) {
-
-								max = vote_c;
-								id_x =  xr;
-								id_y =  yr;
+			int xidx, yidx, yr_sq, xr_sq;
+			for(int yr = -rmax; yr <= rmax; yr++){
+				yidx = yi + yr;													//compute the index into the image
+				if (yidx >= 0 && yidx < y){									//if the current y-index is inside the image
+					yr_sq = yr * yr;											//compute the square of yr, to save time later
+					for(int xr = -rmax; xr <= rmax; xr++){
+						xidx = xi + xr;
+						if(xidx >= 0 && xidx < x){
+							xr_sq = xr * xr;
+							unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+							// calculate the angle between the voter and the current pixel in x and y directions
+							atan_angle = gpuTable[ind_t];
+							//atan_angle = atan2((T)yr, (T)xr);
+											
+							// check if the current pixel is located in the voting area of this voter.
+							if (((xr_sq + yr_sq)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+								
+								vote_c = gpuVote[yidx * x + xidx];				// find the vote value for the current counter
+							// compare the vote value of this pixel with the max value to find the maxima and its index.
+								if  (vote_c>max) {
+
+									max = vote_c;
+									id_x =  xr;
+									id_y =  yr;
+								}
 							}
 						}
 					}
@@ -70,6 +138,7 @@ namespace stim{
 			if(xi < x && yi < y)
 				gpuDir[i] = new_angle;
 		}										//end kernel
+		
 
 		// this kernel updates the gradient direction by the calculated voting direction.
 		template<typename T>
@@ -78,6 +147,8 @@ namespace stim{
 			// calculate the 2D coordinates for this current thread.
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
 			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+
+			if(xi >= x || yi >= y) return;
 		
 			// convert 2D coordinates to 1D
 			int i = yi * x + xi;
@@ -91,23 +162,43 @@ namespace stim{
 
 			//calculate the number of bytes in the array
 			unsigned int bytes = x * y * sizeof(T);
-
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads, 1);
-			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
 			
 			// allocate space on the GPU for the updated vote direction
 			T* gpuDir;
-			cudaMalloc(&gpuDir, bytes);	
+			HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) );	
+
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			//dim3 threads(min(x, max_threads), 1);
+			//dim3 blocks(x/threads.x, y);
+
+			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
+			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
+
+			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
+			//size_t curtain = 2 * rmax;
+			//size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain);
+			size_t shared_mem_req = table_bytes;// + template_bytes;
+			std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
+
+			size_t shared_mem = stim::sharedMemPerBlock();
+			if(shared_mem_req > shared_mem){
+				std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
+				exit(1);
+			}
 
 			//call the kernel to calculate the new voting direction
-			cuda_update_dir <<< blocks, threads>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+			cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+			stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
+
+			//exit(0);
+
+			threads = dim3( sqrt(max_threads), sqrt(max_threads) );
+			blocks = dim3(x/threads.x + 1, y/threads.y + 1);
 
 			//call the kernel to update the gradient direction
 			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
-			
 			//free allocated memory
-			cudaFree(gpuDir);
+			HANDLE_ERROR( cudaFree(gpuDir) );
 
 		}
 		
diff --git a/stim/cuda/ivote_atomic.cuh b/stim/cuda/ivote_atomic.cuh
index c7a827f..4890a9d 100644
--- a/stim/cuda/ivote_atomic.cuh
+++ b/stim/cuda/ivote_atomic.cuh
@@ -6,7 +6,7 @@
 #include <stim/cuda/ivote/update_dir_global.cuh>
 //#include <stim/cuda/ivote/vote_shared_32-32.cuh>
 #include <stim/cuda/ivote/vote_atomic_shared.cuh>
-#include <stim/cuda/ivote/re_sample.cuh>
+//#include <stim/cuda/ivote/re_sample.cuh>
 namespace stim{
 	namespace cuda{
 	
diff --git a/stim/cuda/sharedmem.cuh b/stim/cuda/sharedmem.cuh
index b8d4105..a5f05cc 100644
--- a/stim/cuda/sharedmem.cuh
+++ b/stim/cuda/sharedmem.cuh
@@ -35,10 +35,8 @@ namespace stim{
 			}
 		}
 
-		// Copies values from global memory to shared memory, optimizing threads
-		template<typename T>
-		__device__ void sharedMemcpy(T* dest, T* src, size_t N, size_t tid, size_t nt){
-
+		// Threaded copying of data on a CUDA device.
+		__device__ void threadedMemcpy(char* dest, char* src, size_t N, size_t tid, size_t nt){
 			size_t I = N / nt + 1;	//calculate the number of iterations required to make the copy
 			size_t xi = tid;							//initialize the source and destination index to the thread ID
 			for(size_t i = 0; i < I; i++){ 				//for each iteration
@@ -48,7 +46,37 @@ namespace stim{
 			}
 		}
 
-		
+		/// Threaded copying of 2D data on a CUDA device
+		/// @param dest is a linear destination array of size nx * ny
+		/// @param src is a 2D image stored as a linear array with a pitch of X
+		/// @param X is the number of bytes in a row of src
+		/// @param tid is a 1D id for the current thread
+		/// @param nt is the number of threads in the block
+		template<typename T>
+		__device__ void threadedMemcpy2D(T* dest, size_t nx, size_t ny, 
+										 T* src, size_t x, size_t y, size_t sX, size_t sY,
+										 size_t tid, size_t nt){
+
+			size_t vals = nx * ny;								//calculate the total number of bytes to be copied
+			size_t I = vals / nt + 1;							//calculate the number of iterations required to perform the copy
+
+			size_t src_i, dest_i;
+			size_t dest_x, dest_y, src_x, src_y;
+			for(size_t i = 0; i < I; i++){						//for each iteration
+				dest_i = i * nt + tid;						//calculate the index into the destination array
+				dest_y = dest_i / nx;
+				dest_x = dest_i - dest_y * nx;
+
+				if(dest_y < ny && dest_x < nx){
+
+					src_x = x + dest_x;
+					src_y = y + dest_y;
+
+					src_i = src_y * sX + src_x;
+					dest[dest_i] = src[src_i];
+				}
+			}
+		}
 	}
 }
 
diff --git a/stim/cuda/templates/conv2sep.cuh b/stim/cuda/templates/conv2sep.cuh
index bb33af7..d2e111a 100644
--- a/stim/cuda/templates/conv2sep.cuh
+++ b/stim/cuda/templates/conv2sep.cuh
@@ -30,7 +30,8 @@ namespace stim{
 			int byi = blockIdx.y;
 
 			//copy the portion of the image necessary for this block to shared memory
-			stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
+			//stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
+			stim::cuda::sharedMemcpy_tex2D<float>(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
 
 			//calculate the thread index
 			int ti = threadIdx.x;
@@ -88,7 +89,8 @@ namespace stim{
 			int byi = blockIdx.y * blockDim.y;
 
 			//copy the portion of the image necessary for this block to shared memory
-			stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
+			//stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
+			stim::cuda::sharedMemcpy_tex2D<float>(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
 
 			//calculate the thread index
 			int ti = threadIdx.y;
diff --git a/stim/image/image.h b/stim/image/image.h
index beae621..5fc8120 100644
--- a/stim/image/image.h
+++ b/stim/image/image.h
@@ -160,11 +160,26 @@ public:
 			exit(1);
 		}
 		allocate(cvImage.cols, cvImage.rows, cvImage.channels());			//allocate space for the image
-		T* cv_ptr = (T*)cvImage.data;
+		unsigned char* cv_ptr = (unsigned char*)cvImage.data;
 		if(C() == 1)														//if this is a single-color image, just copy the data
 			memcpy(img, cv_ptr, bytes());
 		if(C() == 3)														//if this is a 3-color image, OpenCV uses BGR interleaving
-			set_interleaved_bgr(cv_ptr, X(), Y());
+			from_opencv(cv_ptr, X(), Y());
+	}
+
+	void from_opencv(unsigned char* buffer, size_t width, size_t height){
+		allocate(width, height, 3);
+		T value;
+		size_t i;
+		for(size_t c = 0; c < C(); c++){								//copy directly
+			for(size_t y = 0; y < Y(); y++){
+				for(size_t x = 0; x < X(); x++){
+					i = y * X() * C() + x * C() + (2-c);
+					value = buffer[i];
+					img[idx(x, y, c)] = value;
+				}
+			}
+		}
 	}
 
 	//save a file
@@ -180,23 +195,35 @@ public:
 		cv::imwrite(filename, cvImage);
 	}
 
+	void set_interleaved(T* buffer, size_t width, size_t height, size_t channels){
+		allocate(width, height, channels);
+		memcpy(img, buffer, bytes());
+	}
+
 	//create an image from an interleaved buffer
 	void set_interleaved_rgb(T* buffer, size_t width, size_t height){
-		allocate(width, height, 3);
-		memcpy(img, buffer, bytes());
+		set_interleaved(buffer, width, height, 3);
 	}
 
 	void set_interleaved_bgr(T* buffer, size_t width, size_t height){
 		allocate(width, height, 3);
+		T value;
+		size_t i;
 		for(size_t c = 0; c < C(); c++){								//copy directly
 			for(size_t y = 0; y < Y(); y++){
 				for(size_t x = 0; x < X(); x++){
-					img[idx(x, y, c)] = buffer[y * X() * C() + x * C() + (2-c)];
+					i = y * X() * C() + x * C() + (2-c);
+					value = buffer[i];
+					img[idx(x, y, c)] = value;
 				}
 			}
 		}
 	}
 
+	void set_interleaved(T* buffer, size_t width, size_t height){
+		set_interleaved_rgb(buffer, width, height);
+	}
+
 	void get_interleaved_bgr(T* data){
 
 		//for each channel
diff --git a/stim/optics/scalarfield.h b/stim/optics/scalarfield.h
index 1c73002..4d7cf37 100644
--- a/stim/optics/scalarfield.h
+++ b/stim/optics/scalarfield.h
@@ -71,8 +71,8 @@ public:
 	void to_cpu(){
 		if(loc == CPUmem) return;
 		else{
-			stim::complex<T>* host_E = (stim::complex<T>*) malloc(e_bytes());			//allocate space in main memory
-			HANDLE_ERROR( cudaMemcpy(host_E, E, e_bytes(), cudaMemcpyDeviceToHost) );	//copy from GPU to CPU
+			stim::complex<T>* host_E = (stim::complex<T>*) malloc(grid_bytes());			//allocate space in main memory
+			HANDLE_ERROR( cudaMemcpy(host_E, E, grid_bytes(), cudaMemcpyDeviceToHost) );	//copy from GPU to CPU
 			HANDLE_ERROR( cudaFree(E) );												//free device memory
 			E = host_E;																	//swap pointers
 		}
diff --git a/stim/visualization/aabb2.h b/stim/visualization/aabb2.h
new file mode 100644
index 0000000..fe38f74
--- /dev/null
+++ b/stim/visualization/aabb2.h
@@ -0,0 +1,49 @@
+#ifndef STIM_AABB2_H
+#define STIM_AABB2_H
+
+#include <stim/cuda/cudatools/callable.h>
+
+namespace stim{
+
+/// Structure for a 2D axis aligned bounding box
+template<typename T>
+struct aabb2{
+
+//protected:
+
+	T low[2];						//top left corner position
+	T high[2];							//dimensions along x and y
+
+//public:
+
+	CUDA_CALLABLE aabb2(T x, T y){					//initialize an axis aligned bounding box of size 0 at the given position
+		low[0] = high[0] = x;					//set the position to the user specified coordinates
+		low[1] = high[1] = y;
+	}
+
+	//insert a point into the bounding box, growing the box appropriately
+	CUDA_CALLABLE void insert(T x, T y){
+		if(x < low[0]) low[0] = x;
+		if(y < low[1]) low[1] = y;
+
+		if(x > high[0]) high[0] = x;		
+		if(y > high[1]) high[1] = y;
+	}
+
+	//trim the bounding box so that the lower bounds are (x, y)
+	CUDA_CALLABLE void trim_low(T x, T y){
+		if(low[0] < x) low[0] = x;
+		if(low[1] < y) low[1] = y;
+	}
+
+	CUDA_CALLABLE void trim_high(T x, T y){
+		if(high[0] > x) high[0] = x;
+		if(high[1] > y) high[1] = y;
+	}
+
+};
+
+}
+
+
+#endif
\ No newline at end of file
--
libgit2 0.21.4