Merge branch 'master' of git.stim.ee.uh.edu:codebase/stimlib

Pavel Govyadinov
2 parents c0e09133 ad2123e6
Showing 40 changed files with 3697 additions and 1625 deletions Show diff stats
stim/biomodels/network.h
stim/cuda/cudatools/callable.h
stim/cuda/cudatools/devices.h
stim/cuda/sharedmem.cuh
stim/envi/bil.h
stim/envi/bip.h
stim/envi/bsq.h
stim/envi/envi.h
stim/image/image.h
stim/math/bessel.h
stim/math/circle.h
stim/math/complex.h
stim/math/constants.h
stim/math/fft.h
stim/math/legendre.h
stim/math/matrix.h
stim/math/meshgrid.h
stim/math/plane.h
stim/math/plane_old.h
stim/math/quad.h
@@ -8,7 +8,7 @@
 #include <algorithm>
 #include <string.h>
 #include <math.h>
-#include <stim/math/vector.h>
+#include <stim/math/vec3.h>
 #include <stim/visualization/obj.h>
 #include <stim/visualization/cylinder.h>
 #include <ANN/ANN.h>
@@ -37,7 +37,7 @@ class network{
 		/// Constructor - creates an edge from a list of points by calling the stim::fiber constructor
 		///@param p is an array of positions in space
-		edge(std::vector< stim::vec<T> > p) : cylinder<T>(p){}
+		edge(std::vector< stim::vec3<T> > p) : cylinder<T>(p){}
 		/// Copy constructor creates an edge from a fiber
 		edge(stim::cylinder<T> f) : cylinder<T>(f) {}
@@ -61,20 +61,20 @@ class network{
 	};
 	///Node class that stores the physical position of the node as well as the edges it is connected to (edges that connect to it), As well as any additional data necessary.
-	class vertex : public stim::vec<T>
+	class vertex : public stim::vec3<T>
 	{
 		public:
 			//std::vector<unsigned int> edges;		//indices of edges connected to this node.
 			std::vector<unsigned int> e[2];			//indices of edges going out (e[0]) and coming in (e[1])
-			//stim::vec<T> p;						//position of this node in physical space.
+			//stim::vec3<T> p;						//position of this node in physical space.
 			//constructor takes a stim::vec
-			vertex(stim::vec<T> p) : stim::vec<T>(p){}
+			vertex(stim::vec3<T> p) : stim::vec3<T>(p){}
 			/// Output the vertex information as a string
 			std::string	str(){
 				std::stringstream ss;
-				ss<<"\t(x, y, z) = "<<stim::vec<T>::str();
+				ss<<"\t(x, y, z) = "<<stim::vec3<T>::str();
 				if(e[0].size() > 0){
 					ss<<"\t> ";
@@ -129,7 +129,11 @@ public:
 			std::vector< stim::vec<T> > c;				//allocate an array of points for the vessel centerline
 			O.getLine(l, c);							//get the fiber centerline
-			edge new_edge = c;							//create an edge from the given centerline
+			std::vector< stim::vec3<T> > c3(c.size());
+			for(size_t j = 0; j < c.size(); j++)
+				c3[j] = c[j];
+
+			edge new_edge = c3;							//create an edge from the given centerline
 			unsigned int I = new_edge.size();			//calculate the number of points on the centerline
 			//get the first and last vertex IDs for the line
@@ -222,7 +226,7 @@ public:
 	float gaussianFunction(float x, float std=25){ return exp(-x/(2*std*std));} // by default std = 25
     // stim 3d vector to annpoint of 3 dimensions
-	void stim2ann(ANNpoint &a, stim::vec<T> b){
+	void stim2ann(ANNpoint &a, stim::vec3<T> b){
 		a[0] = b[0];
 		a[1] = b[1];
 		a[2] = b[2];
@@ -278,10 +282,9 @@ public:
 		ANNdistArray dists = new ANNdist[1];     // near neighbor distances
 		ANNidxArray nnIdx = new ANNidx[1];				// near neighbor indices // allocate near neigh indices
-		stim::vec<T> p0, p1;
-		float m0, m1;
+		stim::vec3<T> p0, p1;
+		float m1;
 		float M = 0;											//stores the total metric value
-		float l;												//stores the segment length
 		float L = 0;											//stores the total network length
 		ANNpoint queryPt = annAllocPt(3);
 		for(unsigned e = 0; e < R.E.size(); e++){					//for each edge in A
@@ -292,7 +295,7 @@ public:
 				p1 = R.E[e][p];									//get the next point in the edge
 				stim2ann(queryPt, p1);
 				kdt->annkSearch( queryPt, 1, nnIdx, dists, eps);	//find the distance between A and the current network
-				m1 = 1.0f - gaussianFunction(dists[0], sigma);		//calculate the metric value based on the distance
+				m1 = 1.0f - gaussianFunction((float)dists[0], sigma);		//calculate the metric value based on the distance
 				R.E[e].set_mag(m1, p, 1);						//set the error for the second point in the segment
 			}
@@ -2,7 +2,7 @@
 //define the CUDA_CALLABLE macro (will prefix all members)
 #ifdef __CUDACC__
-#define CUDA_CALLABLE __host__ __device__
+#define CUDA_CALLABLE __host__ __device__ inline
 #else
 #define CUDA_CALLABLE
 #endif
@@ -15,7 +15,7 @@ int maxThreadsPerBlock()
 }
 extern "C"
-int sharedMemPerBlock()
+size_t sharedMemPerBlock()
 {
 	int device;
 	cudaGetDevice(&device);		//get the id of the current device
@@ -23,6 +23,16 @@ int sharedMemPerBlock()
 	cudaGetDeviceProperties(&props, device);
 	return props.sharedMemPerBlock;
 }
+
+extern "C"
+size_t constMem()
+{
+	int device;
+	cudaGetDevice(&device);		//get the id of the current device
+	cudaDeviceProp props;		//device property structure
+	cudaGetDeviceProperties(&props, device);
+	return props.totalConstMem;
+}
 }	//end namespace rts
 #endif
@@ -5,7 +5,7 @@
 namespace stim{
 	namespace cuda{
-		// Copies values from global memory to shared memory, optimizing threads
+		// Copies values from texture memory to shared memory, optimizing threads
 		template<typename T>
 		__device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src,
 										 unsigned int x, unsigned int y, unsigned int X, unsigned int Y,
@@ -35,6 +35,19 @@ namespace stim{
 			}
 		}
+		// Copies values from global memory to shared memory, optimizing threads
+		template<typename T>
+		__device__ void sharedMemcpy(T* dest, T* src, size_t N, size_t tid, size_t nt){
+
+			size_t I = N / nt + 1;	//calculate the number of iterations required to make the copy
+			size_t xi = tid;							//initialize the source and destination index to the thread ID
+			for(size_t i = 0; i < I; i++){ 				//for each iteration
+				if(xi < N)								//if the index is within the copy region
+					dest[xi] = src[xi];					//perform the copy
+				xi += nt;
+			}
+		}
+
 	}
 }
@@ -884,7 +884,7 @@ public:
 	///		using the following indexing: i = p*B + b
 	/// @param matrix is the destination for the pixel data
 	/// @param mask is the mask
-	bool sift(T* matrix, unsigned char* mask = NULL){
+	bool sift(T* matrix, unsigned char* mask = NULL, bool PROGRESS = false){
 		size_t Lbytes = sizeof(T) * X();
 		T* line = (T*) malloc( Lbytes );					//allocate space for a line
@@ -903,6 +903,7 @@ public:
 						pl++;								//increment the pixel pointer
 					}
 				}
+				if(PROGRESS) progress = (double)( (y+1)*Z() + 1) / (double)(Y() * Z()) * 100;
 			}
 			p += pl;										//add the line increment to the running pixel index
 		}
@@ -817,7 +817,7 @@ public:
 	///		using the following indexing: i = p*B + b
 	/// @param matrix is the destination for the pixel data
 	/// @param mask is the mask
-	bool sift(T* matrix, unsigned char* mask = NULL){
+	bool sift(T* matrix, unsigned char* mask = NULL, bool PROGRESS = false){
 		size_t Bbytes = sizeof(T) * Z();
 		size_t XY = X() * Y();
 		T* band = (T*) malloc( Bbytes );					//allocate space for a line
@@ -836,6 +836,7 @@ public:
 			}
 			else
 				file.seekg(Bbytes, std::ios::cur);			//otherwise skip this band
+			if(PROGRESS) progress = (double)(xy+1) / (double)XY * 100;
 		}
 		return true;
 	}
@@ -809,7 +809,7 @@ public:
 	///		using the following indexing: i = p*B + b
 	/// @param matrix is the destination for the pixel data
 	/// @param mask is the mask
-	bool sift(T* matrix, unsigned char* mask = NULL){
+	bool sift(T* matrix, unsigned char* mask = NULL, bool PROGRESS = false){
 		unsigned long long XY = X() * Y(); 					//Number of XY pixels
 		unsigned long long L = XY * sizeof(T); 				//size of XY plane (in bytes)
@@ -827,9 +827,8 @@ public:
 				if(mask == NULL || mask[xy] != 0){				//if the pixel is valid
 					matrix[i*Z() + b] = band_image[xy];			//copy it to the appropriate point in the values[] array
 					i++;
-					//std::cout<<i<<std::endl;
 				}
-
+				if(PROGRESS) progress = (double)(xy+1) / (double)XY * 100;
 			}
 		}
@@ -670,13 +670,13 @@ public:
 	///		using the following indexing: i = b*P + p
 	/// @param matrix is the destination for the pixel data
 	/// @param p is the mask
-	bool sift(void* matrix, unsigned char* p = NULL){
+	bool sift(void* matrix, unsigned char* p = NULL, bool PROGRESS = false){
 		if (header.interleave == envi_header::BSQ){		//if the infile is bsq file
 			if (header.data_type == envi_header::float32)
-				return ((bsq<float>*)file)->sift((float*)matrix, p);
+				return ((bsq<float>*)file)->sift((float*)matrix, p, PROGRESS);
 			else if (header.data_type == envi_header::float64)
-				return ((bsq<double>*)file)->sift((double*)matrix, p);
+				return ((bsq<double>*)file)->sift((double*)matrix, p, PROGRESS);
 			else{
 				std::cout << "ERROR: unidentified data type" << std::endl;
 				exit(1);
@@ -685,9 +685,9 @@ public:
 		if (header.interleave == envi_header::BIP){
 			if (header.data_type == envi_header::float32)
-				return ((bip<float>*)file)->sift((float*)matrix, p);
+				return ((bip<float>*)file)->sift((float*)matrix, p, PROGRESS);
 			else if (header.data_type == envi_header::float64)
-				return ((bip<double>*)file)->sift((double*)matrix, p);
+				return ((bip<double>*)file)->sift((double*)matrix, p, PROGRESS);
 			else{
 				std::cout << "ERROR: unidentified data type" << std::endl;
 				exit(1);
@@ -695,9 +695,9 @@ public:
 		}
 		if (header.interleave == envi_header::BIL){
 			if (header.data_type == envi_header::float32)
-				return ((bil<float>*)file)->sift((float*)matrix, p);
+				return ((bil<float>*)file)->sift((float*)matrix, p, PROGRESS);
 			else if (header.data_type == envi_header::float64)
-				return ((bil<double>*)file)->sift((double*)matrix, p);
+				return ((bil<double>*)file)->sift((double*)matrix, p, PROGRESS);
 			else{
 				std::cout << "ERROR: unidentified data type" << std::endl;
 				exit(1);
@@ -6,6 +6,7 @@
 #include <vector>
 #include <iostream>
 #include <limits>
+#include <typeinfo>
 namespace stim{
 /// This static class provides the STIM interface for loading, saving, and storing 2D images.
@@ -24,8 +25,6 @@ class image{
 	size_t Y() const { return R[2]; }
 	size_t C() const { return R[0]; }
-	size_t bytes(){ return size() * sizeof(T); }
-
 	void init(){								//initializes all variables, assumes no memory is allocated
 		memset(R, 0, sizeof(size_t) * 3);		//set the resolution and number of channels to zero
 		img = NULL;
@@ -33,7 +32,6 @@ class image{
 	void unalloc(){								//frees any resources associated with the image
 		if(img)	free(img);						//if memory has been allocated, free it
-		img=NULL;		
 	}
@@ -44,16 +42,15 @@ class image{
 	void allocate(){
 		unalloc();
-		img = (T*) malloc( bytes() );	//allocate memory
-		memset(img, 0, bytes());
+		img = (T*) malloc( sizeof(T) * R[0] * R[1] * R[2] );	//allocate memory
 	}
 	void allocate(size_t x, size_t y, size_t c){	//allocate memory based on the resolution
-		unalloc();
 		R[0] = c; R[1] = x; R[2] = y;				//set the resolution
 		allocate();									//allocate memory
 	}
+	size_t bytes(){ return size() * sizeof(T); }
 	size_t idx(size_t x, size_t y, size_t c = 0){
 		return y * C() * X() + x * C() + c;
@@ -61,13 +58,23 @@ class image{
 	int cv_type(){
-		if(std::is_same<T, unsigned char>::value)	return CV_MAKETYPE(CV_8U, (int)C());
-		if(std::is_same<T, char>::value)			return CV_MAKETYPE(CV_8S, (int)C());
-		if(std::is_same<T, unsigned short>::value)	return CV_MAKETYPE(CV_16U, (int)C());
-		if(std::is_same<T, short>::value)			return CV_MAKETYPE(CV_16S, (int)C());
-		if(std::is_same<T, int>::value)				return CV_MAKETYPE(CV_32S, (int)C());
-		if(std::is_same<T, float>::value)			return CV_MAKETYPE(CV_32F, (int)C());
-		if(std::is_same<T, double>::value)			return CV_MAKETYPE(CV_64F, (int)C());
+		// The following is C++ 11 code, but causes problems on some compilers (ex. nvcc). Below is my best approximation to a solution
+
+		//if(std::is_same<T, unsigned char>::value)	return CV_MAKETYPE(CV_8U, (int)C());
+		//if(std::is_same<T, char>::value)			return CV_MAKETYPE(CV_8S, (int)C());
+		//if(std::is_same<T, unsigned short>::value)	return CV_MAKETYPE(CV_16U, (int)C());
+		//if(std::is_same<T, short>::value)			return CV_MAKETYPE(CV_16S, (int)C());
+		//if(std::is_same<T, int>::value)				return CV_MAKETYPE(CV_32S, (int)C());
+		//if(std::is_same<T, float>::value)			return CV_MAKETYPE(CV_32F, (int)C());
+		//if(std::is_same<T, double>::value)			return CV_MAKETYPE(CV_64F, (int)C());
+
+		if(typeid(T) == typeid(unsigned char))		return CV_MAKETYPE(CV_8U, (int)C());
+		if(typeid(T) == typeid(char))				return CV_MAKETYPE(CV_8S, (int)C());
+		if(typeid(T) == typeid(unsigned short))		return CV_MAKETYPE(CV_16U, (int)C());
+		if(typeid(T) == typeid(short))				return CV_MAKETYPE(CV_16S, (int)C());
+		if(typeid(T) == typeid(int))				return CV_MAKETYPE(CV_32S, (int)C());
+		if(typeid(T) == typeid(float))				return CV_MAKETYPE(CV_32F, (int)C());
+		if(typeid(T) == typeid(double))				return CV_MAKETYPE(CV_64F, (int)C());
 		std::cout<<"ERROR in stim::image::cv_type - no valid data type found"<<std::endl;
 		exit(1);
@@ -75,15 +82,26 @@ class image{
 	/// Returns the value for "white" based on the dynamic range (assumes white is 1.0 for floating point images)
 	T white(){
-		if(std::is_same<T, unsigned char>::value)		return UCHAR_MAX;
-		if(std::is_same<T, unsigned short>::value)		return SHRT_MAX;
-		if(std::is_same<T, unsigned>::value)			return UINT_MAX;
-		if(std::is_same<T, unsigned long>::value)		return ULONG_MAX;
-		if(std::is_same<T, unsigned long long>::value)	return ULLONG_MAX;
-		if(std::is_same<T, float>::value)				return 1.0f;
-		if(std::is_same<T, double>::value)				return 1.0;
+		// The following is C++ 11 code, but causes problems on some compilers (ex. nvcc). Below is my best approximation to a solution
+
+		//if(std::is_same<T, unsigned char>::value)		return UCHAR_MAX;
+		//if(std::is_same<T, unsigned short>::value)		return SHRT_MAX;
+		//if(std::is_same<T, unsigned>::value)			return UINT_MAX;
+		//if(std::is_same<T, unsigned long>::value)		return ULONG_MAX;
+		//if(std::is_same<T, unsigned long long>::value)	return ULLONG_MAX;
+		//if(std::is_same<T, float>::value)				return 1.0f;
+		//if(std::is_same<T, double>::value)				return 1.0;
+
+		if(typeid(T) == typeid(unsigned char))		return UCHAR_MAX;
+		if(typeid(T) == typeid(unsigned short))		return SHRT_MAX;
+		if(typeid(T) == typeid(unsigned))			return UINT_MAX;
+		if(typeid(T) == typeid(unsigned long))		return ULONG_MAX;
+		if(typeid(T) == typeid(unsigned long long))	return ULLONG_MAX;
+		if(typeid(T) == typeid(float))				return 1.0f;
+		if(typeid(T) == typeid(double))				return 1.0;
 		std::cout<<"ERROR in stim::image::white - no white value known for this data type"<<std::endl;
+		exit(1);
 	}
@@ -91,9 +109,7 @@ class image{
 public:
 	/// Default constructor - creates an empty image object
-	image(){
-		init();						//initialize all variables to zero, don't allocate any memory
-	}							
+	image(){ init(); }							//initialize all variables to zero, don't allocate any memory
 	/// Constructor with a filename - loads the specified file
 	image(std::string filename){				//constructor initialize the image with an image file
@@ -115,7 +131,7 @@ public:
 	}
 	/// Copy constructor - duplicates an image object
-	image(const stim::image<T> &I){
+	image(const stim::image<T>& I){
 		init();
 		allocate(I.X(), I.Y(), I.C());
 		memcpy(img, I.img, bytes());
@@ -127,6 +143,7 @@ public:
 	}
 	stim::image<T>& operator=(const stim::image<T>& I){
+		init();
 		if(&I == this)									//handle self-assignment
 			return *this;
 		allocate(I.X(), I.Y(), I.C());
@@ -139,22 +156,15 @@ public:
 		cv::Mat cvImage = cv::imread(filename, CV_LOAD_IMAGE_UNCHANGED);	//use OpenCV to open the image file
 		if(!cvImage.data){
-			std::cout<<"ERROR stim::image::load() - unable to find image "<<filename<<" ["<<__FILE__<<" (line "<<__LINE__<<")]"<<std::endl;
+			std::cout<<"ERROR stim::image::load() - unable to find image "<<filename<<std::endl;
 			exit(1);
 		}
 		allocate(cvImage.cols, cvImage.rows, cvImage.channels());			//allocate space for the image
-		T* cv_ptr = (T*) cvImage.data;
-		if(C() == 1)
-		{
-	//if this is a single-color image, just copy the data
-			memcpy(img, cv_ptr, bytes());     
-		}
-		if(C() == 3)
-		{														//if this is a 3-color image, OpenCV uses BGR interleaving
+		T* cv_ptr = (T*)cvImage.data;
+		if(C() == 1)														//if this is a single-color image, just copy the data
+			memcpy(img, cv_ptr, bytes());
+		if(C() == 3)														//if this is a 3-color image, OpenCV uses BGR interleaving
 			set_interleaved_bgr(cv_ptr, X(), Y());
-		}
-
-		cvImage.release();	
 	}
 	//save a file
@@ -168,18 +178,16 @@ public:
 			get_interleaved_bgr(buffer);
 		cv::Mat cvImage((int)Y(), (int)X(), cv_type(), buffer);
 		cv::imwrite(filename, cvImage);
-		cvImage.release();
-		free(buffer);
 	}
 	//create an image from an interleaved buffer
-	void set_interleaved_rgb(T* buffer, size_t width, size_t height, size_t channels = 3){
-		allocate(width, height, channels);
+	void set_interleaved_rgb(T* buffer, size_t width, size_t height){
+		allocate(width, height, 3);
 		memcpy(img, buffer, bytes());
 	}
-	void set_interleaved_bgr(T* buffer, size_t width, size_t height, size_t channels = 3){
-		allocate(width, height, channels);
+	void set_interleaved_bgr(T* buffer, size_t width, size_t height){
+		allocate(width, height, 3);
 		for(size_t c = 0; c < C(); c++){								//copy directly
 			for(size_t y = 0; y < Y(); y++){
 				for(size_t x = 0; x < X(); x++){
@@ -359,34 +367,6 @@ public:
 		return r;								//return the inverted image
 	}
-	
-	/// Invert an image by calculating I1 = alpha - I0, where alpha is the maximum image value
-	image<T> invert(){
-		size_t N = size();						//calculate the total number of values in the image
-		image<T> r(X(), Y(), C());				//allocate space for the resulting image
-		T white_val = maxv();
-		for(size_t n = 0; n < N; n++)
-			r.img[n] = white_val - img[n];		//perform the inversion
-
-		return r;								//return the inverted image
-	}
-
-	///crops the image from x1 to x0 and y1 to y0 and returns a new (smaller) image.
-	image<T> crop(int x0, int x1, int y0, int y1)
-	{
-		
-		image<T> ret(x1-x0, y1-y0, C());
-		int newWidth = x1-x0;
-		int destidx, srcidx;
-		///for each row, cut what amount of data from the original and put it into the new copy.
-		for(int i = 0; i < (y1-y0); i++)
-		{
-			destidx = i*newWidth*C(); ///destination index one per each row
-			srcidx = ((i+(y0))*X()+x0)*C(); ///source index, one per each row.
-			memcpy(&ret.img[destidx], &img[srcidx], sizeof(T)*newWidth*C());
-		}		
-		return ret;
-	}
 	image<T> srgb2lab(){
 		std::cout<<"ERROR stim::image::srgb2lab - function has been broken, re-implement."<<std::endl;
@@ -405,7 +385,6 @@ public:
 		exit(1);
 	}
-
 	// leila's code for non_interleaving data in 3D
 	//create an data set from an interleaved buffer
 	void set_interleaved3(T* buffer, size_t width, size_t height, size_t depth, size_t channels = 3){
@@ -17,6 +17,11 @@ static complex&lt;double&gt; czero(0.0,0.0);
 template< typename P >
 P gamma(P x)
 {
+	const P EPS = numeric_limits<P>::epsilon();
+	const P FPMIN_MAG = numeric_limits<P>::min();
+	const P FPMIN = numeric_limits<P>::lowest();
+	const P FPMAX = numeric_limits<P>::max();
+
     int i,k,m;
     P ga,gr,r,z;
@@ -47,7 +52,7 @@ P gamma(P x)
        -0.54e-14,
         0.14e-14};
-    if (x > 171.0) return 1e308;    // This value is an overflow flag.
+    if (x > 171.0) return FPMAX;    // This value is an overflow flag.
     if (x == (int)x) {
         if (x > 0.0) {
             ga = 1.0;               // use factorial
@@ -56,7 +61,7 @@ P gamma(P x)
             }
          }
          else
-            ga = 1e308;
+            ga = FPMAX;
      }
      else {
         if (fabs(x) > 1.0) {
@@ -89,6 +94,11 @@ template&lt;typename P&gt;
 int bessjy01a(P x,P &j0,P &j1,P &y0,P &y1,
     P &j0p,P &j1p,P &y0p,P &y1p)
 {
+	const P EPS = numeric_limits<P>::epsilon();
+	const P FPMIN_MAG = numeric_limits<P>::min();
+	const P FPMIN = numeric_limits<P>::lowest();
+	const P FPMAX = numeric_limits<P>::max();
+
     P x2,r,ec,w0,w1,r0,r1,cs0,cs1;
     P cu,p0,q0,p1,q1,t1,t2;
     int k,kz;
@@ -157,12 +167,12 @@ int bessjy01a(P x,P &amp;j0,P &amp;j1,P &amp;y0,P &amp;y1,
     if (x == 0.0) {
         j0 = 1.0;
         j1 = 0.0;
-        y0 = -1e308;
-        y1 = -1e308;
+        y0 = -FPMIN;
+        y1 = -FPMIN;
         j0p = 0.0;
         j1p = 0.5;
-        y0p = 1e308;
-        y1p = 1e308;
+        y0p = FPMAX;
+        y1p = FPMAX;
         return 0;
     }
     x2 = x*x;
@@ -329,7 +339,7 @@ int msta1(P x,int mp)
     for (i=0;i<20;i++) {
         nn = (int)(n1-(n1-n0)/(1.0-f0/f1));
         f = 0.5*log10(6.28*nn)-nn*log10(1.36*a0/nn)-mp;
-        if (abs(nn-n1) < 1) break;
+        if (std::abs(nn-n1) < 1) break;
         n0 = n1;
         f0 = f1;
         n1 = nn;
@@ -361,7 +371,7 @@ int msta2(P x,int n,int mp)
     for (i=0;i<20;i++) {
         nn = (int)(n1-(n1-n0)/(1.0-f0/f1));
         f = 0.5*log10(6.28*nn)-nn*log10(1.36*a0/nn)-obj;
-        if (abs(nn-n1) < 1) break;
+        if (std::abs(nn-n1) < 1) break;
         n0 = n1;
         f0 = f1;
         n1 = nn;
@@ -596,21 +606,26 @@ int bessjyv(P v,P x,P &amp;vm,P *jv,P *yv,
     P b,ec,w0,w1,bju0,bju1,pv0,pv1,byvk;
     int j,k,l,m,n,kz;
+	const P EPS = numeric_limits<P>::epsilon();
+	const P FPMIN_MAG = numeric_limits<P>::min();
+	const P FPMIN = numeric_limits<P>::lowest();
+	const P FPMAX = numeric_limits<P>::max();
+
     x2 = x*x;
     n = (int)v;
     v0 = v-n;
     if ((x < 0.0) || (v < 0.0)) return 1;
-    if (x < 1e-15) {
+    if (x < EPS) {
         for (k=0;k<=n;k++) {
             jv[k] = 0.0;
-            yv[k] = -1e308;
+            yv[k] = FPMIN;
             djv[k] = 0.0;
-            dyv[k] = 1e308;
+            dyv[k] = FPMAX;
             if (v0 == 0.0) {
                 jv[0] = 1.0;
                 djv[1] = 0.5;
             }
-            else djv[0] = 1e308;
+            else djv[0] = FPMAX;
         }
         vm = v;
         return 0;
@@ -623,7 +638,7 @@ int bessjyv(P v,P x,P &amp;vm,P *jv,P *yv,
             for (k=1;k<=40;k++) {
                 r *= -0.25*x2/(k*(k+vl));
                 bjvl += r;
-                if (fabs(r) < fabs(bjvl)*1e-15) break;
+                if (fabs(r) < fabs(bjvl)*EPS) break;
             }
             vg = 1.0 + vl;
             a = pow(0.5*x,vl)/gamma(vg);
@@ -686,7 +701,7 @@ int bessjyv(P v,P x,P &amp;vm,P *jv,P *yv,
         if (m < n) n = m;
         else m = msta2(x,n,15);
         f2 = 0.0;
-        f1 = 1.0e-100;
+        f1 = FPMIN_MAG;
         for (k=m;k>=0;k--) {
             f = 2.0*(v0+k+1.0)*f1/x-f2;
             if (k <= n) jv[k] = f;
@@ -763,20 +778,26 @@ int bessjyv(P v,P x,P &amp;vm,P *jv,P *yv,
 template<typename P>
 int bessjyv_sph(int v, P z, P &vm, P* cjv,
-    P* cyv, P* cjvp, P* cyvp)
-{
+    P* cyv, P* cjvp, P* cyvp){
+	
     //first, compute the bessel functions of fractional order
-    bessjyv(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp);
+    bessjyv<P>(v + (P)0.5, z, vm, cjv, cyv, cjvp, cyvp);
+
+	if(z == 0){													//handle degenerate case of z = 0
+		memset(cjv, 0, sizeof(P) * (v+1));
+		cjv[0] = 1;
+	}
     //iterate through each and scale
-    for(int n = 0; n<=v; n++)
-    {
+    for(int n = 0; n<=v; n++){
-        cjv[n] = cjv[n] * sqrt(rtsPI/(z * 2.0));
-        cyv[n] = cyv[n] * sqrt(rtsPI/(z * 2.0));
+		if(z != 0){												//handle degenerate case of z = 0
+			cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
+			cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
+		}
-        cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(rtsPI / (z * 2.0));
-        cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(rtsPI / (z * 2.0));
+        cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0));
+        cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0));
     }
 	return 0;
@@ -1237,7 +1258,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
     P a0,v0,pv0,pv1,vl,ga,gb,vg,vv,w0,w1,ya0,yak,ya1,wa;
     int j,n,k,kz,l,lb,lb0,m;
-    a0 = abs(z);
+    a0 = ::abs(z);
     z1 = z;
     z2 = z*z;
     n = (int)v;
@@ -1265,7 +1286,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
         vm = v;
         return 0;
     }
-    if (real(z1) < 0.0) z1 = -z;
+    if (::real(z1) < 0.0) z1 = -z;
     if (a0 <= 12.0) {
         for (l=0;l<2;l++) {
             vl = v0+l;
@@ -1274,7 +1295,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
             for (k=1;k<=40;k++) {
                 cr *= -0.25*z2/(k*(k+vl));
                 cjvl += cr;
-                if (abs(cr) < abs(cjvl)*eps) break;
+                if (::abs(cr) < ::abs(cjvl)*eps) break;
             }
            vg = 1.0 + vl;
            ga = gamma(vg);
@@ -1327,7 +1348,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
                 for (k=1;k<=40;k++) {
                     cr *= -0.25*z2/(k*(k-vl));
                     cjvl += cr;
-                    if (abs(cr) < abs(cjvl)*eps) break;
+                    if (::abs(cr) < ::abs(cjvl)*eps) break;
                 }
                 vg = 1.0-vl;
                 gb = gamma(vg);
@@ -1360,16 +1381,16 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
             cyv1 = M_2_PI*(cec*cjv1-1.0/z1-0.25*z1*cs1);
         }
     }
-    if (real(z) < 0.0) {
+    if (::real(z) < 0.0) {
         cfac0 = exp(pv0*cii);
         cfac1 = exp(pv1*cii);
-        if (imag(z) < 0.0) {
+        if (::imag(z) < 0.0) {
             cyv0 = cfac0*cyv0-(P)2.0*(complex<P>)cii*cos(pv0)*cjv0;
             cyv1 = cfac1*cyv1-(P)2.0*(complex<P>)cii*cos(pv1)*cjv1;
             cjv0 /= cfac0;
             cjv1 /= cfac1;
         }
-        else if (imag(z) > 0.0) {
+        else if (::imag(z) > 0.0) {
             cyv0 = cyv0/cfac0+(P)2.0*(complex<P>)cii*cos(pv0)*cjv0;
             cyv1 = cyv1/cfac1+(P)2.0*(complex<P>)cii*cos(pv1)*cjv1;
             cjv0 *= cfac0;
@@ -1400,7 +1421,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
             cf2 = cf1;
             cf1 = cf;
         }
-        if (abs(cjv0) > abs(cjv1)) cs = cjv0/cf;
+        if (::abs(cjv0) > ::abs(cjv1)) cs = cjv0/cf;
         else cs = cjv1/cf2;
         for (k=0;k<=n;k++) {
             cjv[k] *= cs;
@@ -1412,21 +1433,21 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
     }
     cyv[0] = cyv0;
     cyv[1] = cyv1;
-    ya0 = abs(cyv0);
+    ya0 = ::abs(cyv0);
     lb = 0;
     cg0 = cyv0;
     cg1 = cyv1;
     for (k=2;k<=n;k++) {
         cyk = 2.0*(v0+k-1.0)*cg1/z-cg0;
-        yak = abs(cyk);
-        ya1 = abs(cg0);
+        yak = ::abs(cyk);
+        ya1 = ::abs(cg0);
         if ((yak < ya0) && (yak< ya1)) lb = k;
         cyv[k] = cyk;
         cg0 = cg1;
         cg1 = cyk;
     }
     lb0 = 0;
-    if ((lb > 4) && (imag(z) != 0.0)) {
+    if ((lb > 4) && (::imag(z) != 0.0)) {
         while(lb != lb0) {
             ch2 = cone;
             ch1 = czero;
@@ -1449,7 +1470,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
             cp21 = ch2;
             if (lb == n)
                 cjv[lb+1] = 2.0*(lb+v0)*cjv[lb]/z-cjv[lb-1];
-            if (abs(cjv[0]) > abs(cjv[1])) {
+            if (::abs(cjv[0]) > ::abs(cjv[1])) {
                 cyv[lb+1] = (cjv[lb+1]*cyv0-2.0*cp11/(M_PI*z))/cjv[0];
                 cyv[lb] = (cjv[lb]*cyv0+2.0*cp12/(M_PI*z))/cjv[0];
             }
@@ -1474,8 +1495,8 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
                 cyl2 = cylk;
             }
             for (k=2;k<=n;k++) {
-                wa = abs(cyv[k]);
-                if (wa < abs(cyv[k-1])) lb = k;
+                wa = ::abs(cyv[k]);
+                if (wa < ::abs(cyv[k-1])) lb = k;
             }
         }
     }
@@ -1494,15 +1515,21 @@ int cbessjyva_sph(int v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
     //first, compute the bessel functions of fractional order
     cbessjyva<P>(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp);
+	if(z == 0){													//handle degenerate case of z = 0
+		memset(cjv, 0, sizeof(P) * (v+1));
+		cjv[0] = 1;
+	}
+
     //iterate through each and scale
     for(int n = 0; n<=v; n++)
     {
+		if(z != 0){												//handle degenerate case of z = 0
+			cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
+			cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
+		}
-        cjv[n] = cjv[n] * sqrt(rtsPI/(z * 2.0));
-        cyv[n] = cyv[n] * sqrt(rtsPI/(z * 2.0));
-
-        cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(rtsPI / (z * 2.0));
-        cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(rtsPI / (z * 2.0));
+        cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0));
+        cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0));
     }
 	return 0;
@@ -17,7 +17,7 @@ class circle : plane&lt;T&gt;
 private:
-	stim::vec<T> Y;
+	stim::vec3<T> Y;
 	CUDA_CALLABLE void
 	init()
@@ -48,7 +48,7 @@ public:
 	circle(T size, T z_pos = (T)0) : plane<T>()
 	{
 		init();
-		center(stim::vec<T>(0,0,z_pos));
+		center(stim::vec3<T>(0,0,z_pos));
 		scale(size);
 	}
@@ -56,7 +56,7 @@ public:
 	///@param c: x,y,z location of the center.
 	///@param n: x,y,z direction of the normal.	
 	CUDA_CALLABLE
-	circle(vec<T> c, vec<T> n = vec<T>(0,0,1)) : plane<T>()
+	circle(vec3<T> c, vec3<T> n = vec3<T>(0,0,1)) : plane<T>()
 	{
 		center(c);
 		normal(n);
@@ -68,7 +68,7 @@ public:
 	///@param s: size of the rectangle.
 	///@param n: x,y,z direction of the normal.
 	CUDA_CALLABLE 
-	circle(vec<T> c, T s, vec<T> n = vec<T>(0,0,1)) : plane<T>()
+	circle(vec3<T> c, T s, vec3<T> n = vec3<T>(0,0,1)) : plane<T>()
 	{
 		init();
 		center(c);
@@ -82,7 +82,7 @@ public:
 	///@param n: x,y,z direction of the normal.
 	///@param u: x,y,z direction for the zero vector (from where the rotation starts)
 	CUDA_CALLABLE
-	circle(vec<T> c, T s, vec<T> n = vec<T>(0,0,1), vec<T> u = vec<T>(1, 0, 0)) : plane<T>()
+	circle(vec3<T> c, T s, vec3<T> n = vec3<T>(0,0,1), vec3<T> u = vec3<T>(1, 0, 0)) : plane<T>()
 	{
 		init();
 		setU(u);
@@ -103,16 +103,15 @@ public:
 	///sets the normal for the cirlce
 	///@param n: x,y,z direction of the normal.
 	CUDA_CALLABLE void
-	normal(vec<T> n)
+	normal(vec3<T> n)
 	{
 		rotate(n, Y);
 	}
 	///sets the center of the circle.
 	///@param n: x,y,z location of the center.
-	CUDA_CALLABLE T
-	center(vec<T> p)
-	{
+	CUDA_CALLABLE void
+	center(vec3<T> p){
 		this->P = p;
 	}
@@ -127,17 +126,17 @@ public:
 	}
 	///get the world space value given the planar coordinates a, b in [0, 1]
-	CUDA_CALLABLE stim::vec<T> p(T a, T b)
+	CUDA_CALLABLE stim::vec3<T> p(T a, T b)
 	{
-		stim::vec<T> result;
+		stim::vec3<T> result;
-		vec<T> A = this->P - this->U * (T)0.5 - Y * (T)0.5;
+		vec3<T> A = this->P - this->U * (T)0.5 - Y * (T)0.5;
 		result = A + this->U * a + Y * b;
 		return result;
 	}
 	///parenthesis operator returns the world space given rectangular coordinates a and b in [0 1]
-	CUDA_CALLABLE stim::vec<T> operator()(T a, T b)
+	CUDA_CALLABLE stim::vec3<T> operator()(T a, T b)
 	{
 		return p(a,b);
 	}
@@ -145,11 +144,11 @@ public:
 	///returns a vector with the points on the initialized circle.
 	///connecting the points results in a circle.
 	///@param n: integer for the number of points representing the circle.
-	std::vector<stim::vec<T> >
+	std::vector<stim::vec3<T> >
 	getPoints(int n)
 	{
-		std::vector<stim::vec<T> > result;
-		stim::vec<T> point;
+		std::vector<stim::vec3<T> > result;
+		stim::vec3<T> point;
 		T x,y;
 		float step = 360.0/(float) n;
 		for(float j = 0; j <= 360.0; j += step)
@@ -164,7 +163,7 @@ public:
 	///returns a vector with the points on the initialized circle.
 	///connecting the points results in a circle.
 	///@param n: integer for the number of points representing the circle.
-	stim::vec<T>
+	stim::vec3<T>
 	p(T theta)
 	{
 		T x,y;
-/*RTS Complex number class.  This class is CUDA compatible,
-and can therefore be used in CUDA code and on CUDA devices.
-*/
+/// CUDA compatible complex number class
-#ifndef RTS_COMPLEX
-#define RTS_COMPLEX
+#ifndef STIM_COMPLEX
+#define STIM_COMPLEX
-#include "../cuda/callable.h"
+#include "../cuda/cudatools/callable.h"
 #include <cmath>
 #include <string>
 #include <sstream>
@@ -13,6 +11,7 @@ and can therefore be used in CUDA code and on CUDA devices.
 namespace stim
 {
+    enum complexComponentType {complexReal, complexImaginary, complexMag};
 template <class T>
 struct complex
@@ -230,12 +229,6 @@ struct complex
 		return result;
 	}
-	/*CUDA_CALLABLE complex<T> pow(int y)
-	{
-
-        return pow((double)y);
-	}*/
-
 	CUDA_CALLABLE complex<T> pow(T y)
 	{
 		complex<T> result;
@@ -328,8 +321,31 @@ struct complex
 		return *this;
 	}
+	
+
 };
+/// Cast an array of complex values to an array of real values
+template<typename T>
+static void real(T* r, complex<T>* c, size_t n){
+	for(size_t i = 0; i < n; i++)
+		r[i] = c[i].real();
+}
+
+/// Cast an array of complex values to an array of real values
+template<typename T>
+static void imag(T* r, complex<T>* c, size_t n){
+	for(size_t i = 0; i < n; i++)
+		r[i] = c[i].imag();
+}
+
+/// Calculate the magnitude of an array of complex values
+template<typename T>
+static void abs(T* m, complex<T>* c, size_t n){
+	for(size_t i = 0; i < n; i++)
+		m[i] = c[i].abs();
+}
+
 }	//end RTS namespace
 //addition
@@ -432,17 +448,6 @@ CUDA_CALLABLE static T imag(stim::complex&lt;T&gt; a)
     return a.i;
 }
-//trigonometric functions
-//template<class A>
-/*CUDA_CALLABLE static stim::complex<float> sinf(const stim::complex<float> x)
-{
-	stim::complex<float> result;
-	result.r = sinf(x.r) * coshf(x.i);
-	result.i = cosf(x.r) * sinhf(x.i);
-
-	return result;
-}*/
-
 template<class A>
 CUDA_CALLABLE stim::complex<A> sin(const stim::complex<A> x)
 {
@@ -453,17 +458,6 @@ CUDA_CALLABLE stim::complex&lt;A&gt; sin(const stim::complex&lt;A&gt; x)
 	return result;
 }
-//floating point template
-//template<class A>
-/*CUDA_CALLABLE static stim::complex<float> cosf(const stim::complex<float> x)
-{
-	stim::complex<float> result;
-	result.r = cosf(x.r) * coshf(x.i);
-	result.i = -(sinf(x.r) * sinhf(x.i));
-
-	return result;
-}*/
-
 template<class A>
 CUDA_CALLABLE stim::complex<A> cos(const stim::complex<A> x)
 {
@@ -496,10 +490,4 @@ std::istream&amp; operator&gt;&gt;(std::istream&amp; is, stim::complex&lt;A&gt;&amp; x)
     return is;		//return the stream
 }
-//#if __GNUC__ > 3 && __GNUC_MINOR__ > 7
-//template<class T> using rtsComplex = stim::complex<T>;
-//#endif
-
-
-
 #endif
-#ifndef RTS_CONSTANTS_H
-#define RTS_CONSTANTS_H
+#ifndef STIM_CONSTANTS_H
+#define STIM_CONSTANTS_H
-#define stimPI 	3.14159
-#define stimTAU	2 * rtsPI
+#include "stim/cuda/cudatools/callable.h"
+namespace stim{
+	const double PI		=	3.1415926535897932384626433832795028841971693993751058209749445923078164062862;
+	const double TAU	=	2 * stim::PI;
+}
 #endif
+#ifndef STIM_FFT_H
+#define STIM_FFT_H
+
+namespace stim{
+
+	template<class T>
+	void circshift(T *out, const T *in, size_t xdim, size_t ydim, size_t xshift, size_t yshift){
+		size_t i, j, ii, jj;
+		for (i =0; i < xdim; i++) {
+			ii = (i + xshift) % xdim;
+			for (j = 0; j < ydim; j++) {
+				jj = (j + yshift) % ydim;
+				out[ii * ydim + jj] = in[i * ydim + j];
+			}
+		}
+	}
+
+	template<typename T>
+	void cpu_fftshift(T* out, T* in, size_t xdim, size_t ydim){
+		circshift(out, in, xdim, ydim, xdim/2, ydim/2);
+	}
+
+	template<typename T>
+	void cpu_ifftshift(T* out, T* in, size_t xdim, size_t ydim){
+		circshift(out, in, xdim, ydim, xdim/2, ydim/2);
+	}
+
+
+}
+
+#endif
 \ No newline at end of file
 #ifndef RTS_LEGENDRE_H
 #define RTS_LEGENDRE_H
-#include "rts/cuda/callable.h"
+#include "../cuda/cudatools/callable.h"
 namespace stim{
@@ -24,9 +24,11 @@ CUDA_CALLABLE void shift_legendre(int n, T x, T&amp; P0, T&amp; P1)
 	P1 = Pnew;
 }
+/// Iteratively evaluates the Legendre polynomials for orders l = [0 n]
 template <typename T>
 CUDA_CALLABLE void legendre(int n, T x, T* P)
 {
+	if(n < 0) return;
     P[0] = 1;
     if(n >= 1)
@@ -5,6 +5,7 @@
 #include <string.h>
 #include <iostream>
 #include <stim/math/vector.h>
+#include <stim/math/vec3.h>
 #include <stim/cuda/cudatools/callable.h>
 namespace stim{
@@ -50,10 +51,8 @@ struct matrix
 		return *this;
 	}
-
 	template<typename Y>
-	CUDA_CALLABLE vec<Y> operator*(vec<Y> rhs)
-	{
+	vec<Y> operator*(vec<Y> rhs){
 		unsigned int N = rhs.size();
 		vec<Y> result;
@@ -66,6 +65,16 @@ struct matrix
 		return result;
 	}
+	template<typename Y>
+	CUDA_CALLABLE vec3<Y> operator*(vec3<Y> rhs){
+		vec3<Y> result = 0;
+		for(int r=0; r<3; r++)
+			for(int c=0; c<3; c++)
+				result[r] += (*this)(r, c) * rhs[c];
+
+		return result;
+	}
+
 	std::string toStr()
 	{
 		std::stringstream ss;
@@ -82,10 +91,6 @@ struct matrix
 		return ss.str();
 	}
-
-
-
-
 };
 }	//end namespace rts
+#ifndef STIM_MESHGRID_H
+#define STIM_MESHGRID_H
+
+namespace stim{
+
+	/// Create a 2D grid based on a pair of vectors representing the grid spacing (see Matlab)
+	/// @param X is an [nx x ny] array that will store the X coordinates for each 2D point
+	/// @param Y is an [nx x ny] array that will store the Y coordinates for each 2D point
+	/// @param x is an [nx] array that provides the positions of grid points in the x direction
+	/// @param nx is the number of grid points in the x direction
+	/// @param y is an [ny] array that provides the positions of grid points in the y direction
+	/// @param ny is the number of grid points in the y direction
+	template<typename T>
+	void meshgrid(T* X, T* Y, T* x, size_t nx, T* y, size_t ny){
+		size_t xi, yi;												//allocate index variables
+		for(yi = 0; yi < ny; yi++){									//iterate through each column
+			for(xi = 0; xi < nx; xi++){								//iterate through each row
+				X[yi * nx + xi] = x[xi];
+				Y[yi * nx + xi] = y[yi];
+			}
+		}
+	}
+
+	/// Creates an array of n equally spaced values in the range [xmin xmax]
+	/// @param X is an array of length n that stores the values
+	/// @param xmin is the start point of the array
+	/// @param xmax is the end point of the array
+	/// @param n is the number of points in the array
+	template<typename T>
+	void linspace(T* X, T xmin, T xmax, size_t n){
+		T alpha;
+		for(size_t i = 0; i < n; i++){
+			alpha = (T)i / (T)n;
+			X[i] = (1 - alpha) * xmin + alpha * xmax;
+		}
+	}
+
+
+}
+
+
+#endif
 \ No newline at end of file
@@ -2,7 +2,7 @@
 #define STIM_PLANE_H
 #include <iostream>
-#include <stim/math/vector.h>
+#include <stim/math/vec3.h>
 #include <stim/cuda/cudatools/callable.h>
 #include <stim/math/quaternion.h>
@@ -22,17 +22,17 @@ template &lt;typename T&gt;
 class plane
 {
 	protected:
-		stim::vec<T> P;
-		stim::vec<T> N;
-		stim::vec<T> U;
+		stim::vec3<T> P;
+		stim::vec3<T> N;
+		stim::vec3<T> U;
 		///Initializes the plane with standard coordinates.
 		///
 		CUDA_CALLABLE void init()
 		{
-			P = stim::vec<T>(0, 0, 0);
-			N = stim::vec<T>(0, 0, 1);
-			U = stim::vec<T>(1, 0, 0);
+			P = stim::vec3<T>(0, 0, 0);
+			N = stim::vec3<T>(0, 0, 1);
+			U = stim::vec3<T>(1, 0, 0);
 		}
 	public:
@@ -42,7 +42,7 @@ class plane
 			init();
 		}
-		CUDA_CALLABLE plane(vec<T> n, vec<T> p = vec<T>(0, 0, 0))
+		CUDA_CALLABLE plane(vec3<T> n, vec3<T> p = vec3<T>(0, 0, 0))
 		{
 			init();
 			P = p;
@@ -56,11 +56,11 @@ class plane
 		}
 		//create a plane from three points (a triangle)
-		CUDA_CALLABLE plane(vec<T> a, vec<T> b, vec<T> c)
+		CUDA_CALLABLE plane(vec3<T> a, vec3<T> b, vec3<T> c)
 		{
 			init();
 			P = c;
-			stim::vec<T> n = (c - a).cross(b - a);
+			stim::vec3<T> n = (c - a).cross(b - a);
 			try
 			{
 				if(n.len() != 0)
@@ -84,17 +84,17 @@ class plane
 		}
-		CUDA_CALLABLE vec<T> n()
+		CUDA_CALLABLE vec3<T> n()
 		{
 			return N;
 		}
-		CUDA_CALLABLE vec<T> p()
+		CUDA_CALLABLE vec3<T> p()
 		{
 			return P;
 		}
-		CUDA_CALLABLE vec<T> u()
+		CUDA_CALLABLE vec3<T> u()
 		{
 			return U;
 		}
@@ -107,7 +107,7 @@ class plane
 		}
 		//determines how a vector v intersects the plane (1 = intersects front, 0 = within plane,     -1 = intersects back)
-		CUDA_CALLABLE int face(vec<T> v){
+		CUDA_CALLABLE int face(vec3<T> v){
 			T dprod = v.dot(N);             //get the dot product between v and N
@@ -121,46 +121,46 @@ class plane
 		}
 		//determine on which side of the plane a point lies (1 = front, 0 = on the plane, -1 = bac    k)
-		CUDA_CALLABLE int side(vec<T> p){
+		CUDA_CALLABLE int side(vec3<T> p){
-			vec<T> v = p - P;    //get the vector from P to the query point p
+			vec3<T> v = p - P;    //get the vector from P to the query point p
 			return face(v);
 		}
 		//compute the component of v that is perpendicular to the plane
-		CUDA_CALLABLE vec<T> perpendicular(vec<T> v){
+		CUDA_CALLABLE vec3<T> perpendicular(vec3<T> v){
 			return N * v.dot(N);
 		}
 		//compute the projection of v in the plane
-		CUDA_CALLABLE vec<T> parallel(vec<T> v){
+		CUDA_CALLABLE vec3<T> parallel(vec3<T> v){
 			return v - perpendicular(v);
 		}
-		CUDA_CALLABLE void setU(vec<T> v)
+		CUDA_CALLABLE void setU(vec3<T> v)
 		{
 			U = (parallel(v.norm())).norm();		
 		}
-		CUDA_CALLABLE void decompose(vec<T> v, vec<T>& para, vec<T>& perp){
+		CUDA_CALLABLE void decompose(vec3<T> v, vec3<T>& para, vec3<T>& perp){
 			perp = N * v.dot(N);
 			para = v - perp;
 		}
 		//get both the parallel and perpendicular components of a vector v w.r.t. the plane
-		CUDA_CALLABLE void project(vec<T> v, vec<T> &v_par, vec<T> &v_perp){
+		CUDA_CALLABLE void project(vec3<T> v, vec3<T> &v_par, vec3<T> &v_perp){
 			v_perp = v.dot(N);
 			v_par = v - v_perp;
 		}
 		//compute the reflection of v off of the plane
-		CUDA_CALLABLE vec<T> reflect(vec<T> v){
+		CUDA_CALLABLE vec3<T> reflect(vec3<T> v){
 			//compute the reflection using N_prime as the plane normal
-			vec<T> par = parallel(v);
-			vec<T> r = (-v) + par * 2;
+			vec3<T> par = parallel(v);
+			vec3<T> r = (-v) + par * 2;
 			return r;
 		}
@@ -184,7 +184,7 @@ class plane
 		}
-		CUDA_CALLABLE void rotate(vec<T> n)
+		CUDA_CALLABLE void rotate(vec3<T> n)
 		{
 			quaternion<T> q;
 			q.CreateRotation(N, n);
@@ -194,7 +194,7 @@ class plane
 		}
-		CUDA_CALLABLE void rotate(vec<T> n, vec<T> &Y)
+		CUDA_CALLABLE void rotate(vec3<T> n, vec3<T> &Y)
 		{
 			quaternion<T> q;
 			q.CreateRotation(N, n);
@@ -205,7 +205,7 @@ class plane
 		}
-		CUDA_CALLABLE void rotate(vec<T> n, vec<T> &X, vec<T> &Y)
+		CUDA_CALLABLE void rotate(vec3<T> n, vec3<T> &X, vec3<T> &Y)
 		{
 			quaternion<T> q;
 			q.CreateRotation(N, n);
-#ifndef RTS_PLANE_H
-#define RTS_PLANE_H
-
-#include <iostream>
-#include <stim/math/vector.h>
-#include "rts/cuda/callable.h"
-
-
-namespace stim{
-template <typename T, int D> class plane;
-}
-
-template <typename T, int D>
-CUDA_CALLABLE stim::plane<T, D> operator-(stim::plane<T, D> v);
-
-namespace stim{
-
-template <class T, int D = 3>
-class plane{
-
-	//a plane is defined by a point and a normal
-
-private:
-
-	vec<T, D> P;	//point on the plane
-	vec<T, D> N;	//plane normal
-
-	CUDA_CALLABLE void init(){
-		P = vec<T, D>(0, 0, 0);
-		N = vec<T, D>(0, 0, 1);
-	}
-
-
-public:
-
-	//default constructor
-	CUDA_CALLABLE plane(){
-		init();
-	}
-	
-	CUDA_CALLABLE plane(vec<T, D> n, vec<T, D> p = vec<T, D>(0, 0, 0)){
-		P = p;
-		N = n.norm();
-	}
-
-	CUDA_CALLABLE plane(T z_pos){
-		init();
-		P[2] = z_pos;
-	}
-
-	//create a plane from three points (a triangle)
-	CUDA_CALLABLE plane(vec<T, D> a, vec<T, D> b, vec<T, D> c){
-		P = c;
-		N = (c - a).cross(b - a);
-		if(N.len() == 0)	//handle the degenerate case when two vectors are the same, N = 0
-			N = 0;
-		else
-			N = N.norm();
-	}
-
-	template< typename U >
-	CUDA_CALLABLE operator plane<U, D>(){
-
-		plane<U, D> result(N, P);
-		return result;
-	}
-
-	CUDA_CALLABLE vec<T, D> norm(){
-		return N;
-	}
-
-	CUDA_CALLABLE vec<T, D> p(){
-		return P;
-	}
-
-	//flip the plane front-to-back
-	CUDA_CALLABLE plane<T, D> flip(){
-		plane<T, D> result = *this;
-		result.N = -result.N;
-		return result;
-	}
-
-	//determines how a vector v intersects the plane (1 = intersects front, 0 = within plane, -1 = intersects back)
-	CUDA_CALLABLE int face(vec<T, D> v){
-		
-		T dprod = v.dot(N);		//get the dot product between v and N
-
-		//conditional returns the appropriate value
-		if(dprod < 0)
-			return 1;
-		else if(dprod > 0)
-			return -1;
-		else
-			return 0;
-	}
-
-	//determine on which side of the plane a point lies (1 = front, 0 = on the plane, -1 = back)
-	CUDA_CALLABLE int side(vec<T, D> p){
-
-		vec<T, D> v = p - P;	//get the vector from P to the query point p
-
-		return face(v);
-	}
-
-	//compute the component of v that is perpendicular to the plane
-	CUDA_CALLABLE vec<T, D> perpendicular(vec<T, D> v){
-		return N * v.dot(N);
-	}
-
-	//compute the projection of v in the plane
-	CUDA_CALLABLE vec<T, D> parallel(vec<T, D> v){
-		return v - perpendicular(v);
-	}
-
-	CUDA_CALLABLE void decompose(vec<T, D> v, vec<T, D>& para, vec<T, D>& perp){
-		perp = N * v.dot(N);
-		para = v - perp;
-	}
-
-	//get both the parallel and perpendicular components of a vector v w.r.t. the plane
-	CUDA_CALLABLE void project(vec<T, D> v, vec<T, D> &v_par, vec<T, D> &v_perp){
-
-		v_perp = v.dot(N);
-		v_par = v - v_perp;
-	}
-
-	//compute the reflection of v off of the plane
-	CUDA_CALLABLE vec<T, D> reflect(vec<T, D> v){
-
-		//compute the reflection using N_prime as the plane normal
-		vec<T, D> par = parallel(v);
-		vec<T, D> r = (-v) + par * 2;
-
-		/*std::cout<<"----------------REFLECT-----------------------------"<<std::endl;
-		std::cout<<str()<<std::endl;
-		std::cout<<"v: "<<v<<std::endl;
-		std::cout<<"r: "<<r<<std::endl;
-		std::cout<<"Perpendicular: "<<perpendicular(v)<<std::endl;
-		std::cout<<"Parallel: "<<par<<std::endl;*/
-		return r;
-
-	}
-
-	CUDA_CALLABLE rts::plane<T, D> operator-()
-	{
-		rts::plane<T, D> p = *this;
-
-		//negate the normal vector
-		p.N = -p.N;
-
-		return p;
-	}
-
-	//output a string
-	std::string str(){
-		std::stringstream ss;
-		ss<<"P: "<<P<<std::endl;
-		ss<<"N: "<<N;
-		return ss.str();
-	}
-
-	///////Friendship
-	//friend CUDA_CALLABLE rts::plane<T, D> operator- <> (rts::plane<T, D> v);
-
-
-
-};
-
-}
-
-//arithmetic operators
-
-//negative operator flips the plane (front to back)
-//template <typename T, int D>
-
-
-
-
-#endif
-#ifndef RTS_QUAD_H
-#define RTS_QUAD_H
-
-//enable CUDA_CALLABLE macro
-#include <stim/cuda/callable.h>
-#include <stim/math/vector.h>
-#include <stim/math/triangle.h>
-#include <stim/math/quaternion.h>
-#include <iostream>
-#include <iomanip>
-#include <algorithm>
-
-namespace stim{
-
-//template for a quadangle class in ND space
-template <class T, int N = 3>
-struct quad
-{
-	/*
-		B------------------>C
-		^                   ^
-		|                   |
-		Y                   |
-		|                   |
-		|                   |
-		A---------X-------->O
-	*/
-
-	/*T A[N];
-	T B[N];
-	T C[N];*/
-
-	rts::vec<T, N> A;
-	rts::vec<T, N> X;
-	rts::vec<T, N> Y;
-
-
-	CUDA_CALLABLE quad()
-	{
-
-	}
-
-	CUDA_CALLABLE quad(vec<T, N> a, vec<T, N> b, vec<T, N> c)
-	{
-
-		A = a;		
-		Y = b - a;
-		X = c - a - Y;
-
-	}
-
-	/*******************************************************************
-	Constructor - create a quad from a position, normal, and rotation
-	*******************************************************************/
-	CUDA_CALLABLE quad(rts::vec<T, N> c, rts::vec<T, N> normal, T width, T height, T theta)
-	{
-
-        //compute the X direction - start along world-space X
-        Y = rts::vec<T, N>(0, 1, 0);
-        if(Y == normal)
-            Y = rts::vec<T, N>(0, 0, 1);
-
-        X = Y.cross(normal).norm();
-
-        std::cout<<X<<std::endl;
-
-        //rotate the X axis by theta radians
-        rts::quaternion<T> q;
-        q.CreateRotation(theta, normal);
-        X = q.toMatrix3() * X;
-        Y = normal.cross(X);
-
-        //normalize everything
-        X = X.norm();
-        Y = Y.norm();
-
-        //scale to match the quad width and height
-        X = X * width;
-        Y = Y * height;
-
-        //set the corner of the plane
-        A = c - X * 0.5f - Y * 0.5f;
-
-        std::cout<<X<<std::endl;
-	}
-
-	//boolean comparison
-	bool operator==(const quad<T, N> & rhs)
-	{
-		if(A == rhs.A && X == rhs.X && Y == rhs.Y)
-			return true;
-		else
-			return false;
-	}
-
-	/*******************************************
-	Return the normal for the quad
-	*******************************************/
-	CUDA_CALLABLE rts::vec<T, N> n()
-	{
-        return (X.cross(Y)).norm();
-	}
-
-	CUDA_CALLABLE rts::vec<T, N> p(T a, T b)
-	{
-		rts::vec<T, N> result;
-		//given the two parameters a, b = [0 1], returns the position in world space
-		result = A + X * a + Y * b;
-
-		return result;
-	}
-
-	CUDA_CALLABLE rts::vec<T, N> operator()(T a, T b)
-	{
-		return p(a, b);
-	}
-
-	std::string str()
-	{
-		std::stringstream ss;
-
-		ss<<std::left<<"B="<<setfill('-')<<setw(20)<<A + Y<<">"<<"C="<<A + Y + X<<std::endl;
-		ss<<setfill(' ')<<setw(23)<<"|"<<"|"<<std::endl<<setw(23)<<"|"<<"|"<<std::endl;
-		ss<<std::left<<"A="<<setfill('-')<<setw(20)<<A<<">"<<"D="<<A + X;
-
-        return ss.str();
-
-	}
-
-	CUDA_CALLABLE quad<T, N> operator*(T rhs)
-	{
-		//scales the plane by a scalar value
-
-		//compute the center point
-		rts::vec<T, N> c = A + X*0.5f + Y*0.5f;
-
-		//create the new quadangle
-		quad<T, N> result;
-		result.X = X * rhs;
-		result.Y = Y * rhs;
-		result.A = c - result.X*0.5f - result.Y*0.5f;
-
-		return result;
-
-	}
-
-	CUDA_CALLABLE T dist(vec<T, N> p)
-	{
-        //compute the distance between a point and this quad
-
-        //first break the quad up into two triangles
-        triangle<T, N> T0(A, A+X, A+Y);
-        triangle<T, N> T1(A+X+Y, A+X, A+Y);
-
-
-        T d0 = T0.dist(p);
-        T d1 = T1.dist(p);
-
-        if(d0 < d1)
-            return d0;
-        else
-            return d1;
-	}
-
-	CUDA_CALLABLE T dist_max(vec<T, N> p)
-	{
-        T da = (A - p).len();
-        T db = (A+X - p).len();
-        T dc = (A+Y - p).len();
-        T dd = (A+X+Y - p).len();
-
-        return std::max( da, std::max(db, std::max(dc, dd) ) );
-	}
-};
-
-}	//end namespace rts
-
-template <typename T, int N>
-std::ostream& operator<<(std::ostream& os, rts::quad<T, N> R)
-{
-    os<<R.str();
-    return os;
-}
-
-
-#endif
@@ -26,13 +26,13 @@ public:
 	CUDA_CALLABLE void CreateRotation(T theta, T ux, T uy, T uz){
-		vec<T> u(ux, uy, uz);
+		vec3<T> u(ux, uy, uz);
 		CreateRotation(theta, u);		
 	}
-	CUDA_CALLABLE void CreateRotation(T theta, vec<T> u){
+	CUDA_CALLABLE void CreateRotation(T theta, vec3<T> u){
-		vec<T> u_hat = u.norm();
+		vec3<T> u_hat = u.norm();
 		//assign the given Euler rotation to this quaternion
 		w = (T)cos(theta/2);
@@ -41,9 +41,11 @@ public:
 		z = u_hat[2]*(T)sin(theta/2);
 	}
-	void CreateRotation(vec<T> from, vec<T> to){
+	CUDA_CALLABLE void CreateRotation(vec3<T> from, vec3<T> to){
-		vec<T> r = from.cross(to);			//compute the rotation vector
+		from = from.norm();
+		to = to.norm();
+		vec3<T> r = from.cross(to);			//compute the rotation vector
 		T theta = asin(r.len());				//compute the angle of the rotation about r
 		//deal with a zero vector (both k and kn point in the same direction)
 		if(theta == (T)0){
@@ -28,13 +28,10 @@ class rect : plane &lt;T&gt;
 		O---------X--------->
 	*/
-private:
-
-	stim::vec<T> X;
-	stim::vec<T> Y;
-
-	
+protected:
+	stim::vec3<T> X;
+	stim::vec3<T> Y;
 public:
@@ -65,7 +62,7 @@ public:
 	///create a rectangle from a center point, normal
 	///@param c: x,y,z location of the center.
 	///@param n: x,y,z direction of the normal.
-	CUDA_CALLABLE rect(vec<T> c, vec<T> n = vec<T>(0, 0, 1))
+	CUDA_CALLABLE rect(vec3<T> c, vec3<T> n = vec3<T>(0, 0, 1))
 		: plane<T>()
 	{
 		init();			//start with the default setting
@@ -76,7 +73,7 @@ public:
 	///@param c: x,y,z location of the center.
 	///@param s: size of the rectangle.
 	///@param n: x,y,z direction of the normal.
-	CUDA_CALLABLE rect(vec<T> c, T s, vec<T> n = vec<T>(0, 0, 1))
+	CUDA_CALLABLE rect(vec3<T> c, T s, vec3<T> n = vec3<T>(0, 0, 1))
 		: plane<T>()
 	{
 		init();			//start with the default setting
@@ -89,7 +86,7 @@ public:
 	///@param center: x,y,z location of the center.
 	///@param directionX: u,v,w direction of the X vector.
 	///@param directionY: u,v,w direction of the Y vector.
-	CUDA_CALLABLE rect(vec<T> center, vec<T> directionX, vec<T> directionY )
+	CUDA_CALLABLE rect(vec3<T> center, vec3<T> directionX, vec3<T> directionY )
 		 : plane<T>((directionX.cross(directionY)).norm(),center)
 	{
 		X = directionX;
@@ -101,7 +98,7 @@ public:
 	///@param center: x,y,z location of the center.
 	///@param directionX: u,v,w direction of the X vector.
 	///@param directionY: u,v,w direction of the Y vector.
-	CUDA_CALLABLE rect(T size, vec<T> center, vec<T> directionX, vec<T> directionY )
+	CUDA_CALLABLE rect(T size, vec3<T> center, vec3<T> directionX, vec3<T> directionY )
 		: plane<T>((directionX.cross(directionY)).norm(),center)
 	{	
 		X = directionX;
@@ -114,7 +111,7 @@ public:
 	///@param center: x,y,z location of the center.
 	///@param directionX: u,v,w direction of the X vector.
 	///@param directionY: u,v,w direction of the Y vector.
-	CUDA_CALLABLE rect(vec<T> size, vec<T> center, vec<T> directionX, vec<T> directionY)
+	CUDA_CALLABLE rect(vec3<T> size, vec3<T> center, vec3<T> directionX, vec3<T> directionY)
 		: plane<T>((directionX.cross(directionY)).norm(), center)
 	{	
 		X = directionX;
@@ -138,7 +135,7 @@ public:
 	///@param n; vector with the normal.
 	///Orients the rectangle along the normal n.
-	CUDA_CALLABLE void normal(vec<T> n)
+	CUDA_CALLABLE void normal(vec3<T> n)
 	{	
 		//orient the rectangle along the specified normal
 		rotate(n, X, Y);
@@ -147,8 +144,8 @@ public:
 	///general init method that sets a general rectangle.
 	CUDA_CALLABLE void init()
 	{
-		X = vec<T>(1, 0, 0);
-		Y = vec<T>(0, 1, 0);
+		X = vec3<T>(1, 0, 0);
+		Y = vec3<T>(0, 1, 0);
 	}
 	//boolean comparison
@@ -162,18 +159,18 @@ public:
 	//get the world space value given the planar coordinates a, b in [0, 1]
-	CUDA_CALLABLE stim::vec<T> p(T a, T b)
+	CUDA_CALLABLE stim::vec3<T> p(T a, T b)
 	{
-		stim::vec<T> result;
+		stim::vec3<T> result;
 		//given the two parameters a, b = [0 1], returns the position in world space
-		vec<T> A = this->P - X * (T)0.5 - Y * (T)0.5;
+		vec3<T> A = this->P - X * (T)0.5 - Y * (T)0.5;
 		result = A + X * a + Y * b;
 		return result;
 	}
 	//parenthesis operator returns the world space given rectangular coordinates a and b in [0 1]
-	CUDA_CALLABLE stim::vec<T> operator()(T a, T b)
+	CUDA_CALLABLE stim::vec3<T> operator()(T a, T b)
 	{
 		return p(a, b);
 	}
@@ -181,12 +178,12 @@ public:
 	std::string str()
 	{
 		std::stringstream ss;
-		vec<T> A = P - X * (T)0.5 - Y * (T)0.5;
+		vec3<T> A = P - X * (T)0.5 - Y * (T)0.5;
 		ss<<std::left<<"B="<<std::setfill('-')<<std::setw(20)<<A + Y<<">"<<"C="<<A + Y + X<<std::endl;
 		ss<<std::setfill(' ')<<std::setw(23)<<"|"<<"|"<<std::endl<<std::setw(23)<<"|"<<"|"<<std::endl;
 		ss<<std::left<<"A="<<std::setfill('-')<<std::setw(20)<<A<<">"<<"D="<<A + X;
-        	return ss.str();
+        return ss.str();
 	}
@@ -205,11 +202,11 @@ public:
 	///computes the distance between the specified point and this rectangle.
 	///@param p: x, y, z coordinates of the point to calculate distance to.
-	CUDA_CALLABLE T dist(vec<T> p)
+	CUDA_CALLABLE T dist(vec3<T> p)
 	{
         //compute the distance between a point and this rect
-		vec<T> A = P - X * (T)0.5 - Y * (T)0.5;
+		vec3<T> A = P - X * (T)0.5 - Y * (T)0.5;
 		//first break the rect up into two triangles
 		triangle<T> T0(A, A+X, A+Y);
@@ -225,16 +222,16 @@ public:
 		    return d1;
 	}
-	CUDA_CALLABLE T center(vec<T> p)
+	CUDA_CALLABLE T center(vec3<T> p)
 	{
 		this->P = p;
 	}
 	///Returns the maximum distance of the rectangle from a point p to the sides of the rectangle.
 	///@param p: x, y, z point.
-	CUDA_CALLABLE T dist_max(vec<T> p)
+	CUDA_CALLABLE T dist_max(vec3<T> p)
 	{
-		vec<T> A = P - X * (T)0.5 - Y * (T)0.5;
+		vec3<T> A = P - X * (T)0.5 - Y * (T)0.5;
 		T da = (A - p).len();
 		T db = (A+X - p).len();
 		T dc = (A+Y - p).len();
+#ifndef STIM_VEC3_H
+#define STIM_VEC3_H
+
+
+#include <stim/cuda/cudatools/callable.h>
+
+
+namespace stim{
+
+
+/// A class designed to act as a 3D vector with CUDA compatibility
+template<typename T>
+class vec3{
+
+protected:
+	T ptr[3];
+
+public:
+
+	CUDA_CALLABLE vec3(){}
+
+	CUDA_CALLABLE vec3(T v){
+		ptr[0] = ptr[1] = ptr[2] = v;
+	}
+
+	CUDA_CALLABLE vec3(T x, T y, T z){
+		ptr[0] = x;
+		ptr[1] = y;
+		ptr[2] = z;
+	}
+
+	//copy constructor
+	CUDA_CALLABLE vec3( const vec3<T>& other){
+		ptr[0] = other.ptr[0];
+		ptr[1] = other.ptr[1];
+		ptr[2] = other.ptr[2];
+	}
+
+	//access an element using an index
+	CUDA_CALLABLE T& operator[](int idx){
+		return ptr[idx];
+	}
+
+/// Casting operator. Creates a new vector with a new type U.
+	template< typename U >
+	CUDA_CALLABLE operator vec3<U>(){
+		vec3<U> result;
+		result.ptr[0] = (U)ptr[0];
+		result.ptr[1] = (U)ptr[1];
+		result.ptr[2] = (U)ptr[2];
+
+		return result;
+	}
+
+	// computes the squared Euclidean length (useful for several operations where only >, =, or < matter)
+	CUDA_CALLABLE T len_sq() const{
+		return ptr[0] * ptr[0] + ptr[1] * ptr[1] + ptr[2] * ptr[2];
+	}
+
+	/// computes the Euclidean length of the vector
+	CUDA_CALLABLE T len() const{
+		return sqrt(len_sq());
+	}
+	
+
+	/// Convert the vector from cartesian to spherical coordinates (x, y, z -> r, theta, phi where theta = [0, 2*pi])
+	CUDA_CALLABLE vec3<T> cart2sph() const{
+		vec3<T> sph;
+		sph.ptr[0] = len();
+		sph.ptr[1] = std::atan2(ptr[1], ptr[0]);
+		if(sph.ptr[0] == 0)
+			sph.ptr[2] = 0;
+		else
+			sph.ptr[2] = std::acos(ptr[2] / sph.ptr[0]);
+		return sph;
+	}
+
+	/// Convert the vector from cartesian to spherical coordinates (r, theta, phi -> x, y, z where theta = [0, 2*pi])
+	CUDA_CALLABLE vec3<T> sph2cart() const{
+		vec3<T> cart;
+		cart.ptr[0] = ptr[0] * std::cos(ptr[1]) * std::sin(ptr[2]);
+		cart.ptr[1] = ptr[0] * std::sin(ptr[1]) * std::sin(ptr[2]);
+		cart.ptr[2] = ptr[0] * std::cos(ptr[2]);
+
+		return cart;
+	}
+
+	/// Computes the normalized vector (where each coordinate is divided by the L2 norm)
+	CUDA_CALLABLE vec3<T> norm() const{
+        vec3<T> result;
+        T l = len();						//compute the vector length
+        return (*this) / l;
+	}
+
+	/// Computes the cross product of a 3-dimensional vector
+	CUDA_CALLABLE vec3<T> cross(const vec3<T> rhs) const{
+
+		vec3<T> result;
+
+		result[0] = (ptr[1] * rhs.ptr[2] - ptr[2] * rhs.ptr[1]);
+		result[1] = (ptr[2] * rhs.ptr[0] - ptr[0] * rhs.ptr[2]);
+		result[2] = (ptr[0] * rhs.ptr[1] - ptr[1] * rhs.ptr[0]);
+
+		return result;
+	}
+
+	/// Compute the Euclidean inner (dot) product
+    CUDA_CALLABLE T dot(vec3<T> rhs) const{
+        return ptr[0] * rhs.ptr[0] + ptr[1] * rhs.ptr[1] + ptr[2] * rhs.ptr[2];
+    }
+
+	/// Arithmetic addition operator
+
+    /// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator+(vec3<T> rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] + rhs[0];
+		result.ptr[1] = ptr[1] + rhs[1];
+		result.ptr[2] = ptr[2] + rhs[2];
+		return result;
+	}
+
+	/// Arithmetic addition to a scalar
+
+	/// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator+(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] + rhs;
+		result.ptr[1] = ptr[1] + rhs;
+		result.ptr[2] = ptr[2] + rhs;
+		return result;
+	}
+
+	/// Arithmetic subtraction operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator-(vec3<T> rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] - rhs[0];
+		result.ptr[1] = ptr[1] - rhs[1];
+		result.ptr[2] = ptr[2] - rhs[2];
+		return result;
+	}
+	/// Arithmetic subtraction to a scalar
+
+	/// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator-(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] - rhs;
+		result.ptr[1] = ptr[1] - rhs;
+		result.ptr[2] = ptr[2] - rhs;
+		return result;
+	}
+
+	/// Arithmetic scalar multiplication operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator*(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] * rhs;
+		result.ptr[1] = ptr[1] * rhs;
+		result.ptr[2] = ptr[2] * rhs;
+		return result;
+	}
+
+	/// Arithmetic scalar division operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator/(T rhs) const{
+		return (*this) * ((T)1.0/rhs);
+	}
+
+	/// Multiplication by a scalar, followed by assignment
+	CUDA_CALLABLE vec3<T> operator*=(T rhs){
+		ptr[0] = ptr[0] * rhs;
+		ptr[1] = ptr[1] * rhs;
+		ptr[2] = ptr[2] * rhs;
+		return *this;
+	}
+
+	/// Addition and assignment
+	CUDA_CALLABLE vec3<T> operator+=(vec3<T> rhs){
+		ptr[0] = ptr[0] + rhs;
+		ptr[1] = ptr[1] + rhs;
+		ptr[2] = ptr[2] + rhs;
+		return *this;
+	}
+
+	/// Assign a scalar to all values
+	CUDA_CALLABLE vec3<T> & operator=(T rhs){
+		ptr[0] = ptr[0] = rhs;
+		ptr[1] = ptr[1] = rhs;
+		ptr[2] = ptr[2] = rhs;
+		return *this;
+	}
+
+	/// Casting and assignment
+	template<typename Y>
+	CUDA_CALLABLE vec3<T> & operator=(vec3<Y> rhs){
+		ptr[0] = (T)rhs.ptr[0];
+		ptr[1] = (T)rhs.ptr[1];
+		ptr[2] = (T)rhs.ptr[2];
+		return *this;
+	}
+
+	/// Unary minus (returns the negative of the vector)
+	CUDA_CALLABLE vec3<T> operator-() const{
+		vec3<T> result;
+		result.ptr[0] = -ptr[0];
+		result.ptr[1] = -ptr[1];
+		result.ptr[2] = -ptr[2];
+		return result;
+	}
+
+
+	/// Outputs the vector as a string
+	std::string str() const{
+		std::stringstream ss;
+
+		const size_t N = 3;
+
+		ss<<"[";
+		for(size_t i=0; i<N; i++)
+		{
+			ss<<ptr[i];
+			if(i != N-1)
+				ss<<", ";
+		}
+		ss<<"]";
+
+		return ss.str();
+	}
+
+	size_t size(){ return 3; }
+
+	};						//end class vec3
+}							//end namespace stim
+
+/// Multiply a vector by a constant when the vector is on the right hand side
+template <typename T>
+stim::vec3<T> operator*(T lhs, stim::vec3<T> rhs){
+    return rhs * lhs;
+}
+
+//stream operator
+template<typename T>
+std::ostream& operator<<(std::ostream& os, stim::vec3<T> const& rhs){
+	os<<rhs.str();
+	return os;
+}
+
+#endif
 \ No newline at end of file
-#ifndef RTS_VECTOR_H
-#define RTS_VECTOR_H
+#ifndef STIM_VECTOR_H
+#define STIM_VECTOR_H
 #include <iostream>
 #include <cmath>
 #include <sstream>
 #include <vector>
-
+
 #include <stim/cuda/cudatools/callable.h>
+#include <stim/math/vec3.h>
 namespace stim
 {
-
-
 template <class T>
 struct vec : public std::vector<T>
 {
@@ -72,8 +71,8 @@ struct vec : public std::vector&lt;T&gt;
 		size_t N = other.size();
 		resize(N);							//resize the current vector to match the copy
 		for(size_t i=0; i<N; i++){	//copy each element
-			at(i) = other[i];
-		}
+			at(i) = other[i];
+		}
 	}
 	//I'm not sure what these were doing here.
@@ -329,6 +328,15 @@ struct vec : public std::vector&lt;T&gt;
 		return *this;
 	}
+	/// Cast to a vec3
+	operator stim::vec3<T>(){
+		stim::vec3<T> r;
+		size_t N = std::min<size_t>(size(), 3);
+		for(size_t i = 0; i < N; i++)
+			r[i] = at(i);
+		return r;
+	}
+
 	/// Casting and assignment
 	template<typename Y>
 	vec<T> & operator=(vec<Y> rhs){
+#ifndef STIM_LENS_H
+#define STIM_LENS_H
+
+#include "scalarwave.h"
+#include "../math/bessel.h"
+#include "../cuda/cudatools/devices.h"
+#include "../visualization/colormap.h"
+#include "../math/fft.h"
+
+#include "cufft.h"
+
+#include <cmath>
+
+namespace stim{
+
+	/// Perform a k-space transform of a scalar field (FFT). The given field has a width of x and the calculated momentum space has a
+	///		width of kx (in radians).
+	/// @param K is a pointer to the output array of all plane waves in the field
+	/// @param kx is the width of the frame in momentum space
+	/// @param ky is the height of the frame in momentum space
+	/// @param E is the field to be transformed
+	/// @param x is the width of the field in the spatial domain
+	/// @param y is the height of the field in the spatial domain
+	/// @param nx is the number of pixels representing the field in the x (and kx) direction
+	/// @param ny is the number of pixels representing the field in the y (and ky) direction
+	template<typename T>
+	void cpu_scalar_to_kspace(stim::complex<T>* K, T& kx, T& ky, stim::complex<T>* E, T x, T y, size_t nx, size_t ny){
+
+		kx = stim::TAU * nx / x;			//calculate the width of the momentum space
+		ky = stim::TAU * ny / y;
+
+		stim::complex<T>* dev_FFT;
+		HANDLE_ERROR( cudaMalloc(&dev_FFT, sizeof(stim::complex<T>) * nx * ny) );		//allocate space on the CUDA device for the output array
+
+		stim::complex<T>* dev_E;
+		HANDLE_ERROR( cudaMalloc(&dev_E, sizeof(stim::complex<T>) * nx * ny) );		//allocate space for the field
+		HANDLE_ERROR( cudaMemcpy(dev_E, E, sizeof(stim::complex<T>) * nx * ny, cudaMemcpyHostToDevice) );	//copy the field to GPU memory
+
+		cufftResult result;
+		cufftHandle plan;
+		result = cufftPlan2d(&plan, nx, ny, CUFFT_C2C);
+		if(result != CUFFT_SUCCESS){
+			std::cout<<"Error creating cuFFT plan."<<std::endl;
+			exit(1);
+		}
+
+		result = cufftExecC2C(plan, (cufftComplex*)dev_E, (cufftComplex*)dev_FFT, CUFFT_FORWARD);
+		if(result != CUFFT_SUCCESS){
+			std::cout<<"Error using cuFFT to perform a forward Fourier transform of the field."<<std::endl;
+			exit(1);
+		}
+
+		stim::complex<T>* fft = (stim::complex<T>*) malloc(sizeof(stim::complex<T>) * nx * ny);
+		HANDLE_ERROR( cudaMemcpy(fft, dev_FFT, sizeof(stim::complex<T>) * nx * ny, cudaMemcpyDeviceToHost) );
+
+		stim::cpu_fftshift(K, fft, nx, ny);
+	}
+
+	template<typename T>
+	void cpu_scalar_from_kspace(stim::complex<T>* E, T& x, T& y, stim::complex<T>* K, T kx, T ky, size_t nx, size_t ny){
+
+		x = stim::TAU * nx / kx;			//calculate the width of the momentum space
+		y = stim::TAU * ny / ky;
+		
+		stim::complex<T>* fft = (stim::complex<T>*) malloc(sizeof(stim::complex<T>) * nx * ny);
+		stim::cpu_ifftshift(fft, K, nx, ny);
+
+		stim::complex<T>* dev_FFT;
+		HANDLE_ERROR( cudaMalloc(&dev_FFT, sizeof(stim::complex<T>) * nx * ny) );		//allocate space on the CUDA device for the output array
+		HANDLE_ERROR( cudaMemcpy(dev_FFT, fft, sizeof(stim::complex<T>) * nx * ny, cudaMemcpyHostToDevice) );	//copy the field to GPU memory
+
+		stim::complex<T>* dev_E;
+		HANDLE_ERROR( cudaMalloc(&dev_E, sizeof(stim::complex<T>) * nx * ny) );		//allocate space for the field
+
+		cufftResult result;
+		cufftHandle plan;
+		result = cufftPlan2d(&plan, nx, ny, CUFFT_C2C);
+		if(result != CUFFT_SUCCESS){
+			std::cout<<"Error creating cuFFT plan."<<std::endl;
+			exit(1);
+		}
+
+		result = cufftExecC2C(plan, (cufftComplex*)dev_FFT, (cufftComplex*)dev_E, CUFFT_FORWARD);
+		if(result != CUFFT_SUCCESS){
+			std::cout<<"Error using cuFFT to perform a forward Fourier transform of the field."<<std::endl;
+			exit(1);
+		}
+
+		HANDLE_ERROR( cudaMemcpy(E, dev_E, sizeof(stim::complex<T>) * nx * ny, cudaMemcpyDeviceToHost) );
+
+		
+	}
+
+	/// Propagate a field slice along its orthogonal direction by a given distance z
+	/// @param Enew is the resulting propogated field
+	template<typename T>
+	void cpu_scalar_propagate(stim::complex<T>* Enew, stim::complex<T>* E, T sx, T sy, T z, T k, size_t nx, size_t ny){
+		
+		stim::complex<T>* K = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * nx * ny );
+
+		T Kx, Ky;											//width and height in k space
+		cpu_scalar_to_kspace(K, Kx, Ky, E ,sx, sy, nx, ny);
+
+		T* mag = (T*) malloc( sizeof(T) * nx * ny );
+		stim::abs(mag, K, nx * ny);
+		stim::cpu2image<float>(mag, "kspace_pre_shift.bmp", nx, ny, stim::cmBrewer);
+		
+		size_t kxi, kyi;
+		size_t i;
+		T kx, kx_sq, ky, ky_sq, k_sq;
+		T kz;
+		stim::complex<T> shift;
+		T min_kx = -Kx / 2;
+		T dkx = Kx / (nx);
+		T min_ky = -Ky / 2;
+		T dky = Ky / (ny);
+		for(kyi = 0; kyi < ny; kyi++){						//for each plane wave in the ky direction
+			for(kxi = 0; kxi < nx; kxi++){					//for each plane wave in the ky direction
+				i = kyi * nx + kxi;
+
+				kx = min_kx + kxi * dkx;					//calculate the position of the current plane wave
+				ky = min_ky + kyi * dky;
+
+				kx_sq = kx * kx;
+				ky_sq = ky * ky;
+				k_sq = k*k;
+				
+				if(kx_sq + ky_sq < k_sq){
+					kz = sqrt(k*k - kx * kx - ky * ky);			//estimate kz using the Fresnel approximation				
+					shift = -exp(stim::complex<T>(0, kz * z));
+					K[i] *= shift;
+				}
+				else{
+					K[i] = 0;
+				}
+			}
+		}
+		
+		stim::abs(mag, K, nx * ny);
+		stim::cpu2image<float>(mag, "kspace_post_shift.bmp", nx, ny, stim::cmBrewer);
+		
+		cpu_scalar_from_kspace(Enew, sx, sy, K, Kx, Ky, nx, ny);
+	}
+
+}
+
+
+#endif
 \ No newline at end of file
+#ifndef STIM_MIE_H
+#define STIM_MIE_H
+#include <boost/math/special_functions/bessel.hpp>
+
+#include "scalarwave.h"
+#include "../math/bessel.h"
+#include "../cuda/cudatools/devices.h"
+#include <cmath>
+
+namespace stim{
+
+
+/// Calculate the scattering coefficients for a spherical scatterer
+template<typename T>
+void B_coefficients(stim::complex<T>* B, T a, T k, stim::complex<T> n, int Nl){
+
+	//temporary variables
+	double vm;															//allocate space to store the return values for the bessel function calculation
+	double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+
+	double ka = k * a;													//store k*a (argument for spherical bessel and Hankel functions)
+	stim::complex<double> kna = k * n * a;								//store k*n*a (argument for spherical bessel functions and derivatives)
+
+	stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka);			//calculate bessel functions and derivatives for k*a
+	stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna);		//calculate complex bessel functions for k*n*a
+
+	stim::complex<double> h_ka, dh_ka;
+	stim::complex<double> numerator, denominator;
+	stim::complex<double> i(0, 1);
+	for(int l = 0; l <= Nl; l++){
+		h_ka.r = j_ka[l];
+		h_ka.i = y_ka[l];
+		dh_ka.r = dj_ka[l];
+		dh_ka.i = dy_ka[l];
+
+		numerator = j_ka[l] * dj_kna[l] * (stim::complex<double>)n - j_kna[l] * dj_ka[l];
+		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
+		B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
+	}
+}
+
+template<typename T>
+void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){
+	//temporary variables
+	double vm;															//allocate space to store the return values for the bessel function calculation
+	double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+
+	double ka = k * a;													//store k*a (argument for spherical bessel and Hankel functions)
+	stim::complex<double> kna = k * n * a;								//store k*n*a (argument for spherical bessel functions and derivatives)
+
+	stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka);			//calculate bessel functions and derivatives for k*a
+	stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna);		//calculate complex bessel functions for k*n*a
+
+	stim::complex<double> h_ka, dh_ka;
+	stim::complex<double> numerator, denominator;
+	stim::complex<double> i(0, 1);
+	for(size_t l = 0; l <= Nl; l++){
+		h_ka.r = j_ka[l];
+		h_ka.i = y_ka[l];
+		dh_ka.r = dj_ka[l];
+		dh_ka.i = dy_ka[l];
+
+		numerator = j_ka[l] * dh_ka - dj_ka[l] * h_ka;
+		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
+		A[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
+	}
+}
+
+#define LOCAL_NL	16
+template<typename T>
+__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){
+	extern __shared__ stim::complex<T> shared_hB[];		//declare the list of waves in shared memory
+
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;													//exit if this thread is outside the array
+	stim::vec3<T> p;
+	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
+	(y == NULL) ? p[1] = 0 : p[1] = y[i];
+	(z == NULL) ? p[2] = 0 : p[2] = z[i];
+	
+	T r = p.len();														//calculate the distance from the sphere
+	if(r < a) return;													//exit if the point is inside the sphere (we only calculate the internal field)
+	T fij = (r - r_min)/dr;											//FP index into the spherical bessel LUT
+	size_t ij = (size_t) fij;											//convert to an integral index
+	T alpha = fij - ij;													//calculate the fractional portion of the index
+	size_t n0j = ij * (Nl + 1);												//start of the first entry in the LUT
+	size_t n1j = (ij+1) * (Nl + 1);											//start of the second entry in the LUT
+
+	T cos_phi;	
+	T Pl_2, Pl_1, Pl;														//declare registers to store the previous two Legendre polynomials
+	
+	stim::complex<T> hBl;
+	stim::complex<T> Ei = 0;											//create a register to store the result
+	int l;
+
+	stim::complex<T> hlBl[LOCAL_NL+1];									//the first LOCAL_NL components are stored in registers for speed
+	int shared_start = threadIdx.x * (Nl - LOCAL_NL);					//wrap up some operations so that they aren't done in the main loops
+
+	#pragma unroll LOCAL_NL+1											//copy the first LOCAL_NL+1 h_l * B_l components to registers
+	for(l = 0; l <= LOCAL_NL; l++)
+		hlBl[l] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
+	
+	for(l = LOCAL_NL+1; l <= Nl; l++)									//copy any additional h_l * B_l components to shared memory
+		shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
+
+	for(size_t w = 0; w < nW; w++){										//for each plane wave
+		cos_phi = p.norm().dot(W[w].kvec().norm());						//calculate the cosine of the angle between the k vector and the direction from the sphere
+		Pl_2 = 1;														//the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
+		Pl_1 = cos_phi;
+		Ei += W[w].E() * hlBl[0] * Pl_2;								//unroll the first two orders using the initial steps of the Legendre recursive relation
+		Ei += W[w].E() * hlBl[1] * Pl_1;		
+
+		#pragma unroll LOCAL_NL-1										//unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file)
+		for(l = 2; l <= LOCAL_NL; l++){
+			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);	//calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs)
+			Ei += W[w].E() * hlBl[l] * Pl;								//calculate and sum the current field order
+			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+			Pl_1 = Pl;
+		}
+
+		for(l = LOCAL_NL+1; l <= Nl; l++){											//do the same as above, except for any additional orders that are stored in shared memory (not registers)
+			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);				//again, this is where most computation in the kernel occurs
+			Ei += W[w].E() * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl;
+			Pl_2 = Pl_1;															//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+			Pl_1 = Pl;			
+		}
+	}
+	E[i] += Ei;															//copy the result to device memory
+}
+
+template<typename T>
+void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){
+	
+	size_t max_shared_mem = stim::sharedMemPerBlock();	
+	size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
+	std::cout<<"hl*Bl array size:  "<<hBl_array<<std::endl;
+	std::cout<<"shared memory:     "<<max_shared_mem<<std::endl;
+	int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
+	std::cout<<"threads per block: "<<threads<<std::endl;
+	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks
+
+	size_t shared_mem;
+	if(Nl <= LOCAL_NL) shared_mem = 0;
+	else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL);				//amount of shared memory to allocate
+	std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
+	cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl);	//call the kernel
+}
+
+template<typename T>
+__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;													//exit if this thread is outside the array
+
+	stim::vec3<T> p;
+	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
+	(y == NULL) ? p[1] = 0 : p[1] = y[i];
+	(z == NULL) ? p[2] = 0 : p[2] = z[i];
+
+	r[i] = p.len();
+}
+/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
+
+/// @param E is a pointer to the destination field values
+/// @param N is the number of points used to calculate the field
+/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
+/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
+/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
+/// @param W is an array of planewaves that will be scattered
+/// @param a is the radius of the sphere
+/// @param n is the complex refractive index of the sphere
+template<typename T>
+void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
+	//calculate the necessary number of orders required to represent the scattered field
+	T k = W[0].kmag();
+
+	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
+	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
+	std::cout<<"Nl: "<<Nl<<std::endl;
+
+	//calculate the scattering coefficients for the sphere
+	stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
+	B_coefficients(B, a, k, n, Nl);
+
+#ifdef CUDA_FOUND
+	stim::complex<T>* dev_E;										//allocate space for the field
+	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
+	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
+	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
+
+	//	COORDINATES
+	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
+	if(x != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
+	if(y != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
+	if(z != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+
+	//	PLANE WAVES
+	stim::scalarwave<T>* dev_W;																//allocate space and copy plane waves
+	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
+	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
+
+	// BESSEL FUNCTION LOOK-UP TABLE
+	//calculate the distance from the sphere center
+	T* dev_r;
+	HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );
+		
+	int threads = stim::maxThreadsPerBlock();
+	dim3 blocks((unsigned)(N / threads + 1));
+	cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);
+
+	//Find the minimum and maximum values of r
+    cublasStatus_t stat;
+    cublasHandle_t handle;
+
+	stat = cublasCreate(&handle);							//create a cuBLAS handle
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS initialization failed\n");
+		exit(1);
+	}
+
+	int i_min, i_max;
+	stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
+		exit(1);
+	}
+	stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
+		exit(1);
+	}
+
+	i_min--;				//cuBLAS uses 1-based indexing for Fortran compatibility
+	i_max--;
+	T r_min, r_max;											//allocate space to store the minimum and maximum values
+	HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
+	HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
+
+	r_min = max(r_min, a);									//if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere)
+
+	//size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1);			//number of values in the look-up table based on the user-specified spacing along r
+	size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1);
+
+	//T kr_min = k * r_min;
+	//T kr_max = k * r_max;
+
+	//temporary variables
+	double vm;															//allocate space to store the return values for the bessel function calculation
+	double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut;
+	stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes);													//pointer to the look-up table
+	T dr = (r_max - r_min) / (N_hB_lut-1);												//distance between values in the LUT
+	std::cout<<"LUT jl bytes:  "<<hB_bytes<<std::endl;
+	stim::complex<T> hl;
+	for(size_t ri = 0; ri < N_hB_lut; ri++){													//for each value in the LUT
+		stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
+		for(size_t l = 0; l <= Nl; l++){													//for each order
+			hl.r = (T)jv[l];
+			hl.i = (T)yv[l];
+
+			hB_lut[ri * (Nl + 1) + l] = hl * B[l];										//store the bessel function result
+			//std::cout<<hB_lut[ri * (Nl + 1) + l]<<std::endl;
+		}
+	}
+	T* real_lut = (T*) malloc(hB_bytes/2);
+	stim::real(real_lut, hB_lut, N_hB_lut);
+	stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer);
+
+	//Allocate device memory and copy everything to the GPU
+	stim::complex<T>* dev_hB_lut;
+	HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
+	HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );
+
+	gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl);
+
+	cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
+
+	if(x != NULL) cudaFree(dev_x);														//free everything
+	if(y != NULL) cudaFree(dev_y);
+	if(z != NULL) cudaFree(dev_z);
+	cudaFree(dev_E);
+#else
+	
+
+	//allocate space to store the bessel function call results
+	double vm;										
+	double* j_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* y_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dj_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dy_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
+
+	T r, kr, cos_phi;
+	stim::complex<T> h;
+	for(size_t i = 0; i < N; i++){
+		stim::vec3<T> p;															//declare a 3D point
+	
+		(x == NULL) ? p[0] = 0 : p[0] = x[i];										// test for NULL values and set positions
+		(y == NULL) ? p[1] = 0 : p[1] = y[i];
+		(z == NULL) ? p[2] = 0 : p[2] = z[i];
+		r = p.len();
+		if(r >= a){
+			for(size_t w = 0; w < W.size(); w++){
+				kr = p.len() * W[w].kmag();											//calculate k*r
+				stim::bessjyv_sph<double>(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr);
+				cos_phi = p.norm().dot(W[w].kvec().norm());							//calculate the cosine of the angle from the propagating direction
+				stim::legendre<T>(Nl, cos_phi, P);
+
+				for(size_t l = 0; l <= Nl; l++){
+					h.r = j_kr[l];
+					h.i = y_kr[l];
+					E[i] += W[w].E() * B[l] * h * P[l];
+				}
+			}
+		}
+	}
+#endif
+}
+
+template<typename T>
+void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){
+	std::vector< stim::scalarwave<T> > W(1, w);
+	cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, r_spacing);
+}
+
+template<typename T>
+__global__ void cuda_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* jA, T r_min, T dr, size_t N_jA, int Nl){
+	extern __shared__ stim::complex<T> shared_jA[];		//declare the list of waves in shared memory
+
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;													//exit if this thread is outside the array
+	stim::vec3<T> p;
+	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
+	(y == NULL) ? p[1] = 0 : p[1] = y[i];
+	(z == NULL) ? p[2] = 0 : p[2] = z[i];
+	
+	T r = p.len();														//calculate the distance from the sphere
+	if(r > a) return;													//exit if the point is inside the sphere (we only calculate the internal field)
+	T fij = (r - r_min)/dr;											//FP index into the spherical bessel LUT
+	size_t ij = (size_t) fij;											//convert to an integral index
+	T alpha = fij - ij;													//calculate the fractional portion of the index
+	size_t n0j = ij * (Nl + 1);												//start of the first entry in the LUT
+	size_t n1j = (ij+1) * (Nl + 1);											//start of the second entry in the LUT
+
+	T cos_phi;	
+	T Pl_2, Pl_1, Pl;														//declare registers to store the previous two Legendre polynomials
+	
+	stim::complex<T> jAl;
+	stim::complex<T> Ei = 0;											//create a register to store the result
+	int l;
+
+	stim::complex<T> jlAl[LOCAL_NL+1];									//the first LOCAL_NL components are stored in registers for speed
+	int shared_start = threadIdx.x * (Nl - LOCAL_NL);					//wrap up some operations so that they aren't done in the main loops
+
+	#pragma unroll LOCAL_NL+1											//copy the first LOCAL_NL+1 h_l * B_l components to registers
+	for(l = 0; l <= LOCAL_NL; l++)
+		jlAl[l] = clerp<T>( jA[n0j + l], jA[n1j + l], alpha );
+	
+	for(l = LOCAL_NL+1; l <= Nl; l++)									//copy any additional h_l * B_l components to shared memory
+		shared_jA[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( jA[n0j + l], jA[n1j + l], alpha );
+
+	for(size_t w = 0; w < nW; w++){										//for each plane wave
+		if(r == 0) cos_phi = 0;
+		else
+			cos_phi = p.norm().dot(W[w].kvec().norm());						//calculate the cosine of the angle between the k vector and the direction from the sphere
+		Pl_2 = 1;														//the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
+		Pl_1 = cos_phi;
+		Ei += W[w].E() * jlAl[0] * Pl_2;								//unroll the first two orders using the initial steps of the Legendre recursive relation
+		Ei += W[w].E() * jlAl[1] * Pl_1;		
+
+		#pragma unroll LOCAL_NL-1										//unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file)
+		for(l = 2; l <= LOCAL_NL; l++){
+			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);	//calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs)
+			Ei += W[w].E() * jlAl[l] * Pl;								//calculate and sum the current field order
+			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+			Pl_1 = Pl;
+		}
+
+		for(l = LOCAL_NL+1; l <= Nl; l++){											//do the same as above, except for any additional orders that are stored in shared memory (not registers)
+			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);				//again, this is where most computation in the kernel occurs
+			Ei += W[w].E() * shared_jA[shared_start + l - LOCAL_NL - 1] * Pl;
+			Pl_2 = Pl_1;															//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+			Pl_1 = Pl;			
+		}
+	}
+	E[i] = Ei;															//copy the result to device memory
+}
+
+template<typename T>
+void gpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* jA, T r_min, T dr, size_t N_jA, size_t Nl){
+	
+	size_t max_shared_mem = stim::sharedMemPerBlock();	
+	size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
+	std::cout<<"hl*Bl array size:  "<<hBl_array<<std::endl;
+	std::cout<<"shared memory:     "<<max_shared_mem<<std::endl;
+	int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
+	std::cout<<"threads per block: "<<threads<<std::endl;
+	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks
+
+	size_t shared_mem;
+	if(Nl <= LOCAL_NL) shared_mem = 0;
+	else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL);				//amount of shared memory to allocate
+	std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
+	cuda_scalar_mie_internal<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl);	//call the kernel
+}
+
+/// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere
+
+/// @param E is a pointer to the destination field values
+/// @param N is the number of points used to calculate the field
+/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
+/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
+/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
+/// @param w is a planewave that will be scattered
+/// @param a is the radius of the sphere
+/// @param n is the complex refractive index of the sphere
+template<typename T>
+void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n, T r_spacing = 0.1){
+//calculate the necessary number of orders required to represent the scattered field
+	T k = W[0].kmag();
+
+	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
+	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
+	std::cout<<"Nl: "<<Nl<<std::endl;
+
+	//calculate the scattering coefficients for the sphere
+	stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
+	A_coefficients(A, a, k, n, Nl);
+
+#ifdef CUDA_FOUND
+	stim::complex<T>* dev_E;										//allocate space for the field
+	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
+	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
+	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
+
+	//	COORDINATES
+	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
+	if(x != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
+	if(y != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
+	if(z != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+
+	//	PLANE WAVES
+	stim::scalarwave<T>* dev_W;																//allocate space and copy plane waves
+	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
+	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
+
+	// BESSEL FUNCTION LOOK-UP TABLE
+	//calculate the distance from the sphere center
+	T* dev_r;
+	HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );
+		
+	int threads = stim::maxThreadsPerBlock();
+	dim3 blocks((unsigned)(N / threads + 1));
+	cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);
+
+	//Find the minimum and maximum values of r
+    cublasStatus_t stat;
+    cublasHandle_t handle;
+
+	stat = cublasCreate(&handle);							//create a cuBLAS handle
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS initialization failed\n");
+		exit(1);
+	}
+
+	int i_min, i_max;
+	stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
+		exit(1);
+	}
+	stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
+		exit(1);
+	}
+
+	i_min--;				//cuBLAS uses 1-based indexing for Fortran compatibility
+	i_max--;
+	T r_min, r_max;											//allocate space to store the minimum and maximum values
+	HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
+	HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
+
+	r_max = min(r_max, a);		//the internal field doesn't exist outside of the sphere
+
+	size_t N_jA_lut = (size_t)((r_max - r_min) / r_spacing + 1);
+
+	//temporary variables
+	double vm;															//allocate space to store the return values for the bessel function calculation
+	stim::complex<double>* jv = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* yv = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* djv= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dyv= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+
+	size_t jA_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_jA_lut;
+	stim::complex<T>* jA_lut = (stim::complex<T>*) malloc(jA_bytes);													//pointer to the look-up table
+	T dr = (r_max - r_min) / (N_jA_lut-1);												//distance between values in the LUT
+	std::cout<<"LUT jl bytes:  "<<jA_bytes<<std::endl;
+	stim::complex<T> hl;
+	stim::complex<double> nd = (stim::complex<double>)n;
+	for(size_t ri = 0; ri < N_jA_lut; ri++){													//for each value in the LUT
+		stim::cbessjyva_sph<double>(Nl, nd * k * (r_min + ri * dr), vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
+		for(size_t l = 0; l <= Nl; l++){													//for each order
+			jA_lut[ri * (Nl + 1) + l] = (stim::complex<T>)(jv[l] * (stim::complex<double>)A[l]);										//store the bessel function result
+		}
+	}
+
+	//Allocate device memory and copy everything to the GPU
+	stim::complex<T>* dev_jA_lut;
+	HANDLE_ERROR( cudaMalloc(&dev_jA_lut, jA_bytes) );
+	HANDLE_ERROR( cudaMemcpy(dev_jA_lut, jA_lut, jA_bytes, cudaMemcpyHostToDevice) );
+
+	gpu_scalar_mie_internal<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_jA_lut, r_min, dr, N_jA_lut, Nl);
+
+	cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
+
+	if(x != NULL) cudaFree(dev_x);														//free everything
+	if(y != NULL) cudaFree(dev_y);
+	if(z != NULL) cudaFree(dev_z);
+	cudaFree(dev_E);
+#else
+
+	//allocate space to store the bessel function call results
+	double vm;										
+	stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* y_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dj_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dy_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+
+	T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
+
+	T r, cos_phi;
+	stim::complex<double> knr;
+	stim::complex<T> h;
+	for(size_t i = 0; i < N; i++){
+		stim::vec3<T> p;									//declare a 3D point
+	
+		(x == NULL) ? p[0] = 0 : p[0] = x[i];				// test for NULL values and set positions
+		(y == NULL) ? p[1] = 0 : p[1] = y[i];
+		(z == NULL) ? p[2] = 0 : p[2] = z[i];
+		r = p.len();
+		if(r < a){
+			E[i] = 0;
+			for(size_t w = 0; w < W.size(); w++){
+				knr = (stim::complex<double>)n * p.len() * W[w].kmag();							//calculate k*n*r
+
+				stim::cbessjyva_sph<double>(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr);
+				if(r == 0)
+					cos_phi = 0;
+				else
+					cos_phi = p.norm().dot(W[w].kvec().norm());				//calculate the cosine of the angle from the propagating direction
+				stim::legendre<T>(Nl, cos_phi, P);
+								
+				for(size_t l = 0; l <= Nl; l++){
+					E[i] += W[w].E() * A[l] * (stim::complex<T>)j_knr[l] * P[l];
+				}
+			}
+		}
+	}
+#endif
+}
+
+template<typename T>
+void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){
+	std::vector< stim::scalarwave<T> > W(1, w);
+	cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, r_spacing);
+}
+
+}
+
+#endif
 \ No newline at end of file
-#ifndef RTS_PLANEWAVE
-#define RTS_PLANEWAVE
+#ifndef STIM_PLANEWAVE_H
+#define STIM_PLANEWAVE_H
 #include <string>
 #include <sstream>
+#include <cmath>
 #include "../math/vector.h"
 #include "../math/quaternion.h"
 #include "../math/constants.h"
 #include "../math/plane.h"
-#include "../cuda/callable.h"
-
-/*Basic conversions used here (assuming a vacuum)
-	lambda =
-*/
+#include "../math/complex.h"
 namespace stim{
+	namespace optics{
+
+		/// evaluate the scalar field produced by a plane wave at a point (x, y, z)
+
+		/// @param x is the x-coordinate of the point
+		/// @param y is the y-coordinate of the point
+		/// @param z is the z-coordinate of the point
+		/// @param A is the amplitude of the plane wave, specifically the field at (0, 0, 0)
+		/// @param kx is the k-vector component in the x direction
+		/// @param ky is the k-vector component in the y direction
+		/// @param kz is the k-vector component in the z direction
+		template<typename T>
+		stim::complex<T> planewave_scalar(T x, T y, T z, stim::complex<T> A, T kx, T ky, T kz){
+			T d = x * kx + y * ky + z * kz;						//calculate the dot product between k and p = (x, y, z) to find the distance p is along the propagation direction
+			stim::complex<T> di = stim::complex<T>(0, d);		//calculate the phase shift that will have to be applied to propagate the wave distance d
+			return A * exp(di);									//multiply the phase term by the amplitude at (0, 0, 0) to propagate the wave to p
+		}
+
+		/// evaluate the scalar field produced by a plane wave at several positions
+
+		/// @param field is a pre-allocated block of memory that will store the complex field at all points
+		/// @param N is the number of field values to be evaluated
+		/// @param x is a set of x coordinates defining positions within the field (NULL implies that all values are zero)
+		/// @param y is a set of y coordinates defining positions within the field (NULL implies that all values are zero)
+		/// @param z is a set of z coordinates defining positions within the field (NULL implies that all values are zero)
+		/// @param A is the amplitude of the plane wave, specifically the field at (0, 0, 0)
+		/// @param kx is the k-vector component in the x direction
+		/// @param ky is the k-vector component in the y direction
+		/// @param kz is the k-vector component in the z direction
+		template<typename T>
+		void cpu_planewave_scalar(stim::complex<T>* field, size_t N, T* x, T* y = NULL, T* z = NULL, stim::complex<T> A = 1.0, T kx = 0.0, T ky = 0.0, T kz = 0.0){
+			T px, py, pz;
+			for(size_t i = 0; i < N; i++){										// for each element in the array
+				(x == NULL) ? px = 0 : px = x[i];								// test for NULL values
+				(y == NULL) ? py = 0 : py = y[i];
+				(z == NULL) ? pz = 0 : pz = z[i];
+
+				field[i] = planewave_scalar(px, py, pz, A, kx, ky, kz);			// call the single-value plane wave function
+			}
+		}
 template<typename T>
 class planewave{
 protected:
-	vec<T> k;	//k = tau / lambda
-	vec< complex<T> > E0;		//amplitude
-	//T phi;
-
-	CUDA_CALLABLE planewave<T> bend(rts::vec<T> kn) const{
+	stim::vec<T> k;							//k-vector, pointed in propagation direction with magnitude |k| = tau / lambda = 2pi / lambda
+	stim::vec< stim::complex<T> > E0;		//amplitude (for a scalar plane wave, only E0[0] is used)
-		vec<T> kn_hat = kn.norm();				//normalize the new k
-		vec<T> k_hat = k.norm();				//normalize the current k
+	/// Bend a plane wave via refraction, given that the new propagation direction is known
+	CUDA_CALLABLE planewave<T> bend(stim::vec<T> kn) const{
-		//std::cout<<"PLANE WAVE BENDING------------------"<<std::endl;
-		//std::cout<<"kn_hat: "<<kn_hat<<"     k_hat: "<<k_hat<<std::endl;
+		stim::vec<T> kn_hat = kn.norm();				//normalize the new k
+		stim::vec<T> k_hat = k.norm();					//normalize the current k
-		planewave<T> new_p;						//create a new plane wave
+		planewave<T> new_p;								//create a new plane wave
-		//if kn is equal to k or -k, handle the degenerate case
-		T k_dot_kn = k_hat.dot(kn_hat);
+		T k_dot_kn = k_hat.dot(kn_hat);					//if kn is equal to k or -k, handle the degenerate case
 		//if k . n < 0, then the bend is a reflection
-			//flip k_hat
-		if(k_dot_kn < 0) k_hat = -k_hat;
+		if(k_dot_kn < 0) k_hat = -k_hat;				//flip k_hat
-		//std::cout<<"k dot kn: "<<k_dot_kn<<std::endl;
-
-		//std::cout<<"k_dot_kn: "<<k_dot_kn<<std::endl;
 		if(k_dot_kn == -1){
 			new_p.k = -k;
 			new_p.E0 = E0;
@@ -56,28 +85,11 @@ protected:
 			return new_p;
 		}
-		vec<T> r = k_hat.cross(kn_hat);			//compute the rotation vector
-
-		//std::cout<<"r: "<<r<<std::endl;
-
-		T theta = asin(r.len());				//compute the angle of the rotation about r
-
-		
-
-		//deal with a zero vector (both k and kn point in the same direction)
-		//if(theta == (T)0)
-		//{
-		//	new_p = *this;
-		//	return new_p;
-		//}
-
-		//create a quaternion to capture the rotation
-		quaternion<T> q;
-		q.CreateRotation(theta, r.norm());
-
-		//apply the rotation to E0
-		vec< complex<T> > E0n = q.toMatrix3() * E0;
-
+		vec<T> r = k_hat.cross(kn_hat);					//compute the rotation vector
+		T theta = asin(r.len());						//compute the angle of the rotation about r
+		quaternion<T> q;								//create a quaternion to capture the rotation
+		q.CreateRotation(theta, r.norm());		
+		vec< complex<T> > E0n = q.toMatrix3() * E0;		//apply the rotation to E0
 		new_p.k = kn_hat * kmag();
 		new_p.E0 = E0n;
@@ -86,16 +98,9 @@ protected:
 public:
-
-	///constructor: create a plane wave propagating along z, polarized along x
-	/*planewave(T lambda = (T)1)
-	{
-		k = rts::vec<T>(0, 0, 1) * (TAU/lambda);
-		E0 = rts::vec<T>(1, 0, 0);
-	}*/
-	///constructor: create a plane wave propagating along k, polarized along _E0, at frequency _omega
-	CUDA_CALLABLE planewave(vec<T> kvec = rts::vec<T>(0, 0, rtsTAU), 
-							vec< complex<T> > E = rts::vec<T>(1, 0, 0), T phase = 0)
+	///constructor: create a plane wave propagating along k
+	CUDA_CALLABLE planewave(vec<T> kvec = stim::vec<T>(0, 0, stim::TAU), 
+							vec< complex<T> > E = stim::vec<T>(1, 0, 0))
 	{
 		//phi = phase;
@@ -107,27 +112,23 @@ public:
 		else{
 			vec< complex<T> > s = (k_hat.cross(E)).norm();		//compute an orthogonal side vector
 			vec< complex<T> > E_hat = (s.cross(k)).norm();	//compute a normalized E0 direction vector
-			E0 = E_hat * E_hat.dot(E);					//compute the projection of _E0 onto E0_hat
+			E0 = E_hat;// * E_hat.dot(E);					//compute the projection of _E0 onto E0_hat
 		}
 		E0 = E0 * exp( complex<T>(0, phase) );
 	}
 	///multiplication operator: scale E0
-    CUDA_CALLABLE planewave<T> & operator* (const T & rhs)
-	{
-		
+    CUDA_CALLABLE planewave<T> & operator* (const T & rhs){		
 		E0 = E0 * rhs;
 		return *this;
 	}
-	CUDA_CALLABLE T lambda() const
-	{
-		return rtsTAU / k.len();
+	CUDA_CALLABLE T lambda() const{
+		return stim::TAU / k.len();
 	}
-	CUDA_CALLABLE T kmag() const
-	{
+	CUDA_CALLABLE T kmag() const{
 		return k.len();
 	}
@@ -139,14 +140,11 @@ public:
 		return k;
 	}
-	/*CUDA_CALLABLE T phase(){
-		return phi;
+	/// calculate the value of the field produced by the plane wave given a three-dimensional position
+	CUDA_CALLABLE vec< complex<T> > pos(T x, T y, T z){
+		return pos( stim::vec<T>(x, y, z) );
 	}
-	CUDA_CALLABLE void phase(T p){
-		phi = p;
-	}*/
-
 	CUDA_CALLABLE vec< complex<T> > pos(vec<T> p = vec<T>(0, 0, 0)){
 		vec< complex<T> > result;
@@ -166,18 +164,32 @@ public:
 		return planewave<T>(k * (nt / ni), E0);
 	}
-	CUDA_CALLABLE planewave<T> refract(rts::vec<T> kn) const
-	{
+	CUDA_CALLABLE planewave<T> refract(stim::vec<T> kn) const{
 		return bend(kn);
 	}
-	void scatter(rts::plane<T> P, T nr, planewave<T> &r, planewave<T> &t){
+	/// Calculate the result of a plane wave hitting an interface between two refractive indices
+
+	/// @param P is a plane representing the position and orientation of the surface
+	/// @param n0 is the refractive index outside of the surface (in the direction of the normal)
+	/// @param n1 is the refractive index inside the surface (in the direction away from the normal)
+	/// @param r is the reflected component of the plane wave
+	/// @param t is the transmitted component of the plane wave
+	void scatter(stim::plane<T> P, T n0, T n1, planewave<T> &r, planewave<T> &t){
+		scatter(P, n1/n0, r, t);
+	}
+
+	/// Calculate the scattering result when nr = n1/n0
+
+	/// @param P is a plane representing the position and orientation of the surface
+	/// @param r is the ration n1/n0
+	/// @param n1 is the refractive index inside the surface (in the direction away from the normal)
+	/// @param r is the reflected component of the plane wave
+	/// @param t is the transmitted component of the plane wave
+	void scatter(stim::plane<T> P, T nr, planewave<T> &r, planewave<T> &t){
 		int facing = P.face(k);		//determine which direction the plane wave is coming in
-		//if(facing == 0)				//if the wave is tangent to the plane, return an identical wave
-		//	return *this;
-		//else 
 		if(facing == -1){		//if the wave hits the back of the plane, invert the plane and nr
 			P = P.flip();			//flip the plane
 			nr = 1/nr;				//invert the refractive index (now nr = n0/n1)
@@ -192,7 +204,7 @@ public:
 		bool tir = false;						//flag for total internal reflection
 		if(theta_t != theta_t){
 			tir = true;
-			theta_t = rtsPI / (T)2;
+			theta_t = stim::PI / (T)2;
 		}
 		//handle the degenerate case where theta_i is 0 (the plane wave hits head-on)
@@ -205,17 +217,10 @@ public:
 			vec< complex<T> > Et = E0 * tp;
 			T phase_t = P.p().dot(k - kt);	//compute the phase offset
 			T phase_r = P.p().dot(k - kr);
-			//std::cout<<"Degeneracy: Head-On"<<std::endl;
-			//std::cout<<"rs: "<<rp<<"  rp: "<<rp<<"  ts: "<<tp<<"  tp: "<<tp<<std::endl;
-			//std::cout<<"phase r: "<<phase_r<<"  phase t: "<<phase_t<<std::endl;
 			//create the plane waves
 			r = planewave<T>(kr, Er, phase_r);
 			t = planewave<T>(kt, Et, phase_t);
-
-			//std::cout<<"i + r: "<<pos()[0] + r.pos()[0]<<pos()[1] + r.pos()[1]<<pos()[2] + r.pos()[2]<<std::endl;
-			//std::cout<<"t:     "<<t.pos()[0]<<t.pos()[1]<<t.pos()[2]<<std::endl;
-			//std::cout<<"--------------------------------"<<std::endl;
 			return;
 		}
@@ -245,11 +250,9 @@ public:
 		//compute the magnitude of the p- and s-polarized components of the incident E vector
 		complex<T> Ei_s = E0.dot(x_hat);
-		//int sgn = (0 < E0.dot(y_hat)) - (E0.dot(y_hat) < 0);
 		int sgn = E0.dot(y_hat).sgn();
 		vec< complex<T> > cx_hat = x_hat;
 		complex<T> Ei_p = ( E0 - cx_hat * Ei_s ).len() * sgn;
-		//T Ei_p = ( E0 - x_hat * Ei_s ).len();
 		//compute the magnitude of the p- and s-polarized components of the reflected E vector
 		complex<T> Er_s = Ei_s * rs;
 		complex<T> Er_p = Ei_p * rp;
@@ -257,14 +260,6 @@ public:
 		complex<T> Et_s = Ei_s * ts;
 		complex<T> Et_p = Ei_p * tp;
-		//std::cout<<"E0: "<<E0<<std::endl;
-		//std::cout<<"E0 dot y_hat: "<<E0.dot(y_hat)<<std::endl;
-		//std::cout<<"theta i: "<<theta_i<<"  theta t: "<<theta_t<<std::endl;
-		//std::cout<<"x_hat: "<<x_hat<<"  y_hat: "<<y_hat<<"  z_hat: "<<z_hat<<std::endl;
-		//std::cout<<"Ei_s: "<<Ei_s<<"  Ei_p: "<<Ei_p<<"  Er_s: "<<Er_s<<"  Er_p: "<<Er_p<<"  Et_s: "<<Et_s<<"  Et_p: "<<Et_p<<std::endl;
-		//std::cout<<"rs: "<<rs<<"  rp: "<<rp<<"  ts: "<<ts<<"  tp: "<<tp<<std::endl;
-		
-
 		//compute the reflected E vector
 		vec< complex<T> > Er = vec< complex<T> >(y_hat * cos(theta_i) + z_hat * sin(theta_i)) * Er_p + cx_hat * Er_s;
 		//compute the transmitted E vector
@@ -273,29 +268,12 @@ public:
 		T phase_t = P.p().dot(k - kt);
 		T phase_r = P.p().dot(k - kr);
-		//std::cout<<"phase r: "<<phase_r<<"  phase t: "<<phase_t<<std::endl;
-
-		//std::cout<<"phase: "<<phase<<std::endl;
-
 		//create the plane waves
 		r.k = kr;
 		r.E0 = Er * exp( complex<T>(0, phase_r) );
-		//r.phi = phase_r;
-
-		//t = bend(kt);
-		//t.k = t.k * nr;
 		t.k = kt;
 		t.E0 = Et * exp( complex<T>(0, phase_t) );
-		//t.phi = phase_t;
-		//std::cout<<"i: "<<str()<<std::endl;
-		//std::cout<<"r: "<<r.str()<<std::endl;
-		//std::cout<<"t: "<<t.str()<<std::endl;
-
-		//std::cout<<"i + r: "<<pos()[0] + r.pos()[0]<<pos()[1] + r.pos()[1]<<pos()[2] + r.pos()[2]<<std::endl;
-		//std::cout<<"t:     "<<t.pos()[0]<<t.pos()[1]<<t.pos()[2]<<std::endl;
-		//std::cout<<"--------------------------------"<<std::endl;
-
 	}
 	std::string str()
@@ -305,14 +283,15 @@ public:
 		ss<<"	"<<E0<<" e^i ( "<<k<<" . r )";
 		return ss.str();
 	}
-};
-}
+};					//end planewave class
+}					//end namespace optics
+}					//end namespace stim
 template <typename T>
-std::ostream& operator<<(std::ostream& os, rts::planewave<T> p)
+std::ostream& operator<<(std::ostream& os, stim::optics::planewave<T> p)
 {
     os<<p.str();
     return os;
 }
 -#endif
+#endif
 \ No newline at end of file
+#ifndef RTS_BEAM
+#define RTS_BEAM
+#include <boost/math/special_functions/bessel.hpp>
+
+#include "../math/vec3.h"
+#include "../optics/scalarwave.h"
+#include "../math/bessel.h"
+#include "../math/legendre.h"
+#include "../cuda/cudatools/devices.h"
+#include "../cuda/cudatools/timer.h"
+#include "../optics/scalarfield.h"
+#include <cublas_v2.h>
+#include <math_constants.h>
+#include <vector>
+#include <stdlib.h>
+
+
+
+namespace stim{
+
+/// Function returns the value of the scalar field produced by a beam with the specified parameters
+
+template<typename T>
+std::vector< stim::vec3<T> > generate_focusing_vectors(size_t N, stim::vec3<T> d, T NA, T NA_in = 0){
+
+	std::vector< stim::vec3<T> > dirs(N);					//allocate an array to store the focusing vectors
+
+	///compute the rotation operator to transform (0, 0, 1) to k
+	T cos_angle = d.dot(vec3<T>(0, 0, 1));
+	stim::matrix<T, 3> rotation;
+
+	//if the cosine of the angle is -1, the rotation is just a flip across the z axis
+	if(cos_angle == -1){
+		rotation(2, 2) = -1;
+	}
+	else if(cos_angle != 1.0)
+	{
+		vec3<T> r_axis = vec3<T>(0, 0, 1).cross(d).norm();	//compute the axis of rotation
+		T angle = acos(cos_angle);							//compute the angle of rotation
+		quaternion<T> quat;							//create a quaternion describing the rotation
+		quat.CreateRotation(angle, r_axis);
+		rotation = quat.toMatrix3();							//compute the rotation matrix
+	}
+
+	//find the phi values associated with the cassegrain ring
+	T PHI[2];
+	PHI[0] = (T)asin(NA);
+	PHI[1] = (T)asin(NA_in);
+
+	//calculate the z-axis cylinder coordinates associated with these angles
+	T Z[2];
+	Z[0] = cos(PHI[0]);
+	Z[1] = cos(PHI[1]);
+	T range = Z[0] - Z[1];
+
+	//draw a distribution of random phi, z values
+	T z, phi, theta;
+	//T kmag = stim::TAU / lambda;
+	for(int i=0; i<N; i++){								//for each sample
+		z = (T)((double)rand() / (double)RAND_MAX) * range + Z[1];			//find a random position on the surface of a cylinder
+		theta = (T)(((double)rand() / (double)RAND_MAX) * stim::TAU);
+		phi = acos(z);													//project onto the sphere, computing phi in spherical coordinates
+
+		//compute and store cartesian coordinates
+		vec3<T> spherical(1, theta, phi);								//convert from spherical to cartesian coordinates
+		vec3<T> cart = spherical.sph2cart();
+		dirs[i] = rotation * cart;										//create a sample vector
+	}
+	return dirs;
+}
+
+		
+/// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional)
+/// @param C is a pointer to Nl + 1 values where the terms will be stored
+template<typename T>
+CUDA_CALLABLE void cpu_aperture_integral(T* C, int Nl, T NA, T NA_in = 0){
+
+	size_t table_bytes = (Nl + 1) * sizeof(T);				//calculate the number of bytes required to store the terms
+	T cos_alpha_1 = cos(asin(NA_in));						//calculate the cosine of the angle subtended by the central obscuration
+	T cos_alpha_2 = cos(asin(NA));							//calculate the cosine of the angle subtended by the aperture
+
+	// the aperture integral is computed using four individual Legendre polynomials, each a function of the angles subtended
+	//		by the objective and central obscuration
+	T* Pln_a1 = (T*) malloc(table_bytes);
+	stim::legendre<T>(Nl-1, cos_alpha_1, &Pln_a1[1]);
+	Pln_a1[0] = 1;
+
+	T* Pln_a2 = (T*) malloc(table_bytes);
+	stim::legendre<T>(Nl-1, cos_alpha_2, &Pln_a2[1]);
+	Pln_a2[0] = 1;
+
+	T* Plp_a1 = (T*) malloc(table_bytes+sizeof(T));
+	stim::legendre<T>(Nl+1, cos_alpha_1, Plp_a1);
+
+	T* Plp_a2 = (T*) malloc(table_bytes+sizeof(T));
+	stim::legendre<T>(Nl+1, cos_alpha_2, Plp_a2);
+
+	for(size_t l = 0; l <= Nl; l++){
+		C[l] = Plp_a1[l+1] - Plp_a2[l+1] - Pln_a1[l] + Pln_a2[l];
+	}
+
+	free(Pln_a1);
+	free(Pln_a2);
+	free(Plp_a1);
+	free(Plp_a2);
+}
+
+/// performs linear interpolation into a look-up table
+template<typename T>
+CUDA_CALLABLE void lut_lookup(T* lut_values, T* lut, T val, size_t N, T min_val, T delta, size_t n_vals){
+	T idx = ((val - min_val) / delta);
+	size_t i = (size_t) idx;
+	T a1 = idx - i;
+	T a0 = 1 - a1;
+	size_t n0 = i * n_vals;
+	size_t n1 = (i+1) * n_vals;
+	for(size_t n = 0; n < n_vals; n++){
+		lut_values[n] = lut[n0 + n] * a0 + lut[n1 + n] * a1;
+	}
+}
+
+template <typename T>
+CUDA_CALLABLE stim::complex<T> clerp(stim::complex<T> v0, stim::complex<T> v1, T t) {
+    return stim::complex<T>( fma(t, v1.r, fma(-t, v0.r, v0.r)), fma(t, v1.i, fma(-t, v0.i, v0.i)) );
+}
+
+template <typename T>
+CUDA_CALLABLE T lerp(T v0, T v1, T t) {
+    return fma(t, v1, fma(-t, v0, v0));
+}
+
+#ifdef CUDA_FOUND
+template<typename T>
+__global__ void cuda_scalar_psf(stim::complex<T>* E, size_t N, T* r, T* phi, T A, size_t Nl,
+								T* C, 
+								T* lut_j, size_t Nj, T min_r, T dr){
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;												//exit if this thread is outside the array
+
+	T cos_phi = cos(phi[i]);									//calculate the thread value for cos(phi)
+	stim::complex<T> Ei = 0;									//initialize the value of the field to zero
+	size_t NC = Nl + 1;										//calculate the number of coefficients to be used
+
+	T fij = (r[i] - min_r)/dr;								//FP index into the spherical bessel LUT
+	size_t ij = (size_t) fij;								//convert to an integral index
+	T a = fij - ij;											//calculate the fractional portion of the index
+	size_t n0j = ij * (NC);									//start of the first entry in the LUT
+	size_t n1j = (ij+1) * (NC);								//start of the second entry in the LUT
+
+	T jl;											//declare register to store the spherical bessel function
+	T Pl_2, Pl_1;									//declare registers to store the previous two Legendre polynomials
+	T Pl = 1;										//initialize the current value for the Legendre polynomial
+	stim::complex<T> im(0, 1);						//declare i (imaginary 1)
+	stim::complex<T> i_pow(1, 0);					//i_pow stores the current value of i^l so it doesn't have to be re-computed every iteration
+	for(int l = 0; l <= Nl; l++){					//for each order
+		jl = lerp<T>( lut_j[n0j + l], lut_j[n1j + l], a );	//read jl from the LUT and interpolate the result
+		Ei += i_pow * jl * Pl * C[l];				//calculate the value for the field and sum
+		i_pow *= im;								//multiply i^l * i for the next iteration
+		Pl_2 = Pl_1;								//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+		Pl_1 = Pl;
+		if(l == 0){									//computing Pl is done recursively, where the recursive relation
+			Pl = cos_phi;							//	requires the first two orders. This defines the second.
+		}
+		else{										//if this is not the first iteration, use the recursive relation to calculate Pl
+			Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
+		}
+		
+	}
+	E[i] = Ei * A * 2 * CUDART_PI_F;						//scale the integral by the amplitude
+}
+
+template<typename T>
+void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl, T r_spacing){
+
+	//Find the minimum and maximum values of r
+    cublasStatus_t stat;
+    cublasHandle_t handle;
+
+	stat = cublasCreate(&handle);							//create a cuBLAS handle
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS initialization failed\n");
+		exit(1);
+	}
+
+	int i_min, i_max;
+	stat = cublasIsamin(handle, (int)N, r, 1, &i_min);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
+		exit(1);
+	}
+	stat = cublasIsamax(handle, (int)N, r, 1, &i_max);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
+		exit(1);
+	}
+
+	i_min--;												//cuBLAS uses 1-based indexing for Fortran compatibility
+	i_max--;
+	T r_min, r_max;											//allocate space to store the minimum and maximum values
+	HANDLE_ERROR( cudaMemcpy(&r_min, r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
+	HANDLE_ERROR( cudaMemcpy(&r_max, r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
+
+	T k = (T)stim::TAU / lambda;							//calculate the wavenumber from lambda
+	size_t C_bytes = (Nl + 1) * sizeof(T);
+	T* C = (T*) malloc( C_bytes );							//allocate space for the aperture integral terms
+	cpu_aperture_integral(C, Nl, NA, NA_in);				//calculate the aperture integral terms
+
+	size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1);			//number of values in the look-up table based on the user-specified spacing along r
+
+
+	size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j;
+	T* j_lut = (T*) malloc(lutj_bytes);													//pointer to the look-up table
+	T dr = (r_max - r_min) / (Nlut_j-1);												//distance between values in the LUT
+	T jl;
+	for(size_t ri = 0; ri < Nlut_j; ri++){													//for each value in the LUT
+		for(size_t l = 0; l <= Nl; l++){													//for each order
+			jl = boost::math::sph_bessel<T>(l, k*(r_min + ri * dr));					//use boost to calculate the spherical bessel function
+			j_lut[ri * (Nl + 1) + l] = jl;												//store the bessel function result
+		}
+	}
+
+	stim::cpu2image<T>(j_lut, "j_lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
+	//Allocate device memory and copy everything to the GPU
+
+	T* gpu_C;
+	HANDLE_ERROR( cudaMalloc(&gpu_C, C_bytes) );
+	HANDLE_ERROR( cudaMemcpy(gpu_C, C, C_bytes, cudaMemcpyHostToDevice) );
+	T* gpu_j_lut;
+	HANDLE_ERROR( cudaMalloc(&gpu_j_lut, lutj_bytes) );
+	HANDLE_ERROR( cudaMemcpy(gpu_j_lut, j_lut, lutj_bytes, cudaMemcpyHostToDevice) );
+
+	int threads = stim::maxThreadsPerBlock();			//get the maximum number of threads per block for the CUDA device
+	dim3 blocks( (unsigned)(N / threads + 1));						//calculate the optimal number of blocks
+
+	cuda_scalar_psf<T><<< blocks, threads >>>(E, N, r, phi, A, Nl, gpu_C, gpu_j_lut, Nlut_j, r_min, dr);
+
+	//free the LUT and condenser tables
+	HANDLE_ERROR( cudaFree(gpu_C) );
+	HANDLE_ERROR( cudaFree(gpu_j_lut) );
+}
+#endif
+
+/// Calculate the analytical solution to a scalar point spread function given a set of spherical coordinates about the PSF (beam propagation along phi = theta = 0)
+template<typename T>
+void cpu_scalar_psf_local(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl){
+	T k = (T)stim::TAU / lambda;
+	size_t C_bytes = (Nl + 1) * sizeof(T);
+	T* C = (T*) malloc( C_bytes );					//allocate space for the aperture integral terms
+	cpu_aperture_integral(C, Nl, NA, NA_in);			//calculate the aperture integral terms
+	memset(F, 0, N * sizeof(stim::complex<T>));
+	T jl, Pl, kr, cos_phi;
+
+	double vm;
+	double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	T* Pl_cos_phi = (T*) malloc((Nl + 1) * sizeof(T));
+
+	for(size_t n = 0; n < N; n++){								//for each point in the field
+		kr = k * r[n];											//calculate kr (the optical distance between the focal point and p)
+		cos_phi = std::cos(phi[n]);								//calculate the cosine of phi
+		stim::bessjyv_sph<double>(Nl, kr, vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
+		stim::legendre<T>(Nl, cos_phi, Pl_cos_phi);				//calculate the [0 Nl] legendre polynomials for this point
+
+		for(int l = 0; l <= Nl; l++){
+			jl = (T)jv[l];
+			Pl = Pl_cos_phi[l];
+			F[n] += pow(complex<T>(0, 1), l) * jl * Pl * C[l];
+		}
+		F[n] *= A * stim::TAU;
+	}
+
+	free(C);
+	free(Pl_cos_phi);
+}
+
+/// Converts a set of cartesian points into spherical coordinates surrounding a point spread function (PSF)
+/// @param r is the output distance from the PSF
+/// @param phi is the non-symmetric direction about the PSF
+/// @param x (x, y, z) are the cartesian coordinates in world space
+/// @f is the focal point of the PSF in cartesian coordinates
+/// @d is the propagation direction of the PSF in cartesian coordinates
+template<typename T>
+__global__ void cuda_cart2psf(T* r, T* phi, size_t N, T* x, T* y, T* z, stim::vec3<T> f, stim::quaternion<T> q){
+
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;												//exit if this thread is outside the array
+
+	stim::vec3<T> p;									//declare a 3D point
+	
+	(x == NULL) ? p[0] = 0 : p[0] = x[i];				// test for NULL values and set positions
+	(y == NULL) ? p[1] = 0 : p[1] = y[i];
+	(z == NULL) ? p[2] = 0 : p[2] = z[i];
+
+	p = p - f;											//shift the point to the center of the PSF (focal point)
+	p = q.toMatrix3() * p;								//rotate the point to align with the propagation direction
+
+	stim::vec3<T> ps = p.cart2sph();									//convert from cartesian to spherical coordinates
+	r[i] = ps[0];										//store r
+	phi[i] = ps[2];										//phi = [0 pi]
+}
+
+#ifdef CUDA_FOUND
+/// Calculate the analytical solution to a point spread function given a set of points in cartesian coordinates
+template<typename T>
+void gpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
+	
+	T* gpu_r;															//allocate space for the coordinates in r
+	HANDLE_ERROR( cudaMalloc(&gpu_r, sizeof(T) * N) );
+	T* gpu_phi;
+	HANDLE_ERROR( cudaMalloc(&gpu_phi, sizeof(T) * N) );
+	//stim::complex<T>* gpu_E;
+	//HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex<T>) * N) );
+
+	stim::quaternion<T> q;												//create a quaternion
+	q.CreateRotation(d, stim::vec3<T>(0, 0, 1));						//create a mapping from the propagation direction to the PSF space
+	int threads = stim::maxThreadsPerBlock();							//get the maximum number of threads per block for the CUDA device
+	dim3 blocks( (unsigned)(N / threads + 1));							//calculate the optimal number of blocks
+	cuda_cart2psf<T> <<< blocks, threads >>> (gpu_r, gpu_phi, N, x, y, z, f, q);	//call the CUDA kernel to move the cartesian coordinates to PSF space
+
+	gpu_scalar_psf_local(E, N, gpu_r, gpu_phi, lambda, A, NA, NA_in, Nl, r_spacing);
+
+}
+#endif
+
+template<typename T>
+void cpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
+
+// If CUDA is available, copy the cartesian points to the GPU and evaluate them in a kernel
+#ifdef CUDA_FOUND
+
+	T* gpu_x = NULL;
+	if(x != NULL){
+		HANDLE_ERROR( cudaMalloc(&gpu_x, sizeof(T) * N) );
+		HANDLE_ERROR( cudaMemcpy(gpu_x, x, sizeof(T) * N, cudaMemcpyHostToDevice) );
+	}
+	T* gpu_y = NULL;
+	if(y != NULL){
+		HANDLE_ERROR( cudaMalloc(&gpu_y, sizeof(T) * N) );
+		HANDLE_ERROR( cudaMemcpy(gpu_y, y, sizeof(T) * N, cudaMemcpyHostToDevice) );
+	}
+	T* gpu_z = NULL;
+	if(z != NULL){
+		HANDLE_ERROR( cudaMalloc(&gpu_z, sizeof(T) * N) );
+		HANDLE_ERROR( cudaMemcpy(gpu_z, z, sizeof(T) * N, cudaMemcpyHostToDevice) );
+	}
+
+	stim::complex<T>* gpu_E;
+	HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex<T>) * N) );
+	HANDLE_ERROR( cudaMemcpy(gpu_E, E, sizeof(stim::complex<T>) * N, cudaMemcpyHostToDevice) );
+	gpu_scalar_psf_cart<T>(gpu_E, N, gpu_x, gpu_y, gpu_z, lambda, A, f, d, NA, NA_in, Nl, r_spacing);
+	HANDLE_ERROR( cudaMemcpy(E, gpu_E, sizeof(stim::complex<T>) * N, cudaMemcpyDeviceToHost) );
+
+	HANDLE_ERROR( cudaFree(gpu_x) );
+	HANDLE_ERROR( cudaFree(gpu_y) );
+	HANDLE_ERROR( cudaFree(gpu_z) );
+	HANDLE_ERROR( cudaFree(gpu_E) );
+
+#else
+	T* r = (T*) malloc(N * sizeof(T));					//allocate space for p in spherical coordinates
+	T* phi = (T*) malloc(N * sizeof(T));				//	only r and phi are necessary (the scalar PSF is symmetric about theta)
+
+	stim::quaternion<T> q;
+	q.CreateRotation(d, stim::vec3<T>(0, 0, 1));
+	stim::matrix<T, 3> R = q.toMatrix3();
+	stim::vec3<T> p, ps, ds;
+	for(size_t i = 0; i < N; i++){
+		(x == NULL) ? p[0] = 0 : p[0] = x[i];	// test for NULL values and set positions
+		(y == NULL) ? p[1] = 0 : p[1] = y[i];
+		(z == NULL) ? p[2] = 0 : p[2] = z[i];
+
+		p = p - f;
+
+		p = R * p;					//rotate the cartesian point
+
+		ps = p.cart2sph();						//convert from cartesian to spherical coordinates
+		r[i] = ps[0];							//store r
+		phi[i] = ps[2];							//phi = [0 pi]
+	}
+
+	cpu_scalar_psf_local(E, N, r, phi, lambda, A, NA, NA_in, Nl);		//call the spherical coordinate CPU function
+
+	free(r);
+	free(phi);
+#endif
+}
+		
+/// Class stim::beam represents a beam of light focused at a point and composed of several plane waves
+template<typename T>
+class scalarbeam
+{
+public:
+	//enum beam_type {Uniform, Bartlett, Hamming, Hanning};
+
+private:
+	
+	T NA[2];				//numerical aperature of the focusing optics	
+	vec3<T> f;				//focal point
+	vec3<T> d;				//propagation direction
+	T A;		//beam amplitude
+	T lambda;				//beam wavelength
+public:
+
+	///constructor: build a default beam (NA=1.0)
+	scalarbeam(T wavelength = 1, T amplitude = 1, vec3<T> focal_point = vec3<T>(0, 0, 0), vec3<T> direction = vec3<T>(0, 0, 1), T numerical_aperture = 1, T center_obsc = 0){
+		lambda = wavelength;
+		A = amplitude;
+		f = focal_point;
+		d = direction.norm();					//make sure that the direction vector is normalized (makes calculations more efficient later on)
+		NA[0] = numerical_aperture;
+		NA[1] = center_obsc;
+	}
+
+	///Numerical Aperature functions
+	void setNA(T na)
+	{
+		NA[0] = (T)0;
+		NA[1] = na;
+	}
+	void setNA(T na0, T na1)
+	{
+		NA[0] = na0;
+		NA[1] = na1;
+	}
+
+	//Monte-Carlo decomposition into plane waves
+	std::vector< scalarwave<T> > mc(size_t N = 100000) const{
+
+		std::vector< stim::vec3<T> > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]);	//generate a random set of N vectors forming a focus
+		std::vector< scalarwave<T> > samples(N);											//create a vector of plane waves
+		T kmag = (T)stim::TAU / lambda;								//calculate the wavenumber
+		stim::complex<T> apw;										//allocate space for the amplitude at the focal point
+		T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N);			//constant value weights plane waves based on the aperture and number of samples (N)
+		stim::vec3<T> kpw;											//declare the new k-vector based on the focused plane wave direction
+		for(size_t i=0; i<N; i++){										//for each sample
+			kpw = dirs[i] * kmag;									//calculate the k-vector for the new plane wave
+			apw = a * exp(stim::complex<T>(0, kpw.dot(-f)));				//calculate the amplitude for the new plane wave
+			samples[i] = scalarwave<T>(kpw, apw);			//create a plane wave based on the direction
+		}
+		return samples;
+	}
+
+	/// Evaluate the beam to a scalar field using Debye focusing
+	void eval(stim::scalarfield<T>& E, size_t order = 500){
+		size_t array_size = E.grid_bytes();
+		T* X = (T*) malloc( array_size );			//allocate space for the coordinate meshes
+		T* Y = (T*) malloc( array_size );
+		T* Z = (T*) malloc( array_size );
+
+		E.meshgrid(X, Y, Z, stim::CPUmem);			//calculate the coordinate meshes
+		cpu_scalar_psf_cart<T>(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[0], NA[1], order, E.spacing());
+
+		free(X);									//free the coordinate meshes
+		free(Y);
+		free(Z);
+	}
+
+	/// Calculate the field at a given point
+	/// @param x is the x-coordinate of the field point
+	/// @O is the approximation accuracy
+	stim::complex<T> field(T x, T y, T z, size_t O){
+		std::vector< scalarwave<T> > W = mc(O);
+		T result = 0;											//initialize the result to zero (0)
+		for(size_t i = 0; i < O; i++){							//for each plane wave
+			result += W[i].pos(x, y, z);
+		}
+		return result;
+	}
+
+	std::string str()
+	{
+		std::stringstream ss;
+		ss<<"Beam:"<<std::endl;
+		//ss<<"	Central Plane Wave: "<<beam::E0<<" e^i ( "<<beam::k<<" . r )"<<std::endl;
+		ss<<"	Beam Direction: "<<d<<std::endl;
+		if(NA[0] == 0)
+			ss<<"	NA: "<<NA[1];
+		else
+			ss<<"	NA: "<<NA[0]<<" -- "<<NA[1];
+
+		return ss.str();
+	}
+
+
+
+};			//end beam
+}			//end namespace stim
+
+#endif
+#ifndef STIM_SCALARFIELD_H
+#define STIM_SCALARFIELD_H
+
+#include "../math/rect.h"
+#include "../math/complex.h"
+
+namespace stim{
+
+	enum locationType {CPUmem, GPUmem};
+
+	/// Class represents a scalar optical field.
+
+	/// In general, this class is designed to operate between the CPU and GPU. So, make sure all functions have an option to create the output on either.
+	///		The field is stored *either* on the GPU or host memory, but not both. This enforces that there can't be different copies of the same field.
+	///		This class is designed to be included in all of the other scalar optics classes, allowing them to render output data so make sure to keep it general and compatible.
+
+template<typename T>
+class scalarfield : public rect<T>{
+
+protected:
+	stim::complex<T>* E;
+	size_t R[2];
+	locationType loc;
+
+	
+
+public:
+
+	CUDA_CALLABLE scalarfield(size_t X, size_t Y, T size = 1, T z_pos = 0) : rect<T>::rect(size, z_pos){
+		R[0] = X;											//set the field resolution
+		R[1] = Y;
+
+		E = (stim::complex<T>*) malloc(sizeof(stim::complex<T>) * R[0] * R[1]);		//allocate in CPU memory
+		loc = CPUmem;
+	}
+
+	CUDA_CALLABLE ~scalarfield(){
+		if(loc == CPUmem) free(E);
+		else cudaFree(E);
+	}
+
+	/// Returns the number of values in the field
+	CUDA_CALLABLE size_t size(){
+		return R[0] * R[1];
+	}
+
+	CUDA_CALLABLE size_t grid_bytes(){
+		return sizeof(stim::complex<T>) * R[0] * R[1];
+	}
+
+	/// Calculates the distance between points on the grid
+	T spacing(){
+		T du = rect<T>::X.len() / R[0];
+		T dv = rect<T>::Y.len() / R[1];
+		return min<T>(du, dv);
+	}
+
+	/// Copy the field array to the GPU, if it isn't already there
+	void to_gpu(){
+		if(loc == GPUmem) return;
+		else{
+			stim::complex<T>* dev_E;
+			HANDLE_ERROR( cudaMalloc(&dev_E, e_bytes()) );								//allocate GPU memory
+			HANDLE_ERROR( cudaMemcpy(dev_E, E, e_bytes(), cudaMemcpyHostToDevice) );	//copy the field to the GPU
+			free(E);																	//free the CPU memory
+			E = dev_E;																	//swap pointers
+		}
+	}
+
+	/// Copy the field array to the CPU, if it isn't already there
+	void to_cpu(){
+		if(loc == CPUmem) return;
+		else{
+			stim::complex<T>* host_E = (stim::complex<T>*) malloc(e_bytes());			//allocate space in main memory
+			HANDLE_ERROR( cudaMemcpy(host_E, E, e_bytes(), cudaMemcpyDeviceToHost) );	//copy from GPU to CPU
+			HANDLE_ERROR( cudaFree(E) );												//free device memory
+			E = host_E;																	//swap pointers
+		}
+	}
+
+	std::string str(){
+		std::stringstream ss;
+		ss<<rect<T>::str()<<std::endl;
+		ss<<"[ "<<R[0]<<" x "<<R[1]<<" ]"<<std::endl;
+		ss<<"location: ";
+		if(loc == CPUmem) ss<<"CPU";
+		else ss<<"GPU";
+
+		ss<<endl;
+		return ss.str();
+	}
+
+	stim::complex<T>* ptr(){
+		return E;
+	}
+
+	/// Evaluate the cartesian coordinates of each point in the field. The resulting arrays are allocated in the same memory where the field is stored.
+	void meshgrid(T* X, T* Y, T* Z, locationType location){
+		size_t array_size = sizeof(T) * R[0] * R[1];
+		if(location == CPUmem){
+
+			T du = 1.0 / (R[0] - 1);					//calculate the spacing between points in the grid
+			T dv = 1.0 / (R[1] - 1);
+
+			size_t ui, vi, i;
+			stim::vec3<T> p;
+			for(vi = 0; vi < R[1]; vi++){
+				i = vi * R[0];
+				for(ui = 0; ui < R[0]; ui++){
+					p = rect<T>::p(ui * du, vi * dv);
+					X[i] = p[0];
+					Y[i] = p[1];
+					Z[i] = p[2];
+					i++;					
+				}
+			}
+			stim::cpu2image(X, "X.bmp", R[0], R[1], stim::cmBrewer);
+			stim::cpu2image(Y, "Y.bmp", R[0], R[1], stim::cmBrewer);
+			stim::cpu2image(Z, "Z.bmp", R[0], R[1], stim::cmBrewer);
+		}
+		else{
+			std::cout<<"GPU allocation of a meshgrid isn't supported yet. You'll have to write kernels to do the calculation.";
+			exit(1);
+		}
+	}
+
+	void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer){
+
+		if(loc == GPUmem) to_cpu();									//if the field is in the GPU, move it to the CPU
+		T* image = (T*) malloc( sizeof(T) * size() );				//allocate space for the real image
+
+		switch(type){												//get the specified component from the complex value
+		case complexMag:
+			stim::abs(image, E, size());
+			break;
+		case complexReal:
+			stim::real(image, E, size());
+			break;
+		case complexImaginary:
+			stim::imag(image, E, size());
+		}
+		stim::cpu2image(image, filename, R[0], R[1], cmap);			//save the resulting image
+		free(image);												//free the real image
+	}
+
+};				//end class scalarfield
+}
+
+//stream insertion operator
+template<typename T>
+std::ostream& operator<<(std::ostream& os, stim::scalarfield<T>& rhs){
+	os<<rhs.str();
+	return os;
+}
+
+
+#endif
 \ No newline at end of file
+#ifndef STIM_SCALARWAVE_H
+#define STIM_SCALARWAVE_H
+
+
+#include <string>
+#include <sstream>
+#include <cmath>
+
+//#include "../math/vector.h"
+#include "../math/vec3.h"
+#include "../math/quaternion.h"
+#include "../math/constants.h"
+#include "../math/plane.h"
+#include "../math/complex.h"
+
+//CUDA
+#include "../cuda/cudatools/devices.h"
+#include "../cuda/cudatools/error.h"
+#include "../cuda/sharedmem.cuh"
+
+namespace stim{
+
+template<typename T>
+class scalarwave{
+
+public:
+
+	stim::vec3<T> k;							//k-vector, pointed in propagation direction with magnitude |k| = tau / lambda = 2pi / lambda
+	stim::complex<T> E0;						//amplitude
+
+	/// Bend a plane wave via refraction, given that the new propagation direction is known
+	CUDA_CALLABLE scalarwave<T> bend(stim::vec3<T> kn) const{
+		return scalarwave<T>(kn.norm() * kmag(), E0);
+	}
+
+public:
+
+	///constructor: create a plane wave propagating along k
+	CUDA_CALLABLE scalarwave(vec3<T> kvec = stim::vec3<T>(0, 0, (T)stim::TAU), complex<T> E = 1){
+		k = kvec;
+		E0 = E;
+	}
+
+	CUDA_CALLABLE scalarwave(T kx, T ky, T kz, complex<T> E = 1){
+		k = vec3<T>(kx, ky, kz);
+		E0 = E;
+	}
+
+	///multiplication operator: scale E0
+    CUDA_CALLABLE scalarwave<T> & operator* (const T & rhs){		
+		E0 = E0 * rhs;
+		return *this;
+	}
+
+	CUDA_CALLABLE T lambda() const{
+		return stim::TAU / k.len();
+	}
+
+	CUDA_CALLABLE T kmag() const{
+		return k.len();
+	}
+
+	CUDA_CALLABLE complex<T> E(){
+		return E0;
+	}
+
+	CUDA_CALLABLE vec3<T> kvec(){
+		return k;
+	}
+
+	/// calculate the value of the field produced by the plane wave given a three-dimensional position
+	CUDA_CALLABLE complex<T> pos(T x, T y, T z){
+		return pos( stim::vec3<T>(x, y, z) );
+	}
+
+	CUDA_CALLABLE complex<T> pos(vec3<T> p = vec3<T>(0, 0, 0)){
+		return E0 * exp(complex<T>(0, k.dot(p)));
+	}
+
+	//scales k based on a transition from material ni to material nt
+	CUDA_CALLABLE scalarwave<T> n(T ni, T nt){
+		return scalarwave<T>(k * (nt / ni), E0);
+	}
+
+	CUDA_CALLABLE scalarwave<T> refract(stim::vec3<T> kn) const{
+		return bend(kn);
+	}
+
+	/// Calculate the result of a plane wave hitting an interface between two refractive indices
+
+	/// @param P is a plane representing the position and orientation of the surface
+	/// @param n0 is the refractive index outside of the surface (in the direction of the normal)
+	/// @param n1 is the refractive index inside the surface (in the direction away from the normal)
+	/// @param r is the reflected component of the plane wave
+	/// @param t is the transmitted component of the plane wave
+	void scatter(stim::plane<T> P, T n0, T n1, scalarwave<T> &r, scalarwave<T> &t){
+		scatter(P, n1/n0, r, t);
+	}
+
+	/// Calculate the scattering result when nr = n1/n0
+
+	/// @param P is a plane representing the position and orientation of the surface
+	/// @param r is the ration n1/n0
+	/// @param n1 is the refractive index inside the surface (in the direction away from the normal)
+	/// @param r is the reflected component of the plane wave
+	/// @param t is the transmitted component of the plane wave
+	void scatter(stim::plane<T> P, T nr, scalarwave<T> &r, scalarwave<T> &t){
+		/*
+		int facing = P.face(k);		//determine which direction the plane wave is coming in
+
+		if(facing == -1){		//if the wave hits the back of the plane, invert the plane and nr
+			P = P.flip();			//flip the plane
+			nr = 1/nr;				//invert the refractive index (now nr = n0/n1)
+		}
+
+		//use Snell's Law to calculate the transmitted angle
+		T cos_theta_i = k.norm().dot(-P.norm());				//compute the cosine of theta_i
+		T theta_i = acos(cos_theta_i);							//compute theta_i
+		T sin_theta_t = (1/nr) * sin(theta_i);						//compute the sine of theta_t using Snell's law
+		T theta_t = asin(sin_theta_t);							//compute the cosine of theta_t
+
+		bool tir = false;						//flag for total internal reflection
+		if(theta_t != theta_t){
+			tir = true;
+			theta_t = stim::PI / (T)2;
+		}
+
+		//handle the degenerate case where theta_i is 0 (the plane wave hits head-on)
+		if(theta_i == 0){
+			T rp = (1 - nr) / (1 + nr);		//compute the Fresnel coefficients
+			T tp = 2 / (1 + nr);
+			vec3<T> kr = -k;
+			vec3<T> kt = k * nr;			//set the k vectors for theta_i = 0
+			vec3< complex<T> > Er = E0 * rp;		//compute the E vectors
+			vec3< complex<T> > Et = E0 * tp;
+			T phase_t = P.p().dot(k - kt);	//compute the phase offset
+			T phase_r = P.p().dot(k - kr);
+
+			//create the plane waves
+			r = planewave<T>(kr, Er, phase_r);
+			t = planewave<T>(kt, Et, phase_t);
+			return;
+		}
+
+
+		//compute the Fresnel coefficients
+		T rp, rs, tp, ts;
+		rp = tan(theta_t - theta_i) / tan(theta_t + theta_i);
+		rs = sin(theta_t - theta_i) / sin(theta_t + theta_i);
+		
+		if(tir){
+			tp = ts = 0;
+		}
+		else{
+			tp = ( 2 * sin(theta_t) * cos(theta_i) ) / ( sin(theta_t + theta_i) * cos(theta_t - theta_i) );
+			ts = ( 2 * sin(theta_t) * cos(theta_i) ) / sin(theta_t + theta_i);
+		}
+
+		//compute the coordinate space for the plane of incidence
+		vec3<T> z_hat = -P.norm();
+		vec3<T> y_hat = P.parallel(k).norm();
+		vec3<T> x_hat = y_hat.cross(z_hat).norm();
+
+		//compute the k vectors for r and t
+		vec3<T> kr, kt;
+		kr = ( y_hat * sin(theta_i) - z_hat * cos(theta_i) ) * kmag();
+		kt = ( y_hat * sin(theta_t) + z_hat * cos(theta_t) ) * kmag() * nr;
+
+		//compute the magnitude of the p- and s-polarized components of the incident E vector
+		complex<T> Ei_s = E0.dot(x_hat);
+		int sgn = E0.dot(y_hat).sgn();
+		vec3< complex<T> > cx_hat = x_hat;
+		complex<T> Ei_p = ( E0 - cx_hat * Ei_s ).len() * sgn;
+		//compute the magnitude of the p- and s-polarized components of the reflected E vector
+		complex<T> Er_s = Ei_s * rs;
+		complex<T> Er_p = Ei_p * rp;
+		//compute the magnitude of the p- and s-polarized components of the transmitted E vector
+		complex<T> Et_s = Ei_s * ts;
+		complex<T> Et_p = Ei_p * tp;
+
+		//compute the reflected E vector
+		vec3< complex<T> > Er = vec3< complex<T> >(y_hat * cos(theta_i) + z_hat * sin(theta_i)) * Er_p + cx_hat * Er_s;
+		//compute the transmitted E vector
+		vec3< complex<T> > Et = vec3< complex<T> >(y_hat * cos(theta_t) - z_hat * sin(theta_t)) * Et_p + cx_hat * Et_s;
+
+		T phase_t = P.p().dot(k - kt);
+		T phase_r = P.p().dot(k - kr);
+
+		//create the plane waves
+		r.k = kr;
+		r.E0 = Er * exp( complex<T>(0, phase_r) );
+
+		t.k = kt;
+		t.E0 = Et * exp( complex<T>(0, phase_t) );
+		*/
+	}
+
+	std::string str()
+	{
+		std::stringstream ss;
+		ss<<"Plane Wave:"<<std::endl;
+		ss<<"	"<<E0<<" e^i ( "<<k<<" . r )";
+		return ss.str();
+	}
+};					//end planewave class
+
+
+/// CUDA kernel for computing the field produced by a batch of plane waves at an array of locations
+template<typename T>
+__global__ void cuda_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t n_waves){
+	extern __shared__ stim::scalarwave<T> shared_W[];		//declare the list of waves in shared memory
+
+	stim::cuda::sharedMemcpy(shared_W, W, n_waves, threadIdx.x, blockDim.x);	//copy the plane waves into shared memory for faster access
+	__syncthreads();															//synchronize threads to insure all data is copied
+
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;												//exit if this thread is outside the array
+	T px, py, pz;
+	(x == NULL) ? px = 0 : px = x[i];								// test for NULL values and set positions
+	(y == NULL) ? py = 0 : py = y[i];
+	(z == NULL) ? pz = 0 : pz = z[i];
+	
+	stim::complex<T> f = 0;											//create a register to store the result
+	for(size_t w = 0; w < n_waves; w++)
+		f += shared_W[w].pos(px, py, pz);							//evaluate the plane wave
+	F[i] += f;														//copy the result to device memory
+}
+
+/// evaluate a scalar wave at several points, where all arrays are on the GPU
+template<typename T>
+void gpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
+	
+	int threads = stim::maxThreadsPerBlock();			//get the maximum number of threads per block for the CUDA device
+	dim3 blocks(N / threads + 1);						//calculate the optimal number of blocks
+	cuda_scalarwave<T><<< blocks, threads >>>(F, N, x, y, z, w);			//call the kernel
+}
+
+template<typename T>
+void gpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW){
+
+	size_t wave_bytes = sizeof(stim::scalarwave<T>);
+	size_t shared_bytes = stim::sharedMemPerBlock();									//calculate the maximum amount of shared memory available
+	size_t max_batch = shared_bytes / wave_bytes;				//calculate number of plane waves that will fit into shared memory
+	size_t batch_bytes = min(nW, max_batch) * wave_bytes;				//initialize the batch size (in bytes) to the maximum batch required
+
+	stim::scalarwave<T>* batch_W;
+	HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes));										//allocate memory for a single batch of plane waves
+
+	int threads = stim::maxThreadsPerBlock();							//get the maximum number of threads per block for the CUDA device
+	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks	
+
+	size_t batch_size;																	//declare a variable to store the size of the current batch
+	size_t waves_processed = 0;															//initialize the number of waves processed to zero
+	while(waves_processed < nW){												//while there are still waves to be processed
+		batch_size = min<size_t>(max_batch, nW - waves_processed);			//process either a whole batch, or whatever is left
+		batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
+		HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice));	//copy the plane waves into global memory
+		cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(F, N, x, y, z, batch_W, batch_size);	//call the kernel
+		waves_processed += batch_size;													//increment the counter indicating how many waves have been processed
+	}
+	cudaFree(batch_W);
+}
+
+/// Sums a series of coherent plane waves at a specified point
+/// @param field is the output array of field values corresponding to each input point
+/// @param x is an array of x coordinates for the field point
+/// @param y is an array of y coordinates for the field point
+/// @param z is an array of z coordinates for the field point
+/// @param N is the number of points in the input and output arrays
+/// @param lambda is the wavelength (all coherent waves are assumed to have the same wavelength)
+/// @param A is the list of amplitudes for each wave
+/// @param S is the list of propagation directions for each wave
+template<typename T>
+void cpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W){
+	size_t S = W.size();											//store the number of waves
+#ifdef __CUDACC__
+	stim::complex<T>* dev_F;										//allocate space for the field
+	cudaMalloc(&dev_F, N * sizeof(stim::complex<T>));
+	cudaMemcpy(dev_F, F, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
+	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
+
+	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
+	if(x != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+
+	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
+	if(y != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+
+	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
+	if(z != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+
+	stim::scalarwave<T>* dev_W;
+	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
+	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
+
+	gpu_scalarwaves(dev_F, N, dev_x, dev_y, dev_z, dev_W, W.size());
+
+	cudaMemcpy(F, dev_F, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
+
+	if(x != NULL) cudaFree(dev_x);														//free everything
+	if(y != NULL) cudaFree(dev_y);
+	if(z != NULL) cudaFree(dev_z);
+	cudaFree(dev_F);
+#else
+	memset(F, 0, N * sizeof(stim::complex<T>));
+	T px, py, pz;
+	for(size_t i = 0; i < N; i++){										// for each element in the array
+		(x == NULL) ? px = 0 : px = x[i];								// test for NULL values
+		(y == NULL) ? py = 0 : py = y[i];
+		(z == NULL) ? pz = 0 : pz = z[i];
+
+		for(size_t s = 0; s < S; s++){
+			F[i] += w_array[s].pos(px, py, pz);						//sum all plane waves at this point
+		}
+	}
+#endif
+}
+
+template<typename T>
+void cpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
+	std::vector< stim::scalarwave<T> > w_array(1, w);
+	cpu_scalarwaves(F, N, x, y, z, w_array);	
+}
+
+template<typename T>
+void cpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
+	std::vector< stim::scalarwave<T> > w_array(1, w);
+	cpu_scalarwaves(F, N, x, y, z, w_array);	
+}
+
+
+/// Sums a series of coherent plane waves at a specified point
+/// @param x is the x coordinate of the field point
+/// @param y is the y coordinate of the field point
+/// @param z is the z coordinate of the field point
+/// @param lambda is the wavelength (all coherent waves are assumed to have the same wavelength)
+/// @param A is the list of amplitudes for each wave
+/// @param S is the list of propagation directions for each wave
+template<typename T>
+CUDA_CALLABLE stim::complex<T> cpu_scalarwaves(T x, T y, T z, std::vector< stim::scalarwave<T> > W){
+	size_t N = W.size();												//get the number of plane wave samples
+	stim::complex<T> field(0, 0);										//initialize the field to zero (0)
+	stim::vec3<T> k;													//allocate space for the direction vector
+	for(size_t i = 0; i < N; i++){
+		field += W[i].pos(x, y, z);
+	}
+	return field;
+}
+
+}					//end namespace stim
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, stim::scalarwave<T> p)
+{
+    os<<p.str();
+    return os;
+}
+
+#endif
 \ No newline at end of file
-#ifndef RTS_BEAM
-#define RTS_BEAM
-
-#include "../math/vector.h"
-#include "../math/function.h"
-#include "../optics/planewave.h"
-#include <vector>
-
-namespace stim{
-
-template<typename P>
-class beam : public planewave<P>
-{
-public:
-	enum beam_type {Uniform, Bartlett, Hamming, Hanning};
-
-private:
-	
-	P _na[2];		//numerical aperature of the focusing optics	
-	vec<P> f;		//focal point	
-	function<P, P> apod;	//apodization function
-	unsigned int apod_res;	//resolution of apodization filter functions
-
-	void apod_uniform()
-	{
-		apod = (P)1;
-	}
-	void apod_bartlett()
-	{
-		apod = (P)1;
-		apod.insert((P)1, (P)0);
-	}
-	void apod_hanning()
-	{
-		apod = (P)0;
-		P x, y;
-		for(unsigned int n=0; n<apod_res; n++)
-		{
-			x = (P)n/(P)apod_res;
-			y = pow( cos( ((P)3.14159 * x) / 2 ), 2);
-			apod.insert(x, y);
-		}
-	}
-	void apod_hamming()
-	{
-		apod = (P)0;
-		P x, y;
-		for(unsigned int n=0; n<apod_res; n++)
-		{
-			x = (P)n/(P)apod_res;
-			y = (P)27/(P)50 + ( (P)23/(P)50 ) * cos((P)3.14159 * x);
-			apod.insert(x, y);
-		}
-	}
-
-	void set_apod(beam_type type)
-	{
-		if(type == Uniform)
-			apod_uniform();
-		if(type == Bartlett)
-			apod_bartlett();
-		if(type == Hanning)
-			apod_hanning();
-		if(type == Hamming)
-			apod_hamming();
-	}
-
-public:
-
-	///constructor: build a default beam (NA=1.0)
-	beam(
-		vec<P> k = rts::vec<P>(0, 0, rtsTAU), 
-		vec<P> _E0 = rts::vec<P>(1, 0, 0), 
-		beam_type _apod = Uniform)
-		: planewave<P>(k, _E0)
-	{
-		_na[0] = (P)0.0;
-		_na[1] = (P)1.0;
-		f = vec<P>( (P)0, (P)0, (P)0 );
-		apod_res = 256;						//set the default resolution for apodization filters
-		set_apod(_apod);						//set the apodization function type
-	}
-
-	beam<P> refract(rts::vec<P> kn) const{
-
-		beam<P> new_beam;
-		new_beam._na[0] = _na[0];
-		new_beam._na[1] = _na[1];
-
-
-		rts::planewave<P> pw = planewave<P>::bend(kn);
-		//std::cout<<pw.str()<<std::endl;
-
-		new_beam.k = pw.kvec();
-		new_beam.E0 = pw.E();
-
-		return new_beam;
-	}
-
-	///Numerical Aperature functions
-	void NA(P na)
-	{
-		_na[0] = (P)0;
-		_na[1] = na;
-	}
-	void NA(P na0, P na1)
-	{
-		_na[0] = na0;
-		_na[1] = na1;
-	}
-
-	/*string str() : 
-	{
-		stringstream ss;
-		ss<<"Beam Center: "<<k<<std::endl;
-
-		return ss.str();
-	}*/
-
-	//Monte-Carlo decomposition into plane waves
-	std::vector< planewave<P> > mc(unsigned int N = 100000, unsigned int seed = 0) const
-	{
-		/*Create Monte-Carlo samples of a cassegrain objective by performing uniform sampling
-			of a sphere and projecting these samples onto an inscribed sphere.
-
-			seed	=	seed for the random number generator
-		*/
-		srand(seed);		//seed the random number generator
-
-		vec<P> k_hat = beam::k.norm();
-
-		///compute the rotation operator to transform (0, 0, 1) to k
-		P cos_angle = k_hat.dot(rts::vec<P>(0, 0, 1));
-		rts::matrix<P, 3> rotation;
-
-		//if the cosine of the angle is -1, the rotation is just a flip across the z axis
-		if(cos_angle == -1){
-			rotation(2, 2) = -1;
-		}
-		else if(cos_angle != 1.0)
-		{
-			rts::vec<P> r_axis = rts::vec<P>(0, 0, 1).cross(k_hat).norm();	//compute the axis of rotation
-			P angle = acos(cos_angle);							//compute the angle of rotation
-			rts::quaternion<P> quat;							//create a quaternion describing the rotation
-			quat.CreateRotation(angle, r_axis);
-			rotation = quat.toMatrix3();							//compute the rotation matrix
-		}
-
-		//find the phi values associated with the cassegrain ring
-		P PHI[2];
-		PHI[0] = (P)asin(_na[0]);
-		PHI[1] = (P)asin(_na[1]);
-
-		//calculate the z-axis cylinder coordinates associated with these angles
-		P Z[2];
-		Z[0] = cos(PHI[0]);
-		Z[1] = cos(PHI[1]);
-		P range = Z[0] - Z[1];
-
-		std::vector< planewave<P> > samples;	//create a vector of plane waves
-
-		//draw a distribution of random phi, z values
-		P z, phi, theta;
-		for(int i=0; i<N; i++)								//for each sample
-		{
-			z = ((P)rand() / (P)RAND_MAX) * range + Z[1];	//find a random position on the surface of a cylinder
-			theta = ((P)rand() / (P)RAND_MAX) * 2 * (P)3.14159;
-			phi = acos(z);									//project onto the sphere, computing phi in spherical coordinates
-
-			//compute and store cartesian coordinates
-			rts::vec<P> spherical(1, theta, phi);				//convert from spherical to cartesian coordinates
-			rts::vec<P> cart = spherical.sph2cart();
-			vec<P> k_prime = rotation * cart;				//create a sample vector
-
-			//store a wave refracted along the given direction
-			//std::cout<<"k prime: "<<rotation<<std::endl;
-			samples.push_back(planewave<P>::refract(k_prime) * apod(phi/PHI[1]));
-		}
-
-		return samples;
-	}
-
-	std::string str()
-	{
-		std::stringstream ss;
-		ss<<"Beam:"<<std::endl;
-		//ss<<"	Central Plane Wave: "<<beam::E0<<" e^i ( "<<beam::k<<" . r )"<<std::endl;
-		ss<<"	Central Plane Wave: "<<beam::k<<std::endl;
-		if(_na[0] == 0)
-			ss<<"	NA: "<<_na[1];
-		else
-			ss<<"	NA: "<<_na[0]<<" -- "<<_na[1];
-
-		return ss.str();
-	}
-
-
-
-};
-
-}
-
-#endif
+#ifndef RTS_BEAM
+#define RTS_BEAM
+
+#include "../math/vector.h"
+#include "../math/function.h"
+#include "../optics/planewave.h"
+#include <vector>
+
+namespace stim{
+
+template<typename P>
+class beam : public planewave<P>
+{
+public:
+	enum beam_type {Uniform, Bartlett, Hamming, Hanning};
+
+private:
+	
+	P _na[2];		//numerical aperature of the focusing optics	
+	vec<P> f;		//focal point	
+	function<P, P> apod;	//apodization function
+	unsigned int apod_res;	//resolution of apodization filter functions
+
+	void apod_uniform()
+	{
+		apod = (P)1;
+	}
+	void apod_bartlett()
+	{
+		apod = (P)1;
+		apod.insert((P)1, (P)0);
+	}
+	void apod_hanning()
+	{
+		apod = (P)0;
+		P x, y;
+		for(unsigned int n=0; n<apod_res; n++)
+		{
+			x = (P)n/(P)apod_res;
+			y = pow( cos( ((P)3.14159 * x) / 2 ), 2);
+			apod.insert(x, y);
+		}
+	}
+	void apod_hamming()
+	{
+		apod = (P)0;
+		P x, y;
+		for(unsigned int n=0; n<apod_res; n++)
+		{
+			x = (P)n/(P)apod_res;
+			y = (P)27/(P)50 + ( (P)23/(P)50 ) * cos((P)3.14159 * x);
+			apod.insert(x, y);
+		}
+	}
+
+	void set_apod(beam_type type)
+	{
+		if(type == Uniform)
+			apod_uniform();
+		if(type == Bartlett)
+			apod_bartlett();
+		if(type == Hanning)
+			apod_hanning();
+		if(type == Hamming)
+			apod_hamming();
+	}
+
+public:
+
+	///constructor: build a default beam (NA=1.0)
+	beam(
+		vec<P> k = rts::vec<P>(0, 0, rtsTAU), 
+		vec<P> _E0 = rts::vec<P>(1, 0, 0), 
+		beam_type _apod = Uniform)
+		: planewave<P>(k, _E0)
+	{
+		_na[0] = (P)0.0;
+		_na[1] = (P)1.0;
+		f = vec<P>( (P)0, (P)0, (P)0 );
+		apod_res = 256;						//set the default resolution for apodization filters
+		set_apod(_apod);						//set the apodization function type
+	}
+
+	beam<P> refract(rts::vec<P> kn) const{
+
+		beam<P> new_beam;
+		new_beam._na[0] = _na[0];
+		new_beam._na[1] = _na[1];
+
+
+		rts::planewave<P> pw = planewave<P>::bend(kn);
+		//std::cout<<pw.str()<<std::endl;
+
+		new_beam.k = pw.kvec();
+		new_beam.E0 = pw.E();
+
+		return new_beam;
+	}
+
+	///Numerical Aperature functions
+	void NA(P na)
+	{
+		_na[0] = (P)0;
+		_na[1] = na;
+	}
+	void NA(P na0, P na1)
+	{
+		_na[0] = na0;
+		_na[1] = na1;
+	}
+
+	/*string str() : 
+	{
+		stringstream ss;
+		ss<<"Beam Center: "<<k<<std::endl;
+
+		return ss.str();
+	}*/
+
+	//Monte-Carlo decomposition into plane waves
+	std::vector< planewave<P> > mc(unsigned int N = 100000, unsigned int seed = 0) const
+	{
+		/*Create Monte-Carlo samples of a cassegrain objective by performing uniform sampling
+			of a sphere and projecting these samples onto an inscribed sphere.
+
+			seed	=	seed for the random number generator
+		*/
+		srand(seed);		//seed the random number generator
+
+		vec<P> k_hat = beam::k.norm();
+
+		///compute the rotation operator to transform (0, 0, 1) to k
+		P cos_angle = k_hat.dot(rts::vec<P>(0, 0, 1));
+		rts::matrix<P, 3> rotation;
+
+		//if the cosine of the angle is -1, the rotation is just a flip across the z axis
+		if(cos_angle == -1){
+			rotation(2, 2) = -1;
+		}
+		else if(cos_angle != 1.0)
+		{
+			rts::vec<P> r_axis = rts::vec<P>(0, 0, 1).cross(k_hat).norm();	//compute the axis of rotation
+			P angle = acos(cos_angle);							//compute the angle of rotation
+			rts::quaternion<P> quat;							//create a quaternion describing the rotation
+			quat.CreateRotation(angle, r_axis);
+			rotation = quat.toMatrix3();							//compute the rotation matrix
+		}
+
+		//find the phi values associated with the cassegrain ring
+		P PHI[2];
+		PHI[0] = (P)asin(_na[0]);
+		PHI[1] = (P)asin(_na[1]);
+
+		//calculate the z-axis cylinder coordinates associated with these angles
+		P Z[2];
+		Z[0] = cos(PHI[0]);
+		Z[1] = cos(PHI[1]);
+		P range = Z[0] - Z[1];
+
+		std::vector< planewave<P> > samples;	//create a vector of plane waves
+
+		//draw a distribution of random phi, z values
+		P z, phi, theta;
+		for(int i=0; i<N; i++)								//for each sample
+		{
+			z = ((P)rand() / (P)RAND_MAX) * range + Z[1];	//find a random position on the surface of a cylinder
+			theta = ((P)rand() / (P)RAND_MAX) * 2 * (P)3.14159;
+			phi = acos(z);									//project onto the sphere, computing phi in spherical coordinates
+
+			//compute and store cartesian coordinates
+			rts::vec<P> spherical(1, theta, phi);				//convert from spherical to cartesian coordinates
+			rts::vec<P> cart = spherical.sph2cart();
+			vec<P> k_prime = rotation * cart;				//create a sample vector
+
+			//store a wave refracted along the given direction
+			//std::cout<<"k prime: "<<rotation<<std::endl;
+			samples.push_back(planewave<P>::refract(k_prime) * apod(phi/PHI[1]));
+		}
+
+		return samples;
+	}
+
+	std::string str()
+	{
+		std::stringstream ss;
+		ss<<"Beam:"<<std::endl;
+		//ss<<"	Central Plane Wave: "<<beam::E0<<" e^i ( "<<beam::k<<" . r )"<<std::endl;
+		ss<<"	Central Plane Wave: "<<beam::k<<std::endl;
+		if(_na[0] == 0)
+			ss<<"	NA: "<<_na[1];
+		else
+			ss<<"	NA: "<<_na[0]<<" -- "<<_na[1];
+
+		return ss.str();
+	}
+
+
+
+};
+
+}
+
+#endif
-#ifndef RTS_MATERIAL_H
-#define RTS_MATERIAL_H
-
-#include <vector>
-#include <ostream>
-#include <iostream>
-#include <fstream>
-#include <complex>
-#include <algorithm>
-#include <sstream>
-#include "../math/complex.h"
-#include "../math/constants.h"
-#include "../math/function.h"
-
-namespace stim{
-
-//Material class - default representation for the material property is the refractive index (RI)
-template<typename T>
-class material : public function< T, complex<T> >{
-
-public:
-    enum wave_property{microns, inverse_cm};
-    enum material_property{ri, absorbance};
-
-private:
-
-    using function< T, complex<T> >::X;
-    using function< T, complex<T> >::Y;
-    using function< T, complex<T> >::insert;
-    using function< T, complex<T> >::bounding;
-
-    std::string name;	//name for the material (defaults to file name)
-
-    void process_header(std::string str, wave_property& wp, material_property& mp){
-
-    	std::stringstream ss(str);	//create a stream from the data string
-    	std::string line;
-    	std::getline(ss, line);		//get the first line as a string
-		while(line[0] == '#'){		//continue looping while the line is a comment
-
-			std::stringstream lstream(line);	//create a stream from the line
-			lstream.ignore();					//ignore the first character ('#')
-
-			std::string prop;		//get the property name
-			lstream>>prop;
-
-			if(prop == "X"){
-				std::string wp_name;
-				lstream>>wp_name;
-				if(wp_name == "microns") wp = microns;
-				else if(wp_name == "inverse_cm") wp = inverse_cm;
-			}
-			else if(prop == "Y"){
-				std::string mp_name;
-				lstream>>mp_name;
-				if(mp_name == "ri") mp = ri;
-				else if(mp_name == "absorbance") mp = absorbance;
-			}
-
-			std::getline(ss, line);		//get the next line
-		}
-
-		function< T, stim::complex<T> >::process_string(str);
-	}
-
-    void from_inverse_cm(){
-    	//convert inverse centimeters to wavelength (in microns)
-    	for(unsigned int i=0; i<X.size(); i++)
-    		X[i] = 10000 / X[i];
-
-    	//reverse the function array
-    	std::reverse(X.begin(), X.end());
-    	std::reverse(Y.begin(), Y.end());
-
-    }
-
-    void init(){
-    	bounding[0] = bounding[1] = stim::complex<T>(1, 0);
-    }
-
-
-public:
-
-    material(std::string filename, wave_property wp, material_property mp){
-    	name = filename;
-    	load(filename, wp, mp);
-    }
-
-    material(std::string filename){
-    	name = filename;
-    	load(filename);
-    }
-
-    material(){
-    	init();
-    }
-
-    complex<T> getN(T lambda){
-    	return function< T, complex<T> >::linear(lambda);
-    }
-
-    void load(std::string filename, wave_property wp, material_property mp){
-
-    	//load the file as a function
-    	function< T, complex<T> >::load(filename);
-    }
-
-    void load(std::string filename){
-
-    	wave_property wp = inverse_cm;
-    	material_property mp = ri;
-    	//turn the file into a string
-    	std::ifstream t(filename.c_str());	//open the file as a stream
-
-    	if(!t){
-    		std::cout<<"ERROR: Couldn't open the material file '"<<filename<<"'"<<std::endl;
-    		exit(1);
-    	}
-		std::string str((std::istreambuf_iterator<char>(t)),
-		std::istreambuf_iterator<char>());
-
-		//process the header information
-		process_header(str, wp, mp);
-
-		//convert units
-		if(wp == inverse_cm)
-			from_inverse_cm();
-		//set the bounding values
-		bounding[0] = Y[0];
-		bounding[1] = Y.back();
-    }
-    std::string str(){
-    	std::stringstream ss;
-    	ss<<name<<std::endl;
-    	ss<<function< T, complex<T> >::str();
-    	return ss.str();
-    }
-    std::string get_name(){
-    	return name;
-    }
-
-    void set_name(std::string str){
-    	name = str;
-    }
-
-};
-
-}
-
-
-
-
-#endif
+#ifndef RTS_MATERIAL_H
+#define RTS_MATERIAL_H
+
+#include <vector>
+#include <ostream>
+#include <iostream>
+#include <fstream>
+#include <complex>
+#include <algorithm>
+#include <sstream>
+#include "../math/complex.h"
+#include "../math/constants.h"
+#include "../math/function.h"
+
+namespace stim{
+
+//Material class - default representation for the material property is the refractive index (RI)
+template<typename T>
+class material : public function< T, complex<T> >{
+
+public:
+    enum wave_property{microns, inverse_cm};
+    enum material_property{ri, absorbance};
+
+private:
+
+    using function< T, complex<T> >::X;
+    using function< T, complex<T> >::Y;
+    using function< T, complex<T> >::insert;
+    using function< T, complex<T> >::bounding;
+
+    std::string name;	//name for the material (defaults to file name)
+
+    void process_header(std::string str, wave_property& wp, material_property& mp){
+
+    	std::stringstream ss(str);	//create a stream from the data string
+    	std::string line;
+    	std::getline(ss, line);		//get the first line as a string
+		while(line[0] == '#'){		//continue looping while the line is a comment
+
+			std::stringstream lstream(line);	//create a stream from the line
+			lstream.ignore();					//ignore the first character ('#')
+
+			std::string prop;		//get the property name
+			lstream>>prop;
+
+			if(prop == "X"){
+				std::string wp_name;
+				lstream>>wp_name;
+				if(wp_name == "microns") wp = microns;
+				else if(wp_name == "inverse_cm") wp = inverse_cm;
+			}
+			else if(prop == "Y"){
+				std::string mp_name;
+				lstream>>mp_name;
+				if(mp_name == "ri") mp = ri;
+				else if(mp_name == "absorbance") mp = absorbance;
+			}
+
+			std::getline(ss, line);		//get the next line
+		}
+
+		function< T, stim::complex<T> >::process_string(str);
+	}
+
+    void from_inverse_cm(){
+    	//convert inverse centimeters to wavelength (in microns)
+    	for(unsigned int i=0; i<X.size(); i++)
+    		X[i] = 10000 / X[i];
+
+    	//reverse the function array
+    	std::reverse(X.begin(), X.end());
+    	std::reverse(Y.begin(), Y.end());
+
+    }
+
+    void init(){
+    	bounding[0] = bounding[1] = stim::complex<T>(1, 0);
+    }
+
+
+public:
+
+    material(std::string filename, wave_property wp, material_property mp){
+    	name = filename;
+    	load(filename, wp, mp);
+    }
+
+    material(std::string filename){
+    	name = filename;
+    	load(filename);
+    }
+
+    material(){
+    	init();
+    }
+
+    complex<T> getN(T lambda){
+    	return function< T, complex<T> >::linear(lambda);
+    }
+
+    void load(std::string filename, wave_property wp, material_property mp){
+
+    	//load the file as a function
+    	function< T, complex<T> >::load(filename);
+    }
+
+    void load(std::string filename){
+
+    	wave_property wp = inverse_cm;
+    	material_property mp = ri;
+    	//turn the file into a string
+    	std::ifstream t(filename.c_str());	//open the file as a stream
+
+    	if(!t){
+    		std::cout<<"ERROR: Couldn't open the material file '"<<filename<<"'"<<std::endl;
+    		exit(1);
+    	}
+		std::string str((std::istreambuf_iterator<char>(t)),
+		std::istreambuf_iterator<char>());
+
+		//process the header information
+		process_header(str, wp, mp);
+
+		//convert units
+		if(wp == inverse_cm)
+			from_inverse_cm();
+		//set the bounding values
+		bounding[0] = Y[0];
+		bounding[1] = Y.back();
+    }
+    std::string str(){
+    	std::stringstream ss;
+    	ss<<name<<std::endl;
+    	ss<<function< T, complex<T> >::str();
+    	return ss.str();
+    }
+    std::string get_name(){
+    	return name;
+    }
+
+    void set_name(std::string str){
+    	name = str;
+    }
+
+};
+
+}
+
+
+
+
+#endif
-#include "../optics/material.h"
-#include "../math/complexfield.cuh"
-#include "../math/constants.h"
-//#include "../envi/bil.h"
-
-#include "cufft.h"
-
-#include <vector>
-#include <sstream>
-
-namespace stim{
-
-//this function writes a sinc function to "dest" such that an iFFT produces a slab
-template<typename T>
-__global__ void gpu_mirst1d_layer_fft(complex<T>* dest, complex<T>* ri, 
-									  T* src, T* zf, 
-									  T w, unsigned int zR, unsigned int nuR){
-	//dest = complex field representing the sample
-	//ri = refractive indices for each wavelength
-	//src = intensity of the light source for each wavelength
-	//zf = z position of the slab interface for each wavelength (accounting for optical path length)
-	//w = width of the slab (in pixels)
-	//zR = number of z-axis samples
-	//nuR = number of wavelengths
-
-    //get the current coordinate in the plane slice
-	int ifz = blockIdx.x * blockDim.x + threadIdx.x;
-	int inu = blockIdx.y * blockDim.y + threadIdx.y;
-
-	//make sure that the thread indices are in-bounds
-	if(inu >= nuR || ifz >= zR) return;
-
-	int i = inu * zR + ifz;
-
-    T fz;
-    if(ifz < zR/2)
-        fz = ifz / (T)zR;
-    else
-        fz = -(zR - ifz) / (T)zR;
-
-    //if the slab starts outside of the simulation domain, just return
-    if(zf[inu] >= zR) return;
-
-	//fill the array along z with a sinc function representing the Fourier transform of the layer
-
-	T opl = w * ri[inu].real();			//optical path length
-
-	//handle the case where the slab goes outside the simulation domain
-	if(zf[inu] + opl >= zR)
-		opl = zR - zf[inu];
-
-	if(opl == 0) return;
-
-	//T l = w * ri[inu].real();
-	//complex<T> e(0.0, -2 * PI * fz * (zf[inu] + zR/2 - l/2.0));
-	complex<T> e(0, -2 * stimPI * fz * (zf[inu] + opl/2));
-
-	complex<T> eta = ri[inu] * ri[inu] - 1;
-
-	//dest[i] = fz;//exp(e) * m[inu] * src[inu] * sin(PI * fz * l) / (PI * fz);
-	if(ifz == 0)
-        dest[i] += opl * exp(e) * eta * src[inu];
-    else
-        dest[i] += opl * exp(e) * eta * src[inu] * sin(stimPI * fz * opl) / (stimPI * fz * opl);
-}
-
-template<typename T>
-__global__ void gpu_mirst1d_increment_z(T* zf, complex<T>* ri, T w, unsigned int S){
-	//zf = current z depth (optical path length) in pixels
-	//ri = refractive index of the material
-	//w = actual width of the layer (in pixels)
-
-
-	//compute the index for this thread
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if(i >= S) return;
-
-	if(ri == NULL)
-		zf[i] += w;
-	else
-		zf[i] += ri[i].real() * w;
-}
-
-//apply the 1D MIRST filter to an existing sample (overwriting the sample)
-template<typename T>
-__global__ void gpu_mirst1d_apply_filter(complex<T>* sampleFFT, T* lambda, 
-								 T dFz,
-								 T inNA, T outNA, 
-								 unsigned int lambdaR, unsigned int zR, 
-								 T sigma = 0){
-	//sampleFFT = the sample in the Fourier domain (will be overwritten)
-	//lambda = list of wavelengths
-	//dFz = delta along the Fz axis in the frequency domain
-	//inNA = NA of the internal obscuration
-	//outNA = NA of the objective
-	//zR = number of pixels along the Fz axis (same as the z-axis)
-	//lambdaR = number of wavelengths
-	//sigma = width of the Gaussian source
-	int ifz = blockIdx.x * blockDim.x + threadIdx.x;
-	int inu = blockIdx.y * blockDim.y + threadIdx.y;
-
-	if(inu >= lambdaR || ifz >= zR) return;
-
-	//calculate the index into the sample FT
-	int i = inu * zR + ifz;
-
-	//compute the frequency (and set all negative spatial frequencies to zero)
-	T fz;
-	if(ifz < zR / 2)
-	    fz = ifz * dFz;
-	//if the spatial frequency is negative, set it to zero and exit
-	else{
-	    sampleFFT[i] = 0;
-	    return;
-	}
-
-	//compute the frequency in inverse microns
-	T nu = 1/lambda[inu];
-
-	//determine the radius of the integration circle
-	T nu_sq = nu * nu;
-	T fz_sq = (fz * fz) / 4;
-
-	//cut off frequencies above the diffraction limit
-	T r;
-	if(fz_sq < nu_sq)
-	    r = sqrt(nu_sq - fz_sq);
-	else
-	    r = 0;
-
-	//account for the optics
-	T Q = 0;
-	if(r > nu * inNA && r < nu * outNA)
-	    Q = 1;
-
-	//account for the source
-	//T sigma = 30.0;
-	T s = exp( - (r*r * sigma*sigma) / 2 );
-	//T s=1;
-
-	//compute the final filter
-	T mirst = 0;
-	if(fz != 0)
-	    mirst = 2 * stimPI * r * s * Q * (1/fz);
-
-	sampleFFT[i] *= mirst;
-
-}
-
-/*This object performs a 1-dimensional (layered) MIRST simulation
-*/
-template<typename T>
-class mirst1d{
-
-private:
-	unsigned int Z;	//z-axis resolution
-	unsigned int pad;	//pixel padding on either side of the sample
-
-	std::vector< material<T> > matlist;	//list of materials
-	std::vector< T > layers;				//list of layer thicknesses
-
-	std::vector< T > lambdas;		//list of wavelengths that are being simulated
-	unsigned int S;					//number of wavelengths (size of "lambdas")
-
-	T NA[2];						//numerical aperature (central obscuration and outer diameter)
-
-	function<T, T> source_profile;	//profile (spectrum) of the source (expressed in inverse centimeters)
-
-	complexfield<T, 1> scratch;		//scratch GPU memory used to build samples, transforms, etc.
-
-	void fft(int direction = CUFFT_FORWARD){
-
-		unsigned padZ = Z + pad;
-		
-		//create cuFFT handles
-		cufftHandle plan;
-		cufftResult result;
-		
-		if(sizeof(T) == 4)
-			result = cufftPlan1d(&plan, padZ, CUFFT_C2C, lambdas.size());	//single precision
-		else
-			result = cufftPlan1d(&plan, padZ, CUFFT_Z2Z, lambdas.size());	//double precision
-
-		//check for Plan 1D errors
-		if(result != CUFFT_SUCCESS){
-			std::cout<<"Error creating CUFFT plan for computing the FFT:"<<std::endl;
-			CufftError(result);
-			exit(1);
-		}
-
-		if(sizeof(T) == 4)
-			result = cufftExecC2C(plan, (cufftComplex*)scratch.ptr(), (cufftComplex*)scratch.ptr(), direction);
-		else
-			result = cufftExecZ2Z(plan, (cufftDoubleComplex*)scratch.ptr(), (cufftDoubleComplex*)scratch.ptr(), direction);
-
-		//check for FFT errors
-		if(result != CUFFT_SUCCESS){
-			std::cout<<"Error executing CUFFT to compute the FFT."<<std::endl;
-			CufftError(result);
-			exit(1);
-		}
-
-		cufftDestroy(plan);
-	}
-
-
-	//initialize the scratch memory
-	void init_scratch(){
-		scratch = complexfield<T, 1>(Z + pad , lambdas.size());
-		scratch = 0;
-	}
-
-	//get the list of scattering efficiency (eta) values for a specified layer
-	std::vector< complex<T> > layer_etas(unsigned int l){
-
-		std::vector< complex<T> > etas;
-
-		//fill the list of etas
-		for(unsigned int i=0; i<lambdas.size(); i++)
-			etas.push_back( matlist[l].eta(lambdas[i]) );
-		return etas;
-	}
-
-	//calculates the optimal block and grid sizes using information from the GPU
-	void cuda_params(dim3& grids, dim3& blocks){
-		int maxThreads = stim::maxThreadsPerBlock(); //compute the optimal block size
-		int SQRT_BLOCK = (int)std::sqrt((float)maxThreads);
-
-		//create one thread for each detector pixel
-		blocks = dim3(SQRT_BLOCK, SQRT_BLOCK);
-		grids = dim3(((Z + 2 * pad) + SQRT_BLOCK -1)/SQRT_BLOCK, (S + SQRT_BLOCK - 1)/SQRT_BLOCK);
-	}
-
-	//add the fourier transform of layer n to the scratch space
-	void build_layer_fft(unsigned int n, T* zf){
-		unsigned int paddedZ = Z + pad;
-
-		T wpx = layers[n] / dz();	//calculate the width of the layer in pixels
-
-		//allocate memory for the refractive index
-		complex<T>* gpuRi;
-		HANDLE_ERROR(cudaMalloc( (void**)&gpuRi, sizeof(complex<T>) * S));
-
-		//allocate memory for the source profile
-		T* gpuSrc;
-		HANDLE_ERROR(cudaMalloc( (void**)&gpuSrc, sizeof(T) * S));
-
-		complex<T> ri;
-		T source;
-		//store the refractive index and source profile in a CPU array
-		for(int inu=0; inu<S; inu++){
-			//save the refractive index to the GPU
-			ri = matlist[n].getN(lambdas[inu]);
-			HANDLE_ERROR(cudaMemcpy( gpuRi + inu, &ri, sizeof(complex<T>), cudaMemcpyHostToDevice ));
-
-			//save the source profile to the GPU
-			source = source_profile(10000 / lambdas[inu]);
-			HANDLE_ERROR(cudaMemcpy( gpuSrc + inu, &source, sizeof(T), cudaMemcpyHostToDevice ));
-
-		}
-
-		//create one thread for each pixel of the field slice
-		dim3 gridDim, blockDim;
-		cuda_params(gridDim, blockDim);
-		stim::gpu_mirst1d_layer_fft<<<gridDim, blockDim>>>(scratch.ptr(), gpuRi, gpuSrc, zf, wpx, paddedZ, S);
-
-		int linBlock = stim::maxThreadsPerBlock(); //compute the optimal block size
-		int linGrid = S / linBlock + 1;
-		stim::gpu_mirst1d_increment_z <<<linGrid, linBlock>>>(zf, gpuRi, wpx, S);
-
-		//free memory
-		HANDLE_ERROR(cudaFree(gpuRi));
-		HANDLE_ERROR(cudaFree(gpuSrc));
-	}
-
-	void build_sample(){
-		init_scratch();		//initialize the GPU scratch space
-		//build_layer(1);
-
-		T* zf;
-		HANDLE_ERROR(cudaMalloc(&zf, sizeof(T) * S));
-		HANDLE_ERROR(cudaMemset(zf, 0, sizeof(T) * S));
-
-		//render each layer of the sample
-		for(unsigned int l=0; l<layers.size(); l++){
-			build_layer_fft(l, zf);
-		}
-
-		HANDLE_ERROR(cudaFree(zf));
-	}
-
-	void apply_filter(){
-		dim3 gridDim, blockDim;
-		cuda_params(gridDim, blockDim);
-
-		unsigned int Zpad = Z + pad;
-
-		T sim_range = dz() * Zpad;
-    	T dFz = 1 / sim_range;
-
-		//copy the array of wavelengths to the GPU
-		T* gpuLambdas;
-		HANDLE_ERROR(cudaMalloc(&gpuLambdas, sizeof(T) * Zpad));
-		HANDLE_ERROR(cudaMemcpy(gpuLambdas, &lambdas[0], sizeof(T) * Zpad, cudaMemcpyHostToDevice));
-		stim::gpu_mirst1d_apply_filter <<<gridDim, blockDim>>>(scratch.ptr(), gpuLambdas, 
-								 dFz,
-								 NA[0], NA[1], 
-								 S, Zpad);
-	}
-
-	//crop the image to the sample thickness - keep in mind that sample thickness != optical path length
-	void crop(){
-
-		scratch = scratch.crop(Z, S);
-	}
-	
-	//save the scratch field as a binary file
-	void to_binary(std::string filename){
-
-	}
-
-
-public:
-
-	//constructor
-	mirst1d(unsigned int rZ = 100,
-			unsigned int padding = 0){
-		Z = rZ;
-		pad = padding;
-		NA[0] = 0;
-		NA[1] = 0.8;
-		S = 0;
-		source_profile = 1;
-	}
-
-	//add a layer, thickness = microns
-	void add_layer(material<T> mat, T thickness){
-		matlist.push_back(mat);
-		layers.push_back(thickness);
-	}
-
-	void add_layer(std::string filename, T thickness){
-		add_layer(material<T>(filename), thickness);
-	}
-
-	//adds a profile spectrum for the light source
-	void set_source(std::string filename){
-		source_profile.load(filename);
-	}
-
-	//adds a block of wavenumbers (cm^-1) to the simulation parameters
-	void add_wavenumbers(unsigned int start, unsigned int stop, unsigned int step){
-		unsigned int nu = start;
-		while(nu <= stop){
-			lambdas.push_back((T)10000 / nu);
-			nu += step;
-		}
-		S = lambdas.size();		//increment the number of wavelengths (shorthand for later)
-	}
-
-	T thickness(){
-		T t = 0;
-		for(unsigned int l=0; l<layers.size(); l++)
-			t += layers[l];
-		return t;
-	}
-
-	void padding(unsigned int padding = 0){
-		pad = padding;
-	}
-
-	T dz(){
-		return thickness() / Z;		//calculate the z-axis step size
-	}
-
-	void na(T in, T out){
-		NA[0] = in;
-		NA[1] = out;
-	}
-
-	void na(T out){
-		na(0, out);
-	}
-
-	stim::function<T, T> get_source(){
-		return source_profile;
-	}
-
-	void save_sample(std::string filename){
-		//create a sample and save the magnitude as an image
-		build_sample();
-		fft(CUFFT_INVERSE);
-		scratch.toImage(filename, stim::complexfield<T, 1>::magnitude);
-	}
-
-	void save_mirst(std::string filename, bool binary = true){
-		//apply the MIRST filter to a sample and save the image
-
-		//build the sample in the Fourier domain
-		build_sample();
-
-		//apply the MIRST filter
-		apply_filter();
-
-		//apply an inverse FFT to bring the results back into the spatial domain
-		fft(CUFFT_INVERSE);
-
-		crop();
-
-		//save the image
-		if(binary)
-			to_binary(filename);
-		else
-			scratch.toImage(filename, stim::complexfield<T, 1>::magnitude);
-	}
-
-
-
-
-	std::string str(){
-
-		stringstream ss;
-		ss<<"1D MIRST Simulation========================="<<std::endl;
-		ss<<"z-axis resolution: "<<Z<<std::endl;
-		ss<<"simulation domain: ["<<lambdas[0]<<", "<<lambdas.back()<<"]"<<std::endl;
-		ss<<"number of wavelengths: "<<lambdas.size()<<std::endl;
-		ss<<"padding: "<<pad<<std::endl;
-		ss<<"sample thickness: "<<thickness()<<" um"<<std::endl;
-		ss<<"dz: "<<dz()<<" um"<<std::endl;
-		ss<<std::endl;
-		ss<<layers.size()<<" layers-------------"<<std::endl;
-		for(unsigned int l=0; l<layers.size(); l++)
-			ss<<"layer "<<l<<": "<<layers[l]<<" um"<<"---------"<<std::endl<<matlist[l].str()<<std::endl;
-
-		ss<<"source profile-----------"<<std::endl;
-		ss<<get_source().str()<<std::endl;
-
-		return ss.str();
-
-
-	}
-
-
-
-};
-
-}
+#include "../optics/material.h"
+#include "../math/complexfield.cuh"
+#include "../math/constants.h"
+//#include "../envi/bil.h"
+
+#include "cufft.h"
+
+#include <vector>
+#include <sstream>
+
+namespace stim{
+
+//this function writes a sinc function to "dest" such that an iFFT produces a slab
+template<typename T>
+__global__ void gpu_mirst1d_layer_fft(complex<T>* dest, complex<T>* ri, 
+									  T* src, T* zf, 
+									  T w, unsigned int zR, unsigned int nuR){
+	//dest = complex field representing the sample
+	//ri = refractive indices for each wavelength
+	//src = intensity of the light source for each wavelength
+	//zf = z position of the slab interface for each wavelength (accounting for optical path length)
+	//w = width of the slab (in pixels)
+	//zR = number of z-axis samples
+	//nuR = number of wavelengths
+
+    //get the current coordinate in the plane slice
+	int ifz = blockIdx.x * blockDim.x + threadIdx.x;
+	int inu = blockIdx.y * blockDim.y + threadIdx.y;
+
+	//make sure that the thread indices are in-bounds
+	if(inu >= nuR || ifz >= zR) return;
+
+	int i = inu * zR + ifz;
+
+    T fz;
+    if(ifz < zR/2)
+        fz = ifz / (T)zR;
+    else
+        fz = -(zR - ifz) / (T)zR;
+
+    //if the slab starts outside of the simulation domain, just return
+    if(zf[inu] >= zR) return;
+
+	//fill the array along z with a sinc function representing the Fourier transform of the layer
+
+	T opl = w * ri[inu].real();			//optical path length
+
+	//handle the case where the slab goes outside the simulation domain
+	if(zf[inu] + opl >= zR)
+		opl = zR - zf[inu];
+
+	if(opl == 0) return;
+
+	//T l = w * ri[inu].real();
+	//complex<T> e(0.0, -2 * PI * fz * (zf[inu] + zR/2 - l/2.0));
+	complex<T> e(0, -2 * stimPI * fz * (zf[inu] + opl/2));
+
+	complex<T> eta = ri[inu] * ri[inu] - 1;
+
+	//dest[i] = fz;//exp(e) * m[inu] * src[inu] * sin(PI * fz * l) / (PI * fz);
+	if(ifz == 0)
+        dest[i] += opl * exp(e) * eta * src[inu];
+    else
+        dest[i] += opl * exp(e) * eta * src[inu] * sin(stimPI * fz * opl) / (stimPI * fz * opl);
+}
+
+template<typename T>
+__global__ void gpu_mirst1d_increment_z(T* zf, complex<T>* ri, T w, unsigned int S){
+	//zf = current z depth (optical path length) in pixels
+	//ri = refractive index of the material
+	//w = actual width of the layer (in pixels)
+
+
+	//compute the index for this thread
+	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	if(i >= S) return;
+
+	if(ri == NULL)
+		zf[i] += w;
+	else
+		zf[i] += ri[i].real() * w;
+}
+
+//apply the 1D MIRST filter to an existing sample (overwriting the sample)
+template<typename T>
+__global__ void gpu_mirst1d_apply_filter(complex<T>* sampleFFT, T* lambda, 
+								 T dFz,
+								 T inNA, T outNA, 
+								 unsigned int lambdaR, unsigned int zR, 
+								 T sigma = 0){
+	//sampleFFT = the sample in the Fourier domain (will be overwritten)
+	//lambda = list of wavelengths
+	//dFz = delta along the Fz axis in the frequency domain
+	//inNA = NA of the internal obscuration
+	//outNA = NA of the objective
+	//zR = number of pixels along the Fz axis (same as the z-axis)
+	//lambdaR = number of wavelengths
+	//sigma = width of the Gaussian source
+	int ifz = blockIdx.x * blockDim.x + threadIdx.x;
+	int inu = blockIdx.y * blockDim.y + threadIdx.y;
+
+	if(inu >= lambdaR || ifz >= zR) return;
+
+	//calculate the index into the sample FT
+	int i = inu * zR + ifz;
+
+	//compute the frequency (and set all negative spatial frequencies to zero)
+	T fz;
+	if(ifz < zR / 2)
+	    fz = ifz * dFz;
+	//if the spatial frequency is negative, set it to zero and exit
+	else{
+	    sampleFFT[i] = 0;
+	    return;
+	}
+
+	//compute the frequency in inverse microns
+	T nu = 1/lambda[inu];
+
+	//determine the radius of the integration circle
+	T nu_sq = nu * nu;
+	T fz_sq = (fz * fz) / 4;
+
+	//cut off frequencies above the diffraction limit
+	T r;
+	if(fz_sq < nu_sq)
+	    r = sqrt(nu_sq - fz_sq);
+	else
+	    r = 0;
+
+	//account for the optics
+	T Q = 0;
+	if(r > nu * inNA && r < nu * outNA)
+	    Q = 1;
+
+	//account for the source
+	//T sigma = 30.0;
+	T s = exp( - (r*r * sigma*sigma) / 2 );
+	//T s=1;
+
+	//compute the final filter
+	T mirst = 0;
+	if(fz != 0)
+	    mirst = 2 * stimPI * r * s * Q * (1/fz);
+
+	sampleFFT[i] *= mirst;
+
+}
+
+/*This object performs a 1-dimensional (layered) MIRST simulation
+*/
+template<typename T>
+class mirst1d{
+
+private:
+	unsigned int Z;	//z-axis resolution
+	unsigned int pad;	//pixel padding on either side of the sample
+
+	std::vector< material<T> > matlist;	//list of materials
+	std::vector< T > layers;				//list of layer thicknesses
+
+	std::vector< T > lambdas;		//list of wavelengths that are being simulated
+	unsigned int S;					//number of wavelengths (size of "lambdas")
+
+	T NA[2];						//numerical aperature (central obscuration and outer diameter)
+
+	function<T, T> source_profile;	//profile (spectrum) of the source (expressed in inverse centimeters)
+
+	complexfield<T, 1> scratch;		//scratch GPU memory used to build samples, transforms, etc.
+
+	void fft(int direction = CUFFT_FORWARD){
+
+		unsigned padZ = Z + pad;
+		
+		//create cuFFT handles
+		cufftHandle plan;
+		cufftResult result;
+		
+		if(sizeof(T) == 4)
+			result = cufftPlan1d(&plan, padZ, CUFFT_C2C, lambdas.size());	//single precision
+		else
+			result = cufftPlan1d(&plan, padZ, CUFFT_Z2Z, lambdas.size());	//double precision
+
+		//check for Plan 1D errors
+		if(result != CUFFT_SUCCESS){
+			std::cout<<"Error creating CUFFT plan for computing the FFT:"<<std::endl;
+			CufftError(result);
+			exit(1);
+		}
+
+		if(sizeof(T) == 4)
+			result = cufftExecC2C(plan, (cufftComplex*)scratch.ptr(), (cufftComplex*)scratch.ptr(), direction);
+		else
+			result = cufftExecZ2Z(plan, (cufftDoubleComplex*)scratch.ptr(), (cufftDoubleComplex*)scratch.ptr(), direction);
+
+		//check for FFT errors
+		if(result != CUFFT_SUCCESS){
+			std::cout<<"Error executing CUFFT to compute the FFT."<<std::endl;
+			CufftError(result);
+			exit(1);
+		}
+
+		cufftDestroy(plan);
+	}
+
+
+	//initialize the scratch memory
+	void init_scratch(){
+		scratch = complexfield<T, 1>(Z + pad , lambdas.size());
+		scratch = 0;
+	}
+
+	//get the list of scattering efficiency (eta) values for a specified layer
+	std::vector< complex<T> > layer_etas(unsigned int l){
+
+		std::vector< complex<T> > etas;
+
+		//fill the list of etas
+		for(unsigned int i=0; i<lambdas.size(); i++)
+			etas.push_back( matlist[l].eta(lambdas[i]) );
+		return etas;
+	}
+
+	//calculates the optimal block and grid sizes using information from the GPU
+	void cuda_params(dim3& grids, dim3& blocks){
+		int maxThreads = stim::maxThreadsPerBlock(); //compute the optimal block size
+		int SQRT_BLOCK = (int)std::sqrt((float)maxThreads);
+
+		//create one thread for each detector pixel
+		blocks = dim3(SQRT_BLOCK, SQRT_BLOCK);
+		grids = dim3(((Z + 2 * pad) + SQRT_BLOCK -1)/SQRT_BLOCK, (S + SQRT_BLOCK - 1)/SQRT_BLOCK);
+	}
+
+	//add the fourier transform of layer n to the scratch space
+	void build_layer_fft(unsigned int n, T* zf){
+		unsigned int paddedZ = Z + pad;
+
+		T wpx = layers[n] / dz();	//calculate the width of the layer in pixels
+
+		//allocate memory for the refractive index
+		complex<T>* gpuRi;
+		HANDLE_ERROR(cudaMalloc( (void**)&gpuRi, sizeof(complex<T>) * S));
+
+		//allocate memory for the source profile
+		T* gpuSrc;
+		HANDLE_ERROR(cudaMalloc( (void**)&gpuSrc, sizeof(T) * S));
+
+		complex<T> ri;
+		T source;
+		//store the refractive index and source profile in a CPU array
+		for(int inu=0; inu<S; inu++){
+			//save the refractive index to the GPU
+			ri = matlist[n].getN(lambdas[inu]);
+			HANDLE_ERROR(cudaMemcpy( gpuRi + inu, &ri, sizeof(complex<T>), cudaMemcpyHostToDevice ));
+
+			//save the source profile to the GPU
+			source = source_profile(10000 / lambdas[inu]);
+			HANDLE_ERROR(cudaMemcpy( gpuSrc + inu, &source, sizeof(T), cudaMemcpyHostToDevice ));
+
+		}
+
+		//create one thread for each pixel of the field slice
+		dim3 gridDim, blockDim;
+		cuda_params(gridDim, blockDim);
+		stim::gpu_mirst1d_layer_fft<<<gridDim, blockDim>>>(scratch.ptr(), gpuRi, gpuSrc, zf, wpx, paddedZ, S);
+
+		int linBlock = stim::maxThreadsPerBlock(); //compute the optimal block size
+		int linGrid = S / linBlock + 1;
+		stim::gpu_mirst1d_increment_z <<<linGrid, linBlock>>>(zf, gpuRi, wpx, S);
+
+		//free memory
+		HANDLE_ERROR(cudaFree(gpuRi));
+		HANDLE_ERROR(cudaFree(gpuSrc));
+	}
+
+	void build_sample(){
+		init_scratch();		//initialize the GPU scratch space
+		//build_layer(1);
+
+		T* zf;
+		HANDLE_ERROR(cudaMalloc(&zf, sizeof(T) * S));
+		HANDLE_ERROR(cudaMemset(zf, 0, sizeof(T) * S));
+
+		//render each layer of the sample
+		for(unsigned int l=0; l<layers.size(); l++){
+			build_layer_fft(l, zf);
+		}
+
+		HANDLE_ERROR(cudaFree(zf));
+	}
+
+	void apply_filter(){
+		dim3 gridDim, blockDim;
+		cuda_params(gridDim, blockDim);
+
+		unsigned int Zpad = Z + pad;
+
+		T sim_range = dz() * Zpad;
+    	T dFz = 1 / sim_range;
+
+		//copy the array of wavelengths to the GPU
+		T* gpuLambdas;
+		HANDLE_ERROR(cudaMalloc(&gpuLambdas, sizeof(T) * Zpad));
+		HANDLE_ERROR(cudaMemcpy(gpuLambdas, &lambdas[0], sizeof(T) * Zpad, cudaMemcpyHostToDevice));
+		stim::gpu_mirst1d_apply_filter <<<gridDim, blockDim>>>(scratch.ptr(), gpuLambdas, 
+								 dFz,
+								 NA[0], NA[1], 
+								 S, Zpad);
+	}
+
+	//crop the image to the sample thickness - keep in mind that sample thickness != optical path length
+	void crop(){
+
+		scratch = scratch.crop(Z, S);
+	}
+	
+	//save the scratch field as a binary file
+	void to_binary(std::string filename){
+
+	}
+
+
+public:
+
+	//constructor
+	mirst1d(unsigned int rZ = 100,
+			unsigned int padding = 0){
+		Z = rZ;
+		pad = padding;
+		NA[0] = 0;
+		NA[1] = 0.8;
+		S = 0;
+		source_profile = 1;
+	}
+
+	//add a layer, thickness = microns
+	void add_layer(material<T> mat, T thickness){
+		matlist.push_back(mat);
+		layers.push_back(thickness);
+	}
+
+	void add_layer(std::string filename, T thickness){
+		add_layer(material<T>(filename), thickness);
+	}
+
+	//adds a profile spectrum for the light source
+	void set_source(std::string filename){
+		source_profile.load(filename);
+	}
+
+	//adds a block of wavenumbers (cm^-1) to the simulation parameters
+	void add_wavenumbers(unsigned int start, unsigned int stop, unsigned int step){
+		unsigned int nu = start;
+		while(nu <= stop){
+			lambdas.push_back((T)10000 / nu);
+			nu += step;
+		}
+		S = lambdas.size();		//increment the number of wavelengths (shorthand for later)
+	}
+
+	T thickness(){
+		T t = 0;
+		for(unsigned int l=0; l<layers.size(); l++)
+			t += layers[l];
+		return t;
+	}
+
+	void padding(unsigned int padding = 0){
+		pad = padding;
+	}
+
+	T dz(){
+		return thickness() / Z;		//calculate the z-axis step size
+	}
+
+	void na(T in, T out){
+		NA[0] = in;
+		NA[1] = out;
+	}
+
+	void na(T out){
+		na(0, out);
+	}
+
+	stim::function<T, T> get_source(){
+		return source_profile;
+	}
+
+	void save_sample(std::string filename){
+		//create a sample and save the magnitude as an image
+		build_sample();
+		fft(CUFFT_INVERSE);
+		scratch.toImage(filename, stim::complexfield<T, 1>::magnitude);
+	}
+
+	void save_mirst(std::string filename, bool binary = true){
+		//apply the MIRST filter to a sample and save the image
+
+		//build the sample in the Fourier domain
+		build_sample();
+
+		//apply the MIRST filter
+		apply_filter();
+
+		//apply an inverse FFT to bring the results back into the spatial domain
+		fft(CUFFT_INVERSE);
+
+		crop();
+
+		//save the image
+		if(binary)
+			to_binary(filename);
+		else
+			scratch.toImage(filename, stim::complexfield<T, 1>::magnitude);
+	}
+
+
+
+
+	std::string str(){
+
+		stringstream ss;
+		ss<<"1D MIRST Simulation========================="<<std::endl;
+		ss<<"z-axis resolution: "<<Z<<std::endl;
+		ss<<"simulation domain: ["<<lambdas[0]<<", "<<lambdas.back()<<"]"<<std::endl;
+		ss<<"number of wavelengths: "<<lambdas.size()<<std::endl;
+		ss<<"padding: "<<pad<<std::endl;
+		ss<<"sample thickness: "<<thickness()<<" um"<<std::endl;
+		ss<<"dz: "<<dz()<<" um"<<std::endl;
+		ss<<std::endl;
+		ss<<layers.size()<<" layers-------------"<<std::endl;
+		for(unsigned int l=0; l<layers.size(); l++)
+			ss<<"layer "<<l<<": "<<layers[l]<<" um"<<"---------"<<std::endl<<matlist[l].str()<<std::endl;
+
+		ss<<"source profile-----------"<<std::endl;
+		ss<<get_source().str()<<std::endl;
+
+		return ss.str();
+
+
+	}
+
+
+
+};
+
+}
+#ifndef RTS_PLANEWAVE
+#define RTS_PLANEWAVE
+
+#include <string>
+#include <sstream>
+
+#include "../math/vector.h"
+#include "../math/quaternion.h"
+#include "../math/constants.h"
+#include "../math/plane.h"
+#include "../cuda/callable.h"
+
+/*Basic conversions used here (assuming a vacuum)
+	lambda =
+*/
+
+namespace stim{
+	namespace optics{
+
+template<typename T>
+class planewave{
+
+protected:
+
+	vec<T> k;	//k = tau / lambda
+	vec< complex<T> > E0;		//amplitude
+	//T phi;
+
+	CUDA_CALLABLE planewave<T> bend(rts::vec<T> kn) const{
+
+		vec<T> kn_hat = kn.norm();				//normalize the new k
+		vec<T> k_hat = k.norm();				//normalize the current k
+
+		//std::cout<<"PLANE WAVE BENDING------------------"<<std::endl;
+		//std::cout<<"kn_hat: "<<kn_hat<<"     k_hat: "<<k_hat<<std::endl;
+
+		planewave<T> new_p;						//create a new plane wave
+
+		//if kn is equal to k or -k, handle the degenerate case
+		T k_dot_kn = k_hat.dot(kn_hat);
+
+		//if k . n < 0, then the bend is a reflection
+			//flip k_hat
+		if(k_dot_kn < 0) k_hat = -k_hat;
+
+		//std::cout<<"k dot kn: "<<k_dot_kn<<std::endl;
+
+		//std::cout<<"k_dot_kn: "<<k_dot_kn<<std::endl;
+		if(k_dot_kn == -1){
+			new_p.k = -k;
+			new_p.E0 = E0;
+			return new_p;
+		}
+		else if(k_dot_kn == 1){
+			new_p.k = k;
+			new_p.E0 = E0;
+			return new_p;
+		}
+
+		vec<T> r = k_hat.cross(kn_hat);			//compute the rotation vector
+
+		//std::cout<<"r: "<<r<<std::endl;
+
+		T theta = asin(r.len());				//compute the angle of the rotation about r
+
+		
+
+		//deal with a zero vector (both k and kn point in the same direction)
+		//if(theta == (T)0)
+		//{
+		//	new_p = *this;
+		//	return new_p;
+		//}
+
+		//create a quaternion to capture the rotation
+		quaternion<T> q;
+		q.CreateRotation(theta, r.norm());
+
+		//apply the rotation to E0
+		vec< complex<T> > E0n = q.toMatrix3() * E0;
+
+		new_p.k = kn_hat * kmag();
+		new_p.E0 = E0n;
+
+		return new_p;
+	}
+
+public:
+
+
+	///constructor: create a plane wave propagating along z, polarized along x
+	/*planewave(T lambda = (T)1)
+	{
+		k = rts::vec<T>(0, 0, 1) * (TAU/lambda);
+		E0 = rts::vec<T>(1, 0, 0);
+	}*/
+	///constructor: create a plane wave propagating along k, polarized along _E0, at frequency _omega
+	CUDA_CALLABLE planewave(vec<T> kvec = rts::vec<T>(0, 0, rtsTAU), 
+							vec< complex<T> > E = rts::vec<T>(1, 0, 0), T phase = 0)
+	{
+		//phi = phase;
+
+		k = kvec;
+		vec< complex<T> > k_hat = k.norm();
+
+		if(E.len() == 0)			//if the plane wave has an amplitude of 0
+			E0 = vec<T>(0);			//just return it
+		else{
+			vec< complex<T> > s = (k_hat.cross(E)).norm();		//compute an orthogonal side vector
+			vec< complex<T> > E_hat = (s.cross(k)).norm();	//compute a normalized E0 direction vector
+			E0 = E_hat * E_hat.dot(E);					//compute the projection of _E0 onto E0_hat
+		}
+
+		E0 = E0 * exp( complex<T>(0, phase) );
+	}
+
+	///multiplication operator: scale E0
+    CUDA_CALLABLE planewave<T> & operator* (const T & rhs)
+	{
+		
+		E0 = E0 * rhs;
+		return *this;
+	}
+
+	CUDA_CALLABLE T lambda() const
+	{
+		return rtsTAU / k.len();
+	}
+
+	CUDA_CALLABLE T kmag() const
+	{
+		return k.len();
+	}
+
+	CUDA_CALLABLE vec< complex<T> > E(){
+		return E0;
+	}
+
+	CUDA_CALLABLE vec<T> kvec(){
+		return k;
+	}
+
+	/*CUDA_CALLABLE T phase(){
+		return phi;
+	}
+
+	CUDA_CALLABLE void phase(T p){
+		phi = p;
+	}*/
+
+	CUDA_CALLABLE vec< complex<T> > pos(vec<T> p = vec<T>(0, 0, 0)){
+		vec< complex<T> > result;
+
+		T kdp = k.dot(p);
+		complex<T> x = complex<T>(0, kdp);
+		complex<T> expx = exp(x);
+
+		result[0] = E0[0] * expx;
+		result[1] = E0[1] * expx;
+		result[2] = E0[2] * expx;
+
+		return result;
+	}
+
+	//scales k based on a transition from material ni to material nt
+	CUDA_CALLABLE planewave<T> n(T ni, T nt){
+		return planewave<T>(k * (nt / ni), E0);
+	}
+
+	CUDA_CALLABLE planewave<T> refract(rts::vec<T> kn) const
+	{
+		return bend(kn);
+	}
+
+	void scatter(rts::plane<T> P, T nr, planewave<T> &r, planewave<T> &t){
+
+		int facing = P.face(k);		//determine which direction the plane wave is coming in
+
+		//if(facing == 0)				//if the wave is tangent to the plane, return an identical wave
+		//	return *this;
+		//else 
+		if(facing == -1){		//if the wave hits the back of the plane, invert the plane and nr
+			P = P.flip();			//flip the plane
+			nr = 1/nr;				//invert the refractive index (now nr = n0/n1)
+		}
+
+		//use Snell's Law to calculate the transmitted angle
+		T cos_theta_i = k.norm().dot(-P.norm());				//compute the cosine of theta_i
+		T theta_i = acos(cos_theta_i);							//compute theta_i
+		T sin_theta_t = (1/nr) * sin(theta_i);						//compute the sine of theta_t using Snell's law
+		T theta_t = asin(sin_theta_t);							//compute the cosine of theta_t
+
+		bool tir = false;						//flag for total internal reflection
+		if(theta_t != theta_t){
+			tir = true;
+			theta_t = rtsPI / (T)2;
+		}
+
+		//handle the degenerate case where theta_i is 0 (the plane wave hits head-on)
+		if(theta_i == 0){
+			T rp = (1 - nr) / (1 + nr);		//compute the Fresnel coefficients
+			T tp = 2 / (1 + nr);
+			vec<T> kr = -k;
+			vec<T> kt = k * nr;			//set the k vectors for theta_i = 0
+			vec< complex<T> > Er = E0 * rp;		//compute the E vectors
+			vec< complex<T> > Et = E0 * tp;
+			T phase_t = P.p().dot(k - kt);	//compute the phase offset
+			T phase_r = P.p().dot(k - kr);
+			//std::cout<<"Degeneracy: Head-On"<<std::endl;
+			//std::cout<<"rs: "<<rp<<"  rp: "<<rp<<"  ts: "<<tp<<"  tp: "<<tp<<std::endl;
+			//std::cout<<"phase r: "<<phase_r<<"  phase t: "<<phase_t<<std::endl;
+
+			//create the plane waves
+			r = planewave<T>(kr, Er, phase_r);
+			t = planewave<T>(kt, Et, phase_t);
+
+			//std::cout<<"i + r: "<<pos()[0] + r.pos()[0]<<pos()[1] + r.pos()[1]<<pos()[2] + r.pos()[2]<<std::endl;
+			//std::cout<<"t:     "<<t.pos()[0]<<t.pos()[1]<<t.pos()[2]<<std::endl;
+			//std::cout<<"--------------------------------"<<std::endl;
+			return;
+		}
+
+
+		//compute the Fresnel coefficients
+		T rp, rs, tp, ts;
+		rp = tan(theta_t - theta_i) / tan(theta_t + theta_i);
+		rs = sin(theta_t - theta_i) / sin(theta_t + theta_i);
+		
+		if(tir){
+			tp = ts = 0;
+		}
+		else{
+			tp = ( 2 * sin(theta_t) * cos(theta_i) ) / ( sin(theta_t + theta_i) * cos(theta_t - theta_i) );
+			ts = ( 2 * sin(theta_t) * cos(theta_i) ) / sin(theta_t + theta_i);
+		}
+
+		//compute the coordinate space for the plane of incidence
+		vec<T> z_hat = -P.norm();
+		vec<T> y_hat = P.parallel(k).norm();
+		vec<T> x_hat = y_hat.cross(z_hat).norm();
+
+		//compute the k vectors for r and t
+		vec<T> kr, kt;
+		kr = ( y_hat * sin(theta_i) - z_hat * cos(theta_i) ) * kmag();
+		kt = ( y_hat * sin(theta_t) + z_hat * cos(theta_t) ) * kmag() * nr;
+
+		//compute the magnitude of the p- and s-polarized components of the incident E vector
+		complex<T> Ei_s = E0.dot(x_hat);
+		//int sgn = (0 < E0.dot(y_hat)) - (E0.dot(y_hat) < 0);
+		int sgn = E0.dot(y_hat).sgn();
+		vec< complex<T> > cx_hat = x_hat;
+		complex<T> Ei_p = ( E0 - cx_hat * Ei_s ).len() * sgn;
+		//T Ei_p = ( E0 - x_hat * Ei_s ).len();
+		//compute the magnitude of the p- and s-polarized components of the reflected E vector
+		complex<T> Er_s = Ei_s * rs;
+		complex<T> Er_p = Ei_p * rp;
+		//compute the magnitude of the p- and s-polarized components of the transmitted E vector
+		complex<T> Et_s = Ei_s * ts;
+		complex<T> Et_p = Ei_p * tp;
+
+		//std::cout<<"E0: "<<E0<<std::endl;
+		//std::cout<<"E0 dot y_hat: "<<E0.dot(y_hat)<<std::endl;
+		//std::cout<<"theta i: "<<theta_i<<"  theta t: "<<theta_t<<std::endl;
+		//std::cout<<"x_hat: "<<x_hat<<"  y_hat: "<<y_hat<<"  z_hat: "<<z_hat<<std::endl;
+		//std::cout<<"Ei_s: "<<Ei_s<<"  Ei_p: "<<Ei_p<<"  Er_s: "<<Er_s<<"  Er_p: "<<Er_p<<"  Et_s: "<<Et_s<<"  Et_p: "<<Et_p<<std::endl;
+		//std::cout<<"rs: "<<rs<<"  rp: "<<rp<<"  ts: "<<ts<<"  tp: "<<tp<<std::endl;
+		
+
+		//compute the reflected E vector
+		vec< complex<T> > Er = vec< complex<T> >(y_hat * cos(theta_i) + z_hat * sin(theta_i)) * Er_p + cx_hat * Er_s;
+		//compute the transmitted E vector
+		vec< complex<T> > Et = vec< complex<T> >(y_hat * cos(theta_t) - z_hat * sin(theta_t)) * Et_p + cx_hat * Et_s;
+
+		T phase_t = P.p().dot(k - kt);
+		T phase_r = P.p().dot(k - kr);
+
+		//std::cout<<"phase r: "<<phase_r<<"  phase t: "<<phase_t<<std::endl;
+
+		//std::cout<<"phase: "<<phase<<std::endl;
+
+		//create the plane waves
+		r.k = kr;
+		r.E0 = Er * exp( complex<T>(0, phase_r) );
+		//r.phi = phase_r;
+
+		//t = bend(kt);
+		//t.k = t.k * nr;
+
+		t.k = kt;
+		t.E0 = Et * exp( complex<T>(0, phase_t) );
+		//t.phi = phase_t;
+		//std::cout<<"i: "<<str()<<std::endl;
+		//std::cout<<"r: "<<r.str()<<std::endl;
+		//std::cout<<"t: "<<t.str()<<std::endl;
+
+		//std::cout<<"i + r: "<<pos()[0] + r.pos()[0]<<pos()[1] + r.pos()[1]<<pos()[2] + r.pos()[2]<<std::endl;
+		//std::cout<<"t:     "<<t.pos()[0]<<t.pos()[1]<<t.pos()[2]<<std::endl;
+		//std::cout<<"--------------------------------"<<std::endl;
+
+	}
+
+	std::string str()
+	{
+		std::stringstream ss;
+		ss<<"Plane Wave:"<<std::endl;
+		ss<<"	"<<E0<<" e^i ( "<<k<<" . r )";
+		return ss.str();
+	}
+};					//end planewave class
+}					//end namespace optics
+}					//end namespace stim
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, rts::planewave<T> p)
+{
+    os<<p.str();
+    return os;
+}
+
+#endif
@@ -10,8 +10,8 @@ class aaboundingbox{
 public:
 	bool set;				//has the bounding box been set to include any points?
-	stim::vec<T> A;			//minimum point in the bounding box
-	stim::vec<T> B;			//maximum point in the bounding box
+	stim::vec3<T> A;			//minimum point in the bounding box
+	stim::vec3<T> B;			//maximum point in the bounding box
 	aaboundingbox(){					//constructor generates an empty bounding box
 		set = false;
@@ -21,7 +21,7 @@ public:
 	/// Test if a point is inside of the bounding box and returns true if it is.
 	/// @param p is the point to be tested
-	bool test(stim::vec<T> p){
+	bool test(stim::vec3<T> p){
 		for(unsigned d = 0; d < p.size(); p++){		//for each dimension
 			if(p[d] < A[d]) return false;			//if the point is less than the minimum bound, return false
@@ -33,7 +33,7 @@ public:
 	/// Expand the bounding box to include the specified point.
 	/// @param p is the point to be included
-	void expand(stim::vec<T> p){
+	void expand(stim::vec3<T> p){
 		if(!set){							//if the bounding box is empty, fill it with the current point
 			A = B = p;
@@ -47,12 +47,12 @@ public:
 	}
 	/// Return the center point of the bounding box as a stim::vec
-	stim::vec<T> center(){
+	stim::vec3<T> center(){
 		return (B + A) * 0.5;
 	}
 	/// Return the size of the bounding box as a stim::vec
-	stim::vec<T> size(){
+	stim::vec3<T> size(){
 		return (B - A);
 	}
@@ -11,32 +11,32 @@ namespace stim{
 class camera
 {
-	vec<float> d;	//direction that the camera is pointing
-	vec<float> p;	//position of the camera
-	vec<float> up;	//"up" direction
+	vec3<float> d;	//direction that the camera is pointing
+	vec3<float> p;	//position of the camera
+	vec3<float> up;	//"up" direction
 	float focus;		//focal length of the camera
 	float fov;
 	//private function makes sure that the up vector is orthogonal to the direction vector and both are normalized
 	void stabalize()
 	{
-		vec<float> side = up.cross(d);
+		vec3<float> side = up.cross(d);
 		up = d.cross(side);
 		up = up.norm();
 		d = d.norm();
 	}
 public:
-	void setPosition(vec<float> pos)
+	void setPosition(vec3<float> pos)
 	{
 		p = pos;
 	}
-	void setPosition(float x, float y, float z){setPosition(vec<float>(x, y, z));}
+	void setPosition(float x, float y, float z){setPosition(vec3<float>(x, y, z));}
 	void setFocalDistance(float distance){focus = distance;}
 	void setFOV(float field_of_view){fov = field_of_view;}
-	void LookAt(vec<float> pos)
+	void LookAt(vec3<float> pos)
 	{
 		//find the new direction
 		d = pos - p;
@@ -47,22 +47,22 @@ public:
 		//stabalize the camera
 		stabalize();
 	}
-	void LookAt(float px, float py, float pz){LookAt(vec<float>(px, py, pz));}
-	void LookAt(vec<float> pos, vec<float> new_up){up = new_up; LookAt(pos);}
-	void LookAt(float px, float py, float pz, float ux, float uy, float uz){LookAt(vec<float>(px, py, pz), vec<float>(ux, uy, uz));}
+	void LookAt(float px, float py, float pz){LookAt(vec3<float>(px, py, pz));}
+	void LookAt(vec3<float> pos, vec3<float> new_up){up = new_up; LookAt(pos);}
+	void LookAt(float px, float py, float pz, float ux, float uy, float uz){LookAt(vec3<float>(px, py, pz), vec3<float>(ux, uy, uz));}
 	void LookAtDolly(float lx, float ly, float lz)
 	{
 		//find the current focus point
-		vec<float> f = p + focus*d;
-		vec<float> T = vec<float>(lx, ly, lz) - f;
+		vec3<float> f = p + focus*d;
+		vec3<float> T = vec3<float>(lx, ly, lz) - f;
 		p = p + T;
 	}
-	void Dolly(vec<float> direction)
+	void Dolly(vec3<float> direction)
 	{
 		p = p+direction;
 	}
-	void Dolly(float x, float y, float z){Dolly(vec<float>(x, y, z));}
+	void Dolly(float x, float y, float z){Dolly(vec3<float>(x, y, z));}
 	void Push(float delta)
 	{
 		if(delta > focus)
@@ -80,7 +80,7 @@ public:
 		qx.CreateRotation(theta_x, up[0], up[1], up[2]);
 		//y rotation is around the side axis
-		vec<float> side = up.cross(d);
+		vec3<float> side = up.cross(d);
 		quaternion<float> qy;
 		qy.CreateRotation(theta_y, side[0], side[1], side[2]);
@@ -118,28 +118,28 @@ public:
 	void OrbitFocus(float theta_x, float theta_y)
 	{
 		//find the focal point
-		vec<float> focal_point = p + focus*d;
+		vec3<float> focal_point = p + focus*d;
 		//center the coordinate system on the focal point
-		vec<float> centered = p - (focal_point - vec<float>(0, 0, 0));
+		vec3<float> centered = p - (focal_point - vec3<float>(0, 0, 0));
 		//create the x rotation (around the up vector)
 		quaternion<float> qx;
 		qx.CreateRotation(theta_x, up[0], up[1], up[2]);
-		centered = vec<float>(0, 0, 0) + qx.toMatrix3()*(centered - vec<float>(0, 0, 0));
+		centered = vec3<float>(0, 0, 0) + qx.toMatrix3()*(centered - vec3<float>(0, 0, 0));
 		//get a side vector for theta_y rotation
-		vec<float> side = up.cross((vec<float>(0, 0, 0) - centered).norm());
+		vec3<float> side = up.cross((vec3<float>(0, 0, 0) - centered).norm());
 		quaternion<float> qy;
 		qy.CreateRotation(theta_y, side[0], side[1], side[2]);
-		centered = vec<float>(0, 0, 0) + qy.toMatrix3()*(centered - vec<float>(0, 0, 0));
+		centered = vec3<float>(0, 0, 0) + qy.toMatrix3()*(centered - vec3<float>(0, 0, 0));
 		//perform the rotation on the centered camera position
 		//centered = final.toMatrix()*centered;
 		//re-position the camera
-		p = centered + (focal_point - vec<float>(0, 0, 0));
+		p = centered + (focal_point - vec3<float>(0, 0, 0));
 		//make sure we are looking at the focal point
 		LookAt(focal_point);
@@ -151,17 +151,17 @@ public:
 	void Slide(float u, float v)
 	{
-		vec<float> V = up.norm();
-		vec<float> U = up.cross(d).norm();
+		vec3<float> V = up.norm();
+		vec3<float> U = up.cross(d).norm();
 		p = p + (V * v) + (U * u);
 	}
 	//accessor methods
-	vec<float> getPosition(){return p;}
-	vec<float> getUp(){return up;}
-	vec<float> getDirection(){return d;}
-	vec<float> getLookAt(){return p + focus*d;}
+	vec3<float> getPosition(){return p;}
+	vec3<float> getUp(){return up;}
+	vec3<float> getDirection(){return d;}
+	vec3<float> getLookAt(){return p + focus*d;}
 	float getFOV(){return fov;}
 	//output the camera settings
@@ -182,9 +182,9 @@ public:
 	//constructor
 	camera()
 	{
-		p = vec<float>(0, 0, 0);
-		d = vec<float>(0, 0, 1);
-		up = vec<float>(0, 1, 0);
+		p = vec3<float>(0, 0, 0);
+		d = vec3<float>(0, 0, 1);
+		up = vec3<float>(0, 1, 0);
 		focus = 1;
 	}
@@ -2,7 +2,7 @@
 #define STIM_CYLINDER_H
 #include <iostream>
 #include <stim/math/circle.h>
-#include <stim/math/vector.h>
+#include <stim/math/vec3.h>
 namespace stim
@@ -25,11 +25,11 @@ class cylinder
 		///inits the cylinder from a list of points (inP) and radii (inM)
 		void
-		init(std::vector<stim::vec<T> > inP, std::vector<stim::vec<T> > inM)
+		init(std::vector<stim::vec3<T> > inP, std::vector<stim::vec<T> > inM)
 		{
 			mags = inM;
-			stim::vec<float> v1;
-			stim::vec<float> v2;
+			stim::vec3<float> v1;
+			stim::vec3<float> v2;
 			e.resize(inP.size());
 			if(inP.size() < 2)
 				return;
@@ -38,16 +38,16 @@ class cylinder
 			L.resize(inP.size());
 			T temp = (T)0;
 			L[0] = 0;
-			for(int i = 1; i < L.size(); i++)
+			for(size_t i = 1; i < L.size(); i++)
 			{
 				temp += (inP[i-1] - inP[i]).len();
 				L[i] = temp;
 			}
-			stim::vec<T> dr = (inP[1] - inP[0]).norm();
-			s = stim::circle<T>(inP[0], inM[0][0], dr, stim::vec<T>(1,0,0));
+			stim::vec3<T> dr = (inP[1] - inP[0]).norm();
+			s = stim::circle<T>(inP[0], inM[0][0], dr, stim::vec3<T>(1,0,0));
 			e[0] = s;
-			for(int i = 1; i < inP.size()-1; i++)
+			for(size_t i = 1; i < inP.size()-1; i++)
 			{
 				s.center(inP[i]);
 				v1 = (inP[i] - inP[i-1]).norm();
@@ -67,7 +67,7 @@ class cylinder
 		}
 		///returns the direction vector at point idx.
-		stim::vec<T>
+		stim::vec3<T>
 		d(int idx)
 		{
 			if(idx == 0)
@@ -81,15 +81,15 @@ class cylinder
 			else
 			{
 //				return (e[idx+1].P - e[idx].P).norm();
-				stim::vec<float> v1 = (e[idx].P-e[idx-1].P).norm();
-				stim::vec<float> v2 = (e[idx+1].P-e[idx].P).norm();
+				stim::vec3<float> v1 = (e[idx].P-e[idx-1].P).norm();
+				stim::vec3<float> v2 = (e[idx+1].P-e[idx].P).norm();
 				return (v1+v2).norm();			
 			} 
 	//		return e[idx].N;	
 		}
-		stim::vec<T>
+		stim::vec3<T>
 		d(T l, int idx)
 		{
 			if(idx == 0 || idx == e.size()-1)
@@ -144,13 +144,13 @@ class cylinder
 		///constructor to create a cylinder from a set of points, radii, and the number of sides for the cylinder.
 		///@param inP:  Vector of stim vecs composing the points of the centerline.
 		///@param inM:  Vector of stim vecs composing the radii of the centerline.
-		cylinder(std::vector<stim::vec<T> > inP, std::vector<stim::vec<T> > inM){
+		cylinder(std::vector<stim::vec3<T> > inP, std::vector<stim::vec3<T> > inM){
 			init(inP, inM);
 		}
 		///Constructor defines a cylinder with centerline inP and magnitudes of zero
 		///@param inP: Vector of stim vecs composing the points of the centerline
-		cylinder(std::vector< stim::vec<T> > inP){
+		cylinder(std::vector< stim::vec3<T> > inP){
 			std::vector< stim::vec<T> > inM;						//create an array of arbitrary magnitudes
 			stim::vec<T> zero;
@@ -171,12 +171,12 @@ class cylinder
 		///Returns a position vector at the given p-value (p value ranges from 0 to 1).
 		///interpolates the position along the line.
 		///@param pvalue: the location of the in the cylinder, from 0 (beginning to 1).
-		stim::vec<T>
+		stim::vec3<T>
 		p(T pvalue)
 		{
 			if(pvalue < 0.0 || pvalue > 1.0)
 			{
-				return stim::vec<float>(-1,-1,-1);
+				return stim::vec3<float>(-1,-1,-1);
 			}
 			T l = pvalue*L[L.size()-1];
 			int idx = findIdx(l);
@@ -188,7 +188,7 @@ class cylinder
 		///Interpolates the radius along the line.
 		///@param l: the location of the in the cylinder.
 		///@param idx: integer location of the point closest to l but prior to it.
-		stim::vec<T>
+		stim::vec3<T>
 		p(T l, int idx)
 		{
 				T rat = (l-L[idx])/(L[idx+1]-L[idx]);
@@ -252,16 +252,16 @@ class cylinder
 		///in x, y, z coordinates. Theta is in degrees from 0 to 360.
 		///@param pvalue: the location of the in the cylinder, from 0 (beginning to 1).
 		///@param theta: the angle to the point of a circle.
-		stim::vec<T>
+		stim::vec3<T>
 		surf(T pvalue, T theta)
 		{
 			if(pvalue < 0.0 || pvalue > 1.0)
 			{
-				return stim::vec<float>(-1,-1,-1);
+				return stim::vec3<float>(-1,-1,-1);
 			} else {
 			T l = pvalue*L[L.size()-1];
 			int idx = findIdx(l);
-			stim::vec<T> ps = p(l, idx); 
+			stim::vec3<T> ps = p(l, idx); 
 			T m = r(l, idx);
 			s = e[idx];
 			s.center(ps);
@@ -273,10 +273,10 @@ class cylinder
 		///returns a vector of points necessary to create a circle at every position in the fiber.
 		///@param sides: the number of sides of each circle.	
-		std::vector<std::vector<vec<T> > >
+		std::vector<std::vector<vec3<T> > >
 		getPoints(int sides)
 		{
-			std::vector<std::vector <vec<T> > > points;
+			std::vector<std::vector <vec3<T> > > points;
 			points.resize(e.size());
 			for(int i = 0; i < e.size(); i++)
 			{
@@ -293,7 +293,7 @@ class cylinder
 		}
 		/// Allows a point on the centerline to be accessed using bracket notation
-		vec<T> operator[](unsigned int i){
+		vec3<T> operator[](unsigned int i){
 			return e[i].P;
 		}
@@ -309,7 +309,7 @@ class cylinder
 			T M = 0;						//initialize the integral to zero
 			T m0, m1;						//allocate space for both magnitudes in a single segment
-			//vec<T> p0, p1;					//allocate space for both points in a single segment
+			//vec3<T> p0, p1;					//allocate space for both points in a single segment
 			m0 = mags[0][m];				//initialize the first point and magnitude to the first point in the cylinder
 			//p0 = pos[0];
@@ -325,7 +325,7 @@ class cylinder
 				if(p > 1) len = (L[p-1] - L[p-2]);		//calculate the segment length using the L array
 				//add the average magnitude, weighted by the segment length
-				M += (m0 + m1)/2.0 * len;
+				M += (m0 + m1)/(T)2.0 * len;
 				m0 = m1;								//move to the next segment by shifting points
 			}
@@ -345,21 +345,21 @@ class cylinder
 		/// @param spacing is the maximum spacing allowed between sample points
 		cylinder<T> resample(T spacing){
-			std::vector< vec<T> > result;
+			std::vector< vec3<T> > result;
-			vec<T> p0 = e[0].P;								//initialize p0 to the first point on the centerline
-			vec<T> p1;
+			vec3<T> p0 = e[0].P;								//initialize p0 to the first point on the centerline
+			vec3<T> p1;
 			unsigned N = size();							//number of points in the current centerline
 			//for each line segment on the centerline
 			for(unsigned int i = 1; i < N; i++){
 				p1 = e[i].P;								//get the second point in the line segment
-				vec<T> v = p1 - p0;							//calculate the vector between these two points
+				vec3<T> v = p1 - p0;							//calculate the vector between these two points
 				T d = v.len();								//calculate the distance between these two points (length of the line segment)
-				unsigned nsteps = d / spacing+1;		//calculate the number of steps to take along the segment to meet the spacing criteria
-				T stepsize = 1.0 / nsteps;			//calculate the parametric step size between new centerline points
+				size_t nsteps = (size_t)std::ceil(d / spacing);		//calculate the number of steps to take along the segment to meet the spacing criteria
+				T stepsize = (T)1.0 / nsteps;			//calculate the parametric step size between new centerline points
 				//for each step along the line segment
 				for(unsigned s = 0; s < nsteps; s++){