Merge branch 'master' of git.stim.ee.uh.edu:codebase/stimlib

David Mayerich
2 parents d31038b3 31262e83
Showing 21 changed files with 918 additions and 309 deletions Show diff stats
stim/biomodels/network.h
stim/cuda/cudatools/callable.h
stim/envi/bil.h
stim/envi/bip.h
stim/envi/bsq.h
stim/envi/envi.h
stim/image/image.h
stim/math/bessel.h
stim/math/circle.h
stim/math/constants.h
stim/math/matrix.h
stim/math/plane.h
stim/math/quaternion.h
stim/math/vec3.h
stim/math/vector.h
stim/optics/mie.h
stim/optics/scalarbeam.h
stim/optics/scalarwave.h
stim/visualization/aaboundingbox.h
stim/visualization/camera.h
@@ -8,7 +8,7 @@
 #include <algorithm>
 #include <string.h>
 #include <math.h>
-#include <stim/math/vector.h>
+#include <stim/math/vec3.h>
 #include <stim/visualization/obj.h>
 #include <stim/visualization/cylinder.h>
 #include <ANN/ANN.h>
@@ -37,7 +37,7 @@ class network{
 		/// Constructor - creates an edge from a list of points by calling the stim::fiber constructor
  
 		///@param p is an array of positions in space
-		edge(std::vector< stim::vec<T> > p) : cylinder<T>(p){}
+		edge(std::vector< stim::vec3<T> > p) : cylinder<T>(p){}
  
 		/// Copy constructor creates an edge from a fiber
 		edge(stim::cylinder<T> f) : cylinder<T>(f) {}
@@ -61,20 +61,20 @@ class network{
 	};
  
 	///Node class that stores the physical position of the node as well as the edges it is connected to (edges that connect to it), As well as any additional data necessary.
-	class vertex : public stim::vec<T>
+	class vertex : public stim::vec3<T>
 	{
 		public:
 			//std::vector<unsigned int> edges;		//indices of edges connected to this node.
 			std::vector<unsigned int> e[2];			//indices of edges going out (e[0]) and coming in (e[1])
-			//stim::vec<T> p;						//position of this node in physical space.
+			//stim::vec3<T> p;						//position of this node in physical space.
  
 			//constructor takes a stim::vec
-			vertex(stim::vec<T> p) : stim::vec<T>(p){}
+			vertex(stim::vec3<T> p) : stim::vec3<T>(p){}
  
 			/// Output the vertex information as a string
 			std::string	str(){
 				std::stringstream ss;
-				ss<<"\t(x, y, z) = "<<stim::vec<T>::str();
+				ss<<"\t(x, y, z) = "<<stim::vec3<T>::str();
  
 				if(e[0].size() > 0){
 					ss<<"\t> ";
@@ -129,7 +129,11 @@ public:
 			std::vector< stim::vec<T> > c;				//allocate an array of points for the vessel centerline
 			O.getLine(l, c);							//get the fiber centerline
  
-			edge new_edge = c;							//create an edge from the given centerline
+			std::vector< stim::vec3<T> > c3(c.size());
+			for(size_t j = 0; j < c.size(); j++)
+				c3[j] = c[j];
+
+			edge new_edge = c3;							//create an edge from the given centerline
 			unsigned int I = new_edge.size();			//calculate the number of points on the centerline
  
 			//get the first and last vertex IDs for the line
@@ -222,7 +226,7 @@ public:
 	float gaussianFunction(float x, float std=25){ return exp(-x/(2*std*std));} // by default std = 25
  
     // stim 3d vector to annpoint of 3 dimensions
-	void stim2ann(ANNpoint &a, stim::vec<T> b){
+	void stim2ann(ANNpoint &a, stim::vec3<T> b){
 		a[0] = b[0];
 		a[1] = b[1];
 		a[2] = b[2];
@@ -278,10 +282,9 @@ public:
 		ANNdistArray dists = new ANNdist[1];     // near neighbor distances
 		ANNidxArray nnIdx = new ANNidx[1];				// near neighbor indices // allocate near neigh indices
  
-		stim::vec<T> p0, p1;
-		float m0, m1;
+		stim::vec3<T> p0, p1;
+		float m1;
 		float M = 0;											//stores the total metric value
-		float l;												//stores the segment length
 		float L = 0;											//stores the total network length
 		ANNpoint queryPt = annAllocPt(3);
 		for(unsigned e = 0; e < R.E.size(); e++){					//for each edge in A
@@ -292,7 +295,7 @@ public:
 				p1 = R.E[e][p];									//get the next point in the edge
 				stim2ann(queryPt, p1);
 				kdt->annkSearch( queryPt, 1, nnIdx, dists, eps);	//find the distance between A and the current network
-				m1 = 1.0f - gaussianFunction(dists[0], sigma);		//calculate the metric value based on the distance
+				m1 = 1.0f - gaussianFunction((float)dists[0], sigma);		//calculate the metric value based on the distance
 				R.E[e].set_mag(m1, p, 1);						//set the error for the second point in the segment
  
 			}
@@ -2,7 +2,7 @@
  
 //define the CUDA_CALLABLE macro (will prefix all members)
 #ifdef __CUDACC__
-#define CUDA_CALLABLE __host__ __device__
+#define CUDA_CALLABLE __host__ __device__ inline
 #else
 #define CUDA_CALLABLE
 #endif
@@ -884,7 +884,7 @@ public:
 	///		using the following indexing: i = p*B + b
 	/// @param matrix is the destination for the pixel data
 	/// @param mask is the mask
-	bool sift(T* matrix, unsigned char* mask = NULL){
+	bool sift(T* matrix, unsigned char* mask = NULL, bool PROGRESS = false){
 		size_t Lbytes = sizeof(T) * X();
 		T* line = (T*) malloc( Lbytes );					//allocate space for a line
  
@@ -903,6 +903,7 @@ public:
 						pl++;								//increment the pixel pointer
 					}
 				}
+				if(PROGRESS) progress = (double)( (y+1)*Z() + 1) / (double)(Y() * Z()) * 100;
 			}
 			p += pl;										//add the line increment to the running pixel index
 		}
@@ -817,7 +817,7 @@ public:
 	///		using the following indexing: i = p*B + b
 	/// @param matrix is the destination for the pixel data
 	/// @param mask is the mask
-	bool sift(T* matrix, unsigned char* mask = NULL){
+	bool sift(T* matrix, unsigned char* mask = NULL, bool PROGRESS = false){
 		size_t Bbytes = sizeof(T) * Z();
 		size_t XY = X() * Y();
 		T* band = (T*) malloc( Bbytes );					//allocate space for a line
@@ -836,6 +836,7 @@ public:
 			}
 			else
 				file.seekg(Bbytes, std::ios::cur);			//otherwise skip this band
+			if(PROGRESS) progress = (double)(xy+1) / (double)XY * 100;
 		}
 		return true;
 	}
@@ -809,7 +809,7 @@ public:
 	///		using the following indexing: i = p*B + b
 	/// @param matrix is the destination for the pixel data
 	/// @param mask is the mask
-	bool sift(T* matrix, unsigned char* mask = NULL){
+	bool sift(T* matrix, unsigned char* mask = NULL, bool PROGRESS = false){
 		unsigned long long XY = X() * Y(); 					//Number of XY pixels
 		unsigned long long L = XY * sizeof(T); 				//size of XY plane (in bytes)
  
@@ -827,9 +827,8 @@ public:
 				if(mask == NULL || mask[xy] != 0){				//if the pixel is valid
 					matrix[i*Z() + b] = band_image[xy];			//copy it to the appropriate point in the values[] array
 					i++;
-					//std::cout<<i<<std::endl;
 				}
-
+				if(PROGRESS) progress = (double)(xy+1) / (double)XY * 100;
 			}
 		}
  
@@ -670,13 +670,13 @@ public:
 	///		using the following indexing: i = b*P + p
 	/// @param matrix is the destination for the pixel data
 	/// @param p is the mask
-	bool sift(void* matrix, unsigned char* p = NULL){
+	bool sift(void* matrix, unsigned char* p = NULL, bool PROGRESS = false){
  
 		if (header.interleave == envi_header::BSQ){		//if the infile is bsq file
 			if (header.data_type == envi_header::float32)
-				return ((bsq<float>*)file)->sift((float*)matrix, p);
+				return ((bsq<float>*)file)->sift((float*)matrix, p, PROGRESS);
 			else if (header.data_type == envi_header::float64)
-				return ((bsq<double>*)file)->sift((double*)matrix, p);
+				return ((bsq<double>*)file)->sift((double*)matrix, p, PROGRESS);
 			else{
 				std::cout << "ERROR: unidentified data type" << std::endl;
 				exit(1);
@@ -685,9 +685,9 @@ public:
  
 		if (header.interleave == envi_header::BIP){
 			if (header.data_type == envi_header::float32)
-				return ((bip<float>*)file)->sift((float*)matrix, p);
+				return ((bip<float>*)file)->sift((float*)matrix, p, PROGRESS);
 			else if (header.data_type == envi_header::float64)
-				return ((bip<double>*)file)->sift((double*)matrix, p);
+				return ((bip<double>*)file)->sift((double*)matrix, p, PROGRESS);
 			else{
 				std::cout << "ERROR: unidentified data type" << std::endl;
 				exit(1);
@@ -695,9 +695,9 @@ public:
 		}
 		if (header.interleave == envi_header::BIL){
 			if (header.data_type == envi_header::float32)
-				return ((bil<float>*)file)->sift((float*)matrix, p);
+				return ((bil<float>*)file)->sift((float*)matrix, p, PROGRESS);
 			else if (header.data_type == envi_header::float64)
-				return ((bil<double>*)file)->sift((double*)matrix, p);
+				return ((bil<double>*)file)->sift((double*)matrix, p, PROGRESS);
 			else{
 				std::cout << "ERROR: unidentified data type" << std::endl;
 				exit(1);
@@ -25,8 +25,6 @@ class image{
 	size_t Y() const { return R[2]; }
 	size_t C() const { return R[0]; }
  
-	size_t bytes(){ return size() * sizeof(T); }
-
 	void init(){								//initializes all variables, assumes no memory is allocated
 		memset(R, 0, sizeof(size_t) * 3);		//set the resolution and number of channels to zero
 		img = NULL;
@@ -34,7 +32,6 @@ class image{
  
 	void unalloc(){								//frees any resources associated with the image
 		if(img)	free(img);						//if memory has been allocated, free it
-		img=NULL;		
 	}
  
  
@@ -45,16 +42,15 @@ class image{
  
 	void allocate(){
 		unalloc();
-		img = (T*) malloc( bytes() );	//allocate memory
-		memset(img, 0, bytes());
+		img = (T*) malloc( sizeof(T) * R[0] * R[1] * R[2] );	//allocate memory
 	}
  
 	void allocate(size_t x, size_t y, size_t c){	//allocate memory based on the resolution
-		unalloc();
 		R[0] = c; R[1] = x; R[2] = y;				//set the resolution
 		allocate();									//allocate memory
 	}
  
+	size_t bytes(){ return size() * sizeof(T); }
  
 	size_t idx(size_t x, size_t y, size_t c = 0){
 		return y * C() * X() + x * C() + c;
@@ -106,15 +102,14 @@ class image{
  
 		std::cout<<"ERROR in stim::image::white - no white value known for this data type"<<std::endl;
 		exit(1);
+
 	}
  
  
 public:
  
 	/// Default constructor - creates an empty image object
-	image(){
-		init();						//initialize all variables to zero, don't allocate any memory
-	}							
+	image(){ init(); }							//initialize all variables to zero, don't allocate any memory
  
 	/// Constructor with a filename - loads the specified file
 	image(std::string filename){				//constructor initialize the image with an image file
@@ -136,7 +131,7 @@ public:
 	}
  
 	/// Copy constructor - duplicates an image object
-	image(const stim::image<T> &I){
+	image(const stim::image<T>& I){
 		init();
 		allocate(I.X(), I.Y(), I.C());
 		memcpy(img, I.img, bytes());
@@ -148,6 +143,7 @@ public:
 	}
  
 	stim::image<T>& operator=(const stim::image<T>& I){
+		init();
 		if(&I == this)									//handle self-assignment
 			return *this;
 		allocate(I.X(), I.Y(), I.C());
@@ -160,22 +156,15 @@ public:
  
 		cv::Mat cvImage = cv::imread(filename, CV_LOAD_IMAGE_UNCHANGED);	//use OpenCV to open the image file
 		if(!cvImage.data){
-			std::cout<<"ERROR stim::image::load() - unable to find image "<<filename<<" ["<<__FILE__<<" (line "<<__LINE__<<")]"<<std::endl;
+			std::cout<<"ERROR stim::image::load() - unable to find image "<<filename<<std::endl;
 			exit(1);
 		}
 		allocate(cvImage.cols, cvImage.rows, cvImage.channels());			//allocate space for the image
-		T* cv_ptr = (T*) cvImage.data;
-		if(C() == 1)
-		{
-	//if this is a single-color image, just copy the data
-			memcpy(img, cv_ptr, bytes());     
-		}
-		if(C() == 3)
-		{														//if this is a 3-color image, OpenCV uses BGR interleaving
+		T* cv_ptr = (T*)cvImage.data;
+		if(C() == 1)														//if this is a single-color image, just copy the data
+			memcpy(img, cv_ptr, bytes());
+		if(C() == 3)														//if this is a 3-color image, OpenCV uses BGR interleaving
 			set_interleaved_bgr(cv_ptr, X(), Y());
-		}
-
-		cvImage.release();	
 	}
  
 	//save a file
@@ -189,18 +178,16 @@ public:
 			get_interleaved_bgr(buffer);
 		cv::Mat cvImage((int)Y(), (int)X(), cv_type(), buffer);
 		cv::imwrite(filename, cvImage);
-		cvImage.release();
-		free(buffer);
 	}
  
 	//create an image from an interleaved buffer
-	void set_interleaved_rgb(T* buffer, size_t width, size_t height, size_t channels = 3){
-		allocate(width, height, channels);
+	void set_interleaved_rgb(T* buffer, size_t width, size_t height){
+		allocate(width, height, 3);
 		memcpy(img, buffer, bytes());
 	}
  
-	void set_interleaved_bgr(T* buffer, size_t width, size_t height, size_t channels = 3){
-		allocate(width, height, channels);
+	void set_interleaved_bgr(T* buffer, size_t width, size_t height){
+		allocate(width, height, 3);
 		for(size_t c = 0; c < C(); c++){								//copy directly
 			for(size_t y = 0; y < Y(); y++){
 				for(size_t x = 0; x < X(); x++){
@@ -380,34 +367,6 @@ public:
  
 		return r;								//return the inverted image
 	}
-	
-	/// Invert an image by calculating I1 = alpha - I0, where alpha is the maximum image value
-	image<T> invert(){
-		size_t N = size();						//calculate the total number of values in the image
-		image<T> r(X(), Y(), C());				//allocate space for the resulting image
-		T white_val = maxv();
-		for(size_t n = 0; n < N; n++)
-			r.img[n] = white_val - img[n];		//perform the inversion
-
-		return r;								//return the inverted image
-	}
-
-	///crops the image from x1 to x0 and y1 to y0 and returns a new (smaller) image.
-	image<T> crop(int x0, int x1, int y0, int y1)
-	{
-		
-		image<T> ret(x1-x0, y1-y0, C());
-		int newWidth = x1-x0;
-		int destidx, srcidx;
-		///for each row, cut what amount of data from the original and put it into the new copy.
-		for(int i = 0; i < (y1-y0); i++)
-		{
-			destidx = i*newWidth*C(); ///destination index one per each row
-			srcidx = ((i+(y0))*X()+x0)*C(); ///source index, one per each row.
-			memcpy(&ret.img[destidx], &img[srcidx], sizeof(T)*newWidth*C());
-		}		
-		return ret;
-	}
  
 	image<T> srgb2lab(){
 		std::cout<<"ERROR stim::image::srgb2lab - function has been broken, re-implement."<<std::endl;
@@ -426,7 +385,6 @@ public:
 		exit(1);
 	}
  
-
 	// leila's code for non_interleaving data in 3D
 	//create an data set from an interleaved buffer
 	void set_interleaved3(T* buffer, size_t width, size_t height, size_t depth, size_t channels = 3){
@@ -1258,7 +1258,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
     P a0,v0,pv0,pv1,vl,ga,gb,vg,vv,w0,w1,ya0,yak,ya1,wa;
     int j,n,k,kz,l,lb,lb0,m;
  
-    a0 = abs(z);
+    a0 = ::abs(z);
     z1 = z;
     z2 = z*z;
     n = (int)v;
@@ -1286,7 +1286,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
         vm = v;
         return 0;
     }
-    if (real(z1) < 0.0) z1 = -z;
+    if (::real(z1) < 0.0) z1 = -z;
     if (a0 <= 12.0) {
         for (l=0;l<2;l++) {
             vl = v0+l;
@@ -1295,7 +1295,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
             for (k=1;k<=40;k++) {
                 cr *= -0.25*z2/(k*(k+vl));
                 cjvl += cr;
-                if (abs(cr) < abs(cjvl)*eps) break;
+                if (::abs(cr) < ::abs(cjvl)*eps) break;
             }
            vg = 1.0 + vl;
            ga = gamma(vg);
@@ -1348,7 +1348,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
                 for (k=1;k<=40;k++) {
                     cr *= -0.25*z2/(k*(k-vl));
                     cjvl += cr;
-                    if (abs(cr) < abs(cjvl)*eps) break;
+                    if (::abs(cr) < ::abs(cjvl)*eps) break;
                 }
                 vg = 1.0-vl;
                 gb = gamma(vg);
@@ -1381,16 +1381,16 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
             cyv1 = M_2_PI*(cec*cjv1-1.0/z1-0.25*z1*cs1);
         }
     }
-    if (real(z) < 0.0) {
+    if (::real(z) < 0.0) {
         cfac0 = exp(pv0*cii);
         cfac1 = exp(pv1*cii);
-        if (imag(z) < 0.0) {
+        if (::imag(z) < 0.0) {
             cyv0 = cfac0*cyv0-(P)2.0*(complex<P>)cii*cos(pv0)*cjv0;
             cyv1 = cfac1*cyv1-(P)2.0*(complex<P>)cii*cos(pv1)*cjv1;
             cjv0 /= cfac0;
             cjv1 /= cfac1;
         }
-        else if (imag(z) > 0.0) {
+        else if (::imag(z) > 0.0) {
             cyv0 = cyv0/cfac0+(P)2.0*(complex<P>)cii*cos(pv0)*cjv0;
             cyv1 = cyv1/cfac1+(P)2.0*(complex<P>)cii*cos(pv1)*cjv1;
             cjv0 *= cfac0;
@@ -1421,7 +1421,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
             cf2 = cf1;
             cf1 = cf;
         }
-        if (abs(cjv0) > abs(cjv1)) cs = cjv0/cf;
+        if (::abs(cjv0) > ::abs(cjv1)) cs = cjv0/cf;
         else cs = cjv1/cf2;
         for (k=0;k<=n;k++) {
             cjv[k] *= cs;
@@ -1433,21 +1433,21 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
     }
     cyv[0] = cyv0;
     cyv[1] = cyv1;
-    ya0 = abs(cyv0);
+    ya0 = ::abs(cyv0);
     lb = 0;
     cg0 = cyv0;
     cg1 = cyv1;
     for (k=2;k<=n;k++) {
         cyk = 2.0*(v0+k-1.0)*cg1/z-cg0;
-        yak = abs(cyk);
-        ya1 = abs(cg0);
+        yak = ::abs(cyk);
+        ya1 = ::abs(cg0);
         if ((yak < ya0) && (yak< ya1)) lb = k;
         cyv[k] = cyk;
         cg0 = cg1;
         cg1 = cyk;
     }
     lb0 = 0;
-    if ((lb > 4) && (imag(z) != 0.0)) {
+    if ((lb > 4) && (::imag(z) != 0.0)) {
         while(lb != lb0) {
             ch2 = cone;
             ch1 = czero;
@@ -1470,7 +1470,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
             cp21 = ch2;
             if (lb == n)
                 cjv[lb+1] = 2.0*(lb+v0)*cjv[lb]/z-cjv[lb-1];
-            if (abs(cjv[0]) > abs(cjv[1])) {
+            if (::abs(cjv[0]) > ::abs(cjv[1])) {
                 cyv[lb+1] = (cjv[lb+1]*cyv0-2.0*cp11/(M_PI*z))/cjv[0];
                 cyv[lb] = (cjv[lb]*cyv0+2.0*cp12/(M_PI*z))/cjv[0];
             }
@@ -1495,8 +1495,8 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
                 cyl2 = cylk;
             }
             for (k=2;k<=n;k++) {
-                wa = abs(cyv[k]);
-                if (wa < abs(cyv[k-1])) lb = k;
+                wa = ::abs(cyv[k]);
+                if (wa < ::abs(cyv[k-1])) lb = k;
             }
         }
     }
@@ -1515,12 +1515,18 @@ int cbessjyva_sph(int v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
     //first, compute the bessel functions of fractional order
     cbessjyva<P>(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp);
  
+	if(z == 0){													//handle degenerate case of z = 0
+		memset(cjv, 0, sizeof(P) * (v+1));
+		cjv[0] = 1;
+	}
+
     //iterate through each and scale
     for(int n = 0; n<=v; n++)
     {
-
-        cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
-        cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
+		if(z != 0){												//handle degenerate case of z = 0
+			cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
+			cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
+		}
  
         cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0));
         cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0));
@@ -17,7 +17,7 @@ class circle : plane&lt;T&gt;
  
 private:
  
-	stim::vec<T> Y;
+	stim::vec3<T> Y;
  
 	CUDA_CALLABLE void
 	init()
@@ -48,7 +48,7 @@ public:
 	circle(T size, T z_pos = (T)0) : plane<T>()
 	{
 		init();
-		center(stim::vec<T>(0,0,z_pos));
+		center(stim::vec3<T>(0,0,z_pos));
 		scale(size);
 	}
  
@@ -56,7 +56,7 @@ public:
 	///@param c: x,y,z location of the center.
 	///@param n: x,y,z direction of the normal.	
 	CUDA_CALLABLE
-	circle(vec<T> c, vec<T> n = vec<T>(0,0,1)) : plane<T>()
+	circle(vec3<T> c, vec3<T> n = vec3<T>(0,0,1)) : plane<T>()
 	{
 		center(c);
 		normal(n);
@@ -68,7 +68,7 @@ public:
 	///@param s: size of the rectangle.
 	///@param n: x,y,z direction of the normal.
 	CUDA_CALLABLE 
-	circle(vec<T> c, T s, vec<T> n = vec<T>(0,0,1)) : plane<T>()
+	circle(vec3<T> c, T s, vec3<T> n = vec3<T>(0,0,1)) : plane<T>()
 	{
 		init();
 		center(c);
@@ -82,7 +82,7 @@ public:
 	///@param n: x,y,z direction of the normal.
 	///@param u: x,y,z direction for the zero vector (from where the rotation starts)
 	CUDA_CALLABLE
-	circle(vec<T> c, T s, vec<T> n = vec<T>(0,0,1), vec<T> u = vec<T>(1, 0, 0)) : plane<T>()
+	circle(vec3<T> c, T s, vec3<T> n = vec3<T>(0,0,1), vec3<T> u = vec3<T>(1, 0, 0)) : plane<T>()
 	{
 		init();
 		setU(u);
@@ -103,16 +103,15 @@ public:
 	///sets the normal for the cirlce
 	///@param n: x,y,z direction of the normal.
 	CUDA_CALLABLE void
-	normal(vec<T> n)
+	normal(vec3<T> n)
 	{
 		rotate(n, Y);
 	}
  
 	///sets the center of the circle.
 	///@param n: x,y,z location of the center.
-	CUDA_CALLABLE T
-	center(vec<T> p)
-	{
+	CUDA_CALLABLE void
+	center(vec3<T> p){
 		this->P = p;
 	}
  
@@ -127,17 +126,17 @@ public:
 	}
  
 	///get the world space value given the planar coordinates a, b in [0, 1]
-	CUDA_CALLABLE stim::vec<T> p(T a, T b)
+	CUDA_CALLABLE stim::vec3<T> p(T a, T b)
 	{
-		stim::vec<T> result;
+		stim::vec3<T> result;
  
-		vec<T> A = this->P - this->U * (T)0.5 - Y * (T)0.5;
+		vec3<T> A = this->P - this->U * (T)0.5 - Y * (T)0.5;
 		result = A + this->U * a + Y * b;
 		return result;
 	}
  
 	///parenthesis operator returns the world space given rectangular coordinates a and b in [0 1]
-	CUDA_CALLABLE stim::vec<T> operator()(T a, T b)
+	CUDA_CALLABLE stim::vec3<T> operator()(T a, T b)
 	{
 		return p(a,b);
 	}
@@ -145,11 +144,11 @@ public:
 	///returns a vector with the points on the initialized circle.
 	///connecting the points results in a circle.
 	///@param n: integer for the number of points representing the circle.
-	std::vector<stim::vec<T> >
+	std::vector<stim::vec3<T> >
 	getPoints(int n)
 	{
-		std::vector<stim::vec<T> > result;
-		stim::vec<T> point;
+		std::vector<stim::vec3<T> > result;
+		stim::vec3<T> point;
 		T x,y;
 		float step = 360.0/(float) n;
 		for(float j = 0; j <= 360.0; j += step)
@@ -164,7 +163,7 @@ public:
 	///returns a vector with the points on the initialized circle.
 	///connecting the points results in a circle.
 	///@param n: integer for the number of points representing the circle.
-	stim::vec<T>
+	stim::vec3<T>
 	p(T theta)
 	{
 		T x,y;
 #ifndef STIM_CONSTANTS_H
 #define STIM_CONSTANTS_H
  
+#include "stim/cuda/cudatools/callable.h"
 namespace stim{
 	const double PI		=	3.1415926535897932384626433832795028841971693993751058209749445923078164062862;
 	const double TAU	=	2 * stim::PI;
@@ -5,6 +5,7 @@
 #include <string.h>
 #include <iostream>
 #include <stim/math/vector.h>
+#include <stim/math/vec3.h>
 #include <stim/cuda/cudatools/callable.h>
  
 namespace stim{
@@ -2,7 +2,7 @@
 #define STIM_PLANE_H
  
 #include <iostream>
-#include <stim/math/vector.h>
+#include <stim/math/vec3.h>
 #include <stim/cuda/cudatools/callable.h>
 #include <stim/math/quaternion.h>
  
@@ -22,17 +22,17 @@ template &lt;typename T&gt;
 class plane
 {
 	protected:
-		stim::vec<T> P;
-		stim::vec<T> N;
-		stim::vec<T> U;
+		stim::vec3<T> P;
+		stim::vec3<T> N;
+		stim::vec3<T> U;
  
 		///Initializes the plane with standard coordinates.
 		///
 		CUDA_CALLABLE void init()
 		{
-			P = stim::vec<T>(0, 0, 0);
-			N = stim::vec<T>(0, 0, 1);
-			U = stim::vec<T>(1, 0, 0);
+			P = stim::vec3<T>(0, 0, 0);
+			N = stim::vec3<T>(0, 0, 1);
+			U = stim::vec3<T>(1, 0, 0);
 		}
  
 	public:
@@ -42,7 +42,7 @@ class plane
 			init();
 		}
  
-		CUDA_CALLABLE plane(vec<T> n, vec<T> p = vec<T>(0, 0, 0))
+		CUDA_CALLABLE plane(vec3<T> n, vec3<T> p = vec3<T>(0, 0, 0))
 		{
 			init();
 			P = p;
@@ -56,11 +56,11 @@ class plane
 		}
  
 		//create a plane from three points (a triangle)
-		CUDA_CALLABLE plane(vec<T> a, vec<T> b, vec<T> c)
+		CUDA_CALLABLE plane(vec3<T> a, vec3<T> b, vec3<T> c)
 		{
 			init();
 			P = c;
-			stim::vec<T> n = (c - a).cross(b - a);
+			stim::vec3<T> n = (c - a).cross(b - a);
 			try
 			{
 				if(n.len() != 0)
@@ -84,17 +84,17 @@ class plane
  
 		}
  
-		CUDA_CALLABLE vec<T> n()
+		CUDA_CALLABLE vec3<T> n()
 		{
 			return N;
 		}
  
-		CUDA_CALLABLE vec<T> p()
+		CUDA_CALLABLE vec3<T> p()
 		{
 			return P;
 		}
  
-		CUDA_CALLABLE vec<T> u()
+		CUDA_CALLABLE vec3<T> u()
 		{
 			return U;
 		}
@@ -107,7 +107,7 @@ class plane
 		}
  
 		//determines how a vector v intersects the plane (1 = intersects front, 0 = within plane,     -1 = intersects back)
-		CUDA_CALLABLE int face(vec<T> v){
+		CUDA_CALLABLE int face(vec3<T> v){
  
 			T dprod = v.dot(N);             //get the dot product between v and N
  
@@ -121,46 +121,46 @@ class plane
 		}
  
 		//determine on which side of the plane a point lies (1 = front, 0 = on the plane, -1 = bac    k)
-		CUDA_CALLABLE int side(vec<T> p){
+		CUDA_CALLABLE int side(vec3<T> p){
  
-			vec<T> v = p - P;    //get the vector from P to the query point p
+			vec3<T> v = p - P;    //get the vector from P to the query point p
  
 			return face(v);
 		}
  
 		//compute the component of v that is perpendicular to the plane
-		CUDA_CALLABLE vec<T> perpendicular(vec<T> v){
+		CUDA_CALLABLE vec3<T> perpendicular(vec3<T> v){
 			return N * v.dot(N);
 		}
  
 		//compute the projection of v in the plane
-		CUDA_CALLABLE vec<T> parallel(vec<T> v){
+		CUDA_CALLABLE vec3<T> parallel(vec3<T> v){
 			return v - perpendicular(v);
 		}
  
-		CUDA_CALLABLE void setU(vec<T> v)
+		CUDA_CALLABLE void setU(vec3<T> v)
 		{
 			U = (parallel(v.norm())).norm();		
 		}
  
-		CUDA_CALLABLE void decompose(vec<T> v, vec<T>& para, vec<T>& perp){
+		CUDA_CALLABLE void decompose(vec3<T> v, vec3<T>& para, vec3<T>& perp){
 			perp = N * v.dot(N);
 			para = v - perp;
 		}
  
 		//get both the parallel and perpendicular components of a vector v w.r.t. the plane
-		CUDA_CALLABLE void project(vec<T> v, vec<T> &v_par, vec<T> &v_perp){
+		CUDA_CALLABLE void project(vec3<T> v, vec3<T> &v_par, vec3<T> &v_perp){
  
 			v_perp = v.dot(N);
 			v_par = v - v_perp;
 		}
  
 		//compute the reflection of v off of the plane
-		CUDA_CALLABLE vec<T> reflect(vec<T> v){
+		CUDA_CALLABLE vec3<T> reflect(vec3<T> v){
  
 			//compute the reflection using N_prime as the plane normal
-			vec<T> par = parallel(v);
-			vec<T> r = (-v) + par * 2;
+			vec3<T> par = parallel(v);
+			vec3<T> r = (-v) + par * 2;
 			return r;
  
 		}
@@ -184,7 +184,7 @@ class plane
 		}
  
  
-		CUDA_CALLABLE void rotate(vec<T> n)
+		CUDA_CALLABLE void rotate(vec3<T> n)
 		{
 			quaternion<T> q;
 			q.CreateRotation(N, n);
@@ -194,7 +194,7 @@ class plane
  
 		}
  
-		CUDA_CALLABLE void rotate(vec<T> n, vec<T> &Y)
+		CUDA_CALLABLE void rotate(vec3<T> n, vec3<T> &Y)
 		{
 			quaternion<T> q;
 			q.CreateRotation(N, n);
@@ -205,7 +205,7 @@ class plane
  
 		}
  
-		CUDA_CALLABLE void rotate(vec<T> n, vec<T> &X, vec<T> &Y)
+		CUDA_CALLABLE void rotate(vec3<T> n, vec3<T> &X, vec3<T> &Y)
 		{
 			quaternion<T> q;
 			q.CreateRotation(N, n);
@@ -43,6 +43,8 @@ public:
  
 	CUDA_CALLABLE void CreateRotation(vec3<T> from, vec3<T> to){
  
+		from = from.norm();
+		to = to.norm();
 		vec3<T> r = from.cross(to);			//compute the rotation vector
 		T theta = asin(r.len());				//compute the angle of the rotation about r
 		//deal with a zero vector (both k and kn point in the same direction)
@@ -217,12 +217,12 @@ public:
 	std::string str() const{
 		std::stringstream ss;
  
-		size_t N = size();
+		const size_t N = 3;
  
 		ss<<"[";
 		for(size_t i=0; i<N; i++)
 		{
-			ss<<at(i);
+			ss<<ptr[i];
 			if(i != N-1)
 				ss<<", ";
 		}
@@ -230,7 +230,10 @@ public:
  
 		return ss.str();
 	}
-	};						//end class triple
+
+	size_t size(){ return 3; }
+
+	};						//end class vec3
 }							//end namespace stim
  
 /// Multiply a vector by a constant when the vector is on the right hand side
@@ -317,6 +317,15 @@ struct vec : public std::vector&lt;T&gt;
 		return *this;
 	}
  
+	/// Cast to a vec3
+	operator vec3<T>(){
+		vec3<T> r;
+		size_t N = std::min<size_t>(size(), 3);
+		for(size_t i = 0; i < N; i++)
+			r[i] = at(i);
+		return r;
+	}
+
 	/// Casting and assignment
 	template<typename Y>
 	vec<T> & operator=(vec<Y> rhs){
+#ifndef STIM_MIE_H
+#define STIM_MIE_H
+
+#include "scalarwave.h"
+#include "../math/bessel.h"
+#include "../cuda/cudatools/devices.h"
+#include <cmath>
+
+namespace stim{
+
+
+/// Calculate the scattering coefficients for a spherical scatterer
+template<typename T>
+void B_coefficients(stim::complex<T>* B, T a, T k, stim::complex<T> n, int Nl){
+
+	//temporary variables
+	double vm;															//allocate space to store the return values for the bessel function calculation
+	double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+
+	double ka = k * a;													//store k*a (argument for spherical bessel and Hankel functions)
+	stim::complex<double> kna = k * n * a;								//store k*n*a (argument for spherical bessel functions and derivatives)
+
+	stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka);			//calculate bessel functions and derivatives for k*a
+	stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna);		//calculate complex bessel functions for k*n*a
+
+	stim::complex<double> h_ka, dh_ka;
+	stim::complex<double> numerator, denominator;
+	stim::complex<double> i(0, 1);
+	for(int l = 0; l <= Nl; l++){
+		h_ka.r = j_ka[l];
+		h_ka.i = y_ka[l];
+		dh_ka.r = dj_ka[l];
+		dh_ka.i = dy_ka[l];
+
+		numerator = j_ka[l] * dj_kna[l] * (stim::complex<double>)n - j_kna[l] * dj_ka[l];
+		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
+		B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
+		std::cout<<B[l]<<std::endl;
+	}
+}
+
+template<typename T>
+void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){
+	//temporary variables
+	double vm;															//allocate space to store the return values for the bessel function calculation
+	double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+
+	double ka = k * a;													//store k*a (argument for spherical bessel and Hankel functions)
+	stim::complex<double> kna = k * n * a;								//store k*n*a (argument for spherical bessel functions and derivatives)
+
+	stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka);			//calculate bessel functions and derivatives for k*a
+	stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna);		//calculate complex bessel functions for k*n*a
+
+	stim::complex<double> h_ka, dh_ka;
+	stim::complex<double> numerator, denominator;
+	stim::complex<double> i(0, 1);
+	for(size_t l = 0; l <= Nl; l++){
+		h_ka.r = j_ka[l];
+		h_ka.i = y_ka[l];
+		dh_ka.r = dj_ka[l];
+		dh_ka.i = dy_ka[l];
+
+		numerator = j_ka[l] * dh_ka - dj_ka[l] * h_ka;
+		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
+		A[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
+	}
+}
+
+#define LOCAL_NL	16
+template<typename T>
+__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, int Nl){
+	extern __shared__ stim::complex<T> shared_hB[];		//declare the list of waves in shared memory
+
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;													//exit if this thread is outside the array
+	stim::vec3<T> p;
+	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
+	(y == NULL) ? p[1] = 0 : p[1] = y[i];
+	(z == NULL) ? p[2] = 0 : p[2] = z[i];
+	
+	T r = p.len();														//calculate the distance from the sphere
+	if(r < a) return;													//exit if the point is inside the sphere (we only calculate the internal field)
+	T k = W[0].kmag();
+	size_t NC = Nl + 1;													//calculate the number of coefficients to be used
+	T kr = r * k;														//calculate the thread value for k*r
+	T fij = (kr - kr_min)/dkr;											//FP index into the spherical bessel LUT
+	size_t ij = (size_t) fij;											//convert to an integral index
+	T alpha = fij - ij;													//calculate the fractional portion of the index
+	size_t n0j = ij * (NC);												//start of the first entry in the LUT
+	size_t n1j = (ij+1) * (NC);											//start of the second entry in the LUT
+
+	T cos_phi;	
+	T Pl_2, Pl_1, Pl;														//declare registers to store the previous two Legendre polynomials
+	
+	stim::complex<T> hBl;
+	stim::complex<T> Ei = 0;											//create a register to store the result
+	int l;
+
+	stim::complex<T> hlBl[LOCAL_NL+1];
+	int shared_start = threadIdx.x * (Nl - LOCAL_NL);
+
+	#pragma unroll LOCAL_NL+1
+	for(l = 0; l <= LOCAL_NL; l++)
+		hlBl[l] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
+	
+	for(l = LOCAL_NL+1; l <= Nl; l++)
+		shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
+
+	for(size_t w = 0; w < nW; w++){
+		cos_phi = p.norm().dot(W[w].kvec().norm());						//calculate the cosine of the angle between the k vector and the direction from the sphere
+		Pl_2 = 1;
+		Pl_1 = cos_phi;
+		Ei += W[w].E() * hlBl[0] * Pl_2;
+		Ei += W[w].E() * hlBl[1] * Pl_1;		
+
+		#pragma unroll LOCAL_NL-1
+		for(l = 2; l <= LOCAL_NL; l++){
+			Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
+			Ei += W[w].E() * hlBl[l] * Pl;
+			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+			Pl_1 = Pl;
+		}
+
+		for(l = LOCAL_NL+1; l <= Nl; l++){
+			Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
+			Ei += W[w].E() * shared_hB[shared_start + (l - (LOCAL_NL+1))] * Pl;
+			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+			Pl_1 = Pl;
+			
+		}
+	}
+	E[i] += Ei;															//copy the result to device memory
+}
+
+template<typename T>
+void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){
+	
+	size_t max_shared_mem = stim::sharedMemPerBlock();	
+	int hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
+	std::cout<<"hl*Bl array size:  "<<hBl_array<<std::endl;
+	std::cout<<"shared memory:     "<<max_shared_mem<<std::endl;
+	int threads = (max_shared_mem / hBl_array) / 32 * 32;
+	std::cout<<"threads per block: "<<threads<<std::endl;
+	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks
+
+	size_t shared_mem;
+	if(Nl <= LOCAL_NL) shared_mem = 0;
+	else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL);				//amount of shared memory to allocate
+	std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
+	cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl);	//call the kernel
+
+}
+
+template<typename T>
+__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;													//exit if this thread is outside the array
+
+	stim::vec3<T> p;
+	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
+	(y == NULL) ? p[1] = 0 : p[1] = y[i];
+	(z == NULL) ? p[2] = 0 : p[2] = z[i];
+
+	r[i] = p.len();
+}
+/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
+
+/// @param E is a pointer to the destination field values
+/// @param N is the number of points used to calculate the field
+/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
+/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
+/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
+/// @param W is an array of planewaves that will be scattered
+/// @param a is the radius of the sphere
+/// @param n is the complex refractive index of the sphere
+template<typename T>
+void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
+	//calculate the necessary number of orders required to represent the scattered field
+	T k = W[0].kmag();
+
+	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
+	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
+	std::cout<<"Nl: "<<Nl<<std::endl;
+
+	//calculate the scattering coefficients for the sphere
+	stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
+	B_coefficients(B, a, k, n, Nl);
+
+#ifdef CUDA_FOUND
+	stim::complex<T>* dev_E;										//allocate space for the field
+	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
+	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
+	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
+
+	//	COORDINATES
+	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
+	if(x != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
+	if(y != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
+	if(z != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+
+	//	PLANE WAVES
+	stim::scalarwave<T>* dev_W;																//allocate space and copy plane waves
+	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
+	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
+
+	// BESSEL FUNCTION LOOK-UP TABLE
+	//calculate the distance from the sphere center
+	T* dev_r;
+	HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );
+		
+	int threads = stim::maxThreadsPerBlock();
+	dim3 blocks((unsigned)(N / threads + 1));
+	cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);
+
+	//Find the minimum and maximum values of r
+    cublasStatus_t stat;
+    cublasHandle_t handle;
+
+	stat = cublasCreate(&handle);							//create a cuBLAS handle
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS initialization failed\n");
+		exit(1);
+	}
+
+	int i_min, i_max;
+	stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
+		exit(1);
+	}
+	stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
+		exit(1);
+	}
+
+	T r_min, r_max;											//allocate space to store the minimum and maximum values
+	HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
+	HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
+
+
+	//size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1);			//number of values in the look-up table based on the user-specified spacing along r
+	size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1);
+
+	T kr_min = k * r_min;
+	T kr_max = k * r_max;
+
+	//temporary variables
+	double vm;															//allocate space to store the return values for the bessel function calculation
+	double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut;
+	stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes);													//pointer to the look-up table
+	T dkr = (kr_max - kr_min) / (N_hB_lut-1);												//distance between values in the LUT
+	std::cout<<"LUT jl bytes:  "<<hB_bytes<<std::endl;
+	stim::complex<T> hl;
+	for(size_t kri = 0; kri < N_hB_lut; kri++){													//for each value in the LUT
+		stim::bessjyv_sph<double>(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
+		for(size_t l = 0; l <= Nl; l++){													//for each order
+			hl.r = (T)jv[l];
+			hl.i = (T)yv[l];
+
+			hB_lut[kri * (Nl + 1) + l] = hl * B[l];										//store the bessel function result
+		}
+	}
+
+	//stim::cpu2image<T>(hankel_lut, "hankel.bmp", Nl+1, Nlut_j, stim::cmBrewer);
+
+	//Allocate device memory and copy everything to the GPU
+	stim::complex<T>* dev_hB_lut;
+	HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
+	HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );
+
+	gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, kr_min, dkr, N_hB_lut, Nl);
+
+	cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
+
+	if(x != NULL) cudaFree(dev_x);														//free everything
+	if(y != NULL) cudaFree(dev_y);
+	if(z != NULL) cudaFree(dev_z);
+	cudaFree(dev_E);
+#else
+	
+
+	//allocate space to store the bessel function call results
+	double vm;										
+	double* j_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* y_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dj_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dy_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
+
+	T r, kr, cos_phi;
+	stim::complex<T> h;
+	for(size_t i = 0; i < N; i++){
+		stim::vec3<T> p;															//declare a 3D point
+	
+		(x == NULL) ? p[0] = 0 : p[0] = x[i];										// test for NULL values and set positions
+		(y == NULL) ? p[1] = 0 : p[1] = y[i];
+		(z == NULL) ? p[2] = 0 : p[2] = z[i];
+		r = p.len();
+		if(r >= a){
+			for(size_t w = 0; w < W.size(); w++){
+				kr = p.len() * W[w].kmag();											//calculate k*r
+				stim::bessjyv_sph<double>(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr);
+				cos_phi = p.norm().dot(W[w].kvec().norm());							//calculate the cosine of the angle from the propagating direction
+				stim::legendre<T>(Nl, cos_phi, P);
+
+				for(size_t l = 0; l <= Nl; l++){
+					h.r = j_kr[l];
+					h.i = y_kr[l];
+					E[i] += W[w].E() * B[l] * h * P[l];
+				}
+			}
+		}
+	}
+#endif
+}
+
+template<typename T>
+void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){
+	std::vector< stim::scalarwave<T> > W(1, w);
+	cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n);
+}
+
+/// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere
+
+/// @param E is a pointer to the destination field values
+/// @param N is the number of points used to calculate the field
+/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
+/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
+/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
+/// @param w is a planewave that will be scattered
+/// @param a is the radius of the sphere
+/// @param n is the complex refractive index of the sphere
+template<typename T>
+void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n){
+
+	//calculate the necessary number of orders required to represent the scattered field
+	T k = W[0].kmag();
+
+	size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2);
+	std::cout<<"Nl: "<<Nl<<std::endl;
+
+	//calculate the scattering coefficients for the sphere
+	stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
+	A_coefficients(A, a, k, n, Nl);
+
+	//allocate space to store the bessel function call results
+	double vm;										
+	stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* y_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dj_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dy_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+
+	T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
+
+	T r, cos_phi;
+	stim::complex<double> knr;
+	stim::complex<T> h;
+	for(size_t i = 0; i < N; i++){
+		stim::vec3<T> p;									//declare a 3D point
+	
+		(x == NULL) ? p[0] = 0 : p[0] = x[i];				// test for NULL values and set positions
+		(y == NULL) ? p[1] = 0 : p[1] = y[i];
+		(z == NULL) ? p[2] = 0 : p[2] = z[i];
+		r = p.len();
+		if(r < a){
+			E[i] = 0;
+			for(size_t w = 0; w < W.size(); w++){
+				knr = (stim::complex<double>)n * p.len() * W[w].kmag();							//calculate k*n*r
+
+				stim::cbessjyva_sph<double>(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr);
+				if(r == 0)
+					cos_phi = 0;
+				else
+					cos_phi = p.norm().dot(W[w].kvec().norm());				//calculate the cosine of the angle from the propagating direction
+				stim::legendre<T>(Nl, cos_phi, P);
+								
+				for(size_t l = 0; l <= Nl; l++){
+					E[i] += W[w].E() * A[l] * (stim::complex<T>)j_knr[l] * P[l];
+				}
+			}
+		}
+	}
+}
+
+template<typename T>
+void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){
+	std::vector< stim::scalarwave<T> > W(1, w);
+	cpu_scalar_mie_internal(E, N, x, y, z, W, a, n);
+}
+
+}
+
+#endif
 \ No newline at end of file
@@ -5,7 +5,12 @@
 #include "../optics/scalarwave.h"
 #include "../math/bessel.h"
 #include "../math/legendre.h"
+#include "../cuda/cudatools/devices.h"
+#include "../cuda/cudatools/timer.h"
+#include <cublas_v2.h>
+#include <math_constants.h>
 #include <vector>
+#include <stdlib.h>
  
 namespace stim{
  
@@ -105,10 +110,11 @@ public:
 		std::vector< scalarwave<T> > samples(N);											//create a vector of plane waves
 		T kmag = (T)stim::TAU / lambda;								//calculate the wavenumber
 		stim::complex<T> apw;										//allocate space for the amplitude at the focal point
+		T a = (T)(stim::TAU * (1 - cos(asin(NA[0]))) / (double)N);
 		stim::vec3<T> kpw;											//declare the new k-vector based on the focused plane wave direction
 		for(size_t i=0; i<N; i++){										//for each sample
 			kpw = dirs[i] * kmag;									//calculate the k-vector for the new plane wave
-			apw = exp(stim::complex<T>(0, kpw.dot(-f)));				//calculate the amplitude for the new plane wave
+			apw = a * exp(stim::complex<T>(0, kpw.dot(-f)));				//calculate the amplitude for the new plane wave
 			samples[i] = scalarwave<T>(kpw, apw);			//create a plane wave based on the direction
 		}
  
@@ -148,7 +154,7 @@ public:
 /// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional)
 /// @param C is a pointer to Nl + 1 values where the terms will be stored
 template<typename T>
-CUDA_CALLABLE void cpu_aperture_integral(T* C, size_t Nl, T NA, T NA_in = 0){
+CUDA_CALLABLE void cpu_aperture_integral(T* C, int Nl, T NA, T NA_in = 0){
  
 	size_t table_bytes = (Nl + 1) * sizeof(T);				//calculate the number of bytes required to store the terms
 	T cos_alpha_1 = cos(asin(NA_in));						//calculate the cosine of the angle subtended by the central obscuration
@@ -182,23 +188,156 @@ CUDA_CALLABLE void cpu_aperture_integral(T* C, size_t Nl, T NA, T NA_in = 0){
  
 /// performs linear interpolation into a look-up table
 template<typename T>
-T lut_lookup(T* lut, T val, size_t N, T min_val, T delta, size_t stride = 0){
-	size_t idx = (size_t)((val - min_val) / delta);
-	T alpha = val - idx * delta + min_val;
+CUDA_CALLABLE void lut_lookup(T* lut_values, T* lut, T val, size_t N, T min_val, T delta, size_t n_vals){
+	T idx = ((val - min_val) / delta);
+	size_t i = (size_t) idx;
+	T a1 = idx - i;
+	T a0 = 1 - a1;
+	size_t n0 = i * n_vals;
+	size_t n1 = (i+1) * n_vals;
+	for(size_t n = 0; n < n_vals; n++){
+		lut_values[n] = lut[n0 + n] * a0 + lut[n1 + n] * a1;
+	}
+}
+
+template <typename T>
+CUDA_CALLABLE stim::complex<T> clerp(stim::complex<T> v0, stim::complex<T> v1, T t) {
+    return stim::complex<T>( fma(t, v1.r, fma(-t, v0.r, v0.r)), fma(t, v1.i, fma(-t, v0.i, v0.i)) );
+}
+
+template <typename T>
+CUDA_CALLABLE T lerp(T v0, T v1, T t) {
+    return fma(t, v1, fma(-t, v0, v0));
+}
  
-	if(alpha == 0) return lut[idx];
-	else return lut[idx * stride] * (1 - alpha) + lut[ (idx+1) * stride] * alpha;
+#ifdef __CUDACC__
+template<typename T>
+__global__ void cuda_scalar_psf(stim::complex<T>* E, size_t N, T* r, T* phi, T k, T A, size_t Nl,
+								T* C, 
+								T* lut_j, size_t Nj, T min_kr, T dkr){
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;												//exit if this thread is outside the array
+
+	T cos_phi = cos(phi[i]);									//calculate the thread value for cos(phi)
+	T kr = r[i] * k;											//calculate the thread value for k*r
+	stim::complex<T> Ei = 0;									//initialize the value of the field to zero
+	size_t NC = Nl + 1;										//calculate the number of coefficients to be used
+
+	T fij = (kr - min_kr)/dkr;								//FP index into the spherical bessel LUT
+	size_t ij = (size_t) fij;								//convert to an integral index
+	T a = fij - ij;											//calculate the fractional portion of the index
+	size_t n0j = ij * (NC);									//start of the first entry in the LUT
+	size_t n1j = (ij+1) * (NC);								//start of the second entry in the LUT
+
+	T jl;											//declare register to store the spherical bessel function
+	T Pl_2, Pl_1;									//declare registers to store the previous two Legendre polynomials
+	T Pl = 1;										//initialize the current value for the Legendre polynomial
+	stim::complex<T> im(0, 1);						//declare i (imaginary 1)
+	stim::complex<T> i_pow(1, 0);					//i_pow stores the current value of i^l so it doesn't have to be re-computed every iteration
+	for(int l = 0; l <= Nl; l++){					//for each order
+		jl = lerp<T>( lut_j[n0j + l], lut_j[n1j + l], a );	//read jl from the LUT and interpolate the result
+		Ei += i_pow * jl * Pl * C[l];				//calculate the value for the field and sum
+		i_pow *= im;								//multiply i^l * i for the next iteration
+		Pl_2 = Pl_1;								//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+		Pl_1 = Pl;
+		if(l == 0){									//computing Pl is done recursively, where the recursive relation
+			Pl = cos_phi;							//	requires the first two orders. This defines the second.
+		}
+		else{										//if this is not the first iteration, use the recursive relation to calculate Pl
+			Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
+		}
+		
+	}
+	E[i] = Ei * A * 2 * CUDART_PI_F;						//scale the integral by the amplitude
 }
  
 template<typename T>
-void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, stim::vec3<T> f, T NA, T NA_in, int Nl){
-	T k = stim::TAU / lambda;
+void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl, T r_spacing){
  
-	T* C = (T*) malloc( (Nl + 1) * sizeof(T) );					//allocate space for the aperture integral terms
+	//Find the minimum and maximum values of r
+    cublasStatus_t stat;
+    cublasHandle_t handle;
+
+	stat = cublasCreate(&handle);							//create a cuBLAS handle
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS initialization failed\n");
+		exit(1);
+	}
+
+	int i_min, i_max;
+	stat = cublasIsamin(handle, (int)N, r, 1, &i_min);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
+		exit(1);
+	}
+	stat = cublasIsamax(handle, (int)N, r, 1, &i_max);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
+		exit(1);
+	}
+
+	T r_min, r_max;											//allocate space to store the minimum and maximum values
+	HANDLE_ERROR( cudaMemcpy(&r_min, r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
+	HANDLE_ERROR( cudaMemcpy(&r_max, r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
+
+	T k = (T)stim::TAU / lambda;							//calculate the wavenumber from lambda
+	size_t C_bytes = (Nl + 1) * sizeof(T);
+	T* C = (T*) malloc( C_bytes );							//allocate space for the aperture integral terms
+	cpu_aperture_integral(C, Nl, NA, NA_in);				//calculate the aperture integral terms
+
+	size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1);			//number of values in the look-up table based on the user-specified spacing along r
+
+	T kr_min = k * r_min;
+	T kr_max = k * r_max;
+
+	//temporary variables
+	double vm;															//allocate space to store the return values for the bessel function calculation
+	double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j;
+	T* bessel_lut = (T*) malloc(lutj_bytes);													//pointer to the look-up table
+	T delta_kr = (kr_max - kr_min) / (Nlut_j-1);												//distance between values in the LUT
+	std::cout<<"LUT jl bytes:  "<<lutj_bytes<<std::endl;
+	for(size_t kri = 0; kri < Nlut_j; kri++){													//for each value in the LUT
+		stim::bessjyv_sph<double>(Nl, kr_min + kri * delta_kr, vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
+		for(size_t l = 0; l <= Nl; l++){													//for each order
+			bessel_lut[kri * (Nl + 1) + l] = (T)jv[l];										//store the bessel function result
+		}
+	}
+
+	stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
+
+	//Allocate device memory and copy everything to the GPU
+
+	T* gpu_C;
+	HANDLE_ERROR( cudaMalloc(&gpu_C, C_bytes) );
+	HANDLE_ERROR( cudaMemcpy(gpu_C, C, C_bytes, cudaMemcpyHostToDevice) );
+	T* gpu_j_lut;
+	HANDLE_ERROR( cudaMalloc(&gpu_j_lut, lutj_bytes) );
+	HANDLE_ERROR( cudaMemcpy(gpu_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) );
+
+	int threads = stim::maxThreadsPerBlock();			//get the maximum number of threads per block for the CUDA device
+	dim3 blocks( (unsigned)(N / threads + 1));						//calculate the optimal number of blocks
+
+	cuda_scalar_psf<T><<< blocks, threads >>>(E, N, r, phi, (T)stim::TAU/lambda, A, Nl, gpu_C, gpu_j_lut, Nlut_j, kr_min, delta_kr);
+
+	//free the LUT and condenser tables
+	HANDLE_ERROR( cudaFree(gpu_C) );
+	HANDLE_ERROR( cudaFree(gpu_j_lut) );
+}
+#endif
+
+/// Calculate the analytical solution to a scalar point spread function given a set of spherical coordinates about the PSF (beam propagation along phi = theta = 0)
+template<typename T>
+void cpu_scalar_psf_local(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl){
+	T k = (T)stim::TAU / lambda;
+	size_t C_bytes = (Nl + 1) * sizeof(T);
+	T* C = (T*) malloc( C_bytes );					//allocate space for the aperture integral terms
 	cpu_aperture_integral(C, Nl, NA, NA_in);			//calculate the aperture integral terms
 	memset(F, 0, N * sizeof(stim::complex<T>));
-#ifdef NO_CUDA
-	memset(F, 0, N * sizeof(stim::complex<T>));
 	T jl, Pl, kr, cos_phi;
  
 	double vm;
@@ -225,71 +364,117 @@ void cpu_scalar_psf(stim::complex&lt;T&gt;* F, size_t N, T* r, T* phi, T lambda, T A, 
  
 	free(C);
 	free(Pl_cos_phi);
-#else
-	T min_r = r[0];
-	T max_r = r[0];
-	for(size_t i = 0; i < N; i++){								//find the minimum and maximum values of r (min and max distance from the focal point)
-		if(r[i] < min_r) min_r = r[i];
-		if(r[i] > max_r) max_r = r[i];
-	}
-	T min_kr = k * min_r;
-	T max_kr = k * max_r;
+}
  
-	//temporary variables
-	double vm;
-	double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
+/// Converts a set of cartesian points into spherical coordinates surrounding a point spread function (PSF)
+/// @param r is the output distance from the PSF
+/// @param phi is the non-symmetric direction about the PSF
+/// @param x (x, y, z) are the cartesian coordinates in world space
+/// @f is the focal point of the PSF in cartesian coordinates
+/// @d is the propagation direction of the PSF in cartesian coordinates
+template<typename T>
+__global__ void cuda_cart2psf(T* r, T* phi, size_t N, T* x, T* y, T* z, stim::vec3<T> f, stim::quaternion<T> q){
  
-	size_t Nlut = (size_t)sqrt(N) * 2;
-	T* bessel_lut = (T*) malloc(sizeof(T) * (Nl+1) * Nlut);
-	T delta_kr = (max_kr - min_kr) / (Nlut-1);
-	for(size_t kri = 0; kri < Nlut; kri++){
-		stim::bessjyv_sph<double>(Nl, min_kr + kri * delta_kr, vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
-		for(size_t l = 0; l <= Nl; l++){
-			bessel_lut[kri * (Nl + 1) + l] = (T)jv[l];
-		}
-	}
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;												//exit if this thread is outside the array
  
-	T* Pl_cos_phi = (T*) malloc((Nl + 1) * sizeof(T));
-	T kr, cos_phi, jl, Pl;
-	for(size_t n = 0; n < N; n++){								//for each point in the field
-		kr = k * r[n];											//calculate kr (the optical distance between the focal point and p)
-		cos_phi = std::cos(phi[n]);								//calculate the cosine of phi
-		stim::legendre<T>(Nl, cos_phi, Pl_cos_phi);				//calculate the [0 Nl] legendre polynomials for this point
+	stim::vec3<T> p;									//declare a 3D point
+	
+	(x == NULL) ? p[0] = 0 : p[0] = x[i];				// test for NULL values and set positions
+	(y == NULL) ? p[1] = 0 : p[1] = y[i];
+	(z == NULL) ? p[2] = 0 : p[2] = z[i];
  
-		for(int l = 0; l <= Nl; l++){
-			jl = lut_lookup<T>(&bessel_lut[l], kr, Nlut, min_kr, delta_kr, Nl+1);
-			Pl = Pl_cos_phi[l];
-			F[n] += pow(complex<T>(0, 1), l) * jl * Pl * C[l];
-		}
-		F[n] *= A * stim::TAU;
-	}
-#endif
+	p = p - f;											//shift the point to the center of the PSF (focal point)
+	p = q.toMatrix3() * p;								//rotate the point to align with the propagation direction
+
+	stim::vec3<T> ps = p.cart2sph();									//convert from cartesian to spherical coordinates
+	r[i] = ps[0];										//store r
+	phi[i] = ps[2];										//phi = [0 pi]
 }
  
+#ifdef __CUDACC__
+/// Calculate the analytical solution to a point spread function given a set of points in cartesian coordinates
+template<typename T>
+void gpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
+	
+	T* gpu_r;															//allocate space for the coordinates in r
+	HANDLE_ERROR( cudaMalloc(&gpu_r, sizeof(T) * N) );
+	T* gpu_phi;
+	HANDLE_ERROR( cudaMalloc(&gpu_phi, sizeof(T) * N) );
+	//stim::complex<T>* gpu_E;
+	//HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex<T>) * N) );
+
+	stim::quaternion<T> q;												//create a quaternion
+	q.CreateRotation(d, stim::vec3<T>(0, 0, 1));						//create a mapping from the propagation direction to the PSF space
+	int threads = stim::maxThreadsPerBlock();							//get the maximum number of threads per block for the CUDA device
+	dim3 blocks( (unsigned)(N / threads + 1));							//calculate the optimal number of blocks
+	cuda_cart2psf<T> <<< blocks, threads >>> (gpu_r, gpu_phi, N, x, y, z, f, q);	//call the CUDA kernel to move the cartesian coordinates to PSF space
+
+	gpu_scalar_psf_local(E, N, gpu_r, gpu_phi, lambda, A, NA, NA_in, Nl, r_spacing);
+
+}
+#endif
  
 template<typename T>
-void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, T NA, T NA_in, int Nl){
+void cpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
+
+// If CUDA is available, copy the cartesian points to the GPU and evaluate them in a kernel
+#ifdef __CUDACC__
+
+	T* gpu_x = NULL;
+	if(x != NULL){
+		HANDLE_ERROR( cudaMalloc(&gpu_x, sizeof(T) * N) );
+		HANDLE_ERROR( cudaMemcpy(gpu_x, x, sizeof(T) * N, cudaMemcpyHostToDevice) );
+	}
+	T* gpu_y = NULL;
+	if(y != NULL){
+		HANDLE_ERROR( cudaMalloc(&gpu_y, sizeof(T) * N) );
+		HANDLE_ERROR( cudaMemcpy(gpu_y, y, sizeof(T) * N, cudaMemcpyHostToDevice) );
+	}
+	T* gpu_z = NULL;
+	if(z != NULL){
+		HANDLE_ERROR( cudaMalloc(&gpu_z, sizeof(T) * N) );
+		HANDLE_ERROR( cudaMemcpy(gpu_z, z, sizeof(T) * N, cudaMemcpyHostToDevice) );
+	}
+
+	stim::complex<T>* gpu_E;
+	HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex<T>) * N) );
+	HANDLE_ERROR( cudaMemcpy(gpu_E, E, sizeof(stim::complex<T>) * N, cudaMemcpyHostToDevice) );
+	gpu_scalar_psf_cart<T>(gpu_E, N, gpu_x, gpu_y, gpu_z, lambda, A, f, d, NA, NA_in, Nl, r_spacing);
+	HANDLE_ERROR( cudaMemcpy(E, gpu_E, sizeof(stim::complex<T>) * N, cudaMemcpyDeviceToHost) );
+
+	HANDLE_ERROR( cudaFree(gpu_x) );
+	HANDLE_ERROR( cudaFree(gpu_y) );
+	HANDLE_ERROR( cudaFree(gpu_z) );
+	HANDLE_ERROR( cudaFree(gpu_E) );
+
+#else
 	T* r = (T*) malloc(N * sizeof(T));					//allocate space for p in spherical coordinates
 	T* phi = (T*) malloc(N * sizeof(T));				//	only r and phi are necessary (the scalar PSF is symmetric about theta)
  
-	stim::vec3<T> p, ps;
+	stim::quaternion<T> q;
+	q.CreateRotation(d, stim::vec3<T>(0, 0, 1));
+	stim::matrix<T, 3> R = q.toMatrix3();
+	stim::vec3<T> p, ps, ds;
 	for(size_t i = 0; i < N; i++){
 		(x == NULL) ? p[0] = 0 : p[0] = x[i];	// test for NULL values and set positions
 		(y == NULL) ? p[1] = 0 : p[1] = y[i];
 		(z == NULL) ? p[2] = 0 : p[2] = z[i];
  
+		p = p - f;
+
+		p = R * p;					//rotate the cartesian point
+
 		ps = p.cart2sph();						//convert from cartesian to spherical coordinates
 		r[i] = ps[0];							//store r
 		phi[i] = ps[2];							//phi = [0 pi]
 	}
  
-	cpu_scalar_psf(F, N, r, phi, lambda, A, f, NA, NA_in, Nl);		//call the spherical coordinate CPU function
+	cpu_scalar_psf_local(F, N, r, phi, lambda, A, NA, NA_in, Nl);		//call the spherical coordinate CPU function
  
 	free(r);
 	free(phi);
+#endif
 }
  
 }			//end namespace stim
@@ -23,7 +23,7 @@ namespace stim{
 template<typename T>
 class scalarwave{
  
-protected:
+public:
  
 	stim::vec3<T> k;							//k-vector, pointed in propagation direction with magnitude |k| = tau / lambda = 2pi / lambda
 	stim::complex<T> E0;						//amplitude
@@ -60,7 +60,7 @@ public:
 		return k.len();
 	}
  
-	CUDA_CALLABLE vec3< complex<T> > E(){
+	CUDA_CALLABLE complex<T> E(){
 		return E0;
 	}
  
@@ -235,6 +235,32 @@ void gpu_scalarwave(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scala
 	cuda_scalarwave<T><<< blocks, threads >>>(F, N, x, y, z, w);			//call the kernel
 }
  
+template<typename T>
+void gpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW){
+
+	size_t wave_bytes = sizeof(stim::scalarwave<T>);
+	size_t shared_bytes = stim::sharedMemPerBlock();									//calculate the maximum amount of shared memory available
+	size_t max_batch = shared_bytes / wave_bytes;				//calculate number of plane waves that will fit into shared memory
+	size_t batch_bytes = min(nW, max_batch) * wave_bytes;				//initialize the batch size (in bytes) to the maximum batch required
+
+	stim::scalarwave<T>* batch_W;
+	HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes));										//allocate memory for a single batch of plane waves
+
+	int threads = stim::maxThreadsPerBlock();							//get the maximum number of threads per block for the CUDA device
+	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks	
+
+	size_t batch_size;																	//declare a variable to store the size of the current batch
+	size_t waves_processed = 0;															//initialize the number of waves processed to zero
+	while(waves_processed < nW){												//while there are still waves to be processed
+		batch_size = min<size_t>(max_batch, nW - waves_processed);			//process either a whole batch, or whatever is left
+		batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
+		HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice));	//copy the plane waves into global memory
+		cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(F, N, x, y, z, batch_W, batch_size);	//call the kernel
+		waves_processed += batch_size;													//increment the counter indicating how many waves have been processed
+	}
+	cudaFree(batch_W);
+}
+
 /// Sums a series of coherent plane waves at a specified point
 /// @param field is the output array of field values corresponding to each input point
 /// @param x is an array of x coordinates for the field point
@@ -245,24 +271,13 @@ void gpu_scalarwave(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scala
 /// @param A is the list of amplitudes for each wave
 /// @param S is the list of propagation directions for each wave
 template<typename T>
-void cpu_sum_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > w_array){
-	size_t S = w_array.size();											//store the number of waves
-#ifdef NO_CUDA
-	memset(F, 0, N * sizeof(stim::complex<T>));
-	T px, py, pz;
-	for(size_t i = 0; i < N; i++){										// for each element in the array
-		(x == NULL) ? px = 0 : px = x[i];								// test for NULL values
-		(y == NULL) ? py = 0 : py = y[i];
-		(z == NULL) ? pz = 0 : pz = z[i];
-
-		for(size_t s = 0; s < S; s++){
-			F[i] += w_array[s].pos(px, py, pz);						//sum all plane waves at this point
-		}
-	}
-#else
+void cpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W){
+	size_t S = W.size();											//store the number of waves
+#ifdef __CUDACC__
 	stim::complex<T>* dev_F;										//allocate space for the field
 	cudaMalloc(&dev_F, N * sizeof(stim::complex<T>));
-	cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
+	cudaMemcpy(dev_F, F, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
+	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
  
 	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
 	if(x != NULL){
@@ -282,28 +297,11 @@ void cpu_sum_scalarwaves(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, std::v
 		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
 	}
  
-	size_t wave_bytes = sizeof(stim::scalarwave<T>);
-	size_t shared_bytes = stim::sharedMemPerBlock();									//calculate the maximum amount of shared memory available
-	size_t array_bytes = w_array.size() * wave_bytes;			//calculate the maximum number of bytes required for the planewave array
-	size_t max_batch = shared_bytes / wave_bytes;				//calculate number of plane waves that will fit into shared memory
-	size_t num_batches = w_array.size() / max_batch + 1;								//calculate the number of batches required to process all plane waves
-	size_t batch_bytes = min(w_array.size(), max_batch) * wave_bytes;				//initialize the batch size (in bytes) to the maximum batch required
-
-	stim::scalarwave<T>* dev_w;
-	HANDLE_ERROR(cudaMalloc(&dev_w, batch_bytes));										//allocate memory for a single batch of plane waves
-
-	int threads = stim::maxThreadsPerBlock();							//get the maximum number of threads per block for the CUDA device
-	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks	
+	stim::scalarwave<T>* dev_W;
+	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
+	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
  
-	size_t batch_size;																	//declare a variable to store the size of the current batch
-	size_t waves_processed = 0;															//initialize the number of waves processed to zero
-	while(waves_processed < w_array.size()){												//while there are still waves to be processed
-		batch_size = min<size_t>(max_batch, w_array.size() - waves_processed);			//process either a whole batch, or whatever is left
-		batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
-		HANDLE_ERROR(cudaMemcpy(dev_w, &w_array[waves_processed], batch_bytes, cudaMemcpyHostToDevice));	//copy the plane waves into global memory
-		cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(dev_F, N, dev_x, dev_y, dev_z, dev_w, batch_size);	//call the kernel
-		waves_processed += batch_size;													//increment the counter indicating how many waves have been processed
-	}
+	gpu_scalarwaves(dev_F, N, dev_x, dev_y, dev_z, dev_W, W.size());
  
 	cudaMemcpy(F, dev_F, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
  
@@ -311,15 +309,31 @@ void cpu_sum_scalarwaves(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, std::v
 	if(y != NULL) cudaFree(dev_y);
 	if(z != NULL) cudaFree(dev_z);
 	cudaFree(dev_F);
-	cudaFree(dev_w);
+#else
+	memset(F, 0, N * sizeof(stim::complex<T>));
+	T px, py, pz;
+	for(size_t i = 0; i < N; i++){										// for each element in the array
+		(x == NULL) ? px = 0 : px = x[i];								// test for NULL values
+		(y == NULL) ? py = 0 : py = y[i];
+		(z == NULL) ? pz = 0 : pz = z[i];
  
+		for(size_t s = 0; s < S; s++){
+			F[i] += w_array[s].pos(px, py, pz);						//sum all plane waves at this point
+		}
+	}
 #endif
 }
  
 template<typename T>
 void cpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
 	std::vector< stim::scalarwave<T> > w_array(1, w);
-	cpu_sum_scalarwaves(F, N, x, y, z, w_array);	
+	cpu_scalarwaves(F, N, x, y, z, w_array);	
+}
+
+template<typename T>
+void cpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
+	std::vector< stim::scalarwave<T> > w_array(1, w);
+	cpu_scalarwaves(F, N, x, y, z, w_array);	
 }
  
  
@@ -331,7 +345,7 @@ void cpu_scalarwave(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scala
 /// @param A is the list of amplitudes for each wave
 /// @param S is the list of propagation directions for each wave
 template<typename T>
-CUDA_CALLABLE stim::complex<T> sum_scalarwaves(T x, T y, T z, std::vector< stim::scalarwave<T> > W){
+CUDA_CALLABLE stim::complex<T> cpu_scalarwaves(T x, T y, T z, std::vector< stim::scalarwave<T> > W){
 	size_t N = W.size();												//get the number of plane wave samples
 	stim::complex<T> field(0, 0);										//initialize the field to zero (0)
 	stim::vec3<T> k;													//allocate space for the direction vector
@@ -10,8 +10,8 @@ class aaboundingbox{
  
 public:
 	bool set;				//has the bounding box been set to include any points?
-	stim::vec<T> A;			//minimum point in the bounding box
-	stim::vec<T> B;			//maximum point in the bounding box
+	stim::vec3<T> A;			//minimum point in the bounding box
+	stim::vec3<T> B;			//maximum point in the bounding box
  
 	aaboundingbox(){					//constructor generates an empty bounding box
 		set = false;
@@ -21,7 +21,7 @@ public:
 	/// Test if a point is inside of the bounding box and returns true if it is.
  
 	/// @param p is the point to be tested
-	bool test(stim::vec<T> p){
+	bool test(stim::vec3<T> p){
  
 		for(unsigned d = 0; d < p.size(); p++){		//for each dimension
 			if(p[d] < A[d]) return false;			//if the point is less than the minimum bound, return false
@@ -33,7 +33,7 @@ public:
 	/// Expand the bounding box to include the specified point.
  
 	/// @param p is the point to be included
-	void expand(stim::vec<T> p){
+	void expand(stim::vec3<T> p){
  
 		if(!set){							//if the bounding box is empty, fill it with the current point
 			A = B = p;
@@ -47,12 +47,12 @@ public:
 	}
  
 	/// Return the center point of the bounding box as a stim::vec
-	stim::vec<T> center(){
+	stim::vec3<T> center(){
 		return (B + A) * 0.5;
 	}
  
 	/// Return the size of the bounding box as a stim::vec
-	stim::vec<T> size(){
+	stim::vec3<T> size(){
 		return (B - A);
 	}
  
@@ -11,32 +11,32 @@ namespace stim{
  
 class camera
 {
-	vec<float> d;	//direction that the camera is pointing
-	vec<float> p;	//position of the camera
-	vec<float> up;	//"up" direction
+	vec3<float> d;	//direction that the camera is pointing
+	vec3<float> p;	//position of the camera
+	vec3<float> up;	//"up" direction
 	float focus;		//focal length of the camera
 	float fov;
  
 	//private function makes sure that the up vector is orthogonal to the direction vector and both are normalized
 	void stabalize()
 	{
-		vec<float> side = up.cross(d);
+		vec3<float> side = up.cross(d);
 		up = d.cross(side);
 		up = up.norm();
 		d = d.norm();
 	}
  
 public:
-	void setPosition(vec<float> pos)
+	void setPosition(vec3<float> pos)
 	{
 		p = pos;
 	}
-	void setPosition(float x, float y, float z){setPosition(vec<float>(x, y, z));}
+	void setPosition(float x, float y, float z){setPosition(vec3<float>(x, y, z));}
  
 	void setFocalDistance(float distance){focus = distance;}
 	void setFOV(float field_of_view){fov = field_of_view;}
  
-	void LookAt(vec<float> pos)
+	void LookAt(vec3<float> pos)
 	{
 		//find the new direction
 		d = pos - p;
@@ -47,22 +47,22 @@ public:
 		//stabalize the camera
 		stabalize();
 	}
-	void LookAt(float px, float py, float pz){LookAt(vec<float>(px, py, pz));}
-	void LookAt(vec<float> pos, vec<float> new_up){up = new_up; LookAt(pos);}
-	void LookAt(float px, float py, float pz, float ux, float uy, float uz){LookAt(vec<float>(px, py, pz), vec<float>(ux, uy, uz));}
+	void LookAt(float px, float py, float pz){LookAt(vec3<float>(px, py, pz));}
+	void LookAt(vec3<float> pos, vec3<float> new_up){up = new_up; LookAt(pos);}
+	void LookAt(float px, float py, float pz, float ux, float uy, float uz){LookAt(vec3<float>(px, py, pz), vec3<float>(ux, uy, uz));}
 	void LookAtDolly(float lx, float ly, float lz)
 	{
 		//find the current focus point
-		vec<float> f = p + focus*d;
-		vec<float> T = vec<float>(lx, ly, lz) - f;
+		vec3<float> f = p + focus*d;
+		vec3<float> T = vec3<float>(lx, ly, lz) - f;
 		p = p + T;
 	}
  
-	void Dolly(vec<float> direction)
+	void Dolly(vec3<float> direction)
 	{
 		p = p+direction;
 	}
-	void Dolly(float x, float y, float z){Dolly(vec<float>(x, y, z));}
+	void Dolly(float x, float y, float z){Dolly(vec3<float>(x, y, z));}
 	void Push(float delta)
 	{
 		if(delta > focus)
@@ -80,7 +80,7 @@ public:
 		qx.CreateRotation(theta_x, up[0], up[1], up[2]);
  
 		//y rotation is around the side axis
-		vec<float> side = up.cross(d);
+		vec3<float> side = up.cross(d);
 		quaternion<float> qy;
 		qy.CreateRotation(theta_y, side[0], side[1], side[2]);
  
@@ -118,28 +118,28 @@ public:
 	void OrbitFocus(float theta_x, float theta_y)
 	{
 		//find the focal point
-		vec<float> focal_point = p + focus*d;
+		vec3<float> focal_point = p + focus*d;
  
 		//center the coordinate system on the focal point
-		vec<float> centered = p - (focal_point - vec<float>(0, 0, 0));
+		vec3<float> centered = p - (focal_point - vec3<float>(0, 0, 0));
  
 		//create the x rotation (around the up vector)
 		quaternion<float> qx;
 		qx.CreateRotation(theta_x, up[0], up[1], up[2]);
-		centered = vec<float>(0, 0, 0) + qx.toMatrix3()*(centered - vec<float>(0, 0, 0));
+		centered = vec3<float>(0, 0, 0) + qx.toMatrix3()*(centered - vec3<float>(0, 0, 0));
  
 		//get a side vector for theta_y rotation
-		vec<float> side = up.cross((vec<float>(0, 0, 0) - centered).norm());
+		vec3<float> side = up.cross((vec3<float>(0, 0, 0) - centered).norm());
  
 		quaternion<float> qy;
 		qy.CreateRotation(theta_y, side[0], side[1], side[2]);
-		centered = vec<float>(0, 0, 0) + qy.toMatrix3()*(centered - vec<float>(0, 0, 0));
+		centered = vec3<float>(0, 0, 0) + qy.toMatrix3()*(centered - vec3<float>(0, 0, 0));
  
 		//perform the rotation on the centered camera position
 		//centered = final.toMatrix()*centered;
  
 		//re-position the camera
-		p = centered + (focal_point - vec<float>(0, 0, 0));
+		p = centered + (focal_point - vec3<float>(0, 0, 0));
  
 		//make sure we are looking at the focal point
 		LookAt(focal_point);
@@ -151,17 +151,17 @@ public:
  
 	void Slide(float u, float v)
 	{
-		vec<float> V = up.norm();
-		vec<float> U = up.cross(d).norm();
+		vec3<float> V = up.norm();
+		vec3<float> U = up.cross(d).norm();
  
 		p = p + (V * v) + (U * u);
 	}
  
 	//accessor methods
-	vec<float> getPosition(){return p;}
-	vec<float> getUp(){return up;}
-	vec<float> getDirection(){return d;}
-	vec<float> getLookAt(){return p + focus*d;}
+	vec3<float> getPosition(){return p;}
+	vec3<float> getUp(){return up;}
+	vec3<float> getDirection(){return d;}
+	vec3<float> getLookAt(){return p + focus*d;}
 	float getFOV(){return fov;}
  
 	//output the camera settings
@@ -182,9 +182,9 @@ public:
 	//constructor
 	camera()
 	{
-		p = vec<float>(0, 0, 0);
-		d = vec<float>(0, 0, 1);
-		up = vec<float>(0, 1, 0);
+		p = vec3<float>(0, 0, 0);
+		d = vec3<float>(0, 0, 1);
+		up = vec3<float>(0, 1, 0);
 		focus = 1;
  
 	}
@@ -2,7 +2,7 @@
 #define STIM_CYLINDER_H
 #include <iostream>
 #include <stim/math/circle.h>
-#include <stim/math/vector.h>
+#include <stim/math/vec3.h>
  
  
 namespace stim
@@ -25,11 +25,11 @@ class cylinder
  
 		///inits the cylinder from a list of points (inP) and radii (inM)
 		void
-		init(std::vector<stim::vec<T> > inP, std::vector<stim::vec<T> > inM)
+		init(std::vector<stim::vec3<T> > inP, std::vector<stim::vec<T> > inM)
 		{
 			mags = inM;
-			stim::vec<float> v1;
-			stim::vec<float> v2;
+			stim::vec3<float> v1;
+			stim::vec3<float> v2;
 			e.resize(inP.size());
 			if(inP.size() < 2)
 				return;
@@ -38,16 +38,16 @@ class cylinder
 			L.resize(inP.size());
 			T temp = (T)0;
 			L[0] = 0;
-			for(int i = 1; i < L.size(); i++)
+			for(size_t i = 1; i < L.size(); i++)
 			{
 				temp += (inP[i-1] - inP[i]).len();
 				L[i] = temp;
 			}
  
-			stim::vec<T> dr = (inP[1] - inP[0]).norm();
-			s = stim::circle<T>(inP[0], inM[0][0], dr, stim::vec<T>(1,0,0));
+			stim::vec3<T> dr = (inP[1] - inP[0]).norm();
+			s = stim::circle<T>(inP[0], inM[0][0], dr, stim::vec3<T>(1,0,0));
 			e[0] = s;
-			for(int i = 1; i < inP.size()-1; i++)
+			for(size_t i = 1; i < inP.size()-1; i++)
 			{
 				s.center(inP[i]);
 				v1 = (inP[i] - inP[i-1]).norm();
@@ -67,7 +67,7 @@ class cylinder
 		}
  
 		///returns the direction vector at point idx.
-		stim::vec<T>
+		stim::vec3<T>
 		d(int idx)
 		{
 			if(idx == 0)
@@ -81,15 +81,15 @@ class cylinder
 			else
 			{
 //				return (e[idx+1].P - e[idx].P).norm();
-				stim::vec<float> v1 = (e[idx].P-e[idx-1].P).norm();
-				stim::vec<float> v2 = (e[idx+1].P-e[idx].P).norm();
+				stim::vec3<float> v1 = (e[idx].P-e[idx-1].P).norm();
+				stim::vec3<float> v2 = (e[idx+1].P-e[idx].P).norm();
 				return (v1+v2).norm();			
 			} 
 	//		return e[idx].N;	
  
 		}
  
-		stim::vec<T>
+		stim::vec3<T>
 		d(T l, int idx)
 		{
 			if(idx == 0 || idx == e.size()-1)
@@ -144,13 +144,13 @@ class cylinder
 		///constructor to create a cylinder from a set of points, radii, and the number of sides for the cylinder.
 		///@param inP:  Vector of stim vecs composing the points of the centerline.
 		///@param inM:  Vector of stim vecs composing the radii of the centerline.
-		cylinder(std::vector<stim::vec<T> > inP, std::vector<stim::vec<T> > inM){
+		cylinder(std::vector<stim::vec3<T> > inP, std::vector<stim::vec3<T> > inM){
 			init(inP, inM);
 		}
  
 		///Constructor defines a cylinder with centerline inP and magnitudes of zero
 		///@param inP: Vector of stim vecs composing the points of the centerline
-		cylinder(std::vector< stim::vec<T> > inP){
+		cylinder(std::vector< stim::vec3<T> > inP){
 			std::vector< stim::vec<T> > inM;						//create an array of arbitrary magnitudes
  
 			stim::vec<T> zero;
@@ -171,12 +171,12 @@ class cylinder
 		///Returns a position vector at the given p-value (p value ranges from 0 to 1).
 		///interpolates the position along the line.
 		///@param pvalue: the location of the in the cylinder, from 0 (beginning to 1).
-		stim::vec<T>
+		stim::vec3<T>
 		p(T pvalue)
 		{
 			if(pvalue < 0.0 || pvalue > 1.0)
 			{
-				return stim::vec<float>(-1,-1,-1);
+				return stim::vec3<float>(-1,-1,-1);
 			}
 			T l = pvalue*L[L.size()-1];
 			int idx = findIdx(l);
@@ -188,7 +188,7 @@ class cylinder
 		///Interpolates the radius along the line.
 		///@param l: the location of the in the cylinder.
 		///@param idx: integer location of the point closest to l but prior to it.
-		stim::vec<T>
+		stim::vec3<T>
 		p(T l, int idx)
 		{
 				T rat = (l-L[idx])/(L[idx+1]-L[idx]);
@@ -252,16 +252,16 @@ class cylinder
 		///in x, y, z coordinates. Theta is in degrees from 0 to 360.
 		///@param pvalue: the location of the in the cylinder, from 0 (beginning to 1).
 		///@param theta: the angle to the point of a circle.
-		stim::vec<T>
+		stim::vec3<T>
 		surf(T pvalue, T theta)
 		{
 			if(pvalue < 0.0 || pvalue > 1.0)
 			{
-				return stim::vec<float>(-1,-1,-1);
+				return stim::vec3<float>(-1,-1,-1);
 			} else {
 			T l = pvalue*L[L.size()-1];
 			int idx = findIdx(l);
-			stim::vec<T> ps = p(l, idx); 
+			stim::vec3<T> ps = p(l, idx); 
 			T m = r(l, idx);
 			s = e[idx];
 			s.center(ps);
@@ -273,10 +273,10 @@ class cylinder
  
 		///returns a vector of points necessary to create a circle at every position in the fiber.
 		///@param sides: the number of sides of each circle.	
-		std::vector<std::vector<vec<T> > >
+		std::vector<std::vector<vec3<T> > >
 		getPoints(int sides)
 		{
-			std::vector<std::vector <vec<T> > > points;
+			std::vector<std::vector <vec3<T> > > points;
 			points.resize(e.size());
 			for(int i = 0; i < e.size(); i++)
 			{
@@ -293,7 +293,7 @@ class cylinder
 		}
 		/// Allows a point on the centerline to be accessed using bracket notation
  
-		vec<T> operator[](unsigned int i){
+		vec3<T> operator[](unsigned int i){
 			return e[i].P;
 		}
  
@@ -309,7 +309,7 @@ class cylinder
 			T M = 0;						//initialize the integral to zero
 			T m0, m1;						//allocate space for both magnitudes in a single segment
  
-			//vec<T> p0, p1;					//allocate space for both points in a single segment
+			//vec3<T> p0, p1;					//allocate space for both points in a single segment
  
 			m0 = mags[0][m];				//initialize the first point and magnitude to the first point in the cylinder
 			//p0 = pos[0];
@@ -325,7 +325,7 @@ class cylinder
 				if(p > 1) len = (L[p-1] - L[p-2]);		//calculate the segment length using the L array
  
 				//add the average magnitude, weighted by the segment length
-				M += (m0 + m1)/2.0 * len;
+				M += (m0 + m1)/(T)2.0 * len;
  
 				m0 = m1;								//move to the next segment by shifting points
 			}
@@ -345,21 +345,21 @@ class cylinder
 		/// @param spacing is the maximum spacing allowed between sample points
 		cylinder<T> resample(T spacing){
  
-			std::vector< vec<T> > result;
+			std::vector< vec3<T> > result;
  
-			vec<T> p0 = e[0].P;								//initialize p0 to the first point on the centerline
-			vec<T> p1;
+			vec3<T> p0 = e[0].P;								//initialize p0 to the first point on the centerline
+			vec3<T> p1;
 			unsigned N = size();							//number of points in the current centerline
  
 			//for each line segment on the centerline
 			for(unsigned int i = 1; i < N; i++){
 				p1 = e[i].P;								//get the second point in the line segment
  
-				vec<T> v = p1 - p0;							//calculate the vector between these two points
+				vec3<T> v = p1 - p0;							//calculate the vector between these two points
 				T d = v.len();								//calculate the distance between these two points (length of the line segment)
  
-				unsigned nsteps = d / spacing+1;		//calculate the number of steps to take along the segment to meet the spacing criteria
-				T stepsize = 1.0 / nsteps;			//calculate the parametric step size between new centerline points
+				size_t nsteps = (size_t)std::ceil(d / spacing);		//calculate the number of steps to take along the segment to meet the spacing criteria
+				T stepsize = (T)1.0 / nsteps;			//calculate the parametric step size between new centerline points
  
 				//for each step along the line segment
 				for(unsigned s = 0; s < nsteps; s++){