Merge branch 'master' of git.stim.ee.uh.edu:codebase/stimlib

David Mayerich
2 parents 0bb6cf51 0288346a
Showing 21 changed files with 2279 additions and 1009 deletions Show diff stats
stim/cuda/cudatools/devices.h
stim/cuda/sharedmem.cuh
stim/math/bessel.h
stim/math/complex.h
stim/math/constants.h
stim/math/legendre.h
stim/math/matrix.h
stim/math/meshgrid.h
stim/math/quaternion.h
stim/math/vec3.h
stim/math/vector.h
stim/optics/planewave.h
stim/optics/scalarbeam.h
stim/optics/scalarwave.h
stim/optics/beam.h → stim/optics_old/beam.h
stim/optics/efield.cuh → stim/optics_old/efield.cuh
stim/optics/esphere.cuh → stim/optics_old/esphere.cuh
stim/optics/halfspace.cuh → stim/optics_old/halfspace.cuh
stim/optics/material.h → stim/optics_old/material.h
stim/optics/mirst-1d.cuh → stim/optics_old/mirst-1d.cuh
@@ -15,7 +15,7 @@ int maxThreadsPerBlock()
 }
  
 extern "C"
-int sharedMemPerBlock()
+size_t sharedMemPerBlock()
 {
 	int device;
 	cudaGetDevice(&device);		//get the id of the current device
@@ -23,6 +23,16 @@ int sharedMemPerBlock()
 	cudaGetDeviceProperties(&props, device);
 	return props.sharedMemPerBlock;
 }
+
+extern "C"
+size_t constMem()
+{
+	int device;
+	cudaGetDevice(&device);		//get the id of the current device
+	cudaDeviceProp props;		//device property structure
+	cudaGetDeviceProperties(&props, device);
+	return props.totalConstMem;
+}
 }	//end namespace rts
  
 #endif
@@ -5,7 +5,7 @@
 namespace stim{
 	namespace cuda{
  
-		// Copies values from global memory to shared memory, optimizing threads
+		// Copies values from texture memory to shared memory, optimizing threads
 		template<typename T>
 		__device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src,
 										 unsigned int x, unsigned int y, unsigned int X, unsigned int Y,
@@ -35,6 +35,19 @@ namespace stim{
 			}
 		}
  
+		// Copies values from global memory to shared memory, optimizing threads
+		template<typename T>
+		__device__ void sharedMemcpy(T* dest, T* src, size_t N, size_t tid, size_t nt){
+
+			size_t I = N / nt + 1;	//calculate the number of iterations required to make the copy
+			size_t xi = tid;							//initialize the source and destination index to the thread ID
+			for(size_t i = 0; i < I; i++){ 				//for each iteration
+				if(xi < N)								//if the index is within the copy region
+					dest[xi] = src[xi];					//perform the copy
+				xi += nt;
+			}
+		}
+
  
 	}
 }
@@ -17,6 +17,11 @@ static complex&lt;double&gt; czero(0.0,0.0);
 template< typename P >
 P gamma(P x)
 {
+	const P EPS = numeric_limits<P>::epsilon();
+	const P FPMIN_MAG = numeric_limits<P>::min();
+	const P FPMIN = numeric_limits<P>::lowest();
+	const P FPMAX = numeric_limits<P>::max();
+
     int i,k,m;
     P ga,gr,r,z;
  
@@ -47,7 +52,7 @@ P gamma(P x)
        -0.54e-14,
         0.14e-14};
  
-    if (x > 171.0) return 1e308;    // This value is an overflow flag.
+    if (x > 171.0) return FPMAX;    // This value is an overflow flag.
     if (x == (int)x) {
         if (x > 0.0) {
             ga = 1.0;               // use factorial
@@ -56,7 +61,7 @@ P gamma(P x)
             }
          }
          else
-            ga = 1e308;
+            ga = FPMAX;
      }
      else {
         if (fabs(x) > 1.0) {
@@ -89,6 +94,11 @@ template&lt;typename P&gt;
 int bessjy01a(P x,P &j0,P &j1,P &y0,P &y1,
     P &j0p,P &j1p,P &y0p,P &y1p)
 {
+	const P EPS = numeric_limits<P>::epsilon();
+	const P FPMIN_MAG = numeric_limits<P>::min();
+	const P FPMIN = numeric_limits<P>::lowest();
+	const P FPMAX = numeric_limits<P>::max();
+
     P x2,r,ec,w0,w1,r0,r1,cs0,cs1;
     P cu,p0,q0,p1,q1,t1,t2;
     int k,kz;
@@ -157,12 +167,12 @@ int bessjy01a(P x,P &amp;j0,P &amp;j1,P &amp;y0,P &amp;y1,
     if (x == 0.0) {
         j0 = 1.0;
         j1 = 0.0;
-        y0 = -1e308;
-        y1 = -1e308;
+        y0 = -FPMIN;
+        y1 = -FPMIN;
         j0p = 0.0;
         j1p = 0.5;
-        y0p = 1e308;
-        y1p = 1e308;
+        y0p = FPMAX;
+        y1p = FPMAX;
         return 0;
     }
     x2 = x*x;
@@ -329,7 +339,7 @@ int msta1(P x,int mp)
     for (i=0;i<20;i++) {
         nn = (int)(n1-(n1-n0)/(1.0-f0/f1));
         f = 0.5*log10(6.28*nn)-nn*log10(1.36*a0/nn)-mp;
-        if (abs(nn-n1) < 1) break;
+        if (std::abs(nn-n1) < 1) break;
         n0 = n1;
         f0 = f1;
         n1 = nn;
@@ -361,7 +371,7 @@ int msta2(P x,int n,int mp)
     for (i=0;i<20;i++) {
         nn = (int)(n1-(n1-n0)/(1.0-f0/f1));
         f = 0.5*log10(6.28*nn)-nn*log10(1.36*a0/nn)-obj;
-        if (abs(nn-n1) < 1) break;
+        if (std::abs(nn-n1) < 1) break;
         n0 = n1;
         f0 = f1;
         n1 = nn;
@@ -596,21 +606,26 @@ int bessjyv(P v,P x,P &amp;vm,P *jv,P *yv,
     P b,ec,w0,w1,bju0,bju1,pv0,pv1,byvk;
     int j,k,l,m,n,kz;
  
+	const P EPS = numeric_limits<P>::epsilon();
+	const P FPMIN_MAG = numeric_limits<P>::min();
+	const P FPMIN = numeric_limits<P>::lowest();
+	const P FPMAX = numeric_limits<P>::max();
+
     x2 = x*x;
     n = (int)v;
     v0 = v-n;
     if ((x < 0.0) || (v < 0.0)) return 1;
-    if (x < 1e-15) {
+    if (x < EPS) {
         for (k=0;k<=n;k++) {
             jv[k] = 0.0;
-            yv[k] = -1e308;
+            yv[k] = FPMIN;
             djv[k] = 0.0;
-            dyv[k] = 1e308;
+            dyv[k] = FPMAX;
             if (v0 == 0.0) {
                 jv[0] = 1.0;
                 djv[1] = 0.5;
             }
-            else djv[0] = 1e308;
+            else djv[0] = FPMAX;
         }
         vm = v;
         return 0;
@@ -623,7 +638,7 @@ int bessjyv(P v,P x,P &amp;vm,P *jv,P *yv,
             for (k=1;k<=40;k++) {
                 r *= -0.25*x2/(k*(k+vl));
                 bjvl += r;
-                if (fabs(r) < fabs(bjvl)*1e-15) break;
+                if (fabs(r) < fabs(bjvl)*EPS) break;
             }
             vg = 1.0 + vl;
             a = pow(0.5*x,vl)/gamma(vg);
@@ -686,7 +701,7 @@ int bessjyv(P v,P x,P &amp;vm,P *jv,P *yv,
         if (m < n) n = m;
         else m = msta2(x,n,15);
         f2 = 0.0;
-        f1 = 1.0e-100;
+        f1 = FPMIN_MAG;
         for (k=m;k>=0;k--) {
             f = 2.0*(v0+k+1.0)*f1/x-f2;
             if (k <= n) jv[k] = f;
@@ -763,20 +778,26 @@ int bessjyv(P v,P x,P &amp;vm,P *jv,P *yv,
  
 template<typename P>
 int bessjyv_sph(int v, P z, P &vm, P* cjv,
-    P* cyv, P* cjvp, P* cyvp)
-{
+    P* cyv, P* cjvp, P* cyvp){
+	
     //first, compute the bessel functions of fractional order
-    bessjyv(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp);
+    bessjyv<P>(v + (P)0.5, z, vm, cjv, cyv, cjvp, cyvp);
+
+	if(z == 0){													//handle degenerate case of z = 0
+		memset(cjv, 0, sizeof(P) * (v+1));
+		cjv[0] = 1;
+	}
  
     //iterate through each and scale
-    for(int n = 0; n<=v; n++)
-    {
+    for(int n = 0; n<=v; n++){
  
-        cjv[n] = cjv[n] * sqrt(rtsPI/(z * 2.0));
-        cyv[n] = cyv[n] * sqrt(rtsPI/(z * 2.0));
+		if(z != 0){												//handle degenerate case of z = 0
+			cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
+			cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
+		}
  
-        cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(rtsPI / (z * 2.0));
-        cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(rtsPI / (z * 2.0));
+        cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0));
+        cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0));
     }
  
 	return 0;
@@ -1498,11 +1519,11 @@ int cbessjyva_sph(int v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
     for(int n = 0; n<=v; n++)
     {
  
-        cjv[n] = cjv[n] * sqrt(rtsPI/(z * 2.0));
-        cyv[n] = cyv[n] * sqrt(rtsPI/(z * 2.0));
+        cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
+        cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
  
-        cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(rtsPI / (z * 2.0));
-        cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(rtsPI / (z * 2.0));
+        cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0));
+        cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0));
     }
  
 	return 0;
-/*RTS Complex number class.  This class is CUDA compatible,
-and can therefore be used in CUDA code and on CUDA devices.
-*/
+/// CUDA compatible complex number class
  
-#ifndef RTS_COMPLEX
-#define RTS_COMPLEX
+#ifndef STIM_COMPLEX
+#define STIM_COMPLEX
  
-#include "../cuda/callable.h"
+#include "../cuda/cudatools/callable.h"
 #include <cmath>
 #include <string>
 #include <sstream>
@@ -230,12 +228,6 @@ struct complex
 		return result;
 	}
  
-	/*CUDA_CALLABLE complex<T> pow(int y)
-	{
-
-        return pow((double)y);
-	}*/
-
 	CUDA_CALLABLE complex<T> pow(T y)
 	{
 		complex<T> result;
@@ -328,8 +320,31 @@ struct complex
 		return *this;
 	}
  
+	
+
 };
  
+/// Cast an array of complex values to an array of real values
+template<typename T>
+static void real(T* r, complex<T>* c, size_t n){
+	for(size_t i = 0; i < n; i++)
+		r[i] = c[i].real();
+}
+
+/// Cast an array of complex values to an array of real values
+template<typename T>
+static void imag(T* r, complex<T>* c, size_t n){
+	for(size_t i = 0; i < n; i++)
+		r[i] = c[i].imag();
+}
+
+/// Calculate the magnitude of an array of complex values
+template<typename T>
+static void abs(T* m, complex<T>* c, size_t n){
+	for(size_t i = 0; i < n; i++)
+		m[i] = c[i].abs();
+}
+
 }	//end RTS namespace
  
 //addition
@@ -432,17 +447,6 @@ CUDA_CALLABLE static T imag(stim::complex&lt;T&gt; a)
     return a.i;
 }
  
-//trigonometric functions
-//template<class A>
-/*CUDA_CALLABLE static stim::complex<float> sinf(const stim::complex<float> x)
-{
-	stim::complex<float> result;
-	result.r = sinf(x.r) * coshf(x.i);
-	result.i = cosf(x.r) * sinhf(x.i);
-
-	return result;
-}*/
-
 template<class A>
 CUDA_CALLABLE stim::complex<A> sin(const stim::complex<A> x)
 {
@@ -453,17 +457,6 @@ CUDA_CALLABLE stim::complex&lt;A&gt; sin(const stim::complex&lt;A&gt; x)
 	return result;
 }
  
-//floating point template
-//template<class A>
-/*CUDA_CALLABLE static stim::complex<float> cosf(const stim::complex<float> x)
-{
-	stim::complex<float> result;
-	result.r = cosf(x.r) * coshf(x.i);
-	result.i = -(sinf(x.r) * sinhf(x.i));
-
-	return result;
-}*/
-
 template<class A>
 CUDA_CALLABLE stim::complex<A> cos(const stim::complex<A> x)
 {
@@ -496,10 +489,4 @@ std::istream&amp; operator&gt;&gt;(std::istream&amp; is, stim::complex&lt;A&gt;&amp; x)
     return is;		//return the stream
 }
  
-//#if __GNUC__ > 3 && __GNUC_MINOR__ > 7
-//template<class T> using rtsComplex = stim::complex<T>;
-//#endif
-
-
-
 #endif
-#ifndef RTS_CONSTANTS_H
-#define RTS_CONSTANTS_H
+#ifndef STIM_CONSTANTS_H
+#define STIM_CONSTANTS_H
  
-#define stimPI 	3.14159
-#define stimTAU	2 * rtsPI
+namespace stim{
+	const double PI		=	3.1415926535897932384626433832795028841971693993751058209749445923078164062862;
+	const double TAU	=	2 * stim::PI;
+}
  
 #endif
 #ifndef RTS_LEGENDRE_H
 #define RTS_LEGENDRE_H
  
-#include "rts/cuda/callable.h"
+#include "../cuda/cudatools/callable.h"
  
 namespace stim{
  
@@ -24,9 +24,11 @@ CUDA_CALLABLE void shift_legendre(int n, T x, T&amp; P0, T&amp; P1)
 	P1 = Pnew;
 }
  
+/// Iteratively evaluates the Legendre polynomials for orders l = [0 n]
 template <typename T>
 CUDA_CALLABLE void legendre(int n, T x, T* P)
 {
+	if(n < 0) return;
     P[0] = 1;
  
     if(n >= 1)
@@ -50,10 +50,8 @@ struct matrix
 		return *this;
 	}
  
-
 	template<typename Y>
-	CUDA_CALLABLE vec<Y> operator*(vec<Y> rhs)
-	{
+	vec<Y> operator*(vec<Y> rhs){
 		unsigned int N = rhs.size();
  
 		vec<Y> result;
@@ -66,6 +64,16 @@ struct matrix
 		return result;
 	}
  
+	template<typename Y>
+	CUDA_CALLABLE vec3<Y> operator*(vec3<Y> rhs){
+		vec3<Y> result = 0;
+		for(int r=0; r<3; r++)
+			for(int c=0; c<3; c++)
+				result[r] += (*this)(r, c) * rhs[c];
+
+		return result;
+	}
+
 	std::string toStr()
 	{
 		std::stringstream ss;
@@ -82,10 +90,6 @@ struct matrix
  
 		return ss.str();
 	}
-
-
-
-
 };
  
 }	//end namespace rts
+#ifndef STIM_MESHGRID_H
+#define STIM_MESHGRID_H
+
+namespace stim{
+
+	/// Create a 2D grid based on a pair of vectors representing the grid spacing (see Matlab)
+	/// @param X is an [nx x ny] array that will store the X coordinates for each 2D point
+	/// @param Y is an [nx x ny] array that will store the Y coordinates for each 2D point
+	/// @param x is an [nx] array that provides the positions of grid points in the x direction
+	/// @param nx is the number of grid points in the x direction
+	/// @param y is an [ny] array that provides the positions of grid points in the y direction
+	/// @param ny is the number of grid points in the y direction
+	template<typename T>
+	void meshgrid(T* X, T* Y, T* x, size_t nx, T* y, size_t ny){
+		size_t xi, yi;												//allocate index variables
+		for(yi = 0; yi < ny; yi++){									//iterate through each column
+			for(xi = 0; xi < nx; xi++){								//iterate through each row
+				X[yi * nx + xi] = x[xi];
+				Y[yi * nx + xi] = y[yi];
+			}
+		}
+	}
+
+	/// Creates an array of n equally spaced values in the range [xmin xmax]
+	/// @param X is an array of length n that stores the values
+	/// @param xmin is the start point of the array
+	/// @param xmax is the end point of the array
+	/// @param n is the number of points in the array
+	template<typename T>
+	void linspace(T* X, T xmin, T xmax, size_t n){
+		T alpha;
+		for(size_t i = 0; i < n; i++){
+			alpha = (T)i / (T)n;
+			X[i] = (1 - alpha) * xmin + alpha * xmax;
+		}
+	}
+
+
+}
+
+
+#endif
 \ No newline at end of file
@@ -26,13 +26,13 @@ public:
  
 	CUDA_CALLABLE void CreateRotation(T theta, T ux, T uy, T uz){
  
-		vec<T> u(ux, uy, uz);
+		vec3<T> u(ux, uy, uz);
 		CreateRotation(theta, u);		
 	}
  
-	CUDA_CALLABLE void CreateRotation(T theta, vec<T> u){
+	CUDA_CALLABLE void CreateRotation(T theta, vec3<T> u){
  
-		vec<T> u_hat = u.norm();
+		vec3<T> u_hat = u.norm();
  
 		//assign the given Euler rotation to this quaternion
 		w = (T)cos(theta/2);
@@ -41,9 +41,9 @@ public:
 		z = u_hat[2]*(T)sin(theta/2);
 	}
  
-	void CreateRotation(vec<T> from, vec<T> to){
+	CUDA_CALLABLE void CreateRotation(vec3<T> from, vec3<T> to){
  
-		vec<T> r = from.cross(to);			//compute the rotation vector
+		vec3<T> r = from.cross(to);			//compute the rotation vector
 		T theta = asin(r.len());				//compute the angle of the rotation about r
 		//deal with a zero vector (both k and kn point in the same direction)
 		if(theta == (T)0){
+#ifndef STIM_VEC3_H
+#define STIM_VEC3_H
+
+
+#include <stim/cuda/cudatools/callable.h>
+
+
+namespace stim{
+
+
+/// A class designed to act as a 3D vector with CUDA compatibility
+template<typename T>
+class vec3{
+
+protected:
+	T ptr[3];
+
+public:
+
+	CUDA_CALLABLE vec3(){}
+
+	CUDA_CALLABLE vec3(T v){
+		ptr[0] = ptr[1] = ptr[2] = v;
+	}
+
+	CUDA_CALLABLE vec3(T x, T y, T z){
+		ptr[0] = x;
+		ptr[1] = y;
+		ptr[2] = z;
+	}
+
+	//copy constructor
+	CUDA_CALLABLE vec3( const vec3<T>& other){
+		ptr[0] = other.ptr[0];
+		ptr[1] = other.ptr[1];
+		ptr[2] = other.ptr[2];
+	}
+
+	//access an element using an index
+	CUDA_CALLABLE T& operator[](int idx){
+		return ptr[idx];
+	}
+
+/// Casting operator. Creates a new vector with a new type U.
+	template< typename U >
+	CUDA_CALLABLE operator vec3<U>(){
+		vec3<U> result;
+		result.ptr[0] = (U)ptr[0];
+		result.ptr[1] = (U)ptr[1];
+		result.ptr[2] = (U)ptr[2];
+
+		return result;
+	}
+
+	// computes the squared Euclidean length (useful for several operations where only >, =, or < matter)
+	CUDA_CALLABLE T len_sq() const{
+		return ptr[0] * ptr[0] + ptr[1] * ptr[1] + ptr[2] * ptr[2];
+	}
+
+	/// computes the Euclidean length of the vector
+	CUDA_CALLABLE T len() const{
+		return sqrt(len_sq());
+	}
+	
+
+	/// Convert the vector from cartesian to spherical coordinates (x, y, z -> r, theta, phi where theta = [0, 2*pi])
+	CUDA_CALLABLE vec3<T> cart2sph() const{
+		vec3<T> sph;
+		sph.ptr[0] = len();
+		sph.ptr[1] = std::atan2(ptr[1], ptr[0]);
+		if(sph.ptr[0] == 0)
+			sph.ptr[2] = 0;
+		else
+			sph.ptr[2] = std::acos(ptr[2] / sph.ptr[0]);
+		return sph;
+	}
+
+	/// Convert the vector from cartesian to spherical coordinates (r, theta, phi -> x, y, z where theta = [0, 2*pi])
+	CUDA_CALLABLE vec3<T> sph2cart() const{
+		vec3<T> cart;
+		cart.ptr[0] = ptr[0] * std::cos(ptr[1]) * std::sin(ptr[2]);
+		cart.ptr[1] = ptr[0] * std::sin(ptr[1]) * std::sin(ptr[2]);
+		cart.ptr[2] = ptr[0] * std::cos(ptr[2]);
+
+		return cart;
+	}
+
+	/// Computes the normalized vector (where each coordinate is divided by the L2 norm)
+	CUDA_CALLABLE vec3<T> norm() const{
+        vec3<T> result;
+        T l = len();						//compute the vector length
+        return (*this) / l;
+	}
+
+	/// Computes the cross product of a 3-dimensional vector
+	CUDA_CALLABLE vec3<T> cross(const vec3<T> rhs) const{
+
+		vec3<T> result;
+
+		result[0] = (ptr[1] * rhs.ptr[2] - ptr[2] * rhs.ptr[1]);
+		result[1] = (ptr[2] * rhs.ptr[0] - ptr[0] * rhs.ptr[2]);
+		result[2] = (ptr[0] * rhs.ptr[1] - ptr[1] * rhs.ptr[0]);
+
+		return result;
+	}
+
+	/// Compute the Euclidean inner (dot) product
+    CUDA_CALLABLE T dot(vec3<T> rhs) const{
+        return ptr[0] * rhs.ptr[0] + ptr[1] * rhs.ptr[1] + ptr[2] * rhs.ptr[2];
+    }
+
+	/// Arithmetic addition operator
+
+    /// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator+(vec3<T> rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] + rhs[0];
+		result.ptr[1] = ptr[1] + rhs[1];
+		result.ptr[2] = ptr[2] + rhs[2];
+		return result;
+	}
+
+	/// Arithmetic addition to a scalar
+
+	/// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator+(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] + rhs;
+		result.ptr[1] = ptr[1] + rhs;
+		result.ptr[2] = ptr[2] + rhs;
+		return result;
+	}
+
+	/// Arithmetic subtraction operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator-(vec3<T> rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] - rhs[0];
+		result.ptr[1] = ptr[1] - rhs[1];
+		result.ptr[2] = ptr[2] - rhs[2];
+		return result;
+	}
+	/// Arithmetic subtraction to a scalar
+
+	/// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator-(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] - rhs;
+		result.ptr[1] = ptr[1] - rhs;
+		result.ptr[2] = ptr[2] - rhs;
+		return result;
+	}
+
+	/// Arithmetic scalar multiplication operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator*(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] * rhs;
+		result.ptr[1] = ptr[1] * rhs;
+		result.ptr[2] = ptr[2] * rhs;
+		return result;
+	}
+
+	/// Arithmetic scalar division operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator/(T rhs) const{
+		return (*this) * ((T)1.0/rhs);
+	}
+
+	/// Multiplication by a scalar, followed by assignment
+	CUDA_CALLABLE vec3<T> operator*=(T rhs){
+		ptr[0] = ptr[0] * rhs;
+		ptr[1] = ptr[1] * rhs;
+		ptr[2] = ptr[2] * rhs;
+		return *this;
+	}
+
+	/// Addition and assignment
+	CUDA_CALLABLE vec3<T> operator+=(vec3<T> rhs){
+		ptr[0] = ptr[0] + rhs;
+		ptr[1] = ptr[1] + rhs;
+		ptr[2] = ptr[2] + rhs;
+		return *this;
+	}
+
+	/// Assign a scalar to all values
+	CUDA_CALLABLE vec3<T> & operator=(T rhs){
+		ptr[0] = ptr[0] = rhs;
+		ptr[1] = ptr[1] = rhs;
+		ptr[2] = ptr[2] = rhs;
+		return *this;
+	}
+
+	/// Casting and assignment
+	template<typename Y>
+	CUDA_CALLABLE vec3<T> & operator=(vec3<Y> rhs){
+		ptr[0] = (T)rhs.ptr[0];
+		ptr[1] = (T)rhs.ptr[1];
+		ptr[2] = (T)rhs.ptr[2];
+		return *this;
+	}
+
+	/// Unary minus (returns the negative of the vector)
+	CUDA_CALLABLE vec3<T> operator-() const{
+		vec3<T> result;
+		result.ptr[0] = -ptr[0];
+		result.ptr[1] = -ptr[1];
+		result.ptr[2] = -ptr[2];
+		return result;
+	}
+
+
+	/// Outputs the vector as a string
+	std::string str() const{
+		std::stringstream ss;
+
+		size_t N = size();
+
+		ss<<"[";
+		for(size_t i=0; i<N; i++)
+		{
+			ss<<at(i);
+			if(i != N-1)
+				ss<<", ";
+		}
+		ss<<"]";
+
+		return ss.str();
+	}
+	};						//end class triple
+}							//end namespace stim
+
+/// Multiply a vector by a constant when the vector is on the right hand side
+template <typename T>
+stim::vec3<T> operator*(T lhs, stim::vec3<T> rhs){
+    return rhs * lhs;
+}
+
+#endif
 \ No newline at end of file
-#ifndef RTS_VECTOR_H
-#define RTS_VECTOR_H
+#ifndef STIM_VECTOR_H
+#define STIM_VECTOR_H
  
 #include <iostream>
 #include <cmath>
@@ -11,8 +11,6 @@
 namespace stim
 {
  
-
-
 template <class T>
 struct vec : public std::vector<T>
 {
-#ifndef RTS_PLANEWAVE
-#define RTS_PLANEWAVE
+#ifndef STIM_PLANEWAVE_H
+#define STIM_PLANEWAVE_H
  
 #include <string>
 #include <sstream>
+#include <cmath>
  
 #include "../math/vector.h"
 #include "../math/quaternion.h"
 #include "../math/constants.h"
 #include "../math/plane.h"
-#include "../cuda/callable.h"
-
-/*Basic conversions used here (assuming a vacuum)
-	lambda =
-*/
+#include "../math/complex.h"
  
 namespace stim{
+	namespace optics{
+
+		/// evaluate the scalar field produced by a plane wave at a point (x, y, z)
+
+		/// @param x is the x-coordinate of the point
+		/// @param y is the y-coordinate of the point
+		/// @param z is the z-coordinate of the point
+		/// @param A is the amplitude of the plane wave, specifically the field at (0, 0, 0)
+		/// @param kx is the k-vector component in the x direction
+		/// @param ky is the k-vector component in the y direction
+		/// @param kz is the k-vector component in the z direction
+		template<typename T>
+		stim::complex<T> planewave_scalar(T x, T y, T z, stim::complex<T> A, T kx, T ky, T kz){
+			T d = x * kx + y * ky + z * kz;						//calculate the dot product between k and p = (x, y, z) to find the distance p is along the propagation direction
+			stim::complex<T> di = stim::complex<T>(0, d);		//calculate the phase shift that will have to be applied to propagate the wave distance d
+			return A * exp(di);									//multiply the phase term by the amplitude at (0, 0, 0) to propagate the wave to p
+		}
+
+		/// evaluate the scalar field produced by a plane wave at several positions
+
+		/// @param field is a pre-allocated block of memory that will store the complex field at all points
+		/// @param N is the number of field values to be evaluated
+		/// @param x is a set of x coordinates defining positions within the field (NULL implies that all values are zero)
+		/// @param y is a set of y coordinates defining positions within the field (NULL implies that all values are zero)
+		/// @param z is a set of z coordinates defining positions within the field (NULL implies that all values are zero)
+		/// @param A is the amplitude of the plane wave, specifically the field at (0, 0, 0)
+		/// @param kx is the k-vector component in the x direction
+		/// @param ky is the k-vector component in the y direction
+		/// @param kz is the k-vector component in the z direction
+		template<typename T>
+		void cpu_planewave_scalar(stim::complex<T>* field, size_t N, T* x, T* y = NULL, T* z = NULL, stim::complex<T> A = 1.0, T kx = 0.0, T ky = 0.0, T kz = 0.0){
+			T px, py, pz;
+			for(size_t i = 0; i < N; i++){										// for each element in the array
+				(x == NULL) ? px = 0 : px = x[i];								// test for NULL values
+				(y == NULL) ? py = 0 : py = y[i];
+				(z == NULL) ? pz = 0 : pz = z[i];
+
+				field[i] = planewave_scalar(px, py, pz, A, kx, ky, kz);			// call the single-value plane wave function
+			}
+		}
  
 template<typename T>
 class planewave{
  
 protected:
  
-	vec<T> k;	//k = tau / lambda
-	vec< complex<T> > E0;		//amplitude
-	//T phi;
-
-	CUDA_CALLABLE planewave<T> bend(rts::vec<T> kn) const{
+	stim::vec<T> k;							//k-vector, pointed in propagation direction with magnitude |k| = tau / lambda = 2pi / lambda
+	stim::vec< stim::complex<T> > E0;		//amplitude (for a scalar plane wave, only E0[0] is used)
  
-		vec<T> kn_hat = kn.norm();				//normalize the new k
-		vec<T> k_hat = k.norm();				//normalize the current k
+	/// Bend a plane wave via refraction, given that the new propagation direction is known
+	CUDA_CALLABLE planewave<T> bend(stim::vec<T> kn) const{
  
-		//std::cout<<"PLANE WAVE BENDING------------------"<<std::endl;
-		//std::cout<<"kn_hat: "<<kn_hat<<"     k_hat: "<<k_hat<<std::endl;
+		stim::vec<T> kn_hat = kn.norm();				//normalize the new k
+		stim::vec<T> k_hat = k.norm();					//normalize the current k
  
-		planewave<T> new_p;						//create a new plane wave
+		planewave<T> new_p;								//create a new plane wave
  
-		//if kn is equal to k or -k, handle the degenerate case
-		T k_dot_kn = k_hat.dot(kn_hat);
+		T k_dot_kn = k_hat.dot(kn_hat);					//if kn is equal to k or -k, handle the degenerate case
  
 		//if k . n < 0, then the bend is a reflection
-			//flip k_hat
-		if(k_dot_kn < 0) k_hat = -k_hat;
+		if(k_dot_kn < 0) k_hat = -k_hat;				//flip k_hat
  
-		//std::cout<<"k dot kn: "<<k_dot_kn<<std::endl;
-
-		//std::cout<<"k_dot_kn: "<<k_dot_kn<<std::endl;
 		if(k_dot_kn == -1){
 			new_p.k = -k;
 			new_p.E0 = E0;
@@ -56,28 +85,11 @@ protected:
 			return new_p;
 		}
  
-		vec<T> r = k_hat.cross(kn_hat);			//compute the rotation vector
-
-		//std::cout<<"r: "<<r<<std::endl;
-
-		T theta = asin(r.len());				//compute the angle of the rotation about r
-
-		
-
-		//deal with a zero vector (both k and kn point in the same direction)
-		//if(theta == (T)0)
-		//{
-		//	new_p = *this;
-		//	return new_p;
-		//}
-
-		//create a quaternion to capture the rotation
-		quaternion<T> q;
-		q.CreateRotation(theta, r.norm());
-
-		//apply the rotation to E0
-		vec< complex<T> > E0n = q.toMatrix3() * E0;
-
+		vec<T> r = k_hat.cross(kn_hat);					//compute the rotation vector
+		T theta = asin(r.len());						//compute the angle of the rotation about r
+		quaternion<T> q;								//create a quaternion to capture the rotation
+		q.CreateRotation(theta, r.norm());		
+		vec< complex<T> > E0n = q.toMatrix3() * E0;		//apply the rotation to E0
 		new_p.k = kn_hat * kmag();
 		new_p.E0 = E0n;
  
@@ -86,16 +98,9 @@ protected:
  
 public:
  
-
-	///constructor: create a plane wave propagating along z, polarized along x
-	/*planewave(T lambda = (T)1)
-	{
-		k = rts::vec<T>(0, 0, 1) * (TAU/lambda);
-		E0 = rts::vec<T>(1, 0, 0);
-	}*/
-	///constructor: create a plane wave propagating along k, polarized along _E0, at frequency _omega
-	CUDA_CALLABLE planewave(vec<T> kvec = rts::vec<T>(0, 0, rtsTAU), 
-							vec< complex<T> > E = rts::vec<T>(1, 0, 0), T phase = 0)
+	///constructor: create a plane wave propagating along k
+	CUDA_CALLABLE planewave(vec<T> kvec = stim::vec<T>(0, 0, stim::TAU), 
+							vec< complex<T> > E = stim::vec<T>(1, 0, 0))
 	{
 		//phi = phase;
  
@@ -107,27 +112,23 @@ public:
 		else{
 			vec< complex<T> > s = (k_hat.cross(E)).norm();		//compute an orthogonal side vector
 			vec< complex<T> > E_hat = (s.cross(k)).norm();	//compute a normalized E0 direction vector
-			E0 = E_hat * E_hat.dot(E);					//compute the projection of _E0 onto E0_hat
+			E0 = E_hat;// * E_hat.dot(E);					//compute the projection of _E0 onto E0_hat
 		}
  
 		E0 = E0 * exp( complex<T>(0, phase) );
 	}
  
 	///multiplication operator: scale E0
-    CUDA_CALLABLE planewave<T> & operator* (const T & rhs)
-	{
-		
+    CUDA_CALLABLE planewave<T> & operator* (const T & rhs){		
 		E0 = E0 * rhs;
 		return *this;
 	}
  
-	CUDA_CALLABLE T lambda() const
-	{
-		return rtsTAU / k.len();
+	CUDA_CALLABLE T lambda() const{
+		return stim::TAU / k.len();
 	}
  
-	CUDA_CALLABLE T kmag() const
-	{
+	CUDA_CALLABLE T kmag() const{
 		return k.len();
 	}
  
@@ -139,14 +140,11 @@ public:
 		return k;
 	}
  
-	/*CUDA_CALLABLE T phase(){
-		return phi;
+	/// calculate the value of the field produced by the plane wave given a three-dimensional position
+	CUDA_CALLABLE vec< complex<T> > pos(T x, T y, T z){
+		return pos( stim::vec<T>(x, y, z) );
 	}
  
-	CUDA_CALLABLE void phase(T p){
-		phi = p;
-	}*/
-
 	CUDA_CALLABLE vec< complex<T> > pos(vec<T> p = vec<T>(0, 0, 0)){
 		vec< complex<T> > result;
  
@@ -166,18 +164,32 @@ public:
 		return planewave<T>(k * (nt / ni), E0);
 	}
  
-	CUDA_CALLABLE planewave<T> refract(rts::vec<T> kn) const
-	{
+	CUDA_CALLABLE planewave<T> refract(stim::vec<T> kn) const{
 		return bend(kn);
 	}
  
-	void scatter(rts::plane<T> P, T nr, planewave<T> &r, planewave<T> &t){
+	/// Calculate the result of a plane wave hitting an interface between two refractive indices
+
+	/// @param P is a plane representing the position and orientation of the surface
+	/// @param n0 is the refractive index outside of the surface (in the direction of the normal)
+	/// @param n1 is the refractive index inside the surface (in the direction away from the normal)
+	/// @param r is the reflected component of the plane wave
+	/// @param t is the transmitted component of the plane wave
+	void scatter(stim::plane<T> P, T n0, T n1, planewave<T> &r, planewave<T> &t){
+		scatter(P, n1/n0, r, t);
+	}
+
+	/// Calculate the scattering result when nr = n1/n0
+
+	/// @param P is a plane representing the position and orientation of the surface
+	/// @param r is the ration n1/n0
+	/// @param n1 is the refractive index inside the surface (in the direction away from the normal)
+	/// @param r is the reflected component of the plane wave
+	/// @param t is the transmitted component of the plane wave
+	void scatter(stim::plane<T> P, T nr, planewave<T> &r, planewave<T> &t){
  
 		int facing = P.face(k);		//determine which direction the plane wave is coming in
  
-		//if(facing == 0)				//if the wave is tangent to the plane, return an identical wave
-		//	return *this;
-		//else 
 		if(facing == -1){		//if the wave hits the back of the plane, invert the plane and nr
 			P = P.flip();			//flip the plane
 			nr = 1/nr;				//invert the refractive index (now nr = n0/n1)
@@ -192,7 +204,7 @@ public:
 		bool tir = false;						//flag for total internal reflection
 		if(theta_t != theta_t){
 			tir = true;
-			theta_t = rtsPI / (T)2;
+			theta_t = stim::PI / (T)2;
 		}
  
 		//handle the degenerate case where theta_i is 0 (the plane wave hits head-on)
@@ -205,17 +217,10 @@ public:
 			vec< complex<T> > Et = E0 * tp;
 			T phase_t = P.p().dot(k - kt);	//compute the phase offset
 			T phase_r = P.p().dot(k - kr);
-			//std::cout<<"Degeneracy: Head-On"<<std::endl;
-			//std::cout<<"rs: "<<rp<<"  rp: "<<rp<<"  ts: "<<tp<<"  tp: "<<tp<<std::endl;
-			//std::cout<<"phase r: "<<phase_r<<"  phase t: "<<phase_t<<std::endl;
  
 			//create the plane waves
 			r = planewave<T>(kr, Er, phase_r);
 			t = planewave<T>(kt, Et, phase_t);
-
-			//std::cout<<"i + r: "<<pos()[0] + r.pos()[0]<<pos()[1] + r.pos()[1]<<pos()[2] + r.pos()[2]<<std::endl;
-			//std::cout<<"t:     "<<t.pos()[0]<<t.pos()[1]<<t.pos()[2]<<std::endl;
-			//std::cout<<"--------------------------------"<<std::endl;
 			return;
 		}
  
@@ -245,11 +250,9 @@ public:
  
 		//compute the magnitude of the p- and s-polarized components of the incident E vector
 		complex<T> Ei_s = E0.dot(x_hat);
-		//int sgn = (0 < E0.dot(y_hat)) - (E0.dot(y_hat) < 0);
 		int sgn = E0.dot(y_hat).sgn();
 		vec< complex<T> > cx_hat = x_hat;
 		complex<T> Ei_p = ( E0 - cx_hat * Ei_s ).len() * sgn;
-		//T Ei_p = ( E0 - x_hat * Ei_s ).len();
 		//compute the magnitude of the p- and s-polarized components of the reflected E vector
 		complex<T> Er_s = Ei_s * rs;
 		complex<T> Er_p = Ei_p * rp;
@@ -257,14 +260,6 @@ public:
 		complex<T> Et_s = Ei_s * ts;
 		complex<T> Et_p = Ei_p * tp;
  
-		//std::cout<<"E0: "<<E0<<std::endl;
-		//std::cout<<"E0 dot y_hat: "<<E0.dot(y_hat)<<std::endl;
-		//std::cout<<"theta i: "<<theta_i<<"  theta t: "<<theta_t<<std::endl;
-		//std::cout<<"x_hat: "<<x_hat<<"  y_hat: "<<y_hat<<"  z_hat: "<<z_hat<<std::endl;
-		//std::cout<<"Ei_s: "<<Ei_s<<"  Ei_p: "<<Ei_p<<"  Er_s: "<<Er_s<<"  Er_p: "<<Er_p<<"  Et_s: "<<Et_s<<"  Et_p: "<<Et_p<<std::endl;
-		//std::cout<<"rs: "<<rs<<"  rp: "<<rp<<"  ts: "<<ts<<"  tp: "<<tp<<std::endl;
-		
-
 		//compute the reflected E vector
 		vec< complex<T> > Er = vec< complex<T> >(y_hat * cos(theta_i) + z_hat * sin(theta_i)) * Er_p + cx_hat * Er_s;
 		//compute the transmitted E vector
@@ -273,29 +268,12 @@ public:
 		T phase_t = P.p().dot(k - kt);
 		T phase_r = P.p().dot(k - kr);
  
-		//std::cout<<"phase r: "<<phase_r<<"  phase t: "<<phase_t<<std::endl;
-
-		//std::cout<<"phase: "<<phase<<std::endl;
-
 		//create the plane waves
 		r.k = kr;
 		r.E0 = Er * exp( complex<T>(0, phase_r) );
-		//r.phi = phase_r;
-
-		//t = bend(kt);
-		//t.k = t.k * nr;
  
 		t.k = kt;
 		t.E0 = Et * exp( complex<T>(0, phase_t) );
-		//t.phi = phase_t;
-		//std::cout<<"i: "<<str()<<std::endl;
-		//std::cout<<"r: "<<r.str()<<std::endl;
-		//std::cout<<"t: "<<t.str()<<std::endl;
-
-		//std::cout<<"i + r: "<<pos()[0] + r.pos()[0]<<pos()[1] + r.pos()[1]<<pos()[2] + r.pos()[2]<<std::endl;
-		//std::cout<<"t:     "<<t.pos()[0]<<t.pos()[1]<<t.pos()[2]<<std::endl;
-		//std::cout<<"--------------------------------"<<std::endl;
-
 	}
  
 	std::string str()
@@ -305,14 +283,15 @@ public:
 		ss<<"	"<<E0<<" e^i ( "<<k<<" . r )";
 		return ss.str();
 	}
-};
-}
+};					//end planewave class
+}					//end namespace optics
+}					//end namespace stim
  
 template <typename T>
-std::ostream& operator<<(std::ostream& os, rts::planewave<T> p)
+std::ostream& operator<<(std::ostream& os, stim::optics::planewave<T> p)
 {
     os<<p.str();
     return os;
 }
  
 -#endif
+#endif
 \ No newline at end of file
+#ifndef RTS_BEAM
+#define RTS_BEAM
+
+#include "../math/vec3.h"
+#include "../optics/scalarwave.h"
+#include "../math/bessel.h"
+#include "../math/legendre.h"
+#include <vector>
+
+namespace stim{
+
+		/// Function returns the value of the scalar field produced by a beam with the specified parameters
+
+		template<typename T>
+		std::vector< stim::vec3<T> > generate_focusing_vectors(size_t N, stim::vec3<T> d, T NA, T NA_in = 0){
+
+			std::vector< stim::vec3<T> > dirs(N);					//allocate an array to store the focusing vectors
+
+			///compute the rotation operator to transform (0, 0, 1) to k
+			T cos_angle = d.dot(vec3<T>(0, 0, 1));
+			stim::matrix<T, 3> rotation;
+
+			//if the cosine of the angle is -1, the rotation is just a flip across the z axis
+			if(cos_angle == -1){
+				rotation(2, 2) = -1;
+			}
+			else if(cos_angle != 1.0)
+			{
+				vec3<T> r_axis = vec3<T>(0, 0, 1).cross(d).norm();	//compute the axis of rotation
+				T angle = acos(cos_angle);							//compute the angle of rotation
+				quaternion<T> quat;							//create a quaternion describing the rotation
+				quat.CreateRotation(angle, r_axis);
+				rotation = quat.toMatrix3();							//compute the rotation matrix
+			}
+
+			//find the phi values associated with the cassegrain ring
+			T PHI[2];
+			PHI[0] = (T)asin(NA);
+			PHI[1] = (T)asin(NA_in);
+
+			//calculate the z-axis cylinder coordinates associated with these angles
+			T Z[2];
+			Z[0] = cos(PHI[0]);
+			Z[1] = cos(PHI[1]);
+			T range = Z[0] - Z[1];
+
+			//draw a distribution of random phi, z values
+			T z, phi, theta;
+			//T kmag = stim::TAU / lambda;
+			for(int i=0; i<N; i++){								//for each sample
+				z = (T)((double)rand() / (double)RAND_MAX) * range + Z[1];			//find a random position on the surface of a cylinder
+				theta = (T)(((double)rand() / (double)RAND_MAX) * stim::TAU);
+				phi = acos(z);													//project onto the sphere, computing phi in spherical coordinates
+
+				//compute and store cartesian coordinates
+				vec3<T> spherical(1, theta, phi);								//convert from spherical to cartesian coordinates
+				vec3<T> cart = spherical.sph2cart();
+				dirs[i] = rotation * cart;										//create a sample vector
+			}
+			return dirs;
+		}
+		
+/// Class stim::beam represents a beam of light focused at a point and composed of several plane waves
+template<typename T>
+class scalarbeam
+{
+public:
+	//enum beam_type {Uniform, Bartlett, Hamming, Hanning};
+
+private:
+	
+	T NA[2];				//numerical aperature of the focusing optics	
+	vec3<T> f;				//focal point
+	vec3<T> d;				//propagation direction
+	stim::complex<T> A;		//beam amplitude
+	T lambda;				//beam wavelength
+public:
+
+	///constructor: build a default beam (NA=1.0)
+	scalarbeam(T wavelength = 1, stim::complex<T> amplitude = 1, vec3<T> focal_point = vec3<T>(0, 0, 0), vec3<T> direction = vec3<T>(0, 0, 1), T numerical_aperture = 1, T center_obsc = 0){
+		lambda = wavelength;
+		A = amplitude;
+		f = focal_point;
+		d = direction.norm();					//make sure that the direction vector is normalized (makes calculations more efficient later on)
+		NA[0] = numerical_aperture;
+		NA[1] = center_obsc;
+	}
+
+	///Numerical Aperature functions
+	void setNA(T na)
+	{
+		NA[0] = (T)0;
+		NA[1] = na;
+	}
+	void setNA(T na0, T na1)
+	{
+		NA[0] = na0;
+		NA[1] = na1;
+	}
+
+	//Monte-Carlo decomposition into plane waves
+	std::vector< scalarwave<T> > mc(size_t N = 100000) const{
+
+		std::vector< stim::vec3<T> > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]);	//generate a random set of N vectors forming a focus
+		std::vector< scalarwave<T> > samples(N);											//create a vector of plane waves
+		T kmag = (T)stim::TAU / lambda;								//calculate the wavenumber
+		stim::complex<T> apw;										//allocate space for the amplitude at the focal point
+		stim::vec3<T> kpw;											//declare the new k-vector based on the focused plane wave direction
+		for(size_t i=0; i<N; i++){										//for each sample
+			kpw = dirs[i] * kmag;									//calculate the k-vector for the new plane wave
+			apw = exp(stim::complex<T>(0, kpw.dot(-f)));				//calculate the amplitude for the new plane wave
+			samples[i] = scalarwave<T>(kpw, apw);			//create a plane wave based on the direction
+		}
+
+		return samples;
+	}
+
+	/// Calculate the field at a given point
+	/// @param x is the x-coordinate of the field point
+	/// @O is the approximation accuracy
+	stim::complex<T> field(T x, T y, T z, size_t O){
+		std::vector< scalarwave<T> > W = mc(O);
+		T result = 0;											//initialize the result to zero (0)
+		for(size_t i = 0; i < O; i++){							//for each plane wave
+			result += W[i].pos(x, y, z);
+		}
+		return result;
+	}
+
+	std::string str()
+	{
+		std::stringstream ss;
+		ss<<"Beam:"<<std::endl;
+		//ss<<"	Central Plane Wave: "<<beam::E0<<" e^i ( "<<beam::k<<" . r )"<<std::endl;
+		ss<<"	Beam Direction: "<<d<<std::endl;
+		if(NA[0] == 0)
+			ss<<"	NA: "<<NA[1];
+		else
+			ss<<"	NA: "<<NA[0]<<" -- "<<NA[1];
+
+		return ss.str();
+	}
+
+
+
+};			//end beam
+
+/// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional)
+/// @param C is a pointer to Nl + 1 values where the terms will be stored
+template<typename T>
+CUDA_CALLABLE void cpu_aperture_integral(T* C, size_t Nl, T NA, T NA_in = 0){
+
+	size_t table_bytes = (Nl + 1) * sizeof(T);				//calculate the number of bytes required to store the terms
+	T cos_alpha_1 = cos(asin(NA_in));						//calculate the cosine of the angle subtended by the central obscuration
+	T cos_alpha_2 = cos(asin(NA));							//calculate the cosine of the angle subtended by the aperture
+
+	// the aperture integral is computed using four individual Legendre polynomials, each a function of the angles subtended
+	//		by the objective and central obscuration
+	T* Pln_a1 = (T*) malloc(table_bytes);
+	stim::legendre<T>(Nl-1, cos_alpha_1, &Pln_a1[1]);
+	Pln_a1[0] = 1;
+
+	T* Pln_a2 = (T*) malloc(table_bytes);
+	stim::legendre<T>(Nl-1, cos_alpha_2, &Pln_a2[1]);
+	Pln_a2[0] = 1;
+
+	T* Plp_a1 = (T*) malloc(table_bytes+sizeof(T));
+	stim::legendre<T>(Nl+1, cos_alpha_1, Plp_a1);
+
+	T* Plp_a2 = (T*) malloc(table_bytes+sizeof(T));
+	stim::legendre<T>(Nl+1, cos_alpha_2, Plp_a2);
+
+	for(size_t l = 0; l <= Nl; l++){
+		C[l] = Plp_a1[l+1] - Plp_a2[l+1] - Pln_a1[l] + Pln_a2[l];
+	}
+
+	free(Pln_a1);
+	free(Pln_a2);
+	free(Plp_a1);
+	free(Plp_a2);
+}
+
+/// performs linear interpolation into a look-up table
+template<typename T>
+T lut_lookup(T* lut, T val, size_t N, T min_val, T delta, size_t stride = 0){
+	size_t idx = (size_t)((val - min_val) / delta);
+	T alpha = val - idx * delta + min_val;
+
+	if(alpha == 0) return lut[idx];
+	else return lut[idx * stride] * (1 - alpha) + lut[ (idx+1) * stride] * alpha;
+}
+
+template<typename T>
+void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, stim::vec3<T> f, T NA, T NA_in, int Nl){
+	T k = stim::TAU / lambda;
+
+	T* C = (T*) malloc( (Nl + 1) * sizeof(T) );					//allocate space for the aperture integral terms
+	cpu_aperture_integral(C, Nl, NA, NA_in);			//calculate the aperture integral terms
+	memset(F, 0, N * sizeof(stim::complex<T>));
+#ifdef NO_CUDA
+	memset(F, 0, N * sizeof(stim::complex<T>));
+	T jl, Pl, kr, cos_phi;
+
+	double vm;
+	double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	T* Pl_cos_phi = (T*) malloc((Nl + 1) * sizeof(T));
+
+	for(size_t n = 0; n < N; n++){								//for each point in the field
+		kr = k * r[n];											//calculate kr (the optical distance between the focal point and p)
+		cos_phi = std::cos(phi[n]);								//calculate the cosine of phi
+		stim::bessjyv_sph<double>(Nl, kr, vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
+		stim::legendre<T>(Nl, cos_phi, Pl_cos_phi);				//calculate the [0 Nl] legendre polynomials for this point
+
+		for(int l = 0; l <= Nl; l++){
+			jl = (T)jv[l];
+			Pl = Pl_cos_phi[l];
+			F[n] += pow(complex<T>(0, 1), l) * jl * Pl * C[l];
+		}
+		F[n] *= A * stim::TAU;
+	}
+
+	free(C);
+	free(Pl_cos_phi);
+#else
+	T min_r = r[0];
+	T max_r = r[0];
+	for(size_t i = 0; i < N; i++){								//find the minimum and maximum values of r (min and max distance from the focal point)
+		if(r[i] < min_r) min_r = r[i];
+		if(r[i] > max_r) max_r = r[i];
+	}
+	T min_kr = k * min_r;
+	T max_kr = k * max_r;
+
+	//temporary variables
+	double vm;
+	double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	size_t Nlut = (size_t)sqrt(N) * 2;
+	T* bessel_lut = (T*) malloc(sizeof(T) * (Nl+1) * Nlut);
+	T delta_kr = (max_kr - min_kr) / (Nlut-1);
+	for(size_t kri = 0; kri < Nlut; kri++){
+		stim::bessjyv_sph<double>(Nl, min_kr + kri * delta_kr, vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
+		for(size_t l = 0; l <= Nl; l++){
+			bessel_lut[kri * (Nl + 1) + l] = (T)jv[l];
+		}
+	}
+
+	T* Pl_cos_phi = (T*) malloc((Nl + 1) * sizeof(T));
+	T kr, cos_phi, jl, Pl;
+	for(size_t n = 0; n < N; n++){								//for each point in the field
+		kr = k * r[n];											//calculate kr (the optical distance between the focal point and p)
+		cos_phi = std::cos(phi[n]);								//calculate the cosine of phi
+		stim::legendre<T>(Nl, cos_phi, Pl_cos_phi);				//calculate the [0 Nl] legendre polynomials for this point
+
+		for(int l = 0; l <= Nl; l++){
+			jl = lut_lookup<T>(&bessel_lut[l], kr, Nlut, min_kr, delta_kr, Nl+1);
+			Pl = Pl_cos_phi[l];
+			F[n] += pow(complex<T>(0, 1), l) * jl * Pl * C[l];
+		}
+		F[n] *= A * stim::TAU;
+	}
+#endif
+}
+
+
+template<typename T>
+void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, T NA, T NA_in, int Nl){
+	T* r = (T*) malloc(N * sizeof(T));					//allocate space for p in spherical coordinates
+	T* phi = (T*) malloc(N * sizeof(T));				//	only r and phi are necessary (the scalar PSF is symmetric about theta)
+
+	stim::vec3<T> p, ps;
+	for(size_t i = 0; i < N; i++){
+		(x == NULL) ? p[0] = 0 : p[0] = x[i];	// test for NULL values and set positions
+		(y == NULL) ? p[1] = 0 : p[1] = y[i];
+		(z == NULL) ? p[2] = 0 : p[2] = z[i];
+
+		ps = p.cart2sph();						//convert from cartesian to spherical coordinates
+		r[i] = ps[0];							//store r
+		phi[i] = ps[2];							//phi = [0 pi]
+	}
+
+	cpu_scalar_psf(F, N, r, phi, lambda, A, f, NA, NA_in, Nl);		//call the spherical coordinate CPU function
+
+	free(r);
+	free(phi);
+}
+
+}			//end namespace stim
+
+#endif
+#ifndef STIM_SCALARWAVE_H
+#define STIM_SCALARWAVE_H
+
+
+#include <string>
+#include <sstream>
+#include <cmath>
+
+//#include "../math/vector.h"
+#include "../math/vec3.h"
+#include "../math/quaternion.h"
+#include "../math/constants.h"
+#include "../math/plane.h"
+#include "../math/complex.h"
+
+//CUDA
+#include "../cuda/cudatools/devices.h"
+#include "../cuda/cudatools/error.h"
+#include "../cuda/sharedmem.cuh"
+
+namespace stim{
+
+template<typename T>
+class scalarwave{
+
+protected:
+
+	stim::vec3<T> k;							//k-vector, pointed in propagation direction with magnitude |k| = tau / lambda = 2pi / lambda
+	stim::complex<T> E0;						//amplitude
+
+	/// Bend a plane wave via refraction, given that the new propagation direction is known
+	CUDA_CALLABLE scalarwave<T> bend(stim::vec3<T> kn) const{
+		return scalarwave<T>(kn.norm() * kmag(), E0);
+	}
+
+public:
+
+	///constructor: create a plane wave propagating along k
+	CUDA_CALLABLE scalarwave(vec3<T> kvec = stim::vec3<T>(0, 0, (T)stim::TAU), complex<T> E = 1){
+		k = kvec;
+		E0 = E;
+	}
+
+	CUDA_CALLABLE scalarwave(T kx, T ky, T kz, complex<T> E = 1){
+		k = vec3<T>(kx, ky, kz);
+		E0 = E;
+	}
+
+	///multiplication operator: scale E0
+    CUDA_CALLABLE scalarwave<T> & operator* (const T & rhs){		
+		E0 = E0 * rhs;
+		return *this;
+	}
+
+	CUDA_CALLABLE T lambda() const{
+		return stim::TAU / k.len();
+	}
+
+	CUDA_CALLABLE T kmag() const{
+		return k.len();
+	}
+
+	CUDA_CALLABLE vec3< complex<T> > E(){
+		return E0;
+	}
+
+	CUDA_CALLABLE vec3<T> kvec(){
+		return k;
+	}
+
+	/// calculate the value of the field produced by the plane wave given a three-dimensional position
+	CUDA_CALLABLE complex<T> pos(T x, T y, T z){
+		return pos( stim::vec3<T>(x, y, z) );
+	}
+
+	CUDA_CALLABLE complex<T> pos(vec3<T> p = vec3<T>(0, 0, 0)){
+		return E0 * exp(complex<T>(0, k.dot(p)));
+	}
+
+	//scales k based on a transition from material ni to material nt
+	CUDA_CALLABLE scalarwave<T> n(T ni, T nt){
+		return scalarwave<T>(k * (nt / ni), E0);
+	}
+
+	CUDA_CALLABLE scalarwave<T> refract(stim::vec3<T> kn) const{
+		return bend(kn);
+	}
+
+	/// Calculate the result of a plane wave hitting an interface between two refractive indices
+
+	/// @param P is a plane representing the position and orientation of the surface
+	/// @param n0 is the refractive index outside of the surface (in the direction of the normal)
+	/// @param n1 is the refractive index inside the surface (in the direction away from the normal)
+	/// @param r is the reflected component of the plane wave
+	/// @param t is the transmitted component of the plane wave
+	void scatter(stim::plane<T> P, T n0, T n1, scalarwave<T> &r, scalarwave<T> &t){
+		scatter(P, n1/n0, r, t);
+	}
+
+	/// Calculate the scattering result when nr = n1/n0
+
+	/// @param P is a plane representing the position and orientation of the surface
+	/// @param r is the ration n1/n0
+	/// @param n1 is the refractive index inside the surface (in the direction away from the normal)
+	/// @param r is the reflected component of the plane wave
+	/// @param t is the transmitted component of the plane wave
+	void scatter(stim::plane<T> P, T nr, scalarwave<T> &r, scalarwave<T> &t){
+		/*
+		int facing = P.face(k);		//determine which direction the plane wave is coming in
+
+		if(facing == -1){		//if the wave hits the back of the plane, invert the plane and nr
+			P = P.flip();			//flip the plane
+			nr = 1/nr;				//invert the refractive index (now nr = n0/n1)
+		}
+
+		//use Snell's Law to calculate the transmitted angle
+		T cos_theta_i = k.norm().dot(-P.norm());				//compute the cosine of theta_i
+		T theta_i = acos(cos_theta_i);							//compute theta_i
+		T sin_theta_t = (1/nr) * sin(theta_i);						//compute the sine of theta_t using Snell's law
+		T theta_t = asin(sin_theta_t);							//compute the cosine of theta_t
+
+		bool tir = false;						//flag for total internal reflection
+		if(theta_t != theta_t){
+			tir = true;
+			theta_t = stim::PI / (T)2;
+		}
+
+		//handle the degenerate case where theta_i is 0 (the plane wave hits head-on)
+		if(theta_i == 0){
+			T rp = (1 - nr) / (1 + nr);		//compute the Fresnel coefficients
+			T tp = 2 / (1 + nr);
+			vec3<T> kr = -k;
+			vec3<T> kt = k * nr;			//set the k vectors for theta_i = 0
+			vec3< complex<T> > Er = E0 * rp;		//compute the E vectors
+			vec3< complex<T> > Et = E0 * tp;
+			T phase_t = P.p().dot(k - kt);	//compute the phase offset
+			T phase_r = P.p().dot(k - kr);
+
+			//create the plane waves
+			r = planewave<T>(kr, Er, phase_r);
+			t = planewave<T>(kt, Et, phase_t);
+			return;
+		}
+
+
+		//compute the Fresnel coefficients
+		T rp, rs, tp, ts;
+		rp = tan(theta_t - theta_i) / tan(theta_t + theta_i);
+		rs = sin(theta_t - theta_i) / sin(theta_t + theta_i);
+		
+		if(tir){
+			tp = ts = 0;
+		}
+		else{
+			tp = ( 2 * sin(theta_t) * cos(theta_i) ) / ( sin(theta_t + theta_i) * cos(theta_t - theta_i) );
+			ts = ( 2 * sin(theta_t) * cos(theta_i) ) / sin(theta_t + theta_i);
+		}
+
+		//compute the coordinate space for the plane of incidence
+		vec3<T> z_hat = -P.norm();
+		vec3<T> y_hat = P.parallel(k).norm();
+		vec3<T> x_hat = y_hat.cross(z_hat).norm();
+
+		//compute the k vectors for r and t
+		vec3<T> kr, kt;
+		kr = ( y_hat * sin(theta_i) - z_hat * cos(theta_i) ) * kmag();
+		kt = ( y_hat * sin(theta_t) + z_hat * cos(theta_t) ) * kmag() * nr;
+
+		//compute the magnitude of the p- and s-polarized components of the incident E vector
+		complex<T> Ei_s = E0.dot(x_hat);
+		int sgn = E0.dot(y_hat).sgn();
+		vec3< complex<T> > cx_hat = x_hat;
+		complex<T> Ei_p = ( E0 - cx_hat * Ei_s ).len() * sgn;
+		//compute the magnitude of the p- and s-polarized components of the reflected E vector
+		complex<T> Er_s = Ei_s * rs;
+		complex<T> Er_p = Ei_p * rp;
+		//compute the magnitude of the p- and s-polarized components of the transmitted E vector
+		complex<T> Et_s = Ei_s * ts;
+		complex<T> Et_p = Ei_p * tp;
+
+		//compute the reflected E vector
+		vec3< complex<T> > Er = vec3< complex<T> >(y_hat * cos(theta_i) + z_hat * sin(theta_i)) * Er_p + cx_hat * Er_s;
+		//compute the transmitted E vector
+		vec3< complex<T> > Et = vec3< complex<T> >(y_hat * cos(theta_t) - z_hat * sin(theta_t)) * Et_p + cx_hat * Et_s;
+
+		T phase_t = P.p().dot(k - kt);
+		T phase_r = P.p().dot(k - kr);
+
+		//create the plane waves
+		r.k = kr;
+		r.E0 = Er * exp( complex<T>(0, phase_r) );
+
+		t.k = kt;
+		t.E0 = Et * exp( complex<T>(0, phase_t) );
+		*/
+	}
+
+	std::string str()
+	{
+		std::stringstream ss;
+		ss<<"Plane Wave:"<<std::endl;
+		ss<<"	"<<E0<<" e^i ( "<<k<<" . r )";
+		return ss.str();
+	}
+};					//end planewave class
+
+
+/// CUDA kernel for computing the field produced by a batch of plane waves at an array of locations
+template<typename T>
+__global__ void cuda_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t n_waves){
+	extern __shared__ stim::scalarwave<T> shared_W[];		//declare the list of waves in shared memory
+
+	stim::cuda::sharedMemcpy(shared_W, W, n_waves, threadIdx.x, blockDim.x);	//copy the plane waves into shared memory for faster access
+	__syncthreads();															//synchronize threads to insure all data is copied
+
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;												//exit if this thread is outside the array
+	T px, py, pz;
+	(x == NULL) ? px = 0 : px = x[i];								// test for NULL values and set positions
+	(y == NULL) ? py = 0 : py = y[i];
+	(z == NULL) ? pz = 0 : pz = z[i];
+	
+	stim::complex<T> f = 0;											//create a register to store the result
+	for(size_t w = 0; w < n_waves; w++)
+		f += shared_W[w].pos(px, py, pz);							//evaluate the plane wave
+	F[i] += f;														//copy the result to device memory
+}
+
+/// evaluate a scalar wave at several points, where all arrays are on the GPU
+template<typename T>
+void gpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
+	
+	int threads = stim::maxThreadsPerBlock();			//get the maximum number of threads per block for the CUDA device
+	dim3 blocks(N / threads + 1);						//calculate the optimal number of blocks
+	cuda_scalarwave<T><<< blocks, threads >>>(F, N, x, y, z, w);			//call the kernel
+}
+
+/// Sums a series of coherent plane waves at a specified point
+/// @param field is the output array of field values corresponding to each input point
+/// @param x is an array of x coordinates for the field point
+/// @param y is an array of y coordinates for the field point
+/// @param z is an array of z coordinates for the field point
+/// @param N is the number of points in the input and output arrays
+/// @param lambda is the wavelength (all coherent waves are assumed to have the same wavelength)
+/// @param A is the list of amplitudes for each wave
+/// @param S is the list of propagation directions for each wave
+template<typename T>
+void cpu_sum_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > w_array){
+	size_t S = w_array.size();											//store the number of waves
+#ifdef NO_CUDA
+	memset(F, 0, N * sizeof(stim::complex<T>));
+	T px, py, pz;
+	for(size_t i = 0; i < N; i++){										// for each element in the array
+		(x == NULL) ? px = 0 : px = x[i];								// test for NULL values
+		(y == NULL) ? py = 0 : py = y[i];
+		(z == NULL) ? pz = 0 : pz = z[i];
+
+		for(size_t s = 0; s < S; s++){
+			F[i] += w_array[s].pos(px, py, pz);						//sum all plane waves at this point
+		}
+	}
+#else
+	stim::complex<T>* dev_F;										//allocate space for the field
+	cudaMalloc(&dev_F, N * sizeof(stim::complex<T>));
+	cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
+
+	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
+	if(x != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+
+	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
+	if(y != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+
+	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
+	if(z != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+
+	size_t wave_bytes = sizeof(stim::scalarwave<T>);
+	size_t shared_bytes = stim::sharedMemPerBlock();									//calculate the maximum amount of shared memory available
+	size_t array_bytes = w_array.size() * wave_bytes;			//calculate the maximum number of bytes required for the planewave array
+	size_t max_batch = shared_bytes / wave_bytes;				//calculate number of plane waves that will fit into shared memory
+	size_t num_batches = w_array.size() / max_batch + 1;								//calculate the number of batches required to process all plane waves
+	size_t batch_bytes = min(w_array.size(), max_batch) * wave_bytes;				//initialize the batch size (in bytes) to the maximum batch required
+
+	stim::scalarwave<T>* dev_w;
+	HANDLE_ERROR(cudaMalloc(&dev_w, batch_bytes));										//allocate memory for a single batch of plane waves
+
+	int threads = stim::maxThreadsPerBlock();							//get the maximum number of threads per block for the CUDA device
+	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks	
+
+	size_t batch_size;																	//declare a variable to store the size of the current batch
+	size_t waves_processed = 0;															//initialize the number of waves processed to zero
+	while(waves_processed < w_array.size()){												//while there are still waves to be processed
+		batch_size = min<size_t>(max_batch, w_array.size() - waves_processed);			//process either a whole batch, or whatever is left
+		batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
+		HANDLE_ERROR(cudaMemcpy(dev_w, &w_array[waves_processed], batch_bytes, cudaMemcpyHostToDevice));	//copy the plane waves into global memory
+		cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(dev_F, N, dev_x, dev_y, dev_z, dev_w, batch_size);	//call the kernel
+		waves_processed += batch_size;													//increment the counter indicating how many waves have been processed
+	}
+
+	cudaMemcpy(F, dev_F, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
+
+	if(x != NULL) cudaFree(dev_x);														//free everything
+	if(y != NULL) cudaFree(dev_y);
+	if(z != NULL) cudaFree(dev_z);
+	cudaFree(dev_F);
+	cudaFree(dev_w);
+
+#endif
+}
+
+template<typename T>
+void cpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
+	std::vector< stim::scalarwave<T> > w_array(1, w);
+	cpu_sum_scalarwaves(F, N, x, y, z, w_array);	
+}
+
+
+/// Sums a series of coherent plane waves at a specified point
+/// @param x is the x coordinate of the field point
+/// @param y is the y coordinate of the field point
+/// @param z is the z coordinate of the field point
+/// @param lambda is the wavelength (all coherent waves are assumed to have the same wavelength)
+/// @param A is the list of amplitudes for each wave
+/// @param S is the list of propagation directions for each wave
+template<typename T>
+CUDA_CALLABLE stim::complex<T> sum_scalarwaves(T x, T y, T z, std::vector< stim::scalarwave<T> > W){
+	size_t N = W.size();												//get the number of plane wave samples
+	stim::complex<T> field(0, 0);										//initialize the field to zero (0)
+	stim::vec3<T> k;													//allocate space for the direction vector
+	for(size_t i = 0; i < N; i++){
+		field += W[i].pos(x, y, z);
+	}
+	return field;
+}
+
+}					//end namespace stim
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, stim::scalarwave<T> p)
+{
+    os<<p.str();
+    return os;
+}
+
+#endif
 \ No newline at end of file
-#ifndef RTS_BEAM
-#define RTS_BEAM
-
-#include "../math/vector.h"
-#include "../math/function.h"
-#include "../optics/planewave.h"
-#include <vector>
-
-namespace stim{
-
-template<typename P>
-class beam : public planewave<P>
-{
-public:
-	enum beam_type {Uniform, Bartlett, Hamming, Hanning};
-
-private:
-	
-	P _na[2];		//numerical aperature of the focusing optics	
-	vec<P> f;		//focal point	
-	function<P, P> apod;	//apodization function
-	unsigned int apod_res;	//resolution of apodization filter functions
-
-	void apod_uniform()
-	{
-		apod = (P)1;
-	}
-	void apod_bartlett()
-	{
-		apod = (P)1;
-		apod.insert((P)1, (P)0);
-	}
-	void apod_hanning()
-	{
-		apod = (P)0;
-		P x, y;
-		for(unsigned int n=0; n<apod_res; n++)
-		{
-			x = (P)n/(P)apod_res;
-			y = pow( cos( ((P)3.14159 * x) / 2 ), 2);
-			apod.insert(x, y);
-		}
-	}
-	void apod_hamming()
-	{
-		apod = (P)0;
-		P x, y;
-		for(unsigned int n=0; n<apod_res; n++)
-		{
-			x = (P)n/(P)apod_res;
-			y = (P)27/(P)50 + ( (P)23/(P)50 ) * cos((P)3.14159 * x);
-			apod.insert(x, y);
-		}
-	}
-
-	void set_apod(beam_type type)
-	{
-		if(type == Uniform)
-			apod_uniform();
-		if(type == Bartlett)
-			apod_bartlett();
-		if(type == Hanning)
-			apod_hanning();
-		if(type == Hamming)
-			apod_hamming();
-	}
-
-public:
-
-	///constructor: build a default beam (NA=1.0)
-	beam(
-		vec<P> k = rts::vec<P>(0, 0, rtsTAU), 
-		vec<P> _E0 = rts::vec<P>(1, 0, 0), 
-		beam_type _apod = Uniform)
-		: planewave<P>(k, _E0)
-	{
-		_na[0] = (P)0.0;
-		_na[1] = (P)1.0;
-		f = vec<P>( (P)0, (P)0, (P)0 );
-		apod_res = 256;						//set the default resolution for apodization filters
-		set_apod(_apod);						//set the apodization function type
-	}
-
-	beam<P> refract(rts::vec<P> kn) const{
-
-		beam<P> new_beam;
-		new_beam._na[0] = _na[0];
-		new_beam._na[1] = _na[1];
-
-
-		rts::planewave<P> pw = planewave<P>::bend(kn);
-		//std::cout<<pw.str()<<std::endl;
-
-		new_beam.k = pw.kvec();
-		new_beam.E0 = pw.E();
-
-		return new_beam;
-	}
-
-	///Numerical Aperature functions
-	void NA(P na)
-	{
-		_na[0] = (P)0;
-		_na[1] = na;
-	}
-	void NA(P na0, P na1)
-	{
-		_na[0] = na0;
-		_na[1] = na1;
-	}
-
-	/*string str() : 
-	{
-		stringstream ss;
-		ss<<"Beam Center: "<<k<<std::endl;
-
-		return ss.str();
-	}*/
-
-	//Monte-Carlo decomposition into plane waves
-	std::vector< planewave<P> > mc(unsigned int N = 100000, unsigned int seed = 0) const
-	{
-		/*Create Monte-Carlo samples of a cassegrain objective by performing uniform sampling
-			of a sphere and projecting these samples onto an inscribed sphere.
-
-			seed	=	seed for the random number generator
-		*/
-		srand(seed);		//seed the random number generator
-
-		vec<P> k_hat = beam::k.norm();
-
-		///compute the rotation operator to transform (0, 0, 1) to k
-		P cos_angle = k_hat.dot(rts::vec<P>(0, 0, 1));
-		rts::matrix<P, 3> rotation;
-
-		//if the cosine of the angle is -1, the rotation is just a flip across the z axis
-		if(cos_angle == -1){
-			rotation(2, 2) = -1;
-		}
-		else if(cos_angle != 1.0)
-		{
-			rts::vec<P> r_axis = rts::vec<P>(0, 0, 1).cross(k_hat).norm();	//compute the axis of rotation
-			P angle = acos(cos_angle);							//compute the angle of rotation
-			rts::quaternion<P> quat;							//create a quaternion describing the rotation
-			quat.CreateRotation(angle, r_axis);
-			rotation = quat.toMatrix3();							//compute the rotation matrix
-		}
-
-		//find the phi values associated with the cassegrain ring
-		P PHI[2];
-		PHI[0] = (P)asin(_na[0]);
-		PHI[1] = (P)asin(_na[1]);
-
-		//calculate the z-axis cylinder coordinates associated with these angles
-		P Z[2];
-		Z[0] = cos(PHI[0]);
-		Z[1] = cos(PHI[1]);
-		P range = Z[0] - Z[1];
-
-		std::vector< planewave<P> > samples;	//create a vector of plane waves
-
-		//draw a distribution of random phi, z values
-		P z, phi, theta;
-		for(int i=0; i<N; i++)								//for each sample
-		{
-			z = ((P)rand() / (P)RAND_MAX) * range + Z[1];	//find a random position on the surface of a cylinder
-			theta = ((P)rand() / (P)RAND_MAX) * 2 * (P)3.14159;
-			phi = acos(z);									//project onto the sphere, computing phi in spherical coordinates
-
-			//compute and store cartesian coordinates
-			rts::vec<P> spherical(1, theta, phi);				//convert from spherical to cartesian coordinates
-			rts::vec<P> cart = spherical.sph2cart();
-			vec<P> k_prime = rotation * cart;				//create a sample vector
-
-			//store a wave refracted along the given direction
-			//std::cout<<"k prime: "<<rotation<<std::endl;
-			samples.push_back(planewave<P>::refract(k_prime) * apod(phi/PHI[1]));
-		}
-
-		return samples;
-	}
-
-	std::string str()
-	{
-		std::stringstream ss;
-		ss<<"Beam:"<<std::endl;
-		//ss<<"	Central Plane Wave: "<<beam::E0<<" e^i ( "<<beam::k<<" . r )"<<std::endl;
-		ss<<"	Central Plane Wave: "<<beam::k<<std::endl;
-		if(_na[0] == 0)
-			ss<<"	NA: "<<_na[1];
-		else
-			ss<<"	NA: "<<_na[0]<<" -- "<<_na[1];
-
-		return ss.str();
-	}
-
-
-
-};
-
-}
-
-#endif
+#ifndef RTS_BEAM
+#define RTS_BEAM
+
+#include "../math/vector.h"
+#include "../math/function.h"
+#include "../optics/planewave.h"
+#include <vector>
+
+namespace stim{
+
+template<typename P>
+class beam : public planewave<P>
+{
+public:
+	enum beam_type {Uniform, Bartlett, Hamming, Hanning};
+
+private:
+	
+	P _na[2];		//numerical aperature of the focusing optics	
+	vec<P> f;		//focal point	
+	function<P, P> apod;	//apodization function
+	unsigned int apod_res;	//resolution of apodization filter functions
+
+	void apod_uniform()
+	{
+		apod = (P)1;
+	}
+	void apod_bartlett()
+	{
+		apod = (P)1;
+		apod.insert((P)1, (P)0);
+	}
+	void apod_hanning()
+	{
+		apod = (P)0;
+		P x, y;
+		for(unsigned int n=0; n<apod_res; n++)
+		{
+			x = (P)n/(P)apod_res;
+			y = pow( cos( ((P)3.14159 * x) / 2 ), 2);
+			apod.insert(x, y);
+		}
+	}
+	void apod_hamming()
+	{
+		apod = (P)0;
+		P x, y;
+		for(unsigned int n=0; n<apod_res; n++)
+		{
+			x = (P)n/(P)apod_res;
+			y = (P)27/(P)50 + ( (P)23/(P)50 ) * cos((P)3.14159 * x);
+			apod.insert(x, y);
+		}
+	}
+
+	void set_apod(beam_type type)
+	{
+		if(type == Uniform)
+			apod_uniform();
+		if(type == Bartlett)
+			apod_bartlett();
+		if(type == Hanning)
+			apod_hanning();
+		if(type == Hamming)
+			apod_hamming();
+	}
+
+public:
+
+	///constructor: build a default beam (NA=1.0)
+	beam(
+		vec<P> k = rts::vec<P>(0, 0, rtsTAU), 
+		vec<P> _E0 = rts::vec<P>(1, 0, 0), 
+		beam_type _apod = Uniform)
+		: planewave<P>(k, _E0)
+	{
+		_na[0] = (P)0.0;
+		_na[1] = (P)1.0;
+		f = vec<P>( (P)0, (P)0, (P)0 );
+		apod_res = 256;						//set the default resolution for apodization filters
+		set_apod(_apod);						//set the apodization function type
+	}
+
+	beam<P> refract(rts::vec<P> kn) const{
+
+		beam<P> new_beam;
+		new_beam._na[0] = _na[0];
+		new_beam._na[1] = _na[1];
+
+
+		rts::planewave<P> pw = planewave<P>::bend(kn);
+		//std::cout<<pw.str()<<std::endl;
+
+		new_beam.k = pw.kvec();
+		new_beam.E0 = pw.E();
+
+		return new_beam;
+	}
+
+	///Numerical Aperature functions
+	void NA(P na)
+	{
+		_na[0] = (P)0;
+		_na[1] = na;
+	}
+	void NA(P na0, P na1)
+	{
+		_na[0] = na0;
+		_na[1] = na1;
+	}
+
+	/*string str() : 
+	{
+		stringstream ss;
+		ss<<"Beam Center: "<<k<<std::endl;
+
+		return ss.str();
+	}*/
+
+	//Monte-Carlo decomposition into plane waves
+	std::vector< planewave<P> > mc(unsigned int N = 100000, unsigned int seed = 0) const
+	{
+		/*Create Monte-Carlo samples of a cassegrain objective by performing uniform sampling
+			of a sphere and projecting these samples onto an inscribed sphere.
+
+			seed	=	seed for the random number generator
+		*/
+		srand(seed);		//seed the random number generator
+
+		vec<P> k_hat = beam::k.norm();
+
+		///compute the rotation operator to transform (0, 0, 1) to k
+		P cos_angle = k_hat.dot(rts::vec<P>(0, 0, 1));
+		rts::matrix<P, 3> rotation;
+
+		//if the cosine of the angle is -1, the rotation is just a flip across the z axis
+		if(cos_angle == -1){
+			rotation(2, 2) = -1;
+		}
+		else if(cos_angle != 1.0)
+		{
+			rts::vec<P> r_axis = rts::vec<P>(0, 0, 1).cross(k_hat).norm();	//compute the axis of rotation
+			P angle = acos(cos_angle);							//compute the angle of rotation
+			rts::quaternion<P> quat;							//create a quaternion describing the rotation
+			quat.CreateRotation(angle, r_axis);
+			rotation = quat.toMatrix3();							//compute the rotation matrix
+		}
+
+		//find the phi values associated with the cassegrain ring
+		P PHI[2];
+		PHI[0] = (P)asin(_na[0]);
+		PHI[1] = (P)asin(_na[1]);
+
+		//calculate the z-axis cylinder coordinates associated with these angles
+		P Z[2];
+		Z[0] = cos(PHI[0]);
+		Z[1] = cos(PHI[1]);
+		P range = Z[0] - Z[1];
+
+		std::vector< planewave<P> > samples;	//create a vector of plane waves
+
+		//draw a distribution of random phi, z values
+		P z, phi, theta;
+		for(int i=0; i<N; i++)								//for each sample
+		{
+			z = ((P)rand() / (P)RAND_MAX) * range + Z[1];	//find a random position on the surface of a cylinder
+			theta = ((P)rand() / (P)RAND_MAX) * 2 * (P)3.14159;
+			phi = acos(z);									//project onto the sphere, computing phi in spherical coordinates
+
+			//compute and store cartesian coordinates
+			rts::vec<P> spherical(1, theta, phi);				//convert from spherical to cartesian coordinates
+			rts::vec<P> cart = spherical.sph2cart();
+			vec<P> k_prime = rotation * cart;				//create a sample vector
+
+			//store a wave refracted along the given direction
+			//std::cout<<"k prime: "<<rotation<<std::endl;
+			samples.push_back(planewave<P>::refract(k_prime) * apod(phi/PHI[1]));
+		}
+
+		return samples;
+	}
+
+	std::string str()
+	{
+		std::stringstream ss;
+		ss<<"Beam:"<<std::endl;
+		//ss<<"	Central Plane Wave: "<<beam::E0<<" e^i ( "<<beam::k<<" . r )"<<std::endl;
+		ss<<"	Central Plane Wave: "<<beam::k<<std::endl;
+		if(_na[0] == 0)
+			ss<<"	NA: "<<_na[1];
+		else
+			ss<<"	NA: "<<_na[0]<<" -- "<<_na[1];
+
+		return ss.str();
+	}
+
+
+
+};
+
+}
+
+#endif
-#ifndef RTS_MATERIAL_H
-#define RTS_MATERIAL_H
-
-#include <vector>
-#include <ostream>
-#include <iostream>
-#include <fstream>
-#include <complex>
-#include <algorithm>
-#include <sstream>
-#include "../math/complex.h"
-#include "../math/constants.h"
-#include "../math/function.h"
-
-namespace stim{
-
-//Material class - default representation for the material property is the refractive index (RI)
-template<typename T>
-class material : public function< T, complex<T> >{
-
-public:
-    enum wave_property{microns, inverse_cm};
-    enum material_property{ri, absorbance};
-
-private:
-
-    using function< T, complex<T> >::X;
-    using function< T, complex<T> >::Y;
-    using function< T, complex<T> >::insert;
-    using function< T, complex<T> >::bounding;
-
-    std::string name;	//name for the material (defaults to file name)
-
-    void process_header(std::string str, wave_property& wp, material_property& mp){
-
-    	std::stringstream ss(str);	//create a stream from the data string
-    	std::string line;
-    	std::getline(ss, line);		//get the first line as a string
-		while(line[0] == '#'){		//continue looping while the line is a comment
-
-			std::stringstream lstream(line);	//create a stream from the line
-			lstream.ignore();					//ignore the first character ('#')
-
-			std::string prop;		//get the property name
-			lstream>>prop;
-
-			if(prop == "X"){
-				std::string wp_name;
-				lstream>>wp_name;
-				if(wp_name == "microns") wp = microns;
-				else if(wp_name == "inverse_cm") wp = inverse_cm;
-			}
-			else if(prop == "Y"){
-				std::string mp_name;
-				lstream>>mp_name;
-				if(mp_name == "ri") mp = ri;
-				else if(mp_name == "absorbance") mp = absorbance;
-			}
-
-			std::getline(ss, line);		//get the next line
-		}
-
-		function< T, stim::complex<T> >::process_string(str);
-	}
-
-    void from_inverse_cm(){
-    	//convert inverse centimeters to wavelength (in microns)
-    	for(unsigned int i=0; i<X.size(); i++)
-    		X[i] = 10000 / X[i];
-
-    	//reverse the function array
-    	std::reverse(X.begin(), X.end());
-    	std::reverse(Y.begin(), Y.end());
-
-    }
-
-    void init(){
-    	bounding[0] = bounding[1] = stim::complex<T>(1, 0);
-    }
-
-
-public:
-
-    material(std::string filename, wave_property wp, material_property mp){
-    	name = filename;
-    	load(filename, wp, mp);
-    }
-
-    material(std::string filename){
-    	name = filename;
-    	load(filename);
-    }
-
-    material(){
-    	init();
-    }
-
-    complex<T> getN(T lambda){
-    	return function< T, complex<T> >::linear(lambda);
-    }
-
-    void load(std::string filename, wave_property wp, material_property mp){
-
-    	//load the file as a function
-    	function< T, complex<T> >::load(filename);
-    }
-
-    void load(std::string filename){
-
-    	wave_property wp = inverse_cm;
-    	material_property mp = ri;
-    	//turn the file into a string
-    	std::ifstream t(filename.c_str());	//open the file as a stream
-
-    	if(!t){
-    		std::cout<<"ERROR: Couldn't open the material file '"<<filename<<"'"<<std::endl;
-    		exit(1);
-    	}
-		std::string str((std::istreambuf_iterator<char>(t)),
-		std::istreambuf_iterator<char>());
-
-		//process the header information
-		process_header(str, wp, mp);
-
-		//convert units
-		if(wp == inverse_cm)
-			from_inverse_cm();
-		//set the bounding values
-		bounding[0] = Y[0];
-		bounding[1] = Y.back();
-    }
-    std::string str(){
-    	std::stringstream ss;
-    	ss<<name<<std::endl;
-    	ss<<function< T, complex<T> >::str();
-    	return ss.str();
-    }
-    std::string get_name(){
-    	return name;
-    }
-
-    void set_name(std::string str){
-    	name = str;
-    }
-
-};
-
-}
-
-
-
-
-#endif
+#ifndef RTS_MATERIAL_H
+#define RTS_MATERIAL_H
+
+#include <vector>
+#include <ostream>
+#include <iostream>
+#include <fstream>
+#include <complex>
+#include <algorithm>
+#include <sstream>
+#include "../math/complex.h"
+#include "../math/constants.h"
+#include "../math/function.h"
+
+namespace stim{
+
+//Material class - default representation for the material property is the refractive index (RI)
+template<typename T>
+class material : public function< T, complex<T> >{
+
+public:
+    enum wave_property{microns, inverse_cm};
+    enum material_property{ri, absorbance};
+
+private:
+
+    using function< T, complex<T> >::X;
+    using function< T, complex<T> >::Y;
+    using function< T, complex<T> >::insert;
+    using function< T, complex<T> >::bounding;
+
+    std::string name;	//name for the material (defaults to file name)
+
+    void process_header(std::string str, wave_property& wp, material_property& mp){
+
+    	std::stringstream ss(str);	//create a stream from the data string
+    	std::string line;
+    	std::getline(ss, line);		//get the first line as a string
+		while(line[0] == '#'){		//continue looping while the line is a comment
+
+			std::stringstream lstream(line);	//create a stream from the line
+			lstream.ignore();					//ignore the first character ('#')
+
+			std::string prop;		//get the property name
+			lstream>>prop;
+
+			if(prop == "X"){
+				std::string wp_name;
+				lstream>>wp_name;
+				if(wp_name == "microns") wp = microns;
+				else if(wp_name == "inverse_cm") wp = inverse_cm;
+			}
+			else if(prop == "Y"){
+				std::string mp_name;
+				lstream>>mp_name;
+				if(mp_name == "ri") mp = ri;
+				else if(mp_name == "absorbance") mp = absorbance;
+			}
+
+			std::getline(ss, line);		//get the next line
+		}
+
+		function< T, stim::complex<T> >::process_string(str);
+	}
+
+    void from_inverse_cm(){
+    	//convert inverse centimeters to wavelength (in microns)
+    	for(unsigned int i=0; i<X.size(); i++)
+    		X[i] = 10000 / X[i];
+
+    	//reverse the function array
+    	std::reverse(X.begin(), X.end());
+    	std::reverse(Y.begin(), Y.end());
+
+    }
+
+    void init(){
+    	bounding[0] = bounding[1] = stim::complex<T>(1, 0);
+    }
+
+
+public:
+
+    material(std::string filename, wave_property wp, material_property mp){
+    	name = filename;
+    	load(filename, wp, mp);
+    }
+
+    material(std::string filename){
+    	name = filename;
+    	load(filename);
+    }
+
+    material(){
+    	init();
+    }
+
+    complex<T> getN(T lambda){
+    	return function< T, complex<T> >::linear(lambda);
+    }
+
+    void load(std::string filename, wave_property wp, material_property mp){
+
+    	//load the file as a function
+    	function< T, complex<T> >::load(filename);
+    }
+
+    void load(std::string filename){
+
+    	wave_property wp = inverse_cm;
+    	material_property mp = ri;
+    	//turn the file into a string
+    	std::ifstream t(filename.c_str());	//open the file as a stream
+
+    	if(!t){
+    		std::cout<<"ERROR: Couldn't open the material file '"<<filename<<"'"<<std::endl;
+    		exit(1);
+    	}
+		std::string str((std::istreambuf_iterator<char>(t)),
+		std::istreambuf_iterator<char>());
+
+		//process the header information
+		process_header(str, wp, mp);
+
+		//convert units
+		if(wp == inverse_cm)
+			from_inverse_cm();
+		//set the bounding values
+		bounding[0] = Y[0];
+		bounding[1] = Y.back();
+    }
+    std::string str(){
+    	std::stringstream ss;
+    	ss<<name<<std::endl;
+    	ss<<function< T, complex<T> >::str();
+    	return ss.str();
+    }
+    std::string get_name(){
+    	return name;
+    }
+
+    void set_name(std::string str){
+    	name = str;
+    }
+
+};
+
+}
+
+
+
+
+#endif
-#include "../optics/material.h"
-#include "../math/complexfield.cuh"
-#include "../math/constants.h"
-//#include "../envi/bil.h"
-
-#include "cufft.h"
-
-#include <vector>
-#include <sstream>
-
-namespace stim{
-
-//this function writes a sinc function to "dest" such that an iFFT produces a slab
-template<typename T>
-__global__ void gpu_mirst1d_layer_fft(complex<T>* dest, complex<T>* ri, 
-									  T* src, T* zf, 
-									  T w, unsigned int zR, unsigned int nuR){
-	//dest = complex field representing the sample
-	//ri = refractive indices for each wavelength
-	//src = intensity of the light source for each wavelength
-	//zf = z position of the slab interface for each wavelength (accounting for optical path length)
-	//w = width of the slab (in pixels)
-	//zR = number of z-axis samples
-	//nuR = number of wavelengths
-
-    //get the current coordinate in the plane slice
-	int ifz = blockIdx.x * blockDim.x + threadIdx.x;
-	int inu = blockIdx.y * blockDim.y + threadIdx.y;
-
-	//make sure that the thread indices are in-bounds
-	if(inu >= nuR || ifz >= zR) return;
-
-	int i = inu * zR + ifz;
-
-    T fz;
-    if(ifz < zR/2)
-        fz = ifz / (T)zR;
-    else
-        fz = -(zR - ifz) / (T)zR;
-
-    //if the slab starts outside of the simulation domain, just return
-    if(zf[inu] >= zR) return;
-
-	//fill the array along z with a sinc function representing the Fourier transform of the layer
-
-	T opl = w * ri[inu].real();			//optical path length
-
-	//handle the case where the slab goes outside the simulation domain
-	if(zf[inu] + opl >= zR)
-		opl = zR - zf[inu];
-
-	if(opl == 0) return;
-
-	//T l = w * ri[inu].real();
-	//complex<T> e(0.0, -2 * PI * fz * (zf[inu] + zR/2 - l/2.0));
-	complex<T> e(0, -2 * stimPI * fz * (zf[inu] + opl/2));
-
-	complex<T> eta = ri[inu] * ri[inu] - 1;
-
-	//dest[i] = fz;//exp(e) * m[inu] * src[inu] * sin(PI * fz * l) / (PI * fz);
-	if(ifz == 0)
-        dest[i] += opl * exp(e) * eta * src[inu];
-    else
-        dest[i] += opl * exp(e) * eta * src[inu] * sin(stimPI * fz * opl) / (stimPI * fz * opl);
-}
-
-template<typename T>
-__global__ void gpu_mirst1d_increment_z(T* zf, complex<T>* ri, T w, unsigned int S){
-	//zf = current z depth (optical path length) in pixels
-	//ri = refractive index of the material
-	//w = actual width of the layer (in pixels)
-
-
-	//compute the index for this thread
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if(i >= S) return;
-
-	if(ri == NULL)
-		zf[i] += w;
-	else
-		zf[i] += ri[i].real() * w;
-}
-
-//apply the 1D MIRST filter to an existing sample (overwriting the sample)
-template<typename T>
-__global__ void gpu_mirst1d_apply_filter(complex<T>* sampleFFT, T* lambda, 
-								 T dFz,
-								 T inNA, T outNA, 
-								 unsigned int lambdaR, unsigned int zR, 
-								 T sigma = 0){
-	//sampleFFT = the sample in the Fourier domain (will be overwritten)
-	//lambda = list of wavelengths
-	//dFz = delta along the Fz axis in the frequency domain
-	//inNA = NA of the internal obscuration
-	//outNA = NA of the objective
-	//zR = number of pixels along the Fz axis (same as the z-axis)
-	//lambdaR = number of wavelengths
-	//sigma = width of the Gaussian source
-	int ifz = blockIdx.x * blockDim.x + threadIdx.x;
-	int inu = blockIdx.y * blockDim.y + threadIdx.y;
-
-	if(inu >= lambdaR || ifz >= zR) return;
-
-	//calculate the index into the sample FT
-	int i = inu * zR + ifz;
-
-	//compute the frequency (and set all negative spatial frequencies to zero)
-	T fz;
-	if(ifz < zR / 2)
-	    fz = ifz * dFz;
-	//if the spatial frequency is negative, set it to zero and exit
-	else{
-	    sampleFFT[i] = 0;
-	    return;
-	}
-
-	//compute the frequency in inverse microns
-	T nu = 1/lambda[inu];
-
-	//determine the radius of the integration circle
-	T nu_sq = nu * nu;
-	T fz_sq = (fz * fz) / 4;
-
-	//cut off frequencies above the diffraction limit
-	T r;
-	if(fz_sq < nu_sq)
-	    r = sqrt(nu_sq - fz_sq);
-	else
-	    r = 0;
-
-	//account for the optics
-	T Q = 0;
-	if(r > nu * inNA && r < nu * outNA)
-	    Q = 1;
-
-	//account for the source
-	//T sigma = 30.0;
-	T s = exp( - (r*r * sigma*sigma) / 2 );
-	//T s=1;
-
-	//compute the final filter
-	T mirst = 0;
-	if(fz != 0)
-	    mirst = 2 * stimPI * r * s * Q * (1/fz);
-
-	sampleFFT[i] *= mirst;
-
-}
-
-/*This object performs a 1-dimensional (layered) MIRST simulation
-*/
-template<typename T>
-class mirst1d{
-
-private:
-	unsigned int Z;	//z-axis resolution
-	unsigned int pad;	//pixel padding on either side of the sample
-
-	std::vector< material<T> > matlist;	//list of materials
-	std::vector< T > layers;				//list of layer thicknesses
-
-	std::vector< T > lambdas;		//list of wavelengths that are being simulated
-	unsigned int S;					//number of wavelengths (size of "lambdas")
-
-	T NA[2];						//numerical aperature (central obscuration and outer diameter)
-
-	function<T, T> source_profile;	//profile (spectrum) of the source (expressed in inverse centimeters)
-
-	complexfield<T, 1> scratch;		//scratch GPU memory used to build samples, transforms, etc.
-
-	void fft(int direction = CUFFT_FORWARD){
-
-		unsigned padZ = Z + pad;
-		
-		//create cuFFT handles
-		cufftHandle plan;
-		cufftResult result;
-		
-		if(sizeof(T) == 4)
-			result = cufftPlan1d(&plan, padZ, CUFFT_C2C, lambdas.size());	//single precision
-		else
-			result = cufftPlan1d(&plan, padZ, CUFFT_Z2Z, lambdas.size());	//double precision
-
-		//check for Plan 1D errors
-		if(result != CUFFT_SUCCESS){
-			std::cout<<"Error creating CUFFT plan for computing the FFT:"<<std::endl;
-			CufftError(result);
-			exit(1);
-		}
-
-		if(sizeof(T) == 4)
-			result = cufftExecC2C(plan, (cufftComplex*)scratch.ptr(), (cufftComplex*)scratch.ptr(), direction);
-		else
-			result = cufftExecZ2Z(plan, (cufftDoubleComplex*)scratch.ptr(), (cufftDoubleComplex*)scratch.ptr(), direction);
-
-		//check for FFT errors
-		if(result != CUFFT_SUCCESS){
-			std::cout<<"Error executing CUFFT to compute the FFT."<<std::endl;
-			CufftError(result);
-			exit(1);
-		}
-
-		cufftDestroy(plan);
-	}
-
-
-	//initialize the scratch memory
-	void init_scratch(){
-		scratch = complexfield<T, 1>(Z + pad , lambdas.size());
-		scratch = 0;
-	}
-
-	//get the list of scattering efficiency (eta) values for a specified layer
-	std::vector< complex<T> > layer_etas(unsigned int l){
-
-		std::vector< complex<T> > etas;
-
-		//fill the list of etas
-		for(unsigned int i=0; i<lambdas.size(); i++)
-			etas.push_back( matlist[l].eta(lambdas[i]) );
-		return etas;
-	}
-
-	//calculates the optimal block and grid sizes using information from the GPU
-	void cuda_params(dim3& grids, dim3& blocks){
-		int maxThreads = stim::maxThreadsPerBlock(); //compute the optimal block size
-		int SQRT_BLOCK = (int)std::sqrt((float)maxThreads);
-
-		//create one thread for each detector pixel
-		blocks = dim3(SQRT_BLOCK, SQRT_BLOCK);
-		grids = dim3(((Z + 2 * pad) + SQRT_BLOCK -1)/SQRT_BLOCK, (S + SQRT_BLOCK - 1)/SQRT_BLOCK);
-	}
-
-	//add the fourier transform of layer n to the scratch space
-	void build_layer_fft(unsigned int n, T* zf){
-		unsigned int paddedZ = Z + pad;
-
-		T wpx = layers[n] / dz();	//calculate the width of the layer in pixels
-
-		//allocate memory for the refractive index
-		complex<T>* gpuRi;
-		HANDLE_ERROR(cudaMalloc( (void**)&gpuRi, sizeof(complex<T>) * S));
-
-		//allocate memory for the source profile
-		T* gpuSrc;
-		HANDLE_ERROR(cudaMalloc( (void**)&gpuSrc, sizeof(T) * S));
-
-		complex<T> ri;
-		T source;
-		//store the refractive index and source profile in a CPU array
-		for(int inu=0; inu<S; inu++){
-			//save the refractive index to the GPU
-			ri = matlist[n].getN(lambdas[inu]);
-			HANDLE_ERROR(cudaMemcpy( gpuRi + inu, &ri, sizeof(complex<T>), cudaMemcpyHostToDevice ));
-
-			//save the source profile to the GPU
-			source = source_profile(10000 / lambdas[inu]);
-			HANDLE_ERROR(cudaMemcpy( gpuSrc + inu, &source, sizeof(T), cudaMemcpyHostToDevice ));
-
-		}
-
-		//create one thread for each pixel of the field slice
-		dim3 gridDim, blockDim;
-		cuda_params(gridDim, blockDim);
-		stim::gpu_mirst1d_layer_fft<<<gridDim, blockDim>>>(scratch.ptr(), gpuRi, gpuSrc, zf, wpx, paddedZ, S);
-
-		int linBlock = stim::maxThreadsPerBlock(); //compute the optimal block size
-		int linGrid = S / linBlock + 1;
-		stim::gpu_mirst1d_increment_z <<<linGrid, linBlock>>>(zf, gpuRi, wpx, S);
-
-		//free memory
-		HANDLE_ERROR(cudaFree(gpuRi));
-		HANDLE_ERROR(cudaFree(gpuSrc));
-	}
-
-	void build_sample(){
-		init_scratch();		//initialize the GPU scratch space
-		//build_layer(1);
-
-		T* zf;
-		HANDLE_ERROR(cudaMalloc(&zf, sizeof(T) * S));
-		HANDLE_ERROR(cudaMemset(zf, 0, sizeof(T) * S));
-
-		//render each layer of the sample
-		for(unsigned int l=0; l<layers.size(); l++){
-			build_layer_fft(l, zf);
-		}
-
-		HANDLE_ERROR(cudaFree(zf));
-	}
-
-	void apply_filter(){
-		dim3 gridDim, blockDim;
-		cuda_params(gridDim, blockDim);
-
-		unsigned int Zpad = Z + pad;
-
-		T sim_range = dz() * Zpad;
-    	T dFz = 1 / sim_range;
-
-		//copy the array of wavelengths to the GPU
-		T* gpuLambdas;
-		HANDLE_ERROR(cudaMalloc(&gpuLambdas, sizeof(T) * Zpad));
-		HANDLE_ERROR(cudaMemcpy(gpuLambdas, &lambdas[0], sizeof(T) * Zpad, cudaMemcpyHostToDevice));
-		stim::gpu_mirst1d_apply_filter <<<gridDim, blockDim>>>(scratch.ptr(), gpuLambdas, 
-								 dFz,
-								 NA[0], NA[1], 
-								 S, Zpad);
-	}
-
-	//crop the image to the sample thickness - keep in mind that sample thickness != optical path length
-	void crop(){
-
-		scratch = scratch.crop(Z, S);
-	}
-	
-	//save the scratch field as a binary file
-	void to_binary(std::string filename){
-
-	}
-
-
-public:
-
-	//constructor
-	mirst1d(unsigned int rZ = 100,
-			unsigned int padding = 0){
-		Z = rZ;
-		pad = padding;
-		NA[0] = 0;
-		NA[1] = 0.8;
-		S = 0;
-		source_profile = 1;
-	}
-
-	//add a layer, thickness = microns
-	void add_layer(material<T> mat, T thickness){
-		matlist.push_back(mat);
-		layers.push_back(thickness);
-	}
-
-	void add_layer(std::string filename, T thickness){
-		add_layer(material<T>(filename), thickness);
-	}
-
-	//adds a profile spectrum for the light source
-	void set_source(std::string filename){
-		source_profile.load(filename);
-	}
-
-	//adds a block of wavenumbers (cm^-1) to the simulation parameters
-	void add_wavenumbers(unsigned int start, unsigned int stop, unsigned int step){
-		unsigned int nu = start;
-		while(nu <= stop){
-			lambdas.push_back((T)10000 / nu);
-			nu += step;
-		}
-		S = lambdas.size();		//increment the number of wavelengths (shorthand for later)
-	}
-
-	T thickness(){
-		T t = 0;
-		for(unsigned int l=0; l<layers.size(); l++)
-			t += layers[l];
-		return t;
-	}
-
-	void padding(unsigned int padding = 0){
-		pad = padding;
-	}
-
-	T dz(){
-		return thickness() / Z;		//calculate the z-axis step size
-	}
-
-	void na(T in, T out){
-		NA[0] = in;
-		NA[1] = out;
-	}
-
-	void na(T out){
-		na(0, out);
-	}
-
-	stim::function<T, T> get_source(){
-		return source_profile;
-	}
-
-	void save_sample(std::string filename){
-		//create a sample and save the magnitude as an image
-		build_sample();
-		fft(CUFFT_INVERSE);
-		scratch.toImage(filename, stim::complexfield<T, 1>::magnitude);
-	}
-
-	void save_mirst(std::string filename, bool binary = true){
-		//apply the MIRST filter to a sample and save the image
-
-		//build the sample in the Fourier domain
-		build_sample();
-
-		//apply the MIRST filter
-		apply_filter();
-
-		//apply an inverse FFT to bring the results back into the spatial domain
-		fft(CUFFT_INVERSE);
-
-		crop();
-
-		//save the image
-		if(binary)
-			to_binary(filename);
-		else
-			scratch.toImage(filename, stim::complexfield<T, 1>::magnitude);
-	}
-
-
-
-
-	std::string str(){
-
-		stringstream ss;
-		ss<<"1D MIRST Simulation========================="<<std::endl;
-		ss<<"z-axis resolution: "<<Z<<std::endl;
-		ss<<"simulation domain: ["<<lambdas[0]<<", "<<lambdas.back()<<"]"<<std::endl;
-		ss<<"number of wavelengths: "<<lambdas.size()<<std::endl;
-		ss<<"padding: "<<pad<<std::endl;
-		ss<<"sample thickness: "<<thickness()<<" um"<<std::endl;
-		ss<<"dz: "<<dz()<<" um"<<std::endl;
-		ss<<std::endl;
-		ss<<layers.size()<<" layers-------------"<<std::endl;
-		for(unsigned int l=0; l<layers.size(); l++)
-			ss<<"layer "<<l<<": "<<layers[l]<<" um"<<"---------"<<std::endl<<matlist[l].str()<<std::endl;
-
-		ss<<"source profile-----------"<<std::endl;
-		ss<<get_source().str()<<std::endl;
-
-		return ss.str();
-
-
-	}
-
-
-
-};
-
-}
+#include "../optics/material.h"
+#include "../math/complexfield.cuh"
+#include "../math/constants.h"
+//#include "../envi/bil.h"
+
+#include "cufft.h"
+
+#include <vector>
+#include <sstream>
+
+namespace stim{
+
+//this function writes a sinc function to "dest" such that an iFFT produces a slab
+template<typename T>
+__global__ void gpu_mirst1d_layer_fft(complex<T>* dest, complex<T>* ri, 
+									  T* src, T* zf, 
+									  T w, unsigned int zR, unsigned int nuR){
+	//dest = complex field representing the sample
+	//ri = refractive indices for each wavelength
+	//src = intensity of the light source for each wavelength
+	//zf = z position of the slab interface for each wavelength (accounting for optical path length)
+	//w = width of the slab (in pixels)
+	//zR = number of z-axis samples
+	//nuR = number of wavelengths
+
+    //get the current coordinate in the plane slice
+	int ifz = blockIdx.x * blockDim.x + threadIdx.x;
+	int inu = blockIdx.y * blockDim.y + threadIdx.y;
+
+	//make sure that the thread indices are in-bounds
+	if(inu >= nuR || ifz >= zR) return;
+
+	int i = inu * zR + ifz;
+
+    T fz;
+    if(ifz < zR/2)
+        fz = ifz / (T)zR;
+    else
+        fz = -(zR - ifz) / (T)zR;
+
+    //if the slab starts outside of the simulation domain, just return
+    if(zf[inu] >= zR) return;
+
+	//fill the array along z with a sinc function representing the Fourier transform of the layer
+
+	T opl = w * ri[inu].real();			//optical path length
+
+	//handle the case where the slab goes outside the simulation domain
+	if(zf[inu] + opl >= zR)
+		opl = zR - zf[inu];
+
+	if(opl == 0) return;
+
+	//T l = w * ri[inu].real();
+	//complex<T> e(0.0, -2 * PI * fz * (zf[inu] + zR/2 - l/2.0));
+	complex<T> e(0, -2 * stimPI * fz * (zf[inu] + opl/2));
+
+	complex<T> eta = ri[inu] * ri[inu] - 1;
+
+	//dest[i] = fz;//exp(e) * m[inu] * src[inu] * sin(PI * fz * l) / (PI * fz);
+	if(ifz == 0)
+        dest[i] += opl * exp(e) * eta * src[inu];
+    else
+        dest[i] += opl * exp(e) * eta * src[inu] * sin(stimPI * fz * opl) / (stimPI * fz * opl);
+}
+
+template<typename T>
+__global__ void gpu_mirst1d_increment_z(T* zf, complex<T>* ri, T w, unsigned int S){
+	//zf = current z depth (optical path length) in pixels
+	//ri = refractive index of the material
+	//w = actual width of the layer (in pixels)
+
+
+	//compute the index for this thread
+	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	if(i >= S) return;
+
+	if(ri == NULL)
+		zf[i] += w;
+	else
+		zf[i] += ri[i].real() * w;
+}
+
+//apply the 1D MIRST filter to an existing sample (overwriting the sample)
+template<typename T>
+__global__ void gpu_mirst1d_apply_filter(complex<T>* sampleFFT, T* lambda, 
+								 T dFz,
+								 T inNA, T outNA, 
+								 unsigned int lambdaR, unsigned int zR, 
+								 T sigma = 0){
+	//sampleFFT = the sample in the Fourier domain (will be overwritten)
+	//lambda = list of wavelengths
+	//dFz = delta along the Fz axis in the frequency domain
+	//inNA = NA of the internal obscuration
+	//outNA = NA of the objective
+	//zR = number of pixels along the Fz axis (same as the z-axis)
+	//lambdaR = number of wavelengths
+	//sigma = width of the Gaussian source
+	int ifz = blockIdx.x * blockDim.x + threadIdx.x;
+	int inu = blockIdx.y * blockDim.y + threadIdx.y;
+
+	if(inu >= lambdaR || ifz >= zR) return;
+
+	//calculate the index into the sample FT
+	int i = inu * zR + ifz;
+
+	//compute the frequency (and set all negative spatial frequencies to zero)
+	T fz;
+	if(ifz < zR / 2)
+	    fz = ifz * dFz;
+	//if the spatial frequency is negative, set it to zero and exit
+	else{
+	    sampleFFT[i] = 0;
+	    return;
+	}
+
+	//compute the frequency in inverse microns
+	T nu = 1/lambda[inu];
+
+	//determine the radius of the integration circle
+	T nu_sq = nu * nu;
+	T fz_sq = (fz * fz) / 4;
+
+	//cut off frequencies above the diffraction limit
+	T r;
+	if(fz_sq < nu_sq)
+	    r = sqrt(nu_sq - fz_sq);
+	else
+	    r = 0;
+
+	//account for the optics
+	T Q = 0;
+	if(r > nu * inNA && r < nu * outNA)
+	    Q = 1;
+
+	//account for the source
+	//T sigma = 30.0;
+	T s = exp( - (r*r * sigma*sigma) / 2 );
+	//T s=1;
+
+	//compute the final filter
+	T mirst = 0;
+	if(fz != 0)
+	    mirst = 2 * stimPI * r * s * Q * (1/fz);
+
+	sampleFFT[i] *= mirst;
+
+}
+
+/*This object performs a 1-dimensional (layered) MIRST simulation
+*/
+template<typename T>
+class mirst1d{
+
+private:
+	unsigned int Z;	//z-axis resolution
+	unsigned int pad;	//pixel padding on either side of the sample
+
+	std::vector< material<T> > matlist;	//list of materials
+	std::vector< T > layers;				//list of layer thicknesses
+
+	std::vector< T > lambdas;		//list of wavelengths that are being simulated
+	unsigned int S;					//number of wavelengths (size of "lambdas")
+
+	T NA[2];						//numerical aperature (central obscuration and outer diameter)
+
+	function<T, T> source_profile;	//profile (spectrum) of the source (expressed in inverse centimeters)
+
+	complexfield<T, 1> scratch;		//scratch GPU memory used to build samples, transforms, etc.
+
+	void fft(int direction = CUFFT_FORWARD){
+
+		unsigned padZ = Z + pad;
+		
+		//create cuFFT handles
+		cufftHandle plan;
+		cufftResult result;
+		
+		if(sizeof(T) == 4)
+			result = cufftPlan1d(&plan, padZ, CUFFT_C2C, lambdas.size());	//single precision
+		else
+			result = cufftPlan1d(&plan, padZ, CUFFT_Z2Z, lambdas.size());	//double precision
+
+		//check for Plan 1D errors
+		if(result != CUFFT_SUCCESS){
+			std::cout<<"Error creating CUFFT plan for computing the FFT:"<<std::endl;
+			CufftError(result);
+			exit(1);
+		}
+
+		if(sizeof(T) == 4)
+			result = cufftExecC2C(plan, (cufftComplex*)scratch.ptr(), (cufftComplex*)scratch.ptr(), direction);
+		else
+			result = cufftExecZ2Z(plan, (cufftDoubleComplex*)scratch.ptr(), (cufftDoubleComplex*)scratch.ptr(), direction);
+
+		//check for FFT errors
+		if(result != CUFFT_SUCCESS){
+			std::cout<<"Error executing CUFFT to compute the FFT."<<std::endl;
+			CufftError(result);
+			exit(1);
+		}
+
+		cufftDestroy(plan);
+	}
+
+
+	//initialize the scratch memory
+	void init_scratch(){
+		scratch = complexfield<T, 1>(Z + pad , lambdas.size());
+		scratch = 0;
+	}
+
+	//get the list of scattering efficiency (eta) values for a specified layer
+	std::vector< complex<T> > layer_etas(unsigned int l){
+
+		std::vector< complex<T> > etas;
+
+		//fill the list of etas
+		for(unsigned int i=0; i<lambdas.size(); i++)
+			etas.push_back( matlist[l].eta(lambdas[i]) );
+		return etas;
+	}
+
+	//calculates the optimal block and grid sizes using information from the GPU
+	void cuda_params(dim3& grids, dim3& blocks){
+		int maxThreads = stim::maxThreadsPerBlock(); //compute the optimal block size
+		int SQRT_BLOCK = (int)std::sqrt((float)maxThreads);
+
+		//create one thread for each detector pixel
+		blocks = dim3(SQRT_BLOCK, SQRT_BLOCK);
+		grids = dim3(((Z + 2 * pad) + SQRT_BLOCK -1)/SQRT_BLOCK, (S + SQRT_BLOCK - 1)/SQRT_BLOCK);
+	}
+
+	//add the fourier transform of layer n to the scratch space
+	void build_layer_fft(unsigned int n, T* zf){
+		unsigned int paddedZ = Z + pad;
+
+		T wpx = layers[n] / dz();	//calculate the width of the layer in pixels
+
+		//allocate memory for the refractive index
+		complex<T>* gpuRi;
+		HANDLE_ERROR(cudaMalloc( (void**)&gpuRi, sizeof(complex<T>) * S));
+
+		//allocate memory for the source profile
+		T* gpuSrc;
+		HANDLE_ERROR(cudaMalloc( (void**)&gpuSrc, sizeof(T) * S));
+
+		complex<T> ri;
+		T source;
+		//store the refractive index and source profile in a CPU array
+		for(int inu=0; inu<S; inu++){
+			//save the refractive index to the GPU
+			ri = matlist[n].getN(lambdas[inu]);
+			HANDLE_ERROR(cudaMemcpy( gpuRi + inu, &ri, sizeof(complex<T>), cudaMemcpyHostToDevice ));
+
+			//save the source profile to the GPU
+			source = source_profile(10000 / lambdas[inu]);
+			HANDLE_ERROR(cudaMemcpy( gpuSrc + inu, &source, sizeof(T), cudaMemcpyHostToDevice ));
+
+		}
+
+		//create one thread for each pixel of the field slice
+		dim3 gridDim, blockDim;
+		cuda_params(gridDim, blockDim);
+		stim::gpu_mirst1d_layer_fft<<<gridDim, blockDim>>>(scratch.ptr(), gpuRi, gpuSrc, zf, wpx, paddedZ, S);
+
+		int linBlock = stim::maxThreadsPerBlock(); //compute the optimal block size
+		int linGrid = S / linBlock + 1;
+		stim::gpu_mirst1d_increment_z <<<linGrid, linBlock>>>(zf, gpuRi, wpx, S);
+
+		//free memory
+		HANDLE_ERROR(cudaFree(gpuRi));
+		HANDLE_ERROR(cudaFree(gpuSrc));
+	}
+
+	void build_sample(){
+		init_scratch();		//initialize the GPU scratch space
+		//build_layer(1);
+
+		T* zf;
+		HANDLE_ERROR(cudaMalloc(&zf, sizeof(T) * S));
+		HANDLE_ERROR(cudaMemset(zf, 0, sizeof(T) * S));
+
+		//render each layer of the sample
+		for(unsigned int l=0; l<layers.size(); l++){
+			build_layer_fft(l, zf);
+		}
+
+		HANDLE_ERROR(cudaFree(zf));
+	}
+
+	void apply_filter(){
+		dim3 gridDim, blockDim;
+		cuda_params(gridDim, blockDim);
+
+		unsigned int Zpad = Z + pad;
+
+		T sim_range = dz() * Zpad;
+    	T dFz = 1 / sim_range;
+
+		//copy the array of wavelengths to the GPU
+		T* gpuLambdas;
+		HANDLE_ERROR(cudaMalloc(&gpuLambdas, sizeof(T) * Zpad));
+		HANDLE_ERROR(cudaMemcpy(gpuLambdas, &lambdas[0], sizeof(T) * Zpad, cudaMemcpyHostToDevice));
+		stim::gpu_mirst1d_apply_filter <<<gridDim, blockDim>>>(scratch.ptr(), gpuLambdas, 
+								 dFz,
+								 NA[0], NA[1], 
+								 S, Zpad);
+	}
+
+	//crop the image to the sample thickness - keep in mind that sample thickness != optical path length
+	void crop(){
+
+		scratch = scratch.crop(Z, S);
+	}
+	
+	//save the scratch field as a binary file
+	void to_binary(std::string filename){
+
+	}
+
+
+public:
+
+	//constructor
+	mirst1d(unsigned int rZ = 100,
+			unsigned int padding = 0){
+		Z = rZ;
+		pad = padding;
+		NA[0] = 0;
+		NA[1] = 0.8;
+		S = 0;
+		source_profile = 1;
+	}
+
+	//add a layer, thickness = microns
+	void add_layer(material<T> mat, T thickness){
+		matlist.push_back(mat);
+		layers.push_back(thickness);
+	}
+
+	void add_layer(std::string filename, T thickness){
+		add_layer(material<T>(filename), thickness);
+	}
+
+	//adds a profile spectrum for the light source
+	void set_source(std::string filename){
+		source_profile.load(filename);
+	}
+
+	//adds a block of wavenumbers (cm^-1) to the simulation parameters
+	void add_wavenumbers(unsigned int start, unsigned int stop, unsigned int step){
+		unsigned int nu = start;
+		while(nu <= stop){
+			lambdas.push_back((T)10000 / nu);
+			nu += step;
+		}
+		S = lambdas.size();		//increment the number of wavelengths (shorthand for later)
+	}
+
+	T thickness(){
+		T t = 0;
+		for(unsigned int l=0; l<layers.size(); l++)
+			t += layers[l];
+		return t;
+	}
+
+	void padding(unsigned int padding = 0){
+		pad = padding;
+	}
+
+	T dz(){
+		return thickness() / Z;		//calculate the z-axis step size
+	}
+
+	void na(T in, T out){
+		NA[0] = in;
+		NA[1] = out;
+	}
+
+	void na(T out){
+		na(0, out);
+	}
+
+	stim::function<T, T> get_source(){
+		return source_profile;
+	}
+
+	void save_sample(std::string filename){
+		//create a sample and save the magnitude as an image
+		build_sample();
+		fft(CUFFT_INVERSE);
+		scratch.toImage(filename, stim::complexfield<T, 1>::magnitude);
+	}
+
+	void save_mirst(std::string filename, bool binary = true){
+		//apply the MIRST filter to a sample and save the image
+
+		//build the sample in the Fourier domain
+		build_sample();
+
+		//apply the MIRST filter
+		apply_filter();
+
+		//apply an inverse FFT to bring the results back into the spatial domain
+		fft(CUFFT_INVERSE);
+
+		crop();
+
+		//save the image
+		if(binary)
+			to_binary(filename);
+		else
+			scratch.toImage(filename, stim::complexfield<T, 1>::magnitude);
+	}
+
+
+
+
+	std::string str(){
+
+		stringstream ss;
+		ss<<"1D MIRST Simulation========================="<<std::endl;
+		ss<<"z-axis resolution: "<<Z<<std::endl;
+		ss<<"simulation domain: ["<<lambdas[0]<<", "<<lambdas.back()<<"]"<<std::endl;
+		ss<<"number of wavelengths: "<<lambdas.size()<<std::endl;
+		ss<<"padding: "<<pad<<std::endl;
+		ss<<"sample thickness: "<<thickness()<<" um"<<std::endl;
+		ss<<"dz: "<<dz()<<" um"<<std::endl;
+		ss<<std::endl;
+		ss<<layers.size()<<" layers-------------"<<std::endl;
+		for(unsigned int l=0; l<layers.size(); l++)
+			ss<<"layer "<<l<<": "<<layers[l]<<" um"<<"---------"<<std::endl<<matlist[l].str()<<std::endl;
+
+		ss<<"source profile-----------"<<std::endl;
+		ss<<get_source().str()<<std::endl;
+
+		return ss.str();
+
+
+	}
+
+
+
+};
+
+}
+#ifndef RTS_PLANEWAVE
+#define RTS_PLANEWAVE
+
+#include <string>
+#include <sstream>
+
+#include "../math/vector.h"
+#include "../math/quaternion.h"
+#include "../math/constants.h"
+#include "../math/plane.h"
+#include "../cuda/callable.h"
+
+/*Basic conversions used here (assuming a vacuum)
+	lambda =
+*/
+
+namespace stim{
+	namespace optics{
+
+template<typename T>
+class planewave{
+
+protected:
+
+	vec<T> k;	//k = tau / lambda
+	vec< complex<T> > E0;		//amplitude
+	//T phi;
+
+	CUDA_CALLABLE planewave<T> bend(rts::vec<T> kn) const{
+
+		vec<T> kn_hat = kn.norm();				//normalize the new k
+		vec<T> k_hat = k.norm();				//normalize the current k
+
+		//std::cout<<"PLANE WAVE BENDING------------------"<<std::endl;
+		//std::cout<<"kn_hat: "<<kn_hat<<"     k_hat: "<<k_hat<<std::endl;
+
+		planewave<T> new_p;						//create a new plane wave
+
+		//if kn is equal to k or -k, handle the degenerate case
+		T k_dot_kn = k_hat.dot(kn_hat);
+
+		//if k . n < 0, then the bend is a reflection
+			//flip k_hat
+		if(k_dot_kn < 0) k_hat = -k_hat;
+
+		//std::cout<<"k dot kn: "<<k_dot_kn<<std::endl;
+
+		//std::cout<<"k_dot_kn: "<<k_dot_kn<<std::endl;
+		if(k_dot_kn == -1){
+			new_p.k = -k;
+			new_p.E0 = E0;
+			return new_p;
+		}
+		else if(k_dot_kn == 1){
+			new_p.k = k;
+			new_p.E0 = E0;
+			return new_p;
+		}
+
+		vec<T> r = k_hat.cross(kn_hat);			//compute the rotation vector
+
+		//std::cout<<"r: "<<r<<std::endl;
+
+		T theta = asin(r.len());				//compute the angle of the rotation about r
+
+		
+
+		//deal with a zero vector (both k and kn point in the same direction)
+		//if(theta == (T)0)
+		//{
+		//	new_p = *this;
+		//	return new_p;
+		//}
+
+		//create a quaternion to capture the rotation
+		quaternion<T> q;
+		q.CreateRotation(theta, r.norm());
+
+		//apply the rotation to E0
+		vec< complex<T> > E0n = q.toMatrix3() * E0;
+
+		new_p.k = kn_hat * kmag();
+		new_p.E0 = E0n;
+
+		return new_p;
+	}
+
+public:
+
+
+	///constructor: create a plane wave propagating along z, polarized along x
+	/*planewave(T lambda = (T)1)
+	{
+		k = rts::vec<T>(0, 0, 1) * (TAU/lambda);
+		E0 = rts::vec<T>(1, 0, 0);
+	}*/
+	///constructor: create a plane wave propagating along k, polarized along _E0, at frequency _omega
+	CUDA_CALLABLE planewave(vec<T> kvec = rts::vec<T>(0, 0, rtsTAU), 
+							vec< complex<T> > E = rts::vec<T>(1, 0, 0), T phase = 0)
+	{
+		//phi = phase;
+
+		k = kvec;
+		vec< complex<T> > k_hat = k.norm();
+
+		if(E.len() == 0)			//if the plane wave has an amplitude of 0
+			E0 = vec<T>(0);			//just return it
+		else{
+			vec< complex<T> > s = (k_hat.cross(E)).norm();		//compute an orthogonal side vector
+			vec< complex<T> > E_hat = (s.cross(k)).norm();	//compute a normalized E0 direction vector
+			E0 = E_hat * E_hat.dot(E);					//compute the projection of _E0 onto E0_hat
+		}
+
+		E0 = E0 * exp( complex<T>(0, phase) );
+	}
+
+	///multiplication operator: scale E0
+    CUDA_CALLABLE planewave<T> & operator* (const T & rhs)
+	{
+		
+		E0 = E0 * rhs;
+		return *this;
+	}
+
+	CUDA_CALLABLE T lambda() const
+	{
+		return rtsTAU / k.len();
+	}
+
+	CUDA_CALLABLE T kmag() const
+	{
+		return k.len();
+	}
+
+	CUDA_CALLABLE vec< complex<T> > E(){
+		return E0;
+	}
+
+	CUDA_CALLABLE vec<T> kvec(){
+		return k;
+	}
+
+	/*CUDA_CALLABLE T phase(){
+		return phi;
+	}
+
+	CUDA_CALLABLE void phase(T p){
+		phi = p;
+	}*/
+
+	CUDA_CALLABLE vec< complex<T> > pos(vec<T> p = vec<T>(0, 0, 0)){
+		vec< complex<T> > result;
+
+		T kdp = k.dot(p);
+		complex<T> x = complex<T>(0, kdp);
+		complex<T> expx = exp(x);
+
+		result[0] = E0[0] * expx;
+		result[1] = E0[1] * expx;
+		result[2] = E0[2] * expx;
+
+		return result;
+	}
+
+	//scales k based on a transition from material ni to material nt
+	CUDA_CALLABLE planewave<T> n(T ni, T nt){
+		return planewave<T>(k * (nt / ni), E0);
+	}
+
+	CUDA_CALLABLE planewave<T> refract(rts::vec<T> kn) const
+	{
+		return bend(kn);
+	}
+
+	void scatter(rts::plane<T> P, T nr, planewave<T> &r, planewave<T> &t){
+
+		int facing = P.face(k);		//determine which direction the plane wave is coming in
+
+		//if(facing == 0)				//if the wave is tangent to the plane, return an identical wave
+		//	return *this;
+		//else 
+		if(facing == -1){		//if the wave hits the back of the plane, invert the plane and nr
+			P = P.flip();			//flip the plane
+			nr = 1/nr;				//invert the refractive index (now nr = n0/n1)
+		}
+
+		//use Snell's Law to calculate the transmitted angle
+		T cos_theta_i = k.norm().dot(-P.norm());				//compute the cosine of theta_i
+		T theta_i = acos(cos_theta_i);							//compute theta_i
+		T sin_theta_t = (1/nr) * sin(theta_i);						//compute the sine of theta_t using Snell's law
+		T theta_t = asin(sin_theta_t);							//compute the cosine of theta_t
+
+		bool tir = false;						//flag for total internal reflection
+		if(theta_t != theta_t){
+			tir = true;
+			theta_t = rtsPI / (T)2;
+		}
+
+		//handle the degenerate case where theta_i is 0 (the plane wave hits head-on)
+		if(theta_i == 0){
+			T rp = (1 - nr) / (1 + nr);		//compute the Fresnel coefficients
+			T tp = 2 / (1 + nr);
+			vec<T> kr = -k;
+			vec<T> kt = k * nr;			//set the k vectors for theta_i = 0
+			vec< complex<T> > Er = E0 * rp;		//compute the E vectors
+			vec< complex<T> > Et = E0 * tp;
+			T phase_t = P.p().dot(k - kt);	//compute the phase offset
+			T phase_r = P.p().dot(k - kr);
+			//std::cout<<"Degeneracy: Head-On"<<std::endl;
+			//std::cout<<"rs: "<<rp<<"  rp: "<<rp<<"  ts: "<<tp<<"  tp: "<<tp<<std::endl;
+			//std::cout<<"phase r: "<<phase_r<<"  phase t: "<<phase_t<<std::endl;
+
+			//create the plane waves
+			r = planewave<T>(kr, Er, phase_r);
+			t = planewave<T>(kt, Et, phase_t);
+
+			//std::cout<<"i + r: "<<pos()[0] + r.pos()[0]<<pos()[1] + r.pos()[1]<<pos()[2] + r.pos()[2]<<std::endl;
+			//std::cout<<"t:     "<<t.pos()[0]<<t.pos()[1]<<t.pos()[2]<<std::endl;
+			//std::cout<<"--------------------------------"<<std::endl;
+			return;
+		}
+
+
+		//compute the Fresnel coefficients
+		T rp, rs, tp, ts;
+		rp = tan(theta_t - theta_i) / tan(theta_t + theta_i);
+		rs = sin(theta_t - theta_i) / sin(theta_t + theta_i);
+		
+		if(tir){
+			tp = ts = 0;
+		}
+		else{
+			tp = ( 2 * sin(theta_t) * cos(theta_i) ) / ( sin(theta_t + theta_i) * cos(theta_t - theta_i) );
+			ts = ( 2 * sin(theta_t) * cos(theta_i) ) / sin(theta_t + theta_i);
+		}
+
+		//compute the coordinate space for the plane of incidence
+		vec<T> z_hat = -P.norm();
+		vec<T> y_hat = P.parallel(k).norm();
+		vec<T> x_hat = y_hat.cross(z_hat).norm();
+
+		//compute the k vectors for r and t
+		vec<T> kr, kt;
+		kr = ( y_hat * sin(theta_i) - z_hat * cos(theta_i) ) * kmag();
+		kt = ( y_hat * sin(theta_t) + z_hat * cos(theta_t) ) * kmag() * nr;
+
+		//compute the magnitude of the p- and s-polarized components of the incident E vector
+		complex<T> Ei_s = E0.dot(x_hat);
+		//int sgn = (0 < E0.dot(y_hat)) - (E0.dot(y_hat) < 0);
+		int sgn = E0.dot(y_hat).sgn();
+		vec< complex<T> > cx_hat = x_hat;
+		complex<T> Ei_p = ( E0 - cx_hat * Ei_s ).len() * sgn;
+		//T Ei_p = ( E0 - x_hat * Ei_s ).len();
+		//compute the magnitude of the p- and s-polarized components of the reflected E vector
+		complex<T> Er_s = Ei_s * rs;
+		complex<T> Er_p = Ei_p * rp;
+		//compute the magnitude of the p- and s-polarized components of the transmitted E vector
+		complex<T> Et_s = Ei_s * ts;
+		complex<T> Et_p = Ei_p * tp;
+
+		//std::cout<<"E0: "<<E0<<std::endl;
+		//std::cout<<"E0 dot y_hat: "<<E0.dot(y_hat)<<std::endl;
+		//std::cout<<"theta i: "<<theta_i<<"  theta t: "<<theta_t<<std::endl;
+		//std::cout<<"x_hat: "<<x_hat<<"  y_hat: "<<y_hat<<"  z_hat: "<<z_hat<<std::endl;
+		//std::cout<<"Ei_s: "<<Ei_s<<"  Ei_p: "<<Ei_p<<"  Er_s: "<<Er_s<<"  Er_p: "<<Er_p<<"  Et_s: "<<Et_s<<"  Et_p: "<<Et_p<<std::endl;
+		//std::cout<<"rs: "<<rs<<"  rp: "<<rp<<"  ts: "<<ts<<"  tp: "<<tp<<std::endl;
+		
+
+		//compute the reflected E vector
+		vec< complex<T> > Er = vec< complex<T> >(y_hat * cos(theta_i) + z_hat * sin(theta_i)) * Er_p + cx_hat * Er_s;
+		//compute the transmitted E vector
+		vec< complex<T> > Et = vec< complex<T> >(y_hat * cos(theta_t) - z_hat * sin(theta_t)) * Et_p + cx_hat * Et_s;
+
+		T phase_t = P.p().dot(k - kt);
+		T phase_r = P.p().dot(k - kr);
+
+		//std::cout<<"phase r: "<<phase_r<<"  phase t: "<<phase_t<<std::endl;
+
+		//std::cout<<"phase: "<<phase<<std::endl;
+
+		//create the plane waves
+		r.k = kr;
+		r.E0 = Er * exp( complex<T>(0, phase_r) );
+		//r.phi = phase_r;
+
+		//t = bend(kt);
+		//t.k = t.k * nr;
+
+		t.k = kt;
+		t.E0 = Et * exp( complex<T>(0, phase_t) );
+		//t.phi = phase_t;
+		//std::cout<<"i: "<<str()<<std::endl;
+		//std::cout<<"r: "<<r.str()<<std::endl;
+		//std::cout<<"t: "<<t.str()<<std::endl;
+
+		//std::cout<<"i + r: "<<pos()[0] + r.pos()[0]<<pos()[1] + r.pos()[1]<<pos()[2] + r.pos()[2]<<std::endl;
+		//std::cout<<"t:     "<<t.pos()[0]<<t.pos()[1]<<t.pos()[2]<<std::endl;
+		//std::cout<<"--------------------------------"<<std::endl;
+
+	}
+
+	std::string str()
+	{
+		std::stringstream ss;
+		ss<<"Plane Wave:"<<std::endl;
+		ss<<"	"<<E0<<" e^i ( "<<k<<" . r )";
+		return ss.str();
+	}
+};					//end planewave class
+}					//end namespace optics
+}					//end namespace stim
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, rts::planewave<T> p)
+{
+    os<<p.str();
+    return os;
+}
+
+#endif