ERROR in plane wave refraction.

David Mayerich
1 parent a9275be5
Showing 12 changed files with 373 additions and 862 deletions Show diff stats
math/complex.h
math/function.h
math/quad.h
math/quaternion.h
math/spherical_bessel.h
math/vector.h
optics/beam.h
optics/efield.cuh
optics/planewave.h
visualization/colormap.h
visualization/scalarfield.cuh
visualization/vectorfield.cuh
@@ -161,6 +161,17 @@ struct complex
         	return *this;
     }
  
+	CUDA_CALLABLE complex<T> operator-=(const complex<T> &rhs)
+    {
+		*this = *this - rhs;
+        	return *this;
+    }
+    CUDA_CALLABLE complex<T> operator-=(const T &rhs)
+    {
+		*this = *this - rhs;
+        	return *this;
+    }
+
     CUDA_CALLABLE complex<T> operator*=(const complex<T> &rhs)
     {
 		*this = *this * rhs;
@@ -220,7 +231,7 @@ struct complex
 	{
 		complex<T> result;
  
-		result = (T)log() * y;
+		result = log() * y;
  
 		return result.exp();
 	}
@@ -267,6 +278,27 @@ struct complex
         return false;
     }
  
+	//CASTING operators
+	template < typename otherT >
+	operator complex<otherT>()
+	{
+		complex<otherT> result((otherT)r, (otherT)i);
+		return result;
+	}
+	template< typename otherT >
+	complex( const complex<otherT> &rhs)
+	{
+		r = (T)rhs.r;
+		i = (T)rhs.i;
+	}
+	template< typename otherT >
+	complex& operator=(const complex<otherT> &rhs)
+	{
+		r = (T)rhs.r;
+		i = (T)rhs.i;
+		return *this;
+	}
+
 };
  
 }	//end RTS namespace
@@ -319,6 +351,11 @@ CUDA_CALLABLE static rts::complex&lt;T&gt; pow(rts::complex&lt;T&gt; x, T y)
 {
 	return x.pow(y);
 }
+template<typename T>
+CUDA_CALLABLE static rts::complex<T> pow(rts::complex<T> x, int y)
+{
+	return x.pow(y);
+}
  
 //log function
 template<typename T>
@@ -31,33 +31,40 @@ public:
 		//insert(0, 0);
 	}
  
-	Y linear(X x)
+	Y linear(X x) const
 	{
 		if(f.size() == 0)	return (Y)0;	//return zero if the function is empty
 		//declare an iterator
-        typename std::vector< dataPoint >::iterator it;
+		typedef typename std::vector< dataPoint >::iterator f_iter;
+        f_iter it;
  
-		dataPoint s;
-		s.x = x;
+		//dataPoint s;
+		//s.x = x;
  
-        it = search(f.begin(), f.end(), &s, &s + 1, &function<X, Y>::findCeiling);
+        //it = search(f.begin(), f.end(), &s, &s + 1, &function<X, Y>::findCeiling);
+        unsigned int i;
+        for(i = 0; i<f.size(); i++)
+        {
+        	if(f[i].x > x)
+        		break;
+        }
  
         //if the wavelength is past the end of the list, return the back
-        if(it == f.end())
+        if(i == f.size())
             return f.back().y;
         //if the wavelength is before the beginning of the list, return the front
-        else if(it == f.begin())
+        else if(i == 0)
             return f.front().y;
         //otherwise interpolate
         else
         {
-            X xMax = (*it).x;
-            X xMin = (*(it - 1)).x;
+            X xMax = f[i].x;
+            X xMin = f[i - 1].x;
             //std::cout<<lMin<<"----------"<<lMax<<std::endl;
  
             X a = (x - xMin) / (xMax - xMin);
-            Y riMin = (*(it - 1)).y;
-            Y riMax = (*it).y;
+            Y riMin = f[i - 1].y;
+            Y riMax = f[i].y;
             Y interp;
             interp = riMax * a + riMin * (1.0 - a);
             return interp;
@@ -92,35 +99,35 @@ public:
  
 	}
  
-	X getX(unsigned int i)
+	X getX(unsigned int i) const
 	{
 		return f[i].x;
 	}
  
-	Y getY(unsigned int i)
+	Y getY(unsigned int i) const
 	{
 		return f[i].y;
 	}
  
 	///get the number of data points in the function
-	unsigned int getN()
+	unsigned int getN() const
 	{
 		return f.size();
 	}
  
 	//look up an indexed component
-	dataPoint operator[](int i)
+	dataPoint operator[](int i) const
 	{
 		return f[i];
 	}
  
 	///linear interpolation
-	Y operator()(X x)
+	Y operator()(X x) const
 	{
 		return linear(x);
 	}
  
-	function<X, Y> operator+(Y r)
+	function<X, Y> operator+(Y r) const
 	{
 		function<X, Y> result;
  
@@ -134,13 +141,13 @@ public:
 		return result;
 	}
  
-	function<X, Y> & operator= (const Y & rhs)
-	{		
-		f.clear();
-		if(rhs != 0)			//if the RHS is zero, just clear, otherwise add one value of RHS
-			insert(0, rhs);
-
-		return *this;
+	function<X, Y> & operator= (const Y & rhs)
+	{		
+		f.clear();
+		if(rhs != 0)			//if the RHS is zero, just clear, otherwise add one value of RHS
+			insert(0, rhs);
+
+		return *this;
 	}
  
  
@@ -7,6 +7,7 @@
 #include "../math/triangle.h"
 #include "../math/quaternion.h"
 #include <iostream>
+#include <iomanip>
 #include <algorithm>
  
 namespace rts{
@@ -48,37 +49,6 @@ struct quad
  
 	}
  
-    /****************************************************************
-    Constructor - create a quad from two points and a normal
-    ****************************************************************/
-	/*CUDA_CALLABLE quad(rts::vec<T, N> pMin, rts::vec<T, N> pMax, rts::vec<T, N> normal)
-	{
-
-        //assign the corner point
-        A = pMin;
-
-        //compute the vector from pMin to pMax
-        rts::vec<T, 3> v0;
-        v0 = pMax - pMin;
-
-        //compute the cross product of A and the plane normal
-        rts::vec<T, 3> v1;
-        v1 = v0.cross(normal);
-
-
-        //calculate point B
-        rts::vec<T, 3> B;
-        B = A + v0 * 0.5f + v1 * 0.5f;
-
-        //calculate rtsPoint C
-        rts::vec<T, 3> C;
-        C = A  + v0 * 0.5f - v1 * 0.5f;
-
-        //calculate X and Y
-        X = B - A;
-        Y = C - A;
-	}*/
-
 	/*******************************************************************
 	Constructor - create a quad from a position, normal, and rotation
 	*******************************************************************/
@@ -114,6 +84,15 @@ struct quad
         std::cout<<X<<std::endl;
 	}
  
+	//boolean comparison
+	bool operator==(const quad<T, N> & rhs)
+	{
+		if(A == rhs.A && X == rhs.X && Y == rhs.Y)
+			return true;
+		else
+			return false;
+	}
+
 	/*******************************************
 	Return the normal for the quad
 	*******************************************/
@@ -140,10 +119,9 @@ struct quad
 	{
 		std::stringstream ss;
  
-		ss<<"A = "<<A<<std::endl;
-		ss<<"B = "<<A + Y<<std::endl;
-		ss<<"C = "<<A + Y + X<<std::endl;
-		ss<<"D = "<<A + X<<std::endl;
+		ss<<std::left<<"B="<<setfill('-')<<setw(20)<<A + Y<<">"<<"C="<<A + Y + X<<std::endl;
+		ss<<setfill(' ')<<setw(23)<<"|"<<"|"<<std::endl<<setw(23)<<"|"<<"|"<<std::endl;
+		ss<<std::left<<"A="<<setfill('-')<<setw(20)<<A<<">"<<"D="<<A + X;
  
         return ss.str();
  
@@ -38,19 +38,24 @@ void quaternion&lt;T&gt;::normalize()
 }
  
 template<typename T>
-void quaternion<T>::CreateRotation(T theta, T axis_x, T axis_y, T axis_z)
+void quaternion<T>::CreateRotation(T theta, T ux, T uy, T uz)
 {
-	//assign the given Euler rotation to this quaternion
-	w = (T)cos(theta/2);
-	x = axis_x*(T)sin(theta/2);
-	y = axis_y*(T)sin(theta/2);
-	z = axis_z*(T)sin(theta/2);
+	vec<T, 3> u(ux, uy, uz);
+
+	CreateRotation(theta, u);
+	
 }
  
 template<typename T>
-void quaternion<T>::CreateRotation(T theta, vec<T, 3> axis)
+void quaternion<T>::CreateRotation(T theta, vec<T, 3> u)
 {
-	CreateRotation(theta, axis[0], axis[1], axis[2]);
+	vec<T, 3> u_hat = u.norm();
+
+	//assign the given Euler rotation to this quaternion
+	w = (T)cos(theta/2);
+	x = u_hat[0]*(T)sin(theta/2);
+	y = u_hat[1]*(T)sin(theta/2);
+	z = u_hat[2]*(T)sin(theta/2);
 }
  
 template<typename T>
-#ifndef RTS_SBESSEL_H
-#define RTS_SBESSEL_H
-#include <math.h>
-
-
-namespace rts{
-
-#define RTS_BESSEL_CONVERGENCE_MIN		0.0145f
-#define RTS_BESSEL_CONVERGENCE_MAX		0.4f
-#define RTS_BESSEL_MAXIMUM_FLOAT		-1e33f
-
-template <typename T>
-CUDA_CALLABLE void sbesselj(int n, complex<T> x, complex<T>* j)
-{
-    //compute the first bessel function
-    if(n >= 0)
-        j[0] = (T)sin(x) / x;
-
-    //compute the second bessel function
-    if(n >= 1)
-        j[1] = j[0] / x - (T)cos(x) / x;
-
-    //use the recurrence relation to compute the rest
-    for(int i = 2; i <= n; i++)
-    {
-        j[i] = ( (2 * i - 1) / x ) * j[i-1] - j[i-2];
-    }
-
-    //if x = 0, deal with the degenerate case
-    /*if( isnan(j[0].r) )
-    {
-        j[0] = (T)1.0;
-        for(int i = 1; i<=n; i++)
-            j[i] = (T)0.0;
-    }*/
-}
-
-template <typename T>
-CUDA_CALLABLE void sbessely(int n, complex<T> x, complex<T>* y)
-{
-    //compute the first bessel function
-    if(n >= 0)
-        y[0] = -(T)cos(x) / x;
-
-    //compute the second bessel function
-    if(n >= 1)
-        y[1] = y[0] / x - (T)sin(x) / x;
-
-    //use the recurrence relation to compute the rest
-    for(int i = 2; i <= n; i++)
-    {
-        y[i] = ( (2 * i - 1) / x ) * y[i-1] - y[i-2];
-    }
-
-}
-
-//spherical Hankel functions of the first kind
-template <typename T>
-CUDA_CALLABLE void sbesselh1(int n, complex<T> x, complex<T>* h)
-{
-    //compute j_0 and j_1
-    complex<T> j[2];
-    sbesselj(1, x, j);
-
-    //compute y_0 and y_1
-    complex<T> y[2];
-    sbessely(1, x, y);
-
-    //compute the first-order Hhankel function
-    if(n >= 0)
-        h[0] = j[0] + y[0].imul();
-
-    //compute the second bessel function
-    if(n >= 1)
-        h[1] = j[1] + y[1].imul();
-
-    //use the recurrence relation to compute the rest
-    for(int i = 2; i <= n; i++)
-    {
-        h[i] = ( (2 * i - 1) / x ) * h[i-1] - h[i-2];
-    }
-}
-
-template <typename T>
-CUDA_CALLABLE void init_sbesselj(T x, T* j)
-{
-	//compute the first 2 bessel functions
-	j[0] = (T)sin(x) / x;
-
-	j[1] = j[0] / x - (T)cos(x) / x;
-}
-
-template <typename T>
-CUDA_CALLABLE void init_sbessely(T x, T* y)
-{
-	//compute the first 2 bessel functions
-	y[0] = -(T)cos(x) / x;
-
-	y[1] = y[0] / x - (T)sin(x) / x;
-}
-
-template <typename T>
-CUDA_CALLABLE void shift_sbesselj(int n, T x, T* b)//, T stability = 1.4)
-{
-
-	T bnew;
-
-	//compute the next (order n) Bessel function
-	bnew = ((2 * n - 1) * b[1])/x - b[0];
-
-	//if(n > stability*x)
-	if(n > real(x))
-		if(real(bnew) < RTS_BESSEL_CONVERGENCE_MIN || real(bnew) > RTS_BESSEL_CONVERGENCE_MAX)
-			bnew = 0;
-
-	//shift and add the new value to the array
-	b[0] = b[1];
-	b[1] = bnew;
-}
-
-template <typename T>
-CUDA_CALLABLE void shift_sbessely(int n, T x, T* b)//, T stability = 1.4)
-{
-
-	T bnew;
-
-	//compute the next (order n) Bessel function
-	bnew = ((2 * n - 1) * b[1])/x - b[0];
-
-	if(bnew < RTS_BESSEL_MAXIMUM_FLOAT ||
-	   (n > x && bnew > 0))
-	{
-		bnew = 0;
-		b[1] = 0;
-	}
-
-
-	//shift and add the new value to the array
-	b[0] = b[1];
-	b[1] = bnew;
-}
-
-
-
-}   //end namespace rts
-
-
-
-#endif
@@ -54,7 +54,7 @@ struct vec
 		memcpy(v, data, sizeof(T) * N);
 	}
  
-	CUDA_CALLABLE T len()
+	CUDA_CALLABLE T len() const
 	{
         //compute and return the vector length
         T sum_sq = (T)0;
@@ -66,7 +66,7 @@ struct vec
  
 	}
  
-	CUDA_CALLABLE vec<T, N> cart2sph()
+	CUDA_CALLABLE vec<T, N> cart2sph() const
 	{
 		//convert the vector from cartesian to spherical coordinates
 		//x, y, z -> r, theta, phi (where theta = 0 to 2*pi)
@@ -79,7 +79,7 @@ struct vec
 		return sph;
 	}
  
-	CUDA_CALLABLE vec<T, N> sph2cart()
+	CUDA_CALLABLE vec<T, N> sph2cart() const
 	{
 		//convert the vector from cartesian to spherical coordinates
 		//r, theta, phi -> x, y, z (where theta = 0 to 2*pi)
@@ -92,7 +92,7 @@ struct vec
 		return cart;
 	}
  
-	CUDA_CALLABLE vec<T, N> norm()
+	CUDA_CALLABLE vec<T, N> norm() const
 	{
         //compute and return the vector norm
         vec<T, N> result;
@@ -109,19 +109,19 @@ struct vec
         return result;
 	}
  
-	CUDA_CALLABLE vec<T, 3> cross(vec<T, 3> rhs)
+	CUDA_CALLABLE vec<T, 3> cross(const vec<T, 3> rhs) const
 	{
 		vec<T, 3> result;
  
 		//compute the cross product (only valid for 3D vectors)
-		result[0] = v[1] * rhs[2] - v[2] * rhs[1];
-		result[1] = v[2] * rhs[0] - v[0] * rhs[2];
-		result[2] = v[0] * rhs[1] - v[1] * rhs[0];
+		result[0] = v[1] * rhs.v[2] - v[2] * rhs.v[1];
+		result[1] = v[2] * rhs.v[0] - v[0] * rhs.v[2];
+		result[2] = v[0] * rhs.v[1] - v[1] * rhs.v[0];
  
 		return result;
 	}
  
-    CUDA_CALLABLE T dot(vec<T, N> rhs)
+    CUDA_CALLABLE T dot(vec<T, N> rhs) const
     {
         T result = (T)0;
  
@@ -133,7 +133,7 @@ struct vec
     }
  
 	//arithmetic
-	CUDA_CALLABLE vec<T, N> operator+(vec<T, N> rhs)
+	CUDA_CALLABLE vec<T, N> operator+(vec<T, N> rhs) const
 	{
         vec<T, N> result;
  
@@ -142,7 +142,7 @@ struct vec
  
         return result;
 	}
-	CUDA_CALLABLE vec<T, N> operator-(vec<T, N> rhs)
+	CUDA_CALLABLE vec<T, N> operator-(vec<T, N> rhs) const
 	{
         vec<T, N> result;
  
@@ -151,7 +151,7 @@ struct vec
  
         return result;
 	}
-	CUDA_CALLABLE vec<T, N> operator*(T rhs)
+	CUDA_CALLABLE vec<T, N> operator*(T rhs) const
 	{
         vec<T, N> result;
  
@@ -160,7 +160,7 @@ struct vec
  
         return result;
 	}
-	CUDA_CALLABLE vec<T, N> operator/(T rhs)
+	CUDA_CALLABLE vec<T, N> operator/(T rhs) const
 	{
         vec<T, N> result;
  
@@ -179,7 +179,7 @@ struct vec
 		return *this;
 	}*/
  
-	CUDA_CALLABLE bool operator==(vec<T, N> rhs)
+	CUDA_CALLABLE bool operator==(vec<T, N> rhs) const
 	{
         if ( (rhs.v[0] == v[0]) && (rhs.v[1] == v[1]) && (rhs.v[2] == v[2]) )
             return true;
@@ -187,7 +187,7 @@ struct vec
         return false;
 	}
  
-	std::string toStr()
+	std::string toStr() const
 	{
 		std::stringstream ss;
  
@@ -203,8 +203,8 @@ struct vec
 		return ss.str();
 	}
  
-	//bracket operator
-	CUDA_CALLABLE T& operator[](int i)
+	//bracket operator - allows assignment to the vector
+	CUDA_CALLABLE T& operator[](const unsigned int i)
 	{
         return v[i];
     }
@@ -3,12 +3,13 @@
  
 #include "../math/vector.h"
 #include "../math/function.h"
+#include "../optics/planewave.h"
 #include <vector>
  
 namespace rts{
  
 template<typename P>
-class beam
+class beam : public planewave<P>
 {
 public:
 	enum beam_type {Uniform, Bartlett, Hamming, Hanning};
@@ -17,10 +18,6 @@ private:
  
 	P na[2];		//numerical aperature of the focusing optics	
 	vec<P> f;		//focal point	
-	vec<P> k;		//direction vector	
-	vec<P> E0;		//polarization direction
-	P omega;		//frequency
-
 	function<P, P> apod;	//apodization function
 	unsigned int apod_res;	//resolution of complex apodization filters
  
@@ -71,17 +68,17 @@ private:
 public:
  
 	///constructor: build a default beam (NA=1.0)
-	beam(beam_type _apod = Uniform)
+	beam(
+		vec<P> _k = rts::vec<P>(0, 0, TAU), 
+		vec<P> _E0 = rts::vec<P>(1, 0, 0), 
+		beam_type _apod = Uniform)
+		: planewave<P>(_k, _E0)
 	{
 		na[0] = (P)0.0;
 		na[1] = (P)1.0;
-		f = vec<P>( (P)0.0, (P)0.0, (P)0.0 );
-		k = vec<P>( (P)0.0, (P)0.0, (P)1.0 );
-		E0 = vec<P>( (P)1.0, (P)0.0, (P)0.0 );
-		omega = (P)2 * (P)3.14159;
+		f = vec<P>( (P)0, (P)0, (P)0 );
 		apod_res = 256;						//set the default resolution for apodization filters
 		set_apod(_apod);						//set the apodization function type
-		
 	}
  
 	///Numerical Aperature functions
@@ -96,28 +93,32 @@ public:
 		na[1] = _na1;
 	}
  
+	/*string str() : 
+	{
+		stringstream ss;
+		ss<<"Beam Center: "<<k<<std::endl;
+
+		return ss.str();
+	}*/
  
 	//Monte-Carlo decomposition into plane waves
-	std::vector< planewave<P> > mc(unsigned int N, unsigned int seed = 0)
+	std::vector< planewave<P> > mc(unsigned int N = 100000, unsigned int seed = 0) const
 	{
 		/*Create Monte-Carlo samples of a cassegrain objective by performing uniform sampling
 			of a sphere and projecting these samples onto an inscribed sphere.
  
-			samples = rtsPointer to sample vectors specified as normalized cartesian coordinates
-			N       = number of samples
-			kSph	= incident light direction in spherical coordinates
-			NAin    = internal obscuration NA
-			NAout   = outer cassegrain NA
+			seed	=	seed for the random number generator
 		*/
-
 		srand(seed);		//seed the random number generator
  
+		vec<P> k_hat = beam::k.norm();
+
 		///compute the rotation operator to transform (0, 0, 1) to k
-		P cos_angle = k.dot(rts::vec<P>(0, 0, 1));
+		P cos_angle = k_hat.dot(rts::vec<P>(0, 0, 1));
 		rts::matrix<P, 3> rotation;
 		if(cos_angle != 1.0)
 		{
-			rts::vec<P> r_axis = rts::vec<P>(0, 0, 1).cross(k).norm();	//compute the axis of rotation
+			rts::vec<P> r_axis = rts::vec<P>(0, 0, 1).cross(k_hat).norm();	//compute the axis of rotation
 			P angle = acos(cos_angle);							//compute the angle of rotation
 			rts::quaternion<P> quat;							//create a quaternion describing the rotation
 			quat.CreateRotation(angle, r_axis);
@@ -137,8 +138,6 @@ public:
  
 		std::vector< planewave<P> > samples;	//create a vector of plane waves
  
-		planewave<P> beam_center(k, E0, omega);	//create a plane wave representing the beam center
-
 		//draw a distribution of random phi, z values
 		P z, phi, theta;
 		for(int i=0; i<N; i++)								//for each sample
@@ -153,7 +152,7 @@ public:
 			vec<P> k_prime = rotation * cart;				//create a sample vector
  
 			//store a wave refracted along the given direction
-			samples.push_back(beam_center.refract(k_prime) * apod(phi/PHI[1]));
+			samples.push_back(beam::refract(k_prime) * apod(phi/PHI[1]));
 		}
  
 		return samples;
+#ifndef	RTS_EFIELD
+#define RTS_EFIELD
+
 #include "../math/complex.h"
+#include "../math/realfield.cuh"
 #include "../visualization/colormap.h"
-#include "../visualization/scalarfield.cuh"
-#include "../visualization/vectorfield.cuh"
 #include "../optics/planewave.h"
 #include "../cuda/devices.h"
+#include "../optics/beam.h"
  
  
  
@@ -26,7 +29,7 @@ __global__ void gpu_planewave2efield(complex&lt;T&gt;* X, complex&lt;T&gt;* Y, complex&lt;T&gt;* Z
 	vec<T> p = q( (T)iu/(T)r0, (T)iv/(T)r1 );
 	vec<T> r(p[0], p[1], p[2]);
  
-	complex<T> x( 0.0f, w.omega * (w.k_hat.dot(r)) );
+	complex<T> x( 0.0f, w.k.dot(r) );
  
     if(Y == NULL)                       //if this is a scalar simulation
         X[i] += w.E0.len() * exp(x);    //use the vector magnitude as the plane wave amplitude
@@ -77,7 +80,24 @@ __global__ void gpu_efield_polarization(complex&lt;T&gt;* X, complex&lt;T&gt;* Y, complex&lt;T&gt;
     Px[i] = X[i].abs();
     Py[i] = Y[i].abs();
     Pz[i] = Z[i].abs();
+}
+
+/*	This function computes the sum of two complex fields and stores the result in *dest
+*/
+template<typename T>
+__global__ void gpu_efield_sum(complex<T>* dest, complex<T>* src, unsigned int r0, unsigned int r1)
+{
+	int iu = blockIdx.x * blockDim.x + threadIdx.x;
+    int iv = blockIdx.y * blockDim.y + threadIdx.y;
  
+    //make sure that the thread indices are in-bounds
+    if(iu >= r0 || iv >= r1) return;
+
+    //compute the index into the field
+    int i = iv*r0 + iu;
+
+    //sum the fields
+    dest[i] += src[i];
 }
  
 /*  This class implements a discrete representation of an electromagnetic field
@@ -86,9 +106,9 @@ __global__ void gpu_efield_polarization(complex&lt;T&gt;* X, complex&lt;T&gt;* Y, complex&lt;T&gt;
 template<typename P>
 class efield
 {
-private:
+protected:
  
-    bool scalar;
+    bool vector;
  
     //gpu pointer to the field data
     rts::complex<P>* X;
@@ -107,27 +127,14 @@ private:
         //create one thread for each detector pixel
         dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
         dim3 dimGrid((R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
+        
  
-        gpu_planewave2efield<float> <<<dimGrid, dimBlock>>> (X, Y, Z, R[0], R[1], p, pos);
+        gpu_planewave2efield<P> <<<dimGrid, dimBlock>>> (X, Y, Z, R[0], R[1], p, pos);
     }
  
-	void clear()
-	{
-		cudaMemset(X, 0, sizeof(rts::complex<P>) * R[0] * R[1]);
-
-		if(!scalar)
-        {
-			cudaMemset(Y, 0, sizeof(rts::complex<P>) * R[0] * R[1]);
-			cudaMemset(Z, 0, sizeof(rts::complex<P>) * R[0] * R[1]);
-		}
-	}
-
-
-public:
-
-    efield(unsigned int res0, unsigned int res1, bool _scalar = false)
+    void init(unsigned int res0, unsigned int res1, bool _vector)
     {
-        scalar = _scalar;           //initialize field type
+    	vector = _vector;           //initialize field type
  
         X = Y = Z = NULL;           //initialize all pointers to NULL
         R[0] = res0;
@@ -137,9 +144,9 @@ public:
         cudaMalloc(&X, sizeof(rts::complex<P>) * R[0] * R[1]);
 		cudaMemset(X, 0, sizeof(rts::complex<P>) * R[0] * R[1]);
  
-        if(!scalar)
+        if(vector)
         {
-            std::cout<<"scalar:";
+            std::cout<<"vector:";
             cudaMalloc(&Y, sizeof(rts::complex<P>) * R[0] * R[1]);
 			cudaMemset(Y, 0, sizeof(rts::complex<P>) * R[0] * R[1]);
  
@@ -148,12 +155,66 @@ public:
         }
     }
  
+    void destroy()
+    {
+		if(X != NULL) cudaFree(X);
+		if(Y != NULL) cudaFree(Y);
+		if(Z != NULL) cudaFree(Z);
+    }
+
+    void shallowcpy(const rts::efield<P> & src)
+    {
+    	vector = src.vector;
+    	R[0] = src.R[0];
+    	R[1] = src.R[1];
+    }
+
+    void deepcpy(const rts::efield<P> & src)
+    {
+    	//perform a shallow copy
+    	shallowcpy(src);
+
+    	//allocate memory on the gpu
+    	if(src.X != NULL)
+    	{
+    		cudaMalloc(&X, sizeof(rts::complex<P>) * R[0] * R[1]);
+    		cudaMemcpy(X, src.X, sizeof(rts::complex<P>) * R[0] * R[1], cudaMemcpyDeviceToDevice);
+    	}
+    	if(src.Y != NULL)
+    	{
+    		cudaMalloc(&Y, sizeof(rts::complex<P>) * R[0] * R[1]);
+    		cudaMemcpy(Y, src.Y, sizeof(rts::complex<P>) * R[0] * R[1], cudaMemcpyDeviceToDevice);
+    	}
+    	if(src.Z != NULL)
+    	{
+    		cudaMalloc(&Z, sizeof(rts::complex<P>) * R[0] * R[1]);
+    		cudaMemcpy(Z, src.Z, sizeof(rts::complex<P>) * R[0] * R[1], cudaMemcpyDeviceToDevice);
+    	}
+    }
+
+public:
+    efield(unsigned int res0, unsigned int res1, bool _vector = true)
+    {
+        init(res0, res1, _vector);
+        pos = rts::quad<P>(rts::vec<P>(-10, 0, -10), rts::vec<P>(-10, 0, 10), rts::vec<P>(10, 0, 10));
+    }
+
     //destructor
     ~efield()
     {
-        if(X != NULL) cudaFree(X);
-        if(Y != NULL) cudaFree(Y);
-        if(Z != NULL) cudaFree(Z);
+    	destroy();
+    }
+
+    ///Clear the field - set all points to zero
+    void clear()
+    {
+        cudaMemset(X, 0, sizeof(rts::complex<P>) * R[0] * R[1]);
+
+        if(vector)
+        {
+            cudaMemset(Y, 0, sizeof(rts::complex<P>) * R[0] * R[1]);
+            cudaMemset(Z, 0, sizeof(rts::complex<P>) * R[0] * R[1]);
+        }
     }
  
     void position(quad<P> _p)
@@ -172,56 +233,88 @@ public:
         return ss.str();
     }
  
+    //assignment operator: assignment from another electric field
+    efield<P> & operator= (const efield<P> & rhs)
+    {
+    	destroy();				//destroy any previous information about this field
+    	deepcpy(rhs);			//create a deep copy
+    	return *this;			//return the current object
+    }
+
     //assignment operator: build an electric field from a plane wave
     efield<P> & operator= (const planewave<P> & rhs)
 	{
  
 		clear();				//clear any previous field data
 		from_planewave(rhs);	//create a field from the planewave
-		return *this;
+		return *this;			//return the current object
 	}
  
-	//assignment operator: add the electric field from a plane wave
-    efield<P> & operator+= (const planewave<P> & rhs)
+	//assignment operator: add an existing electric field
+	efield<P> & operator+= (const efield<P> & rhs)
 	{
-		//create a field from the planewave
-		from_planewave(rhs);
+		//if this field and the source field represent the same regions in space
+		if(R[0] == rhs.R[0] && R[1] == rhs.R[1] && pos == rhs.pos)
+		{
+			int maxThreads = rts::maxThreadsPerBlock(); //compute the optimal block size
+			int SQRT_BLOCK = (int)std::sqrt((float)maxThreads);
+
+			//create one thread for each detector pixel
+			dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
+			dim3 dimGrid((R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
+
+			//sum the fields
+			gpu_efield_sum <<<dimGrid, dimBlock>>> (X, rhs.X, R[0], R[1]);
+			if(Y != NULL)
+			{
+				gpu_efield_sum <<<dimGrid, dimBlock>>> (Y, rhs.Y, R[0], R[1]);
+				gpu_efield_sum <<<dimGrid, dimBlock>>> (Z, rhs.Z, R[0], R[1]);
+			}
+		}
+		else
+		{
+			std::cout<<"ERROR in efield: The two summed fields do not represent the same geometry."<<std::endl;
+			exit(1);
+		}
  
-		return *this;
+		return *this;		//return this object
 	}
  
-	//assignment operator: build an electric field from a list of plane waves
-	efield<P> & operator= (const std::vector< planewave<P> > & rhs)
-	{
-		clear();				//clear any previous field data
-		for(unsigned int i = 0; i < rhs.size(); i++)
-			from_planewave(rhs[i]);
-		return *this;
-	}
+    efield<P> & operator= (const rts::beam<P> & rhs)
+    {
+        //get a vector of monte-carlo samples
+        std::vector< rts::planewave<P> > p_list = rhs.mc();
+
+        clear();                //clear any previous field data
+        for(unsigned int i = 0; i < p_list.size(); i++)
+            from_planewave(p_list[i]);
+        return *this;
+    }
+
  
 	//return a scalar field representing field magnitude
-    scalarfield<P> mag()
+    realfield<P, 1, true> mag()
     {
-        int maxThreads = rts::maxThreadsPerBlock(); //compute the optimal block size
-        int SQRT_BLOCK = (int)std::sqrt((float)maxThreads);
+		int maxThreads = rts::maxThreadsPerBlock(); //compute the optimal block size
+		int SQRT_BLOCK = (int)std::sqrt((float)maxThreads);
  
 		//create a scalar field to store the result
-		scalarfield<P> M(R[0], R[1]);
+		realfield<P, 1, true> M(R[0], R[1]);
  
-        //create one thread for each detector pixel
-        dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
-        dim3 dimGrid((R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
+		//create one thread for each detector pixel
+		dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
+		dim3 dimGrid((R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
  
-        //compute the magnitude and store it in a scalar field
-		gpu_efield_magnitude<float> <<<dimGrid, dimBlock>>> (X, Y, Z, R[0], R[1], M.S);
+		//compute the magnitude and store it in a scalar field
+		gpu_efield_magnitude<float> <<<dimGrid, dimBlock>>> (X, Y, Z, R[0], R[1], M.ptr(0));
  
 		return M;
     }
  
     //return a vector field representing field polarization
-    vectorfield<P> polarization()
+    realfield<P, 3, true> polarization()
     {
-        if(scalar)
+        if(!vector)
         {
             std::cout<<"ERROR: Cannot compute polarization of a scalar field."<<std::endl;
             exit(1);
@@ -234,10 +327,10 @@ public:
         dim3 dimGrid((R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
  
  
-        vectorfield<P> Pol(R[0], R[1]);     //create a vector field to store the result
+        realfield<P, 3, true> Pol(R[0], R[1]);     //create a vector field to store the result
  
         //compute the polarization and store it in the vector field
-        gpu_efield_polarization<float> <<<dimGrid, dimBlock>>> (X, Y, Z, R[0], R[1], Pol.S[0], Pol.S[1], Pol.S[2]);
+        gpu_efield_polarization<float> <<<dimGrid, dimBlock>>> (X, Y, Z, R[0], R[1], Pol.ptr(0), Pol.ptr(1), Pol.ptr(2));
  
         return Pol;                         //return the vector field
     }
@@ -248,4 +341,6 @@ public:
  
  
  
-}   //end namespace rts
 \ No newline at end of file
+}   //end namespace rts
+
+#endif
 \ No newline at end of file
@@ -4,8 +4,13 @@
 #include <string>
 #include <sstream>
  
-#include "rts/math/vector.h"
-#include "rts/math/quaternion.h"
+#include "../math/vector.h"
+#include "../math/quaternion.h"
+#include "../math/constants.h"
+
+/*Basic conversions used here (assuming a vacuum)
+	lambda =
+*/
  
 namespace rts{
  
@@ -13,37 +18,28 @@ template&lt;typename P&gt;
 class planewave{
  
 public:
-	rts::vec<P> k_hat;	//normalized planewave direction
-	P omega;				//frequency
-
-	rts::vec<P> E0;		//amplitude
+	vec<P> k;	//k = tau / lambda
+	vec<P> E0;		//amplitude
  
  
-
-	planewave()
-	{
-		omega = (P)2 * (P)3.14159;
-		k_hat = rts::vec<P>(0, 0, 1);
-		E0 = rts::vec<P>(1, 0, 0);
-	}
 	///constructor: create a plane wave propagating along z, polarized along x
-	planewave(P _omega)
+	/*planewave(P lambda = (P)1)
 	{
-		omega = _omega;
-		k_hat = rts::vec<P>(0, 0, 1);
+		k = rts::vec<P>(0, 0, 1) * (TAU/lambda);
 		E0 = rts::vec<P>(1, 0, 0);
-	}
+	}*/
 	///constructor: create a plane wave propagating along _k, polarized along _E0, at frequency _omega
-	planewave(vec<P> _k, vec<P> _E0, P _omega)
+	planewave(vec<P> _k = rts::vec<P>(0, 0, TAU), vec<P> _E0 = rts::vec<P>(1, 0, 0))
 	{
-		k_hat = _k.norm();
-		vec<P> s = k_hat.cross(_E0);	//re-orthogonalize
-		E0 = s.cross(k_hat);
-		omega = _omega;
+		k = _k;
+		vec<P> k_hat = _k.norm();
+
+		vec<P> s = (k.cross(_E0)).norm();		//compute an orthogonal side vector
+		vec<P> E0_hat = (s.cross(k)).norm();	//compute a normalized E0 direction vector
+		E0 = E0_hat * E0_hat.dot(_E0);					//compute the projection of _E0 onto E0_hat
 	}
  
 	///multiplication operator: scale E0
-	//assignment operator: build an electric field from a plane wave
     planewave<P> & operator* (const P & rhs)
 	{
  
@@ -51,19 +47,34 @@ public:
 		return *this;
 	}
  
+	P lambda() const
+	{
+		return TAU / k.len();
+	}
+
+	P kmag() const
+	{
+		return k.len();
+	}
+
  
-	planewave<P> refract(rts::vec<P> new_k)
+	planewave<P> refract(rts::vec<P> kn) const
 	{
-		new_k = new_k.norm();	//normalize new_k
+		vec<P> kn_hat = kn.norm();	//normalize new_k
+		vec<P> k_hat = k.norm();
  
 		//compute the side vector (around which we will be rotating)
-		rts::vec<P> s = k_hat.cross(E0.norm());
+		vec<P> E0_hat = E0.norm();
+		rts::vec<P> s = k_hat.cross(E0_hat);
  
 		//compute the projection of k' onto the k-E plane
-		rts::vec<P> s_prime = s * (new_k.dot(s));
+		rts::vec<P> s_prime = s * (kn_hat.dot(s));
  
 		//compute the angle between
-		P theta = acos(k_hat.dot( (new_k - s_prime).norm() ));
+		vec<P> kp_hat = (kn_hat - s_prime).norm();
+		P theta = acos(k_hat.dot( kp_hat ));
+		if(kn_hat.dot(E0_hat) < 0)
+			theta = -theta;
  
 		//rotate E0 around s by theta
 		quaternion<P> q;
@@ -71,17 +82,60 @@ public:
 		rts::vec<P> E0_prime = q.toMatrix3() * E0;
  
 		//create the refracted plane wave
-		planewave<P> new_p(omega);
+		planewave<P> new_p;
 		new_p.E0 = E0_prime;
-		new_p.k_hat = new_k;
+		new_p.k = kn_hat * k.len();
  
 		return new_p;
+		/*vec<P> kn_hat = kn.norm();		//normalize kn
+		vec<P> k_hat = k.norm();
+		vec<P> E0_hat = E0.norm();
+
+		vec<P> B = k_hat.cross(E0_hat);
+		planewave<P> newp;
+		newp.k = kn_hat * k.len();
+		newp.E0 = B.cross(kn_hat).norm();
+		std::cout<<newp.k.norm().dot(newp.E0.norm())<<std::endl;
+		return newp;*/
+/*
+		//compute the side vector (around which we will be rotating)
+		rts::vec<P> s_hat = k_hat.cross(E0_hat);
+		//std::cout<<s.len()<<std::endl;
+
+		//project kn_hat into the k-E0 plane
+		rts::vec<P> sp = s_hat * (kn_hat.dot(s_hat));	//project k_new onto s
+		rts::vec<P> kp = (kn_hat - sp);	//correct k_new so it lies on the E0-k plane
+		rts::vec<P> kp_hat = kp.norm();
+
+		//compute the angle and direction between k_prime and k
+		P theta = acos(k_hat.dot(kp_hat));
+		if(kp_hat.dot(E0_hat) < 0)
+			theta = -theta;
+
+		//rotate E0 around s by theta
+		quaternion<P> q;
+		q.CreateRotation(theta, s_hat);
+		rts::vec<P> E0n = q.toMatrix3() * E0;
+		rts::vec<P> E0n_hat = E0n.norm();
+
+		//std::cout<<s_hat.dot(kp_hat)<<"  "<<s_hat.dot(E0n_hat)<<"  "<<s_hat.dot(E0_hat)<<"  "<<s_hat.dot(k_hat)<<"  "<<
+		//	E0_hat.dot(k_hat)<<"  "<<k_hat.dot(kp_hat)<<"  "<<E0_hat.dot(E0n_hat)<<"  "<<E0n_hat.dot(kp_hat)<<std::endl;
+
+		//create the refracted plane wave
+		//std::cout<<"cos: "<<cos(theta)<<"   k*kp: "<<k_hat.dot(kp_hat)<<"  E0*E0p: "<<E0_hat.dot(E0n_hat)<<"  E0p*kp: "<<E0n_hat.dot(kp_hat)<<std::endl;
+
+		//std::cout<<"kp*s: "<<kp.dot(s_hat)<<"   E0n*s: "<<E0n.dot(s_hat)<<"  |E0n|: "<<E0n.len()<<"  E0n*kp: "<<E0n.dot(kp_hat)<<"  E0n*kn: "<<E0n.dot(kn_hat)<<std::endl;
+
+		planewave<P> new_p(kn_hat * k.len(), E0n);				//create the plane wave
+		std::cout<<"|E0n|: "<<new_p.E0.len()<<"  E0n*kn: "<<(new_p.E0.norm()).dot(new_p.k.norm())<<std::endl;
+
+		return new_p;*/
 	}
  
 	std::string str()
 	{
 		std::stringstream ss;
-		ss<<E0<<" e^i ( "<<omega<<"t - "<<omega<<" "<<k_hat * omega<<" . r )";
+		ss<<E0<<" e^i ( "<<k<<" . r )";
 		return ss.str();
 	}
 };
@@ -86,7 +86,6 @@ static void initBrewer()
 static void destroyBrewer()
 {
     HANDLE_ERROR(cudaFreeArray(gpuBrewer));
-
 }
  
 template<class T>
@@ -99,8 +98,11 @@ __global__ static void applyBrewer(T* gpuSource, unsigned char* gpuDest, unsigne
 	//compute the normalized value on [minVal maxVal]
 	float a = (gpuSource[i] - minVal) / (maxVal - minVal);
  
+    //compensate for the additional space at the edges
+    a *= (T)(BREWER_CTRL_PTS - 1)/(T)(BREWER_CTRL_PTS);
+
 	//lookup the color
-	float shift = 1.0/(2*BREWER_CTRL_PTS);
+	float shift = (T)1/(2*BREWER_CTRL_PTS);
 	float4 color = tex1D(cudaTexBrewer, a+shift);
 	//float4 color = tex1D(cudaTexBrewer, a);
  
-#ifndef RTS_SCALAR_SLICE
-#define RTS_SCALAR_SLICE
-
-#include "../visualization/colormap.h"
-#include "../envi/envi.h"
-#include "../math/quad.h"
-#include "../cuda/devices.h"
-#include "cublas_v2.h"
-#include <cuda_runtime.h>
-
-///Compute a Gaussian function in 3D (mostly for testing)
-template<typename T>
-__global__ void gpu_gaussian(T* dest, unsigned int r0, unsigned int r1, T mean, T std, rts::quad<T> shape)
-{
-	int iu = blockIdx.x * blockDim.x + threadIdx.x;
-	int iv = blockIdx.y * blockDim.y + threadIdx.y;
-
-	//make sure that the thread indices are in-bounds
-	if(iu >= r0 || iv >= r1) return;
-
-	//compute the index into the field
-	int i = iv*r0 + iu;
-
-	T u = (T)iu / (T)r0;
-	T v = (T)iv / (T)r1;
-
-	rts::vec<T> p = shape(u, v);
-
-	T fx = (T)1.0 / (std * (T)sqrt(2 * 3.14159f) ) * exp( - pow(p[0] - mean, 2) / (2 * std*std) );
-	T fy = (T)1.0 / (std * (T)sqrt(2 * 3.14159f) ) * exp( - pow(p[1] - mean, 2) / (2 * std*std) );
-	T fz = (T)1.0 / (std * (T)sqrt(2 * 3.14159f) ) * exp( - pow(p[2] - mean, 2) / (2 * std*std) );
-
-	dest[i] = fx * fy * fz;
-}
-
-namespace rts {
-template<typename P>
-struct scalarfield
-{
-	//gpu pointer to the scalar slice
-	P* S;
-
-	//resolution of the slice
-	int R[2];
-
-	quad<P> shape;
-
-    scalarfield()
-    {
-        R[0] = R[1] = 0;
-        shape = quad<P>(vec<P>(-1, -1, 0), vec<P>(-1, 1, 0), vec<P>(1, 1, 0));
-        S = NULL;
-
-		std::cout<<"scalarfield CONSTRUCTOR"<<std::endl;
-    }
-
-	scalarfield(int x, int y)
-	{
-        //set the resolution
-        R[0] = x;
-        R[1] = y;
-
-        shape = quad<P>(vec<P>(-1, -1, 0), vec<P>(-1, 1, 0), vec<P>(1, 1, 0));
-
-        //allocate memory on the GPU
-        HANDLE_ERROR(cudaMalloc( (void**)&S, sizeof(P) * x * y ));
-
-		std::cout<<"scalarfield CONSTRUCTOR"<<std::endl;
-    }
-
-	~scalarfield()
-    {
-        if(S != NULL)
-            HANDLE_ERROR(cudaFree(S));
-        S = NULL;
-        R[0] = R[1] = 0;
-
-		std::cout<<"scalarfield DESTRUCTOR"<<std::endl;
-    }
-
-	void clear()
-    {
-        //this function sets the slice to zero
-        if(S != NULL)
-            HANDLE_ERROR(cudaMemset(S, 0, sizeof(P) * R[0] * R[1]));
-    }
-
-	void toImage(std::string filename, P vmin, P vmax, rts::colormapType cmap = rts::cmBrewer)
-    {
-        rts::gpu2image<P>(S, filename, R[0], R[1], vmin, vmax, cmap);
-    }
-
-	void toImage(std::string filename, bool positive = true, rts::colormapType cmap = rts::cmBrewer)
-	{
-        cublasStatus_t stat;
-        cublasHandle_t handle;
-
-        //create a CUBLAS handle
-        stat = cublasCreate(&handle);
-        if(stat != CUBLAS_STATUS_SUCCESS)
-        {
-            std::cout<<"CUBLAS Error: initialization failed"<<std::endl;
-            exit(1);
-        }
-
-        //find the index of the value with maximum magnitude
-        int N = R[0] * R[1];
-        int result;
-
-        if(sizeof(P) == 4)
-            stat = cublasIsamax(handle, N, (float*)S, 1, &result);
-        else
-            stat = cublasIdamax(handle, N, (double*)S, 1, &result);
-
-        //adjust for 1-based indexing
-        result -= 1;
-
-        if(stat != CUBLAS_STATUS_SUCCESS)
-        {
-            std::cout<<"CUBLAS Error: failure finding maximum value."<<std::endl;
-            exit(1);
-        }
-
-
-
-        //retrieve the maximum value
-        P maxVal;
-        HANDLE_ERROR(cudaMemcpy(&maxVal, S + result, sizeof(P), cudaMemcpyDeviceToHost));
-
-        //destroy the CUBLAS handle
-        cublasDestroy(handle);
-
-        //output the image
-        if(positive)
-            toImage(filename, 0, maxVal, cmap);
-        else
-            toImage(filename, -abs(maxVal), abs(maxVal), cmap);
-    }
-
-
-	void toEnvi(std::string filename, P wavelength = 0, bool append = false)
-    {
-        std::string mode;
-        if(append) mode = "a";
-        else       mode = "w";
-
-        //open the ENVI file
-        EnviFile outfile(filename, mode);
-
-        //get the scalar slice from the GPU to the CPU
-        int memsize = sizeof(P) * R[0] * R[1];
-        P* cpuData = (P*) malloc( memsize );
-        HANDLE_ERROR(cudaMemcpy( cpuData, S, memsize, cudaMemcpyDeviceToHost));
-
-        //add a band to the ENVI file
-        outfile.addBand(cpuData, R[0], R[1], wavelength);
-
-        outfile.close();
-    }
-
-	//assignment operator
-	scalarfield & operator= (const scalarfield & rhs)
-    {
-        //de-allocate any existing GPU memory
-        if(S != NULL)
-            HANDLE_ERROR(cudaFree(S));
-
-        //copy the slice resolution
-        R[0] = rhs.R[0];
-        R[1] = rhs.R[1];
-
-        //allocate the necessary memory
-        HANDLE_ERROR(cudaMalloc(&S, sizeof(P) * R[0] * R[1]));
-
-        //copy the slice
-        HANDLE_ERROR(cudaMemcpy(S, rhs.S, sizeof(P) * R[0] * R[1], cudaMemcpyDeviceToDevice));
-
-
-        std::cout<<"Assignment operator."<<std::endl;
-
-        return *this;
-    }
-
-	///copy constructor
-	scalarfield(const scalarfield &rhs)
-	{
-		//first make a shallow copy
-		R[0] = rhs.R[0];
-		R[1] = rhs.R[1];
-
-		//do we have to make a deep copy?
-		if(rhs.S == NULL)
-			S = NULL;		//no
-		else
-		{
-			//allocate the necessary memory
-			HANDLE_ERROR(cudaMalloc(&S, sizeof(P) * R[0] * R[1]));
-
-			//copy the slice
-			HANDLE_ERROR(cudaMemcpy(S, rhs.S, sizeof(P) * R[0] * R[1], cudaMemcpyDeviceToDevice));
-		}
-
-		std::cout<<"scalarfield COPY CONSTRUCTOR"<<std::endl;
-	}
-
-	void gaussian(P mean, P std)
-	{
-		int maxThreads = rts::maxThreadsPerBlock(); //compute the optimal block size
-        int SQRT_BLOCK = (int)sqrt((float)maxThreads);
-		//create one thread for each detector pixel
-		dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
-		dim3 dimGrid((R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
-
-		gpu_gaussian<float> <<<dimGrid, dimBlock>>> (S, R[0], R[1], mean, std, shape);
-	}
-
-};
-
-}   //end namespace rts
-
-
-
-#endif
-#ifndef RTS_VECTORFIELD
-#define RTS_VECTORFIELD
-
-#include "../visualization/colormap.h"
-#include "../envi/envi.h"
-#include "../math/quad.h"
-#include "../cuda/devices.h"
-#include "cublas_v2.h"
-#include <cuda_runtime.h>
-#include <iomanip>
-
-#include <qimage.h>
-#include <qcolor.h>
-
-
-namespace rts {
-template<typename P, unsigned int N = 3>
-struct vectorfield
-{
-private:
-    void process_filename(std::string name, std::string &prefix, std::string &postfix, 
-                          std::string &ext, unsigned int &digits)
-    {
-        std::stringstream ss(name);
-        std::string item;
-        std::vector<std::string> elems;
-        while(std::getline(ss, item, '.'))      //split the string at the '.' character (filename and extension)
-        {
-            elems.push_back(item);
-        }
-        
-        prefix = elems[0];                      //prefix contains the filename (with wildcard '?' characters)
-        ext = elems[1];                         //file extension (ex. .bmp, .png)
-        ext = std::string(".") + ext;           //add a period back into the extension
-
-        size_t i0 = prefix.find_first_of("?");  //find the positions of the first and last wildcard ('?'')
-        size_t i1 = prefix.find_last_of("?");
-
-        postfix = prefix.substr(i1+1);
-        prefix = prefix.substr(0, i0);
-
-        digits = i1 - i0 + 1;                   //compute the number of wildcards
-
-    }
-
-public:
-	//gpu pointers to scalar slices
-	P* S[N];
-
-	//resolution of the slice
-	int R[2];
-
-	quad<P> shape;
-
-    vectorfield()
-    {
-        R[0] = R[1] = 0;            //set the initial resolution to 0
-        shape = quad<P>(vec<P>(-1, -1, 0), vec<P>(-1, 1, 0), vec<P>(1, 1, 0));
-        for(int n=0; n<N; n++)      //set each vector component to NULL
-            S[n]=NULL;
-
-		std::cout<<"vectorfield CONSTRUCTOR"<<std::endl;
-    }
-
-	vectorfield(int x, int y)
-	{
-        //set the resolution
-        R[0] = x;
-        R[1] = y;
-
-        shape = quad<P>(vec<P>(-1, -1, 0), vec<P>(-1, 1, 0), vec<P>(1, 1, 0));
-
-        //allocate memory on the GPU
-        for(int n=0; n<N; n++)
-            HANDLE_ERROR(cudaMalloc( (void**)&S[n], sizeof(P) * x * y ));
-
-		std::cout<<"vectorfield CONSTRUCTOR"<<std::endl;
-    }
-
-	~vectorfield()
-    {
-        for(int n=0; n<N; n++)
-            if(S[n] != NULL)
-            {
-                HANDLE_ERROR(cudaFree(S[n]));
-                S[n] = NULL;
-            }
-        R[0] = R[1] = 0;
-
-		std::cout<<"vectorfield DESTRUCTOR"<<std::endl;
-    }
-
-	void clear()
-    {
-        //this function sets the slice to zero
-        for(int n=0; n<N; n++)
-            if(S[n] != NULL)
-                HANDLE_ERROR(cudaMemset(S[n], 0, sizeof(P) * R[0] * R[1]));
-    }
-
-	void toImage(std::string filename, unsigned int n, P vmin, P vmax, rts::colormapType cmap = rts::cmBrewer)
-    {
-		rts::gpu2image<P>(S[n], filename, R[0], R[1], vmin, vmax, cmap);
-    }
-
-	void toImage3(std::string filename, P vmin, P vmax)
-	{
-		std::cout<<"Implementing a 3-component rainbow colormap: "<<filename.c_str()<<std::endl;
-		//create a buffer for each RGB component
-		unsigned char* red = (unsigned char*)malloc(sizeof(unsigned char) * 3 * R[0] * R[1]);
-		unsigned char* green = (unsigned char*)malloc(sizeof(unsigned char) * 3 * R[0] * R[1]);
-		unsigned char* blue = (unsigned char*)malloc(sizeof(unsigned char) * 3 * R[0] * R[1]);
-
-		//retrieve the buffered images for each component
-		rts::gpu2cpu<P>(S[0], red, R[0] * R[1], vmin, vmax);
-		rts::gpu2cpu<P>(S[1], green, R[0] * R[1], vmin, vmax);
-		rts::gpu2cpu<P>(S[2], blue, R[0] * R[1], vmin, vmax);
-
-		QImage image(R[0], R[1], QImage::Format_RGB32);		//create a QImage object
-		if(image.isNull())										//if it didn't work, throw an error
-		{
-			std::cout<<"Error creating QImage."<<std::endl;
-			return;
-		}
-
-		int i;
-		unsigned char r, g, b;
-		unsigned int x, y;
-		for(y=0; y<R[1]; y++)
-			for(x=0; x<R[0]; x++)
-			{
-				//calculate the 1D index
-				i = y * R[0] + x;
-
-				r = red[i * 3 + 0];
-				g = green[i * 3 + 1];
-				b = blue[i * 3 + 2];
-
-				//set the image pixel
-				QColor color(r, g, b);
-				image.setPixel(x, y, color.rgb());
-			}
-
-		if(!image.save(filename.c_str()))					//if the image didn't save correctly,
-			std::cout<<"Error saving QImage."<<std::endl;	//	throw an error
-	}
-
-	void toImages(std::string filename, bool positive = true, rts::colormapType cmap = rts::cmBrewer, bool globalmax = true)
-	{
-        std::string prefix, postfix, extension;
-        unsigned int digits;
-        process_filename(filename, prefix, postfix, extension, digits);      //process the filename for wild cards
-
-        cublasStatus_t stat;
-        cublasHandle_t handle;
-
-        //create a CUBLAS handle
-        stat = cublasCreate(&handle);
-        if(stat != CUBLAS_STATUS_SUCCESS)
-        {
-            std::cout<<"CUBLAS Error: initialization failed"<<std::endl;
-            exit(1);
-        }
-
-        int L = R[0] * R[1];    //compute the number of discrete points in a slice
-        int result;             //result of the max operation
-
-        P maxVal[N];            //array stores minimum and maximum values
-        P maxAll = 0;           //largest value in the data set
-
-        //compute the maximum value for each vector component
-        for(int n=0; n<N; n++)
-        {
-            if(sizeof(P) == 4)
-                stat = cublasIsamax(handle, L, (const float*)S[n], 1, &result);
-            else
-                stat = cublasIdamax(handle, L, (const double*)S[n], 1, &result);
-
-            result -= 1;        //adjust for 1-based indexing
-
-            if(stat != CUBLAS_STATUS_SUCCESS)   //if there was a GPU error, terminate
-            {
-                std::cout<<"CUBLAS Error: failure finding maximum value."<<std::endl;
-                exit(1);
-            }
-
-            //retrieve the maximum value for this slice and store it in the maxVal array
-            HANDLE_ERROR(cudaMemcpy(&maxVal[n], S[n] + result, sizeof(P), cudaMemcpyDeviceToHost));
-            if(maxVal[n] > maxAll)          //if maxVal is larger, update the maxAll variable
-                maxAll = maxVal[n];
-
-        }
-        
-        cublasDestroy(handle);  //destroy the CUBLAS handle
-
-		if(cmap == rts::cmRainbow && N == 3)		//if the user specifies a rainbow colormap, and the vector has 3 elements
-		{
-			//implement a single image with RGB = XYZ
-			if(positive)
-				toImage3(prefix+postfix+extension, 0, maxAll);
-			else
-				toImage3(prefix+postfix+extension, 0, maxAll);
-		}
-		else
-		{
-			for(int n=0; n<N; n++)          //for each image
-			{
-				stringstream ss;            //assemble the file name
-				ss<<prefix<<std::setfill('0')<<std::setw(digits)<<n<<postfix<<extension;
-				std::cout<<ss.str()<<std::endl;
-				if(positive)                //if the image is positive
-				{
-					std::cout<<"Positive: "<<n<<std::endl;
-					if(globalmax)           //if the global maximum is used
-						toImage(ss.str(), n, 0, maxAll, cmap);         //save the image using the global maximum
-					else
-						toImage(ss.str(), n, 0, maxVal[n], cmap);      //save the image using the local maximum
-				}
-				else
-				{
-					std::cout<<"Negative: "<<n<<std::endl;
-					if(globalmax)           //if the global maximum is used
-						toImage(ss.str(), n, -abs(maxVal[n]), abs(maxVal[n]), cmap);   //save the image using the global maximum
-					else
-						toImage(ss.str(), n, -abs(maxVal[n]), abs(maxVal[n]), cmap);   //save the image using the local maximum
-				}
-			}
-		}
-    }
-
-	//assignment operator
-	vectorfield & operator= (const vectorfield & rhs)
-    {
-        //de-allocate any existing GPU memory
-        for(int n=0; n<N; n++)
-            if(S[n] != NULL)
-                HANDLE_ERROR(cudaFree(S[n]));
-
-        //copy the slice resolution
-        R[0] = rhs.R[0];
-        R[1] = rhs.R[1];
-
-        for(int n=0; n<N; n++)
-        {
-            //allocate the necessary memory
-            HANDLE_ERROR(cudaMalloc(&S[n], sizeof(P) * R[0] * R[1]));
-
-            //copy the slice
-            HANDLE_ERROR(cudaMemcpy(S[n], rhs.S[n], sizeof(P) * R[0] * R[1], cudaMemcpyDeviceToDevice));
-        }
-
-
-        std::cout<<"Assignment operator."<<std::endl;
-
-        return *this;
-    }
-
-	///copy constructor
-	vectorfield(const vectorfield &rhs)
-	{
-		//first make a shallow copy
-		R[0] = rhs.R[0];
-		R[1] = rhs.R[1];
-
-		//do we have to make a deep copy?
-        if(rhs.S[0] == NULL)        //no?
-        {
-            for(int n=0; n<N; n++)  //set all components to NULL
-            {
-                S[n] = NULL;
-            }
-        }
-        else
-        {
-            for(int n=0; n<N; n++)
-            {
-    			//allocate the necessary memory
-    			HANDLE_ERROR(cudaMalloc(&S[n], sizeof(P) * R[0] * R[1]));
-
-    			//copy the slice
-    			HANDLE_ERROR(cudaMemcpy(S[n], rhs.S[n], sizeof(P) * R[0] * R[1], cudaMemcpyDeviceToDevice));
-            }
-        }
-
-		std::cout<<"vectorfield COPY CONSTRUCTOR"<<std::endl;
-	}
-
-};
-
-}   //end namespace rts
-
-
-
-#endif