diff --git a/bessjy.cpp b/bessjy.cpp
index 2dc98af..e72ff28 100644
--- a/bessjy.cpp
+++ b/bessjy.cpp
@@ -13,7 +13,9 @@
 //
 #define _USE_MATH_DEFINES
 #include <math.h>
-#include "bessel.h"
+#include "bessel.h"
+
+#define PI  3.14159
 
 double gamma(double x);
 //
@@ -426,7 +428,7 @@ int bessjynb(int n,double x,int &nm,double *jn,double *yn,
         0.2775764465332031,
        -1.993531733751297,
         2.724882731126854e1};
-        
+
     int i,k,m;
     nm = n;
     if ((x < 0.0) || (n < 0)) return 1;
@@ -702,5 +704,26 @@ int bessjyv(double v,double x,double &vm,double *jv,double *yv,
     }
     vm = n + v0;
     return 0;
+}
+
+int bessjyv_sph(int v, double z, double &vm, double* cjv,
+    double* cyv, double* cjvp, double* cyvp)
+{
+    //first, compute the bessel functions of fractional order
+    bessjyv(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp);
+
+    //iterate through each and scale
+    for(int n = 0; n<=v; n++)
+    {
+
+        cjv[n] = cjv[n] * sqrt(PI/(z * 2.0));
+        cyv[n] = cyv[n] * sqrt(PI/(z * 2.0));
+
+        cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(PI / (z * 2.0));
+        cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(PI / (z * 2.0));
+    }
+
+	return 0;
+
 }
- 
+
diff --git a/cbessjy.cpp b/cbessjy.cpp
index 517dc77..8710800 100644
--- a/cbessjy.cpp
+++ b/cbessjy.cpp
@@ -724,6 +724,7 @@ int cbessjyva_sph(int v,complex<double> z,double &vm,complex<double>*cjv,
     //iterate through each and scale
     for(int n = 0; n<=v; n++)
     {
+
         cjv[n] = cjv[n] * sqrt(PI/(z * 2.0));
         cyv[n] = cyv[n] * sqrt(PI/(z * 2.0));
 
diff --git a/colormap.h b/colormap.h
deleted file mode 100644
index 17b7e34..0000000
--- a/colormap.h
+++ /dev/null
@@ -1,229 +0,0 @@
-#ifndef RTS_COLORMAP_H
-#define RTS_COLORMAP_H
-
-#include <string>
-#include <qimage.h>
-#include <qcolor.h>
-#include "rts/cuda/error.h"
-
-
-#define BREWER_CTRL_PTS 11
-
-#ifdef __CUDACC__
-texture<float4, cudaTextureType1D> cudaTexBrewer;
-static cudaArray* gpuBrewer;
-#endif
-
-
-
-namespace rts{
-	namespace colormap{
-
-enum colormapType {cmBrewer, cmGrayscale};
-
-static void buffer2image(unsigned char* buffer, std::string filename, unsigned int x_size, unsigned int y_size)
-{
-	//create an image object
-	QImage image(x_size, y_size, QImage::Format_RGB32);
-
-	int i;
-	unsigned char r, g, b;
-	unsigned int x, y;
-	for(y=0; y<y_size; y++)
-		for(x=0; x<x_size; x++)
-		{
-			//calculate the 1D index
-			i = y * x_size + x;
-
-			r = buffer[i * 3 + 0];
-			g = buffer[i * 3 + 1];
-			b = buffer[i * 3 + 2];
-
-			//set the image pixel
-			QColor color(r, g, b);
-			image.setPixel(x, y, color.rgb());
-		}
-
-	image.save(filename.c_str());
-}
-
-#ifdef __CUDACC__
-static void initBrewer()
-{
-	//initialize the Brewer colormap
-
-	//allocate CPU space
-	float4 cpuColorMap[BREWER_CTRL_PTS];
-
-	//define control rtsPoints
-	cpuColorMap[0] = make_float4(0.192157f, 0.211765f, 0.584314f, 1.0f);
-	cpuColorMap[1] = make_float4(0.270588f, 0.458824f, 0.705882f, 1.0f);
-	cpuColorMap[2] = make_float4(0.454902f, 0.678431f, 0.819608f, 1.0f);
-	cpuColorMap[3] = make_float4(0.670588f, 0.85098f, 0.913725f, 1.0f);
-	cpuColorMap[4] = make_float4(0.878431f, 0.952941f, 0.972549f, 1.0f);
-	cpuColorMap[5] = make_float4(1.0f, 1.0f, 0.74902f, 1.0f);
-	cpuColorMap[6] = make_float4(0.996078f, 0.878431f, 0.564706f, 1.0f);
-	cpuColorMap[7] = make_float4(0.992157f, 0.682353f, 0.380392f, 1.0f);
-	cpuColorMap[8] = make_float4(0.956863f, 0.427451f, 0.262745f, 1.0f);
-	cpuColorMap[9] = make_float4(0.843137f, 0.188235f, 0.152941f, 1.0f);
-	cpuColorMap[10] = make_float4(0.647059f, 0.0f, 0.14902f, 1.0f);
-
-
-	int width = BREWER_CTRL_PTS;
-	int height = 0;
-
-
-	// allocate array and copy colormap data
-	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat);
-
-	HANDLE_ERROR(cudaMallocArray(&gpuBrewer, &channelDesc, width, height));
-
-	HANDLE_ERROR(cudaMemcpyToArray(gpuBrewer, 0, 0, cpuColorMap, sizeof(float4)*width, cudaMemcpyHostToDevice));
-
-	// set texture parameters
-    cudaTexBrewer.addressMode[0] = cudaAddressModeClamp;
-	//texBrewer.addressMode[1] = cudaAddressModeClamp;
-    cudaTexBrewer.filterMode = cudaFilterModeLinear;
-    cudaTexBrewer.normalized = true;  // access with normalized texture coordinates
-
-	// Bind the array to the texture
-    HANDLE_ERROR(cudaBindTextureToArray( cudaTexBrewer, gpuBrewer, channelDesc));
-
-}
-
-static void destroyBrewer()
-{
-    HANDLE_ERROR(cudaFreeArray(gpuBrewer));
-
-}
-
-template<class T>
-__global__ static void applyBrewer(T* gpuSource, unsigned char* gpuDest, unsigned int N, T minVal = 0, T maxVal = 1)
-{
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if(i >= N) return;
-
-	//compute the normalized value on [minVal maxVal]
-	float a = (gpuSource[i] - minVal) / (maxVal - minVal);
-
-	//lookup the color
-	float shift = 1.0/BREWER_CTRL_PTS;
-	float4 color = tex1D(cudaTexBrewer, a+shift);
-
-	gpuDest[i * 3 + 0] = 255 * color.x;
-	gpuDest[i * 3 + 1] = 255 * color.y;
-	gpuDest[i * 3 + 2] = 255 * color.z;
-}
-
-template<class T>
-__global__ static void applyGrayscale(T* gpuSource, unsigned char* gpuDest, unsigned int N, T minVal = 0, T maxVal = 1)
-{
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if(i >= N) return;
-
-	//compute the normalized value on [minVal maxVal]
-	float a = (gpuSource[i] - minVal) / (maxVal - minVal);
-
-	gpuDest[i * 3 + 0] = 255 * a;
-	gpuDest[i * 3 + 1] = 255 * a;
-	gpuDest[i * 3 + 2] = 255 * a;
-}
-
-template<class T>
-static void gpu2gpu(T* gpuSource, unsigned char* gpuDest, unsigned int nVals, T minVal = 0, T maxVal = 1, colormapType cm = cmGrayscale, int blockDim = 128)
-{
-	//This function converts a scalar field on the GPU to a color image on the GPU
-	int gridDim = (nVals + blockDim - 1)/blockDim;
-	if(cm == cmGrayscale)
-		applyGrayscale<<<gridDim, blockDim>>>(gpuSource, gpuDest, nVals, minVal, maxVal);
-	else if(cm == cmBrewer)
-	{
-		initBrewer();
-		applyBrewer<<<gridDim, blockDim>>>(gpuSource, gpuDest, nVals, minVal, maxVal);
-		destroyBrewer();
-	}
-
-}
-
-template<class T>
-static void gpu2cpu(T* gpuSource, unsigned char* cpuDest, unsigned int nVals, T minVal, T maxVal, colormapType cm = cmGrayscale)
-{
-    //this function converts a scalar field on the GPU to a color image on the CPU
-
-    //first create the color image on the GPU
-
-    //allocate GPU memory for the color image
-    unsigned char* gpuDest;
-    HANDLE_ERROR(cudaMalloc( (void**)&gpuDest, sizeof(unsigned char) * nVals * 3 ));
-
-	//HANDLE_ERROR(cudaMemset(gpuSource, 0, sizeof(T) * nVals));
-
-    //create the image on the gpu
-    gpu2gpu(gpuSource, gpuDest, nVals, minVal, maxVal, cm);
-
-	//HANDLE_ERROR(cudaMemset(gpuDest, 0, sizeof(unsigned char) * nVals * 3));
-
-    //copy the image from the GPU to the CPU
-    HANDLE_ERROR(cudaMemcpy(cpuDest, gpuDest, sizeof(unsigned char) * nVals * 3, cudaMemcpyDeviceToHost));
-
-	HANDLE_ERROR(cudaFree( gpuDest ));
-
-}
-
-template<typename T>
-static void gpu2image(T* gpuSource, std::string fileDest, unsigned int x_size, unsigned int y_size, T valMin, T valMax, colormapType cm = cmGrayscale)
-{
-	//allocate a color buffer
-	unsigned char* cpuBuffer = (unsigned char*) malloc(sizeof(unsigned char) * 3 * x_size * y_size);
-
-	//do the mapping
-	gpu2cpu<T>(gpuSource, cpuBuffer, x_size * y_size, valMin, valMax, cm);
-
-	//copy the buffer to an image
-	buffer2image(cpuBuffer, fileDest, x_size, y_size);
-
-	free(cpuBuffer);
-}
-
-#endif
-
-template<class T>
-static void cpu2cpu(T* cpuSource, unsigned char* cpuDest, unsigned int nVals, T valMin, T valMax, colormapType cm = cmGrayscale)
-{
-	int i;
-	float a;
-	float range = valMax - valMin;
-	for(i = 0; i<nVals; i++)
-	{
-		//normalize to the range [valMin valMax]
-		a = (cpuSource[i] - valMin) / range;
-
-		cpuDest[i * 3 + 0] = 255 * a;
-		cpuDest[i * 3 + 1] = 255 * a;
-		cpuDest[i * 3 + 2] = 255 * a;
-	}
-
-}
-
-
-
-template<typename T>
-static void cpu2image(T* cpuSource, std::string fileDest, unsigned int x_size, unsigned int y_size, T valMin, T valMax, colormapType cm = cmGrayscale)
-{
-    //allocate a color buffer
-	unsigned char* cpuBuffer = (unsigned char*) malloc(sizeof(unsigned char) * 3 * x_size * y_size);
-
-	//do the mapping
-	cpu2cpu<T>(cpuSource, cpuBuffer, x_size * y_size, valMin, valMax, cm);
-
-	//copy the buffer to an image
-	buffer2image(cpuBuffer, fileDest, x_size, y_size);
-
-	free(cpuBuffer);
-
-}
-
-}}	//end namespace colormap and rts
-
-#endif
-
diff --git a/dataTypes.h b/dataTypes.h
index 9515714..ee2efdf 100644
--- a/dataTypes.h
+++ b/dataTypes.h
@@ -24,6 +24,8 @@ typedef double ptype;
 
 typedef ptype fieldPoint;
 
+extern bool verbose;
+
 //hybrid GPU/CPU complex data typ
 #include "rts/math/complex.h"
 #include "rts/math/vector.h"
diff --git a/defaults.h b/defaults.h
index 5ed9204..f0dd9d5 100644
--- a/defaults.h
+++ b/defaults.h
@@ -15,14 +15,14 @@
 #define DEFAULT_FOCUS_X         0
 #define DEFAULT_FOCUS_Y         0
 #define DEFAULT_FOCUS_Z         0
-#define DEFAULT_INCIDENT_ORDER	100
+//#define DEFAULT_INCIDENT_ORDER	20
 #define DEFAULT_STABILITY_PARM	1.4
 
 //optics
-#define DEFAULT_CONDENSER_MIN   0.0
+#define DEFAULT_CONDENSER_MIN   0
 #define DEFAULT_CONDENSER_MAX   1
 
-#define DEFAULT_OBJECTIVE_MIN   0.0
+#define DEFAULT_OBJECTIVE_MIN   0
 #define DEFAULT_OBJECTIVE_MAX   1
 
 //incident light direction
@@ -36,17 +36,20 @@
 //#define DEFAULT_OUTPUT_POINT	fileoutStruct::imageObjective
 
 
-#define DEFAULT_SLICE_MIN_X     -5
-#define DEFAULT_SLICE_MIN_Y     0
-#define DEFAULT_SLICE_MIN_Z     -5
+#define DEFAULT_PLANE_MIN_X     -5
+#define DEFAULT_PLANE_MIN_Y     0
+#define DEFAULT_PLANE_MIN_Z     -5
 
-#define DEFAULT_SLICE_MAX_X     5
-#define DEFAULT_SLICE_MAX_Y     0
-#define DEFAULT_SLICE_MAX_Z     5
+#define DEFAULT_PLANE_MAX_X     5
+#define DEFAULT_PLANE_MAX_Y     0
+#define DEFAULT_PLANE_MAX_Z     5
 
-#define DEFAULT_SLICE_NORM_X    0
-#define DEFAULT_SLICE_NORM_Y    1
-#define DEFAULT_SLICE_NORM_Z    0
+#define DEFAULT_PLANE_NORM_X    0
+#define DEFAULT_PLANE_NORM_Y    1
+#define DEFAULT_PLANE_NORM_Z    0
+
+#define DEFAULT_PLANE_SIZE		40
+#define DEFAULT_PLANE_POSITION	0
 
 
 /*
@@ -64,21 +67,23 @@
 */
 
 
-#define DEFAULT_FIELD_ORDER     200
+#define DEFAULT_FIELD_ORDER     10
 
-#define DEFAULT_SAMPLES         200
+#define DEFAULT_SAMPLES         400
 
 #define DEFAULT_SLICE_RES		256
 
+#define DEFAULT_SPHERE_THETA_R  1000
+
 #define DEFAULT_PADDING			1
 #define DEFAULT_SUPERSAMPLE		1
 
-#define DEFAULT_INTENSITY_FILE	    "testappend"
+#define DEFAULT_INTENSITY_FILE	    "out_i.bmp"
 #define DEFAULT_TRANSMITTANCE_FILE	""
-#define DEFAULT_ABSORBANCE_FILE	    "out_a"
+#define DEFAULT_ABSORBANCE_FILE	    "out_a.bmp"
 #define DEFAULT_NEAR_FILE		    "out_n.bmp"
 #define DEFAULT_FAR_FILE		    "out_f.bmp"
-#define DEFAULT_EXTENDED_SOURCE     "einstein_small.jpg"
+#define DEFAULT_EXTENDED_SOURCE     ""
 #define DEFAULT_FIELD_TYPE		    "magnitude"
 #define DEFAULT_FORMAT			    fileoutStruct::formatImage
 #define DEFAULT_COLORMAP		    "brewer"
diff --git a/fieldslice.cpp b/fieldslice.cpp
index bf63766..882603d 100644
--- a/fieldslice.cpp
+++ b/fieldslice.cpp
@@ -8,14 +8,16 @@
 using namespace std;
 
 fieldslice::fieldslice(unsigned int x_size, unsigned int y_size)
-{
+{
+    x_hat = y_hat = z_hat = NULL;
+
 	//save the slice resolution
 	R[0] = x_size;
 	R[1] = x_size;
 
 	scalarField = true;
 
-	//init_gpu();
+	init_gpu();
 
 
 }
@@ -101,5 +103,5 @@ fieldslice::fieldslice()
 
 fieldslice::~fieldslice()
 {
-	//kill_gpu();
+	kill_gpu();
 }
diff --git a/fieldslice.cu b/fieldslice.cu
index 1b02998..856a91b 100644
--- a/fieldslice.cu
+++ b/fieldslice.cu
@@ -1,12 +1,15 @@
 #include "fieldslice.h"
 #include "dataTypes.h"
-#include "rts/cuda/error.h"
+#include "rts/cuda/error.h"
+#include "rts/cuda/threads.h"
 
 
 __global__ void field_intensity(bsComplex* x, bsComplex* y, bsComplex* z, ptype* I, unsigned int N)
 {
     //compute the index for this thread
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	//int i = blockIdx.x * blockDim.x + threadIdx.x;
+	int i = ThreadIndex1D();
+
 	if(i >= N) return;
 
 	ptype xm = x[i].abs();
@@ -66,7 +69,8 @@ __global__ void resample_intensity(bsComplex* x, bsComplex* y, bsComplex* z, pty
 __global__ void field_real(bsComplex* field_component, ptype* V, unsigned int N)
 {
     //compute the index for this thread
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	//int i = blockIdx.x * blockDim.x + threadIdx.x;
+	int i = ThreadIndex1D();
 	if(i >= N) return;
 
 	V[i] = field_component[i].real();
@@ -75,7 +79,8 @@ __global__ void field_real(bsComplex* field_component, ptype* V, unsigned int N)
 __global__ void field_imaginary(bsComplex* field_component, ptype* V, unsigned int N)
 {
     //compute the index for this thread
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	//int i = blockIdx.x * blockDim.x + threadIdx.x;
+	int i = ThreadIndex1D();
 	if(i >= N) return;
 
 	V[i] = field_component[i].imag();
@@ -84,7 +89,8 @@ __global__ void field_imaginary(bsComplex* field_component, ptype* V, unsigned i
 __global__ void field_sqrt(ptype* input, ptype* output, unsigned int N)
 {
 	//compute the index for this thread
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	//int i = blockIdx.x * blockDim.x + threadIdx.x;
+	int i = ThreadIndex1D();
 	if(i >= N) return;
 
 	output[i] = sqrt(input[i]);
@@ -115,7 +121,8 @@ scalarslice fieldslice::Mag()
 
 	//compute the total number of values in the slice
 	unsigned int N = R[0] * R[1];
-	int gridDim = (N+BLOCK-1)/BLOCK;
+	//int gridDim = (N+BLOCK-1)/BLOCK;
+	dim3 gridDim = GenGrid1D(N, BLOCK);
 
 	field_intensity<<<gridDim, BLOCK>>>(x_hat, y_hat, z_hat, result->S, N);
 	field_sqrt<<<gridDim, BLOCK>>>(result->S, result->S, N);
@@ -132,7 +139,8 @@ scalarslice fieldslice::Real()
 
 	//compute the total number of values in the slice
 	unsigned int N = R[0] * R[1];
-	int gridDim = (N+BLOCK-1)/BLOCK;
+	//int gridDim = (N+BLOCK-1)/BLOCK;
+	dim3 gridDim = GenGrid1D(N, BLOCK);
 
 	field_real<<<gridDim, BLOCK>>>(x_hat, result->S, N);
 
@@ -148,7 +156,8 @@ scalarslice fieldslice::Imag()
 
 	//compute the total number of values in the slice
 	unsigned int N = R[0] * R[1];
-	int gridDim = (N+BLOCK-1)/BLOCK;
+	//int gridDim = (N+BLOCK-1)/BLOCK;
+	dim3 gridDim = GenGrid1D(N, BLOCK);
 
 	field_imaginary<<<gridDim, BLOCK>>>(x_hat, result->S, N);
 
@@ -192,7 +201,6 @@ void fieldslice::ScaleField(ptype v)
 
     //compute the total number of values in the slice
 	unsigned int N = R[0] * R[1];
-	//cout<<"Size of mag field: "<<N<<endl;
 	int gridDim = (N+BLOCK-1)/BLOCK;
 
 	field_scale<<<gridDim, BLOCK>>>(x_hat, y_hat, z_hat, N, v);
@@ -200,19 +208,23 @@ void fieldslice::ScaleField(ptype v)
 }
 
 void fieldslice::init_gpu()
-{
+{
+	//if the field has no size, return
+	if(R[0] == 0 || R[1] == 0)
+		return;
+
+    //free any previous memory allocations
+    if(x_hat)
+        HANDLE_ERROR(cudaFree(x_hat));
+    if(y_hat)
+        HANDLE_ERROR(cudaFree(y_hat));
+    if(z_hat)
+        HANDLE_ERROR(cudaFree(z_hat));
+
     //allocate space on the GPU for the field slice
 	HANDLE_ERROR(cudaMalloc((void**)&x_hat, R[0] * R[1] * sizeof(bsComplex)));
-	//HANDLE_ERROR(cudaMemset(x_hat, 0, R[0] * R[1] * sizeof(bsComplex)));
 
-	//if the field is scalar, y_hat and z_hat are unused
-	if(scalarField)
-	{
-		y_hat = NULL;
-		z_hat = NULL;
-
-	}
-	else
+	if(!scalarField)
 	{
 		HANDLE_ERROR(cudaMalloc((void**)&y_hat, R[0] * R[1] * sizeof(bsComplex)));
 		//HANDLE_ERROR(cudaMemset(y_hat, 0, R[0] * R[1] * sizeof(bsComplex)));
@@ -233,6 +245,8 @@ void fieldslice::kill_gpu()
     if(z_hat != NULL)
         HANDLE_ERROR(cudaFree(z_hat));
 
+	x_hat = y_hat = z_hat = NULL;
+
 }
 
 void fieldslice::clear_gpu()
@@ -275,7 +289,7 @@ fieldslice fieldslice::crop(int u, int v, int su, int sv)
 	result.scalarField = scalarField;
 
 	//allocate space for the new field
-	result.init_gpu();
+	//result.init_gpu();
 
 	//create one thread for each pixel of the field slice
 	dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
@@ -291,3 +305,57 @@ fieldslice fieldslice::crop(int u, int v, int su, int sv)
 
 	return result;
 }
+
+fieldslice::fieldslice(const fieldslice& rhs)
+{
+	R[0] = rhs.R[0];
+	R[1] = rhs.R[1];
+	scalarField = rhs.scalarField;
+
+	x_hat = y_hat = z_hat = NULL;
+
+	unsigned int bytes = sizeof(bsComplex) * R[0] * R[1];
+	if(rhs.x_hat != NULL)
+	{
+		HANDLE_ERROR(cudaMalloc( (void**)&x_hat, bytes));
+		HANDLE_ERROR(cudaMemcpy( x_hat, rhs.x_hat, bytes, cudaMemcpyDeviceToDevice));
+	}
+	if(rhs.y_hat != NULL)
+	{
+		HANDLE_ERROR(cudaMalloc( (void**)&y_hat, bytes));
+		HANDLE_ERROR(cudaMemcpy( y_hat, rhs.y_hat, bytes, cudaMemcpyDeviceToDevice));
+	}
+	if(rhs.z_hat != NULL)
+	{
+		HANDLE_ERROR(cudaMalloc( (void**)&z_hat, bytes));
+		HANDLE_ERROR(cudaMemcpy( z_hat, rhs.z_hat, bytes, cudaMemcpyDeviceToDevice));
+	}
+
+}
+
+fieldslice& fieldslice::operator=(const fieldslice& rhs)
+{
+	//make sure this isn't a self-allocation
+	if(this != &rhs)
+	{
+		//make a shallow copy
+		R[0] = rhs.R[0];
+		R[1] = rhs.R[1];
+		scalarField = rhs.scalarField;
+
+		//initialize to new parameters
+		init_gpu();
+
+		//make a deep copy
+		unsigned int bytes = sizeof(bsComplex) * R[0] * R[1];
+		if(x_hat != NULL)
+			HANDLE_ERROR(cudaMemcpy(x_hat, rhs.x_hat, bytes, cudaMemcpyDeviceToDevice));
+		if(y_hat != NULL)
+			HANDLE_ERROR(cudaMemcpy(y_hat, rhs.y_hat, bytes, cudaMemcpyDeviceToDevice));
+		if(z_hat != NULL)
+			HANDLE_ERROR(cudaMemcpy(z_hat, rhs.z_hat, bytes, cudaMemcpyDeviceToDevice));
+	}
+
+	return *this;
+
+}
diff --git a/fieldslice.h b/fieldslice.h
index d5c7bbd..43fdef4 100644
--- a/fieldslice.h
+++ b/fieldslice.h
@@ -31,6 +31,9 @@ struct fieldslice
 
 	~fieldslice();
 
+	//copy constructor
+	fieldslice(const fieldslice& rhs);
+
 	//void setPos(bsPoint pMin, bsPoint pMax, bsVector N);
 
 	scalarslice Mag();
@@ -47,6 +50,7 @@ struct fieldslice
 
 	//crop a region from the field
 	fieldslice crop(int u, int v, int su, int sv);
+	fieldslice& operator=(const fieldslice& rhs);
 
 	void init_gpu();
 	void kill_gpu();
diff --git a/fileout.cu b/fileout.cu
index 31e51c7..37e86b3 100644
--- a/fileout.cu
+++ b/fileout.cu
@@ -186,11 +186,21 @@ void fileoutStruct::Save(microscopeStruct* scope)
 	//save images of the fields in the microscope
 
 	//if the user specifies an extended source
-	if(scope->focalPoints.size() > 1)
+	if(scope->focalPoints.size() > 0)
 	{
 		//simulate the extended source and output the detector image
 		scope->SimulateExtendedSource();
 
+		//saveNearField(&scope->nf);
+		saveFarField(scope);
+
+		//save the detector images
+		saveDetector(scope);
+
+		//simulate scattering for the last point (so that you have a near field image)
+		scope->SimulateScattering();
+        saveNearField(&scope->nf);
+
 	}
 	else
 	{
@@ -203,12 +213,15 @@ void fileoutStruct::Save(microscopeStruct* scope)
 		//run the far-field simulation
 		scope->SimulateImaging();
 
+		//saveNearField(&scope->nf);
 		saveFarField(scope);
 
+		//save the detector images
+		saveDetector(scope);
+
 	}
 
-	//save the detector images
-    saveDetector(scope);
+	
 
 
 }
diff --git a/fileout.h b/fileout.h
index c1d4e58..96d0c70 100644
--- a/fileout.h
+++ b/fileout.h
@@ -5,7 +5,7 @@
 //#include "defaults.h"
 #include "dataTypes.h"
 
-#include "colormap.h"
+#include "rts/graphics/colormap.h"
 #include "fieldslice.h"
 #include "nearfield.h"
 #include "microscope.h"
@@ -34,7 +34,7 @@ struct fileoutStruct{
 	//image_source source;
 
 	//color map info
-	rts::colormap::colormapType colormap;
+	rts::colormapType colormap;
 	ptype colorMax;
 
 	void Save(microscopeStruct* scope);
diff --git a/main.cpp b/main.cpp
index 4773cc0..9440b8f 100644
--- a/main.cpp
+++ b/main.cpp
@@ -24,6 +24,7 @@ microscopeStruct* SCOPE;
 #include "warnings.h"
 
 fileoutStruct gFileOut;
+bool verbose = false;
 using namespace std;
 
 int cbessjyva(double v,complex<double> z,double &vm,complex<double>*cjv,
@@ -31,32 +32,19 @@ int cbessjyva(double v,complex<double> z,double &vm,complex<double>*cjv,
 
 int main(int argc, char *argv[])
 {
-	//test Envi loading and saving
-	//EnviFile envi("testenvi", "w");
-
-	//float* data = (float*)malloc(sizeof(float) * 100 * 100);
-	//envi.addBand(data, 100, 100, 100);
-
-	//envi.close();
-
-	//return 0;
 
 	SCOPE = new microscopeStruct();
 
-    cout<<SCOPE->nf.Uf.R[0]<<endl;
-
     LoadParameters(argc, argv);
 
-    //TestSimulation(NF, SCOPE, &gFileOut);
-
 	//initialize GPU memory for fields
 	SCOPE->init();
 
-    OutputOptions();
-
 	gFileOut.Save(SCOPE);
 
-	//NF->destroy();
+	if(verbose)
+        OutputOptions();
+
 	SCOPE->destroy();
 
 
diff --git a/microscope.cu b/microscope.cu
index 21b5253..3c2dfb2 100644
--- a/microscope.cu
+++ b/microscope.cu
@@ -4,7 +4,7 @@
 #include "rts/tools/progressbar.h"
 #include "rts/cuda/timer.h"
 #include "dataTypes.h"
-#include "colormap.h"
+#include "rts/graphics/colormap.h"
 
 #include <QImage>
 
@@ -112,8 +112,8 @@ void microscopeStruct::getFarField()
     //Compute the Far Field image of the focal plane
 
     //clear the memory from previous detector fields
-    Ud.kill_gpu();
-    Ufd.kill_gpu();
+    //Ud.kill_gpu();
+    //Ufd.kill_gpu();
 
 	//first crop the filtered near-field image of the source and scattered fields
 	Ud = nf.U.crop(padding * Ud.R[0], padding * Ud.R[1], Ud.R[0], Ud.R[1]);
@@ -261,9 +261,14 @@ void microscopeStruct::SimulateExtendedSource()
 		t += gpuStopTimer();
 
 		rtsProgressBar((double)(i+1)/(double)npts * 100);
+		//unsigned char c;
+		//cin>>c;
 	}
-	cout<<endl;
-	cout<<"Time per source: "<<t/npts<<"ms"<<endl;
+	if(verbose)
+	{
+        cout<<endl;
+        cout<<"Time per source: "<<t/npts<<"ms"<<endl;
+    }
 
 }
 
@@ -304,3 +309,15 @@ void microscopeStruct::LoadExtendedSource(std::string filename)
 			}
         }
 }
+
+std::string microscopeStruct::toStr()
+{
+	stringstream ss;
+	ss<<nf.toStr();
+
+	ss<<"----------Optics--------------"<<endl<<endl;
+	ss<<"Objective NA: "<<objective[0]<<" to "<<objective[1]<<endl;
+	return ss.str();
+
+
+}
diff --git a/microscope.h b/microscope.h
index 06edfdc..b36317c 100644
--- a/microscope.h
+++ b/microscope.h
@@ -63,6 +63,8 @@ struct microscopeStruct
 	scalarslice getTransmittance();
 	scalarslice getIntensity();
 
+	string toStr();
+
 
 
 };
diff --git a/montecarlo.cpp b/montecarlo.cpp
index aec9aa2..edb37dd 100644
--- a/montecarlo.cpp
+++ b/montecarlo.cpp
@@ -35,18 +35,12 @@ void mcSampleNA(bsVector* samples, int N, bsVector k, ptype NAin, ptype NAout)
     ptype inPhi = asin(NAin);
     ptype outPhi = asin(NAout);
 
-    //cout<<"inPhi: "<<inPhi<<endl;
-    //cout<<"outPhi: "<<outPhi<<endl;
-
     //calculate the z-values associated with these angles
     ptype inZ = cos(inPhi);
     ptype outZ = cos(outPhi);
 
     ptype rangeZ = inZ - outZ;
 
-    //cout<<"inZ: "<<inZ<<endl;
-    //cout<<"outZ: "<<outZ<<endl;
-
     //draw a distribution of random phi, z values
     ptype z, phi, theta;
     for(int i=0; i<N; i++)
@@ -58,7 +52,6 @@ void mcSampleNA(bsVector* samples, int N, bsVector k, ptype NAin, ptype NAout)
         phi = acos(z);
 
         //compute and store cartesian coordinates
-        //bsVector spherical(1, theta + kSph[1], phi + kSph[2]);
 		bsVector spherical(1, theta, phi);
 		bsVector cart = spherical.sph2cart();
         samples[i] = rotation * cart;
diff --git a/nearfield.cpp b/nearfield.cpp
index 0bb61d4..5b4af2c 100644
--- a/nearfield.cpp
+++ b/nearfield.cpp
@@ -1,9 +1,21 @@
 #include "nearfield.h"
+#include <time.h>
+#include <math.h>
+
+#ifdef _WIN32
+#define isnan(x) _isnan(x)
+#define isinf(x) (!_finite(x))
+#endif
+
+int bessjyv_sph(int v, double z, double &vm, double* cjv,
+    double* cyv, double* cjvp, double* cyvp);
 
 nearfieldStruct::nearfieldStruct()
 {
     scalarSim = true;
 	planeWave = false;
+	lut_us = true;
+	lut_uf = false;
 
 	nWaves = 0;
 }
@@ -46,6 +58,8 @@ std::string nearfieldStruct::toStr()
 	ss<<"Condenser NA: "<<condenser[0]<<" to "<<condenser[1]<<std::endl;
 	ss<<"Focal Point: "<<focus[0]<<", "<<focus[1]<<", "<<focus[2]<<std::endl;
 	ss<<"Field Slice: "<<std::endl;
+	if(lut_us)
+        ss<<"LUT Parameters --- min: "<<d_min<<"   max: "<<d_max<<std::endl;
 	ss<<pos<<std::endl;
 
 	ss<<std::endl<<"---------Materials-----------"<<std::endl;
@@ -61,6 +75,10 @@ std::string nearfieldStruct::toStr()
 	for(unsigned int s=0; s<sVector.size(); s++)
 		ss<<sVector[s].toStr()<<std::endl;
 
+    ss<<"---------Timings-------------"<<std::endl;
+    ss<<"Uf = "<<t_Uf<<"ms"<<std::endl;
+    ss<<"Us = "<<t_Us<<"ms"<<std::endl;
+
 	return ss.str();
 }
 
@@ -70,7 +88,8 @@ void nearfieldStruct::calcWaves()
     inWaves.resize(nWaves);
 
     //re-seed the random number generator
-    //srand(seed);
+    //srand(time(NULL));
+	srand(NULL);
 
     //calculate the monte-carlo samples
     mcSampleNA(&inWaves[0], nWaves, k, condenser[0], condenser[1]);
@@ -84,6 +103,8 @@ void nearfieldStruct::calcSpheres()
     //calculate all of the constants necessary to evaluate the scattered field
 	//estimate the order required to represent the scattered field for each sphere
 
+
+
 	//for each sphere
 	for(int i=0; i<sVector.size(); i++)
 	{
@@ -91,12 +112,10 @@ void nearfieldStruct::calcSpheres()
 
 		//calculate the required order
 		sVector[i].calcNl(lambda);
-		//std::cout<<sVector[i].Nl<<std::endl;
 
 		//set the refractive index for the sphere
 		int imat = sVector[i].iMaterial;
         rts::rtsComplex<ptype> n = mVector[imat](lambda);
-		//std::cout<<"Sphere refractive index: "<<n<<std::endl;
 
 		//calculate the scattering coefficients
 		sVector[i].calcCoeff(lambda, n);
@@ -104,18 +123,109 @@ void nearfieldStruct::calcSpheres()
 		//save the refractive index
 		sVector[i].n = n;
 
+		//if the LUT is used, calculate Usp(theta, r)
+		if(lut_us)
+		{
+			sVector[i].calcUp(lambda, n, pos, max(U.R[0], U.R[1]));
+        }
+
+
 	}
 
 }
 
+void nearfieldStruct::calcUs()
+{
+
+
+    if(lut_us)
+        scalarUpLut();
+    else
+        scalarUs();
+}
+
+void nearfieldStruct::calcUf()
+{
+	if(lut_uf)
+		scalarUfLut();
+	else
+		scalarUf();
+}
+
 void nearfieldStruct::Simulate()
 {
+    //initialize timings
+    t_Uf = 0;
+    t_Us = 0;
+
 	//compute a set of plane waves for Monte-Carlo simulation
 	calcWaves();
 
     //the near field has to be simulated no matter what the output rtsPoint is
-    scalarUf();
+    calcUf();
     calcSpheres();
-    scalarUs();
+    calcUs();
     sumUf();
+
+	//U.Mag().toImage("testU.bmp");
+}
+
+void nearfieldStruct::calcBesselLut(ptype* j, ptype d_min, ptype d_max, int dR)
+{
+    /*Compute the look-up-table for spherical bessel functions used for the incident field
+        j    =   (Nl + 1) x aR array of values
+        aR      =   resolution of j
+    */
+
+	//compute the wavenumber
+	ptype k = 2 * PI / lambda;
+	unsigned int Nl = m;
+
+    //allocate space for the Bessel functions of the first and second kind (and derivatives -- which will be ignored)
+    int bytes = sizeof(double) * (Nl + 1);
+    double* cjv_kd = (double*)malloc(bytes);
+    double* cyv_kd = (double*)malloc(bytes);
+    double* cjvp_kd = (double*)malloc(bytes);
+    double* cyvp_kd = (double*)malloc(bytes);
+
+    //compute the bessel functions using the CPU-based algorithm
+    double vm;
+
+    //for each sample along r
+    ptype dr = (d_max - d_min) / (dR - 1);
+    ptype d;
+    ptype jv;
+    for(int id = 0; id < dR; id++)
+    {
+        d = id * dr + d_min;
+        double kd = k*d;
+        bessjyv_sph(Nl, kd, vm, cjv_kd, cyv_kd, cjvp_kd, cyvp_kd);
+
+        //copy the double data to the bsComplex array
+        for(int l=0; l<=Nl; l++)
+		{
+            jv = cjv_kd[l];
+			if(isnan(jv) || isinf(jv))
+			{
+                if(kd == 0 && l == 0)
+                    jv = 1;
+                else
+                    jv = 0;
+            }
+            j[id * (Nl+1) + l] = jv;
+		}
+    }
+
+	/*ofstream outfile("uf_besselout.txt");
+    for(int ir = 0; ir < dR; ir++)
+    {
+		outfile<<ir*dr + d_min<<endl;
+        for(int l = 0; l<=Nl; l++)
+        {
+            outfile<<j[ir * (Nl+1) + l]<<" --";
+        }
+        outfile<<endl;
+    }
+	outfile.close();*/
+
 }
diff --git a/nearfield.h b/nearfield.h
index 2b452df..9f36957 100644
--- a/nearfield.h
+++ b/nearfield.h
@@ -31,6 +31,8 @@ struct nearfieldStruct
 
 	//slices for the focused field
 	fieldslice Uf;
+	ptype d_min, d_max;
+
 	//	and total field: Uf + sum(Us)
 	fieldslice U;
 
@@ -43,6 +45,14 @@ struct nearfieldStruct
 	//flag for a plane wave
 	bool planeWave;
 
+	//flag for using a LUT
+	bool lut_uf;
+	bool lut_us;
+
+	//timings
+	float t_Uf;
+	float t_Us;
+
 
 
 	//---------Scatterers------------
@@ -78,10 +88,17 @@ struct nearfieldStruct
 	void setPos(bsPoint pMin, bsPoint pMax, bsVector normal);
 
 	//this function re-computes the focused field
+	void calcUf();
 	void scalarUf();
+	void scalarUfLut();
+
+	void calcBesselLut(ptype* j, ptype d_min, ptype d_max, int dR);
 
 	//compute the field scattered by all of the materials
+	void calcUs();
 	void scalarUs();
+	void scalarUpLut();
+
 
 	//add the incident field to the sum of scattered fields
 	void sumUf();
diff --git a/nfScalarUf.cu b/nfScalarUf.cu
index c19d767..adc61ab 100644
--- a/nfScalarUf.cu
+++ b/nfScalarUf.cu
@@ -5,7 +5,7 @@
 #include "rts/cuda/error.h"
 #include "rts/cuda/timer.h"
 
-
+//Incident field for a single plane wave
 __global__ void gpuScalarUfp(bsComplex* Uf, bsVector k, ptype kmag, bsPoint f, ptype A, bsRect ABCD, int uR, int vR)
 {
 	/*Compute the scalar focused field using Debye focusing
@@ -41,7 +41,8 @@ __global__ void gpuScalarUfp(bsComplex* Uf, bsVector k, ptype kmag, bsPoint f, p
 	Uf[i] = exp(d) * A;
 
 }
-
+
+//Incident field for a focused point source
 __global__ void gpuScalarUf(bsComplex* Uf, bsVector k, ptype kmag, bsPoint f, ptype A, bsRect ABCD, int uR, int vR, ptype cosAlpha, ptype cosBeta, int nl, ptype j_conv = 1.4)
 {
 	/*Compute the scalar focused field using Debye focusing
@@ -151,7 +152,6 @@ __global__ void gpuScalarUf(bsComplex* Uf, bsVector k, ptype kmag, bsPoint f, pt
 		}
 
 		sumUf += il * jl * Pl * (Palpha[1] - Palpha[2] - Pbeta[1] + Pbeta[2]);
-		//sumUf += il * Pl * (Palpha[1] - Palpha[2] - Pbeta[1] + Pbeta[2]);
 
 		il *= im;
 	}
@@ -162,21 +162,12 @@ __global__ void gpuScalarUf(bsComplex* Uf, bsVector k, ptype kmag, bsPoint f, pt
 
 void nearfieldStruct::scalarUf()
 {
-	//Compute the incident field via a scalar simulation
-	//This method uses Debye focusing to approximate the field analytically
-
-	//time the calculation of the focused field
-	//gpuStartTimer();
-
-	//set the field slice to a scalar field
-	//Uf.scalarField = true;
-
-	//initialize the GPU arrays
-	//Uf.init_gpu();
+
+    gpuStartTimer();
 
 	//create one thread for each pixel of the field slice
 	dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
-	dim3 dimGrid((Uf.R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (Uf.R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
+	dim3 dimGrid((Uf.R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (Uf.R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
 
 	//if we are computing a plane wave, call the gpuScalarUfp function
 	if(planeWave)
@@ -191,10 +182,7 @@ void nearfieldStruct::scalarUf()
 		ptype cosBeta = cos(asin(condenser[1]));
 		//compute the scalar Uf field (this will be in the x_hat channel of Uf)
 		gpuScalarUf<<<dimGrid, dimBlock>>>(Uf.x_hat, k, 2 * PI / lambda, focus, A, pos, Uf.R[0], Uf.R[1], cosAlpha, cosBeta, m);
-	}
-
-	//float t = gpuStopTimer();
-	//std::cout<<"Scalar Uf Time: "<<t<<"ms"<<std::endl;
-	//std::cout<<focus<<std::endl;
-
+	}
+
+	t_Uf = gpuStopTimer();
 }
diff --git a/nfScalarUfLut.cu b/nfScalarUfLut.cu
new file mode 100644
index 0000000..71abd73
--- /dev/null
+++ b/nfScalarUfLut.cu
@@ -0,0 +1,188 @@
+#include "nearfield.h"
+
+#include "rts/math/legendre.h"
+#include "rts/cuda/error.h"
+#include "rts/cuda/timer.h"
+
+texture<float, cudaTextureType2D> texJ;
+
+__global__ void gpuScalarUfp(bsComplex* Uf, bsVector k, ptype kmag, bsPoint f, ptype A, bsRect ABCD, int uR, int vR);
+
+__global__ void gpuScalarUfLut(bsComplex* Uf, bsRect ABCD, int uR, int vR, bsPoint f, bsVector k, ptype A, ptype cosAlpha, ptype cosBeta, int nl, ptype dmin, ptype dmax, int dR)
+{
+    /*This function computes the focused field for a 2D slice
+
+    Uf      =   destination field slice
+    ABCD    =   plane representing the field slice in world space
+    uR, vR  =   resolution of the Uf field
+    f       =   focal point of the condenser
+    k       =   direction of the incident light
+    A       =   amplitude of the incident field
+    cosAlpha=   cosine of the solid angle subtended by the condenser obscuration
+    cosBeta =   cosine of the solid angle subtended by the condenser aperature
+    nl      =   number of orders used to compute the field
+    dR      =   number of Bessel function values in the look-up texture
+
+    */
+
+    //get the current coordinate in the plane slice
+	int iu = blockIdx.x * blockDim.x + threadIdx.x;
+	int iv = blockIdx.y * blockDim.y + threadIdx.y;
+
+	//make sure that the thread indices are in-bounds
+	if(iu >= uR || iv >= vR) return;
+
+	//compute the index (easier access to the scalar field array)
+	int i = iv*uR + iu;
+
+	//compute the parameters for u and v
+	ptype u = (ptype)iu / (uR);
+	ptype v = (ptype)iv / (vR);
+
+	
+
+	//get the rtsPoint in world space and then the r vector
+	bsPoint p = ABCD(u, v);
+	bsVector r = p - f;
+	ptype d = r.len();
+
+	if(d == 0)
+	{
+        Uf[i] = A * 2 * PI * (cosAlpha - cosBeta);
+        return;
+    }
+
+	//get info for the light direction and frequency
+	r = r.norm();
+
+	//compute the imaginary factor i^l
+	bsComplex im = bsComplex(0, 1);
+	bsComplex il = bsComplex(1, 0);
+
+	//Legendre functions are computed dynamically to save memory
+	//initialize the Legendre functions
+
+	ptype P[2];
+	//get the angle between k and r (light direction and position vector)
+	ptype cosTheta;
+	cosTheta = k.dot(r);
+
+	rts::init_legendre<ptype>(cosTheta, P[0], P[1]);
+
+	//initialize legendre functions for the cassegrain angles
+	ptype Palpha[3];
+	rts::init_legendre<ptype>(cosAlpha, Palpha[0], Palpha[1]);
+	Palpha[2] = 1;
+
+	ptype Pbeta[3];
+	rts::init_legendre<ptype>(cosBeta, Pbeta[0], Pbeta[1]);
+	Pbeta[2] = 1;
+
+	//for each order l
+	bsComplex sumUf(0.0, 0.0);
+	ptype jl = 0.0;
+	ptype Pl;
+	ptype di = ( (d - dmin)/(dmax - dmin) ) * (dR - 1);
+	for(int l = 0; l<=nl; l++)
+	{
+        jl = tex2D(texJ, l + 0.5, di + 0.5);
+		if(l==0)
+			Pl = P[0];
+		else if(l==1)
+		{
+			Pl = P[1];
+
+			//adjust the cassegrain Legendre function
+			Palpha[2] = Palpha[0];
+			rts::shift_legendre<ptype>(l+1, cosAlpha, Palpha[0], Palpha[1]);
+			Pbeta[2] = Pbeta[0];
+			rts::shift_legendre<ptype>(l+1, cosBeta, Pbeta[0], Pbeta[1]);
+		}
+		else
+		{
+			rts::shift_legendre<ptype>(l, cosTheta, P[0], P[1]);
+
+			Pl = P[1];
+
+			//adjust the cassegrain outer Legendre function
+			Palpha[2] = Palpha[0];
+			rts::shift_legendre<ptype>(l+1, cosAlpha, Palpha[0], Palpha[1]);
+			Pbeta[2] = Pbeta[0];
+			rts::shift_legendre<ptype>(l+1, cosBeta, Pbeta[0], Pbeta[1]);
+		}
+
+		sumUf += il * jl * Pl * (Palpha[1] - Palpha[2] - Pbeta[1] + Pbeta[2]);
+		//sumUf += jl;
+
+		il *= im;
+	}
+
+	Uf[i] = sumUf * 2 * PI * A;
+	//Uf[i] = u;
+	//return;
+}
+
+void nearfieldStruct::scalarUfLut()
+{
+    gpuStartTimer();
+	
+    //calculate the minimum and maximum points in the focused field
+    d_min = pos.dist(focus);
+    d_max = pos.dist_max(focus);
+
+    //allocate space for the Bessel function
+    int dR = 2 * max(Uf.R[0], Uf.R[1]);
+    ptype* j = NULL;
+	j = (ptype*) malloc(sizeof(ptype) * dR * (m+1));
+
+	//calculate Bessel function LUT
+	calcBesselLut(j, d_min, d_max, dR);
+	
+    //create a CUDA array structure and specify the format description
+	cudaArray* arrayJ;
+    cudaChannelFormatDesc channelDesc =
+        cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+	
+    //allocate memory
+    HANDLE_ERROR(cudaMallocArray(&arrayJ, &channelDesc, m+1, dR));
+	
+    //specify texture properties
+    texJ.addressMode[0] = cudaAddressModeMirror;
+    texJ.addressMode[1] = cudaAddressModeMirror;
+    texJ.filterMode     = cudaFilterModeLinear;
+    texJ.normalized     = false;
+
+    //bind the texture to the array
+    HANDLE_ERROR(cudaBindTextureToArray(texJ, arrayJ, channelDesc));
+
+    //copy the CPU Bessel LUT to the GPU-based array
+    HANDLE_ERROR( cudaMemcpy2DToArray(arrayJ, 0, 0, j, (m+1)*sizeof(float), (m+1)*sizeof(float), dR, cudaMemcpyHostToDevice));
+
+    //----------------Compute the focused field
+    //create one thread for each pixel of the field slice
+	dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
+	dim3 dimGrid((Uf.R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (Uf.R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
+
+	//if we are computing a plane wave, call the gpuScalarUfp function
+	if(planeWave)
+	{
+		gpuScalarUfp<<<dimGrid, dimBlock>>>(Uf.x_hat, k, 2 * PI / lambda, focus, A, pos, Uf.R[0], Uf.R[1]);
+	}
+	//otherwise compute the condenser info and create a focused field
+	else
+	{
+		//pre-compute the cosine of the obscuration and objective angles
+		ptype cosAlpha = cos(asin(condenser[0]));
+		ptype cosBeta = cos(asin(condenser[1]));
+		//compute the scalar Uf field (this will be in the x_hat channel of Uf)
+		gpuScalarUfLut<<<dimGrid, dimBlock>>>(Uf.x_hat, pos, Uf.R[0], Uf.R[1], focus, k, A, cosAlpha, cosBeta, m, d_min, d_max, dR);
+	}
+
+	
+    //free everything
+	free(j);
+	
+	HANDLE_ERROR(cudaFreeArray(arrayJ));
+
+	t_Uf = gpuStopTimer();
+}
diff --git a/nfScalarUpLut.cu b/nfScalarUpLut.cu
new file mode 100644
index 0000000..9a2c0a1
--- /dev/null
+++ b/nfScalarUpLut.cu
@@ -0,0 +1,192 @@
+#include "nearfield.h"
+#include "rts/math/spherical_bessel.h"
+#include "rts/math/legendre.h"
+#include <stdlib.h>
+#include "rts/cuda/error.h"
+#include "rts/cuda/timer.h"
+
+texture<float2, cudaTextureType2D> texUsp;
+texture<float2, cudaTextureType2D> texUip;
+
+__global__ void gpuScalarUpLut(bsComplex* Us, bsVector* k, int nk, ptype kmag, ptype a, ptype dmin, ptype dmax, bsPoint f, bsPoint ps, ptype A, bsRect ABCD, int uR, int vR, int dR, int aR, int thetaR)
+{
+    /*This function uses Monte-Carlo integration to sample a texture-based LUT describing the scattered field
+        produced by a plane wave through a sphere.  The MC sampling is used to approximate a focused field.
+
+        Us  =   final scattered field
+        k   =   list of incoming plane waves (Monte-Carlo samples)
+        nk  =   number of incoming MC samples
+        kmag=   magnitude of the incoming field 2pi/lambda
+        dmin=   minimum distance of the Usp texture
+        dmax=   maximum distance of the Usp texture
+        f   =   position of the focus
+        ps  =   position of the sphere
+        A   =   total amplitude of the incident field arriving at the focal spot
+        ABCD=   rectangle representing the field slice
+        uR  =   resolution of the field slice in the u direction
+        vR  =   resolution of the field slice in the v direction
+        dR  =   resolution of the Usp texture in the d direction
+        thetaR= resolution of the Usp texture in the theta direction
+    */
+
+	//get the current coordinate in the plane slice
+	int iu = blockIdx.x * blockDim.x + threadIdx.x;
+	int iv = blockIdx.y * blockDim.y + threadIdx.y;
+
+	//make sure that the thread indices are in-bounds
+	if(iu >= uR || iv >= vR) return;
+
+	//compute the index (easier access to the scalar field array)
+	int i = iv*uR + iu;
+
+	//compute the parameters for u and v
+	ptype u = (ptype)iu / (uR);
+	ptype v = (ptype)iv / (vR);
+
+	//get the rtsPoint in world space and then the r vector
+	bsPoint p = ABCD(u, v);
+	bsVector r = p - ps;
+	ptype d = r.len();
+	float di = ( (d - max(a, dmin))/(dmax - max(a, dmin)) ) * (dR - 1);
+	float ai = ( (d - dmin)/(a - dmin)) * (aR - 1);
+
+    bsComplex sumUs(0, 0);
+    //for each plane wave in the wave list
+    for(int iw = 0; iw < nk; iw++)
+    {
+        //normalize the direction vectors and find their inner product
+        r = r.norm();
+        ptype cos_theta = k[iw].dot(r);
+        if(cos_theta < -1)
+            cos_theta = -1;
+        if(cos_theta > 1)
+            cos_theta = 1;
+        float thetai = ( acos(cos_theta) / PI ) * (thetaR - 1);
+
+        //compute the phase factor for spheres that are not at the origin
+        bsVector c = ps - f;
+        bsComplex phase = exp(bsComplex(0, kmag * k[iw].dot(c)));
+
+        //compute the internal field if we are inside a sphere
+        if(d < a)
+        {
+			float2 Uip = tex2D(texUip, ai + 0.5, thetai + 0.5);
+			sumUs += (1.0/nk) * A * phase * bsComplex(Uip.x, Uip.y);
+        }
+        //otherwise compute the scattered field
+        else
+        {
+            float2 Usp = tex2D(texUsp, di + 0.5, thetai + 0.5);
+            sumUs += (1.0/nk) * A * phase * bsComplex(Usp.x, Usp.y);
+        }
+
+    }
+
+    Us[i] += sumUs;
+}
+
+void nearfieldStruct::scalarUpLut()
+{
+    //get the number of spheres
+	int nSpheres = sVector.size();
+
+	//if there are no spheres, nothing to do here
+	if(nSpheres == 0)
+		return;
+
+	//time the calculation of the focused field
+	gpuStartTimer();
+
+	//clear the scattered field
+	U.clear_gpu();
+
+	//create one thread for each pixel of the field slice
+	dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
+	dim3 dimGrid((U.R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (U.R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
+
+    //copy Monte-Carlo samples to the GPU and determine the incident amplitude (plane-wave specific stuff)
+    bsVector* gpuk;
+    int nWaves;
+    ptype subA;
+    if(planeWave)
+    {
+        nWaves = 1;
+        HANDLE_ERROR(cudaMalloc( (void**)&gpuk, sizeof(bsVector) ) );
+        HANDLE_ERROR(cudaMemcpy( gpuk, &k, sizeof(bsVector), cudaMemcpyHostToDevice));
+        subA = A;
+    }
+    else
+    {
+        nWaves = inWaves.size();
+        HANDLE_ERROR(cudaMalloc( (void**)&gpuk, sizeof(bsVector) * nWaves ) );
+        HANDLE_ERROR(cudaMemcpy( gpuk, &inWaves[0], sizeof(bsVector) * nWaves, cudaMemcpyHostToDevice));
+        //compute the amplitude that makes it through the condenser
+        subA = 2 * PI * A * ( (1 - cos(asin(condenser[1]))) - (1 - cos(asin(condenser[0]))) );
+    }
+
+	//for each sphere
+	for(int s = 0; s<nSpheres; s++)
+	{
+        //get the current sphere
+		//sphere S = sVector[s];
+
+        //allocate space for the Usp and Uip textures
+        //allocate the cuda array
+        cudaArray* arrayUsp;
+		cudaArray* arrayUip;
+        cudaChannelFormatDesc channelDescUsp =
+            cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat);
+		cudaChannelFormatDesc channelDescUip =
+            cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat);
+        int dR = sVector[s].Usp.R[0];
+        int thetaR = sVector[s].Usp.R[1];
+		int aR = sVector[s].Uip.R[0];
+        HANDLE_ERROR(cudaMallocArray(&arrayUsp, &channelDescUsp, dR, thetaR));
+		HANDLE_ERROR(cudaMallocArray(&arrayUip, &channelDescUip, aR, thetaR));
+
+        texUsp.addressMode[0] = cudaAddressModeMirror;
+        texUsp.addressMode[1] = cudaAddressModeMirror;
+        texUsp.filterMode     = cudaFilterModeLinear;
+        texUsp.normalized     = false;
+
+		texUip.addressMode[0] = cudaAddressModeMirror;
+        texUip.addressMode[1] = cudaAddressModeMirror;
+        texUip.filterMode     = cudaFilterModeLinear;
+        texUip.normalized     = false;
+        HANDLE_ERROR(cudaBindTextureToArray(texUsp, arrayUsp, channelDescUsp));
+		HANDLE_ERROR(cudaBindTextureToArray(texUip, arrayUip, channelDescUip));
+
+        //copy the LUT to the Usp texture
+        HANDLE_ERROR( cudaMemcpy2DToArray(arrayUsp, 0, 0, sVector[s].Usp.x_hat, dR*sizeof(float2), dR*sizeof(float2), thetaR, cudaMemcpyDeviceToDevice));
+		HANDLE_ERROR( cudaMemcpy2DToArray(arrayUip, 0, 0, sVector[s].Uip.x_hat, aR*sizeof(float2), aR*sizeof(float2), thetaR, cudaMemcpyDeviceToDevice));
+
+        gpuScalarUpLut<<<dimGrid, dimBlock>>>(U.x_hat,
+                                            gpuk,
+                                            nWaves,
+                                            2 * PI / lambda,
+                                            sVector[s].a,
+                                            sVector[s].d_min,
+                                            sVector[s].d_max,
+                                            focus,
+                                            sVector[s].p,
+                                            subA,
+                                            pos,
+                                            U.R[0],
+                                            U.R[1],
+                                            dR,
+                                            aR,
+											thetaR);
+
+        cudaFreeArray(arrayUsp);
+        cudaFreeArray(arrayUip);
+
+	}
+
+
+    //store the time to compute the scattered field
+	t_Us = gpuStopTimer();
+
+	//free monte-carlo samples
+	cudaFree(gpuk);
+
+}
diff --git a/nfScalarUs.cu b/nfScalarUs.cu
index d1a79fc..813fb0b 100644
--- a/nfScalarUs.cu
+++ b/nfScalarUs.cu
@@ -163,7 +163,7 @@ void nearfieldStruct::scalarUs()
 		return;
 
 	//time the calculation of the focused field
-	//gpuStartTimer();
+	gpuStartTimer();
 
 	//clear the scattered field
 	U.clear_gpu();
@@ -251,9 +251,8 @@ void nearfieldStruct::scalarUs()
 	}
 
 
+    //store the time to compute the scattered field
+	t_Us = gpuStopTimer();
 
-	//float t = gpuStopTimer();
-	//std::cout<<"Scalar Us Time: "<<t<<"ms"<<std::endl;
-	//std::cout<<focus<<std::endl;
 
 }
diff --git a/nfSumUf.cu b/nfSumUf.cu
index 69b6e20..db55007 100644
--- a/nfSumUf.cu
+++ b/nfSumUf.cu
@@ -32,7 +32,7 @@ __global__ void gpuScalarUsp(bsComplex* Ufx, bsComplex* Ufy, bsComplex* Ufz,
 	{
 		r = p - ps[is];
 		d = r.len();
-		if(d <= as[is])
+		if(d < as[is])
 			return;
 	}
 
@@ -110,8 +110,5 @@ void nearfieldStruct::sumUf()
     HANDLE_ERROR(cudaFree(gpu_p));
     HANDLE_ERROR(cudaFree(gpu_a));
 
-	//float t = gpuStopTimer();
-	//std::cout<<"Add Us Time: "<<t<<"ms"<<std::endl;
-	//std::cout<<focus<<std::endl;
 
 }
diff --git a/options.h b/options.h
index 0c7aee8..e8cefa6 100644
--- a/options.h
+++ b/options.h
@@ -5,7 +5,7 @@
 
 #include "nearfield.h"
 #include "microscope.h"
-#include "colormap.h"
+#include "rts/graphics/colormap.h"
 #include "fileout.h"
 //extern nearfieldStruct* NF;
 extern microscopeStruct* SCOPE;
@@ -23,7 +23,179 @@ using namespace std;
 #include <boost/program_options.hpp>
 namespace po = boost::program_options;
 
-static void loadSpheres(string sphereList)
+extern bool verbose;
+
+
+
+static void lNearfield(po::variables_map vm)
+{
+	//test to see if we are simulating a plane wave
+	bool planeWave = DEFAULT_PLANEWAVE;
+	if(vm.count("plane-wave"))
+		planeWave = !planeWave;
+	SCOPE->nf.planeWave = planeWave;
+
+	//get the incident field amplitude
+	SCOPE->nf.A = vm["amplitude"].as<ptype>();
+
+	//get the condenser parameters
+    SCOPE->nf.condenser[0] = DEFAULT_CONDENSER_MIN;
+    SCOPE->nf.condenser[1] = DEFAULT_CONDENSER_MAX;
+
+    if(vm.count("condenser"))
+    {
+        vector<ptype> cparams = vm["condenser"].as< vector<ptype> >();
+
+        if(cparams.size() == 1)
+            SCOPE->nf.condenser[1] = cparams[0];
+        else
+        {
+            SCOPE->nf.condenser[0] = cparams[0];
+            SCOPE->nf.condenser[1] = cparams[1];
+        }
+    }
+
+
+	//get the focal rtsPoint position
+    SCOPE->nf.focus[0] = DEFAULT_FOCUS_X;
+    SCOPE->nf.focus[1] = DEFAULT_FOCUS_Y;
+    SCOPE->nf.focus[2] = DEFAULT_FOCUS_Z;
+    if(vm.count("focus"))
+    {
+        vector<ptype> fpos = vm["focus"].as< vector<ptype> >();
+        if(fpos.size() != 3)
+        {
+            cout<<"BIMSIM Error - the incident focal point is incorrectly specified; it must have three components."<<endl;
+            exit(1);
+        }
+        SCOPE->nf.focus[0] = fpos[0];
+        SCOPE->nf.focus[1] = fpos[1];
+        SCOPE->nf.focus[2] = fpos[2];
+    }
+
+	//get the incident light direction (k-vector)
+	bsVector spherical(1, 0, 0);
+
+    //if a k-vector is specified
+    if(vm.count("k"))
+    {
+        vector<ptype> kvec = vm["k"].as< vector<ptype> >();
+        if(kvec.size() != 2)
+        {
+            cout<<"BIMSIM Error - k-vector is not specified correctly: it must contain two elements"<<endl;
+            exit(1);
+        }
+        spherical[1] = kvec[0];
+        spherical[2] = kvec[1];
+    }
+	SCOPE->nf.k = spherical.sph2cart();
+
+
+    //incident field order
+    SCOPE->nf.m = vm["field-order"].as<int>();
+
+    //number of Monte-Carlo samples
+    SCOPE->nf.nWaves = vm["samples"].as<int>();
+
+	//random number seed for Monte-Carlo samples
+	if(vm.count("seed"))
+		srand(vm["seed"].as<unsigned int>());
+
+
+
+}
+
+
+static void loadOutputParams(po::variables_map vm)
+{
+    //append simulation results to previous binary files
+    gFileOut.append = DEFAULT_APPEND;
+    if(vm.count("append"))
+        gFileOut.append = true;
+
+	//image parameters
+	//component of the field to be saved
+	std::string fieldStr;
+    fieldStr = vm["output-type"].as<string>();
+
+    if(fieldStr == "magnitude")
+        gFileOut.field = fileoutStruct::fieldMag;
+    else if(fieldStr == "intensity")
+        gFileOut.field = fileoutStruct::fieldIntensity;
+    else if(fieldStr == "polarization")
+        gFileOut.field = fileoutStruct::fieldPolar;
+    else if(fieldStr == "imaginary")
+        gFileOut.field = fileoutStruct::fieldImag;
+    else if(fieldStr == "real")
+        gFileOut.field = fileoutStruct::fieldReal;
+    else if(fieldStr == "angular-spectrum")
+        gFileOut.field = fileoutStruct::fieldAngularSpectrum;
+
+
+	//image file names
+	gFileOut.intFile = vm["intensity"].as<string>();
+	gFileOut.absFile = vm["absorbance"].as<string>();
+	gFileOut.transFile = vm["transmittance"].as<string>();
+	gFileOut.nearFile = vm["near-field"].as<string>();
+	gFileOut.farFile = vm["far-field"].as<string>();
+
+	//colormap
+	std::string cmapStr;
+    cmapStr = vm["colormap"].as<string>();
+    if(cmapStr == "brewer")
+        gFileOut.colormap = rts::cmBrewer;
+    else if(cmapStr == "gray")
+        gFileOut.colormap = rts::cmGrayscale;
+    else
+        cout<<"color-map value not recognized (using default): "<<cmapStr<<endl;
+}
+
+void lFlags(po::variables_map vm, po::options_description desc)
+{
+    //display help and exit
+	if(vm.count("help"))
+	{
+		cout<<desc<<endl;
+		exit(1);
+	}
+
+    //flag for verbose output
+	if(vm.count("verbose"))
+        verbose = true;
+
+    if(vm.count("recursive"))
+    {
+        SCOPE->nf.lut_us = false;
+        SCOPE->nf.lut_uf = false;
+    }
+    else if(vm.count("recursive-us"))
+    {
+        SCOPE->nf.lut_us = false;
+    }
+    else if(vm.count("lut-uf"))
+    {
+        SCOPE->nf.lut_uf = true;
+    }
+}
+
+void lWavelength(po::variables_map vm)
+{
+    //load the wavelength
+	if(vm.count("nu"))
+	{
+		//wavelength is given in wavenumber - transform and flag
+		SCOPE->nf.lambda = 10000/vm["nu"].as<ptype>();
+		gFileOut.wavenumber = true;
+	}
+	//otherwise we are using lambda = wavelength
+	else
+	{
+		SCOPE->nf.lambda = vm["lambda"].as<ptype>();
+		gFileOut.wavenumber = false;
+	}
+}
+
+static void lSpheres(string sphereList)
 {
     /*This function loads a list of sphere given in the string sphereList
         The format is:
@@ -58,17 +230,60 @@ static void loadSpheres(string sphereList)
         //check out the next element (this should set the EOF error flag)
         ss.peek();
     }
+}
 
+void lSpheres(po::variables_map vm)
+{
+    //if a sphere is specified at the command line
+    if(vm.count("spheres"))
+    {
+        //convert the sphere to a string
+        vector<ptype> sdesc = vm["spheres"].as< vector<ptype> >();
 
+        //compute the number of spheres specified
+        unsigned int nS;
+        if(sdesc.size() <= 5)
+            nS = 1;
+        else
+        {
+            //if the number of parameters is divisible by 4, compute the number of spheres
+            if(sdesc.size() % 5 == 0)
+                nS = sdesc.size() / 5;
+            else
+            {
+                cout<<"BIMSIM Error: Invalid number of sphere parameters."<<endl;
+                exit(1);
+            }
+        }
 
-}
+        stringstream ss;
+
+        //for each sphere
+        for(unsigned int s=0; s<nS; s++)
+        {
+            //compute the number of sphere parameters
+            unsigned int nP;
+            if(nS == 1) nP = sdesc.size();
+            else nP = 5;
+
+            //store each parameter as a string
+            for(unsigned int i=0; i<nP; i++)
+            {
+                ss<<sdesc[s*5 + i]<<" ";
+            }
+            ss<<endl;
+        }
+
+
+
+        //convert the string to a sphere list
+        lSpheres(ss.str());
+    }
 
-static void loadSpheres(po::variables_map vm)
-{
     //if a files are specified
     if(vm.count("sphere-file"))
     {
-        cout<<"Sphere files detected."<<endl;
+
         vector<string> filenames = vm["sphere-file"].as< vector<string> >();
         //load each file
         for(int iS=0; iS<filenames.size(); iS++)
@@ -85,69 +300,51 @@ static void loadSpheres(po::variables_map vm)
             std::string instr((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
 
             //load the list of spheres from a string
-            loadSpheres(instr);
+            lSpheres(instr);
         }
     }
 
-    //load the sphere from the command line
-    if(vm.count("sx") || vm.count("sy") || vm.count("sz") || vm.count("s"))
-    {
-        //create a new sphere
-        sphere newS;
-
-		//set defaults
-		if(vm.count("sx"))
-            newS.p[0] = vm["sx"].as<ptype>();
-        else
-            newS.p[0] = DEFAULT_SPHERE_X;
-
-
-		if(vm.count("sy"))
-            newS.p[1] = vm["sy"].as<ptype>();
-        else
-            newS.p[1] = DEFAULT_SPHERE_Y;
-
-		if(vm.count("sz"))
-            newS.p[2] = vm["sz"].as<ptype>();
-        else
-            newS.p[2] = DEFAULT_SPHERE_Z;
-
-		if(vm.count("radius"))
-            newS.a = vm["radius"].as<ptype>();
-        else
-            newS.a = DEFAULT_SPHERE_A;
-
-        //add the sphere to the sphere vector
-        SCOPE->nf.sVector.push_back(newS);
+    //make sure the appropriate materials are loaded
+    unsigned int nS = SCOPE->nf.sVector.size();
 
+    //for each sphere
+    for(unsigned int s = 0; s<nS; s++)
+    {
+        //make sure the corresponding material exists
+        if(SCOPE->nf.sVector[s].iMaterial + 1 > SCOPE->nf.mVector.size())
+        {
+            //otherwise output an error
+            cout<<"BIMSIM Error - A material is not loaded for sphere "<<s+1<<"."<<endl;
+            exit(1);
+        }
     }
 }
 
-static void loadMaterials(po::variables_map vm)
+static void lMaterials(po::variables_map vm)
 {
 	//if materials are specified at the command line
 	if(vm.count("materials"))
 	{
 		vector<ptype> matVec = vm["materials"].as< vector<ptype> >();
-		if(matVec.size() %2 != 0)
+		if(matVec.size() == 1)
+		{
+			rts::material<ptype> newM(SCOPE->nf.lambda, matVec[0], 0);
+			SCOPE->nf.mVector.push_back(newM);
+		}
+		else if(matVec.size() %2 != 0)
 		{
 			cout<<"BIMSim Error: materials must be specified in n, k pairs"<<endl;
 			exit(1);
 		}
-
-
-		for(int i=0; i<matVec.size(); i+=2)
+		else
 		{
-			rts::material<ptype> newM(SCOPE->nf.lambda, matVec[i], matVec[i+1]);
-			SCOPE->nf.mVector.push_back(newM);
+			for(int i=0; i<matVec.size(); i+=2)
+			{
+				rts::material<ptype> newM(SCOPE->nf.lambda, matVec[i], matVec[i+1]);
+				SCOPE->nf.mVector.push_back(newM);
+			}
 		}
 	}
-	else
-	{
-		//add the command line material as the default (material 0)
-		rts::material<ptype> newM(SCOPE->nf.lambda, vm["n"].as<ptype>(), vm["k"].as<ptype>());
-		SCOPE->nf.mVector.push_back(newM);
-	}
 
 	//if file names are specified, load the materials
 	if(vm.count("material-file"))
@@ -169,57 +366,109 @@ static void loadMaterials(po::variables_map vm)
 
 }
 
-static void loadNearfieldParams(po::variables_map vm)
+static void lOptics(po::variables_map vm)
 {
-	//test to see if we are simulating a plane wave
-	bool planeWave = DEFAULT_PLANEWAVE;
-	if(vm.count("plane-wave"))
-		planeWave = !planeWave;
-	SCOPE->nf.planeWave = planeWave;
-
-	//get the wavelength
-    //SCOPE->nf.lambda = vm["lambda"].as<ptype>();
-
-	//get the incident field amplitude
-	SCOPE->nf.A = vm["amplitude"].as<ptype>();
-
-	//get the condenser parameters
-    SCOPE->nf.condenser[0] = vm["condenser-min"].as<ptype>();
-    SCOPE->nf.condenser[1] = vm["condenser-max"].as<ptype>();
-
-
-	//get the focal rtsPoint position
-    SCOPE->nf.focus[0] = vm["fx"].as<ptype>();
-    SCOPE->nf.focus[1] = vm["fy"].as<ptype>();
-    SCOPE->nf.focus[2] = vm["fz"].as<ptype>();
-
-	//get the incident light direction (k-vector)
-	bsVector spherical;
-	spherical[0] = 1.0;
-    spherical[1] = vm["theta"].as<ptype>();
-    spherical[2] = vm["phi"].as<ptype>();
-	SCOPE->nf.k = spherical.sph2cart();
-
-
-    //incident field order
-    SCOPE->nf.m = vm["field-order"].as<int>();
-
-    //number of Monte-Carlo samples
-    SCOPE->nf.nWaves = vm["samples"].as<int>();
-
-
+    SCOPE->objective[0] = DEFAULT_OBJECTIVE_MIN;
+    SCOPE->objective[1] = DEFAULT_OBJECTIVE_MAX;
+    if(vm.count("objective"))
+    {
+        vector<ptype> oparams = vm["objective"].as< vector<ptype> >();
 
+        if(oparams.size() == 1)
+            SCOPE->objective[1] = oparams[0];
+        else
+        {
+            SCOPE->objective[0] = oparams[0];
+            SCOPE->objective[1] = oparams[1];
+        }
+    }
 }
 
-static void loadSliceParams(po::variables_map vm)
+static void lImagePlane(po::variables_map vm)
 {
-    //parameters for the sample plane
-
+	bsPoint pMin(DEFAULT_PLANE_MIN_X, DEFAULT_PLANE_MIN_Y, DEFAULT_PLANE_MIN_Z);
+	bsPoint pMax(DEFAULT_PLANE_MAX_X, DEFAULT_PLANE_MAX_Y, DEFAULT_PLANE_MAX_Z);
+	bsVector normal(DEFAULT_PLANE_NORM_X, DEFAULT_PLANE_NORM_Y, DEFAULT_PLANE_NORM_Z);
 
 	//set the default values for the slice position and orientation
-	bsPoint pMin(vm["plane-min-x"].as<ptype>(), vm["plane-min-y"].as<ptype>(), vm["plane-min-z"].as<ptype>());
-	bsPoint pMax(vm["plane-max-x"].as<ptype>(), vm["plane-max-y"].as<ptype>(), vm["plane-max-z"].as<ptype>());
-	bsVector normal(vm["plane-norm-x"].as<ptype>(), vm["plane-norm-y"].as<ptype>(), vm["plane-norm-z"].as<ptype>());
+	if(vm.count("plane-lower-left") && vm.count("plane-upper-right") && vm.count("plane-normal"))
+	{
+		vector<ptype> ll = vm["plane-lower-left"].as< vector<ptype> >();
+		if(ll.size() != 3)
+		{
+			cout<<"BIMSIM Error - The lower-left corner of the image plane is incorrectly specified."<<endl;
+			exit(1);
+		}
+
+		vector<ptype> ur = vm["plane-lower-left"].as< vector<ptype> >();
+		if(ur.size() != 3)
+		{
+			cout<<"BIMSIM Error - The upper-right corner of the image plane is incorrectly specified."<<endl;
+			exit(1);
+		}
+
+		vector<ptype> norm = vm["plane-lower-left"].as< vector<ptype> >();
+		if(norm.size() != 3)
+		{
+			cout<<"BIMSIM Error - The normal of the image plane is incorrectly specified."<<endl;
+			exit(1);
+		}
+
+		pMin = bsPoint(ll[0], ll[1], ll[2]);
+		pMax = bsPoint(ur[0], ur[1], ur[2]);
+		normal = bsVector(norm[0], norm[1], norm[2]);
+	}
+	else if(vm.count("xy"))
+	{
+		//default plane size in microns
+		ptype s = DEFAULT_PLANE_SIZE;
+		ptype pos = DEFAULT_PLANE_POSITION;
+
+		vector<ptype> xy = vm["xy"].as< vector<ptype> >();
+		if(xy.size() >= 1)
+			s = xy[0];
+		if(xy.size() >= 2)
+			pos = xy[1];
+
+		//calculate the plane corners and normal based on the size and position
+		pMin = bsPoint(-s/2, -s/2, pos);
+		pMax = bsPoint(s/2, s/2, pos);
+		normal = bsVector(0, 0, 1);
+	}
+	else if(vm.count("xz"))
+	{
+		//default plane size in microns
+		ptype size = DEFAULT_PLANE_SIZE;
+		ptype pos = DEFAULT_PLANE_POSITION;
+
+		vector<ptype> xz = vm["xz"].as< vector<ptype> >();
+		if(xz.size() >= 1)
+			size = xz[0];
+		if(xz.size() >= 2)
+			pos = xz[1];
+
+		//calculate the plane corners and normal based on the size and position
+		pMin = bsPoint(-size/2, pos, -size/2);
+		pMax = bsPoint(size/2, pos, size/2);
+		normal = bsVector(0, -1, 0);
+	}
+	else if(vm.count("yz"))
+	{
+		//default plane size in microns
+		ptype size = DEFAULT_PLANE_SIZE;
+		ptype pos = DEFAULT_PLANE_POSITION;
+
+		vector<ptype> yz = vm["yz"].as< vector<ptype> >();
+		if(yz.size() >= 1)
+			size = yz[0];
+		if(yz.size() >= 2)
+			pos = yz[1];
+
+		//calculate the plane corners and normal based on the size and position
+		pMin = bsPoint(pos, -size/2, -size/2);
+		pMax = bsPoint(pos, size/2, size/2);
+		normal = bsVector(1, 0, 0);
+	}
 	SCOPE->setPos(pMin, pMax, normal);
 
 	//resolution
@@ -233,175 +482,111 @@ static void loadSliceParams(po::variables_map vm)
 
 
 	SCOPE->setNearfield();
-
-
-
-}
-
-static void loadMicroscopeParams(po::variables_map vm)
-{
-    //objective
-    SCOPE->objective[0] = vm["objective-min"].as<ptype>();
-    SCOPE->objective[1] = vm["objective-max"].as<ptype>();
-
-
-
-
-
-}
-
-static void loadOutputParams(po::variables_map vm)
-{
-    //append simulation results to previous binary files
-    gFileOut.append = DEFAULT_APPEND;
-    if(vm.count("append"))
-        gFileOut.append = true;
-
-	//image parameters
-	//component of the field to be saved
-	std::string fieldStr;
-    fieldStr = vm["output-type"].as<string>();
-
-    if(fieldStr == "magnitude")
-        gFileOut.field = fileoutStruct::fieldMag;
-    else if(fieldStr == "intensity")
-        gFileOut.field = fileoutStruct::fieldIntensity;
-    else if(fieldStr == "polarization")
-        gFileOut.field = fileoutStruct::fieldPolar;
-    else if(fieldStr == "imaginary")
-        gFileOut.field = fileoutStruct::fieldImag;
-    else if(fieldStr == "real")
-        gFileOut.field = fileoutStruct::fieldReal;
-    else if(fieldStr == "angular-spectrum")
-        gFileOut.field = fileoutStruct::fieldAngularSpectrum;
-
-
-	//image file names
-	gFileOut.intFile = vm["intensity"].as<string>();
-	gFileOut.absFile = vm["absorbance"].as<string>();
-	gFileOut.transFile = vm["transmittance"].as<string>();
-	gFileOut.nearFile = vm["near-field"].as<string>();
-	gFileOut.farFile = vm["far-field"].as<string>();
-
-	//colormap
-	std::string cmapStr;
-    cmapStr = vm["colormap"].as<string>();
-    if(cmapStr == "brewer")
-        gFileOut.colormap = rts::colormap::cmBrewer;
-    else if(cmapStr == "gray")
-        gFileOut.colormap = rts::colormap::cmGrayscale;
-    else
-        cout<<"color-map value not recognized (using default): "<<cmapStr<<endl;
 }
 
 static void OutputOptions()
 {
-	cout<<SCOPE->nf.toStr();
+	cout<<SCOPE->toStr();
 
 	cout<<"# of source points: "<<SCOPE->focalPoints.size()<<endl;
 
 }
 
+vector<ptype> test;
 static void SetOptions(po::options_description &desc)
 {
 	desc.add_options()
-		("help,h", "prints this help")
-		("plane-wave,P", "simulates an incident plane wave")
-		("intensity,I", po::value<string>()->default_value(DEFAULT_INTENSITY_FILE), "output measured intensity (filename)")
-		("absorbance,A", po::value<string>()->default_value(DEFAULT_ABSORBANCE_FILE), "output measured absorbance (filename)")
-		("transmittance,T", po::value<string>()->default_value(DEFAULT_TRANSMITTANCE_FILE), "output measured transmittance (filename)")
-		("far-field,F", po::value<string>()->default_value(DEFAULT_FAR_FILE), "output far-field at detector (filename)")
-		("near-field,N", po::value<string>()->default_value(DEFAULT_NEAR_FILE), "output field at focal plane (filename)")
-		("extended-source,X", po::value<string>()->default_value(DEFAULT_EXTENDED_SOURCE), "image of source at focus (filename)")
-		//("sx,x", po::value<ptype>()->default_value(DEFAULT_SPHERE_X), "sphere coordinates")
-		//("sy,y", po::value<ptype>()->default_value(DEFAULT_SPHERE_Y))
-		//("sz,z", po::value<ptype>()->default_value(DEFAULT_SPHERE_Z))
-		("sx,x", po::value<ptype>(), "sphere coordinates")
-		("sy,y", po::value<ptype>())
-		("sz,z", po::value<ptype>())
-		("radius,r", po::value<ptype>()->default_value(DEFAULT_SPHERE_A), "sphere radius")
-		("samples,s", po::value<int>()->default_value(DEFAULT_SAMPLES), "Monte-Carlo samples used to compute Us")
-		("sphere-file,S", po::value< vector<string> >()->multitoken(), "sphere file:\n [x y z radius material]")
-		("amplitude,a", po::value<ptype>()->default_value(DEFAULT_AMPLITUDE), "incident field amplitude")
-		("n,n", po::value<ptype>()->default_value(DEFAULT_N, "1.4"), "sphere phase speed")
-		("k,k", po::value<ptype>()->default_value(DEFAULT_K), "sphere absorption coefficient")
-		("material-file,M", po::value< vector<string> >()->multitoken(), "material file:\n [lambda n k]")
-		("materials", po::value< vector<ptype> >()->multitoken(), "materials specified using n, k pairs:\n ex. --materials n1 k1 n2 k2\n (if used --n and --k are ignored)")
-		("lambda,l", po::value<ptype>()->default_value(DEFAULT_LAMBDA), "incident wavelength")
+		("help", "prints this help")
+		("verbose", "verbose output\n")
+
+		("intensity", po::value<string>()->default_value(DEFAULT_INTENSITY_FILE), "output measured intensity (filename)")
+		("absorbance", po::value<string>()->default_value(DEFAULT_ABSORBANCE_FILE), "output measured absorbance (filename)")
+		("transmittance", po::value<string>()->default_value(DEFAULT_TRANSMITTANCE_FILE), "output measured transmittance (filename)")
+		("far-field", po::value<string>()->default_value(DEFAULT_FAR_FILE), "output far-field at detector (filename)")
+		("near-field", po::value<string>()->default_value(DEFAULT_NEAR_FILE), "output field at focal plane (filename)")
+		("extended-source", po::value<string>()->default_value(DEFAULT_EXTENDED_SOURCE), "image of source at focus (filename)\n")
+
+		("spheres", po::value< vector<ptype> >()->multitoken(), "sphere position: x y z a m")
+		("sphere-file", po::value< vector<string> >()->multitoken(), "sphere file:\n [x y z radius material]")
+		("materials", po::value< vector<ptype> >()->multitoken(), "refractive indices as n, k pairs:\n ex. -m n0 k0 n1 k1 n2 k2")
+		("material-file", po::value< vector<string> >()->multitoken(), "material file:\n [lambda n k]\n")
+
+		("lambda", po::value<ptype>()->default_value(DEFAULT_LAMBDA), "incident wavelength")
 		("nu", po::value<ptype>(), "incident frequency (in cm^-1)\n(if specified, lambda is ignored)")
-		("theta,t", po::value<ptype>()->default_value(DEFAULT_K_THETA), "light direction (polar coords)")
-		("phi,p", po::value<ptype>()->default_value(DEFAULT_K_PHI))
-		("fx", po::value<ptype>()->default_value(DEFAULT_FOCUS_X), "incident focal point")
-		("fy", po::value<ptype>()->default_value(DEFAULT_FOCUS_Y))
-		("fz", po::value<ptype>()->default_value(DEFAULT_FOCUS_Z))
-		("condenser-max,C", po::value<ptype>()->default_value(DEFAULT_CONDENSER_MAX), "condenser numerical aperature")
-		("condenser-min,c", po::value<ptype>()->default_value(DEFAULT_CONDENSER_MIN), "condenser obscuration NA")
-		("objective-max,O", po::value<ptype>()->default_value(DEFAULT_OBJECTIVE_MAX), "objective numerical aperature")
-		("objective-min,o", po::value<ptype>()->default_value(DEFAULT_OBJECTIVE_MIN), "objective obscuration NA")
-		("field-order", po::value<int>()->default_value(DEFAULT_FIELD_ORDER), "order of the incident field")
-		("output-type,f", po::value<string>()->default_value(DEFAULT_FIELD_TYPE), "output field value:\n magnitude, polarization, real, imaginary, angular-spectrum")
-		("resolution,R", po::value<unsigned int>()->default_value(DEFAULT_SLICE_RES), "resolution of the detector")
-		("padding,d", po::value<unsigned int>()->default_value(DEFAULT_PADDING), "FFT padding for the objective bandpass")
+		("k", po::value< vector<ptype> >()->multitoken(), "k-vector direction: -k theta phi\n theta = [0 2*pi], phi = [0 pi]")
+		("amplitude", po::value<ptype>()->default_value(DEFAULT_AMPLITUDE), "incident field amplitude")
+		("condenser", po::value< vector<ptype> >()->multitoken(), "condenser numerical aperature\nA pair of values can be used to specify an inner obscuration: -c NAin NAout")
+		("objective", po::value< vector<ptype> >()->multitoken(), "objective numerical aperature\nA pair of values can be used to specify an inner obscuration: -c NAin NAout")
+		("focus", po::value< vector<ptype> >()->multitoken(), "focal position for the incident point source\n (default = --focus 0 0 0)")
+		("plane-wave", "simulates an incident plane wave\n")
+
+		("resolution", po::value<unsigned int>()->default_value(DEFAULT_SLICE_RES), "resolution of the detector")
+		("plane-lower-left", po::value< vector<ptype> >()->multitoken(), "lower-left position of the image plane")
+		("plane-upper-right", po::value< vector<ptype> >()->multitoken(), "upper-right position of the image plane")
+		("plane-normal", po::value< vector<ptype> >()->multitoken(), "normal for the image plane")
+		("xy", po::value< vector<ptype> >()->multitoken(), "specify an x-y image plane\n (standard microscope)")
+		("xz", po::value< vector<ptype> >()->multitoken(), "specify a x-z image plane\n (cross-section of the focal volume)")
+		("yz", po::value< vector<ptype> >()->multitoken(), "specify a y-z image plane\n (cross-section of the focal volume)\n")
+
+		("samples", po::value<int>()->default_value(DEFAULT_SAMPLES), "Monte-Carlo samples used to compute Us")
+		("padding", po::value<unsigned int>()->default_value(DEFAULT_PADDING), "FFT padding for the objective bandpass")
 		("supersample", po::value<unsigned int>()->default_value(DEFAULT_SUPERSAMPLE), "super-sampling rate for the detector field")
+		("field-order", po::value<int>()->default_value(DEFAULT_FIELD_ORDER), "order of the incident field")
+		("seed", po::value<unsigned int>(), "seed for the Monte-Carlo random number generator")
+		("recursive", "evaluate all Bessel functions recursively\n")
+		("recursive-us", "evaluate scattered-field Bessel functions recursively\n")
+		("lut-uf", "evaluate the focused-field using a look-up table\n")
+
+		("output-type", po::value<string>()->default_value(DEFAULT_FIELD_TYPE), "output field value:\n magnitude, polarization, real, imaginary, angular-spectrum")
 		("colormap", po::value<string>()->default_value(DEFAULT_COLORMAP), "colormap: gray, brewer")
 		("append", "append result to an existing file\n (binary files only)")
-		("plane-min-x,u", po::value<ptype>()->default_value(DEFAULT_SLICE_MIN_X), "lower-left corner of the field slice")
-		("plane-min-y,v", po::value<ptype>()->default_value(DEFAULT_SLICE_MIN_Y))
-		("plane-min-z,w", po::value<ptype>()->default_value(DEFAULT_SLICE_MIN_Z))
-		("plane-max-x,U", po::value<ptype>()->default_value(DEFAULT_SLICE_MAX_X), "upper-right corner of the field slice")
-		("plane-max-y,V", po::value<ptype>()->default_value(DEFAULT_SLICE_MAX_Y))
-		("plane-max-z,W", po::value<ptype>()->default_value(DEFAULT_SLICE_MAX_Z))
-		("plane-norm-x", po::value<ptype>()->default_value(DEFAULT_SLICE_NORM_X), "field slice normal")
-		("plane-norm-y", po::value<ptype>()->default_value(DEFAULT_SLICE_NORM_Y))
-		("plane-norm-z", po::value<ptype>()->default_value(DEFAULT_SLICE_NORM_Z));
+		;
 }
 
 static void LoadParameters(int argc, char *argv[])
 {
 	//create an option description
-	po::options_description desc("Allowed options");
+	po::options_description desc("BimSim arguments");
 
 	//fill it with options
 	SetOptions(desc);
 
     po::variables_map vm;
-	po::store(po::parse_command_line(argc, argv, desc), vm);
+	po::store(po::parse_command_line(argc, argv, desc, po::command_line_style::unix_style ^ po::command_line_style::allow_short), vm);
 	po::notify(vm);
 
-	//display help and exit
-	if(vm.count("help"))
-	{
-		cout<<desc<<endl;
-		exit(1);
-	}
 
-	//load the wavelength
-	if(vm.count("nu"))
-	{
-		//wavelength is given in wavenumber - transform and flag
-		SCOPE->nf.lambda = 10000/vm["nu"].as<ptype>();
-		gFileOut.wavenumber = true;
-	}
-	//otherwise we are using lambda = wavelength
-	else
-	{
-		SCOPE->nf.lambda = vm["lambda"].as<ptype>();
-		gFileOut.wavenumber = false;
-	}
+    //load flags (help, verbose output)
+    lFlags(vm, desc);
+
+    //load the wavelength
+    lWavelength(vm);
+
+    //load materials
+	//loadMaterials(vm);
+	lMaterials(vm);
+
+    //load the sphere data
+    lSpheres(vm);
+
+    //load the optics
+    lOptics(vm);
+
+	//load the position and orientation of the image plane
+	lImagePlane(vm);
 
 	//load spheres
-	loadSpheres(vm);
+	//loadSpheres(vm);
+
 
-	//load materials
-	loadMaterials(vm);
 
-	loadNearfieldParams(vm);
+	lNearfield(vm);
 
 	loadOutputParams(vm);
 
-	loadMicroscopeParams(vm);
+	//loadMicroscopeParams(vm);
 
-	loadSliceParams(vm);
+	//loadSliceParams(vm);
 
     //if an extended source will be used
     if(vm["extended-source"].as<string>() != "")
diff --git a/scalarslice.cu b/scalarslice.cu
index daa609d..c6f215a 100644
--- a/scalarslice.cu
+++ b/scalarslice.cu
@@ -22,16 +22,17 @@ scalarslice::scalarslice()
 
 scalarslice::~scalarslice()
 {
-	HANDLE_ERROR(cudaFree(S));
+	if(S != NULL)
+		HANDLE_ERROR(cudaFree(S));
 	S = NULL;
 }
 
-void scalarslice::toImage(std::string filename, ptype vmin, ptype vmax, rts::colormap::colormapType cmap)
+void scalarslice::toImage(std::string filename, ptype vmin, ptype vmax, rts::colormapType cmap)
 {
-	rts::colormap::gpu2image<ptype>(S, filename, R[0], R[1], vmin, vmax, cmap);
+	rts::gpu2image<ptype>(S, filename, R[0], R[1], vmin, vmax, cmap);
 }
 
-void scalarslice::toImage(std::string filename, bool positive, rts::colormap::colormapType cmap)
+void scalarslice::toImage(std::string filename, bool positive, rts::colormapType cmap)
 {
     cublasStatus_t stat;
     cublasHandle_t handle;
@@ -62,7 +63,7 @@ void scalarslice::toImage(std::string filename, bool positive, rts::colormap::co
 		exit(1);
 	}
 
-    //std::cout<<"Maximum index: "<<result<<std::endl;
+
 
     //retrieve the maximum value
     ptype maxVal;
@@ -75,7 +76,7 @@ void scalarslice::toImage(std::string filename, bool positive, rts::colormap::co
     if(positive)
         toImage(filename, 0, maxVal, cmap);
     else
-        toImage(filename, -maxVal, maxVal, cmap);
+        toImage(filename, -abs(maxVal), abs(maxVal), cmap);
 }
 
 void scalarslice::toEnvi(std::string filename, ptype wavelength, bool append)
diff --git a/scalarslice.h b/scalarslice.h
index 18384a4..9e9af8b 100644
--- a/scalarslice.h
+++ b/scalarslice.h
@@ -2,7 +2,7 @@
 #define RTS_SCALAR_SLICE
 
 #include "dataTypes.h"
-#include "colormap.h"
+#include "rts/graphics/colormap.h"
 
 struct scalarslice
 {
@@ -17,8 +17,8 @@ struct scalarslice
 	~scalarslice();
 	void clear();
 
-	void toImage(std::string filename, ptype vmin, ptype vmax, rts::colormap::colormapType cmap = rts::colormap::cmBrewer);
-	void toImage(std::string filename, bool positive = true, rts::colormap::colormapType cmap = rts::colormap::cmBrewer);
+	void toImage(std::string filename, ptype vmin, ptype vmax, rts::colormapType cmap = rts::cmBrewer);
+	void toImage(std::string filename, bool positive = true, rts::colormapType cmap = rts::cmBrewer);
 	void toEnvi(std::string filename, ptype wavelength = 0, bool append = false);
 
 };
diff --git a/sphere.cpp b/sphere.cpp
index b60c3e3..857f3be 100644
--- a/sphere.cpp
+++ b/sphere.cpp
@@ -1,8 +1,10 @@
 #include "sphere.h"
+#include "defaults.h"
 
 #include "rts/math/complex.h"
 #include <complex>
 #include <stdlib.h>
+#include <fstream>
 
 using namespace rts;
 using namespace std;
@@ -13,6 +15,9 @@ int cbessjyva(double v,complex<double> z,double &vm,complex<double>*cjv,
 int cbessjyva_sph(int v,complex<double> z,double &vm,complex<double>*cjv,
     complex<double>*cyv,complex<double>*cjvp,complex<double>*cyvp);
 
+int bessjyv_sph(int v, double z, double &vm, double* cjv,
+    double* cyv, double* cjvp, double* cyvp);
+
 void sphere::calcCoeff(ptype lambda, rtsComplex<ptype> ri)
 {
     /*  These calculations are done at high-precision on the CPU
@@ -59,12 +64,6 @@ void sphere::calcCoeff(ptype lambda, rtsComplex<ptype> ri)
     cbessjyva_sph(Nl, ka, vm, cjv_ka, cyv_ka, cjvp_ka, cyvp_ka);
     cbessjyva_sph(Nl, kna, vm, cjv_kna, cyv_kna, cjvp_kna, cyvp_kna);
 
-
-    //cout<<"Begin Sphere---------"<<endl;
-    //cout<<"Nl =  "<<Nl<<endl;
-    //cout<<"ka =  "<<ka<<endl;
-    //cout<<"kna = "<<kna<<endl;
-
     //compute A for each order
     complex<double> i(0, 1);
     complex<double> a, b, c, d;
@@ -83,7 +82,7 @@ void sphere::calcCoeff(ptype lambda, rtsComplex<ptype> ri)
         //calculate A and add it to the list
         An = (2.0 * l + 1.0) * pow(i, l) * (a / b);
         A.push_back(bsComplex(An.real(), An.imag()));
-        //cout<<"A:  "<<An<<endl;
+
 
         //Compute B (external scattering coefficient)
         c = cjv_ka[l] * cjvp_kna[l] * nc - cjv_kna[l] * cjvp_ka[l];
@@ -92,7 +91,206 @@ void sphere::calcCoeff(ptype lambda, rtsComplex<ptype> ri)
         //calculate B and add it to the list
         Bn = (2.0 * l + 1.0) * pow(i, l) * (c / d);
         B.push_back(bsComplex(Bn.real(), Bn.imag()));
-        //cout<<"B:  "<<Bn<<endl;
 
+
+    }
+}
+
+void sphere::calcBesselLut(bsComplex* j, ptype k, bsComplex n, int aR)
+{
+    /*Compute the look-up-table for spherical bessel functions used inside of the sphere
+        j    =   (Nl + 1) x aR array of values
+        aR      =   resolution of j
+    */
+
+    //allocate space for the Bessel functions of the first and second kind (and derivatives -- which will be ignored)
+    int bytes = sizeof(complex<double>) * (Nl + 1);
+    complex<double>* cjv_knr = (complex<double>*)malloc(bytes);
+    complex<double>* cyv_knr = (complex<double>*)malloc(bytes);
+    complex<double>* cjvp_knr = (complex<double>*)malloc(bytes);
+    complex<double>* cyvp_knr = (complex<double>*)malloc(bytes);
+
+    //compute the bessel functions using the CPU-based algorithm
+    double vm;
+
+    //for each sample along r
+    ptype dr = a / (aR - 1);
+    ptype r;
+    for(int ir = 0; ir < aR; ir++)
+    {
+        r = ir * dr;
+        complex<double> knr( (k*n*r).real(), (k*n*r).imag() );
+        cbessjyva_sph(Nl, knr, vm, cjv_knr, cyv_knr, cjvp_knr, cyvp_knr);
+
+        //copy the double data to the bsComplex array
+        for(int l=0; l<=Nl; l++)
+		{
+			//deal with the NaN case at the origin
+			if(ir == 0)
+			{
+				if(l == 0)
+					j[ir * (Nl+1)] = 1;
+				else
+					j[ir * (Nl+1) + l] = 0;
+			}
+			else
+				j[ir * (Nl+1) + l] = bsComplex(cjv_knr[l].real(), cjv_knr[l].imag());
+		}
+    }
+
+	/*ofstream outfile("besselout.txt");
+    for(int ir = 0; ir < aR; ir++)
+    {
+        for(int l = 0; l<Nl+1; l++)
+        {
+            outfile<<j[ir * (Nl+1) + l].real()<<"     ";
+        }
+        outfile<<endl;
+    }
+	outfile.close();*/
+
+}
+
+void sphere::calcHankelLut(bsComplex* h, ptype k, int rR)
+{
+	/*Compute the look-up-table for spherical bessel functions used inside of the sphere
+        h_out   =   (Nl + 1) x aR array of values
+		rmin	=	minimum value of r
+		d_max	=	maximum value of r
+        rR      =   resolution of h_out
+    */
+
+    //allocate space for the Bessel functions of the first and second kind (and derivatives -- which will be ignored)
+    int bytes = sizeof(double) * (Nl + 1);
+    double* cjv_kr = (double*)malloc(bytes);
+    double* cyv_kr = (double*)malloc(bytes);
+    double* cjvp_kr = (double*)malloc(bytes);
+    double* cyvp_kr = (double*)malloc(bytes);
+
+    //compute the bessel functions using the CPU-based algorithm
+    double vm;
+
+
+
+    //for each sample along r
+    ptype dr = (d_max - max(a, d_min)) / (rR - 1);
+    ptype r;
+    for(int ir = 0; ir < rR; ir++)
+    {
+        r = ir * dr + max(a, d_min);
+        double kr = k*r;
+        bessjyv_sph(Nl, kr, vm, cjv_kr, cyv_kr, cjvp_kr, cyvp_kr);
+
+        //copy the double data to the bsComplex array
+        for(int l=0; l<=Nl; l++)
+		{
+			//h[ir * (Nl+1) + l] = bsComplex(cjv_kr[l].real(), cyv_kr[l].real());
+			h[ir * (Nl+1) + l] = bsComplex(cjv_kr[l], cyv_kr[l]);
+		}
     }
+
+	/*ofstream outfile("hankelout.txt");
+    for(int ir = 0; ir < rR; ir++)
+    {
+		outfile<<ir*dr + max(a, d_min)<<"     ";
+        for(int l = 0; l<=0; l++)
+        {
+            outfile<<h[ir * (Nl+1) + l].real()<<"     "<<h[ir * (Nl+1) + l].imag()<<"     ";
+        }
+        outfile<<endl;
+    }
+	outfile.close();*/
+}
+
+void sphere::calcLut(bsComplex* j, bsComplex* h, ptype lambda, bsComplex n, int aR, int rR)
+{
+    /*Compute the look-up-tables for spherical bessel functions used both inside and outside of the sphere.
+        j       =   (Nl + 1) x aR array of values
+        j       =   (Nl + 1) x rR array of values
+        d_max    =   maximum distance for the LUT
+        aR      =   resolution of j_in
+        rR      =   resolution of j_out
+    */
+
+    //compute the magnitude of the k vector
+    double k = 2 * PI / lambda;
+
+	calcBesselLut(j, k, n, aR);
+	calcHankelLut(h, k, rR);
+}
+
+void sphere::calcUp(ptype lambda, bsComplex n, rts::rtsQuad<ptype, 3> nfPlane, unsigned int R)
+{
+    //calculate the parameters of the lookup table
+
+    //first find the distance to the closest and furthest points on the nearfield plane
+    d_min = nfPlane.dist(p);
+    d_max = nfPlane.dist_max(p);
+
+    //compute the radius of the cross-section of the sphere with the plane
+    ptype a_inter = 0;
+    if(d_min < a)
+        a_inter = sqrt(a - d_min);
+
+
+	//calculate the resolution of the Usp and Uip lookup tables
+	int aR = 1 + 2 * R * a_inter / (nfPlane(0, 0) - nfPlane(1, 1)).len();
+	int dR = 2 * R;
+	int thetaR = DEFAULT_SPHERE_THETA_R;
+
+	//allocate space for the bessel function LUTs
+	bsComplex* j = (bsComplex*)malloc(sizeof(bsComplex) * (Nl + 1) * aR);
+	bsComplex* h = (bsComplex*)malloc(sizeof(bsComplex) * (Nl + 1) * dR);
+
+	calcLut(j, h, lambda, n, aR, dR);
+
+	//allocate space for the Usp lookup texture
+	Usp.R[0] = dR;
+	Usp.R[1] = thetaR;
+	Usp.init_gpu();
+
+	//allocate space for the Uip lookup texture
+	Uip.R[0] = aR;
+	Uip.R[1] = thetaR;
+	Uip.init_gpu();
+
+
+
+	scalarUsp(h, dR, thetaR);
+	scalarUip(j, aR, thetaR);
+
+	scalarslice UspMag = Usp.Mag();
+	UspMag.toImage("Usp.bmp", true);
+
+	scalarslice UipMag = Uip.Mag();
+	UipMag.toImage("Uip.bmp", true);
+
+	//free memory
+	free(j);
+	free(h);
+
+}
+
+sphere& sphere::operator=(const sphere &rhs)
+{
+	p = rhs.p;
+	a = rhs.a;
+	iMaterial = rhs.iMaterial;
+	Nl = rhs.Nl;
+	n = rhs.n;
+	B = rhs.B;
+	A = rhs.A;
+
+	return *this;
+}
+
+sphere::sphere(const sphere &rhs)
+{
+	p = rhs.p;
+	a = rhs.a;
+	iMaterial = rhs.iMaterial;
+	Nl = rhs.Nl;
+	n = rhs.n;
+	B = rhs.B;
+	A = rhs.A;
 }
diff --git a/sphere.cu b/sphere.cu
new file mode 100644
index 0000000..8819936
--- /dev/null
+++ b/sphere.cu
@@ -0,0 +1,149 @@
+#include "sphere.h"
+#include "rts/math/legendre.h"
+
+__global__ void gpuScalarUsp(bsComplex* Usp, bsComplex* h, bsComplex* B, int Nl, int rR, int thetaR)
+{
+    //get the current coordinate in the plane slice
+	int ir = blockIdx.x * blockDim.x + threadIdx.x;
+	int itheta = blockIdx.y * blockDim.y + threadIdx.y;
+
+	//make sure that the thread indices are in-bounds
+	if(itheta >= thetaR || ir >= rR) return;
+
+	int i = itheta * rR + ir;
+
+	//ptype dr = (rmax - a) / (rR - 1);
+	ptype dtheta = (PI) / (thetaR - 1);
+
+	//comptue the current angle and distance
+	//ptype r = dr * ir + a;
+	ptype theta = dtheta * itheta;
+	ptype cos_theta = cos(theta);
+
+	//initialize the Legendre polynomial
+	ptype P[2];
+	rts::init_legendre<ptype>(cos_theta, P[0], P[1]);
+
+	//initialize the result
+	bsComplex Us((ptype)0, (ptype)0);
+
+    //for each order l
+    for(int l=0; l <= Nl; l++)
+    {
+        if(l == 0)
+        {
+            Us += B[l] * h[ir * (Nl+1) + l] * P[0];
+            //Us += P[0];
+        }
+        else
+        {
+            if(l > 1)
+            {
+                rts::shift_legendre<ptype>(l, cos_theta, P[0], P[1]);
+            }
+            Us += B[l] * h[ir * (Nl+1) + l] * P[1];
+            //Us += P[1];
+        }
+
+
+    }
+	Usp[i] = Us;
+	//Usp[i] = h[ir * (Nl+1)];
+	//Usp[i] = ir;
+
+}
+
+__global__ void gpuScalarUip(bsComplex* Uip, bsComplex* j, bsComplex* A, int Nl, int aR, int thetaR)
+{
+    //get the current coordinate in the plane slice
+	int ia = blockIdx.x * blockDim.x + threadIdx.x;
+	int itheta = blockIdx.y * blockDim.y + threadIdx.y;
+
+	//make sure that the thread indices are in-bounds
+	if(itheta >= thetaR || ia >= aR) return;
+
+	int i = itheta * aR + ia;
+
+	ptype dtheta = (PI) / (thetaR - 1);
+
+	//comptue the current angle and distance
+	ptype theta = dtheta * itheta;
+	ptype cos_theta = cos(theta);
+
+	//initialize the Legendre polynomial
+	ptype P[2];
+	rts::init_legendre<ptype>(cos_theta, P[0], P[1]);
+
+	//initialize the result
+	bsComplex Ui((ptype)0, (ptype)0);
+
+    //for each order l
+    for(int l=0; l <= Nl; l++)
+    {
+        if(l == 0)
+        {
+            Ui += A[l] * j[ia * (Nl+1) + l] * P[0];
+        }
+        else
+        {
+            if(l > 1)
+            {
+                rts::shift_legendre<ptype>(l, cos_theta, P[0], P[1]);
+            }
+            Ui += A[l] * j[ia * (Nl+1) + l] * P[1];
+        }
+
+
+    }
+	Uip[i] = Ui;
+}
+
+void sphere::scalarUsp(bsComplex* h, int rR, int thetaR)
+{
+	//copy the hankel function to the GPU
+    bsComplex* gpu_h;
+    HANDLE_ERROR( cudaMalloc( (void**)&gpu_h, sizeof(bsComplex) * (Nl + 1) * rR ) );
+    HANDLE_ERROR( cudaMemcpy( gpu_h, h, sizeof(bsComplex) * (Nl + 1) * rR, cudaMemcpyHostToDevice ) );
+
+    //allocate memory for the scattering coefficients
+    bsComplex* gpuB;
+    HANDLE_ERROR(cudaMalloc((void**) &gpuB, (Nl+1) * sizeof(bsComplex)));
+    //copy the scattering coefficients to the GPU
+    HANDLE_ERROR(cudaMemcpy(gpuB, &B[0], (Nl+1) * sizeof(bsComplex), cudaMemcpyHostToDevice));
+
+    //create one thread for each pixel of the field slice
+	dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
+	dim3 dimGrid((Usp.R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (Usp.R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
+
+	gpuScalarUsp<<<dimGrid, dimBlock>>>(Usp.x_hat, gpu_h, gpuB, Nl, rR, thetaR);
+
+	//free memory
+	cudaFree(gpu_h);
+	cudaFree(gpuB);
+
+}
+
+void sphere::scalarUip(bsComplex* j, int rR, int thetaR)
+{
+	//copy the bessel and hankel LUTs to the GPU
+    bsComplex* gpu_j;
+    HANDLE_ERROR( cudaMalloc( (void**)&gpu_j, sizeof(bsComplex) * (Nl + 1) * rR ) );
+    HANDLE_ERROR( cudaMemcpy( gpu_j, j, sizeof(bsComplex) * (Nl + 1) * rR, cudaMemcpyHostToDevice ) );
+
+    //allocate memory for the scattering coefficients
+    bsComplex* gpuA;
+    HANDLE_ERROR(cudaMalloc((void**) &gpuA, (Nl+1) * sizeof(bsComplex)));
+    //copy the scattering coefficients to the GPU
+    HANDLE_ERROR(cudaMemcpy(gpuA, &A[0], (Nl+1) * sizeof(bsComplex), cudaMemcpyHostToDevice));
+
+    //create one thread for each pixel of the field slice
+	dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
+	dim3 dimGrid((Uip.R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (Uip.R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
+
+	gpuScalarUip<<<dimGrid, dimBlock>>>(Uip.x_hat, gpu_j, gpuA, Nl, rR, thetaR);
+
+	//free memory
+	cudaFree(gpu_j);
+	cudaFree(gpuA);
+
+}
diff --git a/sphere.h b/sphere.h
index c0b685d..7723c8c 100644
--- a/sphere.h
+++ b/sphere.h
@@ -22,12 +22,12 @@ struct sphere
     //sphere material index
     int iMaterial;
 
-    //rtsPointer to the scattered field produced by a plane wave
+    //GPU pointer to the scattered field produced by a plane wave
     //  this is a function of cos(theta) and |r| (distance from sphere center)
-    //fieldslice surface;
-
-    //resolution of the scattered field
-    int thetaR, rR;
+    fieldslice Usp;
+    fieldslice Uip;
+    ptype d_min;
+    ptype d_max;
 
 	//sphere order
 	int Nl;
@@ -50,6 +50,12 @@ struct sphere
 		//surface = fieldslice(ang, ang/2);
     }
 
+	//assignment operator
+	sphere & operator=(const sphere &rhs);
+	
+	//copy constructor
+	sphere(const sphere &rhs);
+
 	std::string toStr()
 	{
 		std::stringstream ss;
@@ -66,8 +72,19 @@ struct sphere
         Nl = ceil( (2 * PI * a) / lambda + 4 * pow( (2 * PI * a) / lambda, 1.0/3.0) + 2);
 	}
 
-	void calcCoeff(ptype lambda, rts::rtsComplex<ptype> n);
+    //compute the scattering coefficients
+	void calcCoeff(ptype lambda, bsComplex n);
+
+	//compute the bessel function look-up tables
+	void calcLut(bsComplex* j, bsComplex* h, ptype lambda, bsComplex n, int aR, int rR);
+    void calcBesselLut(bsComplex* j, ptype k, bsComplex n, int aR);
+	void calcHankelLut(bsComplex* h, ptype k, int rR);
+
+	//calculate the scattering domain Us(theta, r)
+	void calcUp(ptype lambda, bsComplex n, rts::rtsQuad<ptype, 3> nfPlane, unsigned int R);
 
+	void scalarUsp(bsComplex* h, int rR, int thetaR);
+	void scalarUip(bsComplex* j, int aR, int thetaR);
 
 
 
--
libgit2 0.21.4