added look-up tables

dmayerich
1 parent b6179de6
Showing 27 changed files with 1518 additions and 588 deletions Show diff stats
bessjy.cpp
cbessjy.cpp
colormap.h
dataTypes.h
defaults.h
fieldslice.cpp
fieldslice.cu
fieldslice.h
fileout.cu
fileout.h
main.cpp
microscope.cu
microscope.h
montecarlo.cpp
nearfield.cpp
nearfield.h
nfScalarUf.cu
nfScalarUfLut.cu
nfScalarUpLut.cu
nfScalarUs.cu
@@ -13,7 +13,9 @@
 //
 #define _USE_MATH_DEFINES
 #include <math.h>
-#include "bessel.h"
+#include "bessel.h"
+
+#define PI  3.14159
  
 double gamma(double x);
 //
@@ -426,7 +428,7 @@ int bessjynb(int n,double x,int &amp;nm,double *jn,double *yn,
         0.2775764465332031,
        -1.993531733751297,
         2.724882731126854e1};
-        
+
     int i,k,m;
     nm = n;
     if ((x < 0.0) || (n < 0)) return 1;
@@ -702,5 +704,26 @@ int bessjyv(double v,double x,double &amp;vm,double *jv,double *yv,
     }
     vm = n + v0;
     return 0;
+}
+
+int bessjyv_sph(int v, double z, double &vm, double* cjv,
+    double* cyv, double* cjvp, double* cyvp)
+{
+    //first, compute the bessel functions of fractional order
+    bessjyv(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp);
+
+    //iterate through each and scale
+    for(int n = 0; n<=v; n++)
+    {
+
+        cjv[n] = cjv[n] * sqrt(PI/(z * 2.0));
+        cyv[n] = cyv[n] * sqrt(PI/(z * 2.0));
+
+        cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(PI / (z * 2.0));
+        cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(PI / (z * 2.0));
+    }
+
+	return 0;
+
 }
- 
+
@@ -724,6 +724,7 @@ int cbessjyva_sph(int v,complex&lt;double&gt; z,double &amp;vm,complex&lt;double&gt;*cjv,
     //iterate through each and scale
     for(int n = 0; n<=v; n++)
     {
+
         cjv[n] = cjv[n] * sqrt(PI/(z * 2.0));
         cyv[n] = cyv[n] * sqrt(PI/(z * 2.0));
  
-#ifndef RTS_COLORMAP_H
-#define RTS_COLORMAP_H
-
-#include <string>
-#include <qimage.h>
-#include <qcolor.h>
-#include "rts/cuda/error.h"
-
-
-#define BREWER_CTRL_PTS 11
-
-#ifdef __CUDACC__
-texture<float4, cudaTextureType1D> cudaTexBrewer;
-static cudaArray* gpuBrewer;
-#endif
-
-
-
-namespace rts{
-	namespace colormap{
-
-enum colormapType {cmBrewer, cmGrayscale};
-
-static void buffer2image(unsigned char* buffer, std::string filename, unsigned int x_size, unsigned int y_size)
-{
-	//create an image object
-	QImage image(x_size, y_size, QImage::Format_RGB32);
-
-	int i;
-	unsigned char r, g, b;
-	unsigned int x, y;
-	for(y=0; y<y_size; y++)
-		for(x=0; x<x_size; x++)
-		{
-			//calculate the 1D index
-			i = y * x_size + x;
-
-			r = buffer[i * 3 + 0];
-			g = buffer[i * 3 + 1];
-			b = buffer[i * 3 + 2];
-
-			//set the image pixel
-			QColor color(r, g, b);
-			image.setPixel(x, y, color.rgb());
-		}
-
-	image.save(filename.c_str());
-}
-
-#ifdef __CUDACC__
-static void initBrewer()
-{
-	//initialize the Brewer colormap
-
-	//allocate CPU space
-	float4 cpuColorMap[BREWER_CTRL_PTS];
-
-	//define control rtsPoints
-	cpuColorMap[0] = make_float4(0.192157f, 0.211765f, 0.584314f, 1.0f);
-	cpuColorMap[1] = make_float4(0.270588f, 0.458824f, 0.705882f, 1.0f);
-	cpuColorMap[2] = make_float4(0.454902f, 0.678431f, 0.819608f, 1.0f);
-	cpuColorMap[3] = make_float4(0.670588f, 0.85098f, 0.913725f, 1.0f);
-	cpuColorMap[4] = make_float4(0.878431f, 0.952941f, 0.972549f, 1.0f);
-	cpuColorMap[5] = make_float4(1.0f, 1.0f, 0.74902f, 1.0f);
-	cpuColorMap[6] = make_float4(0.996078f, 0.878431f, 0.564706f, 1.0f);
-	cpuColorMap[7] = make_float4(0.992157f, 0.682353f, 0.380392f, 1.0f);
-	cpuColorMap[8] = make_float4(0.956863f, 0.427451f, 0.262745f, 1.0f);
-	cpuColorMap[9] = make_float4(0.843137f, 0.188235f, 0.152941f, 1.0f);
-	cpuColorMap[10] = make_float4(0.647059f, 0.0f, 0.14902f, 1.0f);
-
-
-	int width = BREWER_CTRL_PTS;
-	int height = 0;
-
-
-	// allocate array and copy colormap data
-	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat);
-
-	HANDLE_ERROR(cudaMallocArray(&gpuBrewer, &channelDesc, width, height));
-
-	HANDLE_ERROR(cudaMemcpyToArray(gpuBrewer, 0, 0, cpuColorMap, sizeof(float4)*width, cudaMemcpyHostToDevice));
-
-	// set texture parameters
-    cudaTexBrewer.addressMode[0] = cudaAddressModeClamp;
-	//texBrewer.addressMode[1] = cudaAddressModeClamp;
-    cudaTexBrewer.filterMode = cudaFilterModeLinear;
-    cudaTexBrewer.normalized = true;  // access with normalized texture coordinates
-
-	// Bind the array to the texture
-    HANDLE_ERROR(cudaBindTextureToArray( cudaTexBrewer, gpuBrewer, channelDesc));
-
-}
-
-static void destroyBrewer()
-{
-    HANDLE_ERROR(cudaFreeArray(gpuBrewer));
-
-}
-
-template<class T>
-__global__ static void applyBrewer(T* gpuSource, unsigned char* gpuDest, unsigned int N, T minVal = 0, T maxVal = 1)
-{
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if(i >= N) return;
-
-	//compute the normalized value on [minVal maxVal]
-	float a = (gpuSource[i] - minVal) / (maxVal - minVal);
-
-	//lookup the color
-	float shift = 1.0/BREWER_CTRL_PTS;
-	float4 color = tex1D(cudaTexBrewer, a+shift);
-
-	gpuDest[i * 3 + 0] = 255 * color.x;
-	gpuDest[i * 3 + 1] = 255 * color.y;
-	gpuDest[i * 3 + 2] = 255 * color.z;
-}
-
-template<class T>
-__global__ static void applyGrayscale(T* gpuSource, unsigned char* gpuDest, unsigned int N, T minVal = 0, T maxVal = 1)
-{
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if(i >= N) return;
-
-	//compute the normalized value on [minVal maxVal]
-	float a = (gpuSource[i] - minVal) / (maxVal - minVal);
-
-	gpuDest[i * 3 + 0] = 255 * a;
-	gpuDest[i * 3 + 1] = 255 * a;
-	gpuDest[i * 3 + 2] = 255 * a;
-}
-
-template<class T>
-static void gpu2gpu(T* gpuSource, unsigned char* gpuDest, unsigned int nVals, T minVal = 0, T maxVal = 1, colormapType cm = cmGrayscale, int blockDim = 128)
-{
-	//This function converts a scalar field on the GPU to a color image on the GPU
-	int gridDim = (nVals + blockDim - 1)/blockDim;
-	if(cm == cmGrayscale)
-		applyGrayscale<<<gridDim, blockDim>>>(gpuSource, gpuDest, nVals, minVal, maxVal);
-	else if(cm == cmBrewer)
-	{
-		initBrewer();
-		applyBrewer<<<gridDim, blockDim>>>(gpuSource, gpuDest, nVals, minVal, maxVal);
-		destroyBrewer();
-	}
-
-}
-
-template<class T>
-static void gpu2cpu(T* gpuSource, unsigned char* cpuDest, unsigned int nVals, T minVal, T maxVal, colormapType cm = cmGrayscale)
-{
-    //this function converts a scalar field on the GPU to a color image on the CPU
-
-    //first create the color image on the GPU
-
-    //allocate GPU memory for the color image
-    unsigned char* gpuDest;
-    HANDLE_ERROR(cudaMalloc( (void**)&gpuDest, sizeof(unsigned char) * nVals * 3 ));
-
-	//HANDLE_ERROR(cudaMemset(gpuSource, 0, sizeof(T) * nVals));
-
-    //create the image on the gpu
-    gpu2gpu(gpuSource, gpuDest, nVals, minVal, maxVal, cm);
-
-	//HANDLE_ERROR(cudaMemset(gpuDest, 0, sizeof(unsigned char) * nVals * 3));
-
-    //copy the image from the GPU to the CPU
-    HANDLE_ERROR(cudaMemcpy(cpuDest, gpuDest, sizeof(unsigned char) * nVals * 3, cudaMemcpyDeviceToHost));
-
-	HANDLE_ERROR(cudaFree( gpuDest ));
-
-}
-
-template<typename T>
-static void gpu2image(T* gpuSource, std::string fileDest, unsigned int x_size, unsigned int y_size, T valMin, T valMax, colormapType cm = cmGrayscale)
-{
-	//allocate a color buffer
-	unsigned char* cpuBuffer = (unsigned char*) malloc(sizeof(unsigned char) * 3 * x_size * y_size);
-
-	//do the mapping
-	gpu2cpu<T>(gpuSource, cpuBuffer, x_size * y_size, valMin, valMax, cm);
-
-	//copy the buffer to an image
-	buffer2image(cpuBuffer, fileDest, x_size, y_size);
-
-	free(cpuBuffer);
-}
-
-#endif
-
-template<class T>
-static void cpu2cpu(T* cpuSource, unsigned char* cpuDest, unsigned int nVals, T valMin, T valMax, colormapType cm = cmGrayscale)
-{
-	int i;
-	float a;
-	float range = valMax - valMin;
-	for(i = 0; i<nVals; i++)
-	{
-		//normalize to the range [valMin valMax]
-		a = (cpuSource[i] - valMin) / range;
-
-		cpuDest[i * 3 + 0] = 255 * a;
-		cpuDest[i * 3 + 1] = 255 * a;
-		cpuDest[i * 3 + 2] = 255 * a;
-	}
-
-}
-
-
-
-template<typename T>
-static void cpu2image(T* cpuSource, std::string fileDest, unsigned int x_size, unsigned int y_size, T valMin, T valMax, colormapType cm = cmGrayscale)
-{
-    //allocate a color buffer
-	unsigned char* cpuBuffer = (unsigned char*) malloc(sizeof(unsigned char) * 3 * x_size * y_size);
-
-	//do the mapping
-	cpu2cpu<T>(cpuSource, cpuBuffer, x_size * y_size, valMin, valMax, cm);
-
-	//copy the buffer to an image
-	buffer2image(cpuBuffer, fileDest, x_size, y_size);
-
-	free(cpuBuffer);
-
-}
-
-}}	//end namespace colormap and rts
-
-#endif
-
@@ -24,6 +24,8 @@ typedef double ptype;
  
 typedef ptype fieldPoint;
  
+extern bool verbose;
+
 //hybrid GPU/CPU complex data typ
 #include "rts/math/complex.h"
 #include "rts/math/vector.h"
@@ -15,14 +15,14 @@
 #define DEFAULT_FOCUS_X         0
 #define DEFAULT_FOCUS_Y         0
 #define DEFAULT_FOCUS_Z         0
-#define DEFAULT_INCIDENT_ORDER	100
+//#define DEFAULT_INCIDENT_ORDER	20
 #define DEFAULT_STABILITY_PARM	1.4
  
 //optics
-#define DEFAULT_CONDENSER_MIN   0.0
+#define DEFAULT_CONDENSER_MIN   0
 #define DEFAULT_CONDENSER_MAX   1
  
-#define DEFAULT_OBJECTIVE_MIN   0.0
+#define DEFAULT_OBJECTIVE_MIN   0
 #define DEFAULT_OBJECTIVE_MAX   1
  
 //incident light direction
@@ -36,17 +36,20 @@
 //#define DEFAULT_OUTPUT_POINT	fileoutStruct::imageObjective
  
  
-#define DEFAULT_SLICE_MIN_X     -5
-#define DEFAULT_SLICE_MIN_Y     0
-#define DEFAULT_SLICE_MIN_Z     -5
+#define DEFAULT_PLANE_MIN_X     -5
+#define DEFAULT_PLANE_MIN_Y     0
+#define DEFAULT_PLANE_MIN_Z     -5
  
-#define DEFAULT_SLICE_MAX_X     5
-#define DEFAULT_SLICE_MAX_Y     0
-#define DEFAULT_SLICE_MAX_Z     5
+#define DEFAULT_PLANE_MAX_X     5
+#define DEFAULT_PLANE_MAX_Y     0
+#define DEFAULT_PLANE_MAX_Z     5
  
-#define DEFAULT_SLICE_NORM_X    0
-#define DEFAULT_SLICE_NORM_Y    1
-#define DEFAULT_SLICE_NORM_Z    0
+#define DEFAULT_PLANE_NORM_X    0
+#define DEFAULT_PLANE_NORM_Y    1
+#define DEFAULT_PLANE_NORM_Z    0
+
+#define DEFAULT_PLANE_SIZE		40
+#define DEFAULT_PLANE_POSITION	0
  
  
 /*
@@ -64,21 +67,23 @@
 */
  
  
-#define DEFAULT_FIELD_ORDER     200
+#define DEFAULT_FIELD_ORDER     10
  
-#define DEFAULT_SAMPLES         200
+#define DEFAULT_SAMPLES         400
  
 #define DEFAULT_SLICE_RES		256
  
+#define DEFAULT_SPHERE_THETA_R  1000
+
 #define DEFAULT_PADDING			1
 #define DEFAULT_SUPERSAMPLE		1
  
-#define DEFAULT_INTENSITY_FILE	    "testappend"
+#define DEFAULT_INTENSITY_FILE	    "out_i.bmp"
 #define DEFAULT_TRANSMITTANCE_FILE	""
-#define DEFAULT_ABSORBANCE_FILE	    "out_a"
+#define DEFAULT_ABSORBANCE_FILE	    "out_a.bmp"
 #define DEFAULT_NEAR_FILE		    "out_n.bmp"
 #define DEFAULT_FAR_FILE		    "out_f.bmp"
-#define DEFAULT_EXTENDED_SOURCE     "einstein_small.jpg"
+#define DEFAULT_EXTENDED_SOURCE     ""
 #define DEFAULT_FIELD_TYPE		    "magnitude"
 #define DEFAULT_FORMAT			    fileoutStruct::formatImage
 #define DEFAULT_COLORMAP		    "brewer"
@@ -8,14 +8,16 @@
 using namespace std;
  
 fieldslice::fieldslice(unsigned int x_size, unsigned int y_size)
-{
+{
+    x_hat = y_hat = z_hat = NULL;
+
 	//save the slice resolution
 	R[0] = x_size;
 	R[1] = x_size;
  
 	scalarField = true;
  
-	//init_gpu();
+	init_gpu();
  
  
 }
@@ -101,5 +103,5 @@ fieldslice::fieldslice()
  
 fieldslice::~fieldslice()
 {
-	//kill_gpu();
+	kill_gpu();
 }
 #include "fieldslice.h"
 #include "dataTypes.h"
-#include "rts/cuda/error.h"
+#include "rts/cuda/error.h"
+#include "rts/cuda/threads.h"
  
  
 __global__ void field_intensity(bsComplex* x, bsComplex* y, bsComplex* z, ptype* I, unsigned int N)
 {
     //compute the index for this thread
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	//int i = blockIdx.x * blockDim.x + threadIdx.x;
+	int i = ThreadIndex1D();
+
 	if(i >= N) return;
  
 	ptype xm = x[i].abs();
@@ -66,7 +69,8 @@ __global__ void resample_intensity(bsComplex* x, bsComplex* y, bsComplex* z, pty
 __global__ void field_real(bsComplex* field_component, ptype* V, unsigned int N)
 {
     //compute the index for this thread
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	//int i = blockIdx.x * blockDim.x + threadIdx.x;
+	int i = ThreadIndex1D();
 	if(i >= N) return;
  
 	V[i] = field_component[i].real();
@@ -75,7 +79,8 @@ __global__ void field_real(bsComplex* field_component, ptype* V, unsigned int N)
 __global__ void field_imaginary(bsComplex* field_component, ptype* V, unsigned int N)
 {
     //compute the index for this thread
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	//int i = blockIdx.x * blockDim.x + threadIdx.x;
+	int i = ThreadIndex1D();
 	if(i >= N) return;
  
 	V[i] = field_component[i].imag();
@@ -84,7 +89,8 @@ __global__ void field_imaginary(bsComplex* field_component, ptype* V, unsigned i
 __global__ void field_sqrt(ptype* input, ptype* output, unsigned int N)
 {
 	//compute the index for this thread
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	//int i = blockIdx.x * blockDim.x + threadIdx.x;
+	int i = ThreadIndex1D();
 	if(i >= N) return;
  
 	output[i] = sqrt(input[i]);
@@ -115,7 +121,8 @@ scalarslice fieldslice::Mag()
  
 	//compute the total number of values in the slice
 	unsigned int N = R[0] * R[1];
-	int gridDim = (N+BLOCK-1)/BLOCK;
+	//int gridDim = (N+BLOCK-1)/BLOCK;
+	dim3 gridDim = GenGrid1D(N, BLOCK);
  
 	field_intensity<<<gridDim, BLOCK>>>(x_hat, y_hat, z_hat, result->S, N);
 	field_sqrt<<<gridDim, BLOCK>>>(result->S, result->S, N);
@@ -132,7 +139,8 @@ scalarslice fieldslice::Real()
  
 	//compute the total number of values in the slice
 	unsigned int N = R[0] * R[1];
-	int gridDim = (N+BLOCK-1)/BLOCK;
+	//int gridDim = (N+BLOCK-1)/BLOCK;
+	dim3 gridDim = GenGrid1D(N, BLOCK);
  
 	field_real<<<gridDim, BLOCK>>>(x_hat, result->S, N);
  
@@ -148,7 +156,8 @@ scalarslice fieldslice::Imag()
  
 	//compute the total number of values in the slice
 	unsigned int N = R[0] * R[1];
-	int gridDim = (N+BLOCK-1)/BLOCK;
+	//int gridDim = (N+BLOCK-1)/BLOCK;
+	dim3 gridDim = GenGrid1D(N, BLOCK);
  
 	field_imaginary<<<gridDim, BLOCK>>>(x_hat, result->S, N);
  
@@ -192,7 +201,6 @@ void fieldslice::ScaleField(ptype v)
  
     //compute the total number of values in the slice
 	unsigned int N = R[0] * R[1];
-	//cout<<"Size of mag field: "<<N<<endl;
 	int gridDim = (N+BLOCK-1)/BLOCK;
  
 	field_scale<<<gridDim, BLOCK>>>(x_hat, y_hat, z_hat, N, v);
@@ -200,19 +208,23 @@ void fieldslice::ScaleField(ptype v)
 }
  
 void fieldslice::init_gpu()
-{
+{
+	//if the field has no size, return
+	if(R[0] == 0 || R[1] == 0)
+		return;
+
+    //free any previous memory allocations
+    if(x_hat)
+        HANDLE_ERROR(cudaFree(x_hat));
+    if(y_hat)
+        HANDLE_ERROR(cudaFree(y_hat));
+    if(z_hat)
+        HANDLE_ERROR(cudaFree(z_hat));
+
     //allocate space on the GPU for the field slice
 	HANDLE_ERROR(cudaMalloc((void**)&x_hat, R[0] * R[1] * sizeof(bsComplex)));
-	//HANDLE_ERROR(cudaMemset(x_hat, 0, R[0] * R[1] * sizeof(bsComplex)));
  
-	//if the field is scalar, y_hat and z_hat are unused
-	if(scalarField)
-	{
-		y_hat = NULL;
-		z_hat = NULL;
-
-	}
-	else
+	if(!scalarField)
 	{
 		HANDLE_ERROR(cudaMalloc((void**)&y_hat, R[0] * R[1] * sizeof(bsComplex)));
 		//HANDLE_ERROR(cudaMemset(y_hat, 0, R[0] * R[1] * sizeof(bsComplex)));
@@ -233,6 +245,8 @@ void fieldslice::kill_gpu()
     if(z_hat != NULL)
         HANDLE_ERROR(cudaFree(z_hat));
  
+	x_hat = y_hat = z_hat = NULL;
+
 }
  
 void fieldslice::clear_gpu()
@@ -275,7 +289,7 @@ fieldslice fieldslice::crop(int u, int v, int su, int sv)
 	result.scalarField = scalarField;
  
 	//allocate space for the new field
-	result.init_gpu();
+	//result.init_gpu();
  
 	//create one thread for each pixel of the field slice
 	dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
@@ -291,3 +305,57 @@ fieldslice fieldslice::crop(int u, int v, int su, int sv)
  
 	return result;
 }
+
+fieldslice::fieldslice(const fieldslice& rhs)
+{
+	R[0] = rhs.R[0];
+	R[1] = rhs.R[1];
+	scalarField = rhs.scalarField;
+
+	x_hat = y_hat = z_hat = NULL;
+
+	unsigned int bytes = sizeof(bsComplex) * R[0] * R[1];
+	if(rhs.x_hat != NULL)
+	{
+		HANDLE_ERROR(cudaMalloc( (void**)&x_hat, bytes));
+		HANDLE_ERROR(cudaMemcpy( x_hat, rhs.x_hat, bytes, cudaMemcpyDeviceToDevice));
+	}
+	if(rhs.y_hat != NULL)
+	{
+		HANDLE_ERROR(cudaMalloc( (void**)&y_hat, bytes));
+		HANDLE_ERROR(cudaMemcpy( y_hat, rhs.y_hat, bytes, cudaMemcpyDeviceToDevice));
+	}
+	if(rhs.z_hat != NULL)
+	{
+		HANDLE_ERROR(cudaMalloc( (void**)&z_hat, bytes));
+		HANDLE_ERROR(cudaMemcpy( z_hat, rhs.z_hat, bytes, cudaMemcpyDeviceToDevice));
+	}
+
+}
+
+fieldslice& fieldslice::operator=(const fieldslice& rhs)
+{
+	//make sure this isn't a self-allocation
+	if(this != &rhs)
+	{
+		//make a shallow copy
+		R[0] = rhs.R[0];
+		R[1] = rhs.R[1];
+		scalarField = rhs.scalarField;
+
+		//initialize to new parameters
+		init_gpu();
+
+		//make a deep copy
+		unsigned int bytes = sizeof(bsComplex) * R[0] * R[1];
+		if(x_hat != NULL)
+			HANDLE_ERROR(cudaMemcpy(x_hat, rhs.x_hat, bytes, cudaMemcpyDeviceToDevice));
+		if(y_hat != NULL)
+			HANDLE_ERROR(cudaMemcpy(y_hat, rhs.y_hat, bytes, cudaMemcpyDeviceToDevice));
+		if(z_hat != NULL)
+			HANDLE_ERROR(cudaMemcpy(z_hat, rhs.z_hat, bytes, cudaMemcpyDeviceToDevice));
+	}
+
+	return *this;
+
+}
@@ -31,6 +31,9 @@ struct fieldslice
  
 	~fieldslice();
  
+	//copy constructor
+	fieldslice(const fieldslice& rhs);
+
 	//void setPos(bsPoint pMin, bsPoint pMax, bsVector N);
  
 	scalarslice Mag();
@@ -47,6 +50,7 @@ struct fieldslice
  
 	//crop a region from the field
 	fieldslice crop(int u, int v, int su, int sv);
+	fieldslice& operator=(const fieldslice& rhs);
  
 	void init_gpu();
 	void kill_gpu();
@@ -186,11 +186,21 @@ void fileoutStruct::Save(microscopeStruct* scope)
 	//save images of the fields in the microscope
  
 	//if the user specifies an extended source
-	if(scope->focalPoints.size() > 1)
+	if(scope->focalPoints.size() > 0)
 	{
 		//simulate the extended source and output the detector image
 		scope->SimulateExtendedSource();
  
+		//saveNearField(&scope->nf);
+		saveFarField(scope);
+
+		//save the detector images
+		saveDetector(scope);
+
+		//simulate scattering for the last point (so that you have a near field image)
+		scope->SimulateScattering();
+        saveNearField(&scope->nf);
+
 	}
 	else
 	{
@@ -203,12 +213,15 @@ void fileoutStruct::Save(microscopeStruct* scope)
 		//run the far-field simulation
 		scope->SimulateImaging();
  
+		//saveNearField(&scope->nf);
 		saveFarField(scope);
  
+		//save the detector images
+		saveDetector(scope);
+
 	}
  
-	//save the detector images
-    saveDetector(scope);
+	
  
  
 }
@@ -5,7 +5,7 @@
 //#include "defaults.h"
 #include "dataTypes.h"
  
-#include "colormap.h"
+#include "rts/graphics/colormap.h"
 #include "fieldslice.h"
 #include "nearfield.h"
 #include "microscope.h"
@@ -34,7 +34,7 @@ struct fileoutStruct{
 	//image_source source;
  
 	//color map info
-	rts::colormap::colormapType colormap;
+	rts::colormapType colormap;
 	ptype colorMax;
  
 	void Save(microscopeStruct* scope);
@@ -24,6 +24,7 @@ microscopeStruct* SCOPE;
 #include "warnings.h"
  
 fileoutStruct gFileOut;
+bool verbose = false;
 using namespace std;
  
 int cbessjyva(double v,complex<double> z,double &vm,complex<double>*cjv,
@@ -31,32 +32,19 @@ int cbessjyva(double v,complex&lt;double&gt; z,double &amp;vm,complex&lt;double&gt;*cjv,
  
 int main(int argc, char *argv[])
 {
-	//test Envi loading and saving
-	//EnviFile envi("testenvi", "w");
-
-	//float* data = (float*)malloc(sizeof(float) * 100 * 100);
-	//envi.addBand(data, 100, 100, 100);
-
-	//envi.close();
-
-	//return 0;
  
 	SCOPE = new microscopeStruct();
  
-    cout<<SCOPE->nf.Uf.R[0]<<endl;
-
     LoadParameters(argc, argv);
  
-    //TestSimulation(NF, SCOPE, &gFileOut);
-
 	//initialize GPU memory for fields
 	SCOPE->init();
  
-    OutputOptions();
-
 	gFileOut.Save(SCOPE);
  
-	//NF->destroy();
+	if(verbose)
+        OutputOptions();
+
 	SCOPE->destroy();
  
  
@@ -4,7 +4,7 @@
 #include "rts/tools/progressbar.h"
 #include "rts/cuda/timer.h"
 #include "dataTypes.h"
-#include "colormap.h"
+#include "rts/graphics/colormap.h"
  
 #include <QImage>
  
@@ -112,8 +112,8 @@ void microscopeStruct::getFarField()
     //Compute the Far Field image of the focal plane
  
     //clear the memory from previous detector fields
-    Ud.kill_gpu();
-    Ufd.kill_gpu();
+    //Ud.kill_gpu();
+    //Ufd.kill_gpu();
  
 	//first crop the filtered near-field image of the source and scattered fields
 	Ud = nf.U.crop(padding * Ud.R[0], padding * Ud.R[1], Ud.R[0], Ud.R[1]);
@@ -261,9 +261,14 @@ void microscopeStruct::SimulateExtendedSource()
 		t += gpuStopTimer();
  
 		rtsProgressBar((double)(i+1)/(double)npts * 100);
+		//unsigned char c;
+		//cin>>c;
 	}
-	cout<<endl;
-	cout<<"Time per source: "<<t/npts<<"ms"<<endl;
+	if(verbose)
+	{
+        cout<<endl;
+        cout<<"Time per source: "<<t/npts<<"ms"<<endl;
+    }
  
 }
  
@@ -304,3 +309,15 @@ void microscopeStruct::LoadExtendedSource(std::string filename)
 			}
         }
 }
+
+std::string microscopeStruct::toStr()
+{
+	stringstream ss;
+	ss<<nf.toStr();
+
+	ss<<"----------Optics--------------"<<endl<<endl;
+	ss<<"Objective NA: "<<objective[0]<<" to "<<objective[1]<<endl;
+	return ss.str();
+
+
+}
@@ -63,6 +63,8 @@ struct microscopeStruct
 	scalarslice getTransmittance();
 	scalarslice getIntensity();
  
+	string toStr();
+
  
  
 };
@@ -35,18 +35,12 @@ void mcSampleNA(bsVector* samples, int N, bsVector k, ptype NAin, ptype NAout)
     ptype inPhi = asin(NAin);
     ptype outPhi = asin(NAout);
  
-    //cout<<"inPhi: "<<inPhi<<endl;
-    //cout<<"outPhi: "<<outPhi<<endl;
-
     //calculate the z-values associated with these angles
     ptype inZ = cos(inPhi);
     ptype outZ = cos(outPhi);
  
     ptype rangeZ = inZ - outZ;
  
-    //cout<<"inZ: "<<inZ<<endl;
-    //cout<<"outZ: "<<outZ<<endl;
-
     //draw a distribution of random phi, z values
     ptype z, phi, theta;
     for(int i=0; i<N; i++)
@@ -58,7 +52,6 @@ void mcSampleNA(bsVector* samples, int N, bsVector k, ptype NAin, ptype NAout)
         phi = acos(z);
  
         //compute and store cartesian coordinates
-        //bsVector spherical(1, theta + kSph[1], phi + kSph[2]);
 		bsVector spherical(1, theta, phi);
 		bsVector cart = spherical.sph2cart();
         samples[i] = rotation * cart;
 #include "nearfield.h"
+#include <time.h>
+#include <math.h>
+
+#ifdef _WIN32
+#define isnan(x) _isnan(x)
+#define isinf(x) (!_finite(x))
+#endif
+
+int bessjyv_sph(int v, double z, double &vm, double* cjv,
+    double* cyv, double* cjvp, double* cyvp);
  
 nearfieldStruct::nearfieldStruct()
 {
     scalarSim = true;
 	planeWave = false;
+	lut_us = true;
+	lut_uf = false;
  
 	nWaves = 0;
 }
@@ -46,6 +58,8 @@ std::string nearfieldStruct::toStr()
 	ss<<"Condenser NA: "<<condenser[0]<<" to "<<condenser[1]<<std::endl;
 	ss<<"Focal Point: "<<focus[0]<<", "<<focus[1]<<", "<<focus[2]<<std::endl;
 	ss<<"Field Slice: "<<std::endl;
+	if(lut_us)
+        ss<<"LUT Parameters --- min: "<<d_min<<"   max: "<<d_max<<std::endl;
 	ss<<pos<<std::endl;
  
 	ss<<std::endl<<"---------Materials-----------"<<std::endl;
@@ -61,6 +75,10 @@ std::string nearfieldStruct::toStr()
 	for(unsigned int s=0; s<sVector.size(); s++)
 		ss<<sVector[s].toStr()<<std::endl;
  
+    ss<<"---------Timings-------------"<<std::endl;
+    ss<<"Uf = "<<t_Uf<<"ms"<<std::endl;
+    ss<<"Us = "<<t_Us<<"ms"<<std::endl;
+
 	return ss.str();
 }
  
@@ -70,7 +88,8 @@ void nearfieldStruct::calcWaves()
     inWaves.resize(nWaves);
  
     //re-seed the random number generator
-    //srand(seed);
+    //srand(time(NULL));
+	srand(NULL);
  
     //calculate the monte-carlo samples
     mcSampleNA(&inWaves[0], nWaves, k, condenser[0], condenser[1]);
@@ -84,6 +103,8 @@ void nearfieldStruct::calcSpheres()
     //calculate all of the constants necessary to evaluate the scattered field
 	//estimate the order required to represent the scattered field for each sphere
  
+
+
 	//for each sphere
 	for(int i=0; i<sVector.size(); i++)
 	{
@@ -91,12 +112,10 @@ void nearfieldStruct::calcSpheres()
  
 		//calculate the required order
 		sVector[i].calcNl(lambda);
-		//std::cout<<sVector[i].Nl<<std::endl;
  
 		//set the refractive index for the sphere
 		int imat = sVector[i].iMaterial;
         rts::rtsComplex<ptype> n = mVector[imat](lambda);
-		//std::cout<<"Sphere refractive index: "<<n<<std::endl;
  
 		//calculate the scattering coefficients
 		sVector[i].calcCoeff(lambda, n);
@@ -104,18 +123,109 @@ void nearfieldStruct::calcSpheres()
 		//save the refractive index
 		sVector[i].n = n;
  
+		//if the LUT is used, calculate Usp(theta, r)
+		if(lut_us)
+		{
+			sVector[i].calcUp(lambda, n, pos, max(U.R[0], U.R[1]));
+        }
+
+
 	}
  
 }
  
+void nearfieldStruct::calcUs()
+{
+
+
+    if(lut_us)
+        scalarUpLut();
+    else
+        scalarUs();
+}
+
+void nearfieldStruct::calcUf()
+{
+	if(lut_uf)
+		scalarUfLut();
+	else
+		scalarUf();
+}
+
 void nearfieldStruct::Simulate()
 {
+    //initialize timings
+    t_Uf = 0;
+    t_Us = 0;
+
 	//compute a set of plane waves for Monte-Carlo simulation
 	calcWaves();
  
     //the near field has to be simulated no matter what the output rtsPoint is
-    scalarUf();
+    calcUf();
     calcSpheres();
-    scalarUs();
+    calcUs();
     sumUf();
+
+	//U.Mag().toImage("testU.bmp");
+}
+
+void nearfieldStruct::calcBesselLut(ptype* j, ptype d_min, ptype d_max, int dR)
+{
+    /*Compute the look-up-table for spherical bessel functions used for the incident field
+        j    =   (Nl + 1) x aR array of values
+        aR      =   resolution of j
+    */
+
+	//compute the wavenumber
+	ptype k = 2 * PI / lambda;
+	unsigned int Nl = m;
+
+    //allocate space for the Bessel functions of the first and second kind (and derivatives -- which will be ignored)
+    int bytes = sizeof(double) * (Nl + 1);
+    double* cjv_kd = (double*)malloc(bytes);
+    double* cyv_kd = (double*)malloc(bytes);
+    double* cjvp_kd = (double*)malloc(bytes);
+    double* cyvp_kd = (double*)malloc(bytes);
+
+    //compute the bessel functions using the CPU-based algorithm
+    double vm;
+
+    //for each sample along r
+    ptype dr = (d_max - d_min) / (dR - 1);
+    ptype d;
+    ptype jv;
+    for(int id = 0; id < dR; id++)
+    {
+        d = id * dr + d_min;
+        double kd = k*d;
+        bessjyv_sph(Nl, kd, vm, cjv_kd, cyv_kd, cjvp_kd, cyvp_kd);
+
+        //copy the double data to the bsComplex array
+        for(int l=0; l<=Nl; l++)
+		{
+            jv = cjv_kd[l];
+			if(isnan(jv) || isinf(jv))
+			{
+                if(kd == 0 && l == 0)
+                    jv = 1;
+                else
+                    jv = 0;
+            }
+            j[id * (Nl+1) + l] = jv;
+		}
+    }
+
+	/*ofstream outfile("uf_besselout.txt");
+    for(int ir = 0; ir < dR; ir++)
+    {
+		outfile<<ir*dr + d_min<<endl;
+        for(int l = 0; l<=Nl; l++)
+        {
+            outfile<<j[ir * (Nl+1) + l]<<" --";
+        }
+        outfile<<endl;
+    }
+	outfile.close();*/
+
 }
@@ -31,6 +31,8 @@ struct nearfieldStruct
  
 	//slices for the focused field
 	fieldslice Uf;
+	ptype d_min, d_max;
+
 	//	and total field: Uf + sum(Us)
 	fieldslice U;
  
@@ -43,6 +45,14 @@ struct nearfieldStruct
 	//flag for a plane wave
 	bool planeWave;
  
+	//flag for using a LUT
+	bool lut_uf;
+	bool lut_us;
+
+	//timings
+	float t_Uf;
+	float t_Us;
+
  
  
 	//---------Scatterers------------
@@ -78,10 +88,17 @@ struct nearfieldStruct
 	void setPos(bsPoint pMin, bsPoint pMax, bsVector normal);
  
 	//this function re-computes the focused field
+	void calcUf();
 	void scalarUf();
+	void scalarUfLut();
+
+	void calcBesselLut(ptype* j, ptype d_min, ptype d_max, int dR);
  
 	//compute the field scattered by all of the materials
+	void calcUs();
 	void scalarUs();
+	void scalarUpLut();
+
  
 	//add the incident field to the sum of scattered fields
 	void sumUf();
@@ -5,7 +5,7 @@
 #include "rts/cuda/error.h"
 #include "rts/cuda/timer.h"
  
-
+//Incident field for a single plane wave
 __global__ void gpuScalarUfp(bsComplex* Uf, bsVector k, ptype kmag, bsPoint f, ptype A, bsRect ABCD, int uR, int vR)
 {
 	/*Compute the scalar focused field using Debye focusing
@@ -41,7 +41,8 @@ __global__ void gpuScalarUfp(bsComplex* Uf, bsVector k, ptype kmag, bsPoint f, p
 	Uf[i] = exp(d) * A;
  
 }
-
+
+//Incident field for a focused point source
 __global__ void gpuScalarUf(bsComplex* Uf, bsVector k, ptype kmag, bsPoint f, ptype A, bsRect ABCD, int uR, int vR, ptype cosAlpha, ptype cosBeta, int nl, ptype j_conv = 1.4)
 {
 	/*Compute the scalar focused field using Debye focusing
@@ -151,7 +152,6 @@ __global__ void gpuScalarUf(bsComplex* Uf, bsVector k, ptype kmag, bsPoint f, pt
 		}
  
 		sumUf += il * jl * Pl * (Palpha[1] - Palpha[2] - Pbeta[1] + Pbeta[2]);
-		//sumUf += il * Pl * (Palpha[1] - Palpha[2] - Pbeta[1] + Pbeta[2]);
  
 		il *= im;
 	}
@@ -162,21 +162,12 @@ __global__ void gpuScalarUf(bsComplex* Uf, bsVector k, ptype kmag, bsPoint f, pt
  
 void nearfieldStruct::scalarUf()
 {
-	//Compute the incident field via a scalar simulation
-	//This method uses Debye focusing to approximate the field analytically
-
-	//time the calculation of the focused field
-	//gpuStartTimer();
-
-	//set the field slice to a scalar field
-	//Uf.scalarField = true;
-
-	//initialize the GPU arrays
-	//Uf.init_gpu();
+
+    gpuStartTimer();
  
 	//create one thread for each pixel of the field slice
 	dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
-	dim3 dimGrid((Uf.R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (Uf.R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
+	dim3 dimGrid((Uf.R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (Uf.R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
  
 	//if we are computing a plane wave, call the gpuScalarUfp function
 	if(planeWave)
@@ -191,10 +182,7 @@ void nearfieldStruct::scalarUf()
 		ptype cosBeta = cos(asin(condenser[1]));
 		//compute the scalar Uf field (this will be in the x_hat channel of Uf)
 		gpuScalarUf<<<dimGrid, dimBlock>>>(Uf.x_hat, k, 2 * PI / lambda, focus, A, pos, Uf.R[0], Uf.R[1], cosAlpha, cosBeta, m);
-	}
-
-	//float t = gpuStopTimer();
-	//std::cout<<"Scalar Uf Time: "<<t<<"ms"<<std::endl;
-	//std::cout<<focus<<std::endl;
-
+	}
+
+	t_Uf = gpuStopTimer();
 }
+#include "nearfield.h"
+
+#include "rts/math/legendre.h"
+#include "rts/cuda/error.h"
+#include "rts/cuda/timer.h"
+
+texture<float, cudaTextureType2D> texJ;
+
+__global__ void gpuScalarUfp(bsComplex* Uf, bsVector k, ptype kmag, bsPoint f, ptype A, bsRect ABCD, int uR, int vR);
+
+__global__ void gpuScalarUfLut(bsComplex* Uf, bsRect ABCD, int uR, int vR, bsPoint f, bsVector k, ptype A, ptype cosAlpha, ptype cosBeta, int nl, ptype dmin, ptype dmax, int dR)
+{
+    /*This function computes the focused field for a 2D slice
+
+    Uf      =   destination field slice
+    ABCD    =   plane representing the field slice in world space
+    uR, vR  =   resolution of the Uf field
+    f       =   focal point of the condenser
+    k       =   direction of the incident light
+    A       =   amplitude of the incident field
+    cosAlpha=   cosine of the solid angle subtended by the condenser obscuration
+    cosBeta =   cosine of the solid angle subtended by the condenser aperature
+    nl      =   number of orders used to compute the field
+    dR      =   number of Bessel function values in the look-up texture
+
+    */
+
+    //get the current coordinate in the plane slice
+	int iu = blockIdx.x * blockDim.x + threadIdx.x;
+	int iv = blockIdx.y * blockDim.y + threadIdx.y;
+
+	//make sure that the thread indices are in-bounds
+	if(iu >= uR || iv >= vR) return;
+
+	//compute the index (easier access to the scalar field array)
+	int i = iv*uR + iu;
+
+	//compute the parameters for u and v
+	ptype u = (ptype)iu / (uR);
+	ptype v = (ptype)iv / (vR);
+
+	
+
+	//get the rtsPoint in world space and then the r vector
+	bsPoint p = ABCD(u, v);
+	bsVector r = p - f;
+	ptype d = r.len();
+
+	if(d == 0)
+	{
+        Uf[i] = A * 2 * PI * (cosAlpha - cosBeta);
+        return;
+    }
+
+	//get info for the light direction and frequency
+	r = r.norm();
+
+	//compute the imaginary factor i^l
+	bsComplex im = bsComplex(0, 1);
+	bsComplex il = bsComplex(1, 0);
+
+	//Legendre functions are computed dynamically to save memory
+	//initialize the Legendre functions
+
+	ptype P[2];
+	//get the angle between k and r (light direction and position vector)
+	ptype cosTheta;
+	cosTheta = k.dot(r);
+
+	rts::init_legendre<ptype>(cosTheta, P[0], P[1]);
+
+	//initialize legendre functions for the cassegrain angles
+	ptype Palpha[3];
+	rts::init_legendre<ptype>(cosAlpha, Palpha[0], Palpha[1]);
+	Palpha[2] = 1;
+
+	ptype Pbeta[3];
+	rts::init_legendre<ptype>(cosBeta, Pbeta[0], Pbeta[1]);
+	Pbeta[2] = 1;
+
+	//for each order l
+	bsComplex sumUf(0.0, 0.0);
+	ptype jl = 0.0;
+	ptype Pl;
+	ptype di = ( (d - dmin)/(dmax - dmin) ) * (dR - 1);
+	for(int l = 0; l<=nl; l++)
+	{
+        jl = tex2D(texJ, l + 0.5, di + 0.5);
+		if(l==0)
+			Pl = P[0];
+		else if(l==1)
+		{
+			Pl = P[1];
+
+			//adjust the cassegrain Legendre function
+			Palpha[2] = Palpha[0];
+			rts::shift_legendre<ptype>(l+1, cosAlpha, Palpha[0], Palpha[1]);
+			Pbeta[2] = Pbeta[0];
+			rts::shift_legendre<ptype>(l+1, cosBeta, Pbeta[0], Pbeta[1]);
+		}
+		else
+		{
+			rts::shift_legendre<ptype>(l, cosTheta, P[0], P[1]);
+
+			Pl = P[1];
+
+			//adjust the cassegrain outer Legendre function
+			Palpha[2] = Palpha[0];
+			rts::shift_legendre<ptype>(l+1, cosAlpha, Palpha[0], Palpha[1]);
+			Pbeta[2] = Pbeta[0];
+			rts::shift_legendre<ptype>(l+1, cosBeta, Pbeta[0], Pbeta[1]);
+		}
+
+		sumUf += il * jl * Pl * (Palpha[1] - Palpha[2] - Pbeta[1] + Pbeta[2]);
+		//sumUf += jl;
+
+		il *= im;
+	}
+
+	Uf[i] = sumUf * 2 * PI * A;
+	//Uf[i] = u;
+	//return;
+}
+
+void nearfieldStruct::scalarUfLut()
+{
+    gpuStartTimer();
+	
+    //calculate the minimum and maximum points in the focused field
+    d_min = pos.dist(focus);
+    d_max = pos.dist_max(focus);
+
+    //allocate space for the Bessel function
+    int dR = 2 * max(Uf.R[0], Uf.R[1]);
+    ptype* j = NULL;
+	j = (ptype*) malloc(sizeof(ptype) * dR * (m+1));
+
+	//calculate Bessel function LUT
+	calcBesselLut(j, d_min, d_max, dR);
+	
+    //create a CUDA array structure and specify the format description
+	cudaArray* arrayJ;
+    cudaChannelFormatDesc channelDesc =
+        cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+	
+    //allocate memory
+    HANDLE_ERROR(cudaMallocArray(&arrayJ, &channelDesc, m+1, dR));
+	
+    //specify texture properties
+    texJ.addressMode[0] = cudaAddressModeMirror;
+    texJ.addressMode[1] = cudaAddressModeMirror;
+    texJ.filterMode     = cudaFilterModeLinear;
+    texJ.normalized     = false;
+
+    //bind the texture to the array
+    HANDLE_ERROR(cudaBindTextureToArray(texJ, arrayJ, channelDesc));
+
+    //copy the CPU Bessel LUT to the GPU-based array
+    HANDLE_ERROR( cudaMemcpy2DToArray(arrayJ, 0, 0, j, (m+1)*sizeof(float), (m+1)*sizeof(float), dR, cudaMemcpyHostToDevice));
+
+    //----------------Compute the focused field
+    //create one thread for each pixel of the field slice
+	dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
+	dim3 dimGrid((Uf.R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (Uf.R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
+
+	//if we are computing a plane wave, call the gpuScalarUfp function
+	if(planeWave)
+	{
+		gpuScalarUfp<<<dimGrid, dimBlock>>>(Uf.x_hat, k, 2 * PI / lambda, focus, A, pos, Uf.R[0], Uf.R[1]);
+	}
+	//otherwise compute the condenser info and create a focused field
+	else
+	{
+		//pre-compute the cosine of the obscuration and objective angles
+		ptype cosAlpha = cos(asin(condenser[0]));
+		ptype cosBeta = cos(asin(condenser[1]));
+		//compute the scalar Uf field (this will be in the x_hat channel of Uf)
+		gpuScalarUfLut<<<dimGrid, dimBlock>>>(Uf.x_hat, pos, Uf.R[0], Uf.R[1], focus, k, A, cosAlpha, cosBeta, m, d_min, d_max, dR);
+	}
+
+	
+    //free everything
+	free(j);
+	
+	HANDLE_ERROR(cudaFreeArray(arrayJ));
+
+	t_Uf = gpuStopTimer();
+}
+#include "nearfield.h"
+#include "rts/math/spherical_bessel.h"
+#include "rts/math/legendre.h"
+#include <stdlib.h>
+#include "rts/cuda/error.h"
+#include "rts/cuda/timer.h"
+
+texture<float2, cudaTextureType2D> texUsp;
+texture<float2, cudaTextureType2D> texUip;
+
+__global__ void gpuScalarUpLut(bsComplex* Us, bsVector* k, int nk, ptype kmag, ptype a, ptype dmin, ptype dmax, bsPoint f, bsPoint ps, ptype A, bsRect ABCD, int uR, int vR, int dR, int aR, int thetaR)
+{
+    /*This function uses Monte-Carlo integration to sample a texture-based LUT describing the scattered field
+        produced by a plane wave through a sphere.  The MC sampling is used to approximate a focused field.
+
+        Us  =   final scattered field
+        k   =   list of incoming plane waves (Monte-Carlo samples)
+        nk  =   number of incoming MC samples
+        kmag=   magnitude of the incoming field 2pi/lambda
+        dmin=   minimum distance of the Usp texture
+        dmax=   maximum distance of the Usp texture
+        f   =   position of the focus
+        ps  =   position of the sphere
+        A   =   total amplitude of the incident field arriving at the focal spot
+        ABCD=   rectangle representing the field slice
+        uR  =   resolution of the field slice in the u direction
+        vR  =   resolution of the field slice in the v direction
+        dR  =   resolution of the Usp texture in the d direction
+        thetaR= resolution of the Usp texture in the theta direction
+    */
+
+	//get the current coordinate in the plane slice
+	int iu = blockIdx.x * blockDim.x + threadIdx.x;
+	int iv = blockIdx.y * blockDim.y + threadIdx.y;
+
+	//make sure that the thread indices are in-bounds
+	if(iu >= uR || iv >= vR) return;
+
+	//compute the index (easier access to the scalar field array)
+	int i = iv*uR + iu;
+
+	//compute the parameters for u and v
+	ptype u = (ptype)iu / (uR);
+	ptype v = (ptype)iv / (vR);
+
+	//get the rtsPoint in world space and then the r vector
+	bsPoint p = ABCD(u, v);
+	bsVector r = p - ps;
+	ptype d = r.len();
+	float di = ( (d - max(a, dmin))/(dmax - max(a, dmin)) ) * (dR - 1);
+	float ai = ( (d - dmin)/(a - dmin)) * (aR - 1);
+
+    bsComplex sumUs(0, 0);
+    //for each plane wave in the wave list
+    for(int iw = 0; iw < nk; iw++)
+    {
+        //normalize the direction vectors and find their inner product
+        r = r.norm();
+        ptype cos_theta = k[iw].dot(r);
+        if(cos_theta < -1)
+            cos_theta = -1;
+        if(cos_theta > 1)
+            cos_theta = 1;
+        float thetai = ( acos(cos_theta) / PI ) * (thetaR - 1);
+
+        //compute the phase factor for spheres that are not at the origin
+        bsVector c = ps - f;
+        bsComplex phase = exp(bsComplex(0, kmag * k[iw].dot(c)));
+
+        //compute the internal field if we are inside a sphere
+        if(d < a)
+        {
+			float2 Uip = tex2D(texUip, ai + 0.5, thetai + 0.5);
+			sumUs += (1.0/nk) * A * phase * bsComplex(Uip.x, Uip.y);
+        }
+        //otherwise compute the scattered field
+        else
+        {
+            float2 Usp = tex2D(texUsp, di + 0.5, thetai + 0.5);
+            sumUs += (1.0/nk) * A * phase * bsComplex(Usp.x, Usp.y);
+        }
+
+    }
+
+    Us[i] += sumUs;
+}
+
+void nearfieldStruct::scalarUpLut()
+{
+    //get the number of spheres
+	int nSpheres = sVector.size();
+
+	//if there are no spheres, nothing to do here
+	if(nSpheres == 0)
+		return;
+
+	//time the calculation of the focused field
+	gpuStartTimer();
+
+	//clear the scattered field
+	U.clear_gpu();
+
+	//create one thread for each pixel of the field slice
+	dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
+	dim3 dimGrid((U.R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (U.R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
+
+    //copy Monte-Carlo samples to the GPU and determine the incident amplitude (plane-wave specific stuff)
+    bsVector* gpuk;
+    int nWaves;
+    ptype subA;
+    if(planeWave)
+    {
+        nWaves = 1;
+        HANDLE_ERROR(cudaMalloc( (void**)&gpuk, sizeof(bsVector) ) );
+        HANDLE_ERROR(cudaMemcpy( gpuk, &k, sizeof(bsVector), cudaMemcpyHostToDevice));
+        subA = A;
+    }
+    else
+    {
+        nWaves = inWaves.size();
+        HANDLE_ERROR(cudaMalloc( (void**)&gpuk, sizeof(bsVector) * nWaves ) );
+        HANDLE_ERROR(cudaMemcpy( gpuk, &inWaves[0], sizeof(bsVector) * nWaves, cudaMemcpyHostToDevice));
+        //compute the amplitude that makes it through the condenser
+        subA = 2 * PI * A * ( (1 - cos(asin(condenser[1]))) - (1 - cos(asin(condenser[0]))) );
+    }
+
+	//for each sphere
+	for(int s = 0; s<nSpheres; s++)
+	{
+        //get the current sphere
+		//sphere S = sVector[s];
+
+        //allocate space for the Usp and Uip textures
+        //allocate the cuda array
+        cudaArray* arrayUsp;
+		cudaArray* arrayUip;
+        cudaChannelFormatDesc channelDescUsp =
+            cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat);
+		cudaChannelFormatDesc channelDescUip =
+            cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat);
+        int dR = sVector[s].Usp.R[0];
+        int thetaR = sVector[s].Usp.R[1];
+		int aR = sVector[s].Uip.R[0];
+        HANDLE_ERROR(cudaMallocArray(&arrayUsp, &channelDescUsp, dR, thetaR));
+		HANDLE_ERROR(cudaMallocArray(&arrayUip, &channelDescUip, aR, thetaR));
+
+        texUsp.addressMode[0] = cudaAddressModeMirror;
+        texUsp.addressMode[1] = cudaAddressModeMirror;
+        texUsp.filterMode     = cudaFilterModeLinear;
+        texUsp.normalized     = false;
+
+		texUip.addressMode[0] = cudaAddressModeMirror;
+        texUip.addressMode[1] = cudaAddressModeMirror;
+        texUip.filterMode     = cudaFilterModeLinear;
+        texUip.normalized     = false;
+        HANDLE_ERROR(cudaBindTextureToArray(texUsp, arrayUsp, channelDescUsp));
+		HANDLE_ERROR(cudaBindTextureToArray(texUip, arrayUip, channelDescUip));
+
+        //copy the LUT to the Usp texture
+        HANDLE_ERROR( cudaMemcpy2DToArray(arrayUsp, 0, 0, sVector[s].Usp.x_hat, dR*sizeof(float2), dR*sizeof(float2), thetaR, cudaMemcpyDeviceToDevice));
+		HANDLE_ERROR( cudaMemcpy2DToArray(arrayUip, 0, 0, sVector[s].Uip.x_hat, aR*sizeof(float2), aR*sizeof(float2), thetaR, cudaMemcpyDeviceToDevice));
+
+        gpuScalarUpLut<<<dimGrid, dimBlock>>>(U.x_hat,
+                                            gpuk,
+                                            nWaves,
+                                            2 * PI / lambda,
+                                            sVector[s].a,
+                                            sVector[s].d_min,
+                                            sVector[s].d_max,
+                                            focus,
+                                            sVector[s].p,
+                                            subA,
+                                            pos,
+                                            U.R[0],
+                                            U.R[1],
+                                            dR,
+                                            aR,
+											thetaR);
+
+        cudaFreeArray(arrayUsp);
+        cudaFreeArray(arrayUip);
+
+	}
+
+
+    //store the time to compute the scattered field
+	t_Us = gpuStopTimer();
+
+	//free monte-carlo samples
+	cudaFree(gpuk);
+
+}
@@ -163,7 +163,7 @@ void nearfieldStruct::scalarUs()
 		return;
  
 	//time the calculation of the focused field
-	//gpuStartTimer();
+	gpuStartTimer();
  
 	//clear the scattered field
 	U.clear_gpu();
@@ -251,9 +251,8 @@ void nearfieldStruct::scalarUs()
 	}
  
  
+    //store the time to compute the scattered field
+	t_Us = gpuStopTimer();
  
-	//float t = gpuStopTimer();
-	//std::cout<<"Scalar Us Time: "<<t<<"ms"<<std::endl;
-	//std::cout<<focus<<std::endl;
  
 }
@@ -32,7 +32,7 @@ __global__ void gpuScalarUsp(bsComplex* Ufx, bsComplex* Ufy, bsComplex* Ufz,
 	{
 		r = p - ps[is];
 		d = r.len();
-		if(d <= as[is])
+		if(d < as[is])
 			return;
 	}
  
@@ -110,8 +110,5 @@ void nearfieldStruct::sumUf()
     HANDLE_ERROR(cudaFree(gpu_p));
     HANDLE_ERROR(cudaFree(gpu_a));
  
-	//float t = gpuStopTimer();
-	//std::cout<<"Add Us Time: "<<t<<"ms"<<std::endl;
-	//std::cout<<focus<<std::endl;
  
 }
@@ -5,7 +5,7 @@
  
 #include "nearfield.h"
 #include "microscope.h"
-#include "colormap.h"
+#include "rts/graphics/colormap.h"
 #include "fileout.h"
 //extern nearfieldStruct* NF;
 extern microscopeStruct* SCOPE;
@@ -23,7 +23,179 @@ using namespace std;
 #include <boost/program_options.hpp>
 namespace po = boost::program_options;
  
-static void loadSpheres(string sphereList)
+extern bool verbose;
+
+
+
+static void lNearfield(po::variables_map vm)
+{
+	//test to see if we are simulating a plane wave
+	bool planeWave = DEFAULT_PLANEWAVE;
+	if(vm.count("plane-wave"))
+		planeWave = !planeWave;
+	SCOPE->nf.planeWave = planeWave;
+
+	//get the incident field amplitude
+	SCOPE->nf.A = vm["amplitude"].as<ptype>();
+
+	//get the condenser parameters
+    SCOPE->nf.condenser[0] = DEFAULT_CONDENSER_MIN;
+    SCOPE->nf.condenser[1] = DEFAULT_CONDENSER_MAX;
+
+    if(vm.count("condenser"))
+    {
+        vector<ptype> cparams = vm["condenser"].as< vector<ptype> >();
+
+        if(cparams.size() == 1)
+            SCOPE->nf.condenser[1] = cparams[0];
+        else
+        {
+            SCOPE->nf.condenser[0] = cparams[0];
+            SCOPE->nf.condenser[1] = cparams[1];
+        }
+    }
+
+
+	//get the focal rtsPoint position
+    SCOPE->nf.focus[0] = DEFAULT_FOCUS_X;
+    SCOPE->nf.focus[1] = DEFAULT_FOCUS_Y;
+    SCOPE->nf.focus[2] = DEFAULT_FOCUS_Z;
+    if(vm.count("focus"))
+    {
+        vector<ptype> fpos = vm["focus"].as< vector<ptype> >();
+        if(fpos.size() != 3)
+        {
+            cout<<"BIMSIM Error - the incident focal point is incorrectly specified; it must have three components."<<endl;
+            exit(1);
+        }
+        SCOPE->nf.focus[0] = fpos[0];
+        SCOPE->nf.focus[1] = fpos[1];
+        SCOPE->nf.focus[2] = fpos[2];
+    }
+
+	//get the incident light direction (k-vector)
+	bsVector spherical(1, 0, 0);
+
+    //if a k-vector is specified
+    if(vm.count("k"))
+    {
+        vector<ptype> kvec = vm["k"].as< vector<ptype> >();
+        if(kvec.size() != 2)
+        {
+            cout<<"BIMSIM Error - k-vector is not specified correctly: it must contain two elements"<<endl;
+            exit(1);
+        }
+        spherical[1] = kvec[0];
+        spherical[2] = kvec[1];
+    }
+	SCOPE->nf.k = spherical.sph2cart();
+
+
+    //incident field order
+    SCOPE->nf.m = vm["field-order"].as<int>();
+
+    //number of Monte-Carlo samples
+    SCOPE->nf.nWaves = vm["samples"].as<int>();
+
+	//random number seed for Monte-Carlo samples
+	if(vm.count("seed"))
+		srand(vm["seed"].as<unsigned int>());
+
+
+
+}
+
+
+static void loadOutputParams(po::variables_map vm)
+{
+    //append simulation results to previous binary files
+    gFileOut.append = DEFAULT_APPEND;
+    if(vm.count("append"))
+        gFileOut.append = true;
+
+	//image parameters
+	//component of the field to be saved
+	std::string fieldStr;
+    fieldStr = vm["output-type"].as<string>();
+
+    if(fieldStr == "magnitude")
+        gFileOut.field = fileoutStruct::fieldMag;
+    else if(fieldStr == "intensity")
+        gFileOut.field = fileoutStruct::fieldIntensity;
+    else if(fieldStr == "polarization")
+        gFileOut.field = fileoutStruct::fieldPolar;
+    else if(fieldStr == "imaginary")
+        gFileOut.field = fileoutStruct::fieldImag;
+    else if(fieldStr == "real")
+        gFileOut.field = fileoutStruct::fieldReal;
+    else if(fieldStr == "angular-spectrum")
+        gFileOut.field = fileoutStruct::fieldAngularSpectrum;
+
+
+	//image file names
+	gFileOut.intFile = vm["intensity"].as<string>();
+	gFileOut.absFile = vm["absorbance"].as<string>();
+	gFileOut.transFile = vm["transmittance"].as<string>();
+	gFileOut.nearFile = vm["near-field"].as<string>();
+	gFileOut.farFile = vm["far-field"].as<string>();
+
+	//colormap
+	std::string cmapStr;
+    cmapStr = vm["colormap"].as<string>();
+    if(cmapStr == "brewer")
+        gFileOut.colormap = rts::cmBrewer;
+    else if(cmapStr == "gray")
+        gFileOut.colormap = rts::cmGrayscale;
+    else
+        cout<<"color-map value not recognized (using default): "<<cmapStr<<endl;
+}
+
+void lFlags(po::variables_map vm, po::options_description desc)
+{
+    //display help and exit
+	if(vm.count("help"))
+	{
+		cout<<desc<<endl;
+		exit(1);
+	}
+
+    //flag for verbose output
+	if(vm.count("verbose"))
+        verbose = true;
+
+    if(vm.count("recursive"))
+    {
+        SCOPE->nf.lut_us = false;
+        SCOPE->nf.lut_uf = false;
+    }
+    else if(vm.count("recursive-us"))
+    {
+        SCOPE->nf.lut_us = false;
+    }
+    else if(vm.count("lut-uf"))
+    {
+        SCOPE->nf.lut_uf = true;
+    }
+}
+
+void lWavelength(po::variables_map vm)
+{
+    //load the wavelength
+	if(vm.count("nu"))
+	{
+		//wavelength is given in wavenumber - transform and flag
+		SCOPE->nf.lambda = 10000/vm["nu"].as<ptype>();
+		gFileOut.wavenumber = true;
+	}
+	//otherwise we are using lambda = wavelength
+	else
+	{
+		SCOPE->nf.lambda = vm["lambda"].as<ptype>();
+		gFileOut.wavenumber = false;
+	}
+}
+
+static void lSpheres(string sphereList)
 {
     /*This function loads a list of sphere given in the string sphereList
         The format is:
@@ -58,17 +230,60 @@ static void loadSpheres(string sphereList)
         //check out the next element (this should set the EOF error flag)
         ss.peek();
     }
+}
  
+void lSpheres(po::variables_map vm)
+{
+    //if a sphere is specified at the command line
+    if(vm.count("spheres"))
+    {
+        //convert the sphere to a string
+        vector<ptype> sdesc = vm["spheres"].as< vector<ptype> >();
  
+        //compute the number of spheres specified
+        unsigned int nS;
+        if(sdesc.size() <= 5)
+            nS = 1;
+        else
+        {
+            //if the number of parameters is divisible by 4, compute the number of spheres
+            if(sdesc.size() % 5 == 0)
+                nS = sdesc.size() / 5;
+            else
+            {
+                cout<<"BIMSIM Error: Invalid number of sphere parameters."<<endl;
+                exit(1);
+            }
+        }
  
-}
+        stringstream ss;
+
+        //for each sphere
+        for(unsigned int s=0; s<nS; s++)
+        {
+            //compute the number of sphere parameters
+            unsigned int nP;
+            if(nS == 1) nP = sdesc.size();
+            else nP = 5;
+
+            //store each parameter as a string
+            for(unsigned int i=0; i<nP; i++)
+            {
+                ss<<sdesc[s*5 + i]<<" ";
+            }
+            ss<<endl;
+        }
+
+
+
+        //convert the string to a sphere list
+        lSpheres(ss.str());
+    }
  
-static void loadSpheres(po::variables_map vm)
-{
     //if a files are specified
     if(vm.count("sphere-file"))
     {
-        cout<<"Sphere files detected."<<endl;
+
         vector<string> filenames = vm["sphere-file"].as< vector<string> >();
         //load each file
         for(int iS=0; iS<filenames.size(); iS++)
@@ -85,69 +300,51 @@ static void loadSpheres(po::variables_map vm)
             std::string instr((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
  
             //load the list of spheres from a string
-            loadSpheres(instr);
+            lSpheres(instr);
         }
     }
  
-    //load the sphere from the command line
-    if(vm.count("sx") || vm.count("sy") || vm.count("sz") || vm.count("s"))
-    {
-        //create a new sphere
-        sphere newS;
-
-		//set defaults
-		if(vm.count("sx"))
-            newS.p[0] = vm["sx"].as<ptype>();
-        else
-            newS.p[0] = DEFAULT_SPHERE_X;
-
-
-		if(vm.count("sy"))
-            newS.p[1] = vm["sy"].as<ptype>();
-        else
-            newS.p[1] = DEFAULT_SPHERE_Y;
-
-		if(vm.count("sz"))
-            newS.p[2] = vm["sz"].as<ptype>();
-        else
-            newS.p[2] = DEFAULT_SPHERE_Z;
-
-		if(vm.count("radius"))
-            newS.a = vm["radius"].as<ptype>();
-        else
-            newS.a = DEFAULT_SPHERE_A;
-
-        //add the sphere to the sphere vector
-        SCOPE->nf.sVector.push_back(newS);
+    //make sure the appropriate materials are loaded
+    unsigned int nS = SCOPE->nf.sVector.size();
  
+    //for each sphere
+    for(unsigned int s = 0; s<nS; s++)
+    {
+        //make sure the corresponding material exists
+        if(SCOPE->nf.sVector[s].iMaterial + 1 > SCOPE->nf.mVector.size())
+        {
+            //otherwise output an error
+            cout<<"BIMSIM Error - A material is not loaded for sphere "<<s+1<<"."<<endl;
+            exit(1);
+        }
     }
 }
  
-static void loadMaterials(po::variables_map vm)
+static void lMaterials(po::variables_map vm)
 {
 	//if materials are specified at the command line
 	if(vm.count("materials"))
 	{
 		vector<ptype> matVec = vm["materials"].as< vector<ptype> >();
-		if(matVec.size() %2 != 0)
+		if(matVec.size() == 1)
+		{
+			rts::material<ptype> newM(SCOPE->nf.lambda, matVec[0], 0);
+			SCOPE->nf.mVector.push_back(newM);
+		}
+		else if(matVec.size() %2 != 0)
 		{
 			cout<<"BIMSim Error: materials must be specified in n, k pairs"<<endl;
 			exit(1);
 		}
-
-
-		for(int i=0; i<matVec.size(); i+=2)
+		else
 		{
-			rts::material<ptype> newM(SCOPE->nf.lambda, matVec[i], matVec[i+1]);
-			SCOPE->nf.mVector.push_back(newM);
+			for(int i=0; i<matVec.size(); i+=2)
+			{
+				rts::material<ptype> newM(SCOPE->nf.lambda, matVec[i], matVec[i+1]);
+				SCOPE->nf.mVector.push_back(newM);
+			}
 		}
 	}
-	else
-	{
-		//add the command line material as the default (material 0)
-		rts::material<ptype> newM(SCOPE->nf.lambda, vm["n"].as<ptype>(), vm["k"].as<ptype>());
-		SCOPE->nf.mVector.push_back(newM);
-	}
  
 	//if file names are specified, load the materials
 	if(vm.count("material-file"))
@@ -169,57 +366,109 @@ static void loadMaterials(po::variables_map vm)
  
 }
  
-static void loadNearfieldParams(po::variables_map vm)
+static void lOptics(po::variables_map vm)
 {
-	//test to see if we are simulating a plane wave
-	bool planeWave = DEFAULT_PLANEWAVE;
-	if(vm.count("plane-wave"))
-		planeWave = !planeWave;
-	SCOPE->nf.planeWave = planeWave;
-
-	//get the wavelength
-    //SCOPE->nf.lambda = vm["lambda"].as<ptype>();
-
-	//get the incident field amplitude
-	SCOPE->nf.A = vm["amplitude"].as<ptype>();
-
-	//get the condenser parameters
-    SCOPE->nf.condenser[0] = vm["condenser-min"].as<ptype>();
-    SCOPE->nf.condenser[1] = vm["condenser-max"].as<ptype>();
-
-
-	//get the focal rtsPoint position
-    SCOPE->nf.focus[0] = vm["fx"].as<ptype>();
-    SCOPE->nf.focus[1] = vm["fy"].as<ptype>();
-    SCOPE->nf.focus[2] = vm["fz"].as<ptype>();
-
-	//get the incident light direction (k-vector)
-	bsVector spherical;
-	spherical[0] = 1.0;
-    spherical[1] = vm["theta"].as<ptype>();
-    spherical[2] = vm["phi"].as<ptype>();
-	SCOPE->nf.k = spherical.sph2cart();
-
-
-    //incident field order
-    SCOPE->nf.m = vm["field-order"].as<int>();
-
-    //number of Monte-Carlo samples
-    SCOPE->nf.nWaves = vm["samples"].as<int>();
-
-
+    SCOPE->objective[0] = DEFAULT_OBJECTIVE_MIN;
+    SCOPE->objective[1] = DEFAULT_OBJECTIVE_MAX;
+    if(vm.count("objective"))
+    {
+        vector<ptype> oparams = vm["objective"].as< vector<ptype> >();
  
+        if(oparams.size() == 1)
+            SCOPE->objective[1] = oparams[0];
+        else
+        {
+            SCOPE->objective[0] = oparams[0];
+            SCOPE->objective[1] = oparams[1];
+        }
+    }
 }
  
-static void loadSliceParams(po::variables_map vm)
+static void lImagePlane(po::variables_map vm)
 {
-    //parameters for the sample plane
-
+	bsPoint pMin(DEFAULT_PLANE_MIN_X, DEFAULT_PLANE_MIN_Y, DEFAULT_PLANE_MIN_Z);
+	bsPoint pMax(DEFAULT_PLANE_MAX_X, DEFAULT_PLANE_MAX_Y, DEFAULT_PLANE_MAX_Z);
+	bsVector normal(DEFAULT_PLANE_NORM_X, DEFAULT_PLANE_NORM_Y, DEFAULT_PLANE_NORM_Z);
  
 	//set the default values for the slice position and orientation
-	bsPoint pMin(vm["plane-min-x"].as<ptype>(), vm["plane-min-y"].as<ptype>(), vm["plane-min-z"].as<ptype>());
-	bsPoint pMax(vm["plane-max-x"].as<ptype>(), vm["plane-max-y"].as<ptype>(), vm["plane-max-z"].as<ptype>());
-	bsVector normal(vm["plane-norm-x"].as<ptype>(), vm["plane-norm-y"].as<ptype>(), vm["plane-norm-z"].as<ptype>());
+	if(vm.count("plane-lower-left") && vm.count("plane-upper-right") && vm.count("plane-normal"))
+	{
+		vector<ptype> ll = vm["plane-lower-left"].as< vector<ptype> >();
+		if(ll.size() != 3)
+		{
+			cout<<"BIMSIM Error - The lower-left corner of the image plane is incorrectly specified."<<endl;
+			exit(1);
+		}
+
+		vector<ptype> ur = vm["plane-lower-left"].as< vector<ptype> >();
+		if(ur.size() != 3)
+		{
+			cout<<"BIMSIM Error - The upper-right corner of the image plane is incorrectly specified."<<endl;
+			exit(1);
+		}
+
+		vector<ptype> norm = vm["plane-lower-left"].as< vector<ptype> >();
+		if(norm.size() != 3)
+		{
+			cout<<"BIMSIM Error - The normal of the image plane is incorrectly specified."<<endl;
+			exit(1);
+		}
+
+		pMin = bsPoint(ll[0], ll[1], ll[2]);
+		pMax = bsPoint(ur[0], ur[1], ur[2]);
+		normal = bsVector(norm[0], norm[1], norm[2]);
+	}
+	else if(vm.count("xy"))
+	{
+		//default plane size in microns
+		ptype s = DEFAULT_PLANE_SIZE;
+		ptype pos = DEFAULT_PLANE_POSITION;
+
+		vector<ptype> xy = vm["xy"].as< vector<ptype> >();
+		if(xy.size() >= 1)
+			s = xy[0];
+		if(xy.size() >= 2)
+			pos = xy[1];
+
+		//calculate the plane corners and normal based on the size and position
+		pMin = bsPoint(-s/2, -s/2, pos);
+		pMax = bsPoint(s/2, s/2, pos);
+		normal = bsVector(0, 0, 1);
+	}
+	else if(vm.count("xz"))
+	{
+		//default plane size in microns
+		ptype size = DEFAULT_PLANE_SIZE;
+		ptype pos = DEFAULT_PLANE_POSITION;
+
+		vector<ptype> xz = vm["xz"].as< vector<ptype> >();
+		if(xz.size() >= 1)
+			size = xz[0];
+		if(xz.size() >= 2)
+			pos = xz[1];
+
+		//calculate the plane corners and normal based on the size and position
+		pMin = bsPoint(-size/2, pos, -size/2);
+		pMax = bsPoint(size/2, pos, size/2);
+		normal = bsVector(0, -1, 0);
+	}
+	else if(vm.count("yz"))
+	{
+		//default plane size in microns
+		ptype size = DEFAULT_PLANE_SIZE;
+		ptype pos = DEFAULT_PLANE_POSITION;
+
+		vector<ptype> yz = vm["yz"].as< vector<ptype> >();
+		if(yz.size() >= 1)
+			size = yz[0];
+		if(yz.size() >= 2)
+			pos = yz[1];
+
+		//calculate the plane corners and normal based on the size and position
+		pMin = bsPoint(pos, -size/2, -size/2);
+		pMax = bsPoint(pos, size/2, size/2);
+		normal = bsVector(1, 0, 0);
+	}
 	SCOPE->setPos(pMin, pMax, normal);
  
 	//resolution
@@ -233,175 +482,111 @@ static void loadSliceParams(po::variables_map vm)
  
  
 	SCOPE->setNearfield();
-
-
-
-}
-
-static void loadMicroscopeParams(po::variables_map vm)
-{
-    //objective
-    SCOPE->objective[0] = vm["objective-min"].as<ptype>();
-    SCOPE->objective[1] = vm["objective-max"].as<ptype>();
-
-
-
-
-
-}
-
-static void loadOutputParams(po::variables_map vm)
-{
-    //append simulation results to previous binary files
-    gFileOut.append = DEFAULT_APPEND;
-    if(vm.count("append"))
-        gFileOut.append = true;
-
-	//image parameters
-	//component of the field to be saved
-	std::string fieldStr;
-    fieldStr = vm["output-type"].as<string>();
-
-    if(fieldStr == "magnitude")
-        gFileOut.field = fileoutStruct::fieldMag;
-    else if(fieldStr == "intensity")
-        gFileOut.field = fileoutStruct::fieldIntensity;
-    else if(fieldStr == "polarization")
-        gFileOut.field = fileoutStruct::fieldPolar;
-    else if(fieldStr == "imaginary")
-        gFileOut.field = fileoutStruct::fieldImag;
-    else if(fieldStr == "real")
-        gFileOut.field = fileoutStruct::fieldReal;
-    else if(fieldStr == "angular-spectrum")
-        gFileOut.field = fileoutStruct::fieldAngularSpectrum;
-
-
-	//image file names
-	gFileOut.intFile = vm["intensity"].as<string>();
-	gFileOut.absFile = vm["absorbance"].as<string>();
-	gFileOut.transFile = vm["transmittance"].as<string>();
-	gFileOut.nearFile = vm["near-field"].as<string>();
-	gFileOut.farFile = vm["far-field"].as<string>();
-
-	//colormap
-	std::string cmapStr;
-    cmapStr = vm["colormap"].as<string>();
-    if(cmapStr == "brewer")
-        gFileOut.colormap = rts::colormap::cmBrewer;
-    else if(cmapStr == "gray")
-        gFileOut.colormap = rts::colormap::cmGrayscale;
-    else
-        cout<<"color-map value not recognized (using default): "<<cmapStr<<endl;
 }
  
 static void OutputOptions()
 {
-	cout<<SCOPE->nf.toStr();
+	cout<<SCOPE->toStr();
  
 	cout<<"# of source points: "<<SCOPE->focalPoints.size()<<endl;
  
 }
  
+vector<ptype> test;
 static void SetOptions(po::options_description &desc)
 {
 	desc.add_options()
-		("help,h", "prints this help")
-		("plane-wave,P", "simulates an incident plane wave")
-		("intensity,I", po::value<string>()->default_value(DEFAULT_INTENSITY_FILE), "output measured intensity (filename)")
-		("absorbance,A", po::value<string>()->default_value(DEFAULT_ABSORBANCE_FILE), "output measured absorbance (filename)")
-		("transmittance,T", po::value<string>()->default_value(DEFAULT_TRANSMITTANCE_FILE), "output measured transmittance (filename)")
-		("far-field,F", po::value<string>()->default_value(DEFAULT_FAR_FILE), "output far-field at detector (filename)")
-		("near-field,N", po::value<string>()->default_value(DEFAULT_NEAR_FILE), "output field at focal plane (filename)")
-		("extended-source,X", po::value<string>()->default_value(DEFAULT_EXTENDED_SOURCE), "image of source at focus (filename)")
-		//("sx,x", po::value<ptype>()->default_value(DEFAULT_SPHERE_X), "sphere coordinates")
-		//("sy,y", po::value<ptype>()->default_value(DEFAULT_SPHERE_Y))
-		//("sz,z", po::value<ptype>()->default_value(DEFAULT_SPHERE_Z))
-		("sx,x", po::value<ptype>(), "sphere coordinates")
-		("sy,y", po::value<ptype>())
-		("sz,z", po::value<ptype>())
-		("radius,r", po::value<ptype>()->default_value(DEFAULT_SPHERE_A), "sphere radius")
-		("samples,s", po::value<int>()->default_value(DEFAULT_SAMPLES), "Monte-Carlo samples used to compute Us")
-		("sphere-file,S", po::value< vector<string> >()->multitoken(), "sphere file:\n [x y z radius material]")
-		("amplitude,a", po::value<ptype>()->default_value(DEFAULT_AMPLITUDE), "incident field amplitude")
-		("n,n", po::value<ptype>()->default_value(DEFAULT_N, "1.4"), "sphere phase speed")
-		("k,k", po::value<ptype>()->default_value(DEFAULT_K), "sphere absorption coefficient")
-		("material-file,M", po::value< vector<string> >()->multitoken(), "material file:\n [lambda n k]")
-		("materials", po::value< vector<ptype> >()->multitoken(), "materials specified using n, k pairs:\n ex. --materials n1 k1 n2 k2\n (if used --n and --k are ignored)")
-		("lambda,l", po::value<ptype>()->default_value(DEFAULT_LAMBDA), "incident wavelength")
+		("help", "prints this help")
+		("verbose", "verbose output\n")
+
+		("intensity", po::value<string>()->default_value(DEFAULT_INTENSITY_FILE), "output measured intensity (filename)")
+		("absorbance", po::value<string>()->default_value(DEFAULT_ABSORBANCE_FILE), "output measured absorbance (filename)")
+		("transmittance", po::value<string>()->default_value(DEFAULT_TRANSMITTANCE_FILE), "output measured transmittance (filename)")
+		("far-field", po::value<string>()->default_value(DEFAULT_FAR_FILE), "output far-field at detector (filename)")
+		("near-field", po::value<string>()->default_value(DEFAULT_NEAR_FILE), "output field at focal plane (filename)")
+		("extended-source", po::value<string>()->default_value(DEFAULT_EXTENDED_SOURCE), "image of source at focus (filename)\n")
+
+		("spheres", po::value< vector<ptype> >()->multitoken(), "sphere position: x y z a m")
+		("sphere-file", po::value< vector<string> >()->multitoken(), "sphere file:\n [x y z radius material]")
+		("materials", po::value< vector<ptype> >()->multitoken(), "refractive indices as n, k pairs:\n ex. -m n0 k0 n1 k1 n2 k2")
+		("material-file", po::value< vector<string> >()->multitoken(), "material file:\n [lambda n k]\n")
+
+		("lambda", po::value<ptype>()->default_value(DEFAULT_LAMBDA), "incident wavelength")
 		("nu", po::value<ptype>(), "incident frequency (in cm^-1)\n(if specified, lambda is ignored)")
-		("theta,t", po::value<ptype>()->default_value(DEFAULT_K_THETA), "light direction (polar coords)")
-		("phi,p", po::value<ptype>()->default_value(DEFAULT_K_PHI))
-		("fx", po::value<ptype>()->default_value(DEFAULT_FOCUS_X), "incident focal point")
-		("fy", po::value<ptype>()->default_value(DEFAULT_FOCUS_Y))
-		("fz", po::value<ptype>()->default_value(DEFAULT_FOCUS_Z))
-		("condenser-max,C", po::value<ptype>()->default_value(DEFAULT_CONDENSER_MAX), "condenser numerical aperature")
-		("condenser-min,c", po::value<ptype>()->default_value(DEFAULT_CONDENSER_MIN), "condenser obscuration NA")
-		("objective-max,O", po::value<ptype>()->default_value(DEFAULT_OBJECTIVE_MAX), "objective numerical aperature")
-		("objective-min,o", po::value<ptype>()->default_value(DEFAULT_OBJECTIVE_MIN), "objective obscuration NA")
-		("field-order", po::value<int>()->default_value(DEFAULT_FIELD_ORDER), "order of the incident field")
-		("output-type,f", po::value<string>()->default_value(DEFAULT_FIELD_TYPE), "output field value:\n magnitude, polarization, real, imaginary, angular-spectrum")
-		("resolution,R", po::value<unsigned int>()->default_value(DEFAULT_SLICE_RES), "resolution of the detector")
-		("padding,d", po::value<unsigned int>()->default_value(DEFAULT_PADDING), "FFT padding for the objective bandpass")
+		("k", po::value< vector<ptype> >()->multitoken(), "k-vector direction: -k theta phi\n theta = [0 2*pi], phi = [0 pi]")
+		("amplitude", po::value<ptype>()->default_value(DEFAULT_AMPLITUDE), "incident field amplitude")
+		("condenser", po::value< vector<ptype> >()->multitoken(), "condenser numerical aperature\nA pair of values can be used to specify an inner obscuration: -c NAin NAout")
+		("objective", po::value< vector<ptype> >()->multitoken(), "objective numerical aperature\nA pair of values can be used to specify an inner obscuration: -c NAin NAout")
+		("focus", po::value< vector<ptype> >()->multitoken(), "focal position for the incident point source\n (default = --focus 0 0 0)")
+		("plane-wave", "simulates an incident plane wave\n")
+
+		("resolution", po::value<unsigned int>()->default_value(DEFAULT_SLICE_RES), "resolution of the detector")
+		("plane-lower-left", po::value< vector<ptype> >()->multitoken(), "lower-left position of the image plane")
+		("plane-upper-right", po::value< vector<ptype> >()->multitoken(), "upper-right position of the image plane")
+		("plane-normal", po::value< vector<ptype> >()->multitoken(), "normal for the image plane")
+		("xy", po::value< vector<ptype> >()->multitoken(), "specify an x-y image plane\n (standard microscope)")
+		("xz", po::value< vector<ptype> >()->multitoken(), "specify a x-z image plane\n (cross-section of the focal volume)")
+		("yz", po::value< vector<ptype> >()->multitoken(), "specify a y-z image plane\n (cross-section of the focal volume)\n")
+
+		("samples", po::value<int>()->default_value(DEFAULT_SAMPLES), "Monte-Carlo samples used to compute Us")
+		("padding", po::value<unsigned int>()->default_value(DEFAULT_PADDING), "FFT padding for the objective bandpass")
 		("supersample", po::value<unsigned int>()->default_value(DEFAULT_SUPERSAMPLE), "super-sampling rate for the detector field")
+		("field-order", po::value<int>()->default_value(DEFAULT_FIELD_ORDER), "order of the incident field")
+		("seed", po::value<unsigned int>(), "seed for the Monte-Carlo random number generator")
+		("recursive", "evaluate all Bessel functions recursively\n")
+		("recursive-us", "evaluate scattered-field Bessel functions recursively\n")
+		("lut-uf", "evaluate the focused-field using a look-up table\n")
+
+		("output-type", po::value<string>()->default_value(DEFAULT_FIELD_TYPE), "output field value:\n magnitude, polarization, real, imaginary, angular-spectrum")
 		("colormap", po::value<string>()->default_value(DEFAULT_COLORMAP), "colormap: gray, brewer")
 		("append", "append result to an existing file\n (binary files only)")
-		("plane-min-x,u", po::value<ptype>()->default_value(DEFAULT_SLICE_MIN_X), "lower-left corner of the field slice")
-		("plane-min-y,v", po::value<ptype>()->default_value(DEFAULT_SLICE_MIN_Y))
-		("plane-min-z,w", po::value<ptype>()->default_value(DEFAULT_SLICE_MIN_Z))
-		("plane-max-x,U", po::value<ptype>()->default_value(DEFAULT_SLICE_MAX_X), "upper-right corner of the field slice")
-		("plane-max-y,V", po::value<ptype>()->default_value(DEFAULT_SLICE_MAX_Y))
-		("plane-max-z,W", po::value<ptype>()->default_value(DEFAULT_SLICE_MAX_Z))
-		("plane-norm-x", po::value<ptype>()->default_value(DEFAULT_SLICE_NORM_X), "field slice normal")
-		("plane-norm-y", po::value<ptype>()->default_value(DEFAULT_SLICE_NORM_Y))
-		("plane-norm-z", po::value<ptype>()->default_value(DEFAULT_SLICE_NORM_Z));
+		;
 }
  
 static void LoadParameters(int argc, char *argv[])
 {
 	//create an option description
-	po::options_description desc("Allowed options");
+	po::options_description desc("BimSim arguments");
  
 	//fill it with options
 	SetOptions(desc);
  
     po::variables_map vm;
-	po::store(po::parse_command_line(argc, argv, desc), vm);
+	po::store(po::parse_command_line(argc, argv, desc, po::command_line_style::unix_style ^ po::command_line_style::allow_short), vm);
 	po::notify(vm);
  
-	//display help and exit
-	if(vm.count("help"))
-	{
-		cout<<desc<<endl;
-		exit(1);
-	}
  
-	//load the wavelength
-	if(vm.count("nu"))
-	{
-		//wavelength is given in wavenumber - transform and flag
-		SCOPE->nf.lambda = 10000/vm["nu"].as<ptype>();
-		gFileOut.wavenumber = true;
-	}
-	//otherwise we are using lambda = wavelength
-	else
-	{
-		SCOPE->nf.lambda = vm["lambda"].as<ptype>();
-		gFileOut.wavenumber = false;
-	}
+    //load flags (help, verbose output)
+    lFlags(vm, desc);
+
+    //load the wavelength
+    lWavelength(vm);
+
+    //load materials
+	//loadMaterials(vm);
+	lMaterials(vm);
+
+    //load the sphere data
+    lSpheres(vm);
+
+    //load the optics
+    lOptics(vm);
+
+	//load the position and orientation of the image plane
+	lImagePlane(vm);
  
 	//load spheres
-	loadSpheres(vm);
+	//loadSpheres(vm);
+
  
-	//load materials
-	loadMaterials(vm);
  
-	loadNearfieldParams(vm);
+	lNearfield(vm);
  
 	loadOutputParams(vm);
  
-	loadMicroscopeParams(vm);
+	//loadMicroscopeParams(vm);
  
-	loadSliceParams(vm);
+	//loadSliceParams(vm);
  
     //if an extended source will be used
     if(vm["extended-source"].as<string>() != "")
@@ -22,16 +22,17 @@ scalarslice::scalarslice()
  
 scalarslice::~scalarslice()
 {
-	HANDLE_ERROR(cudaFree(S));
+	if(S != NULL)
+		HANDLE_ERROR(cudaFree(S));
 	S = NULL;
 }
  
-void scalarslice::toImage(std::string filename, ptype vmin, ptype vmax, rts::colormap::colormapType cmap)
+void scalarslice::toImage(std::string filename, ptype vmin, ptype vmax, rts::colormapType cmap)
 {
-	rts::colormap::gpu2image<ptype>(S, filename, R[0], R[1], vmin, vmax, cmap);
+	rts::gpu2image<ptype>(S, filename, R[0], R[1], vmin, vmax, cmap);
 }
  
-void scalarslice::toImage(std::string filename, bool positive, rts::colormap::colormapType cmap)
+void scalarslice::toImage(std::string filename, bool positive, rts::colormapType cmap)
 {
     cublasStatus_t stat;
     cublasHandle_t handle;
@@ -62,7 +63,7 @@ void scalarslice::toImage(std::string filename, bool positive, rts::colormap::co
 		exit(1);
 	}
  
-    //std::cout<<"Maximum index: "<<result<<std::endl;
+
  
     //retrieve the maximum value
     ptype maxVal;
@@ -75,7 +76,7 @@ void scalarslice::toImage(std::string filename, bool positive, rts::colormap::co
     if(positive)
         toImage(filename, 0, maxVal, cmap);
     else
-        toImage(filename, -maxVal, maxVal, cmap);
+        toImage(filename, -abs(maxVal), abs(maxVal), cmap);
 }
  
 void scalarslice::toEnvi(std::string filename, ptype wavelength, bool append)
@@ -2,7 +2,7 @@
 #define RTS_SCALAR_SLICE
  
 #include "dataTypes.h"
-#include "colormap.h"
+#include "rts/graphics/colormap.h"
  
 struct scalarslice
 {
@@ -17,8 +17,8 @@ struct scalarslice
 	~scalarslice();
 	void clear();
  
-	void toImage(std::string filename, ptype vmin, ptype vmax, rts::colormap::colormapType cmap = rts::colormap::cmBrewer);
-	void toImage(std::string filename, bool positive = true, rts::colormap::colormapType cmap = rts::colormap::cmBrewer);
+	void toImage(std::string filename, ptype vmin, ptype vmax, rts::colormapType cmap = rts::cmBrewer);
+	void toImage(std::string filename, bool positive = true, rts::colormapType cmap = rts::cmBrewer);
 	void toEnvi(std::string filename, ptype wavelength = 0, bool append = false);
  
 };
 #include "sphere.h"
+#include "defaults.h"
  
 #include "rts/math/complex.h"
 #include <complex>
 #include <stdlib.h>
+#include <fstream>
  
 using namespace rts;
 using namespace std;
@@ -13,6 +15,9 @@ int cbessjyva(double v,complex&lt;double&gt; z,double &amp;vm,complex&lt;double&gt;*cjv,
 int cbessjyva_sph(int v,complex<double> z,double &vm,complex<double>*cjv,
     complex<double>*cyv,complex<double>*cjvp,complex<double>*cyvp);
  
+int bessjyv_sph(int v, double z, double &vm, double* cjv,
+    double* cyv, double* cjvp, double* cyvp);
+
 void sphere::calcCoeff(ptype lambda, rtsComplex<ptype> ri)
 {
     /*  These calculations are done at high-precision on the CPU
@@ -59,12 +64,6 @@ void sphere::calcCoeff(ptype lambda, rtsComplex&lt;ptype&gt; ri)
     cbessjyva_sph(Nl, ka, vm, cjv_ka, cyv_ka, cjvp_ka, cyvp_ka);
     cbessjyva_sph(Nl, kna, vm, cjv_kna, cyv_kna, cjvp_kna, cyvp_kna);
  
-
-    //cout<<"Begin Sphere---------"<<endl;
-    //cout<<"Nl =  "<<Nl<<endl;
-    //cout<<"ka =  "<<ka<<endl;
-    //cout<<"kna = "<<kna<<endl;
-
     //compute A for each order
     complex<double> i(0, 1);
     complex<double> a, b, c, d;
@@ -83,7 +82,7 @@ void sphere::calcCoeff(ptype lambda, rtsComplex&lt;ptype&gt; ri)
         //calculate A and add it to the list
         An = (2.0 * l + 1.0) * pow(i, l) * (a / b);
         A.push_back(bsComplex(An.real(), An.imag()));
-        //cout<<"A:  "<<An<<endl;
+
  
         //Compute B (external scattering coefficient)
         c = cjv_ka[l] * cjvp_kna[l] * nc - cjv_kna[l] * cjvp_ka[l];
@@ -92,7 +91,206 @@ void sphere::calcCoeff(ptype lambda, rtsComplex&lt;ptype&gt; ri)
         //calculate B and add it to the list
         Bn = (2.0 * l + 1.0) * pow(i, l) * (c / d);
         B.push_back(bsComplex(Bn.real(), Bn.imag()));
-        //cout<<"B:  "<<Bn<<endl;
  
+
+    }
+}
+
+void sphere::calcBesselLut(bsComplex* j, ptype k, bsComplex n, int aR)
+{
+    /*Compute the look-up-table for spherical bessel functions used inside of the sphere
+        j    =   (Nl + 1) x aR array of values
+        aR      =   resolution of j
+    */
+
+    //allocate space for the Bessel functions of the first and second kind (and derivatives -- which will be ignored)
+    int bytes = sizeof(complex<double>) * (Nl + 1);
+    complex<double>* cjv_knr = (complex<double>*)malloc(bytes);
+    complex<double>* cyv_knr = (complex<double>*)malloc(bytes);
+    complex<double>* cjvp_knr = (complex<double>*)malloc(bytes);
+    complex<double>* cyvp_knr = (complex<double>*)malloc(bytes);
+
+    //compute the bessel functions using the CPU-based algorithm
+    double vm;
+
+    //for each sample along r
+    ptype dr = a / (aR - 1);
+    ptype r;
+    for(int ir = 0; ir < aR; ir++)
+    {
+        r = ir * dr;
+        complex<double> knr( (k*n*r).real(), (k*n*r).imag() );
+        cbessjyva_sph(Nl, knr, vm, cjv_knr, cyv_knr, cjvp_knr, cyvp_knr);
+
+        //copy the double data to the bsComplex array
+        for(int l=0; l<=Nl; l++)
+		{
+			//deal with the NaN case at the origin
+			if(ir == 0)
+			{
+				if(l == 0)
+					j[ir * (Nl+1)] = 1;
+				else
+					j[ir * (Nl+1) + l] = 0;
+			}
+			else
+				j[ir * (Nl+1) + l] = bsComplex(cjv_knr[l].real(), cjv_knr[l].imag());
+		}
+    }
+
+	/*ofstream outfile("besselout.txt");
+    for(int ir = 0; ir < aR; ir++)
+    {
+        for(int l = 0; l<Nl+1; l++)
+        {
+            outfile<<j[ir * (Nl+1) + l].real()<<"     ";
+        }
+        outfile<<endl;
+    }
+	outfile.close();*/
+
+}
+
+void sphere::calcHankelLut(bsComplex* h, ptype k, int rR)
+{
+	/*Compute the look-up-table for spherical bessel functions used inside of the sphere
+        h_out   =   (Nl + 1) x aR array of values
+		rmin	=	minimum value of r
+		d_max	=	maximum value of r
+        rR      =   resolution of h_out
+    */
+
+    //allocate space for the Bessel functions of the first and second kind (and derivatives -- which will be ignored)
+    int bytes = sizeof(double) * (Nl + 1);
+    double* cjv_kr = (double*)malloc(bytes);
+    double* cyv_kr = (double*)malloc(bytes);
+    double* cjvp_kr = (double*)malloc(bytes);
+    double* cyvp_kr = (double*)malloc(bytes);
+
+    //compute the bessel functions using the CPU-based algorithm
+    double vm;
+
+
+
+    //for each sample along r
+    ptype dr = (d_max - max(a, d_min)) / (rR - 1);
+    ptype r;
+    for(int ir = 0; ir < rR; ir++)
+    {
+        r = ir * dr + max(a, d_min);
+        double kr = k*r;
+        bessjyv_sph(Nl, kr, vm, cjv_kr, cyv_kr, cjvp_kr, cyvp_kr);
+
+        //copy the double data to the bsComplex array
+        for(int l=0; l<=Nl; l++)
+		{
+			//h[ir * (Nl+1) + l] = bsComplex(cjv_kr[l].real(), cyv_kr[l].real());
+			h[ir * (Nl+1) + l] = bsComplex(cjv_kr[l], cyv_kr[l]);
+		}
     }
+
+	/*ofstream outfile("hankelout.txt");
+    for(int ir = 0; ir < rR; ir++)
+    {
+		outfile<<ir*dr + max(a, d_min)<<"     ";
+        for(int l = 0; l<=0; l++)
+        {
+            outfile<<h[ir * (Nl+1) + l].real()<<"     "<<h[ir * (Nl+1) + l].imag()<<"     ";
+        }
+        outfile<<endl;
+    }
+	outfile.close();*/
+}
+
+void sphere::calcLut(bsComplex* j, bsComplex* h, ptype lambda, bsComplex n, int aR, int rR)
+{
+    /*Compute the look-up-tables for spherical bessel functions used both inside and outside of the sphere.
+        j       =   (Nl + 1) x aR array of values
+        j       =   (Nl + 1) x rR array of values
+        d_max    =   maximum distance for the LUT
+        aR      =   resolution of j_in
+        rR      =   resolution of j_out
+    */
+
+    //compute the magnitude of the k vector
+    double k = 2 * PI / lambda;
+
+	calcBesselLut(j, k, n, aR);
+	calcHankelLut(h, k, rR);
+}
+
+void sphere::calcUp(ptype lambda, bsComplex n, rts::rtsQuad<ptype, 3> nfPlane, unsigned int R)
+{
+    //calculate the parameters of the lookup table
+
+    //first find the distance to the closest and furthest points on the nearfield plane
+    d_min = nfPlane.dist(p);
+    d_max = nfPlane.dist_max(p);
+
+    //compute the radius of the cross-section of the sphere with the plane
+    ptype a_inter = 0;
+    if(d_min < a)
+        a_inter = sqrt(a - d_min);
+
+
+	//calculate the resolution of the Usp and Uip lookup tables
+	int aR = 1 + 2 * R * a_inter / (nfPlane(0, 0) - nfPlane(1, 1)).len();
+	int dR = 2 * R;
+	int thetaR = DEFAULT_SPHERE_THETA_R;
+
+	//allocate space for the bessel function LUTs
+	bsComplex* j = (bsComplex*)malloc(sizeof(bsComplex) * (Nl + 1) * aR);
+	bsComplex* h = (bsComplex*)malloc(sizeof(bsComplex) * (Nl + 1) * dR);
+
+	calcLut(j, h, lambda, n, aR, dR);
+
+	//allocate space for the Usp lookup texture
+	Usp.R[0] = dR;
+	Usp.R[1] = thetaR;
+	Usp.init_gpu();
+
+	//allocate space for the Uip lookup texture
+	Uip.R[0] = aR;
+	Uip.R[1] = thetaR;
+	Uip.init_gpu();
+
+
+
+	scalarUsp(h, dR, thetaR);
+	scalarUip(j, aR, thetaR);
+
+	scalarslice UspMag = Usp.Mag();
+	UspMag.toImage("Usp.bmp", true);
+
+	scalarslice UipMag = Uip.Mag();
+	UipMag.toImage("Uip.bmp", true);
+
+	//free memory
+	free(j);
+	free(h);
+
+}
+
+sphere& sphere::operator=(const sphere &rhs)
+{
+	p = rhs.p;
+	a = rhs.a;
+	iMaterial = rhs.iMaterial;
+	Nl = rhs.Nl;
+	n = rhs.n;
+	B = rhs.B;
+	A = rhs.A;
+
+	return *this;
+}
+
+sphere::sphere(const sphere &rhs)
+{
+	p = rhs.p;
+	a = rhs.a;
+	iMaterial = rhs.iMaterial;
+	Nl = rhs.Nl;
+	n = rhs.n;
+	B = rhs.B;
+	A = rhs.A;
 }
+#include "sphere.h"
+#include "rts/math/legendre.h"
+
+__global__ void gpuScalarUsp(bsComplex* Usp, bsComplex* h, bsComplex* B, int Nl, int rR, int thetaR)
+{
+    //get the current coordinate in the plane slice
+	int ir = blockIdx.x * blockDim.x + threadIdx.x;
+	int itheta = blockIdx.y * blockDim.y + threadIdx.y;
+
+	//make sure that the thread indices are in-bounds
+	if(itheta >= thetaR || ir >= rR) return;
+
+	int i = itheta * rR + ir;
+
+	//ptype dr = (rmax - a) / (rR - 1);
+	ptype dtheta = (PI) / (thetaR - 1);
+
+	//comptue the current angle and distance
+	//ptype r = dr * ir + a;
+	ptype theta = dtheta * itheta;
+	ptype cos_theta = cos(theta);
+
+	//initialize the Legendre polynomial
+	ptype P[2];
+	rts::init_legendre<ptype>(cos_theta, P[0], P[1]);
+
+	//initialize the result
+	bsComplex Us((ptype)0, (ptype)0);
+
+    //for each order l
+    for(int l=0; l <= Nl; l++)
+    {
+        if(l == 0)
+        {
+            Us += B[l] * h[ir * (Nl+1) + l] * P[0];
+            //Us += P[0];
+        }
+        else
+        {
+            if(l > 1)
+            {
+                rts::shift_legendre<ptype>(l, cos_theta, P[0], P[1]);
+            }
+            Us += B[l] * h[ir * (Nl+1) + l] * P[1];
+            //Us += P[1];
+        }
+
+
+    }
+	Usp[i] = Us;
+	//Usp[i] = h[ir * (Nl+1)];
+	//Usp[i] = ir;
+
+}
+
+__global__ void gpuScalarUip(bsComplex* Uip, bsComplex* j, bsComplex* A, int Nl, int aR, int thetaR)
+{
+    //get the current coordinate in the plane slice
+	int ia = blockIdx.x * blockDim.x + threadIdx.x;
+	int itheta = blockIdx.y * blockDim.y + threadIdx.y;
+
+	//make sure that the thread indices are in-bounds
+	if(itheta >= thetaR || ia >= aR) return;
+
+	int i = itheta * aR + ia;
+
+	ptype dtheta = (PI) / (thetaR - 1);
+
+	//comptue the current angle and distance
+	ptype theta = dtheta * itheta;
+	ptype cos_theta = cos(theta);
+
+	//initialize the Legendre polynomial
+	ptype P[2];
+	rts::init_legendre<ptype>(cos_theta, P[0], P[1]);
+
+	//initialize the result
+	bsComplex Ui((ptype)0, (ptype)0);
+
+    //for each order l
+    for(int l=0; l <= Nl; l++)
+    {
+        if(l == 0)
+        {
+            Ui += A[l] * j[ia * (Nl+1) + l] * P[0];
+        }
+        else
+        {
+            if(l > 1)
+            {
+                rts::shift_legendre<ptype>(l, cos_theta, P[0], P[1]);
+            }
+            Ui += A[l] * j[ia * (Nl+1) + l] * P[1];
+        }
+
+
+    }
+	Uip[i] = Ui;
+}
+
+void sphere::scalarUsp(bsComplex* h, int rR, int thetaR)
+{
+	//copy the hankel function to the GPU
+    bsComplex* gpu_h;
+    HANDLE_ERROR( cudaMalloc( (void**)&gpu_h, sizeof(bsComplex) * (Nl + 1) * rR ) );
+    HANDLE_ERROR( cudaMemcpy( gpu_h, h, sizeof(bsComplex) * (Nl + 1) * rR, cudaMemcpyHostToDevice ) );
+
+    //allocate memory for the scattering coefficients
+    bsComplex* gpuB;
+    HANDLE_ERROR(cudaMalloc((void**) &gpuB, (Nl+1) * sizeof(bsComplex)));
+    //copy the scattering coefficients to the GPU
+    HANDLE_ERROR(cudaMemcpy(gpuB, &B[0], (Nl+1) * sizeof(bsComplex), cudaMemcpyHostToDevice));
+
+    //create one thread for each pixel of the field slice
+	dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
+	dim3 dimGrid((Usp.R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (Usp.R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
+
+	gpuScalarUsp<<<dimGrid, dimBlock>>>(Usp.x_hat, gpu_h, gpuB, Nl, rR, thetaR);
+
+	//free memory
+	cudaFree(gpu_h);
+	cudaFree(gpuB);
+
+}
+
+void sphere::scalarUip(bsComplex* j, int rR, int thetaR)
+{
+	//copy the bessel and hankel LUTs to the GPU
+    bsComplex* gpu_j;
+    HANDLE_ERROR( cudaMalloc( (void**)&gpu_j, sizeof(bsComplex) * (Nl + 1) * rR ) );
+    HANDLE_ERROR( cudaMemcpy( gpu_j, j, sizeof(bsComplex) * (Nl + 1) * rR, cudaMemcpyHostToDevice ) );
+
+    //allocate memory for the scattering coefficients
+    bsComplex* gpuA;
+    HANDLE_ERROR(cudaMalloc((void**) &gpuA, (Nl+1) * sizeof(bsComplex)));
+    //copy the scattering coefficients to the GPU
+    HANDLE_ERROR(cudaMemcpy(gpuA, &A[0], (Nl+1) * sizeof(bsComplex), cudaMemcpyHostToDevice));
+
+    //create one thread for each pixel of the field slice
+	dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
+	dim3 dimGrid((Uip.R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (Uip.R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
+
+	gpuScalarUip<<<dimGrid, dimBlock>>>(Uip.x_hat, gpu_j, gpuA, Nl, rR, thetaR);
+
+	//free memory
+	cudaFree(gpu_j);
+	cudaFree(gpuA);
+
+}
@@ -22,12 +22,12 @@ struct sphere
     //sphere material index
     int iMaterial;
  
-    //rtsPointer to the scattered field produced by a plane wave
+    //GPU pointer to the scattered field produced by a plane wave
     //  this is a function of cos(theta) and |r| (distance from sphere center)
-    //fieldslice surface;
-
-    //resolution of the scattered field
-    int thetaR, rR;
+    fieldslice Usp;
+    fieldslice Uip;
+    ptype d_min;
+    ptype d_max;
  
 	//sphere order
 	int Nl;
@@ -50,6 +50,12 @@ struct sphere
 		//surface = fieldslice(ang, ang/2);
     }
  
+	//assignment operator
+	sphere & operator=(const sphere &rhs);
+	
+	//copy constructor
+	sphere(const sphere &rhs);
+
 	std::string toStr()
 	{
 		std::stringstream ss;
@@ -66,8 +72,19 @@ struct sphere
         Nl = ceil( (2 * PI * a) / lambda + 4 * pow( (2 * PI * a) / lambda, 1.0/3.0) + 2);
 	}
  
-	void calcCoeff(ptype lambda, rts::rtsComplex<ptype> n);
+    //compute the scattering coefficients
+	void calcCoeff(ptype lambda, bsComplex n);
+
+	//compute the bessel function look-up tables
+	void calcLut(bsComplex* j, bsComplex* h, ptype lambda, bsComplex n, int aR, int rR);
+    void calcBesselLut(bsComplex* j, ptype k, bsComplex n, int aR);
+	void calcHankelLut(bsComplex* h, ptype k, int rR);
+
+	//calculate the scattering domain Us(theta, r)
+	void calcUp(ptype lambda, bsComplex n, rts::rtsQuad<ptype, 3> nfPlane, unsigned int R);
  
+	void scalarUsp(bsComplex* h, int rR, int thetaR);
+	void scalarUip(bsComplex* j, int aR, int thetaR);