diff --git a/stim/envi/bsq.h b/stim/envi/bsq.h
index b2c116c..5eb5190 100644
--- a/stim/envi/bsq.h
+++ b/stim/envi/bsq.h
@@ -1021,12 +1021,13 @@ public:
 		//for each band
 		for (unsigned long long z = b0; z < b1; z++)
 		{
+			//std::cout<<z<<std::endl;
 			for (unsigned long long y = 0; y < lines; y++)
 			{
 				file.read((char *)(temp + y * samples), sizeof(T) * samples);
 				file.seekg(jumpl, std::ios::cur);    //go to the next band
 
-				if(PROGRESS) progress = (double)((z+1) * lines + y + 1) / ((b1 - b0) * lines) * 100;
+				if(PROGRESS) progress = (double)((z - b0 + 1) * lines + y + 1) / ((b1 - b0) * lines) * 100;
 			}
 			out.write(reinterpret_cast<const char*>(temp), L);   //write slice data into target file
 			file.seekg(jumpb, std::ios::cur);
diff --git a/stim/math/fft.h b/stim/math/fft.h
index 272ace7..d3e9448 100644
--- a/stim/math/fft.h
+++ b/stim/math/fft.h
@@ -3,7 +3,7 @@
 
 namespace stim{
 
-	template<class T>
+	/*template<class T>
 	void circshift(T *out, const T *in, size_t xdim, size_t ydim, size_t xshift, size_t yshift){
 		size_t i, j, ii, jj;
 		for (i =0; i < xdim; i++) {
@@ -13,16 +13,31 @@ namespace stim{
 				out[ii * ydim + jj] = in[i * ydim + j];
 			}
 		}
+	}*/
+
+	template<typename T>
+	void circshift(T *out, const T *in, int xdim, int ydim, int xshift, int yshift)
+	{
+	 for (int i =0; i < xdim; i++) {
+	   int ii = (i + xshift) % xdim;
+	   if (ii<0) ii = xdim + ii;
+	   for (int j = 0; j < ydim; j++) {
+	     int jj = (j + yshift) % ydim;
+	     if (jj<0) jj = ydim + jj;
+	     //out[ii * ydim + jj] = in[i * ydim + j];
+	     out[jj * xdim + ii] = in[j * xdim + i];
+	   }
+	 }
 	}
 
 	template<typename T>
 	void cpu_fftshift(T* out, T* in, size_t xdim, size_t ydim){
-		circshift(out, in, xdim, ydim, xdim/2, ydim/2);
+		circshift(out, in, xdim, ydim, std::floor(xdim/2), std::floor(ydim/2));
 	}
 
 	template<typename T>
 	void cpu_ifftshift(T* out, T* in, size_t xdim, size_t ydim){
-		circshift(out, in, xdim, ydim, xdim/2, ydim/2);
+		circshift(out, in, xdim, ydim, std::ceil(xdim/2), std::ceil(ydim/2));
 	}
 
 
diff --git a/stim/optics/mie.h b/stim/optics/mie.h
deleted file mode 100644
index 12d208e..0000000
--- a/stim/optics/mie.h
+++ /dev/null
@@ -1,613 +0,0 @@
-#ifndef STIM_MIE_H
-#define STIM_MIE_H
-#include <boost/math/special_functions/bessel.hpp>
-
-#include "scalarwave.h"
-#include "../math/bessel.h"
-#include "../cuda/cudatools/devices.h"
-#include <cmath>
-
-namespace stim{
-
-
-/// Calculate the scattering coefficients for a spherical scatterer
-template<typename T>
-void B_coefficients(stim::complex<T>* B, T a, T k, stim::complex<T> n, int Nl){
-
-	//temporary variables
-	double vm;															//allocate space to store the return values for the bessel function calculation
-	double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
-
-	stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
-	stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
-	stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
-	stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
-
-	double ka = k * a;													//store k*a (argument for spherical bessel and Hankel functions)
-	stim::complex<double> kna = k * n * a;								//store k*n*a (argument for spherical bessel functions and derivatives)
-
-	stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka);			//calculate bessel functions and derivatives for k*a
-	stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna);		//calculate complex bessel functions for k*n*a
-
-	stim::complex<double> h_ka, dh_ka;
-	stim::complex<double> numerator, denominator;
-	stim::complex<double> i(0, 1);
-	for(int l = 0; l <= Nl; l++){
-		h_ka.r = j_ka[l];
-		h_ka.i = y_ka[l];
-		dh_ka.r = dj_ka[l];
-		dh_ka.i = dy_ka[l];
-
-		numerator = j_ka[l] * dj_kna[l] * (stim::complex<double>)n - j_kna[l] * dj_ka[l];
-		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
-		B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
-	}
-}
-
-template<typename T>
-void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){
-	//temporary variables
-	double vm;															//allocate space to store the return values for the bessel function calculation
-	double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
-
-	stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
-	stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
-	stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
-	stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
-
-	double ka = k * a;													//store k*a (argument for spherical bessel and Hankel functions)
-	stim::complex<double> kna = k * n * a;								//store k*n*a (argument for spherical bessel functions and derivatives)
-
-	stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka);			//calculate bessel functions and derivatives for k*a
-	stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna);		//calculate complex bessel functions for k*n*a
-
-	stim::complex<double> h_ka, dh_ka;
-	stim::complex<double> numerator, denominator;
-	stim::complex<double> i(0, 1);
-	for(size_t l = 0; l <= Nl; l++){
-		h_ka.r = j_ka[l];
-		h_ka.i = y_ka[l];
-		dh_ka.r = dj_ka[l];
-		dh_ka.i = dy_ka[l];
-
-		numerator = j_ka[l] * dh_ka - dj_ka[l] * h_ka;
-		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
-		A[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
-	}
-}
-
-#define LOCAL_NL	16
-template<typename T>
-__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){
-	extern __shared__ stim::complex<T> shared_hB[];		//declare the list of waves in shared memory
-
-	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
-	if(i >= N) return;													//exit if this thread is outside the array
-	stim::vec3<T> p;
-	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
-	(y == NULL) ? p[1] = 0 : p[1] = y[i];
-	(z == NULL) ? p[2] = 0 : p[2] = z[i];
-	
-	T r = p.len();														//calculate the distance from the sphere
-	if(r < a) return;													//exit if the point is inside the sphere (we only calculate the internal field)
-	T fij = (r - r_min)/dr;											//FP index into the spherical bessel LUT
-	size_t ij = (size_t) fij;											//convert to an integral index
-	T alpha = fij - ij;													//calculate the fractional portion of the index
-	size_t n0j = ij * (Nl + 1);												//start of the first entry in the LUT
-	size_t n1j = (ij+1) * (Nl + 1);											//start of the second entry in the LUT
-
-	T cos_phi;	
-	T Pl_2, Pl_1, Pl;														//declare registers to store the previous two Legendre polynomials
-	
-	stim::complex<T> hBl;
-	stim::complex<T> Ei = 0;											//create a register to store the result
-	int l;
-
-	stim::complex<T> hlBl[LOCAL_NL+1];									//the first LOCAL_NL components are stored in registers for speed
-	int shared_start = threadIdx.x * (Nl - LOCAL_NL);					//wrap up some operations so that they aren't done in the main loops
-
-	#pragma unroll LOCAL_NL+1											//copy the first LOCAL_NL+1 h_l * B_l components to registers
-	for(l = 0; l <= LOCAL_NL; l++)
-		hlBl[l] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
-	
-	for(l = LOCAL_NL+1; l <= Nl; l++)									//copy any additional h_l * B_l components to shared memory
-		shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
-
-	for(size_t w = 0; w < nW; w++){										//for each plane wave
-		cos_phi = p.norm().dot(W[w].kvec().norm());						//calculate the cosine of the angle between the k vector and the direction from the sphere
-		Pl_2 = 1;														//the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
-		Pl_1 = cos_phi;
-		Ei += W[w].E() * hlBl[0] * Pl_2;								//unroll the first two orders using the initial steps of the Legendre recursive relation
-		Ei += W[w].E() * hlBl[1] * Pl_1;		
-
-		#pragma unroll LOCAL_NL-1										//unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file)
-		for(l = 2; l <= LOCAL_NL; l++){
-			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);	//calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs)
-			Ei += W[w].E() * hlBl[l] * Pl;								//calculate and sum the current field order
-			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
-			Pl_1 = Pl;
-		}
-
-		for(l = LOCAL_NL+1; l <= Nl; l++){											//do the same as above, except for any additional orders that are stored in shared memory (not registers)
-			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);				//again, this is where most computation in the kernel occurs
-			Ei += W[w].E() * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl;
-			Pl_2 = Pl_1;															//shift Pl_1 -> Pl_2 and Pl -> Pl_1
-			Pl_1 = Pl;			
-		}
-	}
-	E[i] += Ei;															//copy the result to device memory
-}
-
-template<typename T>
-void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){
-	
-	size_t max_shared_mem = stim::sharedMemPerBlock();	
-	size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
-	std::cout<<"hl*Bl array size:  "<<hBl_array<<std::endl;
-	std::cout<<"shared memory:     "<<max_shared_mem<<std::endl;
-	int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
-	std::cout<<"threads per block: "<<threads<<std::endl;
-	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks
-
-	size_t shared_mem;
-	if(Nl <= LOCAL_NL) shared_mem = 0;
-	else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL);				//amount of shared memory to allocate
-	std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
-	cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl);	//call the kernel
-}
-
-template<typename T>
-__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){
-	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
-	if(i >= N) return;													//exit if this thread is outside the array
-
-	stim::vec3<T> p;
-	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
-	(y == NULL) ? p[1] = 0 : p[1] = y[i];
-	(z == NULL) ? p[2] = 0 : p[2] = z[i];
-
-	r[i] = p.len();
-}
-/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
-
-/// @param E is a pointer to the destination field values
-/// @param N is the number of points used to calculate the field
-/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
-/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
-/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
-/// @param W is an array of planewaves that will be scattered
-/// @param a is the radius of the sphere
-/// @param n is the complex refractive index of the sphere
-template<typename T>
-void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
-	//calculate the necessary number of orders required to represent the scattered field
-	T k = W[0].kmag();
-
-	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
-	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
-	std::cout<<"Nl: "<<Nl<<std::endl;
-
-	//calculate the scattering coefficients for the sphere
-	stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
-	B_coefficients(B, a, k, n, Nl);
-
-#ifdef CUDA_FOUND
-	stim::complex<T>* dev_E;										//allocate space for the field
-	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
-	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
-	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
-
-	//	COORDINATES
-	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
-	if(x != NULL){
-		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
-		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
-	}
-	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
-	if(y != NULL){
-		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
-		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
-	}
-	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
-	if(z != NULL){
-		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
-		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
-	}
-
-	//	PLANE WAVES
-	stim::scalarwave<T>* dev_W;																//allocate space and copy plane waves
-	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
-	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
-
-	// BESSEL FUNCTION LOOK-UP TABLE
-	//calculate the distance from the sphere center
-	T* dev_r;
-	HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );
-		
-	int threads = stim::maxThreadsPerBlock();
-	dim3 blocks((unsigned)(N / threads + 1));
-	cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);
-
-	//Find the minimum and maximum values of r
-    cublasStatus_t stat;
-    cublasHandle_t handle;
-
-	stat = cublasCreate(&handle);							//create a cuBLAS handle
-	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
-        printf ("CUBLAS initialization failed\n");
-		exit(1);
-	}
-
-	int i_min, i_max;
-	stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
-	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
-        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
-		exit(1);
-	}
-	stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
-	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
-        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
-		exit(1);
-	}
-
-	i_min--;				//cuBLAS uses 1-based indexing for Fortran compatibility
-	i_max--;
-	T r_min, r_max;											//allocate space to store the minimum and maximum values
-	HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
-	HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
-
-	r_min = max(r_min, a);									//if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere)
-
-	//size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1);			//number of values in the look-up table based on the user-specified spacing along r
-	size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1);
-
-	//T kr_min = k * r_min;
-	//T kr_max = k * r_max;
-
-	//temporary variables
-	double vm;															//allocate space to store the return values for the bessel function calculation
-	double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
-
-	size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut;
-	stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes);													//pointer to the look-up table
-	T dr = (r_max - r_min) / (N_hB_lut-1);												//distance between values in the LUT
-	std::cout<<"LUT jl bytes:  "<<hB_bytes<<std::endl;
-	stim::complex<T> hl;
-	for(size_t ri = 0; ri < N_hB_lut; ri++){													//for each value in the LUT
-		stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
-		for(size_t l = 0; l <= Nl; l++){													//for each order
-			hl.r = (T)jv[l];
-			hl.i = (T)yv[l];
-
-			hB_lut[ri * (Nl + 1) + l] = hl * B[l];										//store the bessel function result
-			//std::cout<<hB_lut[ri * (Nl + 1) + l]<<std::endl;
-		}
-	}
-	T* real_lut = (T*) malloc(hB_bytes/2);
-	stim::real(real_lut, hB_lut, N_hB_lut);
-	stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer);
-
-	//Allocate device memory and copy everything to the GPU
-	stim::complex<T>* dev_hB_lut;
-	HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
-	HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );
-
-	gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl);
-
-	cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
-
-	if(x != NULL) cudaFree(dev_x);														//free everything
-	if(y != NULL) cudaFree(dev_y);
-	if(z != NULL) cudaFree(dev_z);
-	cudaFree(dev_E);
-#else
-	
-
-	//allocate space to store the bessel function call results
-	double vm;										
-	double* j_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* y_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* dj_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* dy_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
-
-	T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
-
-	T r, kr, cos_phi;
-	stim::complex<T> h;
-	for(size_t i = 0; i < N; i++){
-		stim::vec3<T> p;															//declare a 3D point
-	
-		(x == NULL) ? p[0] = 0 : p[0] = x[i];										// test for NULL values and set positions
-		(y == NULL) ? p[1] = 0 : p[1] = y[i];
-		(z == NULL) ? p[2] = 0 : p[2] = z[i];
-		r = p.len();
-		if(r >= a){
-			for(size_t w = 0; w < W.size(); w++){
-				kr = p.len() * W[w].kmag();											//calculate k*r
-				stim::bessjyv_sph<double>(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr);
-				cos_phi = p.norm().dot(W[w].kvec().norm());							//calculate the cosine of the angle from the propagating direction
-				stim::legendre<T>(Nl, cos_phi, P);
-
-				for(size_t l = 0; l <= Nl; l++){
-					h.r = j_kr[l];
-					h.i = y_kr[l];
-					E[i] += W[w].E() * B[l] * h * P[l];
-				}
-			}
-		}
-	}
-#endif
-}
-
-template<typename T>
-void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){
-	std::vector< stim::scalarwave<T> > W(1, w);
-	cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, r_spacing);
-}
-
-template<typename T>
-__global__ void cuda_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* jA, T r_min, T dr, size_t N_jA, int Nl){
-	extern __shared__ stim::complex<T> shared_jA[];		//declare the list of waves in shared memory
-
-	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
-	if(i >= N) return;													//exit if this thread is outside the array
-	stim::vec3<T> p;
-	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
-	(y == NULL) ? p[1] = 0 : p[1] = y[i];
-	(z == NULL) ? p[2] = 0 : p[2] = z[i];
-	
-	T r = p.len();														//calculate the distance from the sphere
-	if(r > a) return;													//exit if the point is inside the sphere (we only calculate the internal field)
-	T fij = (r - r_min)/dr;											//FP index into the spherical bessel LUT
-	size_t ij = (size_t) fij;											//convert to an integral index
-	T alpha = fij - ij;													//calculate the fractional portion of the index
-	size_t n0j = ij * (Nl + 1);												//start of the first entry in the LUT
-	size_t n1j = (ij+1) * (Nl + 1);											//start of the second entry in the LUT
-
-	T cos_phi;	
-	T Pl_2, Pl_1, Pl;														//declare registers to store the previous two Legendre polynomials
-	
-	stim::complex<T> jAl;
-	stim::complex<T> Ei = 0;											//create a register to store the result
-	int l;
-
-	stim::complex<T> jlAl[LOCAL_NL+1];									//the first LOCAL_NL components are stored in registers for speed
-	int shared_start = threadIdx.x * (Nl - LOCAL_NL);					//wrap up some operations so that they aren't done in the main loops
-
-	#pragma unroll LOCAL_NL+1											//copy the first LOCAL_NL+1 h_l * B_l components to registers
-	for(l = 0; l <= LOCAL_NL; l++)
-		jlAl[l] = clerp<T>( jA[n0j + l], jA[n1j + l], alpha );
-	
-	for(l = LOCAL_NL+1; l <= Nl; l++)									//copy any additional h_l * B_l components to shared memory
-		shared_jA[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( jA[n0j + l], jA[n1j + l], alpha );
-
-	for(size_t w = 0; w < nW; w++){										//for each plane wave
-		if(r == 0) cos_phi = 0;
-		else
-			cos_phi = p.norm().dot(W[w].kvec().norm());						//calculate the cosine of the angle between the k vector and the direction from the sphere
-		Pl_2 = 1;														//the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
-		Pl_1 = cos_phi;
-		Ei += W[w].E() * jlAl[0] * Pl_2;								//unroll the first two orders using the initial steps of the Legendre recursive relation
-		Ei += W[w].E() * jlAl[1] * Pl_1;		
-
-		#pragma unroll LOCAL_NL-1										//unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file)
-		for(l = 2; l <= LOCAL_NL; l++){
-			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);	//calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs)
-			Ei += W[w].E() * jlAl[l] * Pl;								//calculate and sum the current field order
-			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
-			Pl_1 = Pl;
-		}
-
-		for(l = LOCAL_NL+1; l <= Nl; l++){											//do the same as above, except for any additional orders that are stored in shared memory (not registers)
-			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);				//again, this is where most computation in the kernel occurs
-			Ei += W[w].E() * shared_jA[shared_start + l - LOCAL_NL - 1] * Pl;
-			Pl_2 = Pl_1;															//shift Pl_1 -> Pl_2 and Pl -> Pl_1
-			Pl_1 = Pl;			
-		}
-	}
-	E[i] = Ei;															//copy the result to device memory
-}
-
-template<typename T>
-void gpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* jA, T r_min, T dr, size_t N_jA, size_t Nl){
-	
-	size_t max_shared_mem = stim::sharedMemPerBlock();	
-	size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
-	std::cout<<"hl*Bl array size:  "<<hBl_array<<std::endl;
-	std::cout<<"shared memory:     "<<max_shared_mem<<std::endl;
-	int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
-	std::cout<<"threads per block: "<<threads<<std::endl;
-	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks
-
-	size_t shared_mem;
-	if(Nl <= LOCAL_NL) shared_mem = 0;
-	else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL);				//amount of shared memory to allocate
-	std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
-	cuda_scalar_mie_internal<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl);	//call the kernel
-}
-
-/// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere
-
-/// @param E is a pointer to the destination field values
-/// @param N is the number of points used to calculate the field
-/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
-/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
-/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
-/// @param w is a planewave that will be scattered
-/// @param a is the radius of the sphere
-/// @param n is the complex refractive index of the sphere
-template<typename T>
-void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n, T r_spacing = 0.1){
-//calculate the necessary number of orders required to represent the scattered field
-	T k = W[0].kmag();
-
-	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
-	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
-	std::cout<<"Nl: "<<Nl<<std::endl;
-
-	//calculate the scattering coefficients for the sphere
-	stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
-	A_coefficients(A, a, k, n, Nl);
-
-#ifdef CUDA_FOUND
-	stim::complex<T>* dev_E;										//allocate space for the field
-	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
-	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
-	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
-
-	//	COORDINATES
-	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
-	if(x != NULL){
-		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
-		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
-	}
-	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
-	if(y != NULL){
-		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
-		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
-	}
-	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
-	if(z != NULL){
-		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
-		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
-	}
-
-	//	PLANE WAVES
-	stim::scalarwave<T>* dev_W;																//allocate space and copy plane waves
-	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
-	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
-
-	// BESSEL FUNCTION LOOK-UP TABLE
-	//calculate the distance from the sphere center
-	T* dev_r;
-	HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );
-		
-	int threads = stim::maxThreadsPerBlock();
-	dim3 blocks((unsigned)(N / threads + 1));
-	cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);
-
-	//Find the minimum and maximum values of r
-    cublasStatus_t stat;
-    cublasHandle_t handle;
-
-	stat = cublasCreate(&handle);							//create a cuBLAS handle
-	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
-        printf ("CUBLAS initialization failed\n");
-		exit(1);
-	}
-
-	int i_min, i_max;
-	stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
-	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
-        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
-		exit(1);
-	}
-	stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
-	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
-        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
-		exit(1);
-	}
-
-	i_min--;				//cuBLAS uses 1-based indexing for Fortran compatibility
-	i_max--;
-	T r_min, r_max;											//allocate space to store the minimum and maximum values
-	HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
-	HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
-
-	r_max = min(r_max, a);		//the internal field doesn't exist outside of the sphere
-
-	size_t N_jA_lut = (size_t)((r_max - r_min) / r_spacing + 1);
-
-	//temporary variables
-	double vm;															//allocate space to store the return values for the bessel function calculation
-	stim::complex<double>* jv = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
-	stim::complex<double>* yv = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
-	stim::complex<double>* djv= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
-	stim::complex<double>* dyv= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
-
-	size_t jA_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_jA_lut;
-	stim::complex<T>* jA_lut = (stim::complex<T>*) malloc(jA_bytes);													//pointer to the look-up table
-	T dr = (r_max - r_min) / (N_jA_lut-1);												//distance between values in the LUT
-	std::cout<<"LUT jl bytes:  "<<jA_bytes<<std::endl;
-	stim::complex<T> hl;
-	stim::complex<double> nd = (stim::complex<double>)n;
-	for(size_t ri = 0; ri < N_jA_lut; ri++){													//for each value in the LUT
-		stim::cbessjyva_sph<double>(Nl, nd * k * (r_min + ri * dr), vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
-		for(size_t l = 0; l <= Nl; l++){													//for each order
-			jA_lut[ri * (Nl + 1) + l] = (stim::complex<T>)(jv[l] * (stim::complex<double>)A[l]);										//store the bessel function result
-		}
-	}
-
-	//Allocate device memory and copy everything to the GPU
-	stim::complex<T>* dev_jA_lut;
-	HANDLE_ERROR( cudaMalloc(&dev_jA_lut, jA_bytes) );
-	HANDLE_ERROR( cudaMemcpy(dev_jA_lut, jA_lut, jA_bytes, cudaMemcpyHostToDevice) );
-
-	gpu_scalar_mie_internal<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_jA_lut, r_min, dr, N_jA_lut, Nl);
-
-	cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
-
-	if(x != NULL) cudaFree(dev_x);														//free everything
-	if(y != NULL) cudaFree(dev_y);
-	if(z != NULL) cudaFree(dev_z);
-	cudaFree(dev_E);
-#else
-
-	//allocate space to store the bessel function call results
-	double vm;										
-	stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
-	stim::complex<double>* y_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
-	stim::complex<double>* dj_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
-	stim::complex<double>* dy_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
-
-	T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
-
-	T r, cos_phi;
-	stim::complex<double> knr;
-	stim::complex<T> h;
-	for(size_t i = 0; i < N; i++){
-		stim::vec3<T> p;									//declare a 3D point
-	
-		(x == NULL) ? p[0] = 0 : p[0] = x[i];				// test for NULL values and set positions
-		(y == NULL) ? p[1] = 0 : p[1] = y[i];
-		(z == NULL) ? p[2] = 0 : p[2] = z[i];
-		r = p.len();
-		if(r < a){
-			E[i] = 0;
-			for(size_t w = 0; w < W.size(); w++){
-				knr = (stim::complex<double>)n * p.len() * W[w].kmag();							//calculate k*n*r
-
-				stim::cbessjyva_sph<double>(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr);
-				if(r == 0)
-					cos_phi = 0;
-				else
-					cos_phi = p.norm().dot(W[w].kvec().norm());				//calculate the cosine of the angle from the propagating direction
-				stim::legendre<T>(Nl, cos_phi, P);
-								
-				for(size_t l = 0; l <= Nl; l++){
-					E[i] += W[w].E() * A[l] * (stim::complex<T>)j_knr[l] * P[l];
-				}
-			}
-		}
-	}
-#endif
-}
-
-template<typename T>
-void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){
-	std::vector< stim::scalarwave<T> > W(1, w);
-	cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, r_spacing);
-}
-
-}
-
-#endif
\ No newline at end of file
diff --git a/stim/optics/scalarbeam.h b/stim/optics/scalarbeam.h
index 800f4a3..77fc289 100644
--- a/stim/optics/scalarbeam.h
+++ b/stim/optics/scalarbeam.h
@@ -213,13 +213,13 @@ void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda,
 	T dr = (r_max - r_min) / (Nlut_j-1);												//distance between values in the LUT
 	T jl;
 	for(size_t ri = 0; ri < Nlut_j; ri++){													//for each value in the LUT
-		for(size_t l = 0; l <= Nl; l++){													//for each order
+		for(unsigned l = 0; l <= (unsigned)Nl; l++){													//for each order
 			jl = boost::math::sph_bessel<T>(l, k*(r_min + ri * dr));					//use boost to calculate the spherical bessel function
 			j_lut[ri * (Nl + 1) + l] = jl;												//store the bessel function result
 		}
 	}
 
-	stim::cpu2image<T>(j_lut, "j_lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
+	//stim::cpu2image<T>(j_lut, "j_lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
 	//Allocate device memory and copy everything to the GPU
 
 	T* gpu_C;
@@ -316,6 +316,7 @@ void gpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lamb
 
 	stim::quaternion<T> q;												//create a quaternion
 	q.CreateRotation(d, stim::vec3<T>(0, 0, 1));						//create a mapping from the propagation direction to the PSF space
+	stim::matrix<T, 3> rot = q.toMatrix3();
 	int threads = stim::maxThreadsPerBlock();							//get the maximum number of threads per block for the CUDA device
 	dim3 blocks( (unsigned)(N / threads + 1));							//calculate the optimal number of blocks
 	cuda_cart2psf<T> <<< blocks, threads >>> (gpu_r, gpu_phi, N, x, y, z, f, q);	//call the CUDA kernel to move the cartesian coordinates to PSF space
@@ -442,15 +443,19 @@ public:
 		return samples;
 	}
 
+	void eval(stim::scalarfield<T>& E, T* X, T* Y, T* Z, int order = 500){
+		cpu_scalar_psf_cart<T>(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[0], NA[1], order, E.spacing());
+	}
+
 	/// Evaluate the beam to a scalar field using Debye focusing
-	void eval(stim::scalarfield<T>& E, size_t order = 500){
+	void eval(stim::scalarfield<T>& E, int order = 500){
 		size_t array_size = E.grid_bytes();
 		T* X = (T*) malloc( array_size );			//allocate space for the coordinate meshes
 		T* Y = (T*) malloc( array_size );
 		T* Z = (T*) malloc( array_size );
 
 		E.meshgrid(X, Y, Z, stim::CPUmem);			//calculate the coordinate meshes
-		cpu_scalar_psf_cart<T>(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[0], NA[1], order, E.spacing());
+		eval(E, X, Y, Z, order);
 
 		free(X);									//free the coordinate meshes
 		free(Y);
diff --git a/stim/optics/scalarfield.h b/stim/optics/scalarfield.h
index 4d7cf37..397f49e 100644
--- a/stim/optics/scalarfield.h
+++ b/stim/optics/scalarfield.h
@@ -1,11 +1,211 @@
 #ifndef STIM_SCALARFIELD_H
 #define STIM_SCALARFIELD_H
 
+
 #include "../math/rect.h"
 #include "../math/complex.h"
+#include "../math/fft.h"
 
 namespace stim{
 
+	/// Perform a k-space transform of a scalar field (FFT). The given field has a width of x and the calculated momentum space has a
+	///		width of kx (in radians).
+	/// @param K is a pointer to the output array of all plane waves in the field
+	/// @param kx is the width of the frame in momentum space
+	/// @param ky is the height of the frame in momentum space
+	/// @param E is the field to be transformed
+	/// @param x is the width of the field in the spatial domain
+	/// @param y is the height of the field in the spatial domain
+	/// @param nx is the number of pixels representing the field in the x (and kx) direction
+	/// @param ny is the number of pixels representing the field in the y (and ky) direction
+	template<typename T>
+	void cpu_scalar_to_kspace(stim::complex<T>* K, T& kx, T& ky, stim::complex<T>* E, T x, T y, size_t nx, size_t ny){
+
+		kx = stim::TAU * nx / x;			//calculate the width of the momentum space
+		ky = stim::TAU * ny / y;
+
+		stim::complex<T>* dev_FFT;
+		HANDLE_ERROR( cudaMalloc(&dev_FFT, sizeof(stim::complex<T>) * nx * ny) );		//allocate space on the CUDA device for the output array
+
+		stim::complex<T>* dev_E;
+		HANDLE_ERROR( cudaMalloc(&dev_E, sizeof(stim::complex<T>) * nx * ny) );		//allocate space for the field
+		HANDLE_ERROR( cudaMemcpy(dev_E, E, sizeof(stim::complex<T>) * nx * ny, cudaMemcpyHostToDevice) );	//copy the field to GPU memory
+
+		cufftResult result;
+		cufftHandle plan;
+		result = cufftPlan2d(&plan, nx, ny, CUFFT_C2C);
+		if(result != CUFFT_SUCCESS){
+			std::cout<<"Error creating cuFFT plan."<<std::endl;
+			exit(1);
+		}
+
+		result = cufftExecC2C(plan, (cufftComplex*)dev_E, (cufftComplex*)dev_FFT, CUFFT_FORWARD);
+		if(result != CUFFT_SUCCESS){
+			std::cout<<"Error using cuFFT to perform a forward Fourier transform of the field."<<std::endl;
+			exit(1);
+		}
+
+		stim::complex<T>* fft = (stim::complex<T>*) malloc(sizeof(stim::complex<T>) * nx * ny);
+		HANDLE_ERROR( cudaMemcpy(fft, dev_FFT, sizeof(stim::complex<T>) * nx * ny, cudaMemcpyDeviceToHost) );
+
+		stim::cpu_fftshift(K, fft, nx, ny);
+		//memcpy(K, fft, sizeof(stim::complex<T>) * nx * ny);
+	}
+
+	template<typename T>
+	void cpu_scalar_from_kspace(stim::complex<T>* E, T& x, T& y, stim::complex<T>* K, T kx, T ky, size_t nx, size_t ny){
+
+		x = stim::TAU * nx / kx;			//calculate the width of the momentum space
+		y = stim::TAU * ny / ky;
+		
+		stim::complex<T>* fft = (stim::complex<T>*) malloc(sizeof(stim::complex<T>) * nx * ny);
+		stim::cpu_ifftshift(fft, K, nx, ny);
+		//memcpy(fft, K, sizeof(stim::complex<T>) * nx * ny);
+
+		stim::complex<T>* dev_FFT;
+		HANDLE_ERROR( cudaMalloc(&dev_FFT, sizeof(stim::complex<T>) * nx * ny) );		//allocate space on the CUDA device for the output array
+		HANDLE_ERROR( cudaMemcpy(dev_FFT, fft, sizeof(stim::complex<T>) * nx * ny, cudaMemcpyHostToDevice) );	//copy the field to GPU memory
+
+		stim::complex<T>* dev_E;
+		HANDLE_ERROR( cudaMalloc(&dev_E, sizeof(stim::complex<T>) * nx * ny) );		//allocate space for the field
+
+		cufftResult result;
+		cufftHandle plan;
+		result = cufftPlan2d(&plan, nx, ny, CUFFT_C2C);
+		if(result != CUFFT_SUCCESS){
+			std::cout<<"Error creating cuFFT plan."<<std::endl;
+			exit(1);
+		}
+
+		result = cufftExecC2C(plan, (cufftComplex*)dev_FFT, (cufftComplex*)dev_E, CUFFT_INVERSE);
+		if(result != CUFFT_SUCCESS){
+			std::cout<<"Error using cuFFT to perform a forward Fourier transform of the field."<<std::endl;
+			exit(1);
+		}
+
+		HANDLE_ERROR( cudaMemcpy(E, dev_E, sizeof(stim::complex<T>) * nx * ny, cudaMemcpyDeviceToHost) );
+
+		
+	}
+
+	/// Propagate a field slice along its orthogonal direction by a given distance z
+	/// @param Enew is the resulting propogated field
+	/// @param E is the field to be propogated
+	/// @param sx is the size of the field in the lateral x direction
+	/// @param sy is the size of the field in the lateral y direction
+	/// @param z is the distance to be propagated
+	/// @param k is the wavenumber 2*pi/lambda
+	/// @param nx is the number of samples in the field along the lateral x direction
+	/// @param ny is the number of samples in the field along the lateral y direction
+	template<typename T>
+	void cpu_scalar_propagate(stim::complex<T>* Enew, stim::complex<T>* E, T sx, T sy, T z, T k, size_t nx, size_t ny){
+		
+		stim::complex<T>* K = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * nx * ny );
+
+		T Kx, Ky;											//width and height in k space
+		cpu_scalar_to_kspace(K, Kx, Ky, E ,sx, sy, nx, ny);
+
+		T* mag = (T*) malloc( sizeof(T) * nx * ny );
+		stim::abs(mag, K, nx * ny);
+		stim::cpu2image<float>(mag, "kspace_pre_shift.bmp", nx, ny, stim::cmBrewer);
+		
+		size_t kxi, kyi;
+		size_t i;
+		T kx, kx_sq, ky, ky_sq, k_sq;
+		T kz;
+		stim::complex<T> shift;
+		T min_kx = -Kx / 2;
+		T dkx = Kx / (nx);
+
+		T min_ky = -Ky / 2;
+		T dky = Ky / (ny);
+
+		for(kyi = 0; kyi < ny; kyi++){						//for each plane wave in the ky direction
+			for(kxi = 0; kxi < nx; kxi++){					//for each plane wave in the ky direction
+				i = kyi * nx + kxi;
+
+				kx = min_kx + kxi * dkx;					//calculate the position of the current plane wave
+				ky = min_ky + kyi * dky;
+
+				kx_sq = kx * kx;
+				ky_sq = ky * ky;
+				k_sq = k*k;
+				
+				if(kx_sq + ky_sq < k_sq){
+					kz = sqrt(k_sq - kx_sq - ky_sq);			//estimate kz using the Fresnel approximation				
+					shift = -exp(stim::complex<T>(0, kz * z));
+					K[i] *= shift;
+					K[i] /= (nx*ny);							//normalize the DFT
+				}
+				else{
+					K[i] = 0;
+				}
+			}
+		}
+		
+		stim::abs(mag, K, nx * ny);
+		stim::cpu2image<float>(mag, "kspace_post_shift.bmp", nx, ny, stim::cmBrewer);
+		
+		cpu_scalar_from_kspace(Enew, sx, sy, K, Kx, Ky, nx, ny);
+	}
+
+	/// Apply a lowpass filter to a field slice
+	/// @param Enew is the resulting propogated field
+	/// @param E is the field to be propogated
+	/// @param sx is the size of the field in the lateral x direction
+	/// @param sy is the size of the field in the lateral y direction
+	/// @param highest is the highest spatial frequency that can pass through the filter
+	/// @param nx is the number of samples in the field along the lateral x direction
+	/// @param ny is the number of samples in the field along the lateral y direction
+	template<typename T>
+	void cpu_scalar_lowpass(stim::complex<T>* Enew, stim::complex<T>* E, T sx, T sy, T highest, size_t nx, size_t ny){
+		
+		stim::complex<T>* K = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * nx * ny );
+
+		T Kx, Ky;											//width and height in k space
+		cpu_scalar_to_kspace(K, Kx, Ky, E ,sx, sy, nx, ny);
+
+		T* mag = (T*) malloc( sizeof(T) * nx * ny );
+		stim::abs(mag, K, nx * ny);
+		stim::cpu2image<float>(mag, "kspace_pre_lowpass.bmp", nx, ny, stim::cmBrewer);
+		
+		size_t kxi, kyi;
+		size_t i;
+		T kx, kx_sq, ky, ky_sq, k_sq;
+		T kz;
+		stim::complex<T> shift;
+		T min_kx = -Kx / 2;
+		T dkx = Kx / (nx);
+
+		T min_ky = -Ky / 2;
+		T dky = Ky / (ny);
+
+		T highest_sq = highest * highest;
+
+		for(kyi = 0; kyi < ny; kyi++){						//for each plane wave in the ky direction
+			for(kxi = 0; kxi < nx; kxi++){					//for each plane wave in the ky direction
+				i = kyi * nx + kxi;
+
+				kx = min_kx + kxi * dkx;					//calculate the position of the current plane wave
+				ky = min_ky + kyi * dky;
+
+				kx_sq = kx * kx;
+				ky_sq = ky * ky;
+				
+				if(kx_sq + ky_sq > highest_sq){
+					K[i] = 0;
+				}
+				else
+					K[i] /= nx * ny;						//normalize the DFT
+			}
+		}
+		
+		stim::abs(mag, K, nx * ny);
+		stim::cpu2image<float>(mag, "kspace_post_lowpass.bmp", nx, ny, stim::cmBrewer);
+		
+		cpu_scalar_from_kspace(Enew, sx, sy, K, Kx, Ky, nx, ny);
+	}
+
 	enum locationType {CPUmem, GPUmem};
 
 	/// Class represents a scalar optical field.
@@ -22,23 +222,20 @@ protected:
 	size_t R[2];
 	locationType loc;
 
-	
-
-public:
-
-	CUDA_CALLABLE scalarfield(size_t X, size_t Y, T size = 1, T z_pos = 0) : rect<T>::rect(size, z_pos){
-		R[0] = X;											//set the field resolution
-		R[1] = Y;
-
-		E = (stim::complex<T>*) malloc(sizeof(stim::complex<T>) * R[0] * R[1]);		//allocate in CPU memory
-		loc = CPUmem;
+	/// Convert the field to a k-space representation (do an FFT)
+	void to_kspace(T& kx, T& ky){
+		cpu_scalar_to_kspace(E, kx, ky, E, X.len(), Y.len(), R[0], R[1]);
 	}
 
-	CUDA_CALLABLE ~scalarfield(){
-		if(loc == CPUmem) free(E);
-		else cudaFree(E);
+	void from_kspace(){
+		kx = stim::TAU * R[0] / X.len();			//calculate the width of the momentum space
+		ky = stim::TAU * R[1] / Y.len();
+		T x, y;
+		cpu_scalar_from_kspace(E, x, y, E, kx, ky, R[0], R[1]);
 	}
 
+public:
+
 	/// Returns the number of values in the field
 	CUDA_CALLABLE size_t size(){
 		return R[0] * R[1];
@@ -48,6 +245,20 @@ public:
 		return sizeof(stim::complex<T>) * R[0] * R[1];
 	}
 
+	scalarfield(size_t X, size_t Y, T size = 1, T z_pos = 0) : rect<T>::rect(size, z_pos){
+		R[0] = X;											//set the field resolution
+		R[1] = Y;
+
+		E = (stim::complex<T>*) malloc(grid_bytes());		//allocate in CPU memory
+		memset(E, 0, grid_bytes());
+		loc = CPUmem;
+	}
+
+	~scalarfield(){
+		if(loc == CPUmem) free(E);
+		else cudaFree(E);
+	}	
+
 	/// Calculates the distance between points on the grid
 	T spacing(){
 		T du = rect<T>::X.len() / R[0];
@@ -78,6 +289,16 @@ public:
 		}
 	}
 
+	/// Propagate the field along its orthogonal direction by a distance d
+	void propagate(T d, T k){
+		cpu_scalar_propagate(E, E, X.len(), Y.len(), d, k, R[0], R[1]);
+	}
+
+	/// Propagate the field along its orthogonal direction by a distance d
+	void lowpass(T highest){
+		cpu_scalar_lowpass(E, E, X.len(), Y.len(), highest, R[0], R[1]);
+	}
+
 	std::string str(){
 		std::stringstream ss;
 		ss<<rect<T>::str()<<std::endl;
@@ -96,11 +317,11 @@ public:
 
 	/// Evaluate the cartesian coordinates of each point in the field. The resulting arrays are allocated in the same memory where the field is stored.
 	void meshgrid(T* X, T* Y, T* Z, locationType location){
-		size_t array_size = sizeof(T) * R[0] * R[1];
+		//size_t array_size = sizeof(T) * R[0] * R[1];
 		if(location == CPUmem){
 
-			T du = 1.0 / (R[0] - 1);					//calculate the spacing between points in the grid
-			T dv = 1.0 / (R[1] - 1);
+			T du = (T)1.0 / (R[0] - 1);					//calculate the spacing between points in the grid
+			T dv = (T)1.0 / (R[1] - 1);
 
 			size_t ui, vi, i;
 			stim::vec3<T> p;
@@ -114,9 +335,9 @@ public:
 					i++;					
 				}
 			}
-			stim::cpu2image(X, "X.bmp", R[0], R[1], stim::cmBrewer);
-			stim::cpu2image(Y, "Y.bmp", R[0], R[1], stim::cmBrewer);
-			stim::cpu2image(Z, "Z.bmp", R[0], R[1], stim::cmBrewer);
+			//stim::cpu2image(X, "X.bmp", R[0], R[1], stim::cmBrewer);
+			//stim::cpu2image(Y, "Y.bmp", R[0], R[1], stim::cmBrewer);
+			//stim::cpu2image(Z, "Z.bmp", R[0], R[1], stim::cmBrewer);
 		}
 		else{
 			std::cout<<"GPU allocation of a meshgrid isn't supported yet. You'll have to write kernels to do the calculation.";
@@ -124,6 +345,14 @@ public:
 		}
 	}
 
+	//clear the field, setting all values to zero
+	void clear(){
+		if(loc == GPUmem)
+			HANDLE_ERROR(cudaMemset(E, 0, grid_bytes()));
+		else
+			memset(E, 0, grid_bytes());
+	}
+
 	void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer){
 
 		if(loc == GPUmem) to_cpu();									//if the field is in the GPU, move it to the CPU
diff --git a/stim/optics/scalarmie.h b/stim/optics/scalarmie.h
new file mode 100644
index 0000000..3e1fd7a
--- /dev/null
+++ b/stim/optics/scalarmie.h
@@ -0,0 +1,646 @@
+#ifndef STIM_MIE_H
+#define STIM_MIE_H
+#include <boost/math/special_functions/bessel.hpp>
+
+#include "scalarwave.h"
+#include "../math/bessel.h"
+#include "../cuda/cudatools/devices.h"
+#include <cmath>
+
+namespace stim{
+
+
+/// Calculate the scattering coefficients for a spherical scatterer
+template<typename T>
+void B_coefficients(stim::complex<T>* B, T a, T k, stim::complex<T> n, int Nl){
+
+	//temporary variables
+	double vm;															//allocate space to store the return values for the bessel function calculation
+	double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+
+	double ka = k * a;													//store k*a (argument for spherical bessel and Hankel functions)
+	stim::complex<double> kna = k * n * a;								//store k*n*a (argument for spherical bessel functions and derivatives)
+
+	stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka);			//calculate bessel functions and derivatives for k*a
+	stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna);		//calculate complex bessel functions for k*n*a
+
+	stim::complex<double> h_ka, dh_ka;
+	stim::complex<double> numerator, denominator;
+	stim::complex<double> i(0, 1);
+	for(int l = 0; l <= Nl; l++){
+		h_ka.r = j_ka[l];
+		h_ka.i = y_ka[l];
+		dh_ka.r = dj_ka[l];
+		dh_ka.i = dy_ka[l];
+
+		numerator = j_ka[l] * dj_kna[l] * (stim::complex<double>)n - j_kna[l] * dj_ka[l];
+		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
+		B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
+	}
+}
+
+template<typename T>
+void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){
+	//temporary variables
+	double vm;															//allocate space to store the return values for the bessel function calculation
+	double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+
+	double ka = k * a;													//store k*a (argument for spherical bessel and Hankel functions)
+	stim::complex<double> kna = k * n * a;								//store k*n*a (argument for spherical bessel functions and derivatives)
+
+	stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka);			//calculate bessel functions and derivatives for k*a
+	stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna);		//calculate complex bessel functions for k*n*a
+
+	stim::complex<double> h_ka, dh_ka;
+	stim::complex<double> numerator, denominator;
+	stim::complex<double> i(0, 1);
+	for(size_t l = 0; l <= Nl; l++){
+		h_ka.r = j_ka[l];
+		h_ka.i = y_ka[l];
+		dh_ka.r = dj_ka[l];
+		dh_ka.i = dy_ka[l];
+
+		numerator = j_ka[l] * dh_ka - dj_ka[l] * h_ka;
+		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
+		A[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
+	}
+}
+
+#define LOCAL_NL	16
+template<typename T>
+__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){
+	extern __shared__ stim::complex<T> shared_hB[];		//declare the list of waves in shared memory
+
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;													//exit if this thread is outside the array
+	stim::vec3<T> p;
+	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
+	(y == NULL) ? p[1] = 0 : p[1] = y[i];
+	(z == NULL) ? p[2] = 0 : p[2] = z[i];
+	
+	T r = p.len();														//calculate the distance from the sphere
+	if(r < a) return;													//exit if the point is inside the sphere (we only calculate the internal field)
+	T fij = (r - r_min)/dr;											//FP index into the spherical bessel LUT
+	size_t ij = (size_t) fij;											//convert to an integral index
+	T alpha = fij - ij;													//calculate the fractional portion of the index
+	size_t n0j = ij * (Nl + 1);												//start of the first entry in the LUT
+	size_t n1j = (ij+1) * (Nl + 1);											//start of the second entry in the LUT
+
+	T cos_phi;	
+	T Pl_2, Pl_1, Pl;														//declare registers to store the previous two Legendre polynomials
+	
+	stim::complex<T> hBl;
+	stim::complex<T> Ei = 0;											//create a register to store the result
+	int l;
+
+	stim::complex<T> hlBl[LOCAL_NL+1];									//the first LOCAL_NL components are stored in registers for speed
+	int shared_start = threadIdx.x * (Nl - LOCAL_NL);					//wrap up some operations so that they aren't done in the main loops
+
+	#pragma unroll LOCAL_NL+1											//copy the first LOCAL_NL+1 h_l * B_l components to registers
+	for(l = 0; l <= LOCAL_NL; l++)
+		hlBl[l] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
+	
+	for(l = LOCAL_NL+1; l <= Nl; l++)									//copy any additional h_l * B_l components to shared memory
+		shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
+
+	for(size_t w = 0; w < nW; w++){										//for each plane wave
+		cos_phi = p.norm().dot(W[w].kvec().norm());						//calculate the cosine of the angle between the k vector and the direction from the sphere
+		Pl_2 = 1;														//the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
+		Pl_1 = cos_phi;
+		Ei += W[w].E() * hlBl[0] * Pl_2;								//unroll the first two orders using the initial steps of the Legendre recursive relation
+		Ei += W[w].E() * hlBl[1] * Pl_1;		
+
+		#pragma unroll LOCAL_NL-1										//unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file)
+		for(l = 2; l <= LOCAL_NL; l++){
+			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);	//calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs)
+			Ei += W[w].E() * hlBl[l] * Pl;								//calculate and sum the current field order
+			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+			Pl_1 = Pl;
+		}
+
+		for(l = LOCAL_NL+1; l <= Nl; l++){											//do the same as above, except for any additional orders that are stored in shared memory (not registers)
+			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);				//again, this is where most computation in the kernel occurs
+			Ei += W[w].E() * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl;
+			Pl_2 = Pl_1;															//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+			Pl_1 = Pl;			
+		}
+	}
+	E[i] += Ei;															//copy the result to device memory
+}
+
+template<typename T>
+void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){
+	
+	size_t max_shared_mem = stim::sharedMemPerBlock();	
+	size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
+	std::cout<<"hl*Bl array size:  "<<hBl_array<<std::endl;
+	std::cout<<"shared memory:     "<<max_shared_mem<<std::endl;
+	int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
+	std::cout<<"threads per block: "<<threads<<std::endl;
+	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks
+
+	size_t shared_mem;
+	if(Nl <= LOCAL_NL) shared_mem = 0;
+	else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL);				//amount of shared memory to allocate
+	std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
+	cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl);	//call the kernel
+}
+
+template<typename T>
+__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;													//exit if this thread is outside the array
+
+	stim::vec3<T> p;
+	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
+	(y == NULL) ? p[1] = 0 : p[1] = y[i];
+	(z == NULL) ? p[2] = 0 : p[2] = z[i];
+
+	r[i] = p.len();
+}
+/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
+
+/// @param E is a pointer to the destination field values
+/// @param N is the number of points used to calculate the field
+/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
+/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
+/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
+/// @param W is an array of planewaves that will be scattered
+/// @param a is the radius of the sphere
+/// @param n is the complex refractive index of the sphere
+template<typename T>
+void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
+	//calculate the necessary number of orders required to represent the scattered field
+	T k = W[0].kmag();
+
+	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
+	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
+	std::cout<<"Nl: "<<Nl<<std::endl;
+
+	//calculate the scattering coefficients for the sphere
+	stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
+	B_coefficients(B, a, k, n, Nl);
+
+#ifdef CUDA_FOUND
+	stim::complex<T>* dev_E;										//allocate space for the field
+	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
+	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
+	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
+
+	//	COORDINATES
+	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
+	if(x != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
+	if(y != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
+	if(z != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+
+	//	PLANE WAVES
+	stim::scalarwave<T>* dev_W;																//allocate space and copy plane waves
+	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
+	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
+
+	// BESSEL FUNCTION LOOK-UP TABLE
+	//calculate the distance from the sphere center
+	T* dev_r;
+	HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );
+		
+	int threads = stim::maxThreadsPerBlock();
+	dim3 blocks((unsigned)(N / threads + 1));
+	cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);
+
+	//Find the minimum and maximum values of r
+    cublasStatus_t stat;
+    cublasHandle_t handle;
+
+	stat = cublasCreate(&handle);							//create a cuBLAS handle
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS initialization failed\n");
+		exit(1);
+	}
+
+	int i_min, i_max;
+	stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
+		exit(1);
+	}
+	stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
+		exit(1);
+	}
+
+	i_min--;				//cuBLAS uses 1-based indexing for Fortran compatibility
+	i_max--;
+	T r_min, r_max;											//allocate space to store the minimum and maximum values
+	HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
+	HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
+
+	r_min = max(r_min, a);									//if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere)
+
+	//size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1);			//number of values in the look-up table based on the user-specified spacing along r
+	size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1);
+
+	//T kr_min = k * r_min;
+	//T kr_max = k * r_max;
+
+	//temporary variables
+	double vm;															//allocate space to store the return values for the bessel function calculation
+	double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut;
+	stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes);													//pointer to the look-up table
+	T dr = (r_max - r_min) / (N_hB_lut-1);												//distance between values in the LUT
+	std::cout<<"LUT jl bytes:  "<<hB_bytes<<std::endl;
+	stim::complex<T> hl;
+	for(size_t ri = 0; ri < N_hB_lut; ri++){													//for each value in the LUT
+		stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
+		for(size_t l = 0; l <= Nl; l++){													//for each order
+			hl.r = (T)jv[l];
+			hl.i = (T)yv[l];
+
+			hB_lut[ri * (Nl + 1) + l] = hl * B[l];										//store the bessel function result
+			//std::cout<<hB_lut[ri * (Nl + 1) + l]<<std::endl;
+		}
+	}
+	T* real_lut = (T*) malloc(hB_bytes/2);
+	stim::real(real_lut, hB_lut, N_hB_lut);
+	stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer);
+
+	//Allocate device memory and copy everything to the GPU
+	stim::complex<T>* dev_hB_lut;
+	HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
+	HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );
+
+	gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl);
+
+	cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
+
+	if(x != NULL) cudaFree(dev_x);														//free everything
+	if(y != NULL) cudaFree(dev_y);
+	if(z != NULL) cudaFree(dev_z);
+	cudaFree(dev_E);
+#else
+	
+
+	//allocate space to store the bessel function call results
+	double vm;										
+	double* j_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* y_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dj_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dy_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
+
+	T r, kr, cos_phi;
+	stim::complex<T> h;
+	for(size_t i = 0; i < N; i++){
+		stim::vec3<T> p;															//declare a 3D point
+	
+		(x == NULL) ? p[0] = 0 : p[0] = x[i];										// test for NULL values and set positions
+		(y == NULL) ? p[1] = 0 : p[1] = y[i];
+		(z == NULL) ? p[2] = 0 : p[2] = z[i];
+		r = p.len();
+		if(r >= a){
+			for(size_t w = 0; w < W.size(); w++){
+				kr = p.len() * W[w].kmag();											//calculate k*r
+				stim::bessjyv_sph<double>(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr);
+				cos_phi = p.norm().dot(W[w].kvec().norm());							//calculate the cosine of the angle from the propagating direction
+				stim::legendre<T>(Nl, cos_phi, P);
+
+				for(size_t l = 0; l <= Nl; l++){
+					h.r = j_kr[l];
+					h.i = y_kr[l];
+					E[i] += W[w].E() * B[l] * h * P[l];
+				}
+			}
+		}
+	}
+#endif
+}
+
+template<typename T>
+void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){
+	std::vector< stim::scalarwave<T> > W(1, w);
+	cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, r_spacing);
+}
+
+template<typename T>
+__global__ void cuda_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* jA, T r_min, T dr, size_t N_jA, int Nl){
+	extern __shared__ stim::complex<T> shared_jA[];		//declare the list of waves in shared memory
+
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;													//exit if this thread is outside the array
+	stim::vec3<T> p;
+	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
+	(y == NULL) ? p[1] = 0 : p[1] = y[i];
+	(z == NULL) ? p[2] = 0 : p[2] = z[i];
+	
+	T r = p.len();														//calculate the distance from the sphere
+	if(r >= a) return;													//exit if the point is inside the sphere (we only calculate the internal field)
+	T fij = (r - r_min)/dr;											//FP index into the spherical bessel LUT
+	size_t ij = (size_t) fij;											//convert to an integral index
+	T alpha = fij - ij;													//calculate the fractional portion of the index
+	size_t n0j = ij * (Nl + 1);												//start of the first entry in the LUT
+	size_t n1j = (ij+1) * (Nl + 1);											//start of the second entry in the LUT
+
+	T cos_phi;	
+	T Pl_2, Pl_1, Pl;														//declare registers to store the previous two Legendre polynomials
+	
+	stim::complex<T> jAl;
+	stim::complex<T> Ei = 0;											//create a register to store the result
+	int l;
+
+	stim::complex<T> jlAl[LOCAL_NL+1];									//the first LOCAL_NL components are stored in registers for speed
+	int shared_start = threadIdx.x * (Nl - LOCAL_NL);					//wrap up some operations so that they aren't done in the main loops
+
+	#pragma unroll LOCAL_NL+1											//copy the first LOCAL_NL+1 h_l * B_l components to registers
+	for(l = 0; l <= LOCAL_NL; l++)
+		jlAl[l] = clerp<T>( jA[n0j + l], jA[n1j + l], alpha );
+	
+	for(l = LOCAL_NL+1; l <= Nl; l++)									//copy any additional h_l * B_l components to shared memory
+		shared_jA[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( jA[n0j + l], jA[n1j + l], alpha );
+
+	for(size_t w = 0; w < nW; w++){										//for each plane wave
+		if(r == 0) cos_phi = 0;
+		else
+			cos_phi = p.norm().dot(W[w].kvec().norm());						//calculate the cosine of the angle between the k vector and the direction from the sphere
+		Pl_2 = 1;														//the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
+		Pl_1 = cos_phi;
+		Ei += W[w].E() * jlAl[0] * Pl_2;								//unroll the first two orders using the initial steps of the Legendre recursive relation
+		Ei += W[w].E() * jlAl[1] * Pl_1;		
+
+		#pragma unroll LOCAL_NL-1										//unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file)
+		for(l = 2; l <= LOCAL_NL; l++){
+			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);	//calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs)
+			Ei += W[w].E() * jlAl[l] * Pl;								//calculate and sum the current field order
+			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+			Pl_1 = Pl;
+		}
+
+		for(l = LOCAL_NL+1; l <= Nl; l++){											//do the same as above, except for any additional orders that are stored in shared memory (not registers)
+			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);				//again, this is where most computation in the kernel occurs
+			Ei += W[w].E() * shared_jA[shared_start + l - LOCAL_NL - 1] * Pl;
+			Pl_2 = Pl_1;															//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+			Pl_1 = Pl;			
+		}
+	}
+	E[i] = Ei;															//copy the result to device memory
+}
+
+template<typename T>
+void gpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* jA, T r_min, T dr, size_t N_jA, size_t Nl){
+	
+	size_t max_shared_mem = stim::sharedMemPerBlock();	
+	size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
+	std::cout<<"hl*Bl array size:  "<<hBl_array<<std::endl;
+	std::cout<<"shared memory:     "<<max_shared_mem<<std::endl;
+	int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
+	std::cout<<"threads per block: "<<threads<<std::endl;
+	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks
+
+	size_t shared_mem;
+	if(Nl <= LOCAL_NL) shared_mem = 0;
+	else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL);				//amount of shared memory to allocate
+	std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
+	cuda_scalar_mie_internal<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl);	//call the kernel
+}
+
+/// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere
+
+/// @param E is a pointer to the destination field values
+/// @param N is the number of points used to calculate the field
+/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
+/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
+/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
+/// @param w is a planewave that will be scattered
+/// @param a is the radius of the sphere
+/// @param n is the complex refractive index of the sphere
+template<typename T>
+void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n, T r_spacing = 0.1){
+//calculate the necessary number of orders required to represent the scattered field
+	T k = W[0].kmag();
+
+	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
+	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
+	std::cout<<"Nl: "<<Nl<<std::endl;
+
+	//calculate the scattering coefficients for the sphere
+	stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
+	A_coefficients(A, a, k, n, Nl);
+
+#ifdef CUDA_FOUND
+	stim::complex<T>* dev_E;										//allocate space for the field
+	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
+	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
+	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
+
+	//	COORDINATES
+	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
+	if(x != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
+	if(y != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
+	if(z != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+
+	//	PLANE WAVES
+	stim::scalarwave<T>* dev_W;																//allocate space and copy plane waves
+	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
+	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
+
+	// BESSEL FUNCTION LOOK-UP TABLE
+	//calculate the distance from the sphere center
+	T* dev_r;
+	HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );
+		
+	int threads = stim::maxThreadsPerBlock();
+	dim3 blocks((unsigned)(N / threads + 1));
+	cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);
+
+	//Find the minimum and maximum values of r
+    cublasStatus_t stat;
+    cublasHandle_t handle;
+
+	stat = cublasCreate(&handle);							//create a cuBLAS handle
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS initialization failed\n");
+		exit(1);
+	}
+
+	int i_min, i_max;
+	stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
+		exit(1);
+	}
+	stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
+		exit(1);
+	}
+
+	i_min--;				//cuBLAS uses 1-based indexing for Fortran compatibility
+	i_max--;
+	T r_min, r_max;											//allocate space to store the minimum and maximum values
+	HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
+	HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
+
+	r_max = min(r_max, a);		//the internal field doesn't exist outside of the sphere
+
+	size_t N_jA_lut = (size_t)((r_max - r_min) / r_spacing + 1);
+
+	//temporary variables
+	double vm;															//allocate space to store the return values for the bessel function calculation
+	stim::complex<double>* jv = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* yv = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* djv= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dyv= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+
+	size_t jA_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_jA_lut;
+	stim::complex<T>* jA_lut = (stim::complex<T>*) malloc(jA_bytes);													//pointer to the look-up table
+	T dr = (r_max - r_min) / (N_jA_lut-1);												//distance between values in the LUT
+	std::cout<<"LUT jl bytes:  "<<jA_bytes<<std::endl;
+	stim::complex<T> hl;
+	stim::complex<double> nd = (stim::complex<double>)n;
+	for(size_t ri = 0; ri < N_jA_lut; ri++){													//for each value in the LUT
+		stim::cbessjyva_sph<double>(Nl, nd * k * (r_min + ri * dr), vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
+		for(size_t l = 0; l <= Nl; l++){													//for each order
+			jA_lut[ri * (Nl + 1) + l] = (stim::complex<T>)(jv[l] * (stim::complex<double>)A[l]);										//store the bessel function result
+		}
+	}
+
+	//Allocate device memory and copy everything to the GPU
+	stim::complex<T>* dev_jA_lut;
+	HANDLE_ERROR( cudaMalloc(&dev_jA_lut, jA_bytes) );
+	HANDLE_ERROR( cudaMemcpy(dev_jA_lut, jA_lut, jA_bytes, cudaMemcpyHostToDevice) );
+
+	gpu_scalar_mie_internal<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_jA_lut, r_min, dr, N_jA_lut, Nl);
+
+	cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
+
+	if(x != NULL) cudaFree(dev_x);														//free everything
+	if(y != NULL) cudaFree(dev_y);
+	if(z != NULL) cudaFree(dev_z);
+	cudaFree(dev_E);
+#else
+
+	//allocate space to store the bessel function call results
+	double vm;										
+	stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* y_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dj_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dy_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+
+	T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
+
+	T r, cos_phi;
+	stim::complex<double> knr;
+	stim::complex<T> h;
+	for(size_t i = 0; i < N; i++){
+		stim::vec3<T> p;									//declare a 3D point
+	
+		(x == NULL) ? p[0] = 0 : p[0] = x[i];				// test for NULL values and set positions
+		(y == NULL) ? p[1] = 0 : p[1] = y[i];
+		(z == NULL) ? p[2] = 0 : p[2] = z[i];
+		r = p.len();
+		if(r < a){
+			E[i] = 0;
+			for(size_t w = 0; w < W.size(); w++){
+				knr = (stim::complex<double>)n * p.len() * W[w].kmag();							//calculate k*n*r
+
+				stim::cbessjyva_sph<double>(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr);
+				if(r == 0)
+					cos_phi = 0;
+				else
+					cos_phi = p.norm().dot(W[w].kvec().norm());				//calculate the cosine of the angle from the propagating direction
+				stim::legendre<T>(Nl, cos_phi, P);
+								
+				for(size_t l = 0; l <= Nl; l++){
+					E[i] += W[w].E() * A[l] * (stim::complex<T>)j_knr[l] * P[l];
+				}
+			}
+		}
+	}
+#endif
+}
+
+template<typename T>
+void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){
+	std::vector< stim::scalarwave<T> > W(1, w);
+	cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, r_spacing);
+}
+
+
+/// Class stim::scalarmie represents a scalar Mie scattering model that can be used to calculate the fields produced by a scattering sphere.
+template<typename T>
+class scalarmie
+{
+private:
+	T radius;					//radius of the scattering sphere
+	stim::complex<T> n;			//refractive index of the scattering sphere
+	
+public:
+
+	scalarmie(T r, stim::complex<T> ri){
+		radius = r;
+		n = ri;
+	}
+
+	void eval(stim::scalarfield<T>& E, stim::scalarbeam<T> b, int order = 500, int samples = 1000){
+
+		size_t array_size = E.grid_bytes();											//calculate the number of bytes in the scalar grid
+		float* X = (float*) malloc( array_size );									//allocate space for the coordinate meshes
+		float* Y = (float*) malloc( array_size );
+		float* Z = (float*) malloc( array_size );
+		E.meshgrid(X, Y, Z, stim::CPUmem);											//calculate the coordinate meshes
+
+		b.eval(E, X, Y, Z, order);													//evaluate the incident field using a plane wave expansion
+
+		std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves
+		stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
+		stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
+	}
+
+};			//end stim::scalarmie
+
+}			//end namespace stim
+
+#endif
\ No newline at end of file
diff --git a/stim/optics/scalarwave.h b/stim/optics/scalarwave.h
index ac620ab..d993567 100644
--- a/stim/optics/scalarwave.h
+++ b/stim/optics/scalarwave.h
@@ -210,7 +210,7 @@ template<typename T>
 __global__ void cuda_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t n_waves){
 	extern __shared__ stim::scalarwave<T> shared_W[];		//declare the list of waves in shared memory
 
-	stim::cuda::sharedMemcpy(shared_W, W, n_waves, threadIdx.x, blockDim.x);	//copy the plane waves into shared memory for faster access
+	stim::cuda::threadedMemcpy(shared_W, W, n_waves, threadIdx.x, blockDim.x);	//copy the plane waves into shared memory for faster access
 	__syncthreads();															//synchronize threads to insure all data is copied
 
 	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
diff --git a/stim/parser/arguments.h b/stim/parser/arguments.h
index 3ca819e..148893f 100644
--- a/stim/parser/arguments.h
+++ b/stim/parser/arguments.h
@@ -42,11 +42,13 @@ namespace stim{
 		bool char_numerical(char c){
 			if( (c >= '0' && c <= '9') || c == '-' || c == '.')
 				return true;
+			return false;
 		}
 
 		bool char_integral(char c){
 			if( (c >= '0' && c <= '9') || c == '-')
 				return true;
+			return false;
 		}
 
 		/// Test if a given string contains a numerical (real) value
@@ -475,31 +477,31 @@ namespace stim{
         ///Determines of a parameter has been set and returns true if it has
 
         /// @param _name is the name of the argument
-        bool operator()(std::string _name)
-        {
-            size_t i = find(opts.begin(), opts.end(), _name) - opts.begin();
+        bool operator()(std::string _name){
+			std::vector<cmd_option>::iterator it;
+            it = std::find(opts.begin(), opts.end(), _name);// - opts.begin();
 
-            if(i < 0){
+            if(it == opts.end()){
                 std::cout<<"ERROR - Unspecified parameter name: "<<_name<<std::endl;
                 exit(1);
             }
 
-            return opts[i].is_set();
+            return it->is_set();
         }
 
         ///Returns the number of parameters for a specified argument
 
         /// @param _name is the name of the argument whose parameter number will be returned
-        size_t nargs(std::string _name)
-        {
-            size_t i = find(opts.begin(), opts.end(), _name) - opts.begin();
+        size_t nargs(std::string _name){
+			std::vector<cmd_option>::iterator it;
+            it = find(opts.begin(), opts.end(), _name);// - opts.begin();
 
-            if(i < 0){
+            if(it == opts.end()){
                 std::cout<<"ERROR - Unspecified parameter name: "<<_name<<std::endl;
                 exit(1);
             }
 
-            return opts[i].nargs();
+            return it->nargs();
         }
 
         ///Returns the number of arguments that have been set
@@ -525,17 +527,16 @@ namespace stim{
         ///Returns an object describing the argument
 
         /// @param _name is the name of the requested argument
-        cmd_option operator[](std::string _name)
-        {
-            size_t i = find(opts.begin(), opts.end(), _name) - opts.begin();
+        cmd_option operator[](std::string _name){
+			std::vector<cmd_option>::iterator it;
+            it = find(opts.begin(), opts.end(), _name);// - opts.begin();
 
-            if(i < 0 || i >= opts.size())
-            {
+            if(it == opts.end()){
                 std::cout<<"ERROR - Unspecified parameter name: "<<_name<<std::endl;
                 exit(1);
             }
 
-            return opts[i];
+            return *it;
         }
 
 
diff --git a/stim/parser/filename.h b/stim/parser/filename.h
index 57180fe..9ee839e 100644
--- a/stim/parser/filename.h
+++ b/stim/parser/filename.h
@@ -68,6 +68,7 @@ protected:
 
 	//parse a file locator string
 	void parse(std::string loc){
+		if(loc.size() == 0) return;
 
 		loc = unix_div(loc);
 
@@ -307,6 +308,13 @@ public:
 		parse_name(fname);
 	}
 
+	//append a string to the filename and return a new filename
+	stim::filename append(std::string s){
+		stim::filename result = *this;		//create a new filename, copy the current filename
+		result.prefix = prefix + s;			//append the string to the filename
+		return result;
+	}
+
 	//get a path relative to the current one
 	stim::filename get_relative(std::string rel){
 
--
libgit2 0.21.4