updated files and added features to support BIMSim

David Mayerich
1 parent 0012d210
Showing 8 changed files with 235 additions and 112 deletions Show diff stats
stim/cuda/cudatools/devices.h
stim/image/image.h
stim/math/bessel.h
stim/optics/scalarbeam.h
stim/optics/scalarfield.h
stim/optics/scalarmie.h
stim/optics/scalarwave.h
stim/parser/table.h
@@ -4,35 +4,56 @@
 #include <cuda.h>
 namespace stim{
-extern "C"
-int maxThreadsPerBlock()
-{
-	int device;
-	cudaGetDevice(&device);		//get the id of the current device
-	cudaDeviceProp props;		//device property structure
-	cudaGetDeviceProperties(&props, device);
-	return props.maxThreadsPerBlock;
-}
+	extern "C"
+	int maxThreadsPerBlock(){
+		int device;
+		cudaGetDevice(&device);		//get the id of the current device
+		cudaDeviceProp props;		//device property structure
+		cudaGetDeviceProperties(&props, device);
+		return props.maxThreadsPerBlock;
+	}
-extern "C"
-size_t sharedMemPerBlock()
-{
-	int device;
-	cudaGetDevice(&device);		//get the id of the current device
-	cudaDeviceProp props;		//device property structure
-	cudaGetDeviceProperties(&props, device);
-	return props.sharedMemPerBlock;
-}
+	extern "C"
+	size_t sharedMemPerBlock(){
+		int device;
+		cudaGetDevice(&device);		//get the id of the current device
+		cudaDeviceProp props;		//device property structure
+		cudaGetDeviceProperties(&props, device);
+		return props.sharedMemPerBlock;
+	}
-extern "C"
-size_t constMem()
-{
-	int device;
-	cudaGetDevice(&device);		//get the id of the current device
-	cudaDeviceProp props;		//device property structure
-	cudaGetDeviceProperties(&props, device);
-	return props.totalConstMem;
-}
+	extern "C"
+	size_t constMem(){
+		int device;
+		cudaGetDevice(&device);		//get the id of the current device
+		cudaDeviceProp props;		//device property structure
+		cudaGetDeviceProperties(&props, device);
+		return props.totalConstMem;
+	}
+
+	//tests that a given device ID is valid and provides at least the specified compute capability
+	bool testDevice(int d, int major, int minor){
+		int nd;
+		cudaGetDeviceCount(&nd);		//get the number of CUDA devices
+		if(d < nd && d > 0)	{		//if the given ID has an associated device
+			cudaDeviceProp props;
+			cudaGetDeviceProperties(&props, d);	//get the device properties structure
+			if(props.major >= major && props.minor >= minor)
+					return true;
+		}
+		return false;
+	}
+
+	//tests each device ID in a list and returns the number of devices that fit the desired
+	//	compute capability
+	int testDevices(int* dlist, unsigned n_devices, int major, int minor){
+		int valid = 0;
+		for(int d = 0; d < n_devices; d++){
+			if(testDevice(dlist[d], major, minor))
+				valid++;
+		}
+		return valid;
+	}
 }	//end namespace rts
 #endif
@@ -324,6 +324,12 @@ public:
 	//save a file
 	void save(std::string filename){
+		stim::filename file(filename);
+		if (file.extension() == "raw" || file.extension() == "") {
+			std::ofstream outfile(filename, std::ios::binary);
+			outfile.write((char*)img, sizeof(T) * R[0] * R[1] * R[2]);
+			outfile.close();
+		}
 #ifdef USING_OPENCV
 		//OpenCV uses an interleaved format, so convert first and then output
 		T* buffer = (T*) malloc(bytes());
@@ -17,10 +17,10 @@ static complex&lt;double&gt; czero(0.0,0.0);
 template< typename P >
 P gamma(P x)
 {
-	const P EPS = numeric_limits<P>::epsilon();
-	const P FPMIN_MAG = numeric_limits<P>::min();
-	const P FPMIN = numeric_limits<P>::lowest();
-	const P FPMAX = numeric_limits<P>::max();
+	const P EPS = std::numeric_limits<P>::epsilon();
+	const P FPMIN_MAG = std::numeric_limits<P>::min();
+	const P FPMIN = std::numeric_limits<P>::lowest();
+	const P FPMAX = std::numeric_limits<P>::max();
     int i,k,m;
     P ga,gr,r,z;
@@ -94,10 +94,10 @@ template&lt;typename P&gt;
 int bessjy01a(P x,P &j0,P &j1,P &y0,P &y1,
     P &j0p,P &j1p,P &y0p,P &y1p)
 {
-	const P EPS = numeric_limits<P>::epsilon();
-	const P FPMIN_MAG = numeric_limits<P>::min();
-	const P FPMIN = numeric_limits<P>::lowest();
-	const P FPMAX = numeric_limits<P>::max();
+	const P EPS = std::numeric_limits<P>::epsilon();
+	const P FPMIN_MAG = std::numeric_limits<P>::min();
+	const P FPMIN = std::numeric_limits<P>::lowest();
+	const P FPMAX = std::numeric_limits<P>::max();
     P x2,r,ec,w0,w1,r0,r1,cs0,cs1;
     P cu,p0,q0,p1,q1,t1,t2;
@@ -606,10 +606,10 @@ int bessjyv(P v,P x,P &amp;vm,P *jv,P *yv,
     P b,ec,w0,w1,bju0,bju1,pv0,pv1,byvk;
     int j,k,l,m,n,kz;
-	const P EPS = numeric_limits<P>::epsilon();
-	const P FPMIN_MAG = numeric_limits<P>::min();
-	const P FPMIN = numeric_limits<P>::lowest();
-	const P FPMAX = numeric_limits<P>::max();
+	const P EPS = std::numeric_limits<P>::epsilon();
+	const P FPMIN_MAG = std::numeric_limits<P>::min();
+	const P FPMIN = std::numeric_limits<P>::lowest();
+	const P FPMAX = std::numeric_limits<P>::max();
     x2 = x*x;
     n = (int)v;
@@ -405,7 +405,7 @@ private:
 	T NA[2];				//numerical aperature of the focusing optics	
 	vec3<T> f;				//focal point
 	vec3<T> d;				//propagation direction
-	T A;		//beam amplitude
+	T A;					//beam amplitude
 	T lambda;				//beam wavelength
 public:
@@ -440,10 +440,10 @@ public:
 		stim::complex<T> apw;										//allocate space for the amplitude at the focal point
 		T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N);			//constant value weights plane waves based on the aperture and number of samples (N)
 		stim::vec3<T> kpw;											//declare the new k-vector based on the focused plane wave direction
-		for(size_t i=0; i<N; i++){										//for each sample
+		for(size_t i=0; i<N; i++){									//for each sample
 			kpw = dirs[i] * kmag;									//calculate the k-vector for the new plane wave
-			apw = a * exp(stim::complex<T>(0, kpw.dot(-f)));				//calculate the amplitude for the new plane wave
-			samples[i] = scalarwave<T>(kpw, apw);			//create a plane wave based on the direction
+			apw = a * A * exp(stim::complex<T>(0, kpw.dot(-f)));	//calculate the amplitude for the new plane wave
+			samples[i] = scalarwave<T>(kpw, apw);					//create a plane wave based on the direction
 		}
 		return samples;
 	}
@@ -143,8 +143,8 @@ namespace stim{
 	/// Propagate a field slice along its orthogonal direction by a given distance z
-	/// @param Enew is the resulting propogated field
-	/// @param E is the field to be propogated
+	/// @param Enew is the resulting propagated field
+	/// @param E is the field to be propagated
 	/// @param sx is the size of the field in the lateral x direction
 	/// @param sy is the size of the field in the lateral y direction
 	/// @param z is the distance to be propagated
@@ -187,12 +187,13 @@ namespace stim{
 				if(kx_sq + ky_sq < k_sq){
 					kz = sqrt(k_sq - kx_sq - ky_sq);			//estimate kz using the Fresnel approximation				
-					shift = -exp(stim::complex<T>(0, kz * z));
+					shift = exp(stim::complex<T>(0, kz * z));
+					//std::cout << shift << std::endl;
 					K[i] *= shift;
 					K[i] /= (nx*ny);							//normalize the DFT
 				}
 				else{
-					K[i] = 0;
+					K[i] /= (nx*ny);
 				}
 			}
 		}
@@ -342,7 +343,7 @@ public:
 	T spacing(){
 		T du = rect<T>::X.len() / R[0];
 		T dv = rect<T>::Y.len() / R[1];
-		return min<T>(du, dv);
+		return std::min<T>(du, dv);
 	}
 	/// Copy the field array to the GPU, if it isn't already there
@@ -537,7 +538,7 @@ public:
 			memset(E, 0, grid_bytes());
 	}
-	void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer){
+	void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer, T minval = 0, T maxval = 0){
 		if(loc == GPUmem){
 			T* image;
@@ -559,7 +560,10 @@ public:
 				cuda_intensity<T><<< blocks, threads >>>(image, E, size());
 				break;
 			}
-			stim::gpu2image<T>(image, filename, R[0], R[1], stim::cmBrewer);
+			if (minval == maxval)
+				stim::gpu2image<T>(image, filename, R[0], R[1], cmap);
+			else
+				stim::gpu2image<T>(image, filename, R[0], R[1], minval, maxval, cmap);
 			HANDLE_ERROR( cudaFree(image) );
 		}
 		else{
@@ -579,7 +583,10 @@ public:
 				stim::intensity(image, E, size());
 				break;
 			}
-			stim::cpu2image(image, filename, R[0], R[1], cmap);			//save the resulting image
+			if (minval == maxval)
+				stim::cpu2image(image, filename, R[0], R[1], cmap);			//save the resulting image
+			else
+				stim::cpu2image<T>(image, filename, R[0], R[1], minval, maxval, cmap);
 			free(image);												//free the real image
 		}
 	}
@@ -89,7 +89,7 @@ void A_coefficients(stim::complex&lt;T&gt;* A, T a, T k, stim::complex&lt;T&gt; n, int Nl){
 #define LOCAL_NL	16
 template<typename T>
-__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){
+__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::vec3<T> c, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){
 	extern __shared__ stim::complex<T> shared_hB[];		//declare the list of waves in shared memory
 	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
@@ -98,7 +98,7 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* 
 	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
 	(y == NULL) ? p[1] = 0 : p[1] = y[i];
 	(z == NULL) ? p[2] = 0 : p[2] = z[i];
-	
+	p = p - c;
 	T r = p.len();														//calculate the distance from the sphere
 	if(r < a) return;													//exit if the point is inside the sphere (we only calculate the internal field)
 	T fij = (r - r_min)/dr;											//FP index into the spherical bessel LUT
@@ -124,24 +124,27 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* 
 	for(l = LOCAL_NL+1; l <= Nl; l++)									//copy any additional h_l * B_l components to shared memory
 		shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
+	complex<T> e, Ew;
 	for(size_t w = 0; w < nW; w++){										//for each plane wave
 		cos_phi = p.norm().dot(W[w].kvec().norm());						//calculate the cosine of the angle between the k vector and the direction from the sphere
 		Pl_2 = 1;														//the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
 		Pl_1 = cos_phi;
-		Ei += W[w].E() * hlBl[0] * Pl_2;								//unroll the first two orders using the initial steps of the Legendre recursive relation
-		Ei += W[w].E() * hlBl[1] * Pl_1;		
+		e = exp(complex<T>(0, W[w].kvec().dot(c)));
+		Ew = W[w].E() * e;
+		Ei += Ew * hlBl[0] * Pl_2;								//unroll the first two orders using the initial steps of the Legendre recursive relation
+		Ei += Ew * hlBl[1] * Pl_1;		
 		#pragma unroll LOCAL_NL-1										//unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file)
 		for(l = 2; l <= LOCAL_NL; l++){
 			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);	//calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs)
-			Ei += W[w].E() * hlBl[l] * Pl;								//calculate and sum the current field order
+			Ei += Ew * hlBl[l] * Pl;								//calculate and sum the current field order
 			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
 			Pl_1 = Pl;
 		}
 		for(l = LOCAL_NL+1; l <= Nl; l++){											//do the same as above, except for any additional orders that are stored in shared memory (not registers)
 			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);				//again, this is where most computation in the kernel occurs
-			Ei += W[w].E() * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl;
+			Ei += Ew * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl;
 			Pl_2 = Pl_1;															//shift Pl_1 -> Pl_2 and Pl -> Pl_1
 			Pl_1 = Pl;			
 		}
@@ -149,8 +152,18 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* 
 	E[i] += Ei;															//copy the result to device memory
 }
+///Calculate the scalar Mie scattered field on the GPU
+/// @param E (GPU) is the N x N destination scalar field
+/// @param N is the number fo elements of the scalar field in each direction
+/// @param x (GPU) is the grid of X coordinates for each point in E
+/// @param y (GPU) is the grid of Y coordinates for each point in E
+/// @param z (GPU) is the grid of Z coordinates for each point in E
+/// @param W (CPU) is an array of coherent scalar plane waves incident on the Mie scatterer
+/// @param a is the radius of the Mie scatterer
+/// @param n is the complex refractive index of the Mie scatterer
+/// @param r_spacing is the minimum distance between r values of the sample points in E (used to calculate look-up tables)
 template<typename T>
-void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){
+void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::vec3<T> c, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){
 	size_t max_shared_mem = stim::sharedMemPerBlock();	
 	size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
@@ -164,11 +177,11 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, sti
 	if(Nl <= LOCAL_NL) shared_mem = 0;
 	else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL);				//amount of shared memory to allocate
 	//std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
-	cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl);	//call the kernel
+	cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, c, hB, kr_min, dkr, N_hB, (int)Nl);	//call the kernel
 }
 template<typename T>
-__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){
+__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N, stim::vec3<T> c = stim::vec3<T>(0, 0, 0)){
 	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
 	if(i >= N) return;													//exit if this thread is outside the array
@@ -177,10 +190,21 @@ __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){
 	(y == NULL) ? p[1] = 0 : p[1] = y[i];
 	(z == NULL) ? p[2] = 0 : p[2] = z[i];
-	r[i] = p.len();
+	r[i] = (p - c).len();
 }
+
+///Calculate the scalar Mie scattered field on the GPU
+/// @param E (GPU) is the N x N destination scalar field
+/// @param N is the number fo elements of the scalar field in each direction
+/// @param x (GPU) is the grid of X coordinates for each point in E
+/// @param y (GPU) is the grid of Y coordinates for each point in E
+/// @param z (GPU) is the grid of Z coordinates for each point in E
+/// @param W (CPU) is an array of coherent scalar plane waves incident on the Mie scatterer
+/// @param a is the radius of the Mie scatterer
+/// @param n is the complex refractive index of the Mie scatterer
+/// @param r_spacing is the minimum distance between r values of the sample points in E (used to calculate look-up tables)
 template<typename T>
-void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
+void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0), T r_spacing = 0.1){
 	//calculate the necessary number of orders required to represent the scattered field
 	T k = W[0].kmag();
@@ -205,7 +229,7 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
 	int threads = stim::maxThreadsPerBlock();
 	dim3 blocks((unsigned)(N / threads + 1));
-	cuda_dist<T> <<< blocks, threads >>>(dev_r, x, y, z, N);
+	cuda_dist<T> <<< blocks, threads >>>(dev_r, x, y, z, N, c);
 	//Find the minimum and maximum values of r
     cublasStatus_t stat;
@@ -273,10 +297,16 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
 	stim::complex<T>* dev_hB_lut;
 	HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
 	HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );
+	//std::cout << "r_min: " << r_min << std::endl;
+	//std::cout << "dr: " << dr << std::endl;
+	gpu_scalar_mie_scatter<T>(E, N, x, y, z, dev_W, W.size(), a, n, c, dev_hB_lut, r_min, dr, N_hB_lut, Nl);
-	gpu_scalar_mie_scatter<T>(E, N, x, y, z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl);
+	HANDLE_ERROR(cudaMemcpy(E, E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost));			//copy the field from device memory
+
+	HANDLE_ERROR(cudaFree(dev_hB_lut));
+	HANDLE_ERROR(cudaFree(dev_r));
+	HANDLE_ERROR(cudaFree(dev_W));
-	cudaMemcpy(E, E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
 }
 /// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
@@ -289,10 +319,10 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
 /// @param a is the radius of the sphere
 /// @param n is the complex refractive index of the sphere
 template<typename T>
-void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
+void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0)){
-#ifdef CUDA_FOUND
+/*#ifdef CUDA_FOUND
 	stim::complex<T>* dev_E;										//allocate space for the field
 	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
 	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
@@ -315,13 +345,14 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
 		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
 	}
-	gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, W, a, n, r_spacing);
+	gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, W, a, n, c, r_spacing);
 	if(x != NULL) cudaFree(dev_x);														//free everything
 	if(y != NULL) cudaFree(dev_y);
 	if(z != NULL) cudaFree(dev_z);
 	cudaFree(dev_E);
 #else
+*/
 	//calculate the necessary number of orders required to represent the scattered field
 	T k = W[0].kmag();
@@ -345,15 +376,18 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
 	T r, kr, cos_phi;
 	stim::complex<T> h;
+	stim::complex<T> Ew;
 	for(size_t i = 0; i < N; i++){
 		stim::vec3<T> p;															//declare a 3D point
 		(x == NULL) ? p[0] = 0 : p[0] = x[i];										// test for NULL values and set positions
 		(y == NULL) ? p[1] = 0 : p[1] = y[i];
 		(z == NULL) ? p[2] = 0 : p[2] = z[i];
+		p = p - c;
 		r = p.len();
 		if(r >= a){
 			for(size_t w = 0; w < W.size(); w++){
+				Ew = W[w].E() * exp(stim::complex<double>(0, W[w].kvec().dot(c)));
 				kr = p.len() * W[w].kmag();											//calculate k*r
 				stim::bessjyv_sph<double>(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr);
 				cos_phi = p.norm().dot(W[w].kvec().norm());							//calculate the cosine of the angle from the propagating direction
@@ -362,18 +396,18 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
 				for(size_t l = 0; l <= Nl; l++){
 					h.r = j_kr[l];
 					h.i = y_kr[l];
-					E[i] += W[w].E() * B[l] * h * P[l];
+					E[i] += Ew * B[l] * h * P[l];
 				}
 			}
 		}
 	}
-#endif
+//#endif
 }
 template<typename T>
-void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){
+void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0), T r_spacing = 0.1){
 	std::vector< stim::scalarwave<T> > W(1, w);
-	cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, r_spacing);
+	cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, c, r_spacing);
 }
 template<typename T>
@@ -389,14 +423,14 @@ __global__ void cuda_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T*
 	T r = p.len();														//calculate the distance from the sphere
 	if(r >= a) return;													//exit if the point is inside the sphere (we only calculate the internal field)
-	T fij = (r - r_min)/dr;											//FP index into the spherical bessel LUT
+	T fij = (r - r_min)/dr;												//FP index into the spherical bessel LUT
 	size_t ij = (size_t) fij;											//convert to an integral index
 	T alpha = fij - ij;													//calculate the fractional portion of the index
-	size_t n0j = ij * (Nl + 1);												//start of the first entry in the LUT
-	size_t n1j = (ij+1) * (Nl + 1);											//start of the second entry in the LUT
+	size_t n0j = ij * (Nl + 1);											//start of the first entry in the LUT
+	size_t n1j = (ij+1) * (Nl + 1);										//start of the second entry in the LUT
 	T cos_phi;	
-	T Pl_2, Pl_1, Pl;														//declare registers to store the previous two Legendre polynomials
+	T Pl_2, Pl_1, Pl;													//declare registers to store the previous two Legendre polynomials
 	stim::complex<T> jAl;
 	stim::complex<T> Ei = 0;											//create a register to store the result
@@ -415,7 +449,7 @@ __global__ void cuda_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T*
 	for(size_t w = 0; w < nW; w++){										//for each plane wave
 		if(r == 0) cos_phi = 0;
 		else
-			cos_phi = p.norm().dot(W[w].kvec().norm());						//calculate the cosine of the angle between the k vector and the direction from the sphere
+			cos_phi = p.norm().dot(W[w].kvec().norm());					//calculate the cosine of the angle between the k vector and the direction from the sphere
 		Pl_2 = 1;														//the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
 		Pl_1 = cos_phi;
 		Ei += W[w].E() * jlAl[0] * Pl_2;								//unroll the first two orders using the initial steps of the Legendre recursive relation
@@ -468,7 +502,7 @@ void gpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
 /// @param a is the radius of the sphere
 /// @param n is the complex refractive index of the sphere
 template<typename T>
-void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n, T r_spacing = 0.1){
+void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0)){
 //calculate the necessary number of orders required to represent the scattered field
 	T k = W[0].kmag();
@@ -480,7 +514,7 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
 	stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
 	A_coefficients(A, a, k, n, Nl);
-#ifdef CUDA_FOUND
+/*#ifdef CUDA_FOUND
 	stim::complex<T>* dev_E;										//allocate space for the field
 	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
 	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
@@ -538,6 +572,7 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
         printf ("CUBLAS Error: failed to calculate maximum r value.\n");
 		exit(1);
 	}
+	cublasDestroy(handle);									//destroy the CUBLAS handle
 	i_min--;				//cuBLAS uses 1-based indexing for Fortran compatibility
 	i_max--;
@@ -585,9 +620,9 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
 	HANDLE_ERROR( cudaFree(dev_E) );
 	HANDLE_ERROR( cudaFree(dev_W) );
 	HANDLE_ERROR( cudaFree(dev_r) );
-	cudaFree(dev_E);
+	HANDLE_ERROR( cudaFree(dev_E) );
 #else
-
+*/
 	//allocate space to store the bessel function call results
 	double vm;										
 	stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
@@ -600,16 +635,19 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
 	T r, cos_phi;
 	stim::complex<double> knr;
 	stim::complex<T> h;
+	stim::complex<T> Ew;
 	for(size_t i = 0; i < N; i++){
 		stim::vec3<T> p;									//declare a 3D point
 		(x == NULL) ? p[0] = 0 : p[0] = x[i];				// test for NULL values and set positions
 		(y == NULL) ? p[1] = 0 : p[1] = y[i];
 		(z == NULL) ? p[2] = 0 : p[2] = z[i];
+		p = p - c;
 		r = p.len();
 		if(r < a){
 			E[i] = 0;
 			for(size_t w = 0; w < W.size(); w++){
+				Ew = W[w].E() * exp(stim::complex<double>(0, W[w].kvec().dot(c)));
 				knr = (stim::complex<double>)n * p.len() * W[w].kmag();							//calculate k*n*r
 				stim::cbessjyva_sph<double>(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr);
@@ -620,18 +658,18 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
 				stim::legendre<T>(Nl, cos_phi, P);
 				for(size_t l = 0; l <= Nl; l++){
-					E[i] += W[w].E() * A[l] * (stim::complex<T>)j_knr[l] * P[l];
+					E[i] += Ew * A[l] * (stim::complex<T>)j_knr[l] * P[l];
 				}
 			}
 		}
 	}
-#endif
+//#endif
 }
 template<typename T>
-void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){
+void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0)){
 	std::vector< stim::scalarwave<T> > W(1, w);
-	cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, r_spacing);
+	cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, c);
 }
@@ -639,59 +677,89 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
 template<typename T>
 class scalarmie
 {
-private:
+public:
 	T radius;					//radius of the scattering sphere
 	stim::complex<T> n;			//refractive index of the scattering sphere
+	vec3<T> c;					//position of the sphere in space
 public:
-	scalarmie(T r, stim::complex<T> ri){
+	scalarmie() {				//default constructor
+		radius = 0.5;
+		n = stim::complex<T>(1.4, 0.0);
+		c = stim::vec3<T>(0, 0, 0);
+	}
+
+	scalarmie(T r, stim::complex<T> ri, stim::vec3<T> center = stim::vec3<T>(0, 0, 0)){
 		radius = r;
 		n = ri;
+		c = center;
+		//c = stim::vec3<T>(2, 1, 0);
 	}
-	void sum_scat(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){
-		std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves
-		stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
-	}
+	//void sum_scat(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){
+	//	std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves
+	//	stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
+	//}
-	void sum_intern(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){
-		std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves
-		stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
-	}
+	//void sum_intern(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){
+	//	std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves
+	//	stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
+	//}
-	void eval(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int order = 500, int samples = 1000){
-		b.eval(E, X, Y, Z, order);													//evaluate the incident field using a plane wave expansion
-		std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves		
-		sum_scat(E, X, Y, Z, b, samples);
-		sum_intern(E, X, Y, Z, b, samples);
-	}
+	//void eval(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int order = 500, int samples = 1000){
+	//	b.eval(E, X, Y, Z, order);													//evaluate the incident field using a plane wave expansion
+	//	std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves		
+	//	sum_scat(E, X, Y, Z, b, samples);
+	//	sum_intern(E, X, Y, Z, b, samples);
+	//}
 	void eval(stim::scalarfield<T>& E, stim::scalarbeam<T> b, int order = 500, int samples = 1000){
-		/*size_t array_size = E.grid_bytes();											//calculate the number of bytes in the scalar grid
-		float* X = (float*) malloc( array_size );									//allocate space for the coordinate meshes
-		float* Y = (float*) malloc( array_size );
-		float* Z = (float*) malloc( array_size );
-		E.meshgrid(X, Y, Z, stim::CPUmem);											//calculate the coordinate meshes
-		*/
 		E.meshgrid();
 		b.eval(E, order);
 		std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves
 		if(E.gpu()){
-			stim::gpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing());
+			stim::gpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c, E.spacing());
 		}
 		else{
 			stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing());
 			stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing());
 		}
-		//eval(E, X, Y, Z, b, order, samples);										//evaluate the field		
 	}
 };			//end stim::scalarmie
+template<typename T>
+class scalarcluster : public std::vector< scalarmie<T> > {
+public:
+
+	void eval(stim::scalarfield<T>& E, stim::scalarbeam<T> b, int order = 500, int samples = 1000) {
+		E.meshgrid();
+		b.eval(E, order);
+
+		std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves
+
+		T radius;
+		stim::complex<T> n;
+		stim::vec3<T> c;
+		for (size_t si = 0; si < size(); si++) {									//for each sphere in the cluster
+			radius = at(si).radius;
+			n = at(si).n;
+			c = at(si).c;
+			if (E.gpu()) {
+				stim::gpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c, E.spacing());
+			}
+			else {
+				stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c);
+				stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c);
+			}
+		}
+	}
+};
+
 }			//end namespace stim
 #endif
 \ No newline at end of file
@@ -252,7 +252,7 @@ void gpu_scalarwaves(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scal
 	size_t batch_size;																	//declare a variable to store the size of the current batch
 	size_t waves_processed = 0;															//initialize the number of waves processed to zero
 	while(waves_processed < nW){												//while there are still waves to be processed
-		batch_size = min<size_t>(max_batch, nW - waves_processed);			//process either a whole batch, or whatever is left
+		batch_size = std::min<size_t>(max_batch, nW - waves_processed);			//process either a whole batch, or whatever is left
 		batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
 		HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice));	//copy the plane waves into global memory
 		cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(F, N, x, y, z, batch_W, batch_size);	//call the kernel
-#ifndef STIM_CSV_H
-#define STIM_CSV_H
+#ifndef STIM_TABLE_H
+#define STIM_TABLE_H
 #include <type_traits>
 #include <vector>
@@ -102,6 +102,27 @@ namespace stim{
 			return result;											//return the casted table
 		}
+		size_t rows(){
+			return TABLE.size();
+		}
+
+		size_t cols(size_t ri){
+			return TABLE[ri].size();
+		}
+
+		//returns the maximum number of columns in the table
+		size_t cols(){
+			size_t max_c = 0;
+			for(size_t ri = 0; ri < rows(); ri++){
+				max_c = std::max(max_c, cols(ri));
+			}
+			return max_c;
+		}
+
+		std::string operator()(size_t ri, size_t ci){
+			return TABLE[ri][ci];
+		}
+
 	};
 };