From 9339fbad873457047446ee3a90f52296eda250a5 Mon Sep 17 00:00:00 2001
From: David <mayerich@uh.edu>
Date: Mon, 13 Jun 2016 14:29:34 -0500
Subject: [PATCH] implementing mie scattering

---
 stim/cuda/cudatools/callable.h |   2 +-
 stim/image/image.h             |  32 ++++++++++++--------------------
 stim/math/bessel.h             |  42 ++++++++++++++++++++++++------------------
 stim/math/constants.h          |   1 +
 stim/math/matrix.h             |   2 +-
 stim/math/quaternion.h         |   2 ++
 stim/optics/mie.h              | 373 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/optics/scalarbeam.h       | 286 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------------------------------
 stim/optics/scalarwave.h       |  92 +++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------------------
 9 files changed, 698 insertions(+), 134 deletions(-)
 create mode 100644 stim/optics/mie.h

diff --git a/stim/cuda/cudatools/callable.h b/stim/cuda/cudatools/callable.h
index eefc437..52e1c8a 100644
--- a/stim/cuda/cudatools/callable.h
+++ b/stim/cuda/cudatools/callable.h
@@ -2,7 +2,7 @@
 
 //define the CUDA_CALLABLE macro (will prefix all members)
 #ifdef __CUDACC__
-#define CUDA_CALLABLE __host__ __device__
+#define CUDA_CALLABLE __host__ __device__ inline
 #else
 #define CUDA_CALLABLE
 #endif
diff --git a/stim/image/image.h b/stim/image/image.h
index 02f7dc1..674bf9f 100644
--- a/stim/image/image.h
+++ b/stim/image/image.h
@@ -58,12 +58,12 @@ class image{
 
 	int cv_type(){
 		if(std::is_same<T, unsigned char>::value)	return CV_MAKETYPE(CV_8U, (int)C());
-		if(std::is_same<T, char>::value)			return CV_MAKETYPE(CV_8S, (int)C());
-		if(std::is_same<T, unsigned short>::value)	return CV_MAKETYPE(CV_16U, (int)C());
-		if(std::is_same<T, short>::value)			return CV_MAKETYPE(CV_16S, (int)C());
-		if(std::is_same<T, int>::value)				return CV_MAKETYPE(CV_32S, (int)C());
-		if(std::is_same<T, float>::value)			return CV_MAKETYPE(CV_32F, (int)C());
-		if(std::is_same<T, double>::value)			return CV_MAKETYPE(CV_64F, (int)C());
+		else if(std::is_same<T, char>::value)			return CV_MAKETYPE(CV_8S, (int)C());
+		else if(std::is_same<T, unsigned short>::value)	return CV_MAKETYPE(CV_16U, (int)C());
+		else if(std::is_same<T, short>::value)			return CV_MAKETYPE(CV_16S, (int)C());
+		else if(std::is_same<T, int>::value)				return CV_MAKETYPE(CV_32S, (int)C());
+		else if(std::is_same<T, float>::value)			return CV_MAKETYPE(CV_32F, (int)C());
+		else if(std::is_same<T, double>::value)			return CV_MAKETYPE(CV_64F, (int)C());
 
 		std::cout<<"ERROR in stim::image::cv_type - no valid data type found"<<std::endl;
 		exit(1);
@@ -72,12 +72,12 @@ class image{
 	/// Returns the value for "white" based on the dynamic range (assumes white is 1.0 for floating point images)
 	T white(){
 		if(std::is_same<T, unsigned char>::value)		return UCHAR_MAX;
-		if(std::is_same<T, unsigned short>::value)		return SHRT_MAX;
-		if(std::is_same<T, unsigned>::value)			return UINT_MAX;
-		if(std::is_same<T, unsigned long>::value)		return ULONG_MAX;
-		if(std::is_same<T, unsigned long long>::value)	return ULLONG_MAX;
-		if(std::is_same<T, float>::value)				return 1.0f;
-		if(std::is_same<T, double>::value)				return 1.0;
+		else if(std::is_same<T, unsigned short>::value)		return SHRT_MAX;
+		else if(std::is_same<T, unsigned>::value)			return UINT_MAX;
+		else if(std::is_same<T, unsigned long>::value)		return ULONG_MAX;
+		else if(std::is_same<T, unsigned long long>::value)	return ULLONG_MAX;
+		else if(std::is_same<T, float>::value)				return 1.0f;
+		else if(std::is_same<T, double>::value)				return 1.0;
 
 		std::cout<<"ERROR in stim::image::white - no white value known for this data type"<<std::endl;
 
@@ -120,14 +120,6 @@ public:
 		free(img);
 	}
 
-	/*stim::image<T> operator=(const stim::image<T>& I){
-		if(&I == this)									//handle self-assignment
-			return *this;
-		allocate(I.X(), I.Y(), I.C());
-		memcpy(img, I.img, bytes());
-		return *this;
-	}*/
-
 	stim::image<T>& operator=(const stim::image<T>& I){
 		init();
 		if(&I == this)									//handle self-assignment
diff --git a/stim/math/bessel.h b/stim/math/bessel.h
index 6de92a2..6c31c43 100644
--- a/stim/math/bessel.h
+++ b/stim/math/bessel.h
@@ -1258,7 +1258,7 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv,
     P a0,v0,pv0,pv1,vl,ga,gb,vg,vv,w0,w1,ya0,yak,ya1,wa;
     int j,n,k,kz,l,lb,lb0,m;
 
-    a0 = abs(z);
+    a0 = ::abs(z);
     z1 = z;
     z2 = z*z;
     n = (int)v;
@@ -1286,7 +1286,7 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv,
         vm = v;
         return 0;
     }
-    if (real(z1) < 0.0) z1 = -z;
+    if (::real(z1) < 0.0) z1 = -z;
     if (a0 <= 12.0) {
         for (l=0;l<2;l++) {
             vl = v0+l;
@@ -1295,7 +1295,7 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv,
             for (k=1;k<=40;k++) {
                 cr *= -0.25*z2/(k*(k+vl));
                 cjvl += cr;
-                if (abs(cr) < abs(cjvl)*eps) break;
+                if (::abs(cr) < ::abs(cjvl)*eps) break;
             }
            vg = 1.0 + vl;
            ga = gamma(vg);
@@ -1348,7 +1348,7 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv,
                 for (k=1;k<=40;k++) {
                     cr *= -0.25*z2/(k*(k-vl));
                     cjvl += cr;
-                    if (abs(cr) < abs(cjvl)*eps) break;
+                    if (::abs(cr) < ::abs(cjvl)*eps) break;
                 }
                 vg = 1.0-vl;
                 gb = gamma(vg);
@@ -1381,16 +1381,16 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv,
             cyv1 = M_2_PI*(cec*cjv1-1.0/z1-0.25*z1*cs1);
         }
     }
-    if (real(z) < 0.0) {
+    if (::real(z) < 0.0) {
         cfac0 = exp(pv0*cii);
         cfac1 = exp(pv1*cii);
-        if (imag(z) < 0.0) {
+        if (::imag(z) < 0.0) {
             cyv0 = cfac0*cyv0-(P)2.0*(complex<P>)cii*cos(pv0)*cjv0;
             cyv1 = cfac1*cyv1-(P)2.0*(complex<P>)cii*cos(pv1)*cjv1;
             cjv0 /= cfac0;
             cjv1 /= cfac1;
         }
-        else if (imag(z) > 0.0) {
+        else if (::imag(z) > 0.0) {
             cyv0 = cyv0/cfac0+(P)2.0*(complex<P>)cii*cos(pv0)*cjv0;
             cyv1 = cyv1/cfac1+(P)2.0*(complex<P>)cii*cos(pv1)*cjv1;
             cjv0 *= cfac0;
@@ -1421,7 +1421,7 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv,
             cf2 = cf1;
             cf1 = cf;
         }
-        if (abs(cjv0) > abs(cjv1)) cs = cjv0/cf;
+        if (::abs(cjv0) > ::abs(cjv1)) cs = cjv0/cf;
         else cs = cjv1/cf2;
         for (k=0;k<=n;k++) {
             cjv[k] *= cs;
@@ -1433,21 +1433,21 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv,
     }
     cyv[0] = cyv0;
     cyv[1] = cyv1;
-    ya0 = abs(cyv0);
+    ya0 = ::abs(cyv0);
     lb = 0;
     cg0 = cyv0;
     cg1 = cyv1;
     for (k=2;k<=n;k++) {
         cyk = 2.0*(v0+k-1.0)*cg1/z-cg0;
-        yak = abs(cyk);
-        ya1 = abs(cg0);
+        yak = ::abs(cyk);
+        ya1 = ::abs(cg0);
         if ((yak < ya0) && (yak< ya1)) lb = k;
         cyv[k] = cyk;
         cg0 = cg1;
         cg1 = cyk;
     }
     lb0 = 0;
-    if ((lb > 4) && (imag(z) != 0.0)) {
+    if ((lb > 4) && (::imag(z) != 0.0)) {
         while(lb != lb0) {
             ch2 = cone;
             ch1 = czero;
@@ -1470,7 +1470,7 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv,
             cp21 = ch2;
             if (lb == n)
                 cjv[lb+1] = 2.0*(lb+v0)*cjv[lb]/z-cjv[lb-1];
-            if (abs(cjv[0]) > abs(cjv[1])) {
+            if (::abs(cjv[0]) > ::abs(cjv[1])) {
                 cyv[lb+1] = (cjv[lb+1]*cyv0-2.0*cp11/(M_PI*z))/cjv[0];
                 cyv[lb] = (cjv[lb]*cyv0+2.0*cp12/(M_PI*z))/cjv[0];
             }
@@ -1495,8 +1495,8 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv,
                 cyl2 = cylk;
             }
             for (k=2;k<=n;k++) {
-                wa = abs(cyv[k]);
-                if (wa < abs(cyv[k-1])) lb = k;
+                wa = ::abs(cyv[k]);
+                if (wa < ::abs(cyv[k-1])) lb = k;
             }
         }
     }
@@ -1515,12 +1515,18 @@ int cbessjyva_sph(int v,complex<P> z,P &vm,complex<P>*cjv,
     //first, compute the bessel functions of fractional order
     cbessjyva<P>(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp);
 
+	if(z == 0){													//handle degenerate case of z = 0
+		memset(cjv, 0, sizeof(P) * (v+1));
+		cjv[0] = 1;
+	}
+
     //iterate through each and scale
     for(int n = 0; n<=v; n++)
     {
-
-        cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
-        cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
+		if(z != 0){												//handle degenerate case of z = 0
+			cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
+			cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
+		}
 
         cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0));
         cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0));
diff --git a/stim/math/constants.h b/stim/math/constants.h
index 3ea11f9..9d93349 100644
--- a/stim/math/constants.h
+++ b/stim/math/constants.h
@@ -1,6 +1,7 @@
 #ifndef STIM_CONSTANTS_H
 #define STIM_CONSTANTS_H
 
+#include "stim/cuda/cudatools/callable.h"
 namespace stim{
 	const double PI		=	3.1415926535897932384626433832795028841971693993751058209749445923078164062862;
 	const double TAU	=	2 * stim::PI;
diff --git a/stim/math/matrix.h b/stim/math/matrix.h
index 5d749cf..64bcb28 100644
--- a/stim/math/matrix.h
+++ b/stim/math/matrix.h
@@ -55,7 +55,7 @@ struct matrix
 	vec<Y> operator*(vec<Y> rhs){
 		unsigned int N = rhs.size();
 
-		vec3Y> result;
+		vec<Y> result;
 		result.resize(N);
 
 		for(int r=0; r<N; r++)
diff --git a/stim/math/quaternion.h b/stim/math/quaternion.h
index 1fde328..6c90592 100644
--- a/stim/math/quaternion.h
+++ b/stim/math/quaternion.h
@@ -43,6 +43,8 @@ public:
 
 	CUDA_CALLABLE void CreateRotation(vec3<T> from, vec3<T> to){
 
+		from = from.norm();
+		to = to.norm();
 		vec3<T> r = from.cross(to);			//compute the rotation vector
 		T theta = asin(r.len());				//compute the angle of the rotation about r
 		//deal with a zero vector (both k and kn point in the same direction)
diff --git a/stim/optics/mie.h b/stim/optics/mie.h
new file mode 100644
index 0000000..59796d8
--- /dev/null
+++ b/stim/optics/mie.h
@@ -0,0 +1,373 @@
+#ifndef STIM_MIE_H
+#define STIM_MIE_H
+
+#include "scalarwave.h"
+#include "../math/bessel.h"
+#include <cmath>
+
+namespace stim{
+
+
+/// Calculate the scattering coefficients for a spherical scatterer
+template<typename T>
+void B_coefficients(stim::complex<T>* B, T a, T k, stim::complex<T> n, int Nl){
+
+	//temporary variables
+	double vm;															//allocate space to store the return values for the bessel function calculation
+	double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+
+	double ka = k * a;													//store k*a (argument for spherical bessel and Hankel functions)
+	stim::complex<double> kna = k * n * a;								//store k*n*a (argument for spherical bessel functions and derivatives)
+
+	stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka);			//calculate bessel functions and derivatives for k*a
+	stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna);		//calculate complex bessel functions for k*n*a
+
+	stim::complex<double> h_ka, dh_ka;
+	stim::complex<double> numerator, denominator;
+	stim::complex<double> i(0, 1);
+	for(size_t l = 0; l <= Nl; l++){
+		h_ka.r = j_ka[l];
+		h_ka.i = y_ka[l];
+		dh_ka.r = dj_ka[l];
+		dh_ka.i = dy_ka[l];
+
+		numerator = j_ka[l] * dj_kna[l] * (stim::complex<double>)n - j_kna[l] * dj_ka[l];
+		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
+		B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
+		std::cout<<B[l]<<std::endl;
+	}
+}
+
+template<typename T>
+void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){
+	//temporary variables
+	double vm;															//allocate space to store the return values for the bessel function calculation
+	double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+
+	double ka = k * a;													//store k*a (argument for spherical bessel and Hankel functions)
+	stim::complex<double> kna = k * n * a;								//store k*n*a (argument for spherical bessel functions and derivatives)
+
+	stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka);			//calculate bessel functions and derivatives for k*a
+	stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna);		//calculate complex bessel functions for k*n*a
+
+	stim::complex<double> h_ka, dh_ka;
+	stim::complex<double> numerator, denominator;
+	stim::complex<double> i(0, 1);
+	for(size_t l = 0; l <= Nl; l++){
+		h_ka.r = j_ka[l];
+		h_ka.i = y_ka[l];
+		dh_ka.r = dj_ka[l];
+		dh_ka.i = dy_ka[l];
+
+		numerator = j_ka[l] * dh_ka - dj_ka[l] * h_ka;
+		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
+		A[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
+	}
+}
+
+template<typename T>
+__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* B, T* j, T kr_min, T dkr, int Nl){
+	extern __shared__ stim::scalarwave<T> shared_W[];		//declare the list of waves in shared memory
+
+	stim::cuda::sharedMemcpy(shared_W, W, nW, threadIdx.x, blockDim.x);	//copy the plane waves into shared memory for faster access
+	__syncthreads();															//synchronize threads to insure all data is copied
+
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;												//exit if this thread is outside the array
+	stim::vec3<T> p;
+	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
+	(y == NULL) ? p[1] = 0 : p[1] = y[i];
+	(z == NULL) ? p[2] = 0 : p[2] = z[i];
+	
+	T r = p.len();													//calculate the distance from the sphere
+	T k = W[0].kmag();
+	if(r < a) return;												//exit if the point is inside the sphere (we only calculate the internal field)
+
+	size_t NC = Nl + 1;										//calculate the number of coefficients to be used
+	T kr = r * k;											//calculate the thread value for k*r
+	T fij = (kr - kr_min)/dkr;								//FP index into the spherical bessel LUT
+	size_t ij = (size_t) fij;								//convert to an integral index
+	T alpha = fij - ij;											//calculate the fractional portion of the index
+	size_t n0j = ij * (NC);									//start of the first entry in the LUT
+	size_t n1j = (ij+1) * (NC);								//start of the second entry in the LUT
+
+	T cos_phi;	
+	T Pl_2, Pl_1;									//declare registers to store the previous two Legendre polynomials
+	T Pl = 1;										//initialize the current value for the Legendre polynomial
+	T jl;
+	stim::complex<T> Ei = 0;											//create a register to store the result
+	int l;
+	for(size_t w = 0; w < nW; w++){
+		cos_phi = p.norm().dot(W[w].kvec().norm());						//calculate the cosine of the angle between the k vector and the direction from the sphere
+		for(l = 0; l <= Nl; l++){
+			Pl_2 = Pl_1;								//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+			Pl_1 = Pl;
+			if(l == 0){									//computing Pl is done recursively, where the recursive relation
+				Pl = cos_phi;							//	requires the first two orders. This defines the second.
+			}
+			else{										//if this is not the first iteration, use the recursive relation to calculate Pl
+				Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
+			}
+
+			jl = lerp<T>( j[n0j + l], j[n1j + l], alpha );	//read jl from the LUT and interpolate the result
+			Ei += W[w].E() * B[l] * jl * Pl;
+		}
+		//Ei += shared_W[w].pos(p);									//evaluate the plane wave
+	}
+	E[i] += Ei;														//copy the result to device memory
+}
+
+template<typename T>
+void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* B, T* j, T kr_min, T dkr, size_t Nl){
+
+	size_t wave_bytes = sizeof(stim::scalarwave<T>);
+	size_t shared_bytes = stim::sharedMemPerBlock();									//calculate the maximum amount of shared memory available
+	size_t array_bytes = nW * wave_bytes;			//calculate the maximum number of bytes required for the planewave array
+	size_t max_batch = shared_bytes / wave_bytes;				//calculate number of plane waves that will fit into shared memory
+	size_t num_batches = nW / max_batch + 1;								//calculate the number of batches required to process all plane waves
+	size_t batch_bytes = min(nW, max_batch) * wave_bytes;				//initialize the batch size (in bytes) to the maximum batch required
+
+	stim::scalarwave<T>* batch_W;
+	HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes));										//allocate memory for a single batch of plane waves
+
+	int threads = stim::maxThreadsPerBlock();							//get the maximum number of threads per block for the CUDA device
+	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks	
+
+	size_t batch_size;																	//declare a variable to store the size of the current batch
+	size_t waves_processed = 0;															//initialize the number of waves processed to zero
+	while(waves_processed < nW){												//while there are still waves to be processed
+		batch_size = min<size_t>(max_batch, nW - waves_processed);			//process either a whole batch, or whatever is left
+		batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
+		HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice));	//copy the plane waves into global memory
+		cuda_scalar_mie_scatter<T><<< blocks, threads, batch_bytes >>>(E, N, x, y, z, batch_W, batch_size, a, n, B, j, kr_min, dkr, (int)Nl);	//call the kernel
+		waves_processed += batch_size;													//increment the counter indicating how many waves have been processed
+	}
+	cudaFree(batch_W);
+}
+/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
+
+/// @param E is a pointer to the destination field values
+/// @param N is the number of points used to calculate the field
+/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
+/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
+/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
+/// @param W is an array of planewaves that will be scattered
+/// @param a is the radius of the sphere
+/// @param n is the complex refractive index of the sphere
+template<typename T>
+void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n){
+	//calculate the necessary number of orders required to represent the scattered field
+	T k = W[0].kmag();
+
+	size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2);
+
+	//calculate the scattering coefficients for the sphere
+	stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
+	B_coefficients(B, a, k, n, Nl);
+
+#ifdef __CUDACC__
+	stim::complex<T>* dev_E;										//allocate space for the field
+	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
+	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
+	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
+
+	//	COORDINATES
+	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
+	if(x != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
+	if(y != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
+	if(z != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+
+	//	PLANE WAVES
+	stim::scalarwave<T>* dev_W;																//allocate space and copy plane waves
+	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
+	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
+
+	// SCATTERING COEFFICIENTS
+	stim::complex<T>* dev_B;
+	HANDLE_ERROR( cudaMalloc(&dev_B, sizeof(stim::complex<T>) * (Nl+1)) );
+	HANDLE_ERROR( cudaMemcpy(dev_B, B, sizeof(stim::complex<T>) * (Nl+1), cudaMemcpyHostToDevice) );
+
+	// BESSEL FUNCTION LOOK-UP TABLE
+	//size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1);			//number of values in the look-up table based on the user-specified spacing along r
+	size_t Nlut_j = 1024;
+	T r_min = 0;
+	T r_max = 10;
+
+	T kr_min = k * r_min;
+	T kr_max = k * r_max;
+
+	//temporary variables
+	double vm;															//allocate space to store the return values for the bessel function calculation
+	double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j;
+	T* bessel_lut = (T*) malloc(lutj_bytes);													//pointer to the look-up table
+	T dkr = (kr_max - kr_min) / (Nlut_j-1);												//distance between values in the LUT
+	std::cout<<"LUT jl bytes:  "<<lutj_bytes<<std::endl;
+	for(size_t kri = 0; kri < Nlut_j; kri++){													//for each value in the LUT
+		stim::bessjyv_sph<double>(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
+		for(size_t l = 0; l <= Nl; l++){													//for each order
+			bessel_lut[kri * (Nl + 1) + l] = (T)jv[l];										//store the bessel function result
+		}
+	}
+
+	stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
+
+	//Allocate device memory and copy everything to the GPU
+	T* dev_j_lut;
+	HANDLE_ERROR( cudaMalloc(&dev_j_lut, lutj_bytes) );
+	HANDLE_ERROR( cudaMemcpy(dev_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) );
+
+	gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_B, dev_j_lut, kr_min, dkr, Nl);
+
+	cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
+
+	if(x != NULL) cudaFree(dev_x);														//free everything
+	if(y != NULL) cudaFree(dev_y);
+	if(z != NULL) cudaFree(dev_z);
+	cudaFree(dev_E);
+#else
+	
+
+	//allocate space to store the bessel function call results
+	double vm;										
+	double* j_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* y_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dj_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dy_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
+
+	T r, kr, cos_phi;
+	stim::complex<T> h;
+	for(size_t i = 0; i < N; i++){
+		stim::vec3<T> p;															//declare a 3D point
+	
+		(x == NULL) ? p[0] = 0 : p[0] = x[i];										// test for NULL values and set positions
+		(y == NULL) ? p[1] = 0 : p[1] = y[i];
+		(z == NULL) ? p[2] = 0 : p[2] = z[i];
+		r = p.len();
+		if(r >= a){
+			for(size_t w = 0; w < W.size(); w++){
+				kr = p.len() * W[w].kmag();											//calculate k*r
+				stim::bessjyv_sph<double>(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr);
+				cos_phi = p.norm().dot(W[w].kvec().norm());							//calculate the cosine of the angle from the propagating direction
+				stim::legendre<T>(Nl, cos_phi, P);
+
+				for(size_t l = 0; l <= Nl; l++){
+					h.r = j_kr[l];
+					h.i = y_kr[l];
+					E[i] += W[w].E() * B[l] * h * P[l];
+				}
+			}
+		}
+	}
+#endif
+}
+
+template<typename T>
+void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){
+	std::vector< stim::scalarwave<T> > W(1, w);
+	cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n);
+}
+
+/// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere
+
+/// @param E is a pointer to the destination field values
+/// @param N is the number of points used to calculate the field
+/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
+/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
+/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
+/// @param w is a planewave that will be scattered
+/// @param a is the radius of the sphere
+/// @param n is the complex refractive index of the sphere
+template<typename T>
+void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n){
+
+	//calculate the necessary number of orders required to represent the scattered field
+	T k = W[0].kmag();
+
+	size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2);
+
+	//calculate the scattering coefficients for the sphere
+	stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
+	A_coefficients(A, a, k, n, Nl);
+
+	//allocate space to store the bessel function call results
+	double vm;										
+	stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* y_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dj_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dy_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+
+	T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
+
+	T r, cos_phi;
+	stim::complex<double> knr;
+	stim::complex<T> h;
+	for(size_t i = 0; i < N; i++){
+		stim::vec3<T> p;									//declare a 3D point
+	
+		(x == NULL) ? p[0] = 0 : p[0] = x[i];				// test for NULL values and set positions
+		(y == NULL) ? p[1] = 0 : p[1] = y[i];
+		(z == NULL) ? p[2] = 0 : p[2] = z[i];
+		r = p.len();
+		if(r < a){
+			E[i] = 0;
+			for(size_t w = 0; w < W.size(); w++){
+				knr = (stim::complex<double>)n * p.len() * W[w].kmag();							//calculate k*n*r
+
+				stim::cbessjyva_sph<double>(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr);
+				if(r == 0)
+					cos_phi = 0;
+				else
+					cos_phi = p.norm().dot(W[w].kvec().norm());				//calculate the cosine of the angle from the propagating direction
+				stim::legendre<T>(Nl, cos_phi, P);
+								
+				for(size_t l = 0; l <= Nl; l++){
+					E[i] += W[w].E() * A[l] * (stim::complex<T>)j_knr[l] * P[l];
+				}
+			}
+		}
+	}
+}
+
+template<typename T>
+void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){
+	std::vector< stim::scalarwave<T> > W(1, w);
+	cpu_scalar_mie_internal(E, N, x, y, z, W, a, n);
+}
+
+}
+
+#endif
\ No newline at end of file
diff --git a/stim/optics/scalarbeam.h b/stim/optics/scalarbeam.h
index cfb817b..e4dcd06 100644
--- a/stim/optics/scalarbeam.h
+++ b/stim/optics/scalarbeam.h
@@ -5,7 +5,12 @@
 #include "../optics/scalarwave.h"
 #include "../math/bessel.h"
 #include "../math/legendre.h"
+#include "../cuda/cudatools/devices.h"
+#include "../cuda/cudatools/timer.h"
+#include <cublas_v2.h>
+#include <math_constants.h>
 #include <vector>
+#include <stdlib.h>
 
 namespace stim{
 
@@ -105,10 +110,11 @@ public:
 		std::vector< scalarwave<T> > samples(N);											//create a vector of plane waves
 		T kmag = (T)stim::TAU / lambda;								//calculate the wavenumber
 		stim::complex<T> apw;										//allocate space for the amplitude at the focal point
+		T a = stim::TAU * (1 - cos(asin(NA[0]))) / (double)N;
 		stim::vec3<T> kpw;											//declare the new k-vector based on the focused plane wave direction
 		for(size_t i=0; i<N; i++){										//for each sample
 			kpw = dirs[i] * kmag;									//calculate the k-vector for the new plane wave
-			apw = exp(stim::complex<T>(0, kpw.dot(-f)));				//calculate the amplitude for the new plane wave
+			apw = a * exp(stim::complex<T>(0, kpw.dot(-f)));				//calculate the amplitude for the new plane wave
 			samples[i] = scalarwave<T>(kpw, apw);			//create a plane wave based on the direction
 		}
 
@@ -148,7 +154,7 @@ public:
 /// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional)
 /// @param C is a pointer to Nl + 1 values where the terms will be stored
 template<typename T>
-CUDA_CALLABLE void cpu_aperture_integral(T* C, size_t Nl, T NA, T NA_in = 0){
+CUDA_CALLABLE void cpu_aperture_integral(T* C, int Nl, T NA, T NA_in = 0){
 
 	size_t table_bytes = (Nl + 1) * sizeof(T);				//calculate the number of bytes required to store the terms
 	T cos_alpha_1 = cos(asin(NA_in));						//calculate the cosine of the angle subtended by the central obscuration
@@ -182,23 +188,151 @@ CUDA_CALLABLE void cpu_aperture_integral(T* C, size_t Nl, T NA, T NA_in = 0){
 
 /// performs linear interpolation into a look-up table
 template<typename T>
-T lut_lookup(T* lut, T val, size_t N, T min_val, T delta, size_t stride = 0){
-	size_t idx = (size_t)((val - min_val) / delta);
-	T alpha = val - idx * delta + min_val;
+CUDA_CALLABLE void lut_lookup(T* lut_values, T* lut, T val, size_t N, T min_val, T delta, size_t n_vals){
+	T idx = ((val - min_val) / delta);
+	size_t i = (size_t) idx;
+	T a1 = idx - i;
+	T a0 = 1 - a1;
+	size_t n0 = i * n_vals;
+	size_t n1 = (i+1) * n_vals;
+	for(size_t n = 0; n < n_vals; n++){
+		lut_values[n] = lut[n0 + n] * a0 + lut[n1 + n] * a1;
+	}
+}
 
-	if(alpha == 0) return lut[idx];
-	else return lut[idx * stride] * (1 - alpha) + lut[ (idx+1) * stride] * alpha;
+template <typename T>
+CUDA_CALLABLE T lerp(T v0, T v1, T t) {
+    return fma(t, v1, fma(-t, v0, v0));
 }
 
+#ifdef __CUDACC__
 template<typename T>
-void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, stim::vec3<T> f, T NA, T NA_in, int Nl){
-	T k = stim::TAU / lambda;
+__global__ void cuda_scalar_psf(stim::complex<T>* E, size_t N, T* r, T* phi, T k, T A, size_t Nl,
+								T* C, 
+								T* lut_j, size_t Nj, T min_kr, T dkr){
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;												//exit if this thread is outside the array
+
+	T cos_phi = cos(phi[i]);									//calculate the thread value for cos(phi)
+	T kr = r[i] * k;											//calculate the thread value for k*r
+	stim::complex<T> Ei = 0;									//initialize the value of the field to zero
+	size_t NC = Nl + 1;										//calculate the number of coefficients to be used
+
+	T fij = (kr - min_kr)/dkr;								//FP index into the spherical bessel LUT
+	size_t ij = (size_t) fij;								//convert to an integral index
+	T a = fij - ij;											//calculate the fractional portion of the index
+	size_t n0j = ij * (NC);									//start of the first entry in the LUT
+	size_t n1j = (ij+1) * (NC);								//start of the second entry in the LUT
+
+	T jl;											//declare register to store the spherical bessel function
+	T Pl_2, Pl_1;									//declare registers to store the previous two Legendre polynomials
+	T Pl = 1;										//initialize the current value for the Legendre polynomial
+	stim::complex<T> im(0, 1);						//declare i (imaginary 1)
+	stim::complex<T> i_pow(1, 0);					//i_pow stores the current value of i^l so it doesn't have to be re-computed every iteration
+	for(int l = 0; l <= Nl; l++){					//for each order
+		jl = lerp<T>( lut_j[n0j + l], lut_j[n1j + l], a );	//read jl from the LUT and interpolate the result
+		Ei += i_pow * jl * Pl * C[l];				//calculate the value for the field and sum
+		i_pow *= im;								//multiply i^l * i for the next iteration
+		Pl_2 = Pl_1;								//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+		Pl_1 = Pl;
+		if(l == 0){									//computing Pl is done recursively, where the recursive relation
+			Pl = cos_phi;							//	requires the first two orders. This defines the second.
+		}
+		else{										//if this is not the first iteration, use the recursive relation to calculate Pl
+			Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
+		}
+		
+	}
+	E[i] = Ei * A * 2 * CUDART_PI_F;						//scale the integral by the amplitude
+}
+
+template<typename T>
+void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl, T r_spacing){
+
+	//Find the minimum and maximum values of r
+    cublasStatus_t stat;
+    cublasHandle_t handle;
+
+	stat = cublasCreate(&handle);							//create a cuBLAS handle
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS initialization failed\n");
+		exit(1);
+	}
+
+	int i_min, i_max;
+	stat = cublasIsamin(handle, (int)N, r, 1, &i_min);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
+		exit(1);
+	}
+	stat = cublasIsamax(handle, (int)N, r, 1, &i_max);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
+		exit(1);
+	}
+
+	T r_min, r_max;											//allocate space to store the minimum and maximum values
+	HANDLE_ERROR( cudaMemcpy(&r_min, r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
+	HANDLE_ERROR( cudaMemcpy(&r_max, r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
+
+	T k = (T)stim::TAU / lambda;							//calculate the wavenumber from lambda
+	size_t C_bytes = (Nl + 1) * sizeof(T);
+	T* C = (T*) malloc( C_bytes );							//allocate space for the aperture integral terms
+	cpu_aperture_integral(C, Nl, NA, NA_in);				//calculate the aperture integral terms
+
+	size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1);			//number of values in the look-up table based on the user-specified spacing along r
+
+	T kr_min = k * r_min;
+	T kr_max = k * r_max;
+
+	//temporary variables
+	double vm;															//allocate space to store the return values for the bessel function calculation
+	double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
+	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
+
+	size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j;
+	T* bessel_lut = (T*) malloc(lutj_bytes);													//pointer to the look-up table
+	T delta_kr = (kr_max - kr_min) / (Nlut_j-1);												//distance between values in the LUT
+	std::cout<<"LUT jl bytes:  "<<lutj_bytes<<std::endl;
+	for(size_t kri = 0; kri < Nlut_j; kri++){													//for each value in the LUT
+		stim::bessjyv_sph<double>(Nl, kr_min + kri * delta_kr, vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
+		for(size_t l = 0; l <= Nl; l++){													//for each order
+			bessel_lut[kri * (Nl + 1) + l] = (T)jv[l];										//store the bessel function result
+		}
+	}
+
+	stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
+
+	//Allocate device memory and copy everything to the GPU
+
+	T* gpu_C;
+	HANDLE_ERROR( cudaMalloc(&gpu_C, C_bytes) );
+	HANDLE_ERROR( cudaMemcpy(gpu_C, C, C_bytes, cudaMemcpyHostToDevice) );
+	T* gpu_j_lut;
+	HANDLE_ERROR( cudaMalloc(&gpu_j_lut, lutj_bytes) );
+	HANDLE_ERROR( cudaMemcpy(gpu_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) );
+
+	int threads = stim::maxThreadsPerBlock();			//get the maximum number of threads per block for the CUDA device
+	dim3 blocks( (unsigned)(N / threads + 1));						//calculate the optimal number of blocks
 
-	T* C = (T*) malloc( (Nl + 1) * sizeof(T) );					//allocate space for the aperture integral terms
+	cuda_scalar_psf<T><<< blocks, threads >>>(E, N, r, phi, (T)stim::TAU/lambda, A, Nl, gpu_C, gpu_j_lut, Nlut_j, kr_min, delta_kr);
+
+	//free the LUT and condenser tables
+	HANDLE_ERROR( cudaFree(gpu_C) );
+	HANDLE_ERROR( cudaFree(gpu_j_lut) );
+}
+#endif
+
+/// Calculate the analytical solution to a scalar point spread function given a set of spherical coordinates about the PSF (beam propagation along phi = theta = 0)
+template<typename T>
+void cpu_scalar_psf_local(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl){
+	T k = (T)stim::TAU / lambda;
+	size_t C_bytes = (Nl + 1) * sizeof(T);
+	T* C = (T*) malloc( C_bytes );					//allocate space for the aperture integral terms
 	cpu_aperture_integral(C, Nl, NA, NA_in);			//calculate the aperture integral terms
 	memset(F, 0, N * sizeof(stim::complex<T>));
-#ifdef NO_CUDA
-	memset(F, 0, N * sizeof(stim::complex<T>));
 	T jl, Pl, kr, cos_phi;
 
 	double vm;
@@ -225,71 +359,117 @@ void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, 
 
 	free(C);
 	free(Pl_cos_phi);
-#else
-	T min_r = r[0];
-	T max_r = r[0];
-	for(size_t i = 0; i < N; i++){								//find the minimum and maximum values of r (min and max distance from the focal point)
-		if(r[i] < min_r) min_r = r[i];
-		if(r[i] > max_r) max_r = r[i];
-	}
-	T min_kr = k * min_r;
-	T max_kr = k * max_r;
+}
 
-	//temporary variables
-	double vm;
-	double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
+/// Converts a set of cartesian points into spherical coordinates surrounding a point spread function (PSF)
+/// @param r is the output distance from the PSF
+/// @param phi is the non-symmetric direction about the PSF
+/// @param x (x, y, z) are the cartesian coordinates in world space
+/// @f is the focal point of the PSF in cartesian coordinates
+/// @d is the propagation direction of the PSF in cartesian coordinates
+template<typename T>
+__global__ void cuda_cart2psf(T* r, T* phi, size_t N, T* x, T* y, T* z, stim::vec3<T> f, stim::quaternion<T> q){
 
-	size_t Nlut = (size_t)sqrt(N) * 2;
-	T* bessel_lut = (T*) malloc(sizeof(T) * (Nl+1) * Nlut);
-	T delta_kr = (max_kr - min_kr) / (Nlut-1);
-	for(size_t kri = 0; kri < Nlut; kri++){
-		stim::bessjyv_sph<double>(Nl, min_kr + kri * delta_kr, vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
-		for(size_t l = 0; l <= Nl; l++){
-			bessel_lut[kri * (Nl + 1) + l] = (T)jv[l];
-		}
-	}
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;												//exit if this thread is outside the array
 
-	T* Pl_cos_phi = (T*) malloc((Nl + 1) * sizeof(T));
-	T kr, cos_phi, jl, Pl;
-	for(size_t n = 0; n < N; n++){								//for each point in the field
-		kr = k * r[n];											//calculate kr (the optical distance between the focal point and p)
-		cos_phi = std::cos(phi[n]);								//calculate the cosine of phi
-		stim::legendre<T>(Nl, cos_phi, Pl_cos_phi);				//calculate the [0 Nl] legendre polynomials for this point
+	stim::vec3<T> p;									//declare a 3D point
+	
+	(x == NULL) ? p[0] = 0 : p[0] = x[i];				// test for NULL values and set positions
+	(y == NULL) ? p[1] = 0 : p[1] = y[i];
+	(z == NULL) ? p[2] = 0 : p[2] = z[i];
 
-		for(int l = 0; l <= Nl; l++){
-			jl = lut_lookup<T>(&bessel_lut[l], kr, Nlut, min_kr, delta_kr, Nl+1);
-			Pl = Pl_cos_phi[l];
-			F[n] += pow(complex<T>(0, 1), l) * jl * Pl * C[l];
-		}
-		F[n] *= A * stim::TAU;
-	}
-#endif
+	p = p - f;											//shift the point to the center of the PSF (focal point)
+	p = q.toMatrix3() * p;								//rotate the point to align with the propagation direction
+
+	stim::vec3<T> ps = p.cart2sph();									//convert from cartesian to spherical coordinates
+	r[i] = ps[0];										//store r
+	phi[i] = ps[2];										//phi = [0 pi]
 }
 
+#ifdef __CUDACC__
+/// Calculate the analytical solution to a point spread function given a set of points in cartesian coordinates
+template<typename T>
+void gpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
+	
+	T* gpu_r;															//allocate space for the coordinates in r
+	HANDLE_ERROR( cudaMalloc(&gpu_r, sizeof(T) * N) );
+	T* gpu_phi;
+	HANDLE_ERROR( cudaMalloc(&gpu_phi, sizeof(T) * N) );
+	//stim::complex<T>* gpu_E;
+	//HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex<T>) * N) );
+
+	stim::quaternion<T> q;												//create a quaternion
+	q.CreateRotation(d, stim::vec3<T>(0, 0, 1));						//create a mapping from the propagation direction to the PSF space
+	int threads = stim::maxThreadsPerBlock();							//get the maximum number of threads per block for the CUDA device
+	dim3 blocks( (unsigned)(N / threads + 1));							//calculate the optimal number of blocks
+	cuda_cart2psf<T> <<< blocks, threads >>> (gpu_r, gpu_phi, N, x, y, z, f, q);	//call the CUDA kernel to move the cartesian coordinates to PSF space
+
+	gpu_scalar_psf_local(E, N, gpu_r, gpu_phi, lambda, A, NA, NA_in, Nl, r_spacing);
+
+}
+#endif
 
 template<typename T>
-void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, T NA, T NA_in, int Nl){
+void cpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
+
+// If CUDA is available, copy the cartesian points to the GPU and evaluate them in a kernel
+#ifdef __CUDACC__
+
+	T* gpu_x = NULL;
+	if(x != NULL){
+		HANDLE_ERROR( cudaMalloc(&gpu_x, sizeof(T) * N) );
+		HANDLE_ERROR( cudaMemcpy(gpu_x, x, sizeof(T) * N, cudaMemcpyHostToDevice) );
+	}
+	T* gpu_y = NULL;
+	if(y != NULL){
+		HANDLE_ERROR( cudaMalloc(&gpu_y, sizeof(T) * N) );
+		HANDLE_ERROR( cudaMemcpy(gpu_y, y, sizeof(T) * N, cudaMemcpyHostToDevice) );
+	}
+	T* gpu_z = NULL;
+	if(z != NULL){
+		HANDLE_ERROR( cudaMalloc(&gpu_z, sizeof(T) * N) );
+		HANDLE_ERROR( cudaMemcpy(gpu_z, z, sizeof(T) * N, cudaMemcpyHostToDevice) );
+	}
+
+	stim::complex<T>* gpu_E;
+	HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex<T>) * N) );
+	HANDLE_ERROR( cudaMemcpy(gpu_E, E, sizeof(stim::complex<T>) * N, cudaMemcpyHostToDevice) );
+	gpu_scalar_psf_cart<T>(gpu_E, N, gpu_x, gpu_y, gpu_z, lambda, A, f, d, NA, NA_in, Nl, r_spacing);
+	HANDLE_ERROR( cudaMemcpy(E, gpu_E, sizeof(stim::complex<T>) * N, cudaMemcpyDeviceToHost) );
+
+	HANDLE_ERROR( cudaFree(gpu_x) );
+	HANDLE_ERROR( cudaFree(gpu_y) );
+	HANDLE_ERROR( cudaFree(gpu_z) );
+	HANDLE_ERROR( cudaFree(gpu_E) );
+
+#else
 	T* r = (T*) malloc(N * sizeof(T));					//allocate space for p in spherical coordinates
 	T* phi = (T*) malloc(N * sizeof(T));				//	only r and phi are necessary (the scalar PSF is symmetric about theta)
 
-	stim::vec3<T> p, ps;
+	stim::quaternion<T> q;
+	q.CreateRotation(d, stim::vec3<T>(0, 0, 1));
+	stim::matrix<T, 3> R = q.toMatrix3();
+	stim::vec3<T> p, ps, ds;
 	for(size_t i = 0; i < N; i++){
 		(x == NULL) ? p[0] = 0 : p[0] = x[i];	// test for NULL values and set positions
 		(y == NULL) ? p[1] = 0 : p[1] = y[i];
 		(z == NULL) ? p[2] = 0 : p[2] = z[i];
 
+		p = p - f;
+
+		p = R * p;					//rotate the cartesian point
+
 		ps = p.cart2sph();						//convert from cartesian to spherical coordinates
 		r[i] = ps[0];							//store r
 		phi[i] = ps[2];							//phi = [0 pi]
 	}
 
-	cpu_scalar_psf(F, N, r, phi, lambda, A, f, NA, NA_in, Nl);		//call the spherical coordinate CPU function
+	cpu_scalar_psf_local(F, N, r, phi, lambda, A, NA, NA_in, Nl);		//call the spherical coordinate CPU function
 
 	free(r);
 	free(phi);
+#endif
 }
 
 }			//end namespace stim
diff --git a/stim/optics/scalarwave.h b/stim/optics/scalarwave.h
index 54ba253..4e63bed 100644
--- a/stim/optics/scalarwave.h
+++ b/stim/optics/scalarwave.h
@@ -60,7 +60,7 @@ public:
 		return k.len();
 	}
 
-	CUDA_CALLABLE vec3< complex<T> > E(){
+	CUDA_CALLABLE complex<T> E(){
 		return E0;
 	}
 
@@ -235,6 +235,34 @@ void gpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scala
 	cuda_scalarwave<T><<< blocks, threads >>>(F, N, x, y, z, w);			//call the kernel
 }
 
+template<typename T>
+void gpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW){
+
+	size_t wave_bytes = sizeof(stim::scalarwave<T>);
+	size_t shared_bytes = stim::sharedMemPerBlock();									//calculate the maximum amount of shared memory available
+	size_t array_bytes = nW * wave_bytes;			//calculate the maximum number of bytes required for the planewave array
+	size_t max_batch = shared_bytes / wave_bytes;				//calculate number of plane waves that will fit into shared memory
+	size_t num_batches = nW / max_batch + 1;								//calculate the number of batches required to process all plane waves
+	size_t batch_bytes = min(nW, max_batch) * wave_bytes;				//initialize the batch size (in bytes) to the maximum batch required
+
+	stim::scalarwave<T>* batch_W;
+	HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes));										//allocate memory for a single batch of plane waves
+
+	int threads = stim::maxThreadsPerBlock();							//get the maximum number of threads per block for the CUDA device
+	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks	
+
+	size_t batch_size;																	//declare a variable to store the size of the current batch
+	size_t waves_processed = 0;															//initialize the number of waves processed to zero
+	while(waves_processed < nW){												//while there are still waves to be processed
+		batch_size = min<size_t>(max_batch, nW - waves_processed);			//process either a whole batch, or whatever is left
+		batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
+		HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice));	//copy the plane waves into global memory
+		cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(F, N, x, y, z, batch_W, batch_size);	//call the kernel
+		waves_processed += batch_size;													//increment the counter indicating how many waves have been processed
+	}
+	cudaFree(batch_W);
+}
+
 /// Sums a series of coherent plane waves at a specified point
 /// @param field is the output array of field values corresponding to each input point
 /// @param x is an array of x coordinates for the field point
@@ -245,24 +273,13 @@ void gpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scala
 /// @param A is the list of amplitudes for each wave
 /// @param S is the list of propagation directions for each wave
 template<typename T>
-void cpu_sum_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > w_array){
-	size_t S = w_array.size();											//store the number of waves
-#ifdef NO_CUDA
-	memset(F, 0, N * sizeof(stim::complex<T>));
-	T px, py, pz;
-	for(size_t i = 0; i < N; i++){										// for each element in the array
-		(x == NULL) ? px = 0 : px = x[i];								// test for NULL values
-		(y == NULL) ? py = 0 : py = y[i];
-		(z == NULL) ? pz = 0 : pz = z[i];
-
-		for(size_t s = 0; s < S; s++){
-			F[i] += w_array[s].pos(px, py, pz);						//sum all plane waves at this point
-		}
-	}
-#else
+void cpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W){
+	size_t S = W.size();											//store the number of waves
+#ifdef __CUDACC__
 	stim::complex<T>* dev_F;										//allocate space for the field
 	cudaMalloc(&dev_F, N * sizeof(stim::complex<T>));
-	cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
+	cudaMemcpy(dev_F, F, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
+	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
 
 	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
 	if(x != NULL){
@@ -282,28 +299,11 @@ void cpu_sum_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::v
 		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
 	}
 
-	size_t wave_bytes = sizeof(stim::scalarwave<T>);
-	size_t shared_bytes = stim::sharedMemPerBlock();									//calculate the maximum amount of shared memory available
-	size_t array_bytes = w_array.size() * wave_bytes;			//calculate the maximum number of bytes required for the planewave array
-	size_t max_batch = shared_bytes / wave_bytes;				//calculate number of plane waves that will fit into shared memory
-	size_t num_batches = w_array.size() / max_batch + 1;								//calculate the number of batches required to process all plane waves
-	size_t batch_bytes = min(w_array.size(), max_batch) * wave_bytes;				//initialize the batch size (in bytes) to the maximum batch required
-
-	stim::scalarwave<T>* dev_w;
-	HANDLE_ERROR(cudaMalloc(&dev_w, batch_bytes));										//allocate memory for a single batch of plane waves
+	stim::scalarwave<T>* dev_W;
+	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
+	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
 
-	int threads = stim::maxThreadsPerBlock();							//get the maximum number of threads per block for the CUDA device
-	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks	
-
-	size_t batch_size;																	//declare a variable to store the size of the current batch
-	size_t waves_processed = 0;															//initialize the number of waves processed to zero
-	while(waves_processed < w_array.size()){												//while there are still waves to be processed
-		batch_size = min<size_t>(max_batch, w_array.size() - waves_processed);			//process either a whole batch, or whatever is left
-		batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
-		HANDLE_ERROR(cudaMemcpy(dev_w, &w_array[waves_processed], batch_bytes, cudaMemcpyHostToDevice));	//copy the plane waves into global memory
-		cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(dev_F, N, dev_x, dev_y, dev_z, dev_w, batch_size);	//call the kernel
-		waves_processed += batch_size;													//increment the counter indicating how many waves have been processed
-	}
+	gpu_scalarwaves(dev_F, N, dev_x, dev_y, dev_z, dev_W, W.size());
 
 	cudaMemcpy(F, dev_F, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
 
@@ -311,15 +311,25 @@ void cpu_sum_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::v
 	if(y != NULL) cudaFree(dev_y);
 	if(z != NULL) cudaFree(dev_z);
 	cudaFree(dev_F);
-	cudaFree(dev_w);
+#else
+	memset(F, 0, N * sizeof(stim::complex<T>));
+	T px, py, pz;
+	for(size_t i = 0; i < N; i++){										// for each element in the array
+		(x == NULL) ? px = 0 : px = x[i];								// test for NULL values
+		(y == NULL) ? py = 0 : py = y[i];
+		(z == NULL) ? pz = 0 : pz = z[i];
 
+		for(size_t s = 0; s < S; s++){
+			F[i] += w_array[s].pos(px, py, pz);						//sum all plane waves at this point
+		}
+	}
 #endif
 }
 
 template<typename T>
 void cpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
 	std::vector< stim::scalarwave<T> > w_array(1, w);
-	cpu_sum_scalarwaves(F, N, x, y, z, w_array);	
+	cpu_scalarwaves(F, N, x, y, z, w_array);	
 }
 
 
@@ -331,7 +341,7 @@ void cpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scala
 /// @param A is the list of amplitudes for each wave
 /// @param S is the list of propagation directions for each wave
 template<typename T>
-CUDA_CALLABLE stim::complex<T> sum_scalarwaves(T x, T y, T z, std::vector< stim::scalarwave<T> > W){
+CUDA_CALLABLE stim::complex<T> cpu_scalarwaves(T x, T y, T z, std::vector< stim::scalarwave<T> > W){
 	size_t N = W.size();												//get the number of plane wave samples
 	stim::complex<T> field(0, 0);										//initialize the field to zero (0)
 	stim::vec3<T> k;													//allocate space for the direction vector
--
libgit2 0.21.4