From 8309b07a16caaadfd4bc85bfbebdf7041a63f0e0 Mon Sep 17 00:00:00 2001
From: David <mayerich@uh.edu>
Date: Wed, 15 Jun 2016 12:50:08 -0500
Subject: [PATCH] fixed some vec3 errors

---
 stim/math/complex.h      |   1 +
 stim/math/plane_old.h    | 179 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 stim/math/quad.h         | 186 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 stim/math/rect.h         |  47 ++++++++++++++++++++++-------------------------
 stim/math/vec3.h         |   7 +++++++
 stim/math/vector.h       |  11 ++++++-----
 stim/optics/mie.h        | 274 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------------------------
 stim/optics/scalarbeam.h | 317 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------------------------------------------------------------------------------------------------------------------------------------
 8 files changed, 429 insertions(+), 593 deletions(-)
 delete mode 100644 stim/math/plane_old.h
 delete mode 100644 stim/math/quad.h
diff --git a/stim/math/complex.h b/stim/math/complex.h
index e5d0a35..237b3ca 100644
--- a/stim/math/complex.h
+++ b/stim/math/complex.h
@@ -11,6 +11,7 @@
 
 namespace stim
 {
+    enum complexComponentType {complexReal, complexImaginary, complexMag};
 
 template <class T>
 struct complex
diff --git a/stim/math/plane_old.h b/stim/math/plane_old.h
deleted file mode 100644
index b009cad..0000000
--- a/stim/math/plane_old.h
+++ /dev/null
@@ -1,179 +0,0 @@
-#ifndef RTS_PLANE_H
-#define RTS_PLANE_H
-
-#include <iostream>
-#include <stim/math/vector.h>
-#include "rts/cuda/callable.h"
-
-
-namespace stim{
-template <typename T, int D> class plane;
-}
-
-template <typename T, int D>
-CUDA_CALLABLE stim::plane<T, D> operator-(stim::plane<T, D> v);
-
-namespace stim{
-
-template <class T, int D = 3>
-class plane{
-
-	//a plane is defined by a point and a normal
-
-private:
-
-	vec<T, D> P;	//point on the plane
-	vec<T, D> N;	//plane normal
-
-	CUDA_CALLABLE void init(){
-		P = vec<T, D>(0, 0, 0);
-		N = vec<T, D>(0, 0, 1);
-	}
-
-
-public:
-
-	//default constructor
-	CUDA_CALLABLE plane(){
-		init();
-	}
-	
-	CUDA_CALLABLE plane(vec<T, D> n, vec<T, D> p = vec<T, D>(0, 0, 0)){
-		P = p;
-		N = n.norm();
-	}
-
-	CUDA_CALLABLE plane(T z_pos){
-		init();
-		P[2] = z_pos;
-	}
-
-	//create a plane from three points (a triangle)
-	CUDA_CALLABLE plane(vec<T, D> a, vec<T, D> b, vec<T, D> c){
-		P = c;
-		N = (c - a).cross(b - a);
-		if(N.len() == 0)	//handle the degenerate case when two vectors are the same, N = 0
-			N = 0;
-		else
-			N = N.norm();
-	}
-
-	template< typename U >
-	CUDA_CALLABLE operator plane<U, D>(){
-
-		plane<U, D> result(N, P);
-		return result;
-	}
-
-	CUDA_CALLABLE vec<T, D> norm(){
-		return N;
-	}
-
-	CUDA_CALLABLE vec<T, D> p(){
-		return P;
-	}
-
-	//flip the plane front-to-back
-	CUDA_CALLABLE plane<T, D> flip(){
-		plane<T, D> result = *this;
-		result.N = -result.N;
-		return result;
-	}
-
-	//determines how a vector v intersects the plane (1 = intersects front, 0 = within plane, -1 = intersects back)
-	CUDA_CALLABLE int face(vec<T, D> v){
-		
-		T dprod = v.dot(N);		//get the dot product between v and N
-
-		//conditional returns the appropriate value
-		if(dprod < 0)
-			return 1;
-		else if(dprod > 0)
-			return -1;
-		else
-			return 0;
-	}
-
-	//determine on which side of the plane a point lies (1 = front, 0 = on the plane, -1 = back)
-	CUDA_CALLABLE int side(vec<T, D> p){
-
-		vec<T, D> v = p - P;	//get the vector from P to the query point p
-
-		return face(v);
-	}
-
-	//compute the component of v that is perpendicular to the plane
-	CUDA_CALLABLE vec<T, D> perpendicular(vec<T, D> v){
-		return N * v.dot(N);
-	}
-
-	//compute the projection of v in the plane
-	CUDA_CALLABLE vec<T, D> parallel(vec<T, D> v){
-		return v - perpendicular(v);
-	}
-
-	CUDA_CALLABLE void decompose(vec<T, D> v, vec<T, D>& para, vec<T, D>& perp){
-		perp = N * v.dot(N);
-		para = v - perp;
-	}
-
-	//get both the parallel and perpendicular components of a vector v w.r.t. the plane
-	CUDA_CALLABLE void project(vec<T, D> v, vec<T, D> &v_par, vec<T, D> &v_perp){
-
-		v_perp = v.dot(N);
-		v_par = v - v_perp;
-	}
-
-	//compute the reflection of v off of the plane
-	CUDA_CALLABLE vec<T, D> reflect(vec<T, D> v){
-
-		//compute the reflection using N_prime as the plane normal
-		vec<T, D> par = parallel(v);
-		vec<T, D> r = (-v) + par * 2;
-
-		/*std::cout<<"----------------REFLECT-----------------------------"<<std::endl;
-		std::cout<<str()<<std::endl;
-		std::cout<<"v: "<<v<<std::endl;
-		std::cout<<"r: "<<r<<std::endl;
-		std::cout<<"Perpendicular: "<<perpendicular(v)<<std::endl;
-		std::cout<<"Parallel: "<<par<<std::endl;*/
-		return r;
-
-	}
-
-	CUDA_CALLABLE rts::plane<T, D> operator-()
-	{
-		rts::plane<T, D> p = *this;
-
-		//negate the normal vector
-		p.N = -p.N;
-
-		return p;
-	}
-
-	//output a string
-	std::string str(){
-		std::stringstream ss;
-		ss<<"P: "<<P<<std::endl;
-		ss<<"N: "<<N;
-		return ss.str();
-	}
-
-	///////Friendship
-	//friend CUDA_CALLABLE rts::plane<T, D> operator- <> (rts::plane<T, D> v);
-
-
-
-};
-
-}
-
-//arithmetic operators
-
-//negative operator flips the plane (front to back)
-//template <typename T, int D>
-
-
-
-
-#endif
diff --git a/stim/math/quad.h b/stim/math/quad.h
deleted file mode 100644
index c983008..0000000
--- a/stim/math/quad.h
+++ /dev/null
@@ -1,186 +0,0 @@
-#ifndef RTS_QUAD_H
-#define RTS_QUAD_H
-
-//enable CUDA_CALLABLE macro
-#include <stim/cuda/callable.h>
-#include <stim/math/vector.h>
-#include <stim/math/triangle.h>
-#include <stim/math/quaternion.h>
-#include <iostream>
-#include <iomanip>
-#include <algorithm>
-
-namespace stim{
-
-//template for a quadangle class in ND space
-template <class T, int N = 3>
-struct quad
-{
-	/*
-		B------------------>C
-		^                   ^
-		|                   |
-		Y                   |
-		|                   |
-		|                   |
-		A---------X-------->O
-	*/
-
-	/*T A[N];
-	T B[N];
-	T C[N];*/
-
-	rts::vec<T, N> A;
-	rts::vec<T, N> X;
-	rts::vec<T, N> Y;
-
-
-	CUDA_CALLABLE quad()
-	{
-
-	}
-
-	CUDA_CALLABLE quad(vec<T, N> a, vec<T, N> b, vec<T, N> c)
-	{
-
-		A = a;		
-		Y = b - a;
-		X = c - a - Y;
-
-	}
-
-	/*******************************************************************
-	Constructor - create a quad from a position, normal, and rotation
-	*******************************************************************/
-	CUDA_CALLABLE quad(rts::vec<T, N> c, rts::vec<T, N> normal, T width, T height, T theta)
-	{
-
-        //compute the X direction - start along world-space X
-        Y = rts::vec<T, N>(0, 1, 0);
-        if(Y == normal)
-            Y = rts::vec<T, N>(0, 0, 1);
-
-        X = Y.cross(normal).norm();
-
-        std::cout<<X<<std::endl;
-
-        //rotate the X axis by theta radians
-        rts::quaternion<T> q;
-        q.CreateRotation(theta, normal);
-        X = q.toMatrix3() * X;
-        Y = normal.cross(X);
-
-        //normalize everything
-        X = X.norm();
-        Y = Y.norm();
-
-        //scale to match the quad width and height
-        X = X * width;
-        Y = Y * height;
-
-        //set the corner of the plane
-        A = c - X * 0.5f - Y * 0.5f;
-
-        std::cout<<X<<std::endl;
-	}
-
-	//boolean comparison
-	bool operator==(const quad<T, N> & rhs)
-	{
-		if(A == rhs.A && X == rhs.X && Y == rhs.Y)
-			return true;
-		else
-			return false;
-	}
-
-	/*******************************************
-	Return the normal for the quad
-	*******************************************/
-	CUDA_CALLABLE rts::vec<T, N> n()
-	{
-        return (X.cross(Y)).norm();
-	}
-
-	CUDA_CALLABLE rts::vec<T, N> p(T a, T b)
-	{
-		rts::vec<T, N> result;
-		//given the two parameters a, b = [0 1], returns the position in world space
-		result = A + X * a + Y * b;
-
-		return result;
-	}
-
-	CUDA_CALLABLE rts::vec<T, N> operator()(T a, T b)
-	{
-		return p(a, b);
-	}
-
-	std::string str()
-	{
-		std::stringstream ss;
-
-		ss<<std::left<<"B="<<setfill('-')<<setw(20)<<A + Y<<">"<<"C="<<A + Y + X<<std::endl;
-		ss<<setfill(' ')<<setw(23)<<"|"<<"|"<<std::endl<<setw(23)<<"|"<<"|"<<std::endl;
-		ss<<std::left<<"A="<<setfill('-')<<setw(20)<<A<<">"<<"D="<<A + X;
-
-        return ss.str();
-
-	}
-
-	CUDA_CALLABLE quad<T, N> operator*(T rhs)
-	{
-		//scales the plane by a scalar value
-
-		//compute the center point
-		rts::vec<T, N> c = A + X*0.5f + Y*0.5f;
-
-		//create the new quadangle
-		quad<T, N> result;
-		result.X = X * rhs;
-		result.Y = Y * rhs;
-		result.A = c - result.X*0.5f - result.Y*0.5f;
-
-		return result;
-
-	}
-
-	CUDA_CALLABLE T dist(vec<T, N> p)
-	{
-        //compute the distance between a point and this quad
-
-        //first break the quad up into two triangles
-        triangle<T, N> T0(A, A+X, A+Y);
-        triangle<T, N> T1(A+X+Y, A+X, A+Y);
-
-
-        T d0 = T0.dist(p);
-        T d1 = T1.dist(p);
-
-        if(d0 < d1)
-            return d0;
-        else
-            return d1;
-	}
-
-	CUDA_CALLABLE T dist_max(vec<T, N> p)
-	{
-        T da = (A - p).len();
-        T db = (A+X - p).len();
-        T dc = (A+Y - p).len();
-        T dd = (A+X+Y - p).len();
-
-        return std::max( da, std::max(db, std::max(dc, dd) ) );
-	}
-};
-
-}	//end namespace rts
-
-template <typename T, int N>
-std::ostream& operator<<(std::ostream& os, rts::quad<T, N> R)
-{
-    os<<R.str();
-    return os;
-}
-
-
-#endif
diff --git a/stim/math/rect.h b/stim/math/rect.h
index 969f88b..8890a18 100644
--- a/stim/math/rect.h
+++ b/stim/math/rect.h
@@ -28,13 +28,10 @@ class rect : plane <T>
 		O---------X--------->
 	*/
 
-private:
-
-	stim::vec<T> X;
-	stim::vec<T> Y;
-
-	
+protected:
 
+	stim::vec3<T> X;
+	stim::vec3<T> Y;
 
 public:
 
@@ -65,7 +62,7 @@ public:
 	///create a rectangle from a center point, normal
 	///@param c: x,y,z location of the center.
 	///@param n: x,y,z direction of the normal.
-	CUDA_CALLABLE rect(vec<T> c, vec<T> n = vec<T>(0, 0, 1))
+	CUDA_CALLABLE rect(vec3<T> c, vec3<T> n = vec3<T>(0, 0, 1))
 		: plane<T>()
 	{
 		init();			//start with the default setting
@@ -76,7 +73,7 @@ public:
 	///@param c: x,y,z location of the center.
 	///@param s: size of the rectangle.
 	///@param n: x,y,z direction of the normal.
-	CUDA_CALLABLE rect(vec<T> c, T s, vec<T> n = vec<T>(0, 0, 1))
+	CUDA_CALLABLE rect(vec3<T> c, T s, vec3<T> n = vec3<T>(0, 0, 1))
 		: plane<T>()
 	{
 		init();			//start with the default setting
@@ -89,7 +86,7 @@ public:
 	///@param center: x,y,z location of the center.
 	///@param directionX: u,v,w direction of the X vector.
 	///@param directionY: u,v,w direction of the Y vector.
-	CUDA_CALLABLE rect(vec<T> center, vec<T> directionX, vec<T> directionY )
+	CUDA_CALLABLE rect(vec3<T> center, vec3<T> directionX, vec3<T> directionY )
 		 : plane<T>((directionX.cross(directionY)).norm(),center)
 	{
 		X = directionX;
@@ -101,7 +98,7 @@ public:
 	///@param center: x,y,z location of the center.
 	///@param directionX: u,v,w direction of the X vector.
 	///@param directionY: u,v,w direction of the Y vector.
-	CUDA_CALLABLE rect(T size, vec<T> center, vec<T> directionX, vec<T> directionY )
+	CUDA_CALLABLE rect(T size, vec3<T> center, vec3<T> directionX, vec3<T> directionY )
 		: plane<T>((directionX.cross(directionY)).norm(),center)
 	{	
 		X = directionX;
@@ -114,7 +111,7 @@ public:
 	///@param center: x,y,z location of the center.
 	///@param directionX: u,v,w direction of the X vector.
 	///@param directionY: u,v,w direction of the Y vector.
-	CUDA_CALLABLE rect(vec<T> size, vec<T> center, vec<T> directionX, vec<T> directionY)
+	CUDA_CALLABLE rect(vec3<T> size, vec3<T> center, vec3<T> directionX, vec3<T> directionY)
 		: plane<T>((directionX.cross(directionY)).norm(), center)
 	{	
 		X = directionX;
@@ -138,7 +135,7 @@ public:
 
 	///@param n; vector with the normal.
 	///Orients the rectangle along the normal n.
-	CUDA_CALLABLE void normal(vec<T> n)
+	CUDA_CALLABLE void normal(vec3<T> n)
 	{	
 		//orient the rectangle along the specified normal
 		rotate(n, X, Y);
@@ -147,8 +144,8 @@ public:
 	///general init method that sets a general rectangle.
 	CUDA_CALLABLE void init()
 	{
-		X = vec<T>(1, 0, 0);
-		Y = vec<T>(0, 1, 0);
+		X = vec3<T>(1, 0, 0);
+		Y = vec3<T>(0, 1, 0);
 	}
 
 	//boolean comparison
@@ -162,18 +159,18 @@ public:
 
 
 	//get the world space value given the planar coordinates a, b in [0, 1]
-	CUDA_CALLABLE stim::vec<T> p(T a, T b)
+	CUDA_CALLABLE stim::vec3<T> p(T a, T b)
 	{
-		stim::vec<T> result;
+		stim::vec3<T> result;
 		//given the two parameters a, b = [0 1], returns the position in world space
-		vec<T> A = this->P - X * (T)0.5 - Y * (T)0.5;
+		vec3<T> A = this->P - X * (T)0.5 - Y * (T)0.5;
 		result = A + X * a + Y * b;
 
 		return result;
 	}
 
 	//parenthesis operator returns the world space given rectangular coordinates a and b in [0 1]
-	CUDA_CALLABLE stim::vec<T> operator()(T a, T b)
+	CUDA_CALLABLE stim::vec3<T> operator()(T a, T b)
 	{
 		return p(a, b);
 	}
@@ -181,12 +178,12 @@ public:
 	std::string str()
 	{
 		std::stringstream ss;
-		vec<T> A = P - X * (T)0.5 - Y * (T)0.5;
+		vec3<T> A = P - X * (T)0.5 - Y * (T)0.5;
 		ss<<std::left<<"B="<<std::setfill('-')<<std::setw(20)<<A + Y<<">"<<"C="<<A + Y + X<<std::endl;
 		ss<<std::setfill(' ')<<std::setw(23)<<"|"<<"|"<<std::endl<<std::setw(23)<<"|"<<"|"<<std::endl;
 		ss<<std::left<<"A="<<std::setfill('-')<<std::setw(20)<<A<<">"<<"D="<<A + X;
 
-        	return ss.str();
+        return ss.str();
 
 	}
 
@@ -205,11 +202,11 @@ public:
 
 	///computes the distance between the specified point and this rectangle.
 	///@param p: x, y, z coordinates of the point to calculate distance to.
-	CUDA_CALLABLE T dist(vec<T> p)
+	CUDA_CALLABLE T dist(vec3<T> p)
 	{
         //compute the distance between a point and this rect
 
-		vec<T> A = P - X * (T)0.5 - Y * (T)0.5;
+		vec3<T> A = P - X * (T)0.5 - Y * (T)0.5;
 
 		//first break the rect up into two triangles
 		triangle<T> T0(A, A+X, A+Y);
@@ -225,16 +222,16 @@ public:
 		    return d1;
 	}
 
-	CUDA_CALLABLE T center(vec<T> p)
+	CUDA_CALLABLE T center(vec3<T> p)
 	{
 		this->P = p;
 	}
 
 	///Returns the maximum distance of the rectangle from a point p to the sides of the rectangle.
 	///@param p: x, y, z point.
-	CUDA_CALLABLE T dist_max(vec<T> p)
+	CUDA_CALLABLE T dist_max(vec3<T> p)
 	{
-		vec<T> A = P - X * (T)0.5 - Y * (T)0.5;
+		vec3<T> A = P - X * (T)0.5 - Y * (T)0.5;
 		T da = (A - p).len();
 		T db = (A+X - p).len();
 		T dc = (A+Y - p).len();
diff --git a/stim/math/vec3.h b/stim/math/vec3.h
index 892a370..f5cedd2 100644
--- a/stim/math/vec3.h
+++ b/stim/math/vec3.h
@@ -242,4 +242,11 @@ stim::vec3<T> operator*(T lhs, stim::vec3<T> rhs){
     return rhs * lhs;
 }
 
+//stream operator
+template<typename T>
+std::ostream& operator<<(std::ostream& os, stim::vec3<T> const& rhs){
+	os<<rhs.str();
+	return os;
+}
+
 #endif
\ No newline at end of file
diff --git a/stim/math/vector.h b/stim/math/vector.h
index d8edf87..d5dbf87 100644
--- a/stim/math/vector.h
+++ b/stim/math/vector.h
@@ -5,8 +5,9 @@
 #include <cmath>
 #include <sstream>
 #include <vector>
-
+
 #include <stim/cuda/cudatools/callable.h>
+#include <stim/math/vec3.h>
 
 namespace stim
 {
@@ -70,8 +71,8 @@ struct vec : public std::vector<T>
 		size_t N = other.size();
 		resize(N);							//resize the current vector to match the copy
 		for(size_t i=0; i<N; i++){	//copy each element
-			at(i) = other[i];
-		}
+			at(i) = other[i];
+		}
 	}
 
 	//I'm not sure what these were doing here.
@@ -318,8 +319,8 @@ struct vec : public std::vector<T>
 	}
 
 	/// Cast to a vec3
-	operator vec3<T>(){
-		vec3<T> r;
+	operator stim::vec3<T>(){
+		stim::vec3<T> r;
 		size_t N = std::min<size_t>(size(), 3);
 		for(size_t i = 0; i < N; i++)
 			r[i] = at(i);
diff --git a/stim/optics/mie.h b/stim/optics/mie.h
index 9955205..12d208e 100644
--- a/stim/optics/mie.h
+++ b/stim/optics/mie.h
@@ -1,5 +1,6 @@
 #ifndef STIM_MIE_H
 #define STIM_MIE_H
+#include <boost/math/special_functions/bessel.hpp>
 
 #include "scalarwave.h"
 #include "../math/bessel.h"
@@ -43,7 +44,6 @@ void B_coefficients(stim::complex<T>* B, T a, T k, stim::complex<T> n, int Nl){
 		numerator = j_ka[l] * dj_kna[l] * (stim::complex<double>)n - j_kna[l] * dj_ka[l];
 		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
 		B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
-		std::cout<<B[l]<<std::endl;
 	}
 }
 
@@ -84,7 +84,7 @@ void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){
 
 #define LOCAL_NL	16
 template<typename T>
-__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, int Nl){
+__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){
 	extern __shared__ stim::complex<T> shared_hB[];		//declare the list of waves in shared memory
 
 	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
@@ -96,14 +96,11 @@ __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* 
 	
 	T r = p.len();														//calculate the distance from the sphere
 	if(r < a) return;													//exit if the point is inside the sphere (we only calculate the internal field)
-	T k = W[0].kmag();
-	size_t NC = Nl + 1;													//calculate the number of coefficients to be used
-	T kr = r * k;														//calculate the thread value for k*r
-	T fij = (kr - kr_min)/dkr;											//FP index into the spherical bessel LUT
+	T fij = (r - r_min)/dr;											//FP index into the spherical bessel LUT
 	size_t ij = (size_t) fij;											//convert to an integral index
 	T alpha = fij - ij;													//calculate the fractional portion of the index
-	size_t n0j = ij * (NC);												//start of the first entry in the LUT
-	size_t n1j = (ij+1) * (NC);											//start of the second entry in the LUT
+	size_t n0j = ij * (Nl + 1);												//start of the first entry in the LUT
+	size_t n1j = (ij+1) * (Nl + 1);											//start of the second entry in the LUT
 
 	T cos_phi;	
 	T Pl_2, Pl_1, Pl;														//declare registers to store the previous two Legendre polynomials
@@ -112,37 +109,36 @@ __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* 
 	stim::complex<T> Ei = 0;											//create a register to store the result
 	int l;
 
-	stim::complex<T> hlBl[LOCAL_NL+1];
-	int shared_start = threadIdx.x * (Nl - LOCAL_NL);
+	stim::complex<T> hlBl[LOCAL_NL+1];									//the first LOCAL_NL components are stored in registers for speed
+	int shared_start = threadIdx.x * (Nl - LOCAL_NL);					//wrap up some operations so that they aren't done in the main loops
 
-	#pragma unroll LOCAL_NL+1
+	#pragma unroll LOCAL_NL+1											//copy the first LOCAL_NL+1 h_l * B_l components to registers
 	for(l = 0; l <= LOCAL_NL; l++)
 		hlBl[l] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
 	
-	for(l = LOCAL_NL+1; l <= Nl; l++)
+	for(l = LOCAL_NL+1; l <= Nl; l++)									//copy any additional h_l * B_l components to shared memory
 		shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
 
-	for(size_t w = 0; w < nW; w++){
+	for(size_t w = 0; w < nW; w++){										//for each plane wave
 		cos_phi = p.norm().dot(W[w].kvec().norm());						//calculate the cosine of the angle between the k vector and the direction from the sphere
-		Pl_2 = 1;
+		Pl_2 = 1;														//the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
 		Pl_1 = cos_phi;
-		Ei += W[w].E() * hlBl[0] * Pl_2;
+		Ei += W[w].E() * hlBl[0] * Pl_2;								//unroll the first two orders using the initial steps of the Legendre recursive relation
 		Ei += W[w].E() * hlBl[1] * Pl_1;		
 
-		#pragma unroll LOCAL_NL-1
+		#pragma unroll LOCAL_NL-1										//unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file)
 		for(l = 2; l <= LOCAL_NL; l++){
-			Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
-			Ei += W[w].E() * hlBl[l] * Pl;
+			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);	//calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs)
+			Ei += W[w].E() * hlBl[l] * Pl;								//calculate and sum the current field order
 			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
 			Pl_1 = Pl;
 		}
 
-		for(l = LOCAL_NL+1; l <= Nl; l++){
-			Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
-			Ei += W[w].E() * shared_hB[shared_start + (l - (LOCAL_NL+1))] * Pl;
-			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
-			Pl_1 = Pl;
-			
+		for(l = LOCAL_NL+1; l <= Nl; l++){											//do the same as above, except for any additional orders that are stored in shared memory (not registers)
+			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);				//again, this is where most computation in the kernel occurs
+			Ei += W[w].E() * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl;
+			Pl_2 = Pl_1;															//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+			Pl_1 = Pl;			
 		}
 	}
 	E[i] += Ei;															//copy the result to device memory
@@ -152,10 +148,10 @@ template<typename T>
 void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){
 	
 	size_t max_shared_mem = stim::sharedMemPerBlock();	
-	int hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
+	size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
 	std::cout<<"hl*Bl array size:  "<<hBl_array<<std::endl;
 	std::cout<<"shared memory:     "<<max_shared_mem<<std::endl;
-	int threads = (max_shared_mem / hBl_array) / 32 * 32;
+	int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
 	std::cout<<"threads per block: "<<threads<<std::endl;
 	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks
 
@@ -164,7 +160,6 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, sti
 	else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL);				//amount of shared memory to allocate
 	std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
 	cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl);	//call the kernel
-
 }
 
 template<typename T>
@@ -261,16 +256,19 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std
 		exit(1);
 	}
 
+	i_min--;				//cuBLAS uses 1-based indexing for Fortran compatibility
+	i_max--;
 	T r_min, r_max;											//allocate space to store the minimum and maximum values
 	HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
 	HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
 
+	r_min = max(r_min, a);									//if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere)
 
 	//size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1);			//number of values in the look-up table based on the user-specified spacing along r
 	size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1);
 
-	T kr_min = k * r_min;
-	T kr_max = k * r_max;
+	//T kr_min = k * r_min;
+	//T kr_max = k * r_max;
 
 	//temporary variables
 	double vm;															//allocate space to store the return values for the bessel function calculation
@@ -281,27 +279,29 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std
 
 	size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut;
 	stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes);													//pointer to the look-up table
-	T dkr = (kr_max - kr_min) / (N_hB_lut-1);												//distance between values in the LUT
+	T dr = (r_max - r_min) / (N_hB_lut-1);												//distance between values in the LUT
 	std::cout<<"LUT jl bytes:  "<<hB_bytes<<std::endl;
 	stim::complex<T> hl;
-	for(size_t kri = 0; kri < N_hB_lut; kri++){													//for each value in the LUT
-		stim::bessjyv_sph<double>(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
+	for(size_t ri = 0; ri < N_hB_lut; ri++){													//for each value in the LUT
+		stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
 		for(size_t l = 0; l <= Nl; l++){													//for each order
 			hl.r = (T)jv[l];
 			hl.i = (T)yv[l];
 
-			hB_lut[kri * (Nl + 1) + l] = hl * B[l];										//store the bessel function result
+			hB_lut[ri * (Nl + 1) + l] = hl * B[l];										//store the bessel function result
+			//std::cout<<hB_lut[ri * (Nl + 1) + l]<<std::endl;
 		}
 	}
-
-	//stim::cpu2image<T>(hankel_lut, "hankel.bmp", Nl+1, Nlut_j, stim::cmBrewer);
+	T* real_lut = (T*) malloc(hB_bytes/2);
+	stim::real(real_lut, hB_lut, N_hB_lut);
+	stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer);
 
 	//Allocate device memory and copy everything to the GPU
 	stim::complex<T>* dev_hB_lut;
 	HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
 	HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );
 
-	gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, kr_min, dkr, N_hB_lut, Nl);
+	gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl);
 
 	cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
 
@@ -349,9 +349,90 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std
 }
 
 template<typename T>
-void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){
+void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){
 	std::vector< stim::scalarwave<T> > W(1, w);
-	cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n);
+	cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, r_spacing);
+}
+
+template<typename T>
+__global__ void cuda_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* jA, T r_min, T dr, size_t N_jA, int Nl){
+	extern __shared__ stim::complex<T> shared_jA[];		//declare the list of waves in shared memory
+
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;													//exit if this thread is outside the array
+	stim::vec3<T> p;
+	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
+	(y == NULL) ? p[1] = 0 : p[1] = y[i];
+	(z == NULL) ? p[2] = 0 : p[2] = z[i];
+	
+	T r = p.len();														//calculate the distance from the sphere
+	if(r > a) return;													//exit if the point is inside the sphere (we only calculate the internal field)
+	T fij = (r - r_min)/dr;											//FP index into the spherical bessel LUT
+	size_t ij = (size_t) fij;											//convert to an integral index
+	T alpha = fij - ij;													//calculate the fractional portion of the index
+	size_t n0j = ij * (Nl + 1);												//start of the first entry in the LUT
+	size_t n1j = (ij+1) * (Nl + 1);											//start of the second entry in the LUT
+
+	T cos_phi;	
+	T Pl_2, Pl_1, Pl;														//declare registers to store the previous two Legendre polynomials
+	
+	stim::complex<T> jAl;
+	stim::complex<T> Ei = 0;											//create a register to store the result
+	int l;
+
+	stim::complex<T> jlAl[LOCAL_NL+1];									//the first LOCAL_NL components are stored in registers for speed
+	int shared_start = threadIdx.x * (Nl - LOCAL_NL);					//wrap up some operations so that they aren't done in the main loops
+
+	#pragma unroll LOCAL_NL+1											//copy the first LOCAL_NL+1 h_l * B_l components to registers
+	for(l = 0; l <= LOCAL_NL; l++)
+		jlAl[l] = clerp<T>( jA[n0j + l], jA[n1j + l], alpha );
+	
+	for(l = LOCAL_NL+1; l <= Nl; l++)									//copy any additional h_l * B_l components to shared memory
+		shared_jA[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( jA[n0j + l], jA[n1j + l], alpha );
+
+	for(size_t w = 0; w < nW; w++){										//for each plane wave
+		if(r == 0) cos_phi = 0;
+		else
+			cos_phi = p.norm().dot(W[w].kvec().norm());						//calculate the cosine of the angle between the k vector and the direction from the sphere
+		Pl_2 = 1;														//the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
+		Pl_1 = cos_phi;
+		Ei += W[w].E() * jlAl[0] * Pl_2;								//unroll the first two orders using the initial steps of the Legendre recursive relation
+		Ei += W[w].E() * jlAl[1] * Pl_1;		
+
+		#pragma unroll LOCAL_NL-1										//unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file)
+		for(l = 2; l <= LOCAL_NL; l++){
+			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);	//calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs)
+			Ei += W[w].E() * jlAl[l] * Pl;								//calculate and sum the current field order
+			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+			Pl_1 = Pl;
+		}
+
+		for(l = LOCAL_NL+1; l <= Nl; l++){											//do the same as above, except for any additional orders that are stored in shared memory (not registers)
+			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);				//again, this is where most computation in the kernel occurs
+			Ei += W[w].E() * shared_jA[shared_start + l - LOCAL_NL - 1] * Pl;
+			Pl_2 = Pl_1;															//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+			Pl_1 = Pl;			
+		}
+	}
+	E[i] = Ei;															//copy the result to device memory
+}
+
+template<typename T>
+void gpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* jA, T r_min, T dr, size_t N_jA, size_t Nl){
+	
+	size_t max_shared_mem = stim::sharedMemPerBlock();	
+	size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
+	std::cout<<"hl*Bl array size:  "<<hBl_array<<std::endl;
+	std::cout<<"shared memory:     "<<max_shared_mem<<std::endl;
+	int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
+	std::cout<<"threads per block: "<<threads<<std::endl;
+	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks
+
+	size_t shared_mem;
+	if(Nl <= LOCAL_NL) shared_mem = 0;
+	else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL);				//amount of shared memory to allocate
+	std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
+	cuda_scalar_mie_internal<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl);	//call the kernel
 }
 
 /// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere
@@ -365,18 +446,122 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, sti
 /// @param a is the radius of the sphere
 /// @param n is the complex refractive index of the sphere
 template<typename T>
-void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n){
-
-	//calculate the necessary number of orders required to represent the scattered field
+void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n, T r_spacing = 0.1){
+//calculate the necessary number of orders required to represent the scattered field
 	T k = W[0].kmag();
 
-	size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2);
+	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
+	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
 	std::cout<<"Nl: "<<Nl<<std::endl;
 
 	//calculate the scattering coefficients for the sphere
 	stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
 	A_coefficients(A, a, k, n, Nl);
 
+#ifdef CUDA_FOUND
+	stim::complex<T>* dev_E;										//allocate space for the field
+	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
+	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
+	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
+
+	//	COORDINATES
+	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
+	if(x != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
+	if(y != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
+	if(z != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+
+	//	PLANE WAVES
+	stim::scalarwave<T>* dev_W;																//allocate space and copy plane waves
+	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
+	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
+
+	// BESSEL FUNCTION LOOK-UP TABLE
+	//calculate the distance from the sphere center
+	T* dev_r;
+	HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );
+		
+	int threads = stim::maxThreadsPerBlock();
+	dim3 blocks((unsigned)(N / threads + 1));
+	cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);
+
+	//Find the minimum and maximum values of r
+    cublasStatus_t stat;
+    cublasHandle_t handle;
+
+	stat = cublasCreate(&handle);							//create a cuBLAS handle
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS initialization failed\n");
+		exit(1);
+	}
+
+	int i_min, i_max;
+	stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
+		exit(1);
+	}
+	stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
+		exit(1);
+	}
+
+	i_min--;				//cuBLAS uses 1-based indexing for Fortran compatibility
+	i_max--;
+	T r_min, r_max;											//allocate space to store the minimum and maximum values
+	HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
+	HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
+
+	r_max = min(r_max, a);		//the internal field doesn't exist outside of the sphere
+
+	size_t N_jA_lut = (size_t)((r_max - r_min) / r_spacing + 1);
+
+	//temporary variables
+	double vm;															//allocate space to store the return values for the bessel function calculation
+	stim::complex<double>* jv = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* yv = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* djv= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+	stim::complex<double>* dyv= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
+
+	size_t jA_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_jA_lut;
+	stim::complex<T>* jA_lut = (stim::complex<T>*) malloc(jA_bytes);													//pointer to the look-up table
+	T dr = (r_max - r_min) / (N_jA_lut-1);												//distance between values in the LUT
+	std::cout<<"LUT jl bytes:  "<<jA_bytes<<std::endl;
+	stim::complex<T> hl;
+	stim::complex<double> nd = (stim::complex<double>)n;
+	for(size_t ri = 0; ri < N_jA_lut; ri++){													//for each value in the LUT
+		stim::cbessjyva_sph<double>(Nl, nd * k * (r_min + ri * dr), vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
+		for(size_t l = 0; l <= Nl; l++){													//for each order
+			jA_lut[ri * (Nl + 1) + l] = (stim::complex<T>)(jv[l] * (stim::complex<double>)A[l]);										//store the bessel function result
+		}
+	}
+
+	//Allocate device memory and copy everything to the GPU
+	stim::complex<T>* dev_jA_lut;
+	HANDLE_ERROR( cudaMalloc(&dev_jA_lut, jA_bytes) );
+	HANDLE_ERROR( cudaMemcpy(dev_jA_lut, jA_lut, jA_bytes, cudaMemcpyHostToDevice) );
+
+	gpu_scalar_mie_internal<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_jA_lut, r_min, dr, N_jA_lut, Nl);
+
+	cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
+
+	if(x != NULL) cudaFree(dev_x);														//free everything
+	if(y != NULL) cudaFree(dev_y);
+	if(z != NULL) cudaFree(dev_z);
+	cudaFree(dev_E);
+#else
+
 	//allocate space to store the bessel function call results
 	double vm;										
 	stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
@@ -414,12 +599,13 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st
 			}
 		}
 	}
+#endif
 }
 
 template<typename T>
-void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){
+void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){
 	std::vector< stim::scalarwave<T> > W(1, w);
-	cpu_scalar_mie_internal(E, N, x, y, z, W, a, n);
+	cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, r_spacing);
 }
 
 }
diff --git a/stim/optics/scalarbeam.h b/stim/optics/scalarbeam.h
index 93881eb..800f4a3 100644
--- a/stim/optics/scalarbeam.h
+++ b/stim/optics/scalarbeam.h
@@ -1,5 +1,6 @@
 #ifndef RTS_BEAM
 #define RTS_BEAM
+#include <boost/math/special_functions/bessel.hpp>
 
 #include "../math/vec3.h"
 #include "../optics/scalarwave.h"
@@ -7,150 +8,68 @@
 #include "../math/legendre.h"
 #include "../cuda/cudatools/devices.h"
 #include "../cuda/cudatools/timer.h"
+#include "../optics/scalarfield.h"
 #include <cublas_v2.h>
 #include <math_constants.h>
 #include <vector>
 #include <stdlib.h>
 
-namespace stim{
 
-		/// Function returns the value of the scalar field produced by a beam with the specified parameters
-
-		template<typename T>
-		std::vector< stim::vec3<T> > generate_focusing_vectors(size_t N, stim::vec3<T> d, T NA, T NA_in = 0){
-
-			std::vector< stim::vec3<T> > dirs(N);					//allocate an array to store the focusing vectors
-
-			///compute the rotation operator to transform (0, 0, 1) to k
-			T cos_angle = d.dot(vec3<T>(0, 0, 1));
-			stim::matrix<T, 3> rotation;
-
-			//if the cosine of the angle is -1, the rotation is just a flip across the z axis
-			if(cos_angle == -1){
-				rotation(2, 2) = -1;
-			}
-			else if(cos_angle != 1.0)
-			{
-				vec3<T> r_axis = vec3<T>(0, 0, 1).cross(d).norm();	//compute the axis of rotation
-				T angle = acos(cos_angle);							//compute the angle of rotation
-				quaternion<T> quat;							//create a quaternion describing the rotation
-				quat.CreateRotation(angle, r_axis);
-				rotation = quat.toMatrix3();							//compute the rotation matrix
-			}
-
-			//find the phi values associated with the cassegrain ring
-			T PHI[2];
-			PHI[0] = (T)asin(NA);
-			PHI[1] = (T)asin(NA_in);
-
-			//calculate the z-axis cylinder coordinates associated with these angles
-			T Z[2];
-			Z[0] = cos(PHI[0]);
-			Z[1] = cos(PHI[1]);
-			T range = Z[0] - Z[1];
-
-			//draw a distribution of random phi, z values
-			T z, phi, theta;
-			//T kmag = stim::TAU / lambda;
-			for(int i=0; i<N; i++){								//for each sample
-				z = (T)((double)rand() / (double)RAND_MAX) * range + Z[1];			//find a random position on the surface of a cylinder
-				theta = (T)(((double)rand() / (double)RAND_MAX) * stim::TAU);
-				phi = acos(z);													//project onto the sphere, computing phi in spherical coordinates
-
-				//compute and store cartesian coordinates
-				vec3<T> spherical(1, theta, phi);								//convert from spherical to cartesian coordinates
-				vec3<T> cart = spherical.sph2cart();
-				dirs[i] = rotation * cart;										//create a sample vector
-			}
-			return dirs;
-		}
-		
-/// Class stim::beam represents a beam of light focused at a point and composed of several plane waves
-template<typename T>
-class scalarbeam
-{
-public:
-	//enum beam_type {Uniform, Bartlett, Hamming, Hanning};
-
-private:
-	
-	T NA[2];				//numerical aperature of the focusing optics	
-	vec3<T> f;				//focal point
-	vec3<T> d;				//propagation direction
-	stim::complex<T> A;		//beam amplitude
-	T lambda;				//beam wavelength
-public:
 
-	///constructor: build a default beam (NA=1.0)
-	scalarbeam(T wavelength = 1, stim::complex<T> amplitude = 1, vec3<T> focal_point = vec3<T>(0, 0, 0), vec3<T> direction = vec3<T>(0, 0, 1), T numerical_aperture = 1, T center_obsc = 0){
-		lambda = wavelength;
-		A = amplitude;
-		f = focal_point;
-		d = direction.norm();					//make sure that the direction vector is normalized (makes calculations more efficient later on)
-		NA[0] = numerical_aperture;
-		NA[1] = center_obsc;
-	}
+namespace stim{
 
-	///Numerical Aperature functions
-	void setNA(T na)
-	{
-		NA[0] = (T)0;
-		NA[1] = na;
-	}
-	void setNA(T na0, T na1)
-	{
-		NA[0] = na0;
-		NA[1] = na1;
-	}
+/// Function returns the value of the scalar field produced by a beam with the specified parameters
 
-	//Monte-Carlo decomposition into plane waves
-	std::vector< scalarwave<T> > mc(size_t N = 100000) const{
+template<typename T>
+std::vector< stim::vec3<T> > generate_focusing_vectors(size_t N, stim::vec3<T> d, T NA, T NA_in = 0){
 
-		std::vector< stim::vec3<T> > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]);	//generate a random set of N vectors forming a focus
-		std::vector< scalarwave<T> > samples(N);											//create a vector of plane waves
-		T kmag = (T)stim::TAU / lambda;								//calculate the wavenumber
-		stim::complex<T> apw;										//allocate space for the amplitude at the focal point
-		T a = (T)(stim::TAU * (1 - cos(asin(NA[0]))) / (double)N);
-		stim::vec3<T> kpw;											//declare the new k-vector based on the focused plane wave direction
-		for(size_t i=0; i<N; i++){										//for each sample
-			kpw = dirs[i] * kmag;									//calculate the k-vector for the new plane wave
-			apw = a * exp(stim::complex<T>(0, kpw.dot(-f)));				//calculate the amplitude for the new plane wave
-			samples[i] = scalarwave<T>(kpw, apw);			//create a plane wave based on the direction
-		}
+	std::vector< stim::vec3<T> > dirs(N);					//allocate an array to store the focusing vectors
 
-		return samples;
-	}
+	///compute the rotation operator to transform (0, 0, 1) to k
+	T cos_angle = d.dot(vec3<T>(0, 0, 1));
+	stim::matrix<T, 3> rotation;
 
-	/// Calculate the field at a given point
-	/// @param x is the x-coordinate of the field point
-	/// @O is the approximation accuracy
-	stim::complex<T> field(T x, T y, T z, size_t O){
-		std::vector< scalarwave<T> > W = mc(O);
-		T result = 0;											//initialize the result to zero (0)
-		for(size_t i = 0; i < O; i++){							//for each plane wave
-			result += W[i].pos(x, y, z);
-		}
-		return result;
+	//if the cosine of the angle is -1, the rotation is just a flip across the z axis
+	if(cos_angle == -1){
+		rotation(2, 2) = -1;
 	}
-
-	std::string str()
+	else if(cos_angle != 1.0)
 	{
-		std::stringstream ss;
-		ss<<"Beam:"<<std::endl;
-		//ss<<"	Central Plane Wave: "<<beam::E0<<" e^i ( "<<beam::k<<" . r )"<<std::endl;
-		ss<<"	Beam Direction: "<<d<<std::endl;
-		if(NA[0] == 0)
-			ss<<"	NA: "<<NA[1];
-		else
-			ss<<"	NA: "<<NA[0]<<" -- "<<NA[1];
-
-		return ss.str();
+		vec3<T> r_axis = vec3<T>(0, 0, 1).cross(d).norm();	//compute the axis of rotation
+		T angle = acos(cos_angle);							//compute the angle of rotation
+		quaternion<T> quat;							//create a quaternion describing the rotation
+		quat.CreateRotation(angle, r_axis);
+		rotation = quat.toMatrix3();							//compute the rotation matrix
 	}
 
+	//find the phi values associated with the cassegrain ring
+	T PHI[2];
+	PHI[0] = (T)asin(NA);
+	PHI[1] = (T)asin(NA_in);
+
+	//calculate the z-axis cylinder coordinates associated with these angles
+	T Z[2];
+	Z[0] = cos(PHI[0]);
+	Z[1] = cos(PHI[1]);
+	T range = Z[0] - Z[1];
+
+	//draw a distribution of random phi, z values
+	T z, phi, theta;
+	//T kmag = stim::TAU / lambda;
+	for(int i=0; i<N; i++){								//for each sample
+		z = (T)((double)rand() / (double)RAND_MAX) * range + Z[1];			//find a random position on the surface of a cylinder
+		theta = (T)(((double)rand() / (double)RAND_MAX) * stim::TAU);
+		phi = acos(z);													//project onto the sphere, computing phi in spherical coordinates
+
+		//compute and store cartesian coordinates
+		vec3<T> spherical(1, theta, phi);								//convert from spherical to cartesian coordinates
+		vec3<T> cart = spherical.sph2cart();
+		dirs[i] = rotation * cart;										//create a sample vector
+	}
+	return dirs;
+}
 
-
-};			//end beam
-
+		
 /// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional)
 /// @param C is a pointer to Nl + 1 values where the terms will be stored
 template<typename T>
@@ -210,20 +129,19 @@ CUDA_CALLABLE T lerp(T v0, T v1, T t) {
     return fma(t, v1, fma(-t, v0, v0));
 }
 
-#ifdef __CUDACC__
+#ifdef CUDA_FOUND
 template<typename T>
-__global__ void cuda_scalar_psf(stim::complex<T>* E, size_t N, T* r, T* phi, T k, T A, size_t Nl,
+__global__ void cuda_scalar_psf(stim::complex<T>* E, size_t N, T* r, T* phi, T A, size_t Nl,
 								T* C, 
-								T* lut_j, size_t Nj, T min_kr, T dkr){
+								T* lut_j, size_t Nj, T min_r, T dr){
 	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
 	if(i >= N) return;												//exit if this thread is outside the array
 
 	T cos_phi = cos(phi[i]);									//calculate the thread value for cos(phi)
-	T kr = r[i] * k;											//calculate the thread value for k*r
 	stim::complex<T> Ei = 0;									//initialize the value of the field to zero
 	size_t NC = Nl + 1;										//calculate the number of coefficients to be used
 
-	T fij = (kr - min_kr)/dkr;								//FP index into the spherical bessel LUT
+	T fij = (r[i] - min_r)/dr;								//FP index into the spherical bessel LUT
 	size_t ij = (size_t) fij;								//convert to an integral index
 	T a = fij - ij;											//calculate the fractional portion of the index
 	size_t n0j = ij * (NC);									//start of the first entry in the LUT
@@ -276,6 +194,8 @@ void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda,
 		exit(1);
 	}
 
+	i_min--;												//cuBLAS uses 1-based indexing for Fortran compatibility
+	i_max--;
 	T r_min, r_max;											//allocate space to store the minimum and maximum values
 	HANDLE_ERROR( cudaMemcpy(&r_min, r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
 	HANDLE_ERROR( cudaMemcpy(&r_max, r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
@@ -287,29 +207,19 @@ void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda,
 
 	size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1);			//number of values in the look-up table based on the user-specified spacing along r
 
-	T kr_min = k * r_min;
-	T kr_max = k * r_max;
-
-	//temporary variables
-	double vm;															//allocate space to store the return values for the bessel function calculation
-	double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
-	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
 
 	size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j;
-	T* bessel_lut = (T*) malloc(lutj_bytes);													//pointer to the look-up table
-	T delta_kr = (kr_max - kr_min) / (Nlut_j-1);												//distance between values in the LUT
-	std::cout<<"LUT jl bytes:  "<<lutj_bytes<<std::endl;
-	for(size_t kri = 0; kri < Nlut_j; kri++){													//for each value in the LUT
-		stim::bessjyv_sph<double>(Nl, kr_min + kri * delta_kr, vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
+	T* j_lut = (T*) malloc(lutj_bytes);													//pointer to the look-up table
+	T dr = (r_max - r_min) / (Nlut_j-1);												//distance between values in the LUT
+	T jl;
+	for(size_t ri = 0; ri < Nlut_j; ri++){													//for each value in the LUT
 		for(size_t l = 0; l <= Nl; l++){													//for each order
-			bessel_lut[kri * (Nl + 1) + l] = (T)jv[l];										//store the bessel function result
+			jl = boost::math::sph_bessel<T>(l, k*(r_min + ri * dr));					//use boost to calculate the spherical bessel function
+			j_lut[ri * (Nl + 1) + l] = jl;												//store the bessel function result
 		}
 	}
 
-	stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
-
+	stim::cpu2image<T>(j_lut, "j_lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
 	//Allocate device memory and copy everything to the GPU
 
 	T* gpu_C;
@@ -317,12 +227,12 @@ void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda,
 	HANDLE_ERROR( cudaMemcpy(gpu_C, C, C_bytes, cudaMemcpyHostToDevice) );
 	T* gpu_j_lut;
 	HANDLE_ERROR( cudaMalloc(&gpu_j_lut, lutj_bytes) );
-	HANDLE_ERROR( cudaMemcpy(gpu_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) );
+	HANDLE_ERROR( cudaMemcpy(gpu_j_lut, j_lut, lutj_bytes, cudaMemcpyHostToDevice) );
 
 	int threads = stim::maxThreadsPerBlock();			//get the maximum number of threads per block for the CUDA device
 	dim3 blocks( (unsigned)(N / threads + 1));						//calculate the optimal number of blocks
 
-	cuda_scalar_psf<T><<< blocks, threads >>>(E, N, r, phi, (T)stim::TAU/lambda, A, Nl, gpu_C, gpu_j_lut, Nlut_j, kr_min, delta_kr);
+	cuda_scalar_psf<T><<< blocks, threads >>>(E, N, r, phi, A, Nl, gpu_C, gpu_j_lut, Nlut_j, r_min, dr);
 
 	//free the LUT and condenser tables
 	HANDLE_ERROR( cudaFree(gpu_C) );
@@ -392,7 +302,7 @@ __global__ void cuda_cart2psf(T* r, T* phi, size_t N, T* x, T* y, T* z, stim::ve
 	phi[i] = ps[2];										//phi = [0 pi]
 }
 
-#ifdef __CUDACC__
+#ifdef CUDA_FOUND
 /// Calculate the analytical solution to a point spread function given a set of points in cartesian coordinates
 template<typename T>
 void gpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
@@ -419,7 +329,7 @@ template<typename T>
 void cpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
 
 // If CUDA is available, copy the cartesian points to the GPU and evaluate them in a kernel
-#ifdef __CUDACC__
+#ifdef CUDA_FOUND
 
 	T* gpu_x = NULL;
 	if(x != NULL){
@@ -470,13 +380,112 @@ void cpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lamb
 		phi[i] = ps[2];							//phi = [0 pi]
 	}
 
-	cpu_scalar_psf_local(F, N, r, phi, lambda, A, NA, NA_in, Nl);		//call the spherical coordinate CPU function
+	cpu_scalar_psf_local(E, N, r, phi, lambda, A, NA, NA_in, Nl);		//call the spherical coordinate CPU function
 
 	free(r);
 	free(phi);
 #endif
 }
+		
+/// Class stim::beam represents a beam of light focused at a point and composed of several plane waves
+template<typename T>
+class scalarbeam
+{
+public:
+	//enum beam_type {Uniform, Bartlett, Hamming, Hanning};
+
+private:
+	
+	T NA[2];				//numerical aperature of the focusing optics	
+	vec3<T> f;				//focal point
+	vec3<T> d;				//propagation direction
+	T A;		//beam amplitude
+	T lambda;				//beam wavelength
+public:
 
+	///constructor: build a default beam (NA=1.0)
+	scalarbeam(T wavelength = 1, T amplitude = 1, vec3<T> focal_point = vec3<T>(0, 0, 0), vec3<T> direction = vec3<T>(0, 0, 1), T numerical_aperture = 1, T center_obsc = 0){
+		lambda = wavelength;
+		A = amplitude;
+		f = focal_point;
+		d = direction.norm();					//make sure that the direction vector is normalized (makes calculations more efficient later on)
+		NA[0] = numerical_aperture;
+		NA[1] = center_obsc;
+	}
+
+	///Numerical Aperature functions
+	void setNA(T na)
+	{
+		NA[0] = (T)0;
+		NA[1] = na;
+	}
+	void setNA(T na0, T na1)
+	{
+		NA[0] = na0;
+		NA[1] = na1;
+	}
+
+	//Monte-Carlo decomposition into plane waves
+	std::vector< scalarwave<T> > mc(size_t N = 100000) const{
+
+		std::vector< stim::vec3<T> > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]);	//generate a random set of N vectors forming a focus
+		std::vector< scalarwave<T> > samples(N);											//create a vector of plane waves
+		T kmag = (T)stim::TAU / lambda;								//calculate the wavenumber
+		stim::complex<T> apw;										//allocate space for the amplitude at the focal point
+		T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N);			//constant value weights plane waves based on the aperture and number of samples (N)
+		stim::vec3<T> kpw;											//declare the new k-vector based on the focused plane wave direction
+		for(size_t i=0; i<N; i++){										//for each sample
+			kpw = dirs[i] * kmag;									//calculate the k-vector for the new plane wave
+			apw = a * exp(stim::complex<T>(0, kpw.dot(-f)));				//calculate the amplitude for the new plane wave
+			samples[i] = scalarwave<T>(kpw, apw);			//create a plane wave based on the direction
+		}
+		return samples;
+	}
+
+	/// Evaluate the beam to a scalar field using Debye focusing
+	void eval(stim::scalarfield<T>& E, size_t order = 500){
+		size_t array_size = E.grid_bytes();
+		T* X = (T*) malloc( array_size );			//allocate space for the coordinate meshes
+		T* Y = (T*) malloc( array_size );
+		T* Z = (T*) malloc( array_size );
+
+		E.meshgrid(X, Y, Z, stim::CPUmem);			//calculate the coordinate meshes
+		cpu_scalar_psf_cart<T>(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[0], NA[1], order, E.spacing());
+
+		free(X);									//free the coordinate meshes
+		free(Y);
+		free(Z);
+	}
+
+	/// Calculate the field at a given point
+	/// @param x is the x-coordinate of the field point
+	/// @O is the approximation accuracy
+	stim::complex<T> field(T x, T y, T z, size_t O){
+		std::vector< scalarwave<T> > W = mc(O);
+		T result = 0;											//initialize the result to zero (0)
+		for(size_t i = 0; i < O; i++){							//for each plane wave
+			result += W[i].pos(x, y, z);
+		}
+		return result;
+	}
+
+	std::string str()
+	{
+		std::stringstream ss;
+		ss<<"Beam:"<<std::endl;
+		//ss<<"	Central Plane Wave: "<<beam::E0<<" e^i ( "<<beam::k<<" . r )"<<std::endl;
+		ss<<"	Beam Direction: "<<d<<std::endl;
+		if(NA[0] == 0)
+			ss<<"	NA: "<<NA[1];
+		else
+			ss<<"	NA: "<<NA[0]<<" -- "<<NA[1];
+
+		return ss.str();
+	}
+
+
+
+};			//end beam
 }			//end namespace stim
 
 #endif
--
libgit2 0.21.4