stim/optics/scalarmie.h

#ifndef STIM_MIE_H
#define STIM_MIE_H
#include <boost/math/special_functions/bessel.hpp>
#include "scalarwave.h"
#include "../math/bessel.h"
#include "../cuda/cudatools/devices.h"
#include <cmath>
namespace stim{
/// Calculate the scattering coefficients for a spherical scatterer
template<typename T>
void B_coefficients(stim::complex<T>* B, T a, T k, stim::complex<T> n, int Nl){
	//temporary variables
	double vm;															//allocate space to store the return values for the bessel function calculation
	double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
	stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	double ka = k * a;													//store k*a (argument for spherical bessel and Hankel functions)
	stim::complex<double> kna = k * n * a;								//store k*n*a (argument for spherical bessel functions and derivatives)
	stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka);			//calculate bessel functions and derivatives for k*a
	stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna);		//calculate complex bessel functions for k*n*a
	stim::complex<double> h_ka, dh_ka;
	stim::complex<double> numerator, denominator;
	stim::complex<double> i(0, 1);
	for(int l = 0; l <= Nl; l++){
		h_ka.r = j_ka[l];
		h_ka.i = y_ka[l];
		dh_ka.r = dj_ka[l];
		dh_ka.i = dy_ka[l];
		numerator = j_ka[l] * dj_kna[l] * (stim::complex<double>)n - j_kna[l] * dj_ka[l];
		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
		B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
	}
}
template<typename T>
void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){
	//temporary variables
	double vm;															//allocate space to store the return values for the bessel function calculation
	double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
	stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	double ka = k * a;													//store k*a (argument for spherical bessel and Hankel functions)
	stim::complex<double> kna = k * n * a;								//store k*n*a (argument for spherical bessel functions and derivatives)
	stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka);			//calculate bessel functions and derivatives for k*a
	stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna);		//calculate complex bessel functions for k*n*a
	stim::complex<double> h_ka, dh_ka;
	stim::complex<double> numerator, denominator;
	stim::complex<double> i(0, 1);
	for(size_t l = 0; l <= Nl; l++){
		h_ka.r = j_ka[l];
		h_ka.i = y_ka[l];
		dh_ka.r = dj_ka[l];
		dh_ka.i = dy_ka[l];
		numerator = j_ka[l] * dh_ka - dj_ka[l] * h_ka;
		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
		A[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
	}
}
#define LOCAL_NL	16
template<typename T>
__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){
	extern __shared__ stim::complex<T> shared_hB[];		//declare the list of waves in shared memory
	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
	if(i >= N) return;													//exit if this thread is outside the array
	stim::vec3<T> p;
	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
	(y == NULL) ? p[1] = 0 : p[1] = y[i];
	(z == NULL) ? p[2] = 0 : p[2] = z[i];
	
	T r = p.len();														//calculate the distance from the sphere
	if(r < a) return;													//exit if the point is inside the sphere (we only calculate the internal field)
	T fij = (r - r_min)/dr;											//FP index into the spherical bessel LUT
	size_t ij = (size_t) fij;											//convert to an integral index
	T alpha = fij - ij;													//calculate the fractional portion of the index
	size_t n0j = ij * (Nl + 1);												//start of the first entry in the LUT
	size_t n1j = (ij+1) * (Nl + 1);											//start of the second entry in the LUT
	T cos_phi;	
	T Pl_2, Pl_1, Pl;														//declare registers to store the previous two Legendre polynomials
	
	stim::complex<T> hBl;
	stim::complex<T> Ei = 0;											//create a register to store the result
	int l;
	stim::complex<T> hlBl[LOCAL_NL+1];									//the first LOCAL_NL components are stored in registers for speed
	int shared_start = threadIdx.x * (Nl - LOCAL_NL);					//wrap up some operations so that they aren't done in the main loops
	#pragma unroll LOCAL_NL+1											//copy the first LOCAL_NL+1 h_l * B_l components to registers
	for(l = 0; l <= LOCAL_NL; l++)
		hlBl[l] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
	
	for(l = LOCAL_NL+1; l <= Nl; l++)									//copy any additional h_l * B_l components to shared memory
		shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
	for(size_t w = 0; w < nW; w++){										//for each plane wave
		cos_phi = p.norm().dot(W[w].kvec().norm());						//calculate the cosine of the angle between the k vector and the direction from the sphere
		Pl_2 = 1;														//the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
		Pl_1 = cos_phi;
		Ei += W[w].E() * hlBl[0] * Pl_2;								//unroll the first two orders using the initial steps of the Legendre recursive relation
		Ei += W[w].E() * hlBl[1] * Pl_1;		
		#pragma unroll LOCAL_NL-1										//unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file)
		for(l = 2; l <= LOCAL_NL; l++){
			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);	//calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs)
			Ei += W[w].E() * hlBl[l] * Pl;								//calculate and sum the current field order
			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
			Pl_1 = Pl;
		}
		for(l = LOCAL_NL+1; l <= Nl; l++){											//do the same as above, except for any additional orders that are stored in shared memory (not registers)
			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);				//again, this is where most computation in the kernel occurs
			Ei += W[w].E() * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl;
			Pl_2 = Pl_1;															//shift Pl_1 -> Pl_2 and Pl -> Pl_1
			Pl_1 = Pl;			
		}
	}
	E[i] += Ei;															//copy the result to device memory
}
template<typename T>
void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){
	
	size_t max_shared_mem = stim::sharedMemPerBlock();	
	size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
	//std::cout<<"hl*Bl array size:  "<<hBl_array<<std::endl;
	//std::cout<<"shared memory:     "<<max_shared_mem<<std::endl;
	int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
	//std::cout<<"threads per block: "<<threads<<std::endl;
	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks
	size_t shared_mem;
	if(Nl <= LOCAL_NL) shared_mem = 0;
	else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL);				//amount of shared memory to allocate
	//std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
	cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl);	//call the kernel
}
template<typename T>
__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){
	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
	if(i >= N) return;													//exit if this thread is outside the array
	stim::vec3<T> p;
	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
	(y == NULL) ? p[1] = 0 : p[1] = y[i];
	(z == NULL) ? p[2] = 0 : p[2] = z[i];
	r[i] = p.len();
}
template<typename T>
void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
	
	//calculate the necessary number of orders required to represent the scattered field
	T k = W[0].kmag();
	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
	//std::cout<<"Nl: "<<Nl<<std::endl;
	//calculate the scattering coefficients for the sphere
	stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
	B_coefficients(B, a, k, n, Nl);	
	
	//	PLANE WAVES
	stim::scalarwave<T>* dev_W;																//allocate space and copy plane waves
	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
	// BESSEL FUNCTION LOOK-UP TABLE
	//calculate the distance from the sphere center
	T* dev_r;
	HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );
		
	int threads = stim::maxThreadsPerBlock();
	dim3 blocks((unsigned)(N / threads + 1));
	cuda_dist<T> <<< blocks, threads >>>(dev_r, x, y, z, N);
	//Find the minimum and maximum values of r
    cublasStatus_t stat;
    cublasHandle_t handle;
	stat = cublasCreate(&handle);							//create a cuBLAS handle
	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
        printf ("CUBLAS initialization failed\n");
		exit(1);
	}
	int i_min, i_max;
	stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
		exit(1);
	}
	stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
		exit(1);
	}
	i_min--;				//cuBLAS uses 1-based indexing for Fortran compatibility
	i_max--;
	T r_min, r_max;											//allocate space to store the minimum and maximum values
	HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
	HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
	r_min = max(r_min, a);									//if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere)
	//size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1);			//number of values in the look-up table based on the user-specified spacing along r
	size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1);
	//T kr_min = k * r_min;
	//T kr_max = k * r_max;
	//temporary variables
	double vm;															//allocate space to store the return values for the bessel function calculation
	double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
	size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut;
	stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes);													//pointer to the look-up table
	T dr = (r_max - r_min) / (N_hB_lut-1);												//distance between values in the LUT
	//std::cout<<"LUT jl bytes:  "<<hB_bytes<<std::endl;
	stim::complex<T> hl;
	for(size_t ri = 0; ri < N_hB_lut; ri++){													//for each value in the LUT
		stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
		for(size_t l = 0; l <= Nl; l++){													//for each order
			hl.r = (T)jv[l];
			hl.i = (T)yv[l];
			hB_lut[ri * (Nl + 1) + l] = hl * B[l];										//store the bessel function result
			//std::cout<<hB_lut[ri * (Nl + 1) + l]<<std::endl;
		}
	}
	//T* real_lut = (T*) malloc(hB_bytes/2);
	//stim::real(real_lut, hB_lut, N_hB_lut);
	//stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer);
	//Allocate device memory and copy everything to the GPU
	stim::complex<T>* dev_hB_lut;
	HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
	HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );
	gpu_scalar_mie_scatter<T>(E, N, x, y, z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl);
	cudaMemcpy(E, E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
}
/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
/// @param E is a pointer to the destination field values
/// @param N is the number of points used to calculate the field
/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
/// @param W is an array of planewaves that will be scattered
/// @param a is the radius of the sphere
/// @param n is the complex refractive index of the sphere
template<typename T>
void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
	
#ifdef CUDA_FOUND
	stim::complex<T>* dev_E;										//allocate space for the field
	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
	//	COORDINATES
	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
	if(x != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
	if(y != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
	if(z != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, W, a, n, r_spacing);
	if(x != NULL) cudaFree(dev_x);														//free everything
	if(y != NULL) cudaFree(dev_y);
	if(z != NULL) cudaFree(dev_z);
	cudaFree(dev_E);
#else
	
	//calculate the necessary number of orders required to represent the scattered field
	T k = W[0].kmag();
	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
	//std::cout<<"Nl: "<<Nl<<std::endl;
	//calculate the scattering coefficients for the sphere
	stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
	B_coefficients(B, a, k, n, Nl);
	//allocate space to store the bessel function call results
	double vm;										
	double* j_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* y_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dj_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dy_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
	T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
	T r, kr, cos_phi;
	stim::complex<T> h;
	for(size_t i = 0; i < N; i++){
		stim::vec3<T> p;															//declare a 3D point
	
		(x == NULL) ? p[0] = 0 : p[0] = x[i];										// test for NULL values and set positions
		(y == NULL) ? p[1] = 0 : p[1] = y[i];
		(z == NULL) ? p[2] = 0 : p[2] = z[i];
		r = p.len();
		if(r >= a){
			for(size_t w = 0; w < W.size(); w++){
				kr = p.len() * W[w].kmag();											//calculate k*r
				stim::bessjyv_sph<double>(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr);
				cos_phi = p.norm().dot(W[w].kvec().norm());							//calculate the cosine of the angle from the propagating direction
				stim::legendre<T>(Nl, cos_phi, P);
				for(size_t l = 0; l <= Nl; l++){
					h.r = j_kr[l];
					h.i = y_kr[l];
					E[i] += W[w].E() * B[l] * h * P[l];
				}
			}
		}
	}
#endif
}
template<typename T>
void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){
	std::vector< stim::scalarwave<T> > W(1, w);
	cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, r_spacing);
}
template<typename T>
__global__ void cuda_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* jA, T r_min, T dr, size_t N_jA, int Nl){
	extern __shared__ stim::complex<T> shared_jA[];		//declare the list of waves in shared memory
	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
	if(i >= N) return;													//exit if this thread is outside the array
	stim::vec3<T> p;
	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
	(y == NULL) ? p[1] = 0 : p[1] = y[i];
	(z == NULL) ? p[2] = 0 : p[2] = z[i];
	
	T r = p.len();														//calculate the distance from the sphere
	if(r >= a) return;													//exit if the point is inside the sphere (we only calculate the internal field)
	T fij = (r - r_min)/dr;											//FP index into the spherical bessel LUT
	size_t ij = (size_t) fij;											//convert to an integral index
	T alpha = fij - ij;													//calculate the fractional portion of the index
	size_t n0j = ij * (Nl + 1);												//start of the first entry in the LUT
	size_t n1j = (ij+1) * (Nl + 1);											//start of the second entry in the LUT
	T cos_phi;	
	T Pl_2, Pl_1, Pl;														//declare registers to store the previous two Legendre polynomials
	
	stim::complex<T> jAl;
	stim::complex<T> Ei = 0;											//create a register to store the result
	int l;
	stim::complex<T> jlAl[LOCAL_NL+1];									//the first LOCAL_NL components are stored in registers for speed
	int shared_start = threadIdx.x * (Nl - LOCAL_NL);					//wrap up some operations so that they aren't done in the main loops
	#pragma unroll LOCAL_NL+1											//copy the first LOCAL_NL+1 h_l * B_l components to registers
	for(l = 0; l <= LOCAL_NL; l++)
		jlAl[l] = clerp<T>( jA[n0j + l], jA[n1j + l], alpha );
	
	for(l = LOCAL_NL+1; l <= Nl; l++)									//copy any additional h_l * B_l components to shared memory
		shared_jA[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( jA[n0j + l], jA[n1j + l], alpha );
	for(size_t w = 0; w < nW; w++){										//for each plane wave
		if(r == 0) cos_phi = 0;
		else
			cos_phi = p.norm().dot(W[w].kvec().norm());						//calculate the cosine of the angle between the k vector and the direction from the sphere
		Pl_2 = 1;														//the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
		Pl_1 = cos_phi;
		Ei += W[w].E() * jlAl[0] * Pl_2;								//unroll the first two orders using the initial steps of the Legendre recursive relation
		Ei += W[w].E() * jlAl[1] * Pl_1;		
		#pragma unroll LOCAL_NL-1										//unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file)
		for(l = 2; l <= LOCAL_NL; l++){
			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);	//calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs)
			Ei += W[w].E() * jlAl[l] * Pl;								//calculate and sum the current field order
			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
			Pl_1 = Pl;
		}
		for(l = LOCAL_NL+1; l <= Nl; l++){											//do the same as above, except for any additional orders that are stored in shared memory (not registers)
			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);				//again, this is where most computation in the kernel occurs
			Ei += W[w].E() * shared_jA[shared_start + l - LOCAL_NL - 1] * Pl;
			Pl_2 = Pl_1;															//shift Pl_1 -> Pl_2 and Pl -> Pl_1
			Pl_1 = Pl;			
		}
	}
	E[i] = Ei;															//copy the result to device memory
}
template<typename T>
void gpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* jA, T r_min, T dr, size_t N_jA, size_t Nl){
	
	size_t max_shared_mem = stim::sharedMemPerBlock();	
	size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
	//std::cout<<"hl*Bl array size:  "<<hBl_array<<std::endl;
	//std::cout<<"shared memory:     "<<max_shared_mem<<std::endl;
	int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
	//std::cout<<"threads per block: "<<threads<<std::endl;
	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks
	size_t shared_mem;
	if(Nl <= LOCAL_NL) shared_mem = 0;
	else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL);				//amount of shared memory to allocate
	//std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
	cuda_scalar_mie_internal<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl);	//call the kernel
}
/// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere
/// @param E is a pointer to the destination field values
/// @param N is the number of points used to calculate the field
/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
/// @param w is a planewave that will be scattered
/// @param a is the radius of the sphere
/// @param n is the complex refractive index of the sphere
template<typename T>
void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n, T r_spacing = 0.1){
//calculate the necessary number of orders required to represent the scattered field
	T k = W[0].kmag();
	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
	//std::cout<<"Nl: "<<Nl<<std::endl;
	//calculate the scattering coefficients for the sphere
	stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
	A_coefficients(A, a, k, n, Nl);
#ifdef CUDA_FOUND
	stim::complex<T>* dev_E;										//allocate space for the field
	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
	//	COORDINATES
	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
	if(x != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
	if(y != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
	if(z != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	//	PLANE WAVES
	stim::scalarwave<T>* dev_W;																//allocate space and copy plane waves
	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
	// BESSEL FUNCTION LOOK-UP TABLE
	//calculate the distance from the sphere center
	T* dev_r;
	HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );
		
	int threads = stim::maxThreadsPerBlock();
	dim3 blocks((unsigned)(N / threads + 1));
	cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);
	//Find the minimum and maximum values of r
    cublasStatus_t stat;
    cublasHandle_t handle;
	stat = cublasCreate(&handle);							//create a cuBLAS handle
	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
        printf ("CUBLAS initialization failed\n");
		exit(1);
	}
	int i_min, i_max;
	stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
		exit(1);
	}
	stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
		exit(1);
	}
	i_min--;				//cuBLAS uses 1-based indexing for Fortran compatibility
	i_max--;
	T r_min, r_max;											//allocate space to store the minimum and maximum values
	HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
	HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
	r_max = min(r_max, a);		//the internal field doesn't exist outside of the sphere
	size_t N_jA_lut = (size_t)((r_max - r_min) / r_spacing + 1);
	//temporary variables
	double vm;															//allocate space to store the return values for the bessel function calculation
	stim::complex<double>* jv = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* yv = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* djv= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dyv= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	size_t jA_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_jA_lut;
	stim::complex<T>* jA_lut = (stim::complex<T>*) malloc(jA_bytes);													//pointer to the look-up table
	T dr = (r_max - r_min) / (N_jA_lut-1);												//distance between values in the LUT
	//std::cout<<"LUT jl bytes:  "<<jA_bytes<<std::endl;
	stim::complex<T> hl;
	stim::complex<double> nd = (stim::complex<double>)n;
	for(size_t ri = 0; ri < N_jA_lut; ri++){													//for each value in the LUT
		stim::cbessjyva_sph<double>(Nl, nd * k * (r_min + ri * dr), vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
		for(size_t l = 0; l <= Nl; l++){													//for each order
			jA_lut[ri * (Nl + 1) + l] = (stim::complex<T>)(jv[l] * (stim::complex<double>)A[l]);										//store the bessel function result
		}
	}
	//Allocate device memory and copy everything to the GPU
	stim::complex<T>* dev_jA_lut;
	HANDLE_ERROR( cudaMalloc(&dev_jA_lut, jA_bytes) );
	HANDLE_ERROR( cudaMemcpy(dev_jA_lut, jA_lut, jA_bytes, cudaMemcpyHostToDevice) );
	gpu_scalar_mie_internal<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_jA_lut, r_min, dr, N_jA_lut, Nl);
	cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
	if(x != NULL) cudaFree(dev_x);														//free everything
	if(y != NULL) cudaFree(dev_y);
	if(z != NULL) cudaFree(dev_z);
	HANDLE_ERROR( cudaFree(dev_jA_lut) );
	HANDLE_ERROR( cudaFree(dev_E) );
	HANDLE_ERROR( cudaFree(dev_W) );
	HANDLE_ERROR( cudaFree(dev_r) );
	cudaFree(dev_E);
#else
	//allocate space to store the bessel function call results
	double vm;										
	stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* y_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dj_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dy_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
	T r, cos_phi;
	stim::complex<double> knr;
	stim::complex<T> h;
	for(size_t i = 0; i < N; i++){
		stim::vec3<T> p;									//declare a 3D point
	
		(x == NULL) ? p[0] = 0 : p[0] = x[i];				// test for NULL values and set positions
		(y == NULL) ? p[1] = 0 : p[1] = y[i];
		(z == NULL) ? p[2] = 0 : p[2] = z[i];
		r = p.len();
		if(r < a){
			E[i] = 0;
			for(size_t w = 0; w < W.size(); w++){
				knr = (stim::complex<double>)n * p.len() * W[w].kmag();							//calculate k*n*r
				stim::cbessjyva_sph<double>(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr);
				if(r == 0)
					cos_phi = 0;
				else
					cos_phi = p.norm().dot(W[w].kvec().norm());				//calculate the cosine of the angle from the propagating direction
				stim::legendre<T>(Nl, cos_phi, P);
								
				for(size_t l = 0; l <= Nl; l++){
					E[i] += W[w].E() * A[l] * (stim::complex<T>)j_knr[l] * P[l];
				}
			}
		}
	}
#endif
}
template<typename T>
void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){
	std::vector< stim::scalarwave<T> > W(1, w);
	cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, r_spacing);
}
/// Class stim::scalarmie represents a scalar Mie scattering model that can be used to calculate the fields produced by a scattering sphere.
template<typename T>
class scalarmie
{
private:
	T radius;					//radius of the scattering sphere
	stim::complex<T> n;			//refractive index of the scattering sphere
	
public:
	scalarmie(T r, stim::complex<T> ri){
		radius = r;
		n = ri;
	}
	void sum_scat(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){
		std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves
		stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
	}
	void sum_intern(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){
		std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves
		stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
	}
	void eval(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int order = 500, int samples = 1000){
		b.eval(E, X, Y, Z, order);													//evaluate the incident field using a plane wave expansion
		std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves		
		sum_scat(E, X, Y, Z, b, samples);
		sum_intern(E, X, Y, Z, b, samples);
	}
	void eval(stim::scalarfield<T>& E, stim::scalarbeam<T> b, int order = 500, int samples = 1000){
		/*size_t array_size = E.grid_bytes();											//calculate the number of bytes in the scalar grid
		float* X = (float*) malloc( array_size );									//allocate space for the coordinate meshes
		float* Y = (float*) malloc( array_size );
		float* Z = (float*) malloc( array_size );
		E.meshgrid(X, Y, Z, stim::CPUmem);											//calculate the coordinate meshes
		*/
		E.meshgrid();
		b.eval(E, order);
		std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves
		if(E.gpu()){
			stim::gpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing());
		}
		else{
			stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing());
			stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing());
		}
		//eval(E, X, Y, Z, b, order, samples);										//evaluate the field		
	}
};			//end stim::scalarmie
}			//end namespace stim
#endif