stim/optics/scalarmie.h

#ifndef STIM_MIE_H
#define STIM_MIE_H
#include <boost/math/special_functions/bessel.hpp>
#include "scalarwave.h"
#include "../math/bessel.h"
#include "../cuda/cudatools/devices.h"
#include <cmath>
namespace stim{
/// Calculate the scattering coefficients for a spherical scatterer
template<typename T>
void B_coefficients(stim::complex<T>* B, T a, T k, stim::complex<T> n, int Nl){
	//temporary variables
	double vm;															//allocate space to store the return values for the bessel function calculation
	double* j_ka = (double*) malloc( (Nl + 2) * sizeof(double) );
	double* y_ka = (double*) malloc( (Nl + 2) * sizeof(double) );
	double* dj_ka= (double*) malloc( (Nl + 2) * sizeof(double) );
	double* dy_ka= (double*) malloc( (Nl + 2) * sizeof(double) );
	stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 2) * sizeof(stim::complex<double>) );
	stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 2) * sizeof(stim::complex<double>) );
	stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 2) * sizeof(stim::complex<double>) );
	stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 2) * sizeof(stim::complex<double>) );
	double ka = k * a;													//store k*a (argument for spherical bessel and Hankel functions)
	stim::complex<double> kna = k * n * a;								//store k*n*a (argument for spherical bessel functions and derivatives)
	stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka);			//calculate bessel functions and derivatives for k*a
	stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna);		//calculate complex bessel functions for k*n*a
	stim::complex<double> h_ka, dh_ka;
	stim::complex<double> numerator, denominator;
	stim::complex<double> i(0, 1);
	for(int l = 0; l <= Nl; l++){
		h_ka.r = j_ka[l];
		h_ka.i = y_ka[l];
		dh_ka.r = dj_ka[l];
		dh_ka.i = dy_ka[l];
		numerator = j_ka[l] * dj_kna[l] * (stim::complex<double>)n - j_kna[l] * dj_ka[l];
		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
		B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
	}
	//free memory
	free(j_ka); free(y_ka);	free(dj_ka); free(dy_ka); free(j_kna); free(y_kna); free(dj_kna); free(dy_kna);
}
template<typename T>
void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){
	//temporary variables
	double vm;															//allocate space to store the return values for the bessel function calculation
	double* j_ka = (double*) malloc( (Nl + 2) * sizeof(double) );
	double* y_ka = (double*) malloc( (Nl + 2) * sizeof(double) );
	double* dj_ka= (double*) malloc( (Nl + 2) * sizeof(double) );
	double* dy_ka= (double*) malloc( (Nl + 2) * sizeof(double) );
	stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 2) * sizeof(stim::complex<double>) );
	stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 2) * sizeof(stim::complex<double>) );
	stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 2) * sizeof(stim::complex<double>) );
	stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 2) * sizeof(stim::complex<double>) );
	double ka = k * a;													//store k*a (argument for spherical bessel and Hankel functions)
	stim::complex<double> kna = k * n * a;								//store k*n*a (argument for spherical bessel functions and derivatives)
	stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka);			//calculate bessel functions and derivatives for k*a
	stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna);		//calculate complex bessel functions for k*n*a
	stim::complex<double> h_ka, dh_ka;
	stim::complex<double> numerator, denominator;
	stim::complex<double> i(0, 1);
	for(size_t l = 0; l <= Nl; l++){
		h_ka.r = j_ka[l];
		h_ka.i = y_ka[l];
		dh_ka.r = dj_ka[l];
		dh_ka.i = dy_ka[l];
		numerator = j_ka[l] * dh_ka - dj_ka[l] * h_ka;
		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
		A[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
	}
	//free memory
	free(j_ka);	free(y_ka);	free(dj_ka); free(dy_ka); free(j_kna); free(y_kna); free(dj_kna); free(dy_kna);
}
#define LOCAL_NL	16
/// CUDA kernel for calculating the Mie scattering solution given a set of points (x, y, z), a list of plane waves, and a look-up table for Bl*hl
/// @param E (GPU) is the N x N destination scalar field
/// @param N is the number of sample points to evaluate
/// @param x (GPU) is the grid of X coordinates for each point in E
/// @param y (GPU) is the grid of Y coordinates for each point in E
/// @param z (GPU) is the grid of Z coordinates for each point in E
/// @param W (GPU) is an array of coherent scalar plane waves incident on the Mie scatterer
/// @param nW is the number of plane waves to evaluate (sum)
/// @param a is the radius of the Mie scatterer
/// @param n is the complex refractive index of the Mie scatterer
/// @param c is the position of the sphere in (x, y, z)
/// @param hB (GPU) is a look-up table of Hankel functions (equally spaced in distance from the sphere) pre-multiplied with scattering coefficients
/// @param kr_min is the minimum kr value in the hB look-up table (corresponding to the closest point to the sphere)
/// @param dkr is the spacing (in kr) between samples of the hB look-up table
/// @param N_hB is the number of samples in hB
/// @param Nl is the order of the calculation (number of Hankel function orders)
template<typename T>
__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::vec3<T> c, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){
	extern __shared__ stim::complex<T> shared_hB[];						//declare the list of waves in shared memory
	size_t i = blockIdx.x * blockDim.x + threadIdx.x;					//get the index into the sample array (sample point associated with this thread)
	if(i >= N) return;													//exit if this thread is outside the array
	stim::vec3<T> p;
	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
	(y == NULL) ? p[1] = 0 : p[1] = y[i];
	(z == NULL) ? p[2] = 0 : p[2] = z[i];
	p = p - c;
	T r = p.len();														//calculate the distance from the sphere
	if(r < a) return;													//exit if the point is inside the sphere (we only calculate the internal field)
	T fij = (r - r_min)/dr;												//FP index into the spherical bessel LUT
	size_t ij = (size_t) fij;											//convert to an integral index
	T alpha = fij - ij;													//calculate the fractional portion of the index
	size_t n0j = ij * (Nl + 1);											//start of the first entry in the LUT
	size_t n1j = (ij+1) * (Nl + 1);										//start of the second entry in the LUT
	T cos_phi;	
	T Pl_2, Pl_1, Pl;													//declare registers to store the previous two Legendre polynomials
	
	stim::complex<T> hBl;
	stim::complex<T> Ei = 0;											//create a register to store the result
	int l;
	stim::complex<T> hlBl[LOCAL_NL+1];									//the first LOCAL_NL components are stored in registers for speed
	int shared_start = threadIdx.x * (Nl - LOCAL_NL);					//wrap up some operations so that they aren't done in the main loops
	//unroll LOCAL_NL + 1
	#pragma unroll 17													//copy the first LOCAL_NL+1 h_l * B_l components to registers
	for(l = 0; l <= LOCAL_NL; l++)
		hlBl[l] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
	
	for(l = LOCAL_NL+1; l <= Nl; l++)									//copy any additional h_l * B_l components to shared memory
		shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
	complex<T> e, Ew;
	for(size_t w = 0; w < nW; w++){										//for each plane wave
		cos_phi = p.norm().dot(W[w].kvec().norm());						//calculate the cosine of the angle between the k vector and the direction from the sphere
		Pl_2 = 1;														//the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
		Pl_1 = cos_phi;
		e = exp(complex<T>(0, W[w].kvec().dot(c)));
		Ew = W[w].E() * e;
		Ei += Ew * hlBl[0] * Pl_2;								//unroll the first two orders using the initial steps of the Legendre recursive relation
		Ei += Ew * hlBl[1] * Pl_1;		
		//LOCAL_NL - 1
		#pragma unroll 15										//unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file)
		for(l = 2; l <= LOCAL_NL; l++){
			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);	//calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs)
			Ei += Ew * hlBl[l] * Pl;								//calculate and sum the current field order
			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
			Pl_1 = Pl;
		}
		for(l = LOCAL_NL+1; l <= Nl; l++){											//do the same as above, except for any additional orders that are stored in shared memory (not registers)
			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);				//again, this is where most computation in the kernel occurs
			Ei += Ew * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl;
			Pl_2 = Pl_1;															//shift Pl_1 -> Pl_2 and Pl -> Pl_1
			Pl_1 = Pl;			
		}
	}
	E[i] += Ei;															//copy the result to device memory
}
///Calculate the scalar Mie scattered field on the GPU when a list of GPU-based pre-multiplied Hankel functions are available
/// @param E (GPU) is the N x N destination scalar field
/// @param N is the number fo elements of the scalar field in each direction
/// @param x (GPU) is the grid of X coordinates for each point in E
/// @param y (GPU) is the grid of Y coordinates for each point in E
/// @param z (GPU) is the grid of Z coordinates for each point in E
/// @param W (GPU) is an array of coherent scalar plane waves incident on the Mie scatterer
/// @param nW is the number of plane waves to evaluate (sum)
/// @param a is the radius of the Mie scatterer
/// @param n is the complex refractive index of the Mie scatterer
/// @param c is the position of the sphere in (x, y, z)
/// @param hB (GPU) is a look-up table of Hankel functions (equally spaced in distance from the sphere) pre-multiplied with scattering coefficients
/// @param kr_min is the minimum kr value in the hB look-up table (corresponding to the closest point to the sphere)
/// @param dkr is the spacing (in kr) between samples of the hB look-up table
/// @param N_hB is the number of samples in hB
/// @param Nl is the order of the calculation (number of Hankel function orders)
template<typename T>
void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::vec3<T> c, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){
	
	size_t max_shared_mem = stim::sharedMemPerBlock();								//get the amount of shared memory per block
	size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);							//calculate the number of bytes required to store the LUT corresponding to a single sample in shared memory
	int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);					//calculate the optimal number of threads per block (make sure it's divisible by the number of warps - 32)
	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks
	size_t shared_mem;
	if(Nl <= LOCAL_NL) shared_mem = 0;
	else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL);				//amount of shared memory to allocate
	//std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
	cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, c, hB, kr_min, dkr, N_hB, (int)Nl);	//call the kernel
}
template<typename T>
__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N, stim::vec3<T> c = stim::vec3<T>(0, 0, 0)){
	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
	if(i >= N) return;													//exit if this thread is outside the array
	stim::vec3<T> p;
	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
	(y == NULL) ? p[1] = 0 : p[1] = y[i];
	(z == NULL) ? p[2] = 0 : p[2] = z[i];
	r[i] = (p - c).len();
}
///Calculate the scalar Mie scattered field on the GPU
/// @param E (GPU) is the N x N destination scalar field
/// @param N is the number of sample points of the scalar field
/// @param x (GPU) is the grid of X coordinates for each point in E
/// @param y (GPU) is the grid of Y coordinates for each point in E
/// @param z (GPU) is the grid of Z coordinates for each point in E
/// @param W (CPU) is an array of coherent scalar plane waves incident on the Mie scatterer
/// @param a is the radius of the Mie scatterer
/// @param n is the complex refractive index of the Mie scatterer
/// @param r_spacing is the minimum distance between r values of the sample points in E (used to calculate look-up tables)
template<typename T>
void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0), T r_spacing = 0.1){
	
	//calculate the necessary number of orders required to represent the scattered field
	T k = W[0].kmag();
	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);			//calculate the number of orders required to represent the sphere
	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
	//calculate the scattering coefficients for the sphere
	stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
	B_coefficients(B, a, k, n, Nl);																//calculate the scattering coefficients	
	
	//	PLANE WAVES
	stim::scalarwave<T>* dev_W;																	//allocate space and copy plane waves
	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
	// BESSEL FUNCTION LOOK-UP TABLE
	//calculate the distance from the sphere center at each sample point and store the result in dev_r
	T* dev_r;																					//declare the device pointer to hold the distance from the sphere center
	HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );											//allocate space for the array of distances
		
	int threads = stim::maxThreadsPerBlock();													//query the device to find the maximum number of threads per block
	dim3 blocks((unsigned)(N / threads + 1));													//calculate the number of blocks necessary to evaluate the total number of sample points N
	cuda_dist<T> <<< blocks, threads >>>(dev_r, x, y, z, N, c);									//calculate the distance
	//Use the cuBLAS library to find the minimum and maximum distances from the sphere center. This will be used to create a look-up table for the Hankel functions
    cublasStatus_t stat;
    cublasHandle_t handle;
	stat = cublasCreate(&handle);							//create a cuBLAS handle
	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
        printf ("CUBLAS initialization failed\n");
		exit(1);
	}
	int i_min, i_max;
	stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
		exit(1);
	}
	stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
		exit(1);
	}
	i_min--;				//cuBLAS uses 1-based indexing for Fortran compatibility
	i_max--;
	T r_min, r_max;											//allocate space to store the minimum and maximum values
	HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
	HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
	r_min = max(r_min, a);												//if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere)
	size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1);		//number of values in the look-up table based on the user-specified spacing along r
	//Declare and evaluate variables used to calculate the spherical Bessel functions and store them temporarily on the CPU
	double vm;																	//allocate space to store the return values for the bessel function calculation
	double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
	size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut;						//calculate the number of bytes necessary to store the Hankel function LUT
	stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes);					//pointer to the look-up table
	T dr = (r_max - r_min) / (N_hB_lut-1);												//calculate the optimal distance between values in the LUT
	stim::complex<T> hl;																//declare a complex value for the Hankel function result
	for(size_t ri = 0; ri < N_hB_lut; ri++){											//for each value in the LUT
		stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
		for(size_t l = 0; l <= Nl; l++){												//for each order
			hl.r = (T)jv[l];															//generate the spherical Hankel function from the Bessel functions
			hl.i = (T)yv[l];
			hB_lut[ri * (Nl + 1) + l] = hl * B[l];										//pre-multiply the Hankel function by the scattering coefficients
		}
	}
	//Copy the pre-multiplied Hankel function look-up table to the GPU - this LUT gives a list of uniformly spaced Hankel function values pre-multiplied by scattering coefficients
	stim::complex<T>* dev_hB_lut;
	HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
	HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );
	//calculate the Mie scattering solution on the GPU
	gpu_scalar_mie_scatter<T>(E, N, x, y, z, dev_W, W.size(), a, n, c, dev_hB_lut, r_min, dr, N_hB_lut, Nl);
	//HANDLE_ERROR(cudaMemcpy(E, E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost));			//copy the field from device memory
	HANDLE_ERROR(cudaFree(dev_hB_lut));
	HANDLE_ERROR(cudaFree(dev_r));
	HANDLE_ERROR(cudaFree(dev_W));
}
/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
/// @param E is a pointer to the destination field values
/// @param N is the number of points used to calculate the field
/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
/// @param W is an array of planewaves that will be scattered
/// @param a is the radius of the sphere
/// @param n is the complex refractive index of the sphere
template<typename T>
void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0)){
	
/*#ifdef CUDA_FOUND
	stim::complex<T>* dev_E;										//allocate space for the field
	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
	//	COORDINATES
	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
	if(x != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
	if(y != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
	if(z != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, W, a, n, c, r_spacing);
	if(x != NULL) cudaFree(dev_x);														//free everything
	if(y != NULL) cudaFree(dev_y);
	if(z != NULL) cudaFree(dev_z);
	cudaFree(dev_E);
#else
*/
	
	//calculate the necessary number of orders required to represent the scattered field
	T k = W[0].kmag();
	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
	//std::cout<<"Nl: "<<Nl<<std::endl;
	//calculate the scattering coefficients for the sphere
	stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
	B_coefficients(B, a, k, n, Nl);
	//allocate space to store the bessel function call results
	double vm;										
	double* j_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* y_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dj_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dy_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
	T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
	T r, kr, cos_phi;
	stim::complex<T> h;
	stim::complex<T> Ew;
	for(size_t i = 0; i < N; i++){
		stim::vec3<T> p;															//declare a 3D point
	
		(x == NULL) ? p[0] = 0 : p[0] = x[i];										// test for NULL values and set positions
		(y == NULL) ? p[1] = 0 : p[1] = y[i];
		(z == NULL) ? p[2] = 0 : p[2] = z[i];
		p = p - c;
		r = p.len();
		if(r >= a){
			for(size_t w = 0; w < W.size(); w++){
				Ew = W[w].E() * exp(stim::complex<float>(0, W[w].kvec().dot(c)));
				kr = p.len() * W[w].kmag();											//calculate k*r
				stim::bessjyv_sph<double>(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr);
				cos_phi = p.norm().dot(W[w].kvec().norm());							//calculate the cosine of the angle from the propagating direction
				stim::legendre<T>(Nl, cos_phi, P);
				for(size_t l = 0; l <= Nl; l++){
					h.r = j_kr[l];
					h.i = y_kr[l];
					E[i] += Ew * B[l] * h * P[l];
				}
			}
		}
	}
//#endif
}
template<typename T>
void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0), T r_spacing = 0.1){
	std::vector< stim::scalarwave<T> > W(1, w);
	cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, c, r_spacing);
}
template<typename T>
__global__ void cuda_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* jA, T r_min, T dr, size_t N_jA, int Nl){
	extern __shared__ stim::complex<T> shared_jA[];		//declare the list of waves in shared memory
	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
	if(i >= N) return;													//exit if this thread is outside the array
	stim::vec3<T> p;
	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
	(y == NULL) ? p[1] = 0 : p[1] = y[i];
	(z == NULL) ? p[2] = 0 : p[2] = z[i];
	
	T r = p.len();														//calculate the distance from the sphere
	if(r >= a) return;													//exit if the point is inside the sphere (we only calculate the internal field)
	T fij = (r - r_min)/dr;												//FP index into the spherical bessel LUT
	size_t ij = (size_t) fij;											//convert to an integral index
	T alpha = fij - ij;													//calculate the fractional portion of the index
	size_t n0j = ij * (Nl + 1);											//start of the first entry in the LUT
	size_t n1j = (ij+1) * (Nl + 1);										//start of the second entry in the LUT
	T cos_phi;	
	T Pl_2, Pl_1, Pl;													//declare registers to store the previous two Legendre polynomials
	
	stim::complex<T> jAl;
	stim::complex<T> Ei = 0;											//create a register to store the result
	int l;
	stim::complex<T> jlAl[LOCAL_NL+1];									//the first LOCAL_NL components are stored in registers for speed
	int shared_start = threadIdx.x * (Nl - LOCAL_NL);					//wrap up some operations so that they aren't done in the main loops
	#pragma unroll LOCAL_NL+1											//copy the first LOCAL_NL+1 h_l * B_l components to registers
	for(l = 0; l <= LOCAL_NL; l++)
		jlAl[l] = clerp<T>( jA[n0j + l], jA[n1j + l], alpha );
	
	for(l = LOCAL_NL+1; l <= Nl; l++)									//copy any additional h_l * B_l components to shared memory
		shared_jA[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( jA[n0j + l], jA[n1j + l], alpha );
	for(size_t w = 0; w < nW; w++){										//for each plane wave
		if(r == 0) cos_phi = 0;
		else
			cos_phi = p.norm().dot(W[w].kvec().norm());					//calculate the cosine of the angle between the k vector and the direction from the sphere
		Pl_2 = 1;														//the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
		Pl_1 = cos_phi;
		Ei += W[w].E() * jlAl[0] * Pl_2;								//unroll the first two orders using the initial steps of the Legendre recursive relation
		Ei += W[w].E() * jlAl[1] * Pl_1;		
		#pragma unroll LOCAL_NL-1										//unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file)
		for(l = 2; l <= LOCAL_NL; l++){
			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);	//calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs)
			Ei += W[w].E() * jlAl[l] * Pl;								//calculate and sum the current field order
			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
			Pl_1 = Pl;
		}
		for(l = LOCAL_NL+1; l <= Nl; l++){											//do the same as above, except for any additional orders that are stored in shared memory (not registers)
			Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l);				//again, this is where most computation in the kernel occurs
			Ei += W[w].E() * shared_jA[shared_start + l - LOCAL_NL - 1] * Pl;
			Pl_2 = Pl_1;															//shift Pl_1 -> Pl_2 and Pl -> Pl_1
			Pl_1 = Pl;			
		}
	}
	E[i] = Ei;															//copy the result to device memory
}
template<typename T>
void gpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* jA, T r_min, T dr, size_t N_jA, size_t Nl){
	
	size_t max_shared_mem = stim::sharedMemPerBlock();	
	size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
	//std::cout<<"hl*Bl array size:  "<<hBl_array<<std::endl;
	//std::cout<<"shared memory:     "<<max_shared_mem<<std::endl;
	int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
	//std::cout<<"threads per block: "<<threads<<std::endl;
	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks
	size_t shared_mem;
	if(Nl <= LOCAL_NL) shared_mem = 0;
	else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL);				//amount of shared memory to allocate
	//std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
	cuda_scalar_mie_internal<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl);	//call the kernel
}
/// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere
/// @param E is a pointer to the destination field values
/// @param N is the number of points used to calculate the field
/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
/// @param w is a planewave that will be scattered
/// @param a is the radius of the sphere
/// @param n is the complex refractive index of the sphere
template<typename T>
void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0)){
//calculate the necessary number of orders required to represent the scattered field
	T k = W[0].kmag();
	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
	//std::cout<<"Nl: "<<Nl<<std::endl;
	//calculate the scattering coefficients for the sphere
	stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
	A_coefficients(A, a, k, n, Nl);
/*#ifdef CUDA_FOUND
	stim::complex<T>* dev_E;										//allocate space for the field
	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
	//	COORDINATES
	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
	if(x != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
	if(y != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
	if(z != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	//	PLANE WAVES
	stim::scalarwave<T>* dev_W;																//allocate space and copy plane waves
	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
	// BESSEL FUNCTION LOOK-UP TABLE
	//calculate the distance from the sphere center
	T* dev_r;
	HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );
		
	int threads = stim::maxThreadsPerBlock();
	dim3 blocks((unsigned)(N / threads + 1));
	cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);
	//Find the minimum and maximum values of r
    cublasStatus_t stat;
    cublasHandle_t handle;
	stat = cublasCreate(&handle);							//create a cuBLAS handle
	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
        printf ("CUBLAS initialization failed\n");
		exit(1);
	}
	int i_min, i_max;
	stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
		exit(1);
	}
	stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
		exit(1);
	}
	cublasDestroy(handle);									//destroy the CUBLAS handle
	i_min--;				//cuBLAS uses 1-based indexing for Fortran compatibility
	i_max--;
	T r_min, r_max;											//allocate space to store the minimum and maximum values
	HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
	HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
	r_max = min(r_max, a);		//the internal field doesn't exist outside of the sphere
	size_t N_jA_lut = (size_t)((r_max - r_min) / r_spacing + 1);
	//temporary variables
	double vm;															//allocate space to store the return values for the bessel function calculation
	stim::complex<double>* jv = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* yv = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* djv= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dyv= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	size_t jA_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_jA_lut;
	stim::complex<T>* jA_lut = (stim::complex<T>*) malloc(jA_bytes);													//pointer to the look-up table
	T dr = (r_max - r_min) / (N_jA_lut-1);												//distance between values in the LUT
	//std::cout<<"LUT jl bytes:  "<<jA_bytes<<std::endl;
	stim::complex<T> hl;
	stim::complex<double> nd = (stim::complex<double>)n;
	for(size_t ri = 0; ri < N_jA_lut; ri++){													//for each value in the LUT
		stim::cbessjyva_sph<double>(Nl, nd * k * (r_min + ri * dr), vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
		for(size_t l = 0; l <= Nl; l++){													//for each order
			jA_lut[ri * (Nl + 1) + l] = (stim::complex<T>)(jv[l] * (stim::complex<double>)A[l]);										//store the bessel function result
		}
	}
	//Allocate device memory and copy everything to the GPU
	stim::complex<T>* dev_jA_lut;
	HANDLE_ERROR( cudaMalloc(&dev_jA_lut, jA_bytes) );
	HANDLE_ERROR( cudaMemcpy(dev_jA_lut, jA_lut, jA_bytes, cudaMemcpyHostToDevice) );
	gpu_scalar_mie_internal<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_jA_lut, r_min, dr, N_jA_lut, Nl);
	cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
	if(x != NULL) cudaFree(dev_x);														//free everything
	if(y != NULL) cudaFree(dev_y);
	if(z != NULL) cudaFree(dev_z);
	HANDLE_ERROR( cudaFree(dev_jA_lut) );
	HANDLE_ERROR( cudaFree(dev_E) );
	HANDLE_ERROR( cudaFree(dev_W) );
	HANDLE_ERROR( cudaFree(dev_r) );
	HANDLE_ERROR( cudaFree(dev_E) );
#else
*/
	//allocate space to store the bessel function call results
	double vm;										
	stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* y_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dj_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dy_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
	T r, cos_phi;
	stim::complex<double> knr;
	stim::complex<T> h;
	stim::complex<T> Ew;
	for(size_t i = 0; i < N; i++){
		stim::vec3<T> p;									//declare a 3D point
	
		(x == NULL) ? p[0] = 0 : p[0] = x[i];				// test for NULL values and set positions
		(y == NULL) ? p[1] = 0 : p[1] = y[i];
		(z == NULL) ? p[2] = 0 : p[2] = z[i];
		p = p - c;
		r = p.len();
		if(r < a){
			E[i] = 0;
			for(size_t w = 0; w < W.size(); w++){
				Ew = W[w].E() * exp(stim::complex<float>(0, W[w].kvec().dot(c)));
				knr = (stim::complex<double>)n * p.len() * W[w].kmag();							//calculate k*n*r
				stim::cbessjyva_sph<double>(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr);
				if(r == 0)
					cos_phi = 0;
				else
					cos_phi = p.norm().dot(W[w].kvec().norm());				//calculate the cosine of the angle from the propagating direction
				stim::legendre<T>(Nl, cos_phi, P);
								
				for(size_t l = 0; l <= Nl; l++){
					E[i] += Ew * A[l] * (stim::complex<T>)j_knr[l] * P[l];
				}
			}
		}
	}
//#endif
}
template<typename T>
void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0)){
	std::vector< stim::scalarwave<T> > W(1, w);
	cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, c);
}
/// Class stim::scalarmie represents a scalar Mie scattering model that can be used to calculate the fields produced by a scattering sphere.
template<typename T>
class scalarmie
{
public:
	T radius;					//radius of the scattering sphere
	stim::complex<T> n;			//refractive index of the scattering sphere
	vec3<T> c;					//position of the sphere in space
	
public:
	scalarmie() {				//default constructor
		radius = 0.5;
		n = stim::complex<T>(1.4, 0.0);
		c = stim::vec3<T>(0, 0, 0);
	}
	scalarmie(T r, stim::complex<T> ri, stim::vec3<T> center = stim::vec3<T>(0, 0, 0)){
		radius = r;
		n = ri;
		c = center;
		//c = stim::vec3<T>(2, 1, 0);
	}
	//void sum_scat(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){
	//	std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves
	//	stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
	//}
	//void sum_intern(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){
	//	std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves
	//	stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
	//}
	//void eval(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int order = 500, int samples = 1000){
	//	b.eval(E, X, Y, Z, order);													//evaluate the incident field using a plane wave expansion
	//	std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves		
	//	sum_scat(E, X, Y, Z, b, samples);
	//	sum_intern(E, X, Y, Z, b, samples);
	//}
	void eval(stim::scalarfield<T>& E, stim::scalarbeam<T> b, int order = 500, int samples = 1000){
		E.meshgrid();
		b.eval(E, order);
		std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves
		if(E.gpu()){
			stim::gpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c, E.spacing());
		}
		else{
			stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing());
			stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing());
		}
	}
};			//end stim::scalarmie
template<typename T>
class scalarcluster : public std::vector< scalarmie<T> > {
public:
	void eval(stim::scalarfield<T>& E, stim::scalarbeam<T> b, int order = 500, int samples = 1000) {
		E.meshgrid();
		b.eval(E, order);
		std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves
		T radius;
		stim::complex<T> n;
		stim::vec3<T> c;
		for (size_t si = 0; si < std::vector< scalarmie<T> >::size(); si++) {									//for each sphere in the cluster
			radius = std::vector< scalarmie<T> >::at(si).radius;
			n = std::vector< scalarmie<T> >::at(si).n;
			c = std::vector< scalarmie<T> >::at(si).c;
			if (E.gpu()) {
				stim::gpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c, E.spacing());
			}
			else {
				stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c);
				stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c);
			}
		}
	}
};
}			//end namespace stim
#endif