mie.h 17.6 KB
#ifndef STIM_MIE_H
#define STIM_MIE_H

#include "scalarwave.h"
#include "../math/bessel.h"
#include <cmath>

namespace stim{


/// Calculate the scattering coefficients for a spherical scatterer
template<typename T>
void B_coefficients(stim::complex<T>* B, T a, T k, stim::complex<T> n, int Nl){

	//temporary variables
	double vm;															//allocate space to store the return values for the bessel function calculation
	double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );

	stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );

	double ka = k * a;													//store k*a (argument for spherical bessel and Hankel functions)
	stim::complex<double> kna = k * n * a;								//store k*n*a (argument for spherical bessel functions and derivatives)

	stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka);			//calculate bessel functions and derivatives for k*a
	stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna);		//calculate complex bessel functions for k*n*a

	stim::complex<double> h_ka, dh_ka;
	stim::complex<double> numerator, denominator;
	stim::complex<double> i(0, 1);
	for(size_t l = 0; l <= Nl; l++){
		h_ka.r = j_ka[l];
		h_ka.i = y_ka[l];
		dh_ka.r = dj_ka[l];
		dh_ka.i = dy_ka[l];

		numerator = j_ka[l] * dj_kna[l] * (stim::complex<double>)n - j_kna[l] * dj_ka[l];
		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
		B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
		std::cout<<B[l]<<std::endl;
	}
}

template<typename T>
void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){
	//temporary variables
	double vm;															//allocate space to store the return values for the bessel function calculation
	double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );

	stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );

	double ka = k * a;													//store k*a (argument for spherical bessel and Hankel functions)
	stim::complex<double> kna = k * n * a;								//store k*n*a (argument for spherical bessel functions and derivatives)

	stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka);			//calculate bessel functions and derivatives for k*a
	stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna);		//calculate complex bessel functions for k*n*a

	stim::complex<double> h_ka, dh_ka;
	stim::complex<double> numerator, denominator;
	stim::complex<double> i(0, 1);
	for(size_t l = 0; l <= Nl; l++){
		h_ka.r = j_ka[l];
		h_ka.i = y_ka[l];
		dh_ka.r = dj_ka[l];
		dh_ka.i = dy_ka[l];

		numerator = j_ka[l] * dh_ka - dj_ka[l] * h_ka;
		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
		A[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
	}
}

template<typename T>
__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* B, T* j, T kr_min, T dkr, int Nl){
	extern __shared__ stim::scalarwave<T> shared_W[];		//declare the list of waves in shared memory

	stim::cuda::sharedMemcpy(shared_W, W, nW, threadIdx.x, blockDim.x);	//copy the plane waves into shared memory for faster access
	__syncthreads();															//synchronize threads to insure all data is copied

	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
	if(i >= N) return;												//exit if this thread is outside the array
	stim::vec3<T> p;
	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
	(y == NULL) ? p[1] = 0 : p[1] = y[i];
	(z == NULL) ? p[2] = 0 : p[2] = z[i];
	
	T r = p.len();													//calculate the distance from the sphere
	T k = W[0].kmag();
	if(r < a) return;												//exit if the point is inside the sphere (we only calculate the internal field)

	size_t NC = Nl + 1;										//calculate the number of coefficients to be used
	T kr = r * k;											//calculate the thread value for k*r
	T fij = (kr - kr_min)/dkr;								//FP index into the spherical bessel LUT
	size_t ij = (size_t) fij;								//convert to an integral index
	T alpha = fij - ij;											//calculate the fractional portion of the index
	size_t n0j = ij * (NC);									//start of the first entry in the LUT
	size_t n1j = (ij+1) * (NC);								//start of the second entry in the LUT

	T cos_phi;	
	T Pl_2, Pl_1;									//declare registers to store the previous two Legendre polynomials
	T Pl = 1;										//initialize the current value for the Legendre polynomial
	T jl;
	stim::complex<T> Ei = 0;											//create a register to store the result
	int l;
	for(size_t w = 0; w < nW; w++){
		cos_phi = p.norm().dot(W[w].kvec().norm());						//calculate the cosine of the angle between the k vector and the direction from the sphere
		for(l = 0; l <= Nl; l++){
			Pl_2 = Pl_1;								//shift Pl_1 -> Pl_2 and Pl -> Pl_1
			Pl_1 = Pl;
			if(l == 0){									//computing Pl is done recursively, where the recursive relation
				Pl = cos_phi;							//	requires the first two orders. This defines the second.
			}
			else{										//if this is not the first iteration, use the recursive relation to calculate Pl
				Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
			}

			jl = lerp<T>( j[n0j + l], j[n1j + l], alpha );	//read jl from the LUT and interpolate the result
			Ei += W[w].E() * B[l] * jl * Pl;
		}
		//Ei += shared_W[w].pos(p);									//evaluate the plane wave
	}
	E[i] += Ei;														//copy the result to device memory
}

template<typename T>
void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* B, T* j, T kr_min, T dkr, size_t Nl){

	size_t wave_bytes = sizeof(stim::scalarwave<T>);
	size_t shared_bytes = stim::sharedMemPerBlock();									//calculate the maximum amount of shared memory available
	size_t array_bytes = nW * wave_bytes;			//calculate the maximum number of bytes required for the planewave array
	size_t max_batch = shared_bytes / wave_bytes;				//calculate number of plane waves that will fit into shared memory
	size_t num_batches = nW / max_batch + 1;								//calculate the number of batches required to process all plane waves
	size_t batch_bytes = min(nW, max_batch) * wave_bytes;				//initialize the batch size (in bytes) to the maximum batch required

	stim::scalarwave<T>* batch_W;
	HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes));										//allocate memory for a single batch of plane waves

	int threads = stim::maxThreadsPerBlock();							//get the maximum number of threads per block for the CUDA device
	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks	

	size_t batch_size;																	//declare a variable to store the size of the current batch
	size_t waves_processed = 0;															//initialize the number of waves processed to zero
	while(waves_processed < nW){												//while there are still waves to be processed
		batch_size = min<size_t>(max_batch, nW - waves_processed);			//process either a whole batch, or whatever is left
		batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
		HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice));	//copy the plane waves into global memory
		cuda_scalar_mie_scatter<T><<< blocks, threads, batch_bytes >>>(E, N, x, y, z, batch_W, batch_size, a, n, B, j, kr_min, dkr, (int)Nl);	//call the kernel
		waves_processed += batch_size;													//increment the counter indicating how many waves have been processed
	}
	cudaFree(batch_W);
}
/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave

/// @param E is a pointer to the destination field values
/// @param N is the number of points used to calculate the field
/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
/// @param W is an array of planewaves that will be scattered
/// @param a is the radius of the sphere
/// @param n is the complex refractive index of the sphere
template<typename T>
void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n){
	//calculate the necessary number of orders required to represent the scattered field
	T k = W[0].kmag();

	size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2);

	//calculate the scattering coefficients for the sphere
	stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
	B_coefficients(B, a, k, n, Nl);

#ifdef __CUDACC__
	stim::complex<T>* dev_E;										//allocate space for the field
	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)

	//	COORDINATES
	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
	if(x != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
	if(y != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
	if(z != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
	}

	//	PLANE WAVES
	stim::scalarwave<T>* dev_W;																//allocate space and copy plane waves
	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );

	// SCATTERING COEFFICIENTS
	stim::complex<T>* dev_B;
	HANDLE_ERROR( cudaMalloc(&dev_B, sizeof(stim::complex<T>) * (Nl+1)) );
	HANDLE_ERROR( cudaMemcpy(dev_B, B, sizeof(stim::complex<T>) * (Nl+1), cudaMemcpyHostToDevice) );

	// BESSEL FUNCTION LOOK-UP TABLE
	//size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1);			//number of values in the look-up table based on the user-specified spacing along r
	size_t Nlut_j = 1024;
	T r_min = 0;
	T r_max = 10;

	T kr_min = k * r_min;
	T kr_max = k * r_max;

	//temporary variables
	double vm;															//allocate space to store the return values for the bessel function calculation
	double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );

	size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j;
	T* bessel_lut = (T*) malloc(lutj_bytes);													//pointer to the look-up table
	T dkr = (kr_max - kr_min) / (Nlut_j-1);												//distance between values in the LUT
	std::cout<<"LUT jl bytes:  "<<lutj_bytes<<std::endl;
	for(size_t kri = 0; kri < Nlut_j; kri++){													//for each value in the LUT
		stim::bessjyv_sph<double>(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
		for(size_t l = 0; l <= Nl; l++){													//for each order
			bessel_lut[kri * (Nl + 1) + l] = (T)jv[l];										//store the bessel function result
		}
	}

	stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);

	//Allocate device memory and copy everything to the GPU
	T* dev_j_lut;
	HANDLE_ERROR( cudaMalloc(&dev_j_lut, lutj_bytes) );
	HANDLE_ERROR( cudaMemcpy(dev_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) );

	gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_B, dev_j_lut, kr_min, dkr, Nl);

	cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory

	if(x != NULL) cudaFree(dev_x);														//free everything
	if(y != NULL) cudaFree(dev_y);
	if(z != NULL) cudaFree(dev_z);
	cudaFree(dev_E);
#else
	

	//allocate space to store the bessel function call results
	double vm;										
	double* j_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* y_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dj_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dy_kr= (double*) malloc( (Nl + 1) * sizeof(double) );

	T* P = (T*) malloc( (Nl + 1) * sizeof(T) );

	T r, kr, cos_phi;
	stim::complex<T> h;
	for(size_t i = 0; i < N; i++){
		stim::vec3<T> p;															//declare a 3D point
	
		(x == NULL) ? p[0] = 0 : p[0] = x[i];										// test for NULL values and set positions
		(y == NULL) ? p[1] = 0 : p[1] = y[i];
		(z == NULL) ? p[2] = 0 : p[2] = z[i];
		r = p.len();
		if(r >= a){
			for(size_t w = 0; w < W.size(); w++){
				kr = p.len() * W[w].kmag();											//calculate k*r
				stim::bessjyv_sph<double>(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr);
				cos_phi = p.norm().dot(W[w].kvec().norm());							//calculate the cosine of the angle from the propagating direction
				stim::legendre<T>(Nl, cos_phi, P);

				for(size_t l = 0; l <= Nl; l++){
					h.r = j_kr[l];
					h.i = y_kr[l];
					E[i] += W[w].E() * B[l] * h * P[l];
				}
			}
		}
	}
#endif
}

template<typename T>
void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){
	std::vector< stim::scalarwave<T> > W(1, w);
	cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n);
}

/// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere

/// @param E is a pointer to the destination field values
/// @param N is the number of points used to calculate the field
/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
/// @param w is a planewave that will be scattered
/// @param a is the radius of the sphere
/// @param n is the complex refractive index of the sphere
template<typename T>
void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n){

	//calculate the necessary number of orders required to represent the scattered field
	T k = W[0].kmag();

	size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2);

	//calculate the scattering coefficients for the sphere
	stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
	A_coefficients(A, a, k, n, Nl);

	//allocate space to store the bessel function call results
	double vm;										
	stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* y_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dj_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dy_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );

	T* P = (T*) malloc( (Nl + 1) * sizeof(T) );

	T r, cos_phi;
	stim::complex<double> knr;
	stim::complex<T> h;
	for(size_t i = 0; i < N; i++){
		stim::vec3<T> p;									//declare a 3D point
	
		(x == NULL) ? p[0] = 0 : p[0] = x[i];				// test for NULL values and set positions
		(y == NULL) ? p[1] = 0 : p[1] = y[i];
		(z == NULL) ? p[2] = 0 : p[2] = z[i];
		r = p.len();
		if(r < a){
			E[i] = 0;
			for(size_t w = 0; w < W.size(); w++){
				knr = (stim::complex<double>)n * p.len() * W[w].kmag();							//calculate k*n*r

				stim::cbessjyva_sph<double>(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr);
				if(r == 0)
					cos_phi = 0;
				else
					cos_phi = p.norm().dot(W[w].kvec().norm());				//calculate the cosine of the angle from the propagating direction
				stim::legendre<T>(Nl, cos_phi, P);
								
				for(size_t l = 0; l <= Nl; l++){
					E[i] += W[w].E() * A[l] * (stim::complex<T>)j_knr[l] * P[l];
				}
			}
		}
	}
}

template<typename T>
void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){
	std::vector< stim::scalarwave<T> > W(1, w);
	cpu_scalar_mie_internal(E, N, x, y, z, W, a, n);
}

}

#endif