mie.h 18.5 KB

Edit Raw Blame History

#ifndef STIM_MIE_H
#define STIM_MIE_H
#include "scalarwave.h"
#include "../math/bessel.h"
#include "../cuda/cudatools/devices.h"
#include <cmath>
namespace stim{


/// Calculate the scattering coefficients for a spherical scatterer
template<typename T>
void B_coefficients(stim::complex<T>* B, T a, T k, stim::complex<T> n, int Nl){

	//temporary variables
	double vm;															//allocate space to store the return values for the bessel function calculation
	double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );

	stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );

	double ka = k * a;													//store k*a (argument for spherical bessel and Hankel functions)
	stim::complex<double> kna = k * n * a;								//store k*n*a (argument for spherical bessel functions and derivatives)
	stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka);			//calculate bessel functions and derivatives for k*a
	stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna);		//calculate complex bessel functions for k*n*a
	stim::complex<double> h_ka, dh_ka;
	stim::complex<double> numerator, denominator;
	stim::complex<double> i(0, 1);
	for(int l = 0; l <= Nl; l++){
		h_ka.r = j_ka[l];
		h_ka.i = y_ka[l];
		dh_ka.r = dj_ka[l];
		dh_ka.i = dy_ka[l];

		numerator = j_ka[l] * dj_kna[l] * (stim::complex<double>)n - j_kna[l] * dj_ka[l];
		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
		B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
		std::cout<<B[l]<<std::endl;
	}
}

template<typename T>
void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){
	//temporary variables
	double vm;															//allocate space to store the return values for the bessel function calculation
	double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );

	stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );

	double ka = k * a;													//store k*a (argument for spherical bessel and Hankel functions)
	stim::complex<double> kna = k * n * a;								//store k*n*a (argument for spherical bessel functions and derivatives)
	stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka);			//calculate bessel functions and derivatives for k*a
	stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna);		//calculate complex bessel functions for k*n*a
	stim::complex<double> h_ka, dh_ka;
	stim::complex<double> numerator, denominator;
	stim::complex<double> i(0, 1);
	for(size_t l = 0; l <= Nl; l++){
		h_ka.r = j_ka[l];
		h_ka.i = y_ka[l];
		dh_ka.r = dj_ka[l];
		dh_ka.i = dy_ka[l];

		numerator = j_ka[l] * dh_ka - dj_ka[l] * h_ka;
		denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
		A[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
	}
}

#define LOCAL_NL	16
template<typename T>
__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, int Nl){
	extern __shared__ stim::complex<T> shared_hB[];		//declare the list of waves in shared memory
	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
	if(i >= N) return;													//exit if this thread is outside the array
	stim::vec3<T> p;
	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
	(y == NULL) ? p[1] = 0 : p[1] = y[i];
	(z == NULL) ? p[2] = 0 : p[2] = z[i];

	T r = p.len();														//calculate the distance from the sphere
	if(r < a) return;													//exit if the point is inside the sphere (we only calculate the internal field)
	T k = W[0].kmag();
	size_t NC = Nl + 1;													//calculate the number of coefficients to be used
	T kr = r * k;														//calculate the thread value for k*r
	T fij = (kr - kr_min)/dkr;											//FP index into the spherical bessel LUT
	size_t ij = (size_t) fij;											//convert to an integral index
	T alpha = fij - ij;													//calculate the fractional portion of the index
	size_t n0j = ij * (NC);												//start of the first entry in the LUT
	size_t n1j = (ij+1) * (NC);											//start of the second entry in the LUT
	T cos_phi;
	T Pl_2, Pl_1, Pl;														//declare registers to store the previous two Legendre polynomials
	stim::complex<T> hBl;
	stim::complex<T> Ei = 0;											//create a register to store the result
	int l;

	stim::complex<T> hlBl[LOCAL_NL+1];
	int shared_start = threadIdx.x * (Nl - LOCAL_NL);

	#pragma unroll LOCAL_NL+1
	for(l = 0; l <= LOCAL_NL; l++)
		hlBl[l] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );

	for(l = LOCAL_NL+1; l <= Nl; l++)
		shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );

	for(size_t w = 0; w < nW; w++){
		cos_phi = p.norm().dot(W[w].kvec().norm());						//calculate the cosine of the angle between the k vector and the direction from the sphere
		Pl_2 = 1;
		Pl_1 = cos_phi;
		Ei += W[w].E() * hlBl[0] * Pl_2;
		Ei += W[w].E() * hlBl[1] * Pl_1;

		#pragma unroll LOCAL_NL-1
		for(l = 2; l <= LOCAL_NL; l++){
			Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
			Ei += W[w].E() * hlBl[l] * Pl;
			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
			Pl_1 = Pl;
		}

		for(l = LOCAL_NL+1; l <= Nl; l++){
			Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
			Ei += W[w].E() * shared_hB[shared_start + (l - (LOCAL_NL+1))] * Pl;
			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
			Pl_1 = Pl;

		}
	}
	E[i] += Ei;															//copy the result to device memory
}

template<typename T>
void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){

	size_t max_shared_mem = stim::sharedMemPerBlock();
	int hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
	std::cout<<"hl*Bl array size:  "<<hBl_array<<std::endl;
	std::cout<<"shared memory:     "<<max_shared_mem<<std::endl;
	int threads = (max_shared_mem / hBl_array) / 32 * 32;
	std::cout<<"threads per block: "<<threads<<std::endl;
	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks
	size_t shared_mem;
	if(Nl <= LOCAL_NL) shared_mem = 0;
	else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL);				//amount of shared memory to allocate
	std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
	cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl);	//call the kernel
}

template<typename T>
__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){
	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
	if(i >= N) return;													//exit if this thread is outside the array
	stim::vec3<T> p;
	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
	(y == NULL) ? p[1] = 0 : p[1] = y[i];
	(z == NULL) ? p[2] = 0 : p[2] = z[i];

	r[i] = p.len();
}
/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
/// @param E is a pointer to the destination field values
/// @param N is the number of points used to calculate the field
/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
/// @param W is an array of planewaves that will be scattered
/// @param a is the radius of the sphere
/// @param n is the complex refractive index of the sphere
template<typename T>
void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
	//calculate the necessary number of orders required to represent the scattered field
	T k = W[0].kmag();

	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
	std::cout<<"Nl: "<<Nl<<std::endl;

	//calculate the scattering coefficients for the sphere
	stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
	B_coefficients(B, a, k, n, Nl);

#ifdef CUDA_FOUND
	stim::complex<T>* dev_E;										//allocate space for the field
	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
	//	COORDINATES
	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
	if(x != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
	if(y != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
	if(z != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
	}

	//	PLANE WAVES
	stim::scalarwave<T>* dev_W;																//allocate space and copy plane waves
	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );

	// BESSEL FUNCTION LOOK-UP TABLE
	//calculate the distance from the sphere center
	T* dev_r;
	HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );

	int threads = stim::maxThreadsPerBlock();
	dim3 blocks((unsigned)(N / threads + 1));
	cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);

	//Find the minimum and maximum values of r
    cublasStatus_t stat;
    cublasHandle_t handle;

	stat = cublasCreate(&handle);							//create a cuBLAS handle
	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
        printf ("CUBLAS initialization failed\n");
		exit(1);
	}

	int i_min, i_max;
	stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
		exit(1);
	}
	stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
		exit(1);
	}

	T r_min, r_max;											//allocate space to store the minimum and maximum values
	HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
	HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );


	//size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1);			//number of values in the look-up table based on the user-specified spacing along r
	size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1);

	T kr_min = k * r_min;
	T kr_max = k * r_max;

	//temporary variables
	double vm;															//allocate space to store the return values for the bessel function calculation
	double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );

	size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut;
	stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes);													//pointer to the look-up table
	T dkr = (kr_max - kr_min) / (N_hB_lut-1);												//distance between values in the LUT
	std::cout<<"LUT jl bytes:  "<<hB_bytes<<std::endl;
	stim::complex<T> hl;
	for(size_t kri = 0; kri < N_hB_lut; kri++){													//for each value in the LUT
		stim::bessjyv_sph<double>(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
		for(size_t l = 0; l <= Nl; l++){													//for each order
			hl.r = (T)jv[l];
			hl.i = (T)yv[l];

			hB_lut[kri * (Nl + 1) + l] = hl * B[l];										//store the bessel function result
		}
	}

	//stim::cpu2image<T>(hankel_lut, "hankel.bmp", Nl+1, Nlut_j, stim::cmBrewer);
	//Allocate device memory and copy everything to the GPU
	stim::complex<T>* dev_hB_lut;
	HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
	HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );

	gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, kr_min, dkr, N_hB_lut, Nl);

	cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
	if(x != NULL) cudaFree(dev_x);														//free everything
	if(y != NULL) cudaFree(dev_y);
	if(z != NULL) cudaFree(dev_z);
	cudaFree(dev_E);
#else

	//allocate space to store the bessel function call results
	double vm;
	double* j_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* y_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dj_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dy_kr= (double*) malloc( (Nl + 1) * sizeof(double) );

	T* P = (T*) malloc( (Nl + 1) * sizeof(T) );

	T r, kr, cos_phi;
	stim::complex<T> h;
	for(size_t i = 0; i < N; i++){
		stim::vec3<T> p;															//declare a 3D point
		(x == NULL) ? p[0] = 0 : p[0] = x[i];										// test for NULL values and set positions
		(y == NULL) ? p[1] = 0 : p[1] = y[i];
		(z == NULL) ? p[2] = 0 : p[2] = z[i];
		r = p.len();
		if(r >= a){
			for(size_t w = 0; w < W.size(); w++){
				kr = p.len() * W[w].kmag();											//calculate k*r
				stim::bessjyv_sph<double>(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr);
				cos_phi = p.norm().dot(W[w].kvec().norm());							//calculate the cosine of the angle from the propagating direction
				stim::legendre<T>(Nl, cos_phi, P);

				for(size_t l = 0; l <= Nl; l++){
					h.r = j_kr[l];
					h.i = y_kr[l];
					E[i] += W[w].E() * B[l] * h * P[l];
				}
			}
		}
	}
#endif
}

template<typename T>
void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){
	std::vector< stim::scalarwave<T> > W(1, w);
	cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n);
}

/// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere
/// @param E is a pointer to the destination field values
/// @param N is the number of points used to calculate the field
/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
/// @param w is a planewave that will be scattered
/// @param a is the radius of the sphere
/// @param n is the complex refractive index of the sphere
template<typename T>
void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n){

	//calculate the necessary number of orders required to represent the scattered field
	T k = W[0].kmag();

	size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2);
	std::cout<<"Nl: "<<Nl<<std::endl;

	//calculate the scattering coefficients for the sphere
	stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
	A_coefficients(A, a, k, n, Nl);

	//allocate space to store the bessel function call results
	double vm;
	stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* y_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dj_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
	stim::complex<double>* dy_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );

	T* P = (T*) malloc( (Nl + 1) * sizeof(T) );

	T r, cos_phi;
	stim::complex<double> knr;
	stim::complex<T> h;
	for(size_t i = 0; i < N; i++){
		stim::vec3<T> p;									//declare a 3D point
		(x == NULL) ? p[0] = 0 : p[0] = x[i];				// test for NULL values and set positions
		(y == NULL) ? p[1] = 0 : p[1] = y[i];
		(z == NULL) ? p[2] = 0 : p[2] = z[i];
		r = p.len();
		if(r < a){
			E[i] = 0;
			for(size_t w = 0; w < W.size(); w++){
				knr = (stim::complex<double>)n * p.len() * W[w].kmag();							//calculate k*n*r
				stim::cbessjyva_sph<double>(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr);
				if(r == 0)
					cos_phi = 0;
				else
					cos_phi = p.norm().dot(W[w].kvec().norm());				//calculate the cosine of the angle from the propagating direction
				stim::legendre<T>(Nl, cos_phi, P);

				for(size_t l = 0; l <= Nl; l++){
					E[i] += W[w].E() * A[l] * (stim::complex<T>)j_knr[l] * P[l];
				}
			}
		}
	}
}

template<typename T>
void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){
	std::vector< stim::scalarwave<T> > W(1, w);
	cpu_scalar_mie_internal(E, N, x, y, z, W, a, n);
}

}

#endif