stim/optics/scalarbeam.h

#ifndef RTS_BEAM
#define RTS_BEAM
#include <boost/math/special_functions/bessel.hpp>
#include "../math/vec3.h"
#include "../optics/scalarwave.h"
#include "../math/bessel.h"
#include "../math/legendre.h"
#include "../cuda/cudatools/devices.h"
#include "../cuda/cudatools/timer.h"
#include "../optics/scalarfield.h"
#include <cublas_v2.h>
#include <math_constants.h>
#include <vector>
#include <stdlib.h>
namespace stim{
/// Function returns the value of the scalar field produced by a beam with the specified parameters
template<typename T>
std::vector< stim::vec3<T> > generate_focusing_vectors(size_t N, stim::vec3<T> d, T NA, T NA_in = 0){
	std::vector< stim::vec3<T> > dirs(N);					//allocate an array to store the focusing vectors
	///compute the rotation operator to transform (0, 0, 1) to k
	T cos_angle = d.dot(vec3<T>(0, 0, 1));
	stim::matrix<T, 3> rotation;
	//if the cosine of the angle is -1, the rotation is just a flip across the z axis
	if(cos_angle == -1){
		rotation(2, 2) = -1;
	}
	else if(cos_angle != 1.0)
	{
		vec3<T> r_axis = vec3<T>(0, 0, 1).cross(d).norm();	//compute the axis of rotation
		T angle = acos(cos_angle);							//compute the angle of rotation
		quaternion<T> quat;							//create a quaternion describing the rotation
		quat.CreateRotation(angle, r_axis);
		rotation = quat.toMatrix3();							//compute the rotation matrix
	}
	//find the phi values associated with the cassegrain ring
	T PHI[2];
	PHI[0] = (T)asin(NA);
	PHI[1] = (T)asin(NA_in);
	//calculate the z-axis cylinder coordinates associated with these angles
	T Z[2];
	Z[0] = cos(PHI[0]);
	Z[1] = cos(PHI[1]);
	T range = Z[0] - Z[1];
	//draw a distribution of random phi, z values
	T z, phi, theta;
	//T kmag = stim::TAU / lambda;
	for(int i=0; i<N; i++){								//for each sample
		z = (T)((double)rand() / (double)RAND_MAX) * range + Z[1];			//find a random position on the surface of a cylinder
		theta = (T)(((double)rand() / (double)RAND_MAX) * stim::TAU);
		phi = acos(z);													//project onto the sphere, computing phi in spherical coordinates
		//compute and store cartesian coordinates
		vec3<T> spherical(1, theta, phi);								//convert from spherical to cartesian coordinates
		vec3<T> cart = spherical.sph2cart();
		dirs[i] = rotation * cart;										//create a sample vector
	}
	return dirs;
}
		
/// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional)
/// @param C is a pointer to Nl + 1 values where the terms will be stored
template<typename T>
CUDA_CALLABLE void cpu_aperture_integral(T* C, int Nl, T NA, T NA_in = 0){
	size_t table_bytes = (Nl + 1) * sizeof(T);				//calculate the number of bytes required to store the terms
	T cos_alpha_1 = cos(asin(NA_in));						//calculate the cosine of the angle subtended by the central obscuration
	T cos_alpha_2 = cos(asin(NA));							//calculate the cosine of the angle subtended by the aperture
	// the aperture integral is computed using four individual Legendre polynomials, each a function of the angles subtended
	//		by the objective and central obscuration
	T* Pln_a1 = (T*) malloc(table_bytes);
	stim::legendre<T>(Nl-1, cos_alpha_1, &Pln_a1[1]);
	Pln_a1[0] = 1;
	T* Pln_a2 = (T*) malloc(table_bytes);
	stim::legendre<T>(Nl-1, cos_alpha_2, &Pln_a2[1]);
	Pln_a2[0] = 1;
	T* Plp_a1 = (T*) malloc(table_bytes+sizeof(T));
	stim::legendre<T>(Nl+1, cos_alpha_1, Plp_a1);
	T* Plp_a2 = (T*) malloc(table_bytes+sizeof(T));
	stim::legendre<T>(Nl+1, cos_alpha_2, Plp_a2);
	for(size_t l = 0; l <= Nl; l++){
		C[l] = Plp_a1[l+1] - Plp_a2[l+1] - Pln_a1[l] + Pln_a2[l];
	}
	free(Pln_a1);
	free(Pln_a2);
	free(Plp_a1);
	free(Plp_a2);
}
/// performs linear interpolation into a look-up table
template<typename T>
CUDA_CALLABLE void lut_lookup(T* lut_values, T* lut, T val, size_t N, T min_val, T delta, size_t n_vals){
	T idx = ((val - min_val) / delta);
	size_t i = (size_t) idx;
	T a1 = idx - i;
	T a0 = 1 - a1;
	size_t n0 = i * n_vals;
	size_t n1 = (i+1) * n_vals;
	for(size_t n = 0; n < n_vals; n++){
		lut_values[n] = lut[n0 + n] * a0 + lut[n1 + n] * a1;
	}
}
template <typename T>
CUDA_CALLABLE stim::complex<T> clerp(stim::complex<T> v0, stim::complex<T> v1, T t) {
    return stim::complex<T>( fma(t, v1.r, fma(-t, v0.r, v0.r)), fma(t, v1.i, fma(-t, v0.i, v0.i)) );
}
template <typename T>
CUDA_CALLABLE T lerp(T v0, T v1, T t) {
    return fma(t, v1, fma(-t, v0, v0));
}
#ifdef CUDA_FOUND
template<typename T>
__global__ void cuda_scalar_psf(stim::complex<T>* E, size_t N, T* r, T* phi, T A, size_t Nl,
								T* C, 
								T* lut_j, size_t Nj, T min_r, T dr){
	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
	if(i >= N) return;												//exit if this thread is outside the array
	T cos_phi = cos(phi[i]);									//calculate the thread value for cos(phi)
	stim::complex<T> Ei = 0;									//initialize the value of the field to zero
	size_t NC = Nl + 1;										//calculate the number of coefficients to be used
	T fij = (r[i] - min_r)/dr;								//FP index into the spherical bessel LUT
	size_t ij = (size_t) fij;								//convert to an integral index
	T a = fij - ij;											//calculate the fractional portion of the index
	size_t n0j = ij * (NC);									//start of the first entry in the LUT
	size_t n1j = (ij+1) * (NC);								//start of the second entry in the LUT
	T jl;											//declare register to store the spherical bessel function
	T Pl_2, Pl_1;									//declare registers to store the previous two Legendre polynomials
	T Pl = 1;										//initialize the current value for the Legendre polynomial
	stim::complex<T> im(0, 1);						//declare i (imaginary 1)
	stim::complex<T> i_pow(1, 0);					//i_pow stores the current value of i^l so it doesn't have to be re-computed every iteration
	for(int l = 0; l <= Nl; l++){					//for each order
		jl = lerp<T>( lut_j[n0j + l], lut_j[n1j + l], a );	//read jl from the LUT and interpolate the result
		Ei += i_pow * jl * Pl * C[l];				//calculate the value for the field and sum
		i_pow *= im;								//multiply i^l * i for the next iteration
		Pl_2 = Pl_1;								//shift Pl_1 -> Pl_2 and Pl -> Pl_1
		Pl_1 = Pl;
		if(l == 0){									//computing Pl is done recursively, where the recursive relation
			Pl = cos_phi;							//	requires the first two orders. This defines the second.
		}
		else{										//if this is not the first iteration, use the recursive relation to calculate Pl
			Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
		}
		
	}
	E[i] = Ei * A * 2 * CUDART_PI_F;						//scale the integral by the amplitude
}
template<typename T>
void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl, T r_spacing){
	//Find the minimum and maximum values of r
    cublasStatus_t stat;
    cublasHandle_t handle;
	stat = cublasCreate(&handle);							//create a cuBLAS handle
	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
        printf ("CUBLAS initialization failed\n");
		exit(1);
	}
	int i_min, i_max;
	stat = cublasIsamin(handle, (int)N, r, 1, &i_min);
	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
		exit(1);
	}
	stat = cublasIsamax(handle, (int)N, r, 1, &i_max);
	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
		exit(1);
	}
	i_min--;												//cuBLAS uses 1-based indexing for Fortran compatibility
	i_max--;
	T r_min, r_max;											//allocate space to store the minimum and maximum values
	HANDLE_ERROR( cudaMemcpy(&r_min, r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
	HANDLE_ERROR( cudaMemcpy(&r_max, r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
	T k = (T)stim::TAU / lambda;							//calculate the wavenumber from lambda
	size_t C_bytes = (Nl + 1) * sizeof(T);
	T* C = (T*) malloc( C_bytes );							//allocate space for the aperture integral terms
	cpu_aperture_integral(C, Nl, NA, NA_in);				//calculate the aperture integral terms
	size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1);			//number of values in the look-up table based on the user-specified spacing along r
	size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j;
	T* j_lut = (T*) malloc(lutj_bytes);													//pointer to the look-up table
	T dr = (r_max - r_min) / (Nlut_j-1);												//distance between values in the LUT
	T jl;
	for(size_t ri = 0; ri < Nlut_j; ri++){													//for each value in the LUT
		for(size_t l = 0; l <= Nl; l++){													//for each order
			jl = boost::math::sph_bessel<T>(l, k*(r_min + ri * dr));					//use boost to calculate the spherical bessel function
			j_lut[ri * (Nl + 1) + l] = jl;												//store the bessel function result
		}
	}
	stim::cpu2image<T>(j_lut, "j_lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
	//Allocate device memory and copy everything to the GPU
	T* gpu_C;
	HANDLE_ERROR( cudaMalloc(&gpu_C, C_bytes) );
	HANDLE_ERROR( cudaMemcpy(gpu_C, C, C_bytes, cudaMemcpyHostToDevice) );
	T* gpu_j_lut;
	HANDLE_ERROR( cudaMalloc(&gpu_j_lut, lutj_bytes) );
	HANDLE_ERROR( cudaMemcpy(gpu_j_lut, j_lut, lutj_bytes, cudaMemcpyHostToDevice) );
	int threads = stim::maxThreadsPerBlock();			//get the maximum number of threads per block for the CUDA device
	dim3 blocks( (unsigned)(N / threads + 1));						//calculate the optimal number of blocks
	cuda_scalar_psf<T><<< blocks, threads >>>(E, N, r, phi, A, Nl, gpu_C, gpu_j_lut, Nlut_j, r_min, dr);
	//free the LUT and condenser tables
	HANDLE_ERROR( cudaFree(gpu_C) );
	HANDLE_ERROR( cudaFree(gpu_j_lut) );
}
#endif
/// Calculate the analytical solution to a scalar point spread function given a set of spherical coordinates about the PSF (beam propagation along phi = theta = 0)
template<typename T>
void cpu_scalar_psf_local(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl){
	T k = (T)stim::TAU / lambda;
	size_t C_bytes = (Nl + 1) * sizeof(T);
	T* C = (T*) malloc( C_bytes );					//allocate space for the aperture integral terms
	cpu_aperture_integral(C, Nl, NA, NA_in);			//calculate the aperture integral terms
	memset(F, 0, N * sizeof(stim::complex<T>));
	T jl, Pl, kr, cos_phi;
	double vm;
	double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
	T* Pl_cos_phi = (T*) malloc((Nl + 1) * sizeof(T));
	for(size_t n = 0; n < N; n++){								//for each point in the field
		kr = k * r[n];											//calculate kr (the optical distance between the focal point and p)
		cos_phi = std::cos(phi[n]);								//calculate the cosine of phi
		stim::bessjyv_sph<double>(Nl, kr, vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
		stim::legendre<T>(Nl, cos_phi, Pl_cos_phi);				//calculate the [0 Nl] legendre polynomials for this point
		for(int l = 0; l <= Nl; l++){
			jl = (T)jv[l];
			Pl = Pl_cos_phi[l];
			F[n] += pow(complex<T>(0, 1), l) * jl * Pl * C[l];
		}
		F[n] *= A * stim::TAU;
	}
	free(C);
	free(Pl_cos_phi);
}
/// Converts a set of cartesian points into spherical coordinates surrounding a point spread function (PSF)
/// @param r is the output distance from the PSF
/// @param phi is the non-symmetric direction about the PSF
/// @param x (x, y, z) are the cartesian coordinates in world space
/// @f is the focal point of the PSF in cartesian coordinates
/// @d is the propagation direction of the PSF in cartesian coordinates
template<typename T>
__global__ void cuda_cart2psf(T* r, T* phi, size_t N, T* x, T* y, T* z, stim::vec3<T> f, stim::quaternion<T> q){
	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
	if(i >= N) return;												//exit if this thread is outside the array
	stim::vec3<T> p;									//declare a 3D point
	
	(x == NULL) ? p[0] = 0 : p[0] = x[i];				// test for NULL values and set positions
	(y == NULL) ? p[1] = 0 : p[1] = y[i];
	(z == NULL) ? p[2] = 0 : p[2] = z[i];
	p = p - f;											//shift the point to the center of the PSF (focal point)
	p = q.toMatrix3() * p;								//rotate the point to align with the propagation direction
	stim::vec3<T> ps = p.cart2sph();									//convert from cartesian to spherical coordinates
	r[i] = ps[0];										//store r
	phi[i] = ps[2];										//phi = [0 pi]
}
#ifdef CUDA_FOUND
/// Calculate the analytical solution to a point spread function given a set of points in cartesian coordinates
template<typename T>
void gpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
	
	T* gpu_r;															//allocate space for the coordinates in r
	HANDLE_ERROR( cudaMalloc(&gpu_r, sizeof(T) * N) );
	T* gpu_phi;
	HANDLE_ERROR( cudaMalloc(&gpu_phi, sizeof(T) * N) );
	//stim::complex<T>* gpu_E;
	//HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex<T>) * N) );
	stim::quaternion<T> q;												//create a quaternion
	q.CreateRotation(d, stim::vec3<T>(0, 0, 1));						//create a mapping from the propagation direction to the PSF space
	int threads = stim::maxThreadsPerBlock();							//get the maximum number of threads per block for the CUDA device
	dim3 blocks( (unsigned)(N / threads + 1));							//calculate the optimal number of blocks
	cuda_cart2psf<T> <<< blocks, threads >>> (gpu_r, gpu_phi, N, x, y, z, f, q);	//call the CUDA kernel to move the cartesian coordinates to PSF space
	gpu_scalar_psf_local(E, N, gpu_r, gpu_phi, lambda, A, NA, NA_in, Nl, r_spacing);
}
#endif
template<typename T>
void cpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
// If CUDA is available, copy the cartesian points to the GPU and evaluate them in a kernel
#ifdef CUDA_FOUND
	T* gpu_x = NULL;
	if(x != NULL){
		HANDLE_ERROR( cudaMalloc(&gpu_x, sizeof(T) * N) );
		HANDLE_ERROR( cudaMemcpy(gpu_x, x, sizeof(T) * N, cudaMemcpyHostToDevice) );
	}
	T* gpu_y = NULL;
	if(y != NULL){
		HANDLE_ERROR( cudaMalloc(&gpu_y, sizeof(T) * N) );
		HANDLE_ERROR( cudaMemcpy(gpu_y, y, sizeof(T) * N, cudaMemcpyHostToDevice) );
	}
	T* gpu_z = NULL;
	if(z != NULL){
		HANDLE_ERROR( cudaMalloc(&gpu_z, sizeof(T) * N) );
		HANDLE_ERROR( cudaMemcpy(gpu_z, z, sizeof(T) * N, cudaMemcpyHostToDevice) );
	}
	stim::complex<T>* gpu_E;
	HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex<T>) * N) );
	HANDLE_ERROR( cudaMemcpy(gpu_E, E, sizeof(stim::complex<T>) * N, cudaMemcpyHostToDevice) );
	gpu_scalar_psf_cart<T>(gpu_E, N, gpu_x, gpu_y, gpu_z, lambda, A, f, d, NA, NA_in, Nl, r_spacing);
	HANDLE_ERROR( cudaMemcpy(E, gpu_E, sizeof(stim::complex<T>) * N, cudaMemcpyDeviceToHost) );
	HANDLE_ERROR( cudaFree(gpu_x) );
	HANDLE_ERROR( cudaFree(gpu_y) );
	HANDLE_ERROR( cudaFree(gpu_z) );
	HANDLE_ERROR( cudaFree(gpu_E) );
#else
	T* r = (T*) malloc(N * sizeof(T));					//allocate space for p in spherical coordinates
	T* phi = (T*) malloc(N * sizeof(T));				//	only r and phi are necessary (the scalar PSF is symmetric about theta)
	stim::quaternion<T> q;
	q.CreateRotation(d, stim::vec3<T>(0, 0, 1));
	stim::matrix<T, 3> R = q.toMatrix3();
	stim::vec3<T> p, ps, ds;
	for(size_t i = 0; i < N; i++){
		(x == NULL) ? p[0] = 0 : p[0] = x[i];	// test for NULL values and set positions
		(y == NULL) ? p[1] = 0 : p[1] = y[i];
		(z == NULL) ? p[2] = 0 : p[2] = z[i];
		p = p - f;
		p = R * p;					//rotate the cartesian point
		ps = p.cart2sph();						//convert from cartesian to spherical coordinates
		r[i] = ps[0];							//store r
		phi[i] = ps[2];							//phi = [0 pi]
	}
	cpu_scalar_psf_local(E, N, r, phi, lambda, A, NA, NA_in, Nl);		//call the spherical coordinate CPU function
	free(r);
	free(phi);
#endif
}
		
/// Class stim::beam represents a beam of light focused at a point and composed of several plane waves
template<typename T>
class scalarbeam
{
public:
	//enum beam_type {Uniform, Bartlett, Hamming, Hanning};
private:
	
	T NA[2];				//numerical aperature of the focusing optics	
	vec3<T> f;				//focal point
	vec3<T> d;				//propagation direction
	T A;		//beam amplitude
	T lambda;				//beam wavelength
public:
	///constructor: build a default beam (NA=1.0)
	scalarbeam(T wavelength = 1, T amplitude = 1, vec3<T> focal_point = vec3<T>(0, 0, 0), vec3<T> direction = vec3<T>(0, 0, 1), T numerical_aperture = 1, T center_obsc = 0){
		lambda = wavelength;
		A = amplitude;
		f = focal_point;
		d = direction.norm();					//make sure that the direction vector is normalized (makes calculations more efficient later on)
		NA[0] = numerical_aperture;
		NA[1] = center_obsc;
	}
	///Numerical Aperature functions
	void setNA(T na)
	{
		NA[0] = (T)0;
		NA[1] = na;
	}
	void setNA(T na0, T na1)
	{
		NA[0] = na0;
		NA[1] = na1;
	}
	//Monte-Carlo decomposition into plane waves
	std::vector< scalarwave<T> > mc(size_t N = 100000) const{
		std::vector< stim::vec3<T> > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]);	//generate a random set of N vectors forming a focus
		std::vector< scalarwave<T> > samples(N);											//create a vector of plane waves
		T kmag = (T)stim::TAU / lambda;								//calculate the wavenumber
		stim::complex<T> apw;										//allocate space for the amplitude at the focal point
		T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N);			//constant value weights plane waves based on the aperture and number of samples (N)
		stim::vec3<T> kpw;											//declare the new k-vector based on the focused plane wave direction
		for(size_t i=0; i<N; i++){										//for each sample
			kpw = dirs[i] * kmag;									//calculate the k-vector for the new plane wave
			apw = a * exp(stim::complex<T>(0, kpw.dot(-f)));				//calculate the amplitude for the new plane wave
			samples[i] = scalarwave<T>(kpw, apw);			//create a plane wave based on the direction
		}
		return samples;
	}
	/// Evaluate the beam to a scalar field using Debye focusing
	void eval(stim::scalarfield<T>& E, size_t order = 500){
		size_t array_size = E.grid_bytes();
		T* X = (T*) malloc( array_size );			//allocate space for the coordinate meshes
		T* Y = (T*) malloc( array_size );
		T* Z = (T*) malloc( array_size );
		E.meshgrid(X, Y, Z, stim::CPUmem);			//calculate the coordinate meshes
		cpu_scalar_psf_cart<T>(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[0], NA[1], order, E.spacing());
		free(X);									//free the coordinate meshes
		free(Y);
		free(Z);
	}
	/// Calculate the field at a given point
	/// @param x is the x-coordinate of the field point
	/// @O is the approximation accuracy
	stim::complex<T> field(T x, T y, T z, size_t O){
		std::vector< scalarwave<T> > W = mc(O);
		T result = 0;											//initialize the result to zero (0)
		for(size_t i = 0; i < O; i++){							//for each plane wave
			result += W[i].pos(x, y, z);
		}
		return result;
	}
	std::string str()
	{
		std::stringstream ss;
		ss<<"Beam:"<<std::endl;
		//ss<<"	Central Plane Wave: "<<beam::E0<<" e^i ( "<<beam::k<<" . r )"<<std::endl;
		ss<<"	Beam Direction: "<<d<<std::endl;
		if(NA[0] == 0)
			ss<<"	NA: "<<NA[1];
		else
			ss<<"	NA: "<<NA[0]<<" -- "<<NA[1];
		return ss.str();
	}
};			//end beam
}			//end namespace stim
#endif