scalarbeam.h 19 KB
``````#ifndef RTS_BEAM
#define RTS_BEAM
#include <boost/math/special_functions/bessel.hpp>

#include "../math/vec3.h"
#include "../optics/scalarwave.h"
#include "../math/bessel.h"
#include "../math/legendre.h"
#include "../cuda/cudatools/devices.h"
#include "../cuda/cudatools/timer.h"
#include "../optics/scalarfield.h"
#include <cublas_v2.h>
#include <math_constants.h>
#include <vector>
#include <stdlib.h>

namespace stim{

/// Function returns the value of the scalar field produced by a beam with the specified parameters

template<typename T>
std::vector< stim::vec3<T> > generate_focusing_vectors(size_t N, stim::vec3<T> d, T NA, T NA_in = 0){

std::vector< stim::vec3<T> > dirs(N);					//allocate an array to store the focusing vectors

///compute the rotation operator to transform (0, 0, 1) to k
T cos_angle = d.dot(vec3<T>(0, 0, 1));
stim::matrix<T, 3> rotation;

//if the cosine of the angle is -1, the rotation is just a flip across the z axis
if(cos_angle == -1){
rotation(2, 2) = -1;
}
else if(cos_angle != 1.0)
{
vec3<T> r_axis = vec3<T>(0, 0, 1).cross(d).norm();	//compute the axis of rotation
T angle = acos(cos_angle);							//compute the angle of rotation
quaternion<T> quat;							//create a quaternion describing the rotation
quat.CreateRotation(angle, r_axis);
rotation = quat.toMatrix3();							//compute the rotation matrix
}

//find the phi values associated with the cassegrain ring
T PHI[2];
PHI[0] = (T)asin(NA);
PHI[1] = (T)asin(NA_in);

//calculate the z-axis cylinder coordinates associated with these angles
T Z[2];
Z[0] = cos(PHI[0]);
Z[1] = cos(PHI[1]);
T range = Z[0] - Z[1];

//draw a distribution of random phi, z values
T z, phi, theta;
//T kmag = stim::TAU / lambda;
for(int i=0; i<N; i++){								//for each sample
z = (T)((double)rand() / (double)RAND_MAX) * range + Z[1];			//find a random position on the surface of a cylinder
theta = (T)(((double)rand() / (double)RAND_MAX) * stim::TAU);
phi = acos(z);													//project onto the sphere, computing phi in spherical coordinates

//compute and store cartesian coordinates
vec3<T> spherical(1, theta, phi);								//convert from spherical to cartesian coordinates
vec3<T> cart = spherical.sph2cart();
dirs[i] = rotation * cart;										//create a sample vector
}
return dirs;
}

/// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional)
/// @param C is a pointer to Nl + 1 values where the terms will be stored
template<typename T>
CUDA_CALLABLE void cpu_aperture_integral(T* C, int Nl, T NA, T NA_in = 0){

size_t table_bytes = (Nl + 1) * sizeof(T);				//calculate the number of bytes required to store the terms
T cos_alpha_1 = cos(asin(NA_in));						//calculate the cosine of the angle subtended by the central obscuration
T cos_alpha_2 = cos(asin(NA));							//calculate the cosine of the angle subtended by the aperture

// the aperture integral is computed using four individual Legendre polynomials, each a function of the angles subtended
//		by the objective and central obscuration
T* Pln_a1 = (T*) malloc(table_bytes);
stim::legendre<T>(Nl-1, cos_alpha_1, &Pln_a1[1]);
Pln_a1[0] = 1;

T* Pln_a2 = (T*) malloc(table_bytes);
stim::legendre<T>(Nl-1, cos_alpha_2, &Pln_a2[1]);
Pln_a2[0] = 1;

T* Plp_a1 = (T*) malloc(table_bytes+sizeof(T));
stim::legendre<T>(Nl+1, cos_alpha_1, Plp_a1);

T* Plp_a2 = (T*) malloc(table_bytes+sizeof(T));
stim::legendre<T>(Nl+1, cos_alpha_2, Plp_a2);

for(size_t l = 0; l <= Nl; l++){
C[l] = Plp_a1[l+1] - Plp_a2[l+1] - Pln_a1[l] + Pln_a2[l];
}

free(Pln_a1);
free(Pln_a2);
free(Plp_a1);
free(Plp_a2);
}

/// performs linear interpolation into a look-up table
template<typename T>
CUDA_CALLABLE void lut_lookup(T* lut_values, T* lut, T val, size_t N, T min_val, T delta, size_t n_vals){
T idx = ((val - min_val) / delta);
size_t i = (size_t) idx;
T a1 = idx - i;
T a0 = 1 - a1;
size_t n0 = i * n_vals;
size_t n1 = (i+1) * n_vals;
for(size_t n = 0; n < n_vals; n++){
lut_values[n] = lut[n0 + n] * a0 + lut[n1 + n] * a1;
}
}

template <typename T>
CUDA_CALLABLE stim::complex<T> clerp(stim::complex<T> v0, stim::complex<T> v1, T t) {
return stim::complex<T>( fma(t, v1.r, fma(-t, v0.r, v0.r)), fma(t, v1.i, fma(-t, v0.i, v0.i)) );
}

template <typename T>
CUDA_CALLABLE T lerp(T v0, T v1, T t) {
return fma(t, v1, fma(-t, v0, v0));
}

#ifdef CUDA_FOUND
template<typename T>
__global__ void cuda_scalar_psf(stim::complex<T>* E, size_t N, T* r, T* phi, T A, size_t Nl,
T* C,
T* lut_j, size_t Nj, T min_r, T dr){
size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
if(i >= N) return;												//exit if this thread is outside the array

T cos_phi = cos(phi[i]);									//calculate the thread value for cos(phi)
stim::complex<T> Ei = 0;									//initialize the value of the field to zero
size_t NC = Nl + 1;										//calculate the number of coefficients to be used

T fij = (r[i] - min_r)/dr;								//FP index into the spherical bessel LUT
size_t ij = (size_t) fij;								//convert to an integral index
T a = fij - ij;											//calculate the fractional portion of the index
size_t n0j = ij * (NC);									//start of the first entry in the LUT
size_t n1j = (ij+1) * (NC);								//start of the second entry in the LUT

T jl;											//declare register to store the spherical bessel function
T Pl_2, Pl_1;									//declare registers to store the previous two Legendre polynomials
T Pl = 1;										//initialize the current value for the Legendre polynomial
stim::complex<T> im(0, 1);						//declare i (imaginary 1)
stim::complex<T> i_pow(1, 0);					//i_pow stores the current value of i^l so it doesn't have to be re-computed every iteration
for(int l = 0; l <= Nl; l++){					//for each order
jl = lerp<T>( lut_j[n0j + l], lut_j[n1j + l], a );	//read jl from the LUT and interpolate the result
Ei += i_pow * jl * Pl * C[l];				//calculate the value for the field and sum
i_pow *= im;								//multiply i^l * i for the next iteration
Pl_2 = Pl_1;								//shift Pl_1 -> Pl_2 and Pl -> Pl_1
Pl_1 = Pl;
if(l == 0){									//computing Pl is done recursively, where the recursive relation
Pl = cos_phi;							//	requires the first two orders. This defines the second.
}
else{										//if this is not the first iteration, use the recursive relation to calculate Pl
Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
}

}
E[i] = Ei * A * 2 * CUDART_PI_F;						//scale the integral by the amplitude
}

template<typename T>
void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl, T r_spacing){

//Find the minimum and maximum values of r
cublasStatus_t stat;
cublasHandle_t handle;

stat = cublasCreate(&handle);							//create a cuBLAS handle
if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
printf ("CUBLAS initialization failed\n");
exit(1);
}

int i_min, i_max;
stat = cublasIsamin(handle, (int)N, r, 1, &i_min);
if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
printf ("CUBLAS Error: failed to calculate minimum r value.\n");
exit(1);
}
stat = cublasIsamax(handle, (int)N, r, 1, &i_max);
if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
printf ("CUBLAS Error: failed to calculate maximum r value.\n");
exit(1);
}

i_min--;												//cuBLAS uses 1-based indexing for Fortran compatibility
i_max--;
T r_min, r_max;											//allocate space to store the minimum and maximum values
HANDLE_ERROR( cudaMemcpy(&r_min, r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
HANDLE_ERROR( cudaMemcpy(&r_max, r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );

T k = (T)stim::TAU / lambda;							//calculate the wavenumber from lambda
size_t C_bytes = (Nl + 1) * sizeof(T);
T* C = (T*) malloc( C_bytes );							//allocate space for the aperture integral terms
cpu_aperture_integral(C, Nl, NA, NA_in);				//calculate the aperture integral terms

size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1);			//number of values in the look-up table based on the user-specified spacing along r

size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j;
T* j_lut = (T*) malloc(lutj_bytes);													//pointer to the look-up table
T dr = (r_max - r_min) / (Nlut_j-1);												//distance between values in the LUT
T jl;
for(size_t ri = 0; ri < Nlut_j; ri++){													//for each value in the LUT
for(size_t l = 0; l <= Nl; l++){													//for each order
jl = boost::math::sph_bessel<T>(l, k*(r_min + ri * dr));					//use boost to calculate the spherical bessel function
j_lut[ri * (Nl + 1) + l] = jl;												//store the bessel function result
}
}

stim::cpu2image<T>(j_lut, "j_lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
//Allocate device memory and copy everything to the GPU

T* gpu_C;
HANDLE_ERROR( cudaMalloc(&gpu_C, C_bytes) );
HANDLE_ERROR( cudaMemcpy(gpu_C, C, C_bytes, cudaMemcpyHostToDevice) );
T* gpu_j_lut;
HANDLE_ERROR( cudaMalloc(&gpu_j_lut, lutj_bytes) );
HANDLE_ERROR( cudaMemcpy(gpu_j_lut, j_lut, lutj_bytes, cudaMemcpyHostToDevice) );

dim3 blocks( (unsigned)(N / threads + 1));						//calculate the optimal number of blocks

cuda_scalar_psf<T><<< blocks, threads >>>(E, N, r, phi, A, Nl, gpu_C, gpu_j_lut, Nlut_j, r_min, dr);

//free the LUT and condenser tables
HANDLE_ERROR( cudaFree(gpu_C) );
HANDLE_ERROR( cudaFree(gpu_j_lut) );
}
#endif

/// Calculate the analytical solution to a scalar point spread function given a set of spherical coordinates about the PSF (beam propagation along phi = theta = 0)
template<typename T>
void cpu_scalar_psf_local(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl){
T k = (T)stim::TAU / lambda;
size_t C_bytes = (Nl + 1) * sizeof(T);
T* C = (T*) malloc( C_bytes );					//allocate space for the aperture integral terms
cpu_aperture_integral(C, Nl, NA, NA_in);			//calculate the aperture integral terms
memset(F, 0, N * sizeof(stim::complex<T>));
T jl, Pl, kr, cos_phi;

double vm;
double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );

T* Pl_cos_phi = (T*) malloc((Nl + 1) * sizeof(T));

for(size_t n = 0; n < N; n++){								//for each point in the field
kr = k * r[n];											//calculate kr (the optical distance between the focal point and p)
cos_phi = std::cos(phi[n]);								//calculate the cosine of phi
stim::bessjyv_sph<double>(Nl, kr, vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
stim::legendre<T>(Nl, cos_phi, Pl_cos_phi);				//calculate the [0 Nl] legendre polynomials for this point

for(int l = 0; l <= Nl; l++){
jl = (T)jv[l];
Pl = Pl_cos_phi[l];
F[n] += pow(complex<T>(0, 1), l) * jl * Pl * C[l];
}
F[n] *= A * stim::TAU;
}

free(C);
free(Pl_cos_phi);
}

/// Converts a set of cartesian points into spherical coordinates surrounding a point spread function (PSF)
/// @param r is the output distance from the PSF
/// @param phi is the non-symmetric direction about the PSF
/// @param x (x, y, z) are the cartesian coordinates in world space
/// @f is the focal point of the PSF in cartesian coordinates
/// @d is the propagation direction of the PSF in cartesian coordinates
template<typename T>
__global__ void cuda_cart2psf(T* r, T* phi, size_t N, T* x, T* y, T* z, stim::vec3<T> f, stim::quaternion<T> q){

size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
if(i >= N) return;												//exit if this thread is outside the array

stim::vec3<T> p;									//declare a 3D point

(x == NULL) ? p[0] = 0 : p[0] = x[i];				// test for NULL values and set positions
(y == NULL) ? p[1] = 0 : p[1] = y[i];
(z == NULL) ? p[2] = 0 : p[2] = z[i];

p = p - f;											//shift the point to the center of the PSF (focal point)
p = q.toMatrix3() * p;								//rotate the point to align with the propagation direction

stim::vec3<T> ps = p.cart2sph();									//convert from cartesian to spherical coordinates
r[i] = ps[0];										//store r
phi[i] = ps[2];										//phi = [0 pi]
}

#ifdef CUDA_FOUND
/// Calculate the analytical solution to a point spread function given a set of points in cartesian coordinates
template<typename T>
void gpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){

T* gpu_r;															//allocate space for the coordinates in r
HANDLE_ERROR( cudaMalloc(&gpu_r, sizeof(T) * N) );
T* gpu_phi;
HANDLE_ERROR( cudaMalloc(&gpu_phi, sizeof(T) * N) );
//stim::complex<T>* gpu_E;
//HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex<T>) * N) );

stim::quaternion<T> q;												//create a quaternion
q.CreateRotation(d, stim::vec3<T>(0, 0, 1));						//create a mapping from the propagation direction to the PSF space
dim3 blocks( (unsigned)(N / threads + 1));							//calculate the optimal number of blocks
cuda_cart2psf<T> <<< blocks, threads >>> (gpu_r, gpu_phi, N, x, y, z, f, q);	//call the CUDA kernel to move the cartesian coordinates to PSF space

gpu_scalar_psf_local(E, N, gpu_r, gpu_phi, lambda, A, NA, NA_in, Nl, r_spacing);

}
#endif

template<typename T>
void cpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){

// If CUDA is available, copy the cartesian points to the GPU and evaluate them in a kernel
#ifdef CUDA_FOUND

T* gpu_x = NULL;
if(x != NULL){
HANDLE_ERROR( cudaMalloc(&gpu_x, sizeof(T) * N) );
HANDLE_ERROR( cudaMemcpy(gpu_x, x, sizeof(T) * N, cudaMemcpyHostToDevice) );
}
T* gpu_y = NULL;
if(y != NULL){
HANDLE_ERROR( cudaMalloc(&gpu_y, sizeof(T) * N) );
HANDLE_ERROR( cudaMemcpy(gpu_y, y, sizeof(T) * N, cudaMemcpyHostToDevice) );
}
T* gpu_z = NULL;
if(z != NULL){
HANDLE_ERROR( cudaMalloc(&gpu_z, sizeof(T) * N) );
HANDLE_ERROR( cudaMemcpy(gpu_z, z, sizeof(T) * N, cudaMemcpyHostToDevice) );
}

stim::complex<T>* gpu_E;
HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex<T>) * N) );
HANDLE_ERROR( cudaMemcpy(gpu_E, E, sizeof(stim::complex<T>) * N, cudaMemcpyHostToDevice) );
gpu_scalar_psf_cart<T>(gpu_E, N, gpu_x, gpu_y, gpu_z, lambda, A, f, d, NA, NA_in, Nl, r_spacing);
HANDLE_ERROR( cudaMemcpy(E, gpu_E, sizeof(stim::complex<T>) * N, cudaMemcpyDeviceToHost) );

HANDLE_ERROR( cudaFree(gpu_x) );
HANDLE_ERROR( cudaFree(gpu_y) );
HANDLE_ERROR( cudaFree(gpu_z) );
HANDLE_ERROR( cudaFree(gpu_E) );

#else
T* r = (T*) malloc(N * sizeof(T));					//allocate space for p in spherical coordinates
T* phi = (T*) malloc(N * sizeof(T));				//	only r and phi are necessary (the scalar PSF is symmetric about theta)

stim::quaternion<T> q;
q.CreateRotation(d, stim::vec3<T>(0, 0, 1));
stim::matrix<T, 3> R = q.toMatrix3();
stim::vec3<T> p, ps, ds;
for(size_t i = 0; i < N; i++){
(x == NULL) ? p[0] = 0 : p[0] = x[i];	// test for NULL values and set positions
(y == NULL) ? p[1] = 0 : p[1] = y[i];
(z == NULL) ? p[2] = 0 : p[2] = z[i];

p = p - f;

p = R * p;					//rotate the cartesian point

ps = p.cart2sph();						//convert from cartesian to spherical coordinates
r[i] = ps[0];							//store r
phi[i] = ps[2];							//phi = [0 pi]
}

cpu_scalar_psf_local(E, N, r, phi, lambda, A, NA, NA_in, Nl);		//call the spherical coordinate CPU function

free(r);
free(phi);
#endif
}

/// Class stim::beam represents a beam of light focused at a point and composed of several plane waves
template<typename T>
class scalarbeam
{
public:
//enum beam_type {Uniform, Bartlett, Hamming, Hanning};

private:

T NA[2];				//numerical aperature of the focusing optics
vec3<T> f;				//focal point
vec3<T> d;				//propagation direction
T A;		//beam amplitude
T lambda;				//beam wavelength
public:

///constructor: build a default beam (NA=1.0)
scalarbeam(T wavelength = 1, T amplitude = 1, vec3<T> focal_point = vec3<T>(0, 0, 0), vec3<T> direction = vec3<T>(0, 0, 1), T numerical_aperture = 1, T center_obsc = 0){
lambda = wavelength;
A = amplitude;
f = focal_point;
d = direction.norm();					//make sure that the direction vector is normalized (makes calculations more efficient later on)
NA[0] = numerical_aperture;
NA[1] = center_obsc;
}

///Numerical Aperature functions
void setNA(T na)
{
NA[0] = (T)0;
NA[1] = na;
}
void setNA(T na0, T na1)
{
NA[0] = na0;
NA[1] = na1;
}

//Monte-Carlo decomposition into plane waves
std::vector< scalarwave<T> > mc(size_t N = 100000) const{

std::vector< stim::vec3<T> > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]);	//generate a random set of N vectors forming a focus
std::vector< scalarwave<T> > samples(N);											//create a vector of plane waves
T kmag = (T)stim::TAU / lambda;								//calculate the wavenumber
stim::complex<T> apw;										//allocate space for the amplitude at the focal point
T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N);			//constant value weights plane waves based on the aperture and number of samples (N)
stim::vec3<T> kpw;											//declare the new k-vector based on the focused plane wave direction
for(size_t i=0; i<N; i++){										//for each sample
kpw = dirs[i] * kmag;									//calculate the k-vector for the new plane wave
apw = a * exp(stim::complex<T>(0, kpw.dot(-f)));				//calculate the amplitude for the new plane wave
samples[i] = scalarwave<T>(kpw, apw);			//create a plane wave based on the direction
}
return samples;
}

/// Evaluate the beam to a scalar field using Debye focusing
void eval(stim::scalarfield<T>& E, size_t order = 500){
size_t array_size = E.grid_bytes();
T* X = (T*) malloc( array_size );			//allocate space for the coordinate meshes
T* Y = (T*) malloc( array_size );
T* Z = (T*) malloc( array_size );

E.meshgrid(X, Y, Z, stim::CPUmem);			//calculate the coordinate meshes
cpu_scalar_psf_cart<T>(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[0], NA[1], order, E.spacing());

free(X);									//free the coordinate meshes
free(Y);
free(Z);
}

/// Calculate the field at a given point
/// @param x is the x-coordinate of the field point
/// @O is the approximation accuracy
stim::complex<T> field(T x, T y, T z, size_t O){
std::vector< scalarwave<T> > W = mc(O);
T result = 0;											//initialize the result to zero (0)
for(size_t i = 0; i < O; i++){							//for each plane wave
result += W[i].pos(x, y, z);
}
return result;
}

std::string str()
{
std::stringstream ss;
ss<<"Beam:"<<std::endl;
//ss<<"	Central Plane Wave: "<<beam::E0<<" e^i ( "<<beam::k<<" . r )"<<std::endl;
ss<<"	Beam Direction: "<<d<<std::endl;
if(NA[0] == 0)
ss<<"	NA: "<<NA[1];
else
ss<<"	NA: "<<NA[0]<<" -- "<<NA[1];

return ss.str();
}

};			//end beam
}			//end namespace stim

#endif
``````