#ifndef RTS_BEAM #define RTS_BEAM #include #include "../math/vec3.h" #include "../optics/scalarwave.h" #include "../math/bessel.h" #include "../math/legendre.h" #include "../cuda/cudatools/devices.h" #include "../cuda/cudatools/timer.h" #include "../optics/scalarfield.h" #include #include #include #include namespace stim{ /// Function returns the value of the scalar field produced by a beam with the specified parameters template std::vector< stim::vec3 > generate_focusing_vectors(size_t N, stim::vec3 d, T NA, T NA_in = 0){ std::vector< stim::vec3 > dirs(N); //allocate an array to store the focusing vectors ///compute the rotation operator to transform (0, 0, 1) to k T cos_angle = d.dot(vec3(0, 0, 1)); stim::matrix_sq rotation; //if the cosine of the angle is -1, the rotation is just a flip across the z axis if(cos_angle == -1){ rotation(2, 2) = -1; } else if(cos_angle != 1.0) { vec3 r_axis = vec3(0, 0, 1).cross(d).norm(); //compute the axis of rotation T angle = acos(cos_angle); //compute the angle of rotation quaternion quat; //create a quaternion describing the rotation quat.CreateRotation(angle, r_axis); rotation = quat.toMatrix3(); //compute the rotation matrix } //find the phi values associated with the cassegrain ring T PHI[2]; PHI[0] = (T)asin(NA); PHI[1] = (T)asin(NA_in); //calculate the z-axis cylinder coordinates associated with these angles T Z[2]; Z[0] = cos(PHI[0]); Z[1] = cos(PHI[1]); T range = Z[0] - Z[1]; //draw a distribution of random phi, z values T z, phi, theta; //T kmag = stim::TAU / lambda; for(int i=0; i spherical(1, theta, phi); //convert from spherical to cartesian coordinates vec3 cart = spherical.sph2cart(); dirs[i] = rotation * cart; //create a sample vector } return dirs; } /// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional) /// @param C is a pointer to Nl + 1 values where the terms will be stored template CUDA_CALLABLE void cpu_aperture_integral(T* C, int Nl, T NA, T NA_in = 0){ size_t table_bytes = (Nl + 1) * sizeof(T); //calculate the number of bytes required to store the terms T cos_alpha_1 = cos(asin(NA_in)); //calculate the cosine of the angle subtended by the central obscuration T cos_alpha_2 = cos(asin(NA)); //calculate the cosine of the angle subtended by the aperture // the aperture integral is computed using four individual Legendre polynomials, each a function of the angles subtended // by the objective and central obscuration T* Pln_a1 = (T*) malloc(table_bytes); stim::legendre(Nl-1, cos_alpha_1, &Pln_a1[1]); Pln_a1[0] = 1; T* Pln_a2 = (T*) malloc(table_bytes); stim::legendre(Nl-1, cos_alpha_2, &Pln_a2[1]); Pln_a2[0] = 1; T* Plp_a1 = (T*) malloc(table_bytes+sizeof(T)); stim::legendre(Nl+1, cos_alpha_1, Plp_a1); T* Plp_a2 = (T*) malloc(table_bytes+sizeof(T)); stim::legendre(Nl+1, cos_alpha_2, Plp_a2); for(size_t l = 0; l <= Nl; l++){ C[l] = Plp_a1[l+1] - Plp_a2[l+1] - Pln_a1[l] + Pln_a2[l]; } free(Pln_a1); free(Pln_a2); free(Plp_a1); free(Plp_a2); } /// performs linear interpolation into a look-up table template CUDA_CALLABLE void lut_lookup(T* lut_values, T* lut, T val, size_t N, T min_val, T delta, size_t n_vals){ T idx = ((val - min_val) / delta); size_t i = (size_t) idx; T a1 = idx - i; T a0 = 1 - a1; size_t n0 = i * n_vals; size_t n1 = (i+1) * n_vals; for(size_t n = 0; n < n_vals; n++){ lut_values[n] = lut[n0 + n] * a0 + lut[n1 + n] * a1; } } template CUDA_CALLABLE stim::complex clerp(stim::complex v0, stim::complex v1, T t) { return stim::complex( fmaf(t, v1.r, fmaf(-t, v0.r, v0.r)), fmaf(t, v1.i, fmaf(-t, v0.i, v0.i)) ); } template CUDA_CALLABLE T lerp(T v0, T v1, T t) { return fmaf(t, v1, fmaf(-t, v0, v0)); } #ifdef CUDA_FOUND template __global__ void cuda_scalar_psf(stim::complex* E, size_t N, T* r, T* phi, T A, size_t Nl, T* C, T* lut_j, size_t Nj, T min_r, T dr){ size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array if(i >= N) return; //exit if this thread is outside the array T cos_phi = cos(phi[i]); //calculate the thread value for cos(phi) stim::complex Ei = 0; //initialize the value of the field to zero size_t NC = Nl + 1; //calculate the number of coefficients to be used T fij = (r[i] - min_r)/dr; //FP index into the spherical bessel LUT size_t ij = (size_t) fij; //convert to an integral index T a = fij - ij; //calculate the fractional portion of the index size_t n0j = ij * (NC); //start of the first entry in the LUT size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT T jl; //declare register to store the spherical bessel function T Pl_2, Pl_1; //declare registers to store the previous two Legendre polynomials T Pl = 1; //initialize the current value for the Legendre polynomial stim::complex im(0, 1); //declare i (imaginary 1) stim::complex i_pow(1, 0); //i_pow stores the current value of i^l so it doesn't have to be re-computed every iteration for(int l = 0; l <= Nl; l++){ //for each order jl = lerp( lut_j[n0j + l], lut_j[n1j + l], a ); //read jl from the LUT and interpolate the result Ei += i_pow * jl * Pl * C[l]; //calculate the value for the field and sum i_pow *= im; //multiply i^l * i for the next iteration Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 Pl_1 = Pl; if(l == 0){ //computing Pl is done recursively, where the recursive relation Pl = cos_phi; // requires the first two orders. This defines the second. } else{ //if this is not the first iteration, use the recursive relation to calculate Pl Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1); } } E[i] = Ei * A * 2 * CUDART_PI_F; //scale the integral by the amplitude } template void gpu_scalar_psf_local(stim::complex* E, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl, T r_spacing){ //Find the minimum and maximum values of r cublasStatus_t stat; cublasHandle_t handle; stat = cublasCreate(&handle); //create a cuBLAS handle if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure printf ("CUBLAS initialization failed\n"); exit(1); } int i_min, i_max; stat = cublasIsamin(handle, (int)N, r, 1, &i_min); if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure printf ("CUBLAS Error: failed to calculate minimum r value.\n"); exit(1); } stat = cublasIsamax(handle, (int)N, r, 1, &i_max); if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure printf ("CUBLAS Error: failed to calculate maximum r value.\n"); exit(1); } cublasDestroy(handle); i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility i_max--; T r_min, r_max; //allocate space to store the minimum and maximum values HANDLE_ERROR( cudaMemcpy(&r_min, r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU HANDLE_ERROR( cudaMemcpy(&r_max, r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); T k = (T)stim::TAU / lambda; //calculate the wavenumber from lambda size_t C_bytes = (Nl + 1) * sizeof(T); T* C = (T*) malloc( C_bytes ); //allocate space for the aperture integral terms cpu_aperture_integral(C, Nl, NA, NA_in); //calculate the aperture integral terms size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j; T* j_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table T dr = (r_max - r_min) / (Nlut_j-1); //distance between values in the LUT T jl; unsigned l; for(size_t ri = 0; ri < Nlut_j; ri++){ //for each value in the LUT for(l = 0; l <= (unsigned)Nl; l++){ //for each order jl = boost::math::sph_bessel(l, k*(r_min + ri * dr)); //use boost to calculate the spherical bessel function j_lut[ri * (Nl + 1) + l] = jl; //store the bessel function result } } //stim::cpu2image(j_lut, "j_lut.bmp", Nl+1, Nlut_j, stim::cmBrewer); //Allocate device memory and copy everything to the GPU T* gpu_C; HANDLE_ERROR( cudaMalloc(&gpu_C, C_bytes) ); HANDLE_ERROR( cudaMemcpy(gpu_C, C, C_bytes, cudaMemcpyHostToDevice) ); T* gpu_j_lut; HANDLE_ERROR( cudaMalloc(&gpu_j_lut, lutj_bytes) ); HANDLE_ERROR( cudaMemcpy(gpu_j_lut, j_lut, lutj_bytes, cudaMemcpyHostToDevice) ); int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device dim3 blocks( (unsigned)(N / threads + 1)); //calculate the optimal number of blocks cuda_scalar_psf<<< blocks, threads >>>(E, N, r, phi, A, Nl, gpu_C, gpu_j_lut, Nlut_j, r_min, dr); //free the LUT and condenser tables HANDLE_ERROR( cudaFree(gpu_C) ); HANDLE_ERROR( cudaFree(gpu_j_lut) ); } #endif /// Calculate the analytical solution to a scalar point spread function given a set of spherical coordinates about the PSF (beam propagation along phi = theta = 0) template void cpu_scalar_psf_local(stim::complex* F, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl){ T k = (T)stim::TAU / lambda; size_t C_bytes = (Nl + 1) * sizeof(T); T* C = (T*) malloc( C_bytes ); //allocate space for the aperture integral terms cpu_aperture_integral(C, Nl, NA, NA_in); //calculate the aperture integral terms memset(F, 0, N * sizeof(stim::complex)); T jl, Pl, kr, cos_phi; double vm; double* jv = (double*) malloc( (Nl + 1) * sizeof(double) ); double* yv = (double*) malloc( (Nl + 1) * sizeof(double) ); double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); T* Pl_cos_phi = (T*) malloc((Nl + 1) * sizeof(T)); for(size_t n = 0; n < N; n++){ //for each point in the field kr = k * r[n]; //calculate kr (the optical distance between the focal point and p) cos_phi = std::cos(phi[n]); //calculate the cosine of phi stim::bessjyv_sph(Nl, kr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] stim::legendre(Nl, cos_phi, Pl_cos_phi); //calculate the [0 Nl] legendre polynomials for this point for(int l = 0; l <= Nl; l++){ jl = (T)jv[l]; Pl = Pl_cos_phi[l]; F[n] += pow(complex(0, 1), l) * jl * Pl * C[l]; } F[n] *= A * stim::TAU; } free(C); free(Pl_cos_phi); } /// Converts a set of cartesian points into spherical coordinates surrounding a point spread function (PSF) /// @param r is the output distance from the PSF /// @param phi is the non-symmetric direction about the PSF /// @param x (x, y, z) are the cartesian coordinates in world space /// @f is the focal point of the PSF in cartesian coordinates /// @d is the propagation direction of the PSF in cartesian coordinates template __global__ void cuda_cart2psf(T* r, T* phi, size_t N, T* x, T* y, T* z, stim::vec3 f, stim::quaternion q){ size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array if(i >= N) return; //exit if this thread is outside the array stim::vec3 p; //declare a 3D point (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions (y == NULL) ? p[1] = 0 : p[1] = y[i]; (z == NULL) ? p[2] = 0 : p[2] = z[i]; p = p - f; //shift the point to the center of the PSF (focal point) p = q.toMatrix3() * p; //rotate the point to align with the propagation direction stim::vec3 ps = p.cart2sph(); //convert from cartesian to spherical coordinates r[i] = ps[0]; //store r phi[i] = ps[2]; //phi = [0 pi] } #ifdef CUDA_FOUND /// Calculate the analytical solution to a point spread function given a set of points in cartesian coordinates template void gpu_scalar_psf_cart(stim::complex* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3 f, stim::vec3 d, T NA, T NA_in, int Nl, T r_spacing = 1){ T* gpu_r; //allocate space for the coordinates in r HANDLE_ERROR( cudaMalloc(&gpu_r, sizeof(T) * N) ); T* gpu_phi; HANDLE_ERROR( cudaMalloc(&gpu_phi, sizeof(T) * N) ); //stim::complex* gpu_E; //HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex) * N) ); stim::quaternion q; //create a quaternion q.CreateRotation(d, stim::vec3(0, 0, 1)); //create a mapping from the propagation direction to the PSF space stim::matrix_sq rot = q.toMatrix3(); int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device dim3 blocks( (unsigned)(N / threads + 1)); //calculate the optimal number of blocks cuda_cart2psf <<< blocks, threads >>> (gpu_r, gpu_phi, N, x, y, z, f, q); //call the CUDA kernel to move the cartesian coordinates to PSF space gpu_scalar_psf_local(E, N, gpu_r, gpu_phi, lambda, A, NA, NA_in, Nl, r_spacing); HANDLE_ERROR( cudaFree(gpu_r) ); HANDLE_ERROR( cudaFree(gpu_phi) ); } #endif template void cpu_scalar_psf_cart(stim::complex* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3 f, stim::vec3 d, T NA, T NA_in, int Nl, T r_spacing = 1){ // If CUDA is available, copy the cartesian points to the GPU and evaluate them in a kernel #ifdef CUDA_FOUND T* gpu_x = NULL; if(x != NULL){ HANDLE_ERROR( cudaMalloc(&gpu_x, sizeof(T) * N) ); HANDLE_ERROR( cudaMemcpy(gpu_x, x, sizeof(T) * N, cudaMemcpyHostToDevice) ); } T* gpu_y = NULL; if(y != NULL){ HANDLE_ERROR( cudaMalloc(&gpu_y, sizeof(T) * N) ); HANDLE_ERROR( cudaMemcpy(gpu_y, y, sizeof(T) * N, cudaMemcpyHostToDevice) ); } T* gpu_z = NULL; if(z != NULL){ HANDLE_ERROR( cudaMalloc(&gpu_z, sizeof(T) * N) ); HANDLE_ERROR( cudaMemcpy(gpu_z, z, sizeof(T) * N, cudaMemcpyHostToDevice) ); } stim::complex* gpu_E; HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex) * N) ); HANDLE_ERROR( cudaMemcpy(gpu_E, E, sizeof(stim::complex) * N, cudaMemcpyHostToDevice) ); gpu_scalar_psf_cart(gpu_E, N, gpu_x, gpu_y, gpu_z, lambda, A, f, d, NA, NA_in, Nl, r_spacing); HANDLE_ERROR( cudaMemcpy(E, gpu_E, sizeof(stim::complex) * N, cudaMemcpyDeviceToHost) ); HANDLE_ERROR( cudaFree(gpu_x) ); HANDLE_ERROR( cudaFree(gpu_y) ); HANDLE_ERROR( cudaFree(gpu_z) ); HANDLE_ERROR( cudaFree(gpu_E) ); #else T* r = (T*) malloc(N * sizeof(T)); //allocate space for p in spherical coordinates T* phi = (T*) malloc(N * sizeof(T)); // only r and phi are necessary (the scalar PSF is symmetric about theta) stim::quaternion q; q.CreateRotation(d, stim::vec3(0, 0, 1)); stim::matrix R = q.toMatrix3(); stim::vec3 p, ps, ds; for(size_t i = 0; i < N; i++){ (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions (y == NULL) ? p[1] = 0 : p[1] = y[i]; (z == NULL) ? p[2] = 0 : p[2] = z[i]; p = p - f; p = R * p; //rotate the cartesian point ps = p.cart2sph(); //convert from cartesian to spherical coordinates r[i] = ps[0]; //store r phi[i] = ps[2]; //phi = [0 pi] } cpu_scalar_psf_local(E, N, r, phi, lambda, A, NA, NA_in, Nl); //call the spherical coordinate CPU function free(r); free(phi); #endif } /// Class stim::beam represents a beam of light focused at a point and composed of several plane waves template class scalarbeam { public: //enum beam_type {Uniform, Bartlett, Hamming, Hanning}; private: T NA[2]; //numerical aperature of the focusing optics vec3 f; //focal point vec3 d; //propagation direction T A; //beam amplitude T lambda; //beam wavelength public: ///constructor: build a default beam (NA=1.0) scalarbeam(T wavelength = 1, T amplitude = 1, vec3 focal_point = vec3(0, 0, 0), vec3 direction = vec3(0, 0, 1), T numerical_aperture = 1, T center_obsc = 0){ lambda = wavelength; A = amplitude; f = focal_point; d = direction.norm(); //make sure that the direction vector is normalized (makes calculations more efficient later on) NA[0] = center_obsc; NA[1] = numerical_aperture; } ///Numerical Aperature functions void setNA(T na) { NA[0] = (T)0; NA[1] = na; } void setNA(T na_in, T na_out) { NA[0] = na_in; NA[1] = na_out; } //Monte-Carlo decomposition into plane waves std::vector< scalarwave > mc(size_t N = 100000) const{ T kmag = (T)stim::TAU / lambda; //calculate the wavenumber stim::vec3 kpw; //declare the new k-vector based on the focused plane wave direction stim::complex apw; //allocate space for the amplitude at the focal point //deal with the degenerative case where the outer NA is 0, in which case the beam will be specified as a single plane wave if (NA[1] == 0) { std::vector< scalarwave > samples(1); //create a vector containing 1 sample kpw = d * kmag; //calculate the k-vector for the plane wave (beam direction scaled by wavenumber) apw = A * exp(stim::complex(0, kpw.dot(-f))); //calculate the amplitude of the plane wave samples[0] = scalarwave(kpw, apw); //create a plane wave based on the direction return samples; } //otherwise, evaluate the system using N monte-carlo samples std::vector< stim::vec3 > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]); //generate a random set of N vectors forming a focus std::vector< scalarwave > samples(N); //create a vector of plane waves T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N); //constant value weights plane waves based on the aperture and number of samples (N) for(size_t i=0; i(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave samples[i] = scalarwave(kpw, apw); //create a plane wave based on the direction } return samples; } void eval(stim::scalarfield& E, T* X, T* Y, T* Z, int order = 500){ cpu_scalar_psf_cart(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[1], NA[0], order, E.spacing()); } /// Evaluate the beam to a scalar field using Debye focusing void eval(stim::scalarfield& E, int order = 500){ E.meshgrid(); //calculate a meshgrid if one isn't created if(E.gpu()) gpu_scalar_psf_cart(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[1], NA[0], order, E.spacing()); else cpu_scalar_psf_cart(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[1], NA[0], order, E.spacing()); } /// Calculate the field at a given point /// @param x is the x-coordinate of the field point /// @O is the approximation accuracy stim::complex field(T x, T y, T z, size_t O){ std::vector< scalarwave > W = mc(O); T result = 0; //initialize the result to zero (0) for(size_t i = 0; i < O; i++){ //for each plane wave result += W[i].pos(x, y, z); } return result; } std::string str() { std::stringstream ss; ss<<"Beam:"<