#ifndef STIM_MIE_H #define STIM_MIE_H #include #include "scalarwave.h" #include "../math/bessel.h" #include "../cuda/cudatools/devices.h" #include namespace stim{ /// Calculate the scattering coefficients for a spherical scatterer template void B_coefficients(stim::complex* B, T a, T k, stim::complex n, int Nl){ //temporary variables double vm; //allocate space to store the return values for the bessel function calculation double* j_ka = (double*) malloc( (Nl + 2) * sizeof(double) ); double* y_ka = (double*) malloc( (Nl + 2) * sizeof(double) ); double* dj_ka= (double*) malloc( (Nl + 2) * sizeof(double) ); double* dy_ka= (double*) malloc( (Nl + 2) * sizeof(double) ); stim::complex* j_kna = (stim::complex*) malloc( (Nl + 2) * sizeof(stim::complex) ); stim::complex* y_kna = (stim::complex*) malloc( (Nl + 2) * sizeof(stim::complex) ); stim::complex* dj_kna= (stim::complex*) malloc( (Nl + 2) * sizeof(stim::complex) ); stim::complex* dy_kna= (stim::complex*) malloc( (Nl + 2) * sizeof(stim::complex) ); double ka = k * a; //store k*a (argument for spherical bessel and Hankel functions) stim::complex kna = k * n * a; //store k*n*a (argument for spherical bessel functions and derivatives) stim::bessjyv_sph(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka); //calculate bessel functions and derivatives for k*a stim::cbessjyva_sph(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna); //calculate complex bessel functions for k*n*a stim::complex h_ka, dh_ka; stim::complex numerator, denominator; stim::complex i(0, 1); for(int l = 0; l <= Nl; l++){ h_ka.r = j_ka[l]; h_ka.i = y_ka[l]; dh_ka.r = dj_ka[l]; dh_ka.i = dy_ka[l]; numerator = j_ka[l] * dj_kna[l] * (stim::complex)n - j_kna[l] * dj_ka[l]; denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex)n; B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator; } //free memory free(j_ka); free(y_ka); free(dj_ka); free(dy_ka); free(j_kna); free(y_kna); free(dj_kna); free(dy_kna); } template void A_coefficients(stim::complex* A, T a, T k, stim::complex n, int Nl){ //temporary variables double vm; //allocate space to store the return values for the bessel function calculation double* j_ka = (double*) malloc( (Nl + 2) * sizeof(double) ); double* y_ka = (double*) malloc( (Nl + 2) * sizeof(double) ); double* dj_ka= (double*) malloc( (Nl + 2) * sizeof(double) ); double* dy_ka= (double*) malloc( (Nl + 2) * sizeof(double) ); stim::complex* j_kna = (stim::complex*) malloc( (Nl + 2) * sizeof(stim::complex) ); stim::complex* y_kna = (stim::complex*) malloc( (Nl + 2) * sizeof(stim::complex) ); stim::complex* dj_kna= (stim::complex*) malloc( (Nl + 2) * sizeof(stim::complex) ); stim::complex* dy_kna= (stim::complex*) malloc( (Nl + 2) * sizeof(stim::complex) ); double ka = k * a; //store k*a (argument for spherical bessel and Hankel functions) stim::complex kna = k * n * a; //store k*n*a (argument for spherical bessel functions and derivatives) stim::bessjyv_sph(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka); //calculate bessel functions and derivatives for k*a stim::cbessjyva_sph(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna); //calculate complex bessel functions for k*n*a stim::complex h_ka, dh_ka; stim::complex numerator, denominator; stim::complex i(0, 1); for(size_t l = 0; l <= Nl; l++){ h_ka.r = j_ka[l]; h_ka.i = y_ka[l]; dh_ka.r = dj_ka[l]; dh_ka.i = dy_ka[l]; numerator = j_ka[l] * dh_ka - dj_ka[l] * h_ka; denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex)n; A[l] = (2 * l + 1) * pow(i, l) * numerator / denominator; } //free memory free(j_ka); free(y_ka); free(dj_ka); free(dy_ka); free(j_kna); free(y_kna); free(dj_kna); free(dy_kna); } #define LOCAL_NL 16 /// CUDA kernel for calculating the Mie scattering solution given a set of points (x, y, z), a list of plane waves, and a look-up table for Bl*hl /// @param E (GPU) is the N x N destination scalar field /// @param N is the number of sample points to evaluate /// @param x (GPU) is the grid of X coordinates for each point in E /// @param y (GPU) is the grid of Y coordinates for each point in E /// @param z (GPU) is the grid of Z coordinates for each point in E /// @param W (GPU) is an array of coherent scalar plane waves incident on the Mie scatterer /// @param nW is the number of plane waves to evaluate (sum) /// @param a is the radius of the Mie scatterer /// @param n is the complex refractive index of the Mie scatterer /// @param c is the position of the sphere in (x, y, z) /// @param hB (GPU) is a look-up table of Hankel functions (equally spaced in distance from the sphere) pre-multiplied with scattering coefficients /// @param kr_min is the minimum kr value in the hB look-up table (corresponding to the closest point to the sphere) /// @param dkr is the spacing (in kr) between samples of the hB look-up table /// @param N_hB is the number of samples in hB /// @param Nl is the order of the calculation (number of Hankel function orders) template __global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::vec3 c, stim::complex* hB, T r_min, T dr, size_t N_hB, int Nl){ extern __shared__ stim::complex shared_hB[]; //declare the list of waves in shared memory size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the sample array (sample point associated with this thread) if(i >= N) return; //exit if this thread is outside the array stim::vec3 p; (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions (y == NULL) ? p[1] = 0 : p[1] = y[i]; (z == NULL) ? p[2] = 0 : p[2] = z[i]; p = p - c; T r = p.len(); //calculate the distance from the sphere if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT size_t ij = (size_t) fij; //convert to an integral index T alpha = fij - ij; //calculate the fractional portion of the index size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT T cos_phi; T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials stim::complex hBl; stim::complex Ei = 0; //create a register to store the result int l; stim::complex hlBl[LOCAL_NL+1]; //the first LOCAL_NL components are stored in registers for speed int shared_start = threadIdx.x * (Nl - LOCAL_NL); //wrap up some operations so that they aren't done in the main loops //unroll LOCAL_NL + 1 #pragma unroll 17 //copy the first LOCAL_NL+1 h_l * B_l components to registers for(l = 0; l <= LOCAL_NL; l++) hlBl[l] = clerp( hB[n0j + l], hB[n1j + l], alpha ); for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp( hB[n0j + l], hB[n1j + l], alpha ); complex e, Ew; for(size_t w = 0; w < nW; w++){ //for each plane wave cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation Pl_1 = cos_phi; e = exp(complex(0, W[w].kvec().dot(c))); Ew = W[w].E() * e; Ei += Ew * hlBl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation Ei += Ew * hlBl[1] * Pl_1; //LOCAL_NL - 1 #pragma unroll 15 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file) for(l = 2; l <= LOCAL_NL; l++){ Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs) Ei += Ew * hlBl[l] * Pl; //calculate and sum the current field order Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 Pl_1 = Pl; } for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers) Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs Ei += Ew * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl; Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 Pl_1 = Pl; } } E[i] += Ei; //copy the result to device memory } ///Calculate the scalar Mie scattered field on the GPU when a list of GPU-based pre-multiplied Hankel functions are available /// @param E (GPU) is the N x N destination scalar field /// @param N is the number fo elements of the scalar field in each direction /// @param x (GPU) is the grid of X coordinates for each point in E /// @param y (GPU) is the grid of Y coordinates for each point in E /// @param z (GPU) is the grid of Z coordinates for each point in E /// @param W (GPU) is an array of coherent scalar plane waves incident on the Mie scatterer /// @param nW is the number of plane waves to evaluate (sum) /// @param a is the radius of the Mie scatterer /// @param n is the complex refractive index of the Mie scatterer /// @param c is the position of the sphere in (x, y, z) /// @param hB (GPU) is a look-up table of Hankel functions (equally spaced in distance from the sphere) pre-multiplied with scattering coefficients /// @param kr_min is the minimum kr value in the hB look-up table (corresponding to the closest point to the sphere) /// @param dkr is the spacing (in kr) between samples of the hB look-up table /// @param N_hB is the number of samples in hB /// @param Nl is the order of the calculation (number of Hankel function orders) template void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::vec3 c, stim::complex* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){ size_t max_shared_mem = stim::sharedMemPerBlock(); //get the amount of shared memory per block size_t hBl_array = sizeof(stim::complex) * (Nl + 1); //calculate the number of bytes required to store the LUT corresponding to a single sample in shared memory int threads = (int)((max_shared_mem / hBl_array) / 32 * 32); //calculate the optimal number of threads per block (make sure it's divisible by the number of warps - 32) dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks size_t shared_mem; if(Nl <= LOCAL_NL) shared_mem = 0; else shared_mem = threads * sizeof(stim::complex) * (Nl - LOCAL_NL); //amount of shared memory to allocate //std::cout<<"shared memory allocated: "<<<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, c, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel } template __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N, stim::vec3 c = stim::vec3(0, 0, 0)){ size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array if(i >= N) return; //exit if this thread is outside the array stim::vec3 p; (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions (y == NULL) ? p[1] = 0 : p[1] = y[i]; (z == NULL) ? p[2] = 0 : p[2] = z[i]; r[i] = (p - c).len(); } ///Calculate the scalar Mie scattered field on the GPU /// @param E (GPU) is the N x N destination scalar field /// @param N is the number of sample points of the scalar field /// @param x (GPU) is the grid of X coordinates for each point in E /// @param y (GPU) is the grid of Y coordinates for each point in E /// @param z (GPU) is the grid of Z coordinates for each point in E /// @param W (CPU) is an array of coherent scalar plane waves incident on the Mie scatterer /// @param a is the radius of the Mie scatterer /// @param n is the complex refractive index of the Mie scatterer /// @param r_spacing is the minimum distance between r values of the sample points in E (used to calculate look-up tables) template void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector> W, T a, stim::complex n, stim::vec3 c = stim::vec3(0, 0, 0), T r_spacing = 0.1){ //calculate the necessary number of orders required to represent the scattered field T k = W[0].kmag(); int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); //calculate the number of orders required to represent the sphere if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) //calculate the scattering coefficients for the sphere stim::complex* B = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients B_coefficients(B, a, k, n, Nl); //calculate the scattering coefficients // PLANE WAVES stim::scalarwave* dev_W; //allocate space and copy plane waves HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave) * W.size()) ); HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave) * W.size(), cudaMemcpyHostToDevice) ); // BESSEL FUNCTION LOOK-UP TABLE //calculate the distance from the sphere center at each sample point and store the result in dev_r T* dev_r; //declare the device pointer to hold the distance from the sphere center HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) ); //allocate space for the array of distances int threads = stim::maxThreadsPerBlock(); //query the device to find the maximum number of threads per block dim3 blocks((unsigned)(N / threads + 1)); //calculate the number of blocks necessary to evaluate the total number of sample points N cuda_dist <<< blocks, threads >>>(dev_r, x, y, z, N, c); //calculate the distance //Use the cuBLAS library to find the minimum and maximum distances from the sphere center. This will be used to create a look-up table for the Hankel functions cublasStatus_t stat; cublasHandle_t handle; stat = cublasCreate(&handle); //create a cuBLAS handle if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure printf ("CUBLAS initialization failed\n"); exit(1); } int i_min, i_max; stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min); if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure printf ("CUBLAS Error: failed to calculate minimum r value.\n"); exit(1); } stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max); if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure printf ("CUBLAS Error: failed to calculate maximum r value.\n"); exit(1); } i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility i_max--; T r_min, r_max; //allocate space to store the minimum and maximum values HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); r_min = max(r_min, a); //if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere) size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r //Declare and evaluate variables used to calculate the spherical Bessel functions and store them temporarily on the CPU double vm; //allocate space to store the return values for the bessel function calculation double* jv = (double*) malloc( (Nl + 1) * sizeof(double) ); double* yv = (double*) malloc( (Nl + 1) * sizeof(double) ); double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); size_t hB_bytes = sizeof(stim::complex) * (Nl+1) * N_hB_lut; //calculate the number of bytes necessary to store the Hankel function LUT stim::complex* hB_lut = (stim::complex*) malloc(hB_bytes); //pointer to the look-up table T dr = (r_max - r_min) / (N_hB_lut-1); //calculate the optimal distance between values in the LUT stim::complex hl; //declare a complex value for the Hankel function result for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT stim::bessjyv_sph(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] for(size_t l = 0; l <= Nl; l++){ //for each order hl.r = (T)jv[l]; //generate the spherical Hankel function from the Bessel functions hl.i = (T)yv[l]; hB_lut[ri * (Nl + 1) + l] = hl * B[l]; //pre-multiply the Hankel function by the scattering coefficients } } //Copy the pre-multiplied Hankel function look-up table to the GPU - this LUT gives a list of uniformly spaced Hankel function values pre-multiplied by scattering coefficients stim::complex* dev_hB_lut; HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); //calculate the Mie scattering solution on the GPU gpu_scalar_mie_scatter(E, N, x, y, z, dev_W, W.size(), a, n, c, dev_hB_lut, r_min, dr, N_hB_lut, Nl); //HANDLE_ERROR(cudaMemcpy(E, E, N * sizeof(stim::complex), cudaMemcpyDeviceToHost)); //copy the field from device memory HANDLE_ERROR(cudaFree(dev_hB_lut)); HANDLE_ERROR(cudaFree(dev_r)); HANDLE_ERROR(cudaFree(dev_W)); } /// Calculate the scalar Mie solution for the scattered field produced by a single plane wave /// @param E is a pointer to the destination field values /// @param N is the number of points used to calculate the field /// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros) /// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros) /// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros) /// @param W is an array of planewaves that will be scattered /// @param a is the radius of the sphere /// @param n is the complex refractive index of the sphere template void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector> W, T a, stim::complex n, stim::vec3 c = stim::vec3(0, 0, 0)){ /*#ifdef CUDA_FOUND stim::complex* dev_E; //allocate space for the field cudaMalloc(&dev_E, N * sizeof(stim::complex)); cudaMemcpy(dev_E, E, N * sizeof(stim::complex), cudaMemcpyHostToDevice); //cudaMemset(dev_F, 0, N * sizeof(stim::complex)); //set the field to zero (necessary because a sum is used) // COORDINATES T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) if(x != NULL){ HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T))); HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice)); } T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified) if(y != NULL){ HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T))); HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice)); } T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified) if(z != NULL){ HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T))); HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); } gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, W, a, n, c, r_spacing); if(x != NULL) cudaFree(dev_x); //free everything if(y != NULL) cudaFree(dev_y); if(z != NULL) cudaFree(dev_z); cudaFree(dev_E); #else */ //calculate the necessary number of orders required to represent the scattered field T k = W[0].kmag(); int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) //std::cout<<"Nl: "<* B = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients B_coefficients(B, a, k, n, Nl); //allocate space to store the bessel function call results double vm; double* j_kr = (double*) malloc( (Nl + 1) * sizeof(double) ); double* y_kr = (double*) malloc( (Nl + 1) * sizeof(double) ); double* dj_kr= (double*) malloc( (Nl + 1) * sizeof(double) ); double* dy_kr= (double*) malloc( (Nl + 1) * sizeof(double) ); T* P = (T*) malloc( (Nl + 1) * sizeof(T) ); T r, kr, cos_phi; stim::complex h; stim::complex Ew; for(size_t i = 0; i < N; i++){ stim::vec3 p; //declare a 3D point (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions (y == NULL) ? p[1] = 0 : p[1] = y[i]; (z == NULL) ? p[2] = 0 : p[2] = z[i]; p = p - c; r = p.len(); if(r >= a){ for(size_t w = 0; w < W.size(); w++){ Ew = W[w].E() * exp(stim::complex(0, W[w].kvec().dot(c))); kr = p.len() * W[w].kmag(); //calculate k*r stim::bessjyv_sph(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr); cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction stim::legendre(Nl, cos_phi, P); for(size_t l = 0; l <= Nl; l++){ h.r = j_kr[l]; h.i = y_kr[l]; E[i] += Ew * B[l] * h * P[l]; } } } } //#endif } template void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave w, T a, stim::complex n, stim::vec3 c = stim::vec3(0, 0, 0), T r_spacing = 0.1){ std::vector< stim::scalarwave > W(1, w); cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, c, r_spacing); } template __global__ void cuda_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* jA, T r_min, T dr, size_t N_jA, int Nl){ extern __shared__ stim::complex shared_jA[]; //declare the list of waves in shared memory size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array if(i >= N) return; //exit if this thread is outside the array stim::vec3 p; (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions (y == NULL) ? p[1] = 0 : p[1] = y[i]; (z == NULL) ? p[2] = 0 : p[2] = z[i]; T r = p.len(); //calculate the distance from the sphere if(r >= a) return; //exit if the point is inside the sphere (we only calculate the internal field) T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT size_t ij = (size_t) fij; //convert to an integral index T alpha = fij - ij; //calculate the fractional portion of the index size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT T cos_phi; T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials stim::complex jAl; stim::complex Ei = 0; //create a register to store the result int l; stim::complex jlAl[LOCAL_NL+1]; //the first LOCAL_NL components are stored in registers for speed int shared_start = threadIdx.x * (Nl - LOCAL_NL); //wrap up some operations so that they aren't done in the main loops #pragma unroll LOCAL_NL+1 //copy the first LOCAL_NL+1 h_l * B_l components to registers for(l = 0; l <= LOCAL_NL; l++) jlAl[l] = clerp( jA[n0j + l], jA[n1j + l], alpha ); for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory shared_jA[shared_start + (l - (LOCAL_NL+1))] = clerp( jA[n0j + l], jA[n1j + l], alpha ); for(size_t w = 0; w < nW; w++){ //for each plane wave if(r == 0) cos_phi = 0; else cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation Pl_1 = cos_phi; Ei += W[w].E() * jlAl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation Ei += W[w].E() * jlAl[1] * Pl_1; #pragma unroll LOCAL_NL-1 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file) for(l = 2; l <= LOCAL_NL; l++){ Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs) Ei += W[w].E() * jlAl[l] * Pl; //calculate and sum the current field order Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 Pl_1 = Pl; } for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers) Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs Ei += W[w].E() * shared_jA[shared_start + l - LOCAL_NL - 1] * Pl; Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 Pl_1 = Pl; } } E[i] = Ei; //copy the result to device memory } template void gpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* jA, T r_min, T dr, size_t N_jA, size_t Nl){ size_t max_shared_mem = stim::sharedMemPerBlock(); size_t hBl_array = sizeof(stim::complex) * (Nl + 1); //std::cout<<"hl*Bl array size: "<) * (Nl - LOCAL_NL); //amount of shared memory to allocate //std::cout<<"shared memory allocated: "<<<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl); //call the kernel } /// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere /// @param E is a pointer to the destination field values /// @param N is the number of points used to calculate the field /// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros) /// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros) /// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros) /// @param w is a planewave that will be scattered /// @param a is the radius of the sphere /// @param n is the complex refractive index of the sphere template void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave > W, T a, stim::complex n, stim::vec3 c = stim::vec3(0, 0, 0)){ //calculate the necessary number of orders required to represent the scattered field T k = W[0].kmag(); int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) //std::cout<<"Nl: "<* A = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients A_coefficients(A, a, k, n, Nl); /*#ifdef CUDA_FOUND stim::complex* dev_E; //allocate space for the field cudaMalloc(&dev_E, N * sizeof(stim::complex)); cudaMemcpy(dev_E, E, N * sizeof(stim::complex), cudaMemcpyHostToDevice); //cudaMemset(dev_F, 0, N * sizeof(stim::complex)); //set the field to zero (necessary because a sum is used) // COORDINATES T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) if(x != NULL){ HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T))); HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice)); } T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified) if(y != NULL){ HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T))); HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice)); } T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified) if(z != NULL){ HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T))); HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); } // PLANE WAVES stim::scalarwave* dev_W; //allocate space and copy plane waves HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave) * W.size()) ); HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave) * W.size(), cudaMemcpyHostToDevice) ); // BESSEL FUNCTION LOOK-UP TABLE //calculate the distance from the sphere center T* dev_r; HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) ); int threads = stim::maxThreadsPerBlock(); dim3 blocks((unsigned)(N / threads + 1)); cuda_dist <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N); //Find the minimum and maximum values of r cublasStatus_t stat; cublasHandle_t handle; stat = cublasCreate(&handle); //create a cuBLAS handle if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure printf ("CUBLAS initialization failed\n"); exit(1); } int i_min, i_max; stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min); if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure printf ("CUBLAS Error: failed to calculate minimum r value.\n"); exit(1); } stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max); if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure printf ("CUBLAS Error: failed to calculate maximum r value.\n"); exit(1); } cublasDestroy(handle); //destroy the CUBLAS handle i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility i_max--; T r_min, r_max; //allocate space to store the minimum and maximum values HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); r_max = min(r_max, a); //the internal field doesn't exist outside of the sphere size_t N_jA_lut = (size_t)((r_max - r_min) / r_spacing + 1); //temporary variables double vm; //allocate space to store the return values for the bessel function calculation stim::complex* jv = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); stim::complex* yv = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); stim::complex* djv= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); stim::complex* dyv= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); size_t jA_bytes = sizeof(stim::complex) * (Nl+1) * N_jA_lut; stim::complex* jA_lut = (stim::complex*) malloc(jA_bytes); //pointer to the look-up table T dr = (r_max - r_min) / (N_jA_lut-1); //distance between values in the LUT //std::cout<<"LUT jl bytes: "< hl; stim::complex nd = (stim::complex)n; for(size_t ri = 0; ri < N_jA_lut; ri++){ //for each value in the LUT stim::cbessjyva_sph(Nl, nd * k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] for(size_t l = 0; l <= Nl; l++){ //for each order jA_lut[ri * (Nl + 1) + l] = (stim::complex)(jv[l] * (stim::complex)A[l]); //store the bessel function result } } //Allocate device memory and copy everything to the GPU stim::complex* dev_jA_lut; HANDLE_ERROR( cudaMalloc(&dev_jA_lut, jA_bytes) ); HANDLE_ERROR( cudaMemcpy(dev_jA_lut, jA_lut, jA_bytes, cudaMemcpyHostToDevice) ); gpu_scalar_mie_internal(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_jA_lut, r_min, dr, N_jA_lut, Nl); cudaMemcpy(E, dev_E, N * sizeof(stim::complex), cudaMemcpyDeviceToHost); //copy the field from device memory if(x != NULL) cudaFree(dev_x); //free everything if(y != NULL) cudaFree(dev_y); if(z != NULL) cudaFree(dev_z); HANDLE_ERROR( cudaFree(dev_jA_lut) ); HANDLE_ERROR( cudaFree(dev_E) ); HANDLE_ERROR( cudaFree(dev_W) ); HANDLE_ERROR( cudaFree(dev_r) ); HANDLE_ERROR( cudaFree(dev_E) ); #else */ //allocate space to store the bessel function call results double vm; stim::complex* j_knr = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); stim::complex* y_knr = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); stim::complex* dj_knr= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); stim::complex* dy_knr= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); T* P = (T*) malloc( (Nl + 1) * sizeof(T) ); T r, cos_phi; stim::complex knr; stim::complex h; stim::complex Ew; for(size_t i = 0; i < N; i++){ stim::vec3 p; //declare a 3D point (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions (y == NULL) ? p[1] = 0 : p[1] = y[i]; (z == NULL) ? p[2] = 0 : p[2] = z[i]; p = p - c; r = p.len(); if(r < a){ E[i] = 0; for(size_t w = 0; w < W.size(); w++){ Ew = W[w].E() * exp(stim::complex(0, W[w].kvec().dot(c))); knr = (stim::complex)n * p.len() * W[w].kmag(); //calculate k*n*r stim::cbessjyva_sph(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr); if(r == 0) cos_phi = 0; else cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction stim::legendre(Nl, cos_phi, P); for(size_t l = 0; l <= Nl; l++){ E[i] += Ew * A[l] * (stim::complex)j_knr[l] * P[l]; } } } } //#endif } template void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave w, T a, stim::complex n, stim::vec3 c = stim::vec3(0, 0, 0)){ std::vector< stim::scalarwave > W(1, w); cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, c); } /// Class stim::scalarmie represents a scalar Mie scattering model that can be used to calculate the fields produced by a scattering sphere. template class scalarmie { public: T radius; //radius of the scattering sphere stim::complex n; //refractive index of the scattering sphere vec3 c; //position of the sphere in space public: scalarmie() { //default constructor radius = 0.5; n = stim::complex(1.4, 0.0); c = stim::vec3(0, 0, 0); } scalarmie(T r, stim::complex ri, stim::vec3 center = stim::vec3(0, 0, 0)){ radius = r; n = ri; c = center; //c = stim::vec3(2, 1, 0); } //void sum_scat(stim::scalarfield& E, T* X, T* Y, T* Z, stim::scalarbeam b, int samples = 1000){ // std::vector< stim::scalarwave > wave_array = b.mc(samples); //decompose the beam into an array of plane waves // stim::cpu_scalar_mie_scatter(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); //} //void sum_intern(stim::scalarfield& E, T* X, T* Y, T* Z, stim::scalarbeam b, int samples = 1000){ // std::vector< stim::scalarwave > wave_array = b.mc(samples); //decompose the beam into an array of plane waves // stim::cpu_scalar_mie_internal(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); //} //void eval(stim::scalarfield& E, T* X, T* Y, T* Z, stim::scalarbeam b, int order = 500, int samples = 1000){ // b.eval(E, X, Y, Z, order); //evaluate the incident field using a plane wave expansion // std::vector< stim::scalarwave > wave_array = b.mc(samples); //decompose the beam into an array of plane waves // sum_scat(E, X, Y, Z, b, samples); // sum_intern(E, X, Y, Z, b, samples); //} void eval(stim::scalarfield& E, stim::scalarbeam b, int order = 500, int samples = 1000){ E.meshgrid(); b.eval(E, order); std::vector< stim::scalarwave > wave_array = b.mc(samples); //decompose the beam into an array of plane waves if(E.gpu()){ stim::gpu_scalar_mie_scatter(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c, E.spacing()); } else{ stim::cpu_scalar_mie_scatter(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); stim::cpu_scalar_mie_internal(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); } } }; //end stim::scalarmie template class scalarcluster : public std::vector< scalarmie > { public: void eval(stim::scalarfield& E, stim::scalarbeam b, int order = 500, int samples = 1000) { E.meshgrid(); b.eval(E, order); std::vector< stim::scalarwave > wave_array = b.mc(samples); //decompose the beam into an array of plane waves T radius; stim::complex n; stim::vec3 c; for (size_t si = 0; si < std::vector< scalarmie >::size(); si++) { //for each sphere in the cluster radius = std::vector< scalarmie >::at(si).radius; n = std::vector< scalarmie >::at(si).n; c = std::vector< scalarmie >::at(si).c; if (E.gpu()) { stim::gpu_scalar_mie_scatter(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c, E.spacing()); } else { stim::cpu_scalar_mie_scatter(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c); stim::cpu_scalar_mie_internal(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c); } } } }; } //end namespace stim #endif