#ifndef STIM_MIE_H #define STIM_MIE_H #include "scalarwave.h" #include "../math/bessel.h" #include namespace stim{ /// Calculate the scattering coefficients for a spherical scatterer template void B_coefficients(stim::complex* B, T a, T k, stim::complex n, int Nl){ //temporary variables double vm; //allocate space to store the return values for the bessel function calculation double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); stim::complex* j_kna = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); stim::complex* y_kna = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); stim::complex* dj_kna= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); stim::complex* dy_kna= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); double ka = k * a; //store k*a (argument for spherical bessel and Hankel functions) stim::complex kna = k * n * a; //store k*n*a (argument for spherical bessel functions and derivatives) stim::bessjyv_sph(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka); //calculate bessel functions and derivatives for k*a stim::cbessjyva_sph(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna); //calculate complex bessel functions for k*n*a stim::complex h_ka, dh_ka; stim::complex numerator, denominator; stim::complex i(0, 1); for(size_t l = 0; l <= Nl; l++){ h_ka.r = j_ka[l]; h_ka.i = y_ka[l]; dh_ka.r = dj_ka[l]; dh_ka.i = dy_ka[l]; numerator = j_ka[l] * dj_kna[l] * (stim::complex)n - j_kna[l] * dj_ka[l]; denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex)n; B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator; std::cout< void A_coefficients(stim::complex* A, T a, T k, stim::complex n, int Nl){ //temporary variables double vm; //allocate space to store the return values for the bessel function calculation double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); stim::complex* j_kna = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); stim::complex* y_kna = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); stim::complex* dj_kna= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); stim::complex* dy_kna= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); double ka = k * a; //store k*a (argument for spherical bessel and Hankel functions) stim::complex kna = k * n * a; //store k*n*a (argument for spherical bessel functions and derivatives) stim::bessjyv_sph(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka); //calculate bessel functions and derivatives for k*a stim::cbessjyva_sph(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna); //calculate complex bessel functions for k*n*a stim::complex h_ka, dh_ka; stim::complex numerator, denominator; stim::complex i(0, 1); for(size_t l = 0; l <= Nl; l++){ h_ka.r = j_ka[l]; h_ka.i = y_ka[l]; dh_ka.r = dj_ka[l]; dh_ka.i = dy_ka[l]; numerator = j_ka[l] * dh_ka - dj_ka[l] * h_ka; denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex)n; A[l] = (2 * l + 1) * pow(i, l) * numerator / denominator; } } template __global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* B, T* j, T kr_min, T dkr, int Nl){ extern __shared__ stim::scalarwave shared_W[]; //declare the list of waves in shared memory stim::cuda::sharedMemcpy(shared_W, W, nW, threadIdx.x, blockDim.x); //copy the plane waves into shared memory for faster access __syncthreads(); //synchronize threads to insure all data is copied size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array if(i >= N) return; //exit if this thread is outside the array stim::vec3 p; (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions (y == NULL) ? p[1] = 0 : p[1] = y[i]; (z == NULL) ? p[2] = 0 : p[2] = z[i]; T r = p.len(); //calculate the distance from the sphere T k = W[0].kmag(); if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) size_t NC = Nl + 1; //calculate the number of coefficients to be used T kr = r * k; //calculate the thread value for k*r T fij = (kr - kr_min)/dkr; //FP index into the spherical bessel LUT size_t ij = (size_t) fij; //convert to an integral index T alpha = fij - ij; //calculate the fractional portion of the index size_t n0j = ij * (NC); //start of the first entry in the LUT size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT T cos_phi; T Pl_2, Pl_1; //declare registers to store the previous two Legendre polynomials T Pl = 1; //initialize the current value for the Legendre polynomial T jl; stim::complex Ei = 0; //create a register to store the result int l; for(size_t w = 0; w < nW; w++){ cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere for(l = 0; l <= Nl; l++){ Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 Pl_1 = Pl; if(l == 0){ //computing Pl is done recursively, where the recursive relation Pl = cos_phi; // requires the first two orders. This defines the second. } else{ //if this is not the first iteration, use the recursive relation to calculate Pl Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1); } jl = lerp( j[n0j + l], j[n1j + l], alpha ); //read jl from the LUT and interpolate the result Ei += W[w].E() * B[l] * jl * Pl; } //Ei += shared_W[w].pos(p); //evaluate the plane wave } E[i] += Ei; //copy the result to device memory } template void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* B, T* j, T kr_min, T dkr, size_t Nl){ size_t wave_bytes = sizeof(stim::scalarwave); size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available size_t array_bytes = nW * wave_bytes; //calculate the maximum number of bytes required for the planewave array size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory size_t num_batches = nW / max_batch + 1; //calculate the number of batches required to process all plane waves size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required stim::scalarwave* batch_W; HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes)); //allocate memory for a single batch of plane waves int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks size_t batch_size; //declare a variable to store the size of the current batch size_t waves_processed = 0; //initialize the number of waves processed to zero while(waves_processed < nW){ //while there are still waves to be processed batch_size = min(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left batch_bytes = batch_size * sizeof(stim::scalarwave); HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory cuda_scalar_mie_scatter<<< blocks, threads, batch_bytes >>>(E, N, x, y, z, batch_W, batch_size, a, n, B, j, kr_min, dkr, (int)Nl); //call the kernel waves_processed += batch_size; //increment the counter indicating how many waves have been processed } cudaFree(batch_W); } /// Calculate the scalar Mie solution for the scattered field produced by a single plane wave /// @param E is a pointer to the destination field values /// @param N is the number of points used to calculate the field /// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros) /// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros) /// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros) /// @param W is an array of planewaves that will be scattered /// @param a is the radius of the sphere /// @param n is the complex refractive index of the sphere template void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector> W, T a, stim::complex n){ //calculate the necessary number of orders required to represent the scattered field T k = W[0].kmag(); size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2); //calculate the scattering coefficients for the sphere stim::complex* B = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients B_coefficients(B, a, k, n, Nl); #ifdef __CUDACC__ stim::complex* dev_E; //allocate space for the field cudaMalloc(&dev_E, N * sizeof(stim::complex)); cudaMemcpy(dev_E, E, N * sizeof(stim::complex), cudaMemcpyHostToDevice); //cudaMemset(dev_F, 0, N * sizeof(stim::complex)); //set the field to zero (necessary because a sum is used) // COORDINATES T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) if(x != NULL){ HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T))); HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice)); } T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified) if(y != NULL){ HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T))); HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice)); } T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified) if(z != NULL){ HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T))); HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); } // PLANE WAVES stim::scalarwave* dev_W; //allocate space and copy plane waves HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave) * W.size()) ); HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave) * W.size(), cudaMemcpyHostToDevice) ); // SCATTERING COEFFICIENTS stim::complex* dev_B; HANDLE_ERROR( cudaMalloc(&dev_B, sizeof(stim::complex) * (Nl+1)) ); HANDLE_ERROR( cudaMemcpy(dev_B, B, sizeof(stim::complex) * (Nl+1), cudaMemcpyHostToDevice) ); // BESSEL FUNCTION LOOK-UP TABLE //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r size_t Nlut_j = 1024; T r_min = 0; T r_max = 10; T kr_min = k * r_min; T kr_max = k * r_max; //temporary variables double vm; //allocate space to store the return values for the bessel function calculation double* jv = (double*) malloc( (Nl + 1) * sizeof(double) ); double* yv = (double*) malloc( (Nl + 1) * sizeof(double) ); double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j; T* bessel_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table T dkr = (kr_max - kr_min) / (Nlut_j-1); //distance between values in the LUT std::cout<<"LUT jl bytes: "<(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] for(size_t l = 0; l <= Nl; l++){ //for each order bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; //store the bessel function result } } stim::cpu2image(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer); //Allocate device memory and copy everything to the GPU T* dev_j_lut; HANDLE_ERROR( cudaMalloc(&dev_j_lut, lutj_bytes) ); HANDLE_ERROR( cudaMemcpy(dev_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) ); gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_B, dev_j_lut, kr_min, dkr, Nl); cudaMemcpy(E, dev_E, N * sizeof(stim::complex), cudaMemcpyDeviceToHost); //copy the field from device memory if(x != NULL) cudaFree(dev_x); //free everything if(y != NULL) cudaFree(dev_y); if(z != NULL) cudaFree(dev_z); cudaFree(dev_E); #else //allocate space to store the bessel function call results double vm; double* j_kr = (double*) malloc( (Nl + 1) * sizeof(double) ); double* y_kr = (double*) malloc( (Nl + 1) * sizeof(double) ); double* dj_kr= (double*) malloc( (Nl + 1) * sizeof(double) ); double* dy_kr= (double*) malloc( (Nl + 1) * sizeof(double) ); T* P = (T*) malloc( (Nl + 1) * sizeof(T) ); T r, kr, cos_phi; stim::complex h; for(size_t i = 0; i < N; i++){ stim::vec3 p; //declare a 3D point (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions (y == NULL) ? p[1] = 0 : p[1] = y[i]; (z == NULL) ? p[2] = 0 : p[2] = z[i]; r = p.len(); if(r >= a){ for(size_t w = 0; w < W.size(); w++){ kr = p.len() * W[w].kmag(); //calculate k*r stim::bessjyv_sph(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr); cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction stim::legendre(Nl, cos_phi, P); for(size_t l = 0; l <= Nl; l++){ h.r = j_kr[l]; h.i = y_kr[l]; E[i] += W[w].E() * B[l] * h * P[l]; } } } } #endif } template void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave w, T a, stim::complex n){ std::vector< stim::scalarwave > W(1, w); cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n); } /// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere /// @param E is a pointer to the destination field values /// @param N is the number of points used to calculate the field /// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros) /// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros) /// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros) /// @param w is a planewave that will be scattered /// @param a is the radius of the sphere /// @param n is the complex refractive index of the sphere template void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave > W, T a, stim::complex n){ //calculate the necessary number of orders required to represent the scattered field T k = W[0].kmag(); size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2); //calculate the scattering coefficients for the sphere stim::complex* A = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients A_coefficients(A, a, k, n, Nl); //allocate space to store the bessel function call results double vm; stim::complex* j_knr = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); stim::complex* y_knr = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); stim::complex* dj_knr= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); stim::complex* dy_knr= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); T* P = (T*) malloc( (Nl + 1) * sizeof(T) ); T r, cos_phi; stim::complex knr; stim::complex h; for(size_t i = 0; i < N; i++){ stim::vec3 p; //declare a 3D point (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions (y == NULL) ? p[1] = 0 : p[1] = y[i]; (z == NULL) ? p[2] = 0 : p[2] = z[i]; r = p.len(); if(r < a){ E[i] = 0; for(size_t w = 0; w < W.size(); w++){ knr = (stim::complex)n * p.len() * W[w].kmag(); //calculate k*n*r stim::cbessjyva_sph(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr); if(r == 0) cos_phi = 0; else cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction stim::legendre(Nl, cos_phi, P); for(size_t l = 0; l <= Nl; l++){ E[i] += W[w].E() * A[l] * (stim::complex)j_knr[l] * P[l]; } } } } } template void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave w, T a, stim::complex n){ std::vector< stim::scalarwave > W(1, w); cpu_scalar_mie_internal(E, N, x, y, z, W, a, n); } } #endif