diff --git a/stim/envi/bsq.h b/stim/envi/bsq.h index b2c116c..5eb5190 100644 --- a/stim/envi/bsq.h +++ b/stim/envi/bsq.h @@ -1021,12 +1021,13 @@ public: //for each band for (unsigned long long z = b0; z < b1; z++) { + //std::cout<(temp), L); //write slice data into target file file.seekg(jumpb, std::ios::cur); diff --git a/stim/math/fft.h b/stim/math/fft.h index 272ace7..d3e9448 100644 --- a/stim/math/fft.h +++ b/stim/math/fft.h @@ -3,7 +3,7 @@ namespace stim{ - template + /*template void circshift(T *out, const T *in, size_t xdim, size_t ydim, size_t xshift, size_t yshift){ size_t i, j, ii, jj; for (i =0; i < xdim; i++) { @@ -13,16 +13,31 @@ namespace stim{ out[ii * ydim + jj] = in[i * ydim + j]; } } + }*/ + + template + void circshift(T *out, const T *in, int xdim, int ydim, int xshift, int yshift) + { + for (int i =0; i < xdim; i++) { + int ii = (i + xshift) % xdim; + if (ii<0) ii = xdim + ii; + for (int j = 0; j < ydim; j++) { + int jj = (j + yshift) % ydim; + if (jj<0) jj = ydim + jj; + //out[ii * ydim + jj] = in[i * ydim + j]; + out[jj * xdim + ii] = in[j * xdim + i]; + } + } } template void cpu_fftshift(T* out, T* in, size_t xdim, size_t ydim){ - circshift(out, in, xdim, ydim, xdim/2, ydim/2); + circshift(out, in, xdim, ydim, std::floor(xdim/2), std::floor(ydim/2)); } template void cpu_ifftshift(T* out, T* in, size_t xdim, size_t ydim){ - circshift(out, in, xdim, ydim, xdim/2, ydim/2); + circshift(out, in, xdim, ydim, std::ceil(xdim/2), std::ceil(ydim/2)); } diff --git a/stim/optics/mie.h b/stim/optics/mie.h deleted file mode 100644 index 12d208e..0000000 --- a/stim/optics/mie.h +++ /dev/null @@ -1,613 +0,0 @@ -#ifndef STIM_MIE_H -#define STIM_MIE_H -#include - -#include "scalarwave.h" -#include "../math/bessel.h" -#include "../cuda/cudatools/devices.h" -#include - -namespace stim{ - - -/// Calculate the scattering coefficients for a spherical scatterer -template -void B_coefficients(stim::complex* B, T a, T k, stim::complex n, int Nl){ - - //temporary variables - double vm; //allocate space to store the return values for the bessel function calculation - double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); - double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); - double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); - double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); - - stim::complex* j_kna = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); - stim::complex* y_kna = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); - stim::complex* dj_kna= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); - stim::complex* dy_kna= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); - - double ka = k * a; //store k*a (argument for spherical bessel and Hankel functions) - stim::complex kna = k * n * a; //store k*n*a (argument for spherical bessel functions and derivatives) - - stim::bessjyv_sph(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka); //calculate bessel functions and derivatives for k*a - stim::cbessjyva_sph(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna); //calculate complex bessel functions for k*n*a - - stim::complex h_ka, dh_ka; - stim::complex numerator, denominator; - stim::complex i(0, 1); - for(int l = 0; l <= Nl; l++){ - h_ka.r = j_ka[l]; - h_ka.i = y_ka[l]; - dh_ka.r = dj_ka[l]; - dh_ka.i = dy_ka[l]; - - numerator = j_ka[l] * dj_kna[l] * (stim::complex)n - j_kna[l] * dj_ka[l]; - denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex)n; - B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator; - } -} - -template -void A_coefficients(stim::complex* A, T a, T k, stim::complex n, int Nl){ - //temporary variables - double vm; //allocate space to store the return values for the bessel function calculation - double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); - double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); - double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); - double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); - - stim::complex* j_kna = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); - stim::complex* y_kna = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); - stim::complex* dj_kna= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); - stim::complex* dy_kna= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); - - double ka = k * a; //store k*a (argument for spherical bessel and Hankel functions) - stim::complex kna = k * n * a; //store k*n*a (argument for spherical bessel functions and derivatives) - - stim::bessjyv_sph(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka); //calculate bessel functions and derivatives for k*a - stim::cbessjyva_sph(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna); //calculate complex bessel functions for k*n*a - - stim::complex h_ka, dh_ka; - stim::complex numerator, denominator; - stim::complex i(0, 1); - for(size_t l = 0; l <= Nl; l++){ - h_ka.r = j_ka[l]; - h_ka.i = y_ka[l]; - dh_ka.r = dj_ka[l]; - dh_ka.i = dy_ka[l]; - - numerator = j_ka[l] * dh_ka - dj_ka[l] * h_ka; - denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex)n; - A[l] = (2 * l + 1) * pow(i, l) * numerator / denominator; - } -} - -#define LOCAL_NL 16 -template -__global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* hB, T r_min, T dr, size_t N_hB, int Nl){ - extern __shared__ stim::complex shared_hB[]; //declare the list of waves in shared memory - - size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array - if(i >= N) return; //exit if this thread is outside the array - stim::vec3 p; - (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions - (y == NULL) ? p[1] = 0 : p[1] = y[i]; - (z == NULL) ? p[2] = 0 : p[2] = z[i]; - - T r = p.len(); //calculate the distance from the sphere - if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) - T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT - size_t ij = (size_t) fij; //convert to an integral index - T alpha = fij - ij; //calculate the fractional portion of the index - size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT - size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT - - T cos_phi; - T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials - - stim::complex hBl; - stim::complex Ei = 0; //create a register to store the result - int l; - - stim::complex hlBl[LOCAL_NL+1]; //the first LOCAL_NL components are stored in registers for speed - int shared_start = threadIdx.x * (Nl - LOCAL_NL); //wrap up some operations so that they aren't done in the main loops - - #pragma unroll LOCAL_NL+1 //copy the first LOCAL_NL+1 h_l * B_l components to registers - for(l = 0; l <= LOCAL_NL; l++) - hlBl[l] = clerp( hB[n0j + l], hB[n1j + l], alpha ); - - for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory - shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp( hB[n0j + l], hB[n1j + l], alpha ); - - for(size_t w = 0; w < nW; w++){ //for each plane wave - cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere - Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation - Pl_1 = cos_phi; - Ei += W[w].E() * hlBl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation - Ei += W[w].E() * hlBl[1] * Pl_1; - - #pragma unroll LOCAL_NL-1 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file) - for(l = 2; l <= LOCAL_NL; l++){ - Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs) - Ei += W[w].E() * hlBl[l] * Pl; //calculate and sum the current field order - Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 - Pl_1 = Pl; - } - - for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers) - Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs - Ei += W[w].E() * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl; - Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 - Pl_1 = Pl; - } - } - E[i] += Ei; //copy the result to device memory -} - -template -void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){ - - size_t max_shared_mem = stim::sharedMemPerBlock(); - size_t hBl_array = sizeof(stim::complex) * (Nl + 1); - std::cout<<"hl*Bl array size: "<) * (Nl - LOCAL_NL); //amount of shared memory to allocate - std::cout<<"shared memory allocated: "<<<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel -} - -template -__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){ - size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array - if(i >= N) return; //exit if this thread is outside the array - - stim::vec3 p; - (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions - (y == NULL) ? p[1] = 0 : p[1] = y[i]; - (z == NULL) ? p[2] = 0 : p[2] = z[i]; - - r[i] = p.len(); -} -/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave - -/// @param E is a pointer to the destination field values -/// @param N is the number of points used to calculate the field -/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros) -/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros) -/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros) -/// @param W is an array of planewaves that will be scattered -/// @param a is the radius of the sphere -/// @param n is the complex refractive index of the sphere -template -void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector> W, T a, stim::complex n, T r_spacing = 0.1){ - //calculate the necessary number of orders required to represent the scattered field - T k = W[0].kmag(); - - int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); - if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) - std::cout<<"Nl: "<* B = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients - B_coefficients(B, a, k, n, Nl); - -#ifdef CUDA_FOUND - stim::complex* dev_E; //allocate space for the field - cudaMalloc(&dev_E, N * sizeof(stim::complex)); - cudaMemcpy(dev_E, E, N * sizeof(stim::complex), cudaMemcpyHostToDevice); - //cudaMemset(dev_F, 0, N * sizeof(stim::complex)); //set the field to zero (necessary because a sum is used) - - // COORDINATES - T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) - if(x != NULL){ - HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T))); - HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice)); - } - T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified) - if(y != NULL){ - HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T))); - HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice)); - } - T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified) - if(z != NULL){ - HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T))); - HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); - } - - // PLANE WAVES - stim::scalarwave* dev_W; //allocate space and copy plane waves - HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave) * W.size()) ); - HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave) * W.size(), cudaMemcpyHostToDevice) ); - - // BESSEL FUNCTION LOOK-UP TABLE - //calculate the distance from the sphere center - T* dev_r; - HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) ); - - int threads = stim::maxThreadsPerBlock(); - dim3 blocks((unsigned)(N / threads + 1)); - cuda_dist <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N); - - //Find the minimum and maximum values of r - cublasStatus_t stat; - cublasHandle_t handle; - - stat = cublasCreate(&handle); //create a cuBLAS handle - if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure - printf ("CUBLAS initialization failed\n"); - exit(1); - } - - int i_min, i_max; - stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min); - if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure - printf ("CUBLAS Error: failed to calculate minimum r value.\n"); - exit(1); - } - stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max); - if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure - printf ("CUBLAS Error: failed to calculate maximum r value.\n"); - exit(1); - } - - i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility - i_max--; - T r_min, r_max; //allocate space to store the minimum and maximum values - HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU - HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); - - r_min = max(r_min, a); //if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere) - - //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r - size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1); - - //T kr_min = k * r_min; - //T kr_max = k * r_max; - - //temporary variables - double vm; //allocate space to store the return values for the bessel function calculation - double* jv = (double*) malloc( (Nl + 1) * sizeof(double) ); - double* yv = (double*) malloc( (Nl + 1) * sizeof(double) ); - double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); - double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); - - size_t hB_bytes = sizeof(stim::complex) * (Nl+1) * N_hB_lut; - stim::complex* hB_lut = (stim::complex*) malloc(hB_bytes); //pointer to the look-up table - T dr = (r_max - r_min) / (N_hB_lut-1); //distance between values in the LUT - std::cout<<"LUT jl bytes: "< hl; - for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT - stim::bessjyv_sph(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] - for(size_t l = 0; l <= Nl; l++){ //for each order - hl.r = (T)jv[l]; - hl.i = (T)yv[l]; - - hB_lut[ri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result - //std::cout<(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer); - - //Allocate device memory and copy everything to the GPU - stim::complex* dev_hB_lut; - HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); - HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); - - gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl); - - cudaMemcpy(E, dev_E, N * sizeof(stim::complex), cudaMemcpyDeviceToHost); //copy the field from device memory - - if(x != NULL) cudaFree(dev_x); //free everything - if(y != NULL) cudaFree(dev_y); - if(z != NULL) cudaFree(dev_z); - cudaFree(dev_E); -#else - - - //allocate space to store the bessel function call results - double vm; - double* j_kr = (double*) malloc( (Nl + 1) * sizeof(double) ); - double* y_kr = (double*) malloc( (Nl + 1) * sizeof(double) ); - double* dj_kr= (double*) malloc( (Nl + 1) * sizeof(double) ); - double* dy_kr= (double*) malloc( (Nl + 1) * sizeof(double) ); - - T* P = (T*) malloc( (Nl + 1) * sizeof(T) ); - - T r, kr, cos_phi; - stim::complex h; - for(size_t i = 0; i < N; i++){ - stim::vec3 p; //declare a 3D point - - (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions - (y == NULL) ? p[1] = 0 : p[1] = y[i]; - (z == NULL) ? p[2] = 0 : p[2] = z[i]; - r = p.len(); - if(r >= a){ - for(size_t w = 0; w < W.size(); w++){ - kr = p.len() * W[w].kmag(); //calculate k*r - stim::bessjyv_sph(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr); - cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction - stim::legendre(Nl, cos_phi, P); - - for(size_t l = 0; l <= Nl; l++){ - h.r = j_kr[l]; - h.i = y_kr[l]; - E[i] += W[w].E() * B[l] * h * P[l]; - } - } - } - } -#endif -} - -template -void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave w, T a, stim::complex n, T r_spacing = 0.1){ - std::vector< stim::scalarwave > W(1, w); - cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, r_spacing); -} - -template -__global__ void cuda_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* jA, T r_min, T dr, size_t N_jA, int Nl){ - extern __shared__ stim::complex shared_jA[]; //declare the list of waves in shared memory - - size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array - if(i >= N) return; //exit if this thread is outside the array - stim::vec3 p; - (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions - (y == NULL) ? p[1] = 0 : p[1] = y[i]; - (z == NULL) ? p[2] = 0 : p[2] = z[i]; - - T r = p.len(); //calculate the distance from the sphere - if(r > a) return; //exit if the point is inside the sphere (we only calculate the internal field) - T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT - size_t ij = (size_t) fij; //convert to an integral index - T alpha = fij - ij; //calculate the fractional portion of the index - size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT - size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT - - T cos_phi; - T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials - - stim::complex jAl; - stim::complex Ei = 0; //create a register to store the result - int l; - - stim::complex jlAl[LOCAL_NL+1]; //the first LOCAL_NL components are stored in registers for speed - int shared_start = threadIdx.x * (Nl - LOCAL_NL); //wrap up some operations so that they aren't done in the main loops - - #pragma unroll LOCAL_NL+1 //copy the first LOCAL_NL+1 h_l * B_l components to registers - for(l = 0; l <= LOCAL_NL; l++) - jlAl[l] = clerp( jA[n0j + l], jA[n1j + l], alpha ); - - for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory - shared_jA[shared_start + (l - (LOCAL_NL+1))] = clerp( jA[n0j + l], jA[n1j + l], alpha ); - - for(size_t w = 0; w < nW; w++){ //for each plane wave - if(r == 0) cos_phi = 0; - else - cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere - Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation - Pl_1 = cos_phi; - Ei += W[w].E() * jlAl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation - Ei += W[w].E() * jlAl[1] * Pl_1; - - #pragma unroll LOCAL_NL-1 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file) - for(l = 2; l <= LOCAL_NL; l++){ - Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs) - Ei += W[w].E() * jlAl[l] * Pl; //calculate and sum the current field order - Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 - Pl_1 = Pl; - } - - for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers) - Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs - Ei += W[w].E() * shared_jA[shared_start + l - LOCAL_NL - 1] * Pl; - Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 - Pl_1 = Pl; - } - } - E[i] = Ei; //copy the result to device memory -} - -template -void gpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* jA, T r_min, T dr, size_t N_jA, size_t Nl){ - - size_t max_shared_mem = stim::sharedMemPerBlock(); - size_t hBl_array = sizeof(stim::complex) * (Nl + 1); - std::cout<<"hl*Bl array size: "<) * (Nl - LOCAL_NL); //amount of shared memory to allocate - std::cout<<"shared memory allocated: "<<<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl); //call the kernel -} - -/// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere - -/// @param E is a pointer to the destination field values -/// @param N is the number of points used to calculate the field -/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros) -/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros) -/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros) -/// @param w is a planewave that will be scattered -/// @param a is the radius of the sphere -/// @param n is the complex refractive index of the sphere -template -void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave > W, T a, stim::complex n, T r_spacing = 0.1){ -//calculate the necessary number of orders required to represent the scattered field - T k = W[0].kmag(); - - int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); - if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) - std::cout<<"Nl: "<* A = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients - A_coefficients(A, a, k, n, Nl); - -#ifdef CUDA_FOUND - stim::complex* dev_E; //allocate space for the field - cudaMalloc(&dev_E, N * sizeof(stim::complex)); - cudaMemcpy(dev_E, E, N * sizeof(stim::complex), cudaMemcpyHostToDevice); - //cudaMemset(dev_F, 0, N * sizeof(stim::complex)); //set the field to zero (necessary because a sum is used) - - // COORDINATES - T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) - if(x != NULL){ - HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T))); - HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice)); - } - T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified) - if(y != NULL){ - HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T))); - HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice)); - } - T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified) - if(z != NULL){ - HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T))); - HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); - } - - // PLANE WAVES - stim::scalarwave* dev_W; //allocate space and copy plane waves - HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave) * W.size()) ); - HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave) * W.size(), cudaMemcpyHostToDevice) ); - - // BESSEL FUNCTION LOOK-UP TABLE - //calculate the distance from the sphere center - T* dev_r; - HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) ); - - int threads = stim::maxThreadsPerBlock(); - dim3 blocks((unsigned)(N / threads + 1)); - cuda_dist <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N); - - //Find the minimum and maximum values of r - cublasStatus_t stat; - cublasHandle_t handle; - - stat = cublasCreate(&handle); //create a cuBLAS handle - if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure - printf ("CUBLAS initialization failed\n"); - exit(1); - } - - int i_min, i_max; - stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min); - if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure - printf ("CUBLAS Error: failed to calculate minimum r value.\n"); - exit(1); - } - stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max); - if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure - printf ("CUBLAS Error: failed to calculate maximum r value.\n"); - exit(1); - } - - i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility - i_max--; - T r_min, r_max; //allocate space to store the minimum and maximum values - HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU - HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); - - r_max = min(r_max, a); //the internal field doesn't exist outside of the sphere - - size_t N_jA_lut = (size_t)((r_max - r_min) / r_spacing + 1); - - //temporary variables - double vm; //allocate space to store the return values for the bessel function calculation - stim::complex* jv = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); - stim::complex* yv = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); - stim::complex* djv= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); - stim::complex* dyv= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); - - size_t jA_bytes = sizeof(stim::complex) * (Nl+1) * N_jA_lut; - stim::complex* jA_lut = (stim::complex*) malloc(jA_bytes); //pointer to the look-up table - T dr = (r_max - r_min) / (N_jA_lut-1); //distance between values in the LUT - std::cout<<"LUT jl bytes: "< hl; - stim::complex nd = (stim::complex)n; - for(size_t ri = 0; ri < N_jA_lut; ri++){ //for each value in the LUT - stim::cbessjyva_sph(Nl, nd * k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] - for(size_t l = 0; l <= Nl; l++){ //for each order - jA_lut[ri * (Nl + 1) + l] = (stim::complex)(jv[l] * (stim::complex)A[l]); //store the bessel function result - } - } - - //Allocate device memory and copy everything to the GPU - stim::complex* dev_jA_lut; - HANDLE_ERROR( cudaMalloc(&dev_jA_lut, jA_bytes) ); - HANDLE_ERROR( cudaMemcpy(dev_jA_lut, jA_lut, jA_bytes, cudaMemcpyHostToDevice) ); - - gpu_scalar_mie_internal(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_jA_lut, r_min, dr, N_jA_lut, Nl); - - cudaMemcpy(E, dev_E, N * sizeof(stim::complex), cudaMemcpyDeviceToHost); //copy the field from device memory - - if(x != NULL) cudaFree(dev_x); //free everything - if(y != NULL) cudaFree(dev_y); - if(z != NULL) cudaFree(dev_z); - cudaFree(dev_E); -#else - - //allocate space to store the bessel function call results - double vm; - stim::complex* j_knr = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); - stim::complex* y_knr = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); - stim::complex* dj_knr= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); - stim::complex* dy_knr= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); - - T* P = (T*) malloc( (Nl + 1) * sizeof(T) ); - - T r, cos_phi; - stim::complex knr; - stim::complex h; - for(size_t i = 0; i < N; i++){ - stim::vec3 p; //declare a 3D point - - (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions - (y == NULL) ? p[1] = 0 : p[1] = y[i]; - (z == NULL) ? p[2] = 0 : p[2] = z[i]; - r = p.len(); - if(r < a){ - E[i] = 0; - for(size_t w = 0; w < W.size(); w++){ - knr = (stim::complex)n * p.len() * W[w].kmag(); //calculate k*n*r - - stim::cbessjyva_sph(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr); - if(r == 0) - cos_phi = 0; - else - cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction - stim::legendre(Nl, cos_phi, P); - - for(size_t l = 0; l <= Nl; l++){ - E[i] += W[w].E() * A[l] * (stim::complex)j_knr[l] * P[l]; - } - } - } - } -#endif -} - -template -void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave w, T a, stim::complex n, T r_spacing = 0.1){ - std::vector< stim::scalarwave > W(1, w); - cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, r_spacing); -} - -} - -#endif \ No newline at end of file diff --git a/stim/optics/scalarbeam.h b/stim/optics/scalarbeam.h index 800f4a3..77fc289 100644 --- a/stim/optics/scalarbeam.h +++ b/stim/optics/scalarbeam.h @@ -213,13 +213,13 @@ void gpu_scalar_psf_local(stim::complex* E, size_t N, T* r, T* phi, T lambda, T dr = (r_max - r_min) / (Nlut_j-1); //distance between values in the LUT T jl; for(size_t ri = 0; ri < Nlut_j; ri++){ //for each value in the LUT - for(size_t l = 0; l <= Nl; l++){ //for each order + for(unsigned l = 0; l <= (unsigned)Nl; l++){ //for each order jl = boost::math::sph_bessel(l, k*(r_min + ri * dr)); //use boost to calculate the spherical bessel function j_lut[ri * (Nl + 1) + l] = jl; //store the bessel function result } } - stim::cpu2image(j_lut, "j_lut.bmp", Nl+1, Nlut_j, stim::cmBrewer); + //stim::cpu2image(j_lut, "j_lut.bmp", Nl+1, Nlut_j, stim::cmBrewer); //Allocate device memory and copy everything to the GPU T* gpu_C; @@ -316,6 +316,7 @@ void gpu_scalar_psf_cart(stim::complex* E, size_t N, T* x, T* y, T* z, T lamb stim::quaternion q; //create a quaternion q.CreateRotation(d, stim::vec3(0, 0, 1)); //create a mapping from the propagation direction to the PSF space + stim::matrix rot = q.toMatrix3(); int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device dim3 blocks( (unsigned)(N / threads + 1)); //calculate the optimal number of blocks cuda_cart2psf <<< blocks, threads >>> (gpu_r, gpu_phi, N, x, y, z, f, q); //call the CUDA kernel to move the cartesian coordinates to PSF space @@ -442,15 +443,19 @@ public: return samples; } + void eval(stim::scalarfield& E, T* X, T* Y, T* Z, int order = 500){ + cpu_scalar_psf_cart(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[0], NA[1], order, E.spacing()); + } + /// Evaluate the beam to a scalar field using Debye focusing - void eval(stim::scalarfield& E, size_t order = 500){ + void eval(stim::scalarfield& E, int order = 500){ size_t array_size = E.grid_bytes(); T* X = (T*) malloc( array_size ); //allocate space for the coordinate meshes T* Y = (T*) malloc( array_size ); T* Z = (T*) malloc( array_size ); E.meshgrid(X, Y, Z, stim::CPUmem); //calculate the coordinate meshes - cpu_scalar_psf_cart(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[0], NA[1], order, E.spacing()); + eval(E, X, Y, Z, order); free(X); //free the coordinate meshes free(Y); diff --git a/stim/optics/scalarfield.h b/stim/optics/scalarfield.h index 4d7cf37..397f49e 100644 --- a/stim/optics/scalarfield.h +++ b/stim/optics/scalarfield.h @@ -1,11 +1,211 @@ #ifndef STIM_SCALARFIELD_H #define STIM_SCALARFIELD_H + #include "../math/rect.h" #include "../math/complex.h" +#include "../math/fft.h" namespace stim{ + /// Perform a k-space transform of a scalar field (FFT). The given field has a width of x and the calculated momentum space has a + /// width of kx (in radians). + /// @param K is a pointer to the output array of all plane waves in the field + /// @param kx is the width of the frame in momentum space + /// @param ky is the height of the frame in momentum space + /// @param E is the field to be transformed + /// @param x is the width of the field in the spatial domain + /// @param y is the height of the field in the spatial domain + /// @param nx is the number of pixels representing the field in the x (and kx) direction + /// @param ny is the number of pixels representing the field in the y (and ky) direction + template + void cpu_scalar_to_kspace(stim::complex* K, T& kx, T& ky, stim::complex* E, T x, T y, size_t nx, size_t ny){ + + kx = stim::TAU * nx / x; //calculate the width of the momentum space + ky = stim::TAU * ny / y; + + stim::complex* dev_FFT; + HANDLE_ERROR( cudaMalloc(&dev_FFT, sizeof(stim::complex) * nx * ny) ); //allocate space on the CUDA device for the output array + + stim::complex* dev_E; + HANDLE_ERROR( cudaMalloc(&dev_E, sizeof(stim::complex) * nx * ny) ); //allocate space for the field + HANDLE_ERROR( cudaMemcpy(dev_E, E, sizeof(stim::complex) * nx * ny, cudaMemcpyHostToDevice) ); //copy the field to GPU memory + + cufftResult result; + cufftHandle plan; + result = cufftPlan2d(&plan, nx, ny, CUFFT_C2C); + if(result != CUFFT_SUCCESS){ + std::cout<<"Error creating cuFFT plan."<* fft = (stim::complex*) malloc(sizeof(stim::complex) * nx * ny); + HANDLE_ERROR( cudaMemcpy(fft, dev_FFT, sizeof(stim::complex) * nx * ny, cudaMemcpyDeviceToHost) ); + + stim::cpu_fftshift(K, fft, nx, ny); + //memcpy(K, fft, sizeof(stim::complex) * nx * ny); + } + + template + void cpu_scalar_from_kspace(stim::complex* E, T& x, T& y, stim::complex* K, T kx, T ky, size_t nx, size_t ny){ + + x = stim::TAU * nx / kx; //calculate the width of the momentum space + y = stim::TAU * ny / ky; + + stim::complex* fft = (stim::complex*) malloc(sizeof(stim::complex) * nx * ny); + stim::cpu_ifftshift(fft, K, nx, ny); + //memcpy(fft, K, sizeof(stim::complex) * nx * ny); + + stim::complex* dev_FFT; + HANDLE_ERROR( cudaMalloc(&dev_FFT, sizeof(stim::complex) * nx * ny) ); //allocate space on the CUDA device for the output array + HANDLE_ERROR( cudaMemcpy(dev_FFT, fft, sizeof(stim::complex) * nx * ny, cudaMemcpyHostToDevice) ); //copy the field to GPU memory + + stim::complex* dev_E; + HANDLE_ERROR( cudaMalloc(&dev_E, sizeof(stim::complex) * nx * ny) ); //allocate space for the field + + cufftResult result; + cufftHandle plan; + result = cufftPlan2d(&plan, nx, ny, CUFFT_C2C); + if(result != CUFFT_SUCCESS){ + std::cout<<"Error creating cuFFT plan."<) * nx * ny, cudaMemcpyDeviceToHost) ); + + + } + + /// Propagate a field slice along its orthogonal direction by a given distance z + /// @param Enew is the resulting propogated field + /// @param E is the field to be propogated + /// @param sx is the size of the field in the lateral x direction + /// @param sy is the size of the field in the lateral y direction + /// @param z is the distance to be propagated + /// @param k is the wavenumber 2*pi/lambda + /// @param nx is the number of samples in the field along the lateral x direction + /// @param ny is the number of samples in the field along the lateral y direction + template + void cpu_scalar_propagate(stim::complex* Enew, stim::complex* E, T sx, T sy, T z, T k, size_t nx, size_t ny){ + + stim::complex* K = (stim::complex*) malloc( sizeof(stim::complex) * nx * ny ); + + T Kx, Ky; //width and height in k space + cpu_scalar_to_kspace(K, Kx, Ky, E ,sx, sy, nx, ny); + + T* mag = (T*) malloc( sizeof(T) * nx * ny ); + stim::abs(mag, K, nx * ny); + stim::cpu2image(mag, "kspace_pre_shift.bmp", nx, ny, stim::cmBrewer); + + size_t kxi, kyi; + size_t i; + T kx, kx_sq, ky, ky_sq, k_sq; + T kz; + stim::complex shift; + T min_kx = -Kx / 2; + T dkx = Kx / (nx); + + T min_ky = -Ky / 2; + T dky = Ky / (ny); + + for(kyi = 0; kyi < ny; kyi++){ //for each plane wave in the ky direction + for(kxi = 0; kxi < nx; kxi++){ //for each plane wave in the ky direction + i = kyi * nx + kxi; + + kx = min_kx + kxi * dkx; //calculate the position of the current plane wave + ky = min_ky + kyi * dky; + + kx_sq = kx * kx; + ky_sq = ky * ky; + k_sq = k*k; + + if(kx_sq + ky_sq < k_sq){ + kz = sqrt(k_sq - kx_sq - ky_sq); //estimate kz using the Fresnel approximation + shift = -exp(stim::complex(0, kz * z)); + K[i] *= shift; + K[i] /= (nx*ny); //normalize the DFT + } + else{ + K[i] = 0; + } + } + } + + stim::abs(mag, K, nx * ny); + stim::cpu2image(mag, "kspace_post_shift.bmp", nx, ny, stim::cmBrewer); + + cpu_scalar_from_kspace(Enew, sx, sy, K, Kx, Ky, nx, ny); + } + + /// Apply a lowpass filter to a field slice + /// @param Enew is the resulting propogated field + /// @param E is the field to be propogated + /// @param sx is the size of the field in the lateral x direction + /// @param sy is the size of the field in the lateral y direction + /// @param highest is the highest spatial frequency that can pass through the filter + /// @param nx is the number of samples in the field along the lateral x direction + /// @param ny is the number of samples in the field along the lateral y direction + template + void cpu_scalar_lowpass(stim::complex* Enew, stim::complex* E, T sx, T sy, T highest, size_t nx, size_t ny){ + + stim::complex* K = (stim::complex*) malloc( sizeof(stim::complex) * nx * ny ); + + T Kx, Ky; //width and height in k space + cpu_scalar_to_kspace(K, Kx, Ky, E ,sx, sy, nx, ny); + + T* mag = (T*) malloc( sizeof(T) * nx * ny ); + stim::abs(mag, K, nx * ny); + stim::cpu2image(mag, "kspace_pre_lowpass.bmp", nx, ny, stim::cmBrewer); + + size_t kxi, kyi; + size_t i; + T kx, kx_sq, ky, ky_sq, k_sq; + T kz; + stim::complex shift; + T min_kx = -Kx / 2; + T dkx = Kx / (nx); + + T min_ky = -Ky / 2; + T dky = Ky / (ny); + + T highest_sq = highest * highest; + + for(kyi = 0; kyi < ny; kyi++){ //for each plane wave in the ky direction + for(kxi = 0; kxi < nx; kxi++){ //for each plane wave in the ky direction + i = kyi * nx + kxi; + + kx = min_kx + kxi * dkx; //calculate the position of the current plane wave + ky = min_ky + kyi * dky; + + kx_sq = kx * kx; + ky_sq = ky * ky; + + if(kx_sq + ky_sq > highest_sq){ + K[i] = 0; + } + else + K[i] /= nx * ny; //normalize the DFT + } + } + + stim::abs(mag, K, nx * ny); + stim::cpu2image(mag, "kspace_post_lowpass.bmp", nx, ny, stim::cmBrewer); + + cpu_scalar_from_kspace(Enew, sx, sy, K, Kx, Ky, nx, ny); + } + enum locationType {CPUmem, GPUmem}; /// Class represents a scalar optical field. @@ -22,23 +222,20 @@ protected: size_t R[2]; locationType loc; - - -public: - - CUDA_CALLABLE scalarfield(size_t X, size_t Y, T size = 1, T z_pos = 0) : rect::rect(size, z_pos){ - R[0] = X; //set the field resolution - R[1] = Y; - - E = (stim::complex*) malloc(sizeof(stim::complex) * R[0] * R[1]); //allocate in CPU memory - loc = CPUmem; + /// Convert the field to a k-space representation (do an FFT) + void to_kspace(T& kx, T& ky){ + cpu_scalar_to_kspace(E, kx, ky, E, X.len(), Y.len(), R[0], R[1]); } - CUDA_CALLABLE ~scalarfield(){ - if(loc == CPUmem) free(E); - else cudaFree(E); + void from_kspace(){ + kx = stim::TAU * R[0] / X.len(); //calculate the width of the momentum space + ky = stim::TAU * R[1] / Y.len(); + T x, y; + cpu_scalar_from_kspace(E, x, y, E, kx, ky, R[0], R[1]); } +public: + /// Returns the number of values in the field CUDA_CALLABLE size_t size(){ return R[0] * R[1]; @@ -48,6 +245,20 @@ public: return sizeof(stim::complex) * R[0] * R[1]; } + scalarfield(size_t X, size_t Y, T size = 1, T z_pos = 0) : rect::rect(size, z_pos){ + R[0] = X; //set the field resolution + R[1] = Y; + + E = (stim::complex*) malloc(grid_bytes()); //allocate in CPU memory + memset(E, 0, grid_bytes()); + loc = CPUmem; + } + + ~scalarfield(){ + if(loc == CPUmem) free(E); + else cudaFree(E); + } + /// Calculates the distance between points on the grid T spacing(){ T du = rect::X.len() / R[0]; @@ -78,6 +289,16 @@ public: } } + /// Propagate the field along its orthogonal direction by a distance d + void propagate(T d, T k){ + cpu_scalar_propagate(E, E, X.len(), Y.len(), d, k, R[0], R[1]); + } + + /// Propagate the field along its orthogonal direction by a distance d + void lowpass(T highest){ + cpu_scalar_lowpass(E, E, X.len(), Y.len(), highest, R[0], R[1]); + } + std::string str(){ std::stringstream ss; ss<::str()< p; @@ -114,9 +335,9 @@ public: i++; } } - stim::cpu2image(X, "X.bmp", R[0], R[1], stim::cmBrewer); - stim::cpu2image(Y, "Y.bmp", R[0], R[1], stim::cmBrewer); - stim::cpu2image(Z, "Z.bmp", R[0], R[1], stim::cmBrewer); + //stim::cpu2image(X, "X.bmp", R[0], R[1], stim::cmBrewer); + //stim::cpu2image(Y, "Y.bmp", R[0], R[1], stim::cmBrewer); + //stim::cpu2image(Z, "Z.bmp", R[0], R[1], stim::cmBrewer); } else{ std::cout<<"GPU allocation of a meshgrid isn't supported yet. You'll have to write kernels to do the calculation."; @@ -124,6 +345,14 @@ public: } } + //clear the field, setting all values to zero + void clear(){ + if(loc == GPUmem) + HANDLE_ERROR(cudaMemset(E, 0, grid_bytes())); + else + memset(E, 0, grid_bytes()); + } + void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer){ if(loc == GPUmem) to_cpu(); //if the field is in the GPU, move it to the CPU diff --git a/stim/optics/scalarmie.h b/stim/optics/scalarmie.h new file mode 100644 index 0000000..3e1fd7a --- /dev/null +++ b/stim/optics/scalarmie.h @@ -0,0 +1,646 @@ +#ifndef STIM_MIE_H +#define STIM_MIE_H +#include + +#include "scalarwave.h" +#include "../math/bessel.h" +#include "../cuda/cudatools/devices.h" +#include + +namespace stim{ + + +/// Calculate the scattering coefficients for a spherical scatterer +template +void B_coefficients(stim::complex* B, T a, T k, stim::complex n, int Nl){ + + //temporary variables + double vm; //allocate space to store the return values for the bessel function calculation + double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); + double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); + double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); + double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); + + stim::complex* j_kna = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* y_kna = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* dj_kna= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* dy_kna= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + + double ka = k * a; //store k*a (argument for spherical bessel and Hankel functions) + stim::complex kna = k * n * a; //store k*n*a (argument for spherical bessel functions and derivatives) + + stim::bessjyv_sph(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka); //calculate bessel functions and derivatives for k*a + stim::cbessjyva_sph(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna); //calculate complex bessel functions for k*n*a + + stim::complex h_ka, dh_ka; + stim::complex numerator, denominator; + stim::complex i(0, 1); + for(int l = 0; l <= Nl; l++){ + h_ka.r = j_ka[l]; + h_ka.i = y_ka[l]; + dh_ka.r = dj_ka[l]; + dh_ka.i = dy_ka[l]; + + numerator = j_ka[l] * dj_kna[l] * (stim::complex)n - j_kna[l] * dj_ka[l]; + denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex)n; + B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator; + } +} + +template +void A_coefficients(stim::complex* A, T a, T k, stim::complex n, int Nl){ + //temporary variables + double vm; //allocate space to store the return values for the bessel function calculation + double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); + double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); + double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); + double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); + + stim::complex* j_kna = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* y_kna = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* dj_kna= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* dy_kna= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + + double ka = k * a; //store k*a (argument for spherical bessel and Hankel functions) + stim::complex kna = k * n * a; //store k*n*a (argument for spherical bessel functions and derivatives) + + stim::bessjyv_sph(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka); //calculate bessel functions and derivatives for k*a + stim::cbessjyva_sph(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna); //calculate complex bessel functions for k*n*a + + stim::complex h_ka, dh_ka; + stim::complex numerator, denominator; + stim::complex i(0, 1); + for(size_t l = 0; l <= Nl; l++){ + h_ka.r = j_ka[l]; + h_ka.i = y_ka[l]; + dh_ka.r = dj_ka[l]; + dh_ka.i = dy_ka[l]; + + numerator = j_ka[l] * dh_ka - dj_ka[l] * h_ka; + denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex)n; + A[l] = (2 * l + 1) * pow(i, l) * numerator / denominator; + } +} + +#define LOCAL_NL 16 +template +__global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* hB, T r_min, T dr, size_t N_hB, int Nl){ + extern __shared__ stim::complex shared_hB[]; //declare the list of waves in shared memory + + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array + if(i >= N) return; //exit if this thread is outside the array + stim::vec3 p; + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions + (y == NULL) ? p[1] = 0 : p[1] = y[i]; + (z == NULL) ? p[2] = 0 : p[2] = z[i]; + + T r = p.len(); //calculate the distance from the sphere + if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) + T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT + size_t ij = (size_t) fij; //convert to an integral index + T alpha = fij - ij; //calculate the fractional portion of the index + size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT + size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT + + T cos_phi; + T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials + + stim::complex hBl; + stim::complex Ei = 0; //create a register to store the result + int l; + + stim::complex hlBl[LOCAL_NL+1]; //the first LOCAL_NL components are stored in registers for speed + int shared_start = threadIdx.x * (Nl - LOCAL_NL); //wrap up some operations so that they aren't done in the main loops + + #pragma unroll LOCAL_NL+1 //copy the first LOCAL_NL+1 h_l * B_l components to registers + for(l = 0; l <= LOCAL_NL; l++) + hlBl[l] = clerp( hB[n0j + l], hB[n1j + l], alpha ); + + for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory + shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp( hB[n0j + l], hB[n1j + l], alpha ); + + for(size_t w = 0; w < nW; w++){ //for each plane wave + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere + Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation + Pl_1 = cos_phi; + Ei += W[w].E() * hlBl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation + Ei += W[w].E() * hlBl[1] * Pl_1; + + #pragma unroll LOCAL_NL-1 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file) + for(l = 2; l <= LOCAL_NL; l++){ + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs) + Ei += W[w].E() * hlBl[l] * Pl; //calculate and sum the current field order + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 + Pl_1 = Pl; + } + + for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers) + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs + Ei += W[w].E() * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl; + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 + Pl_1 = Pl; + } + } + E[i] += Ei; //copy the result to device memory +} + +template +void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){ + + size_t max_shared_mem = stim::sharedMemPerBlock(); + size_t hBl_array = sizeof(stim::complex) * (Nl + 1); + std::cout<<"hl*Bl array size: "<) * (Nl - LOCAL_NL); //amount of shared memory to allocate + std::cout<<"shared memory allocated: "<<<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel +} + +template +__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){ + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array + if(i >= N) return; //exit if this thread is outside the array + + stim::vec3 p; + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions + (y == NULL) ? p[1] = 0 : p[1] = y[i]; + (z == NULL) ? p[2] = 0 : p[2] = z[i]; + + r[i] = p.len(); +} +/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave + +/// @param E is a pointer to the destination field values +/// @param N is the number of points used to calculate the field +/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros) +/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros) +/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros) +/// @param W is an array of planewaves that will be scattered +/// @param a is the radius of the sphere +/// @param n is the complex refractive index of the sphere +template +void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector> W, T a, stim::complex n, T r_spacing = 0.1){ + //calculate the necessary number of orders required to represent the scattered field + T k = W[0].kmag(); + + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); + if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) + std::cout<<"Nl: "<* B = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients + B_coefficients(B, a, k, n, Nl); + +#ifdef CUDA_FOUND + stim::complex* dev_E; //allocate space for the field + cudaMalloc(&dev_E, N * sizeof(stim::complex)); + cudaMemcpy(dev_E, E, N * sizeof(stim::complex), cudaMemcpyHostToDevice); + //cudaMemset(dev_F, 0, N * sizeof(stim::complex)); //set the field to zero (necessary because a sum is used) + + // COORDINATES + T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) + if(x != NULL){ + HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice)); + } + T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified) + if(y != NULL){ + HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice)); + } + T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified) + if(z != NULL){ + HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); + } + + // PLANE WAVES + stim::scalarwave* dev_W; //allocate space and copy plane waves + HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave) * W.size()) ); + HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave) * W.size(), cudaMemcpyHostToDevice) ); + + // BESSEL FUNCTION LOOK-UP TABLE + //calculate the distance from the sphere center + T* dev_r; + HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) ); + + int threads = stim::maxThreadsPerBlock(); + dim3 blocks((unsigned)(N / threads + 1)); + cuda_dist <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N); + + //Find the minimum and maximum values of r + cublasStatus_t stat; + cublasHandle_t handle; + + stat = cublasCreate(&handle); //create a cuBLAS handle + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure + printf ("CUBLAS initialization failed\n"); + exit(1); + } + + int i_min, i_max; + stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min); + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure + printf ("CUBLAS Error: failed to calculate minimum r value.\n"); + exit(1); + } + stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max); + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure + printf ("CUBLAS Error: failed to calculate maximum r value.\n"); + exit(1); + } + + i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility + i_max--; + T r_min, r_max; //allocate space to store the minimum and maximum values + HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU + HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); + + r_min = max(r_min, a); //if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere) + + //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r + size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1); + + //T kr_min = k * r_min; + //T kr_max = k * r_max; + + //temporary variables + double vm; //allocate space to store the return values for the bessel function calculation + double* jv = (double*) malloc( (Nl + 1) * sizeof(double) ); + double* yv = (double*) malloc( (Nl + 1) * sizeof(double) ); + double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); + double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); + + size_t hB_bytes = sizeof(stim::complex) * (Nl+1) * N_hB_lut; + stim::complex* hB_lut = (stim::complex*) malloc(hB_bytes); //pointer to the look-up table + T dr = (r_max - r_min) / (N_hB_lut-1); //distance between values in the LUT + std::cout<<"LUT jl bytes: "< hl; + for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT + stim::bessjyv_sph(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] + for(size_t l = 0; l <= Nl; l++){ //for each order + hl.r = (T)jv[l]; + hl.i = (T)yv[l]; + + hB_lut[ri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result + //std::cout<(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer); + + //Allocate device memory and copy everything to the GPU + stim::complex* dev_hB_lut; + HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); + HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); + + gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl); + + cudaMemcpy(E, dev_E, N * sizeof(stim::complex), cudaMemcpyDeviceToHost); //copy the field from device memory + + if(x != NULL) cudaFree(dev_x); //free everything + if(y != NULL) cudaFree(dev_y); + if(z != NULL) cudaFree(dev_z); + cudaFree(dev_E); +#else + + + //allocate space to store the bessel function call results + double vm; + double* j_kr = (double*) malloc( (Nl + 1) * sizeof(double) ); + double* y_kr = (double*) malloc( (Nl + 1) * sizeof(double) ); + double* dj_kr= (double*) malloc( (Nl + 1) * sizeof(double) ); + double* dy_kr= (double*) malloc( (Nl + 1) * sizeof(double) ); + + T* P = (T*) malloc( (Nl + 1) * sizeof(T) ); + + T r, kr, cos_phi; + stim::complex h; + for(size_t i = 0; i < N; i++){ + stim::vec3 p; //declare a 3D point + + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions + (y == NULL) ? p[1] = 0 : p[1] = y[i]; + (z == NULL) ? p[2] = 0 : p[2] = z[i]; + r = p.len(); + if(r >= a){ + for(size_t w = 0; w < W.size(); w++){ + kr = p.len() * W[w].kmag(); //calculate k*r + stim::bessjyv_sph(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr); + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction + stim::legendre(Nl, cos_phi, P); + + for(size_t l = 0; l <= Nl; l++){ + h.r = j_kr[l]; + h.i = y_kr[l]; + E[i] += W[w].E() * B[l] * h * P[l]; + } + } + } + } +#endif +} + +template +void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave w, T a, stim::complex n, T r_spacing = 0.1){ + std::vector< stim::scalarwave > W(1, w); + cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, r_spacing); +} + +template +__global__ void cuda_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* jA, T r_min, T dr, size_t N_jA, int Nl){ + extern __shared__ stim::complex shared_jA[]; //declare the list of waves in shared memory + + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array + if(i >= N) return; //exit if this thread is outside the array + stim::vec3 p; + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions + (y == NULL) ? p[1] = 0 : p[1] = y[i]; + (z == NULL) ? p[2] = 0 : p[2] = z[i]; + + T r = p.len(); //calculate the distance from the sphere + if(r >= a) return; //exit if the point is inside the sphere (we only calculate the internal field) + T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT + size_t ij = (size_t) fij; //convert to an integral index + T alpha = fij - ij; //calculate the fractional portion of the index + size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT + size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT + + T cos_phi; + T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials + + stim::complex jAl; + stim::complex Ei = 0; //create a register to store the result + int l; + + stim::complex jlAl[LOCAL_NL+1]; //the first LOCAL_NL components are stored in registers for speed + int shared_start = threadIdx.x * (Nl - LOCAL_NL); //wrap up some operations so that they aren't done in the main loops + + #pragma unroll LOCAL_NL+1 //copy the first LOCAL_NL+1 h_l * B_l components to registers + for(l = 0; l <= LOCAL_NL; l++) + jlAl[l] = clerp( jA[n0j + l], jA[n1j + l], alpha ); + + for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory + shared_jA[shared_start + (l - (LOCAL_NL+1))] = clerp( jA[n0j + l], jA[n1j + l], alpha ); + + for(size_t w = 0; w < nW; w++){ //for each plane wave + if(r == 0) cos_phi = 0; + else + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere + Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation + Pl_1 = cos_phi; + Ei += W[w].E() * jlAl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation + Ei += W[w].E() * jlAl[1] * Pl_1; + + #pragma unroll LOCAL_NL-1 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file) + for(l = 2; l <= LOCAL_NL; l++){ + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs) + Ei += W[w].E() * jlAl[l] * Pl; //calculate and sum the current field order + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 + Pl_1 = Pl; + } + + for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers) + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs + Ei += W[w].E() * shared_jA[shared_start + l - LOCAL_NL - 1] * Pl; + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 + Pl_1 = Pl; + } + } + E[i] = Ei; //copy the result to device memory +} + +template +void gpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* jA, T r_min, T dr, size_t N_jA, size_t Nl){ + + size_t max_shared_mem = stim::sharedMemPerBlock(); + size_t hBl_array = sizeof(stim::complex) * (Nl + 1); + std::cout<<"hl*Bl array size: "<) * (Nl - LOCAL_NL); //amount of shared memory to allocate + std::cout<<"shared memory allocated: "<<<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl); //call the kernel +} + +/// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere + +/// @param E is a pointer to the destination field values +/// @param N is the number of points used to calculate the field +/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros) +/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros) +/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros) +/// @param w is a planewave that will be scattered +/// @param a is the radius of the sphere +/// @param n is the complex refractive index of the sphere +template +void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave > W, T a, stim::complex n, T r_spacing = 0.1){ +//calculate the necessary number of orders required to represent the scattered field + T k = W[0].kmag(); + + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); + if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) + std::cout<<"Nl: "<* A = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients + A_coefficients(A, a, k, n, Nl); + +#ifdef CUDA_FOUND + stim::complex* dev_E; //allocate space for the field + cudaMalloc(&dev_E, N * sizeof(stim::complex)); + cudaMemcpy(dev_E, E, N * sizeof(stim::complex), cudaMemcpyHostToDevice); + //cudaMemset(dev_F, 0, N * sizeof(stim::complex)); //set the field to zero (necessary because a sum is used) + + // COORDINATES + T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) + if(x != NULL){ + HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice)); + } + T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified) + if(y != NULL){ + HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice)); + } + T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified) + if(z != NULL){ + HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); + } + + // PLANE WAVES + stim::scalarwave* dev_W; //allocate space and copy plane waves + HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave) * W.size()) ); + HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave) * W.size(), cudaMemcpyHostToDevice) ); + + // BESSEL FUNCTION LOOK-UP TABLE + //calculate the distance from the sphere center + T* dev_r; + HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) ); + + int threads = stim::maxThreadsPerBlock(); + dim3 blocks((unsigned)(N / threads + 1)); + cuda_dist <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N); + + //Find the minimum and maximum values of r + cublasStatus_t stat; + cublasHandle_t handle; + + stat = cublasCreate(&handle); //create a cuBLAS handle + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure + printf ("CUBLAS initialization failed\n"); + exit(1); + } + + int i_min, i_max; + stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min); + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure + printf ("CUBLAS Error: failed to calculate minimum r value.\n"); + exit(1); + } + stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max); + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure + printf ("CUBLAS Error: failed to calculate maximum r value.\n"); + exit(1); + } + + i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility + i_max--; + T r_min, r_max; //allocate space to store the minimum and maximum values + HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU + HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); + + r_max = min(r_max, a); //the internal field doesn't exist outside of the sphere + + size_t N_jA_lut = (size_t)((r_max - r_min) / r_spacing + 1); + + //temporary variables + double vm; //allocate space to store the return values for the bessel function calculation + stim::complex* jv = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* yv = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* djv= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* dyv= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + + size_t jA_bytes = sizeof(stim::complex) * (Nl+1) * N_jA_lut; + stim::complex* jA_lut = (stim::complex*) malloc(jA_bytes); //pointer to the look-up table + T dr = (r_max - r_min) / (N_jA_lut-1); //distance between values in the LUT + std::cout<<"LUT jl bytes: "< hl; + stim::complex nd = (stim::complex)n; + for(size_t ri = 0; ri < N_jA_lut; ri++){ //for each value in the LUT + stim::cbessjyva_sph(Nl, nd * k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] + for(size_t l = 0; l <= Nl; l++){ //for each order + jA_lut[ri * (Nl + 1) + l] = (stim::complex)(jv[l] * (stim::complex)A[l]); //store the bessel function result + } + } + + //Allocate device memory and copy everything to the GPU + stim::complex* dev_jA_lut; + HANDLE_ERROR( cudaMalloc(&dev_jA_lut, jA_bytes) ); + HANDLE_ERROR( cudaMemcpy(dev_jA_lut, jA_lut, jA_bytes, cudaMemcpyHostToDevice) ); + + gpu_scalar_mie_internal(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_jA_lut, r_min, dr, N_jA_lut, Nl); + + cudaMemcpy(E, dev_E, N * sizeof(stim::complex), cudaMemcpyDeviceToHost); //copy the field from device memory + + if(x != NULL) cudaFree(dev_x); //free everything + if(y != NULL) cudaFree(dev_y); + if(z != NULL) cudaFree(dev_z); + cudaFree(dev_E); +#else + + //allocate space to store the bessel function call results + double vm; + stim::complex* j_knr = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* y_knr = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* dj_knr= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* dy_knr= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + + T* P = (T*) malloc( (Nl + 1) * sizeof(T) ); + + T r, cos_phi; + stim::complex knr; + stim::complex h; + for(size_t i = 0; i < N; i++){ + stim::vec3 p; //declare a 3D point + + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions + (y == NULL) ? p[1] = 0 : p[1] = y[i]; + (z == NULL) ? p[2] = 0 : p[2] = z[i]; + r = p.len(); + if(r < a){ + E[i] = 0; + for(size_t w = 0; w < W.size(); w++){ + knr = (stim::complex)n * p.len() * W[w].kmag(); //calculate k*n*r + + stim::cbessjyva_sph(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr); + if(r == 0) + cos_phi = 0; + else + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction + stim::legendre(Nl, cos_phi, P); + + for(size_t l = 0; l <= Nl; l++){ + E[i] += W[w].E() * A[l] * (stim::complex)j_knr[l] * P[l]; + } + } + } + } +#endif +} + +template +void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave w, T a, stim::complex n, T r_spacing = 0.1){ + std::vector< stim::scalarwave > W(1, w); + cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, r_spacing); +} + + +/// Class stim::scalarmie represents a scalar Mie scattering model that can be used to calculate the fields produced by a scattering sphere. +template +class scalarmie +{ +private: + T radius; //radius of the scattering sphere + stim::complex n; //refractive index of the scattering sphere + +public: + + scalarmie(T r, stim::complex ri){ + radius = r; + n = ri; + } + + void eval(stim::scalarfield& E, stim::scalarbeam b, int order = 500, int samples = 1000){ + + size_t array_size = E.grid_bytes(); //calculate the number of bytes in the scalar grid + float* X = (float*) malloc( array_size ); //allocate space for the coordinate meshes + float* Y = (float*) malloc( array_size ); + float* Z = (float*) malloc( array_size ); + E.meshgrid(X, Y, Z, stim::CPUmem); //calculate the coordinate meshes + + b.eval(E, X, Y, Z, order); //evaluate the incident field using a plane wave expansion + + std::vector< stim::scalarwave > wave_array = b.mc(samples); //decompose the beam into an array of plane waves + stim::cpu_scalar_mie_scatter(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); + stim::cpu_scalar_mie_internal(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); + } + +}; //end stim::scalarmie + +} //end namespace stim + +#endif \ No newline at end of file diff --git a/stim/optics/scalarwave.h b/stim/optics/scalarwave.h index ac620ab..d993567 100644 --- a/stim/optics/scalarwave.h +++ b/stim/optics/scalarwave.h @@ -210,7 +210,7 @@ template __global__ void cuda_scalarwave(stim::complex* F, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t n_waves){ extern __shared__ stim::scalarwave shared_W[]; //declare the list of waves in shared memory - stim::cuda::sharedMemcpy(shared_W, W, n_waves, threadIdx.x, blockDim.x); //copy the plane waves into shared memory for faster access + stim::cuda::threadedMemcpy(shared_W, W, n_waves, threadIdx.x, blockDim.x); //copy the plane waves into shared memory for faster access __syncthreads(); //synchronize threads to insure all data is copied size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array diff --git a/stim/parser/arguments.h b/stim/parser/arguments.h index 3ca819e..148893f 100644 --- a/stim/parser/arguments.h +++ b/stim/parser/arguments.h @@ -42,11 +42,13 @@ namespace stim{ bool char_numerical(char c){ if( (c >= '0' && c <= '9') || c == '-' || c == '.') return true; + return false; } bool char_integral(char c){ if( (c >= '0' && c <= '9') || c == '-') return true; + return false; } /// Test if a given string contains a numerical (real) value @@ -475,31 +477,31 @@ namespace stim{ ///Determines of a parameter has been set and returns true if it has /// @param _name is the name of the argument - bool operator()(std::string _name) - { - size_t i = find(opts.begin(), opts.end(), _name) - opts.begin(); + bool operator()(std::string _name){ + std::vector::iterator it; + it = std::find(opts.begin(), opts.end(), _name);// - opts.begin(); - if(i < 0){ + if(it == opts.end()){ std::cout<<"ERROR - Unspecified parameter name: "<<_name<is_set(); } ///Returns the number of parameters for a specified argument /// @param _name is the name of the argument whose parameter number will be returned - size_t nargs(std::string _name) - { - size_t i = find(opts.begin(), opts.end(), _name) - opts.begin(); + size_t nargs(std::string _name){ + std::vector::iterator it; + it = find(opts.begin(), opts.end(), _name);// - opts.begin(); - if(i < 0){ + if(it == opts.end()){ std::cout<<"ERROR - Unspecified parameter name: "<<_name<nargs(); } ///Returns the number of arguments that have been set @@ -525,17 +527,16 @@ namespace stim{ ///Returns an object describing the argument /// @param _name is the name of the requested argument - cmd_option operator[](std::string _name) - { - size_t i = find(opts.begin(), opts.end(), _name) - opts.begin(); + cmd_option operator[](std::string _name){ + std::vector::iterator it; + it = find(opts.begin(), opts.end(), _name);// - opts.begin(); - if(i < 0 || i >= opts.size()) - { + if(it == opts.end()){ std::cout<<"ERROR - Unspecified parameter name: "<<_name<