From 9339fbad873457047446ee3a90f52296eda250a5 Mon Sep 17 00:00:00 2001 From: David Date: Mon, 13 Jun 2016 14:29:34 -0500 Subject: [PATCH] implementing mie scattering --- stim/cuda/cudatools/callable.h | 2 +- stim/image/image.h | 32 ++++++++++++-------------------- stim/math/bessel.h | 42 ++++++++++++++++++++++++------------------ stim/math/constants.h | 1 + stim/math/matrix.h | 2 +- stim/math/quaternion.h | 2 ++ stim/optics/mie.h | 373 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/optics/scalarbeam.h | 286 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------------------------------- stim/optics/scalarwave.h | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------------------- 9 files changed, 698 insertions(+), 134 deletions(-) create mode 100644 stim/optics/mie.h diff --git a/stim/cuda/cudatools/callable.h b/stim/cuda/cudatools/callable.h index eefc437..52e1c8a 100644 --- a/stim/cuda/cudatools/callable.h +++ b/stim/cuda/cudatools/callable.h @@ -2,7 +2,7 @@ //define the CUDA_CALLABLE macro (will prefix all members) #ifdef __CUDACC__ -#define CUDA_CALLABLE __host__ __device__ +#define CUDA_CALLABLE __host__ __device__ inline #else #define CUDA_CALLABLE #endif diff --git a/stim/image/image.h b/stim/image/image.h index 02f7dc1..674bf9f 100644 --- a/stim/image/image.h +++ b/stim/image/image.h @@ -58,12 +58,12 @@ class image{ int cv_type(){ if(std::is_same::value) return CV_MAKETYPE(CV_8U, (int)C()); - if(std::is_same::value) return CV_MAKETYPE(CV_8S, (int)C()); - if(std::is_same::value) return CV_MAKETYPE(CV_16U, (int)C()); - if(std::is_same::value) return CV_MAKETYPE(CV_16S, (int)C()); - if(std::is_same::value) return CV_MAKETYPE(CV_32S, (int)C()); - if(std::is_same::value) return CV_MAKETYPE(CV_32F, (int)C()); - if(std::is_same::value) return CV_MAKETYPE(CV_64F, (int)C()); + else if(std::is_same::value) return CV_MAKETYPE(CV_8S, (int)C()); + else if(std::is_same::value) return CV_MAKETYPE(CV_16U, (int)C()); + else if(std::is_same::value) return CV_MAKETYPE(CV_16S, (int)C()); + else if(std::is_same::value) return CV_MAKETYPE(CV_32S, (int)C()); + else if(std::is_same::value) return CV_MAKETYPE(CV_32F, (int)C()); + else if(std::is_same::value) return CV_MAKETYPE(CV_64F, (int)C()); std::cout<<"ERROR in stim::image::cv_type - no valid data type found"<::value) return UCHAR_MAX; - if(std::is_same::value) return SHRT_MAX; - if(std::is_same::value) return UINT_MAX; - if(std::is_same::value) return ULONG_MAX; - if(std::is_same::value) return ULLONG_MAX; - if(std::is_same::value) return 1.0f; - if(std::is_same::value) return 1.0; + else if(std::is_same::value) return SHRT_MAX; + else if(std::is_same::value) return UINT_MAX; + else if(std::is_same::value) return ULONG_MAX; + else if(std::is_same::value) return ULLONG_MAX; + else if(std::is_same::value) return 1.0f; + else if(std::is_same::value) return 1.0; std::cout<<"ERROR in stim::image::white - no white value known for this data type"< operator=(const stim::image& I){ - if(&I == this) //handle self-assignment - return *this; - allocate(I.X(), I.Y(), I.C()); - memcpy(img, I.img, bytes()); - return *this; - }*/ - stim::image& operator=(const stim::image& I){ init(); if(&I == this) //handle self-assignment diff --git a/stim/math/bessel.h b/stim/math/bessel.h index 6de92a2..6c31c43 100644 --- a/stim/math/bessel.h +++ b/stim/math/bessel.h @@ -1258,7 +1258,7 @@ int cbessjyva(P v,complex

z,P &vm,complex

*cjv, P a0,v0,pv0,pv1,vl,ga,gb,vg,vv,w0,w1,ya0,yak,ya1,wa; int j,n,k,kz,l,lb,lb0,m; - a0 = abs(z); + a0 = ::abs(z); z1 = z; z2 = z*z; n = (int)v; @@ -1286,7 +1286,7 @@ int cbessjyva(P v,complex

z,P &vm,complex

*cjv, vm = v; return 0; } - if (real(z1) < 0.0) z1 = -z; + if (::real(z1) < 0.0) z1 = -z; if (a0 <= 12.0) { for (l=0;l<2;l++) { vl = v0+l; @@ -1295,7 +1295,7 @@ int cbessjyva(P v,complex

z,P &vm,complex

*cjv, for (k=1;k<=40;k++) { cr *= -0.25*z2/(k*(k+vl)); cjvl += cr; - if (abs(cr) < abs(cjvl)*eps) break; + if (::abs(cr) < ::abs(cjvl)*eps) break; } vg = 1.0 + vl; ga = gamma(vg); @@ -1348,7 +1348,7 @@ int cbessjyva(P v,complex

z,P &vm,complex

*cjv, for (k=1;k<=40;k++) { cr *= -0.25*z2/(k*(k-vl)); cjvl += cr; - if (abs(cr) < abs(cjvl)*eps) break; + if (::abs(cr) < ::abs(cjvl)*eps) break; } vg = 1.0-vl; gb = gamma(vg); @@ -1381,16 +1381,16 @@ int cbessjyva(P v,complex

z,P &vm,complex

*cjv, cyv1 = M_2_PI*(cec*cjv1-1.0/z1-0.25*z1*cs1); } } - if (real(z) < 0.0) { + if (::real(z) < 0.0) { cfac0 = exp(pv0*cii); cfac1 = exp(pv1*cii); - if (imag(z) < 0.0) { + if (::imag(z) < 0.0) { cyv0 = cfac0*cyv0-(P)2.0*(complex

)cii*cos(pv0)*cjv0; cyv1 = cfac1*cyv1-(P)2.0*(complex

)cii*cos(pv1)*cjv1; cjv0 /= cfac0; cjv1 /= cfac1; } - else if (imag(z) > 0.0) { + else if (::imag(z) > 0.0) { cyv0 = cyv0/cfac0+(P)2.0*(complex

)cii*cos(pv0)*cjv0; cyv1 = cyv1/cfac1+(P)2.0*(complex

)cii*cos(pv1)*cjv1; cjv0 *= cfac0; @@ -1421,7 +1421,7 @@ int cbessjyva(P v,complex

z,P &vm,complex

*cjv, cf2 = cf1; cf1 = cf; } - if (abs(cjv0) > abs(cjv1)) cs = cjv0/cf; + if (::abs(cjv0) > ::abs(cjv1)) cs = cjv0/cf; else cs = cjv1/cf2; for (k=0;k<=n;k++) { cjv[k] *= cs; @@ -1433,21 +1433,21 @@ int cbessjyva(P v,complex

z,P &vm,complex

*cjv, } cyv[0] = cyv0; cyv[1] = cyv1; - ya0 = abs(cyv0); + ya0 = ::abs(cyv0); lb = 0; cg0 = cyv0; cg1 = cyv1; for (k=2;k<=n;k++) { cyk = 2.0*(v0+k-1.0)*cg1/z-cg0; - yak = abs(cyk); - ya1 = abs(cg0); + yak = ::abs(cyk); + ya1 = ::abs(cg0); if ((yak < ya0) && (yak< ya1)) lb = k; cyv[k] = cyk; cg0 = cg1; cg1 = cyk; } lb0 = 0; - if ((lb > 4) && (imag(z) != 0.0)) { + if ((lb > 4) && (::imag(z) != 0.0)) { while(lb != lb0) { ch2 = cone; ch1 = czero; @@ -1470,7 +1470,7 @@ int cbessjyva(P v,complex

z,P &vm,complex

*cjv, cp21 = ch2; if (lb == n) cjv[lb+1] = 2.0*(lb+v0)*cjv[lb]/z-cjv[lb-1]; - if (abs(cjv[0]) > abs(cjv[1])) { + if (::abs(cjv[0]) > ::abs(cjv[1])) { cyv[lb+1] = (cjv[lb+1]*cyv0-2.0*cp11/(M_PI*z))/cjv[0]; cyv[lb] = (cjv[lb]*cyv0+2.0*cp12/(M_PI*z))/cjv[0]; } @@ -1495,8 +1495,8 @@ int cbessjyva(P v,complex

z,P &vm,complex

*cjv, cyl2 = cylk; } for (k=2;k<=n;k++) { - wa = abs(cyv[k]); - if (wa < abs(cyv[k-1])) lb = k; + wa = ::abs(cyv[k]); + if (wa < ::abs(cyv[k-1])) lb = k; } } } @@ -1515,12 +1515,18 @@ int cbessjyva_sph(int v,complex

z,P &vm,complex

*cjv, //first, compute the bessel functions of fractional order cbessjyva

(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp); + if(z == 0){ //handle degenerate case of z = 0 + memset(cjv, 0, sizeof(P) * (v+1)); + cjv[0] = 1; + } + //iterate through each and scale for(int n = 0; n<=v; n++) { - - cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0)); - cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0)); + if(z != 0){ //handle degenerate case of z = 0 + cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0)); + cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0)); + } cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0)); cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0)); diff --git a/stim/math/constants.h b/stim/math/constants.h index 3ea11f9..9d93349 100644 --- a/stim/math/constants.h +++ b/stim/math/constants.h @@ -1,6 +1,7 @@ #ifndef STIM_CONSTANTS_H #define STIM_CONSTANTS_H +#include "stim/cuda/cudatools/callable.h" namespace stim{ const double PI = 3.1415926535897932384626433832795028841971693993751058209749445923078164062862; const double TAU = 2 * stim::PI; diff --git a/stim/math/matrix.h b/stim/math/matrix.h index 5d749cf..64bcb28 100644 --- a/stim/math/matrix.h +++ b/stim/math/matrix.h @@ -55,7 +55,7 @@ struct matrix vec operator*(vec rhs){ unsigned int N = rhs.size(); - vec3Y> result; + vec result; result.resize(N); for(int r=0; r from, vec3 to){ + from = from.norm(); + to = to.norm(); vec3 r = from.cross(to); //compute the rotation vector T theta = asin(r.len()); //compute the angle of the rotation about r //deal with a zero vector (both k and kn point in the same direction) diff --git a/stim/optics/mie.h b/stim/optics/mie.h new file mode 100644 index 0000000..59796d8 --- /dev/null +++ b/stim/optics/mie.h @@ -0,0 +1,373 @@ +#ifndef STIM_MIE_H +#define STIM_MIE_H + +#include "scalarwave.h" +#include "../math/bessel.h" +#include + +namespace stim{ + + +/// Calculate the scattering coefficients for a spherical scatterer +template +void B_coefficients(stim::complex* B, T a, T k, stim::complex n, int Nl){ + + //temporary variables + double vm; //allocate space to store the return values for the bessel function calculation + double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); + double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); + double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); + double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); + + stim::complex* j_kna = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* y_kna = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* dj_kna= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* dy_kna= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + + double ka = k * a; //store k*a (argument for spherical bessel and Hankel functions) + stim::complex kna = k * n * a; //store k*n*a (argument for spherical bessel functions and derivatives) + + stim::bessjyv_sph(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka); //calculate bessel functions and derivatives for k*a + stim::cbessjyva_sph(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna); //calculate complex bessel functions for k*n*a + + stim::complex h_ka, dh_ka; + stim::complex numerator, denominator; + stim::complex i(0, 1); + for(size_t l = 0; l <= Nl; l++){ + h_ka.r = j_ka[l]; + h_ka.i = y_ka[l]; + dh_ka.r = dj_ka[l]; + dh_ka.i = dy_ka[l]; + + numerator = j_ka[l] * dj_kna[l] * (stim::complex)n - j_kna[l] * dj_ka[l]; + denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex)n; + B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator; + std::cout< +void A_coefficients(stim::complex* A, T a, T k, stim::complex n, int Nl){ + //temporary variables + double vm; //allocate space to store the return values for the bessel function calculation + double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); + double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); + double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); + double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); + + stim::complex* j_kna = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* y_kna = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* dj_kna= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* dy_kna= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + + double ka = k * a; //store k*a (argument for spherical bessel and Hankel functions) + stim::complex kna = k * n * a; //store k*n*a (argument for spherical bessel functions and derivatives) + + stim::bessjyv_sph(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka); //calculate bessel functions and derivatives for k*a + stim::cbessjyva_sph(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna); //calculate complex bessel functions for k*n*a + + stim::complex h_ka, dh_ka; + stim::complex numerator, denominator; + stim::complex i(0, 1); + for(size_t l = 0; l <= Nl; l++){ + h_ka.r = j_ka[l]; + h_ka.i = y_ka[l]; + dh_ka.r = dj_ka[l]; + dh_ka.i = dy_ka[l]; + + numerator = j_ka[l] * dh_ka - dj_ka[l] * h_ka; + denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex)n; + A[l] = (2 * l + 1) * pow(i, l) * numerator / denominator; + } +} + +template +__global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* B, T* j, T kr_min, T dkr, int Nl){ + extern __shared__ stim::scalarwave shared_W[]; //declare the list of waves in shared memory + + stim::cuda::sharedMemcpy(shared_W, W, nW, threadIdx.x, blockDim.x); //copy the plane waves into shared memory for faster access + __syncthreads(); //synchronize threads to insure all data is copied + + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array + if(i >= N) return; //exit if this thread is outside the array + stim::vec3 p; + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions + (y == NULL) ? p[1] = 0 : p[1] = y[i]; + (z == NULL) ? p[2] = 0 : p[2] = z[i]; + + T r = p.len(); //calculate the distance from the sphere + T k = W[0].kmag(); + if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) + + size_t NC = Nl + 1; //calculate the number of coefficients to be used + T kr = r * k; //calculate the thread value for k*r + T fij = (kr - kr_min)/dkr; //FP index into the spherical bessel LUT + size_t ij = (size_t) fij; //convert to an integral index + T alpha = fij - ij; //calculate the fractional portion of the index + size_t n0j = ij * (NC); //start of the first entry in the LUT + size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT + + T cos_phi; + T Pl_2, Pl_1; //declare registers to store the previous two Legendre polynomials + T Pl = 1; //initialize the current value for the Legendre polynomial + T jl; + stim::complex Ei = 0; //create a register to store the result + int l; + for(size_t w = 0; w < nW; w++){ + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere + for(l = 0; l <= Nl; l++){ + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 + Pl_1 = Pl; + if(l == 0){ //computing Pl is done recursively, where the recursive relation + Pl = cos_phi; // requires the first two orders. This defines the second. + } + else{ //if this is not the first iteration, use the recursive relation to calculate Pl + Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1); + } + + jl = lerp( j[n0j + l], j[n1j + l], alpha ); //read jl from the LUT and interpolate the result + Ei += W[w].E() * B[l] * jl * Pl; + } + //Ei += shared_W[w].pos(p); //evaluate the plane wave + } + E[i] += Ei; //copy the result to device memory +} + +template +void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* B, T* j, T kr_min, T dkr, size_t Nl){ + + size_t wave_bytes = sizeof(stim::scalarwave); + size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available + size_t array_bytes = nW * wave_bytes; //calculate the maximum number of bytes required for the planewave array + size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory + size_t num_batches = nW / max_batch + 1; //calculate the number of batches required to process all plane waves + size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required + + stim::scalarwave* batch_W; + HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes)); //allocate memory for a single batch of plane waves + + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device + dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks + + size_t batch_size; //declare a variable to store the size of the current batch + size_t waves_processed = 0; //initialize the number of waves processed to zero + while(waves_processed < nW){ //while there are still waves to be processed + batch_size = min(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left + batch_bytes = batch_size * sizeof(stim::scalarwave); + HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory + cuda_scalar_mie_scatter<<< blocks, threads, batch_bytes >>>(E, N, x, y, z, batch_W, batch_size, a, n, B, j, kr_min, dkr, (int)Nl); //call the kernel + waves_processed += batch_size; //increment the counter indicating how many waves have been processed + } + cudaFree(batch_W); +} +/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave + +/// @param E is a pointer to the destination field values +/// @param N is the number of points used to calculate the field +/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros) +/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros) +/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros) +/// @param W is an array of planewaves that will be scattered +/// @param a is the radius of the sphere +/// @param n is the complex refractive index of the sphere +template +void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector> W, T a, stim::complex n){ + //calculate the necessary number of orders required to represent the scattered field + T k = W[0].kmag(); + + size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2); + + //calculate the scattering coefficients for the sphere + stim::complex* B = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients + B_coefficients(B, a, k, n, Nl); + +#ifdef __CUDACC__ + stim::complex* dev_E; //allocate space for the field + cudaMalloc(&dev_E, N * sizeof(stim::complex)); + cudaMemcpy(dev_E, E, N * sizeof(stim::complex), cudaMemcpyHostToDevice); + //cudaMemset(dev_F, 0, N * sizeof(stim::complex)); //set the field to zero (necessary because a sum is used) + + // COORDINATES + T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) + if(x != NULL){ + HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice)); + } + T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified) + if(y != NULL){ + HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice)); + } + T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified) + if(z != NULL){ + HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); + } + + // PLANE WAVES + stim::scalarwave* dev_W; //allocate space and copy plane waves + HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave) * W.size()) ); + HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave) * W.size(), cudaMemcpyHostToDevice) ); + + // SCATTERING COEFFICIENTS + stim::complex* dev_B; + HANDLE_ERROR( cudaMalloc(&dev_B, sizeof(stim::complex) * (Nl+1)) ); + HANDLE_ERROR( cudaMemcpy(dev_B, B, sizeof(stim::complex) * (Nl+1), cudaMemcpyHostToDevice) ); + + // BESSEL FUNCTION LOOK-UP TABLE + //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r + size_t Nlut_j = 1024; + T r_min = 0; + T r_max = 10; + + T kr_min = k * r_min; + T kr_max = k * r_max; + + //temporary variables + double vm; //allocate space to store the return values for the bessel function calculation + double* jv = (double*) malloc( (Nl + 1) * sizeof(double) ); + double* yv = (double*) malloc( (Nl + 1) * sizeof(double) ); + double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); + double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); + + size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j; + T* bessel_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table + T dkr = (kr_max - kr_min) / (Nlut_j-1); //distance between values in the LUT + std::cout<<"LUT jl bytes: "<(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] + for(size_t l = 0; l <= Nl; l++){ //for each order + bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; //store the bessel function result + } + } + + stim::cpu2image(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer); + + //Allocate device memory and copy everything to the GPU + T* dev_j_lut; + HANDLE_ERROR( cudaMalloc(&dev_j_lut, lutj_bytes) ); + HANDLE_ERROR( cudaMemcpy(dev_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) ); + + gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_B, dev_j_lut, kr_min, dkr, Nl); + + cudaMemcpy(E, dev_E, N * sizeof(stim::complex), cudaMemcpyDeviceToHost); //copy the field from device memory + + if(x != NULL) cudaFree(dev_x); //free everything + if(y != NULL) cudaFree(dev_y); + if(z != NULL) cudaFree(dev_z); + cudaFree(dev_E); +#else + + + //allocate space to store the bessel function call results + double vm; + double* j_kr = (double*) malloc( (Nl + 1) * sizeof(double) ); + double* y_kr = (double*) malloc( (Nl + 1) * sizeof(double) ); + double* dj_kr= (double*) malloc( (Nl + 1) * sizeof(double) ); + double* dy_kr= (double*) malloc( (Nl + 1) * sizeof(double) ); + + T* P = (T*) malloc( (Nl + 1) * sizeof(T) ); + + T r, kr, cos_phi; + stim::complex h; + for(size_t i = 0; i < N; i++){ + stim::vec3 p; //declare a 3D point + + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions + (y == NULL) ? p[1] = 0 : p[1] = y[i]; + (z == NULL) ? p[2] = 0 : p[2] = z[i]; + r = p.len(); + if(r >= a){ + for(size_t w = 0; w < W.size(); w++){ + kr = p.len() * W[w].kmag(); //calculate k*r + stim::bessjyv_sph(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr); + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction + stim::legendre(Nl, cos_phi, P); + + for(size_t l = 0; l <= Nl; l++){ + h.r = j_kr[l]; + h.i = y_kr[l]; + E[i] += W[w].E() * B[l] * h * P[l]; + } + } + } + } +#endif +} + +template +void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave w, T a, stim::complex n){ + std::vector< stim::scalarwave > W(1, w); + cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n); +} + +/// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere + +/// @param E is a pointer to the destination field values +/// @param N is the number of points used to calculate the field +/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros) +/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros) +/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros) +/// @param w is a planewave that will be scattered +/// @param a is the radius of the sphere +/// @param n is the complex refractive index of the sphere +template +void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave > W, T a, stim::complex n){ + + //calculate the necessary number of orders required to represent the scattered field + T k = W[0].kmag(); + + size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2); + + //calculate the scattering coefficients for the sphere + stim::complex* A = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients + A_coefficients(A, a, k, n, Nl); + + //allocate space to store the bessel function call results + double vm; + stim::complex* j_knr = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* y_knr = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* dj_knr= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* dy_knr= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + + T* P = (T*) malloc( (Nl + 1) * sizeof(T) ); + + T r, cos_phi; + stim::complex knr; + stim::complex h; + for(size_t i = 0; i < N; i++){ + stim::vec3 p; //declare a 3D point + + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions + (y == NULL) ? p[1] = 0 : p[1] = y[i]; + (z == NULL) ? p[2] = 0 : p[2] = z[i]; + r = p.len(); + if(r < a){ + E[i] = 0; + for(size_t w = 0; w < W.size(); w++){ + knr = (stim::complex)n * p.len() * W[w].kmag(); //calculate k*n*r + + stim::cbessjyva_sph(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr); + if(r == 0) + cos_phi = 0; + else + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction + stim::legendre(Nl, cos_phi, P); + + for(size_t l = 0; l <= Nl; l++){ + E[i] += W[w].E() * A[l] * (stim::complex)j_knr[l] * P[l]; + } + } + } + } +} + +template +void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave w, T a, stim::complex n){ + std::vector< stim::scalarwave > W(1, w); + cpu_scalar_mie_internal(E, N, x, y, z, W, a, n); +} + +} + +#endif \ No newline at end of file diff --git a/stim/optics/scalarbeam.h b/stim/optics/scalarbeam.h index cfb817b..e4dcd06 100644 --- a/stim/optics/scalarbeam.h +++ b/stim/optics/scalarbeam.h @@ -5,7 +5,12 @@ #include "../optics/scalarwave.h" #include "../math/bessel.h" #include "../math/legendre.h" +#include "../cuda/cudatools/devices.h" +#include "../cuda/cudatools/timer.h" +#include +#include #include +#include namespace stim{ @@ -105,10 +110,11 @@ public: std::vector< scalarwave > samples(N); //create a vector of plane waves T kmag = (T)stim::TAU / lambda; //calculate the wavenumber stim::complex apw; //allocate space for the amplitude at the focal point + T a = stim::TAU * (1 - cos(asin(NA[0]))) / (double)N; stim::vec3 kpw; //declare the new k-vector based on the focused plane wave direction for(size_t i=0; i(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave + apw = a * exp(stim::complex(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave samples[i] = scalarwave(kpw, apw); //create a plane wave based on the direction } @@ -148,7 +154,7 @@ public: /// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional) /// @param C is a pointer to Nl + 1 values where the terms will be stored template -CUDA_CALLABLE void cpu_aperture_integral(T* C, size_t Nl, T NA, T NA_in = 0){ +CUDA_CALLABLE void cpu_aperture_integral(T* C, int Nl, T NA, T NA_in = 0){ size_t table_bytes = (Nl + 1) * sizeof(T); //calculate the number of bytes required to store the terms T cos_alpha_1 = cos(asin(NA_in)); //calculate the cosine of the angle subtended by the central obscuration @@ -182,23 +188,151 @@ CUDA_CALLABLE void cpu_aperture_integral(T* C, size_t Nl, T NA, T NA_in = 0){ /// performs linear interpolation into a look-up table template -T lut_lookup(T* lut, T val, size_t N, T min_val, T delta, size_t stride = 0){ - size_t idx = (size_t)((val - min_val) / delta); - T alpha = val - idx * delta + min_val; +CUDA_CALLABLE void lut_lookup(T* lut_values, T* lut, T val, size_t N, T min_val, T delta, size_t n_vals){ + T idx = ((val - min_val) / delta); + size_t i = (size_t) idx; + T a1 = idx - i; + T a0 = 1 - a1; + size_t n0 = i * n_vals; + size_t n1 = (i+1) * n_vals; + for(size_t n = 0; n < n_vals; n++){ + lut_values[n] = lut[n0 + n] * a0 + lut[n1 + n] * a1; + } +} - if(alpha == 0) return lut[idx]; - else return lut[idx * stride] * (1 - alpha) + lut[ (idx+1) * stride] * alpha; +template +CUDA_CALLABLE T lerp(T v0, T v1, T t) { + return fma(t, v1, fma(-t, v0, v0)); } +#ifdef __CUDACC__ template -void cpu_scalar_psf(stim::complex* F, size_t N, T* r, T* phi, T lambda, T A, stim::vec3 f, T NA, T NA_in, int Nl){ - T k = stim::TAU / lambda; +__global__ void cuda_scalar_psf(stim::complex* E, size_t N, T* r, T* phi, T k, T A, size_t Nl, + T* C, + T* lut_j, size_t Nj, T min_kr, T dkr){ + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array + if(i >= N) return; //exit if this thread is outside the array + + T cos_phi = cos(phi[i]); //calculate the thread value for cos(phi) + T kr = r[i] * k; //calculate the thread value for k*r + stim::complex Ei = 0; //initialize the value of the field to zero + size_t NC = Nl + 1; //calculate the number of coefficients to be used + + T fij = (kr - min_kr)/dkr; //FP index into the spherical bessel LUT + size_t ij = (size_t) fij; //convert to an integral index + T a = fij - ij; //calculate the fractional portion of the index + size_t n0j = ij * (NC); //start of the first entry in the LUT + size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT + + T jl; //declare register to store the spherical bessel function + T Pl_2, Pl_1; //declare registers to store the previous two Legendre polynomials + T Pl = 1; //initialize the current value for the Legendre polynomial + stim::complex im(0, 1); //declare i (imaginary 1) + stim::complex i_pow(1, 0); //i_pow stores the current value of i^l so it doesn't have to be re-computed every iteration + for(int l = 0; l <= Nl; l++){ //for each order + jl = lerp( lut_j[n0j + l], lut_j[n1j + l], a ); //read jl from the LUT and interpolate the result + Ei += i_pow * jl * Pl * C[l]; //calculate the value for the field and sum + i_pow *= im; //multiply i^l * i for the next iteration + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 + Pl_1 = Pl; + if(l == 0){ //computing Pl is done recursively, where the recursive relation + Pl = cos_phi; // requires the first two orders. This defines the second. + } + else{ //if this is not the first iteration, use the recursive relation to calculate Pl + Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1); + } + + } + E[i] = Ei * A * 2 * CUDART_PI_F; //scale the integral by the amplitude +} + +template +void gpu_scalar_psf_local(stim::complex* E, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl, T r_spacing){ + + //Find the minimum and maximum values of r + cublasStatus_t stat; + cublasHandle_t handle; + + stat = cublasCreate(&handle); //create a cuBLAS handle + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure + printf ("CUBLAS initialization failed\n"); + exit(1); + } + + int i_min, i_max; + stat = cublasIsamin(handle, (int)N, r, 1, &i_min); + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure + printf ("CUBLAS Error: failed to calculate minimum r value.\n"); + exit(1); + } + stat = cublasIsamax(handle, (int)N, r, 1, &i_max); + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure + printf ("CUBLAS Error: failed to calculate maximum r value.\n"); + exit(1); + } + + T r_min, r_max; //allocate space to store the minimum and maximum values + HANDLE_ERROR( cudaMemcpy(&r_min, r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU + HANDLE_ERROR( cudaMemcpy(&r_max, r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); + + T k = (T)stim::TAU / lambda; //calculate the wavenumber from lambda + size_t C_bytes = (Nl + 1) * sizeof(T); + T* C = (T*) malloc( C_bytes ); //allocate space for the aperture integral terms + cpu_aperture_integral(C, Nl, NA, NA_in); //calculate the aperture integral terms + + size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r + + T kr_min = k * r_min; + T kr_max = k * r_max; + + //temporary variables + double vm; //allocate space to store the return values for the bessel function calculation + double* jv = (double*) malloc( (Nl + 1) * sizeof(double) ); + double* yv = (double*) malloc( (Nl + 1) * sizeof(double) ); + double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); + double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); + + size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j; + T* bessel_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table + T delta_kr = (kr_max - kr_min) / (Nlut_j-1); //distance between values in the LUT + std::cout<<"LUT jl bytes: "<(Nl, kr_min + kri * delta_kr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] + for(size_t l = 0; l <= Nl; l++){ //for each order + bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; //store the bessel function result + } + } + + stim::cpu2image(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer); + + //Allocate device memory and copy everything to the GPU + + T* gpu_C; + HANDLE_ERROR( cudaMalloc(&gpu_C, C_bytes) ); + HANDLE_ERROR( cudaMemcpy(gpu_C, C, C_bytes, cudaMemcpyHostToDevice) ); + T* gpu_j_lut; + HANDLE_ERROR( cudaMalloc(&gpu_j_lut, lutj_bytes) ); + HANDLE_ERROR( cudaMemcpy(gpu_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) ); + + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device + dim3 blocks( (unsigned)(N / threads + 1)); //calculate the optimal number of blocks - T* C = (T*) malloc( (Nl + 1) * sizeof(T) ); //allocate space for the aperture integral terms + cuda_scalar_psf<<< blocks, threads >>>(E, N, r, phi, (T)stim::TAU/lambda, A, Nl, gpu_C, gpu_j_lut, Nlut_j, kr_min, delta_kr); + + //free the LUT and condenser tables + HANDLE_ERROR( cudaFree(gpu_C) ); + HANDLE_ERROR( cudaFree(gpu_j_lut) ); +} +#endif + +/// Calculate the analytical solution to a scalar point spread function given a set of spherical coordinates about the PSF (beam propagation along phi = theta = 0) +template +void cpu_scalar_psf_local(stim::complex* F, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl){ + T k = (T)stim::TAU / lambda; + size_t C_bytes = (Nl + 1) * sizeof(T); + T* C = (T*) malloc( C_bytes ); //allocate space for the aperture integral terms cpu_aperture_integral(C, Nl, NA, NA_in); //calculate the aperture integral terms memset(F, 0, N * sizeof(stim::complex)); -#ifdef NO_CUDA - memset(F, 0, N * sizeof(stim::complex)); T jl, Pl, kr, cos_phi; double vm; @@ -225,71 +359,117 @@ void cpu_scalar_psf(stim::complex* F, size_t N, T* r, T* phi, T lambda, T A, free(C); free(Pl_cos_phi); -#else - T min_r = r[0]; - T max_r = r[0]; - for(size_t i = 0; i < N; i++){ //find the minimum and maximum values of r (min and max distance from the focal point) - if(r[i] < min_r) min_r = r[i]; - if(r[i] > max_r) max_r = r[i]; - } - T min_kr = k * min_r; - T max_kr = k * max_r; +} - //temporary variables - double vm; - double* jv = (double*) malloc( (Nl + 1) * sizeof(double) ); - double* yv = (double*) malloc( (Nl + 1) * sizeof(double) ); - double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); - double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); +/// Converts a set of cartesian points into spherical coordinates surrounding a point spread function (PSF) +/// @param r is the output distance from the PSF +/// @param phi is the non-symmetric direction about the PSF +/// @param x (x, y, z) are the cartesian coordinates in world space +/// @f is the focal point of the PSF in cartesian coordinates +/// @d is the propagation direction of the PSF in cartesian coordinates +template +__global__ void cuda_cart2psf(T* r, T* phi, size_t N, T* x, T* y, T* z, stim::vec3 f, stim::quaternion q){ - size_t Nlut = (size_t)sqrt(N) * 2; - T* bessel_lut = (T*) malloc(sizeof(T) * (Nl+1) * Nlut); - T delta_kr = (max_kr - min_kr) / (Nlut-1); - for(size_t kri = 0; kri < Nlut; kri++){ - stim::bessjyv_sph(Nl, min_kr + kri * delta_kr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] - for(size_t l = 0; l <= Nl; l++){ - bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; - } - } + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array + if(i >= N) return; //exit if this thread is outside the array - T* Pl_cos_phi = (T*) malloc((Nl + 1) * sizeof(T)); - T kr, cos_phi, jl, Pl; - for(size_t n = 0; n < N; n++){ //for each point in the field - kr = k * r[n]; //calculate kr (the optical distance between the focal point and p) - cos_phi = std::cos(phi[n]); //calculate the cosine of phi - stim::legendre(Nl, cos_phi, Pl_cos_phi); //calculate the [0 Nl] legendre polynomials for this point + stim::vec3 p; //declare a 3D point + + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions + (y == NULL) ? p[1] = 0 : p[1] = y[i]; + (z == NULL) ? p[2] = 0 : p[2] = z[i]; - for(int l = 0; l <= Nl; l++){ - jl = lut_lookup(&bessel_lut[l], kr, Nlut, min_kr, delta_kr, Nl+1); - Pl = Pl_cos_phi[l]; - F[n] += pow(complex(0, 1), l) * jl * Pl * C[l]; - } - F[n] *= A * stim::TAU; - } -#endif + p = p - f; //shift the point to the center of the PSF (focal point) + p = q.toMatrix3() * p; //rotate the point to align with the propagation direction + + stim::vec3 ps = p.cart2sph(); //convert from cartesian to spherical coordinates + r[i] = ps[0]; //store r + phi[i] = ps[2]; //phi = [0 pi] } +#ifdef __CUDACC__ +/// Calculate the analytical solution to a point spread function given a set of points in cartesian coordinates +template +void gpu_scalar_psf_cart(stim::complex* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3 f, stim::vec3 d, T NA, T NA_in, int Nl, T r_spacing = 1){ + + T* gpu_r; //allocate space for the coordinates in r + HANDLE_ERROR( cudaMalloc(&gpu_r, sizeof(T) * N) ); + T* gpu_phi; + HANDLE_ERROR( cudaMalloc(&gpu_phi, sizeof(T) * N) ); + //stim::complex* gpu_E; + //HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex) * N) ); + + stim::quaternion q; //create a quaternion + q.CreateRotation(d, stim::vec3(0, 0, 1)); //create a mapping from the propagation direction to the PSF space + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device + dim3 blocks( (unsigned)(N / threads + 1)); //calculate the optimal number of blocks + cuda_cart2psf <<< blocks, threads >>> (gpu_r, gpu_phi, N, x, y, z, f, q); //call the CUDA kernel to move the cartesian coordinates to PSF space + + gpu_scalar_psf_local(E, N, gpu_r, gpu_phi, lambda, A, NA, NA_in, Nl, r_spacing); + +} +#endif template -void cpu_scalar_psf(stim::complex* F, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3 f, T NA, T NA_in, int Nl){ +void cpu_scalar_psf_cart(stim::complex* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3 f, stim::vec3 d, T NA, T NA_in, int Nl, T r_spacing = 1){ + +// If CUDA is available, copy the cartesian points to the GPU and evaluate them in a kernel +#ifdef __CUDACC__ + + T* gpu_x = NULL; + if(x != NULL){ + HANDLE_ERROR( cudaMalloc(&gpu_x, sizeof(T) * N) ); + HANDLE_ERROR( cudaMemcpy(gpu_x, x, sizeof(T) * N, cudaMemcpyHostToDevice) ); + } + T* gpu_y = NULL; + if(y != NULL){ + HANDLE_ERROR( cudaMalloc(&gpu_y, sizeof(T) * N) ); + HANDLE_ERROR( cudaMemcpy(gpu_y, y, sizeof(T) * N, cudaMemcpyHostToDevice) ); + } + T* gpu_z = NULL; + if(z != NULL){ + HANDLE_ERROR( cudaMalloc(&gpu_z, sizeof(T) * N) ); + HANDLE_ERROR( cudaMemcpy(gpu_z, z, sizeof(T) * N, cudaMemcpyHostToDevice) ); + } + + stim::complex* gpu_E; + HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex) * N) ); + HANDLE_ERROR( cudaMemcpy(gpu_E, E, sizeof(stim::complex) * N, cudaMemcpyHostToDevice) ); + gpu_scalar_psf_cart(gpu_E, N, gpu_x, gpu_y, gpu_z, lambda, A, f, d, NA, NA_in, Nl, r_spacing); + HANDLE_ERROR( cudaMemcpy(E, gpu_E, sizeof(stim::complex) * N, cudaMemcpyDeviceToHost) ); + + HANDLE_ERROR( cudaFree(gpu_x) ); + HANDLE_ERROR( cudaFree(gpu_y) ); + HANDLE_ERROR( cudaFree(gpu_z) ); + HANDLE_ERROR( cudaFree(gpu_E) ); + +#else T* r = (T*) malloc(N * sizeof(T)); //allocate space for p in spherical coordinates T* phi = (T*) malloc(N * sizeof(T)); // only r and phi are necessary (the scalar PSF is symmetric about theta) - stim::vec3 p, ps; + stim::quaternion q; + q.CreateRotation(d, stim::vec3(0, 0, 1)); + stim::matrix R = q.toMatrix3(); + stim::vec3 p, ps, ds; for(size_t i = 0; i < N; i++){ (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions (y == NULL) ? p[1] = 0 : p[1] = y[i]; (z == NULL) ? p[2] = 0 : p[2] = z[i]; + p = p - f; + + p = R * p; //rotate the cartesian point + ps = p.cart2sph(); //convert from cartesian to spherical coordinates r[i] = ps[0]; //store r phi[i] = ps[2]; //phi = [0 pi] } - cpu_scalar_psf(F, N, r, phi, lambda, A, f, NA, NA_in, Nl); //call the spherical coordinate CPU function + cpu_scalar_psf_local(F, N, r, phi, lambda, A, NA, NA_in, Nl); //call the spherical coordinate CPU function free(r); free(phi); +#endif } } //end namespace stim diff --git a/stim/optics/scalarwave.h b/stim/optics/scalarwave.h index 54ba253..4e63bed 100644 --- a/stim/optics/scalarwave.h +++ b/stim/optics/scalarwave.h @@ -60,7 +60,7 @@ public: return k.len(); } - CUDA_CALLABLE vec3< complex > E(){ + CUDA_CALLABLE complex E(){ return E0; } @@ -235,6 +235,34 @@ void gpu_scalarwave(stim::complex* F, size_t N, T* x, T* y, T* z, stim::scala cuda_scalarwave<<< blocks, threads >>>(F, N, x, y, z, w); //call the kernel } +template +void gpu_scalarwaves(stim::complex* F, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW){ + + size_t wave_bytes = sizeof(stim::scalarwave); + size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available + size_t array_bytes = nW * wave_bytes; //calculate the maximum number of bytes required for the planewave array + size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory + size_t num_batches = nW / max_batch + 1; //calculate the number of batches required to process all plane waves + size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required + + stim::scalarwave* batch_W; + HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes)); //allocate memory for a single batch of plane waves + + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device + dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks + + size_t batch_size; //declare a variable to store the size of the current batch + size_t waves_processed = 0; //initialize the number of waves processed to zero + while(waves_processed < nW){ //while there are still waves to be processed + batch_size = min(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left + batch_bytes = batch_size * sizeof(stim::scalarwave); + HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory + cuda_scalarwave<<< blocks, threads, batch_bytes >>>(F, N, x, y, z, batch_W, batch_size); //call the kernel + waves_processed += batch_size; //increment the counter indicating how many waves have been processed + } + cudaFree(batch_W); +} + /// Sums a series of coherent plane waves at a specified point /// @param field is the output array of field values corresponding to each input point /// @param x is an array of x coordinates for the field point @@ -245,24 +273,13 @@ void gpu_scalarwave(stim::complex* F, size_t N, T* x, T* y, T* z, stim::scala /// @param A is the list of amplitudes for each wave /// @param S is the list of propagation directions for each wave template -void cpu_sum_scalarwaves(stim::complex* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave > w_array){ - size_t S = w_array.size(); //store the number of waves -#ifdef NO_CUDA - memset(F, 0, N * sizeof(stim::complex)); - T px, py, pz; - for(size_t i = 0; i < N; i++){ // for each element in the array - (x == NULL) ? px = 0 : px = x[i]; // test for NULL values - (y == NULL) ? py = 0 : py = y[i]; - (z == NULL) ? pz = 0 : pz = z[i]; - - for(size_t s = 0; s < S; s++){ - F[i] += w_array[s].pos(px, py, pz); //sum all plane waves at this point - } - } -#else +void cpu_scalarwaves(stim::complex* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave > W){ + size_t S = W.size(); //store the number of waves +#ifdef __CUDACC__ stim::complex* dev_F; //allocate space for the field cudaMalloc(&dev_F, N * sizeof(stim::complex)); - cudaMemset(dev_F, 0, N * sizeof(stim::complex)); //set the field to zero (necessary because a sum is used) + cudaMemcpy(dev_F, F, N * sizeof(stim::complex), cudaMemcpyHostToDevice); + //cudaMemset(dev_F, 0, N * sizeof(stim::complex)); //set the field to zero (necessary because a sum is used) T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) if(x != NULL){ @@ -282,28 +299,11 @@ void cpu_sum_scalarwaves(stim::complex* F, size_t N, T* x, T* y, T* z, std::v HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); } - size_t wave_bytes = sizeof(stim::scalarwave); - size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available - size_t array_bytes = w_array.size() * wave_bytes; //calculate the maximum number of bytes required for the planewave array - size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory - size_t num_batches = w_array.size() / max_batch + 1; //calculate the number of batches required to process all plane waves - size_t batch_bytes = min(w_array.size(), max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required - - stim::scalarwave* dev_w; - HANDLE_ERROR(cudaMalloc(&dev_w, batch_bytes)); //allocate memory for a single batch of plane waves + stim::scalarwave* dev_W; + HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave) * W.size()) ); + HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave) * W.size(), cudaMemcpyHostToDevice) ); - int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device - dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks - - size_t batch_size; //declare a variable to store the size of the current batch - size_t waves_processed = 0; //initialize the number of waves processed to zero - while(waves_processed < w_array.size()){ //while there are still waves to be processed - batch_size = min(max_batch, w_array.size() - waves_processed); //process either a whole batch, or whatever is left - batch_bytes = batch_size * sizeof(stim::scalarwave); - HANDLE_ERROR(cudaMemcpy(dev_w, &w_array[waves_processed], batch_bytes, cudaMemcpyHostToDevice)); //copy the plane waves into global memory - cuda_scalarwave<<< blocks, threads, batch_bytes >>>(dev_F, N, dev_x, dev_y, dev_z, dev_w, batch_size); //call the kernel - waves_processed += batch_size; //increment the counter indicating how many waves have been processed - } + gpu_scalarwaves(dev_F, N, dev_x, dev_y, dev_z, dev_W, W.size()); cudaMemcpy(F, dev_F, N * sizeof(stim::complex), cudaMemcpyDeviceToHost); //copy the field from device memory @@ -311,15 +311,25 @@ void cpu_sum_scalarwaves(stim::complex* F, size_t N, T* x, T* y, T* z, std::v if(y != NULL) cudaFree(dev_y); if(z != NULL) cudaFree(dev_z); cudaFree(dev_F); - cudaFree(dev_w); +#else + memset(F, 0, N * sizeof(stim::complex)); + T px, py, pz; + for(size_t i = 0; i < N; i++){ // for each element in the array + (x == NULL) ? px = 0 : px = x[i]; // test for NULL values + (y == NULL) ? py = 0 : py = y[i]; + (z == NULL) ? pz = 0 : pz = z[i]; + for(size_t s = 0; s < S; s++){ + F[i] += w_array[s].pos(px, py, pz); //sum all plane waves at this point + } + } #endif } template void cpu_scalarwave(stim::complex* F, size_t N, T* x, T* y, T* z, stim::scalarwave w){ std::vector< stim::scalarwave > w_array(1, w); - cpu_sum_scalarwaves(F, N, x, y, z, w_array); + cpu_scalarwaves(F, N, x, y, z, w_array); } @@ -331,7 +341,7 @@ void cpu_scalarwave(stim::complex* F, size_t N, T* x, T* y, T* z, stim::scala /// @param A is the list of amplitudes for each wave /// @param S is the list of propagation directions for each wave template -CUDA_CALLABLE stim::complex sum_scalarwaves(T x, T y, T z, std::vector< stim::scalarwave > W){ +CUDA_CALLABLE stim::complex cpu_scalarwaves(T x, T y, T z, std::vector< stim::scalarwave > W){ size_t N = W.size(); //get the number of plane wave samples stim::complex field(0, 0); //initialize the field to zero (0) stim::vec3 k; //allocate space for the direction vector -- libgit2 0.21.4