Commit 9339fbad873457047446ee3a90f52296eda250a5

Authored by David Mayerich
1 parent 308a743c

implementing mie scattering

stim/cuda/cudatools/callable.h
... ... @@ -2,7 +2,7 @@
2 2  
3 3 //define the CUDA_CALLABLE macro (will prefix all members)
4 4 #ifdef __CUDACC__
5   -#define CUDA_CALLABLE __host__ __device__
  5 +#define CUDA_CALLABLE __host__ __device__ inline
6 6 #else
7 7 #define CUDA_CALLABLE
8 8 #endif
... ...
stim/image/image.h
... ... @@ -58,12 +58,12 @@ class image{
58 58  
59 59 int cv_type(){
60 60 if(std::is_same<T, unsigned char>::value) return CV_MAKETYPE(CV_8U, (int)C());
61   - if(std::is_same<T, char>::value) return CV_MAKETYPE(CV_8S, (int)C());
62   - if(std::is_same<T, unsigned short>::value) return CV_MAKETYPE(CV_16U, (int)C());
63   - if(std::is_same<T, short>::value) return CV_MAKETYPE(CV_16S, (int)C());
64   - if(std::is_same<T, int>::value) return CV_MAKETYPE(CV_32S, (int)C());
65   - if(std::is_same<T, float>::value) return CV_MAKETYPE(CV_32F, (int)C());
66   - if(std::is_same<T, double>::value) return CV_MAKETYPE(CV_64F, (int)C());
  61 + else if(std::is_same<T, char>::value) return CV_MAKETYPE(CV_8S, (int)C());
  62 + else if(std::is_same<T, unsigned short>::value) return CV_MAKETYPE(CV_16U, (int)C());
  63 + else if(std::is_same<T, short>::value) return CV_MAKETYPE(CV_16S, (int)C());
  64 + else if(std::is_same<T, int>::value) return CV_MAKETYPE(CV_32S, (int)C());
  65 + else if(std::is_same<T, float>::value) return CV_MAKETYPE(CV_32F, (int)C());
  66 + else if(std::is_same<T, double>::value) return CV_MAKETYPE(CV_64F, (int)C());
67 67  
68 68 std::cout<<"ERROR in stim::image::cv_type - no valid data type found"<<std::endl;
69 69 exit(1);
... ... @@ -72,12 +72,12 @@ class image{
72 72 /// Returns the value for "white" based on the dynamic range (assumes white is 1.0 for floating point images)
73 73 T white(){
74 74 if(std::is_same<T, unsigned char>::value) return UCHAR_MAX;
75   - if(std::is_same<T, unsigned short>::value) return SHRT_MAX;
76   - if(std::is_same<T, unsigned>::value) return UINT_MAX;
77   - if(std::is_same<T, unsigned long>::value) return ULONG_MAX;
78   - if(std::is_same<T, unsigned long long>::value) return ULLONG_MAX;
79   - if(std::is_same<T, float>::value) return 1.0f;
80   - if(std::is_same<T, double>::value) return 1.0;
  75 + else if(std::is_same<T, unsigned short>::value) return SHRT_MAX;
  76 + else if(std::is_same<T, unsigned>::value) return UINT_MAX;
  77 + else if(std::is_same<T, unsigned long>::value) return ULONG_MAX;
  78 + else if(std::is_same<T, unsigned long long>::value) return ULLONG_MAX;
  79 + else if(std::is_same<T, float>::value) return 1.0f;
  80 + else if(std::is_same<T, double>::value) return 1.0;
81 81  
82 82 std::cout<<"ERROR in stim::image::white - no white value known for this data type"<<std::endl;
83 83  
... ... @@ -120,14 +120,6 @@ public:
120 120 free(img);
121 121 }
122 122  
123   - /*stim::image<T> operator=(const stim::image<T>& I){
124   - if(&I == this) //handle self-assignment
125   - return *this;
126   - allocate(I.X(), I.Y(), I.C());
127   - memcpy(img, I.img, bytes());
128   - return *this;
129   - }*/
130   -
131 123 stim::image<T>& operator=(const stim::image<T>& I){
132 124 init();
133 125 if(&I == this) //handle self-assignment
... ...
stim/math/bessel.h
... ... @@ -1258,7 +1258,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1258 1258 P a0,v0,pv0,pv1,vl,ga,gb,vg,vv,w0,w1,ya0,yak,ya1,wa;
1259 1259 int j,n,k,kz,l,lb,lb0,m;
1260 1260  
1261   - a0 = abs(z);
  1261 + a0 = ::abs(z);
1262 1262 z1 = z;
1263 1263 z2 = z*z;
1264 1264 n = (int)v;
... ... @@ -1286,7 +1286,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1286 1286 vm = v;
1287 1287 return 0;
1288 1288 }
1289   - if (real(z1) < 0.0) z1 = -z;
  1289 + if (::real(z1) < 0.0) z1 = -z;
1290 1290 if (a0 <= 12.0) {
1291 1291 for (l=0;l<2;l++) {
1292 1292 vl = v0+l;
... ... @@ -1295,7 +1295,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1295 1295 for (k=1;k<=40;k++) {
1296 1296 cr *= -0.25*z2/(k*(k+vl));
1297 1297 cjvl += cr;
1298   - if (abs(cr) < abs(cjvl)*eps) break;
  1298 + if (::abs(cr) < ::abs(cjvl)*eps) break;
1299 1299 }
1300 1300 vg = 1.0 + vl;
1301 1301 ga = gamma(vg);
... ... @@ -1348,7 +1348,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1348 1348 for (k=1;k<=40;k++) {
1349 1349 cr *= -0.25*z2/(k*(k-vl));
1350 1350 cjvl += cr;
1351   - if (abs(cr) < abs(cjvl)*eps) break;
  1351 + if (::abs(cr) < ::abs(cjvl)*eps) break;
1352 1352 }
1353 1353 vg = 1.0-vl;
1354 1354 gb = gamma(vg);
... ... @@ -1381,16 +1381,16 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1381 1381 cyv1 = M_2_PI*(cec*cjv1-1.0/z1-0.25*z1*cs1);
1382 1382 }
1383 1383 }
1384   - if (real(z) < 0.0) {
  1384 + if (::real(z) < 0.0) {
1385 1385 cfac0 = exp(pv0*cii);
1386 1386 cfac1 = exp(pv1*cii);
1387   - if (imag(z) < 0.0) {
  1387 + if (::imag(z) < 0.0) {
1388 1388 cyv0 = cfac0*cyv0-(P)2.0*(complex<P>)cii*cos(pv0)*cjv0;
1389 1389 cyv1 = cfac1*cyv1-(P)2.0*(complex<P>)cii*cos(pv1)*cjv1;
1390 1390 cjv0 /= cfac0;
1391 1391 cjv1 /= cfac1;
1392 1392 }
1393   - else if (imag(z) > 0.0) {
  1393 + else if (::imag(z) > 0.0) {
1394 1394 cyv0 = cyv0/cfac0+(P)2.0*(complex<P>)cii*cos(pv0)*cjv0;
1395 1395 cyv1 = cyv1/cfac1+(P)2.0*(complex<P>)cii*cos(pv1)*cjv1;
1396 1396 cjv0 *= cfac0;
... ... @@ -1421,7 +1421,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1421 1421 cf2 = cf1;
1422 1422 cf1 = cf;
1423 1423 }
1424   - if (abs(cjv0) > abs(cjv1)) cs = cjv0/cf;
  1424 + if (::abs(cjv0) > ::abs(cjv1)) cs = cjv0/cf;
1425 1425 else cs = cjv1/cf2;
1426 1426 for (k=0;k<=n;k++) {
1427 1427 cjv[k] *= cs;
... ... @@ -1433,21 +1433,21 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1433 1433 }
1434 1434 cyv[0] = cyv0;
1435 1435 cyv[1] = cyv1;
1436   - ya0 = abs(cyv0);
  1436 + ya0 = ::abs(cyv0);
1437 1437 lb = 0;
1438 1438 cg0 = cyv0;
1439 1439 cg1 = cyv1;
1440 1440 for (k=2;k<=n;k++) {
1441 1441 cyk = 2.0*(v0+k-1.0)*cg1/z-cg0;
1442   - yak = abs(cyk);
1443   - ya1 = abs(cg0);
  1442 + yak = ::abs(cyk);
  1443 + ya1 = ::abs(cg0);
1444 1444 if ((yak < ya0) && (yak< ya1)) lb = k;
1445 1445 cyv[k] = cyk;
1446 1446 cg0 = cg1;
1447 1447 cg1 = cyk;
1448 1448 }
1449 1449 lb0 = 0;
1450   - if ((lb > 4) && (imag(z) != 0.0)) {
  1450 + if ((lb > 4) && (::imag(z) != 0.0)) {
1451 1451 while(lb != lb0) {
1452 1452 ch2 = cone;
1453 1453 ch1 = czero;
... ... @@ -1470,7 +1470,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1470 1470 cp21 = ch2;
1471 1471 if (lb == n)
1472 1472 cjv[lb+1] = 2.0*(lb+v0)*cjv[lb]/z-cjv[lb-1];
1473   - if (abs(cjv[0]) > abs(cjv[1])) {
  1473 + if (::abs(cjv[0]) > ::abs(cjv[1])) {
1474 1474 cyv[lb+1] = (cjv[lb+1]*cyv0-2.0*cp11/(M_PI*z))/cjv[0];
1475 1475 cyv[lb] = (cjv[lb]*cyv0+2.0*cp12/(M_PI*z))/cjv[0];
1476 1476 }
... ... @@ -1495,8 +1495,8 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1495 1495 cyl2 = cylk;
1496 1496 }
1497 1497 for (k=2;k<=n;k++) {
1498   - wa = abs(cyv[k]);
1499   - if (wa < abs(cyv[k-1])) lb = k;
  1498 + wa = ::abs(cyv[k]);
  1499 + if (wa < ::abs(cyv[k-1])) lb = k;
1500 1500 }
1501 1501 }
1502 1502 }
... ... @@ -1515,12 +1515,18 @@ int cbessjyva_sph(int v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1515 1515 //first, compute the bessel functions of fractional order
1516 1516 cbessjyva<P>(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp);
1517 1517  
  1518 + if(z == 0){ //handle degenerate case of z = 0
  1519 + memset(cjv, 0, sizeof(P) * (v+1));
  1520 + cjv[0] = 1;
  1521 + }
  1522 +
1518 1523 //iterate through each and scale
1519 1524 for(int n = 0; n<=v; n++)
1520 1525 {
1521   -
1522   - cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
1523   - cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
  1526 + if(z != 0){ //handle degenerate case of z = 0
  1527 + cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
  1528 + cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
  1529 + }
1524 1530  
1525 1531 cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0));
1526 1532 cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0));
... ...
stim/math/constants.h
1 1 #ifndef STIM_CONSTANTS_H
2 2 #define STIM_CONSTANTS_H
3 3  
  4 +#include "stim/cuda/cudatools/callable.h"
4 5 namespace stim{
5 6 const double PI = 3.1415926535897932384626433832795028841971693993751058209749445923078164062862;
6 7 const double TAU = 2 * stim::PI;
... ...
stim/math/matrix.h
... ... @@ -55,7 +55,7 @@ struct matrix
55 55 vec<Y> operator*(vec<Y> rhs){
56 56 unsigned int N = rhs.size();
57 57  
58   - vec3Y> result;
  58 + vec<Y> result;
59 59 result.resize(N);
60 60  
61 61 for(int r=0; r<N; r++)
... ...
stim/math/quaternion.h
... ... @@ -43,6 +43,8 @@ public:
43 43  
44 44 CUDA_CALLABLE void CreateRotation(vec3<T> from, vec3<T> to){
45 45  
  46 + from = from.norm();
  47 + to = to.norm();
46 48 vec3<T> r = from.cross(to); //compute the rotation vector
47 49 T theta = asin(r.len()); //compute the angle of the rotation about r
48 50 //deal with a zero vector (both k and kn point in the same direction)
... ...
stim/optics/mie.h 0 → 100644
  1 +#ifndef STIM_MIE_H
  2 +#define STIM_MIE_H
  3 +
  4 +#include "scalarwave.h"
  5 +#include "../math/bessel.h"
  6 +#include <cmath>
  7 +
  8 +namespace stim{
  9 +
  10 +
  11 +/// Calculate the scattering coefficients for a spherical scatterer
  12 +template<typename T>
  13 +void B_coefficients(stim::complex<T>* B, T a, T k, stim::complex<T> n, int Nl){
  14 +
  15 + //temporary variables
  16 + double vm; //allocate space to store the return values for the bessel function calculation
  17 + double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
  18 + double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
  19 + double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
  20 + double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
  21 +
  22 + stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  23 + stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  24 + stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  25 + stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  26 +
  27 + double ka = k * a; //store k*a (argument for spherical bessel and Hankel functions)
  28 + stim::complex<double> kna = k * n * a; //store k*n*a (argument for spherical bessel functions and derivatives)
  29 +
  30 + stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka); //calculate bessel functions and derivatives for k*a
  31 + stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna); //calculate complex bessel functions for k*n*a
  32 +
  33 + stim::complex<double> h_ka, dh_ka;
  34 + stim::complex<double> numerator, denominator;
  35 + stim::complex<double> i(0, 1);
  36 + for(size_t l = 0; l <= Nl; l++){
  37 + h_ka.r = j_ka[l];
  38 + h_ka.i = y_ka[l];
  39 + dh_ka.r = dj_ka[l];
  40 + dh_ka.i = dy_ka[l];
  41 +
  42 + numerator = j_ka[l] * dj_kna[l] * (stim::complex<double>)n - j_kna[l] * dj_ka[l];
  43 + denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
  44 + B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
  45 + std::cout<<B[l]<<std::endl;
  46 + }
  47 +}
  48 +
  49 +template<typename T>
  50 +void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){
  51 + //temporary variables
  52 + double vm; //allocate space to store the return values for the bessel function calculation
  53 + double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
  54 + double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
  55 + double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
  56 + double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
  57 +
  58 + stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  59 + stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  60 + stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  61 + stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  62 +
  63 + double ka = k * a; //store k*a (argument for spherical bessel and Hankel functions)
  64 + stim::complex<double> kna = k * n * a; //store k*n*a (argument for spherical bessel functions and derivatives)
  65 +
  66 + stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka); //calculate bessel functions and derivatives for k*a
  67 + stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna); //calculate complex bessel functions for k*n*a
  68 +
  69 + stim::complex<double> h_ka, dh_ka;
  70 + stim::complex<double> numerator, denominator;
  71 + stim::complex<double> i(0, 1);
  72 + for(size_t l = 0; l <= Nl; l++){
  73 + h_ka.r = j_ka[l];
  74 + h_ka.i = y_ka[l];
  75 + dh_ka.r = dj_ka[l];
  76 + dh_ka.i = dy_ka[l];
  77 +
  78 + numerator = j_ka[l] * dh_ka - dj_ka[l] * h_ka;
  79 + denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
  80 + A[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
  81 + }
  82 +}
  83 +
  84 +template<typename T>
  85 +__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* B, T* j, T kr_min, T dkr, int Nl){
  86 + extern __shared__ stim::scalarwave<T> shared_W[]; //declare the list of waves in shared memory
  87 +
  88 + stim::cuda::sharedMemcpy(shared_W, W, nW, threadIdx.x, blockDim.x); //copy the plane waves into shared memory for faster access
  89 + __syncthreads(); //synchronize threads to insure all data is copied
  90 +
  91 + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
  92 + if(i >= N) return; //exit if this thread is outside the array
  93 + stim::vec3<T> p;
  94 + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
  95 + (y == NULL) ? p[1] = 0 : p[1] = y[i];
  96 + (z == NULL) ? p[2] = 0 : p[2] = z[i];
  97 +
  98 + T r = p.len(); //calculate the distance from the sphere
  99 + T k = W[0].kmag();
  100 + if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field)
  101 +
  102 + size_t NC = Nl + 1; //calculate the number of coefficients to be used
  103 + T kr = r * k; //calculate the thread value for k*r
  104 + T fij = (kr - kr_min)/dkr; //FP index into the spherical bessel LUT
  105 + size_t ij = (size_t) fij; //convert to an integral index
  106 + T alpha = fij - ij; //calculate the fractional portion of the index
  107 + size_t n0j = ij * (NC); //start of the first entry in the LUT
  108 + size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT
  109 +
  110 + T cos_phi;
  111 + T Pl_2, Pl_1; //declare registers to store the previous two Legendre polynomials
  112 + T Pl = 1; //initialize the current value for the Legendre polynomial
  113 + T jl;
  114 + stim::complex<T> Ei = 0; //create a register to store the result
  115 + int l;
  116 + for(size_t w = 0; w < nW; w++){
  117 + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere
  118 + for(l = 0; l <= Nl; l++){
  119 + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
  120 + Pl_1 = Pl;
  121 + if(l == 0){ //computing Pl is done recursively, where the recursive relation
  122 + Pl = cos_phi; // requires the first two orders. This defines the second.
  123 + }
  124 + else{ //if this is not the first iteration, use the recursive relation to calculate Pl
  125 + Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
  126 + }
  127 +
  128 + jl = lerp<T>( j[n0j + l], j[n1j + l], alpha ); //read jl from the LUT and interpolate the result
  129 + Ei += W[w].E() * B[l] * jl * Pl;
  130 + }
  131 + //Ei += shared_W[w].pos(p); //evaluate the plane wave
  132 + }
  133 + E[i] += Ei; //copy the result to device memory
  134 +}
  135 +
  136 +template<typename T>
  137 +void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* B, T* j, T kr_min, T dkr, size_t Nl){
  138 +
  139 + size_t wave_bytes = sizeof(stim::scalarwave<T>);
  140 + size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available
  141 + size_t array_bytes = nW * wave_bytes; //calculate the maximum number of bytes required for the planewave array
  142 + size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory
  143 + size_t num_batches = nW / max_batch + 1; //calculate the number of batches required to process all plane waves
  144 + size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required
  145 +
  146 + stim::scalarwave<T>* batch_W;
  147 + HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes)); //allocate memory for a single batch of plane waves
  148 +
  149 + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
  150 + dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
  151 +
  152 + size_t batch_size; //declare a variable to store the size of the current batch
  153 + size_t waves_processed = 0; //initialize the number of waves processed to zero
  154 + while(waves_processed < nW){ //while there are still waves to be processed
  155 + batch_size = min<size_t>(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left
  156 + batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
  157 + HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory
  158 + cuda_scalar_mie_scatter<T><<< blocks, threads, batch_bytes >>>(E, N, x, y, z, batch_W, batch_size, a, n, B, j, kr_min, dkr, (int)Nl); //call the kernel
  159 + waves_processed += batch_size; //increment the counter indicating how many waves have been processed
  160 + }
  161 + cudaFree(batch_W);
  162 +}
  163 +/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
  164 +
  165 +/// @param E is a pointer to the destination field values
  166 +/// @param N is the number of points used to calculate the field
  167 +/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
  168 +/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
  169 +/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
  170 +/// @param W is an array of planewaves that will be scattered
  171 +/// @param a is the radius of the sphere
  172 +/// @param n is the complex refractive index of the sphere
  173 +template<typename T>
  174 +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n){
  175 + //calculate the necessary number of orders required to represent the scattered field
  176 + T k = W[0].kmag();
  177 +
  178 + size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2);
  179 +
  180 + //calculate the scattering coefficients for the sphere
  181 + stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients
  182 + B_coefficients(B, a, k, n, Nl);
  183 +
  184 +#ifdef __CUDACC__
  185 + stim::complex<T>* dev_E; //allocate space for the field
  186 + cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
  187 + cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
  188 + //cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used)
  189 +
  190 + // COORDINATES
  191 + T* dev_x = NULL; //allocate space and copy the X coordinate (if specified)
  192 + if(x != NULL){
  193 + HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
  194 + HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
  195 + }
  196 + T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified)
  197 + if(y != NULL){
  198 + HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
  199 + HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
  200 + }
  201 + T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified)
  202 + if(z != NULL){
  203 + HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
  204 + HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
  205 + }
  206 +
  207 + // PLANE WAVES
  208 + stim::scalarwave<T>* dev_W; //allocate space and copy plane waves
  209 + HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
  210 + HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
  211 +
  212 + // SCATTERING COEFFICIENTS
  213 + stim::complex<T>* dev_B;
  214 + HANDLE_ERROR( cudaMalloc(&dev_B, sizeof(stim::complex<T>) * (Nl+1)) );
  215 + HANDLE_ERROR( cudaMemcpy(dev_B, B, sizeof(stim::complex<T>) * (Nl+1), cudaMemcpyHostToDevice) );
  216 +
  217 + // BESSEL FUNCTION LOOK-UP TABLE
  218 + //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r
  219 + size_t Nlut_j = 1024;
  220 + T r_min = 0;
  221 + T r_max = 10;
  222 +
  223 + T kr_min = k * r_min;
  224 + T kr_max = k * r_max;
  225 +
  226 + //temporary variables
  227 + double vm; //allocate space to store the return values for the bessel function calculation
  228 + double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
  229 + double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
  230 + double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
  231 + double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
  232 +
  233 + size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j;
  234 + T* bessel_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table
  235 + T dkr = (kr_max - kr_min) / (Nlut_j-1); //distance between values in the LUT
  236 + std::cout<<"LUT jl bytes: "<<lutj_bytes<<std::endl;
  237 + for(size_t kri = 0; kri < Nlut_j; kri++){ //for each value in the LUT
  238 + stim::bessjyv_sph<double>(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
  239 + for(size_t l = 0; l <= Nl; l++){ //for each order
  240 + bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; //store the bessel function result
  241 + }
  242 + }
  243 +
  244 + stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
  245 +
  246 + //Allocate device memory and copy everything to the GPU
  247 + T* dev_j_lut;
  248 + HANDLE_ERROR( cudaMalloc(&dev_j_lut, lutj_bytes) );
  249 + HANDLE_ERROR( cudaMemcpy(dev_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) );
  250 +
  251 + gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_B, dev_j_lut, kr_min, dkr, Nl);
  252 +
  253 + cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory
  254 +
  255 + if(x != NULL) cudaFree(dev_x); //free everything
  256 + if(y != NULL) cudaFree(dev_y);
  257 + if(z != NULL) cudaFree(dev_z);
  258 + cudaFree(dev_E);
  259 +#else
  260 +
  261 +
  262 + //allocate space to store the bessel function call results
  263 + double vm;
  264 + double* j_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
  265 + double* y_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
  266 + double* dj_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
  267 + double* dy_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
  268 +
  269 + T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
  270 +
  271 + T r, kr, cos_phi;
  272 + stim::complex<T> h;
  273 + for(size_t i = 0; i < N; i++){
  274 + stim::vec3<T> p; //declare a 3D point
  275 +
  276 + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
  277 + (y == NULL) ? p[1] = 0 : p[1] = y[i];
  278 + (z == NULL) ? p[2] = 0 : p[2] = z[i];
  279 + r = p.len();
  280 + if(r >= a){
  281 + for(size_t w = 0; w < W.size(); w++){
  282 + kr = p.len() * W[w].kmag(); //calculate k*r
  283 + stim::bessjyv_sph<double>(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr);
  284 + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction
  285 + stim::legendre<T>(Nl, cos_phi, P);
  286 +
  287 + for(size_t l = 0; l <= Nl; l++){
  288 + h.r = j_kr[l];
  289 + h.i = y_kr[l];
  290 + E[i] += W[w].E() * B[l] * h * P[l];
  291 + }
  292 + }
  293 + }
  294 + }
  295 +#endif
  296 +}
  297 +
  298 +template<typename T>
  299 +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){
  300 + std::vector< stim::scalarwave<T> > W(1, w);
  301 + cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n);
  302 +}
  303 +
  304 +/// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere
  305 +
  306 +/// @param E is a pointer to the destination field values
  307 +/// @param N is the number of points used to calculate the field
  308 +/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
  309 +/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
  310 +/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
  311 +/// @param w is a planewave that will be scattered
  312 +/// @param a is the radius of the sphere
  313 +/// @param n is the complex refractive index of the sphere
  314 +template<typename T>
  315 +void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n){
  316 +
  317 + //calculate the necessary number of orders required to represent the scattered field
  318 + T k = W[0].kmag();
  319 +
  320 + size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2);
  321 +
  322 + //calculate the scattering coefficients for the sphere
  323 + stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients
  324 + A_coefficients(A, a, k, n, Nl);
  325 +
  326 + //allocate space to store the bessel function call results
  327 + double vm;
  328 + stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  329 + stim::complex<double>* y_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  330 + stim::complex<double>* dj_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  331 + stim::complex<double>* dy_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  332 +
  333 + T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
  334 +
  335 + T r, cos_phi;
  336 + stim::complex<double> knr;
  337 + stim::complex<T> h;
  338 + for(size_t i = 0; i < N; i++){
  339 + stim::vec3<T> p; //declare a 3D point
  340 +
  341 + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
  342 + (y == NULL) ? p[1] = 0 : p[1] = y[i];
  343 + (z == NULL) ? p[2] = 0 : p[2] = z[i];
  344 + r = p.len();
  345 + if(r < a){
  346 + E[i] = 0;
  347 + for(size_t w = 0; w < W.size(); w++){
  348 + knr = (stim::complex<double>)n * p.len() * W[w].kmag(); //calculate k*n*r
  349 +
  350 + stim::cbessjyva_sph<double>(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr);
  351 + if(r == 0)
  352 + cos_phi = 0;
  353 + else
  354 + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction
  355 + stim::legendre<T>(Nl, cos_phi, P);
  356 +
  357 + for(size_t l = 0; l <= Nl; l++){
  358 + E[i] += W[w].E() * A[l] * (stim::complex<T>)j_knr[l] * P[l];
  359 + }
  360 + }
  361 + }
  362 + }
  363 +}
  364 +
  365 +template<typename T>
  366 +void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){
  367 + std::vector< stim::scalarwave<T> > W(1, w);
  368 + cpu_scalar_mie_internal(E, N, x, y, z, W, a, n);
  369 +}
  370 +
  371 +}
  372 +
  373 +#endif
0 374 \ No newline at end of file
... ...
stim/optics/scalarbeam.h
... ... @@ -5,7 +5,12 @@
5 5 #include "../optics/scalarwave.h"
6 6 #include "../math/bessel.h"
7 7 #include "../math/legendre.h"
  8 +#include "../cuda/cudatools/devices.h"
  9 +#include "../cuda/cudatools/timer.h"
  10 +#include <cublas_v2.h>
  11 +#include <math_constants.h>
8 12 #include <vector>
  13 +#include <stdlib.h>
9 14  
10 15 namespace stim{
11 16  
... ... @@ -105,10 +110,11 @@ public:
105 110 std::vector< scalarwave<T> > samples(N); //create a vector of plane waves
106 111 T kmag = (T)stim::TAU / lambda; //calculate the wavenumber
107 112 stim::complex<T> apw; //allocate space for the amplitude at the focal point
  113 + T a = stim::TAU * (1 - cos(asin(NA[0]))) / (double)N;
108 114 stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction
109 115 for(size_t i=0; i<N; i++){ //for each sample
110 116 kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave
111   - apw = exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave
  117 + apw = a * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave
112 118 samples[i] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction
113 119 }
114 120  
... ... @@ -148,7 +154,7 @@ public:
148 154 /// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional)
149 155 /// @param C is a pointer to Nl + 1 values where the terms will be stored
150 156 template<typename T>
151   -CUDA_CALLABLE void cpu_aperture_integral(T* C, size_t Nl, T NA, T NA_in = 0){
  157 +CUDA_CALLABLE void cpu_aperture_integral(T* C, int Nl, T NA, T NA_in = 0){
152 158  
153 159 size_t table_bytes = (Nl + 1) * sizeof(T); //calculate the number of bytes required to store the terms
154 160 T cos_alpha_1 = cos(asin(NA_in)); //calculate the cosine of the angle subtended by the central obscuration
... ... @@ -182,23 +188,151 @@ CUDA_CALLABLE void cpu_aperture_integral(T* C, size_t Nl, T NA, T NA_in = 0){
182 188  
183 189 /// performs linear interpolation into a look-up table
184 190 template<typename T>
185   -T lut_lookup(T* lut, T val, size_t N, T min_val, T delta, size_t stride = 0){
186   - size_t idx = (size_t)((val - min_val) / delta);
187   - T alpha = val - idx * delta + min_val;
  191 +CUDA_CALLABLE void lut_lookup(T* lut_values, T* lut, T val, size_t N, T min_val, T delta, size_t n_vals){
  192 + T idx = ((val - min_val) / delta);
  193 + size_t i = (size_t) idx;
  194 + T a1 = idx - i;
  195 + T a0 = 1 - a1;
  196 + size_t n0 = i * n_vals;
  197 + size_t n1 = (i+1) * n_vals;
  198 + for(size_t n = 0; n < n_vals; n++){
  199 + lut_values[n] = lut[n0 + n] * a0 + lut[n1 + n] * a1;
  200 + }
  201 +}
188 202  
189   - if(alpha == 0) return lut[idx];
190   - else return lut[idx * stride] * (1 - alpha) + lut[ (idx+1) * stride] * alpha;
  203 +template <typename T>
  204 +CUDA_CALLABLE T lerp(T v0, T v1, T t) {
  205 + return fma(t, v1, fma(-t, v0, v0));
191 206 }
192 207  
  208 +#ifdef __CUDACC__
193 209 template<typename T>
194   -void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, stim::vec3<T> f, T NA, T NA_in, int Nl){
195   - T k = stim::TAU / lambda;
  210 +__global__ void cuda_scalar_psf(stim::complex<T>* E, size_t N, T* r, T* phi, T k, T A, size_t Nl,
  211 + T* C,
  212 + T* lut_j, size_t Nj, T min_kr, T dkr){
  213 + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
  214 + if(i >= N) return; //exit if this thread is outside the array
  215 +
  216 + T cos_phi = cos(phi[i]); //calculate the thread value for cos(phi)
  217 + T kr = r[i] * k; //calculate the thread value for k*r
  218 + stim::complex<T> Ei = 0; //initialize the value of the field to zero
  219 + size_t NC = Nl + 1; //calculate the number of coefficients to be used
  220 +
  221 + T fij = (kr - min_kr)/dkr; //FP index into the spherical bessel LUT
  222 + size_t ij = (size_t) fij; //convert to an integral index
  223 + T a = fij - ij; //calculate the fractional portion of the index
  224 + size_t n0j = ij * (NC); //start of the first entry in the LUT
  225 + size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT
  226 +
  227 + T jl; //declare register to store the spherical bessel function
  228 + T Pl_2, Pl_1; //declare registers to store the previous two Legendre polynomials
  229 + T Pl = 1; //initialize the current value for the Legendre polynomial
  230 + stim::complex<T> im(0, 1); //declare i (imaginary 1)
  231 + stim::complex<T> i_pow(1, 0); //i_pow stores the current value of i^l so it doesn't have to be re-computed every iteration
  232 + for(int l = 0; l <= Nl; l++){ //for each order
  233 + jl = lerp<T>( lut_j[n0j + l], lut_j[n1j + l], a ); //read jl from the LUT and interpolate the result
  234 + Ei += i_pow * jl * Pl * C[l]; //calculate the value for the field and sum
  235 + i_pow *= im; //multiply i^l * i for the next iteration
  236 + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
  237 + Pl_1 = Pl;
  238 + if(l == 0){ //computing Pl is done recursively, where the recursive relation
  239 + Pl = cos_phi; // requires the first two orders. This defines the second.
  240 + }
  241 + else{ //if this is not the first iteration, use the recursive relation to calculate Pl
  242 + Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
  243 + }
  244 +
  245 + }
  246 + E[i] = Ei * A * 2 * CUDART_PI_F; //scale the integral by the amplitude
  247 +}
  248 +
  249 +template<typename T>
  250 +void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl, T r_spacing){
  251 +
  252 + //Find the minimum and maximum values of r
  253 + cublasStatus_t stat;
  254 + cublasHandle_t handle;
  255 +
  256 + stat = cublasCreate(&handle); //create a cuBLAS handle
  257 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  258 + printf ("CUBLAS initialization failed\n");
  259 + exit(1);
  260 + }
  261 +
  262 + int i_min, i_max;
  263 + stat = cublasIsamin(handle, (int)N, r, 1, &i_min);
  264 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  265 + printf ("CUBLAS Error: failed to calculate minimum r value.\n");
  266 + exit(1);
  267 + }
  268 + stat = cublasIsamax(handle, (int)N, r, 1, &i_max);
  269 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  270 + printf ("CUBLAS Error: failed to calculate maximum r value.\n");
  271 + exit(1);
  272 + }
  273 +
  274 + T r_min, r_max; //allocate space to store the minimum and maximum values
  275 + HANDLE_ERROR( cudaMemcpy(&r_min, r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU
  276 + HANDLE_ERROR( cudaMemcpy(&r_max, r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
  277 +
  278 + T k = (T)stim::TAU / lambda; //calculate the wavenumber from lambda
  279 + size_t C_bytes = (Nl + 1) * sizeof(T);
  280 + T* C = (T*) malloc( C_bytes ); //allocate space for the aperture integral terms
  281 + cpu_aperture_integral(C, Nl, NA, NA_in); //calculate the aperture integral terms
  282 +
  283 + size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r
  284 +
  285 + T kr_min = k * r_min;
  286 + T kr_max = k * r_max;
  287 +
  288 + //temporary variables
  289 + double vm; //allocate space to store the return values for the bessel function calculation
  290 + double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
  291 + double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
  292 + double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
  293 + double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
  294 +
  295 + size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j;
  296 + T* bessel_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table
  297 + T delta_kr = (kr_max - kr_min) / (Nlut_j-1); //distance between values in the LUT
  298 + std::cout<<"LUT jl bytes: "<<lutj_bytes<<std::endl;
  299 + for(size_t kri = 0; kri < Nlut_j; kri++){ //for each value in the LUT
  300 + stim::bessjyv_sph<double>(Nl, kr_min + kri * delta_kr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
  301 + for(size_t l = 0; l <= Nl; l++){ //for each order
  302 + bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; //store the bessel function result
  303 + }
  304 + }
  305 +
  306 + stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
  307 +
  308 + //Allocate device memory and copy everything to the GPU
  309 +
  310 + T* gpu_C;
  311 + HANDLE_ERROR( cudaMalloc(&gpu_C, C_bytes) );
  312 + HANDLE_ERROR( cudaMemcpy(gpu_C, C, C_bytes, cudaMemcpyHostToDevice) );
  313 + T* gpu_j_lut;
  314 + HANDLE_ERROR( cudaMalloc(&gpu_j_lut, lutj_bytes) );
  315 + HANDLE_ERROR( cudaMemcpy(gpu_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) );
  316 +
  317 + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
  318 + dim3 blocks( (unsigned)(N / threads + 1)); //calculate the optimal number of blocks
196 319  
197   - T* C = (T*) malloc( (Nl + 1) * sizeof(T) ); //allocate space for the aperture integral terms
  320 + cuda_scalar_psf<T><<< blocks, threads >>>(E, N, r, phi, (T)stim::TAU/lambda, A, Nl, gpu_C, gpu_j_lut, Nlut_j, kr_min, delta_kr);
  321 +
  322 + //free the LUT and condenser tables
  323 + HANDLE_ERROR( cudaFree(gpu_C) );
  324 + HANDLE_ERROR( cudaFree(gpu_j_lut) );
  325 +}
  326 +#endif
  327 +
  328 +/// Calculate the analytical solution to a scalar point spread function given a set of spherical coordinates about the PSF (beam propagation along phi = theta = 0)
  329 +template<typename T>
  330 +void cpu_scalar_psf_local(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl){
  331 + T k = (T)stim::TAU / lambda;
  332 + size_t C_bytes = (Nl + 1) * sizeof(T);
  333 + T* C = (T*) malloc( C_bytes ); //allocate space for the aperture integral terms
198 334 cpu_aperture_integral(C, Nl, NA, NA_in); //calculate the aperture integral terms
199 335 memset(F, 0, N * sizeof(stim::complex<T>));
200   -#ifdef NO_CUDA
201   - memset(F, 0, N * sizeof(stim::complex<T>));
202 336 T jl, Pl, kr, cos_phi;
203 337  
204 338 double vm;
... ... @@ -225,71 +359,117 @@ void cpu_scalar_psf(stim::complex&lt;T&gt;* F, size_t N, T* r, T* phi, T lambda, T A,
225 359  
226 360 free(C);
227 361 free(Pl_cos_phi);
228   -#else
229   - T min_r = r[0];
230   - T max_r = r[0];
231   - for(size_t i = 0; i < N; i++){ //find the minimum and maximum values of r (min and max distance from the focal point)
232   - if(r[i] < min_r) min_r = r[i];
233   - if(r[i] > max_r) max_r = r[i];
234   - }
235   - T min_kr = k * min_r;
236   - T max_kr = k * max_r;
  362 +}
237 363  
238   - //temporary variables
239   - double vm;
240   - double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
241   - double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
242   - double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
243   - double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
  364 +/// Converts a set of cartesian points into spherical coordinates surrounding a point spread function (PSF)
  365 +/// @param r is the output distance from the PSF
  366 +/// @param phi is the non-symmetric direction about the PSF
  367 +/// @param x (x, y, z) are the cartesian coordinates in world space
  368 +/// @f is the focal point of the PSF in cartesian coordinates
  369 +/// @d is the propagation direction of the PSF in cartesian coordinates
  370 +template<typename T>
  371 +__global__ void cuda_cart2psf(T* r, T* phi, size_t N, T* x, T* y, T* z, stim::vec3<T> f, stim::quaternion<T> q){
244 372  
245   - size_t Nlut = (size_t)sqrt(N) * 2;
246   - T* bessel_lut = (T*) malloc(sizeof(T) * (Nl+1) * Nlut);
247   - T delta_kr = (max_kr - min_kr) / (Nlut-1);
248   - for(size_t kri = 0; kri < Nlut; kri++){
249   - stim::bessjyv_sph<double>(Nl, min_kr + kri * delta_kr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
250   - for(size_t l = 0; l <= Nl; l++){
251   - bessel_lut[kri * (Nl + 1) + l] = (T)jv[l];
252   - }
253   - }
  373 + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
  374 + if(i >= N) return; //exit if this thread is outside the array
254 375  
255   - T* Pl_cos_phi = (T*) malloc((Nl + 1) * sizeof(T));
256   - T kr, cos_phi, jl, Pl;
257   - for(size_t n = 0; n < N; n++){ //for each point in the field
258   - kr = k * r[n]; //calculate kr (the optical distance between the focal point and p)
259   - cos_phi = std::cos(phi[n]); //calculate the cosine of phi
260   - stim::legendre<T>(Nl, cos_phi, Pl_cos_phi); //calculate the [0 Nl] legendre polynomials for this point
  376 + stim::vec3<T> p; //declare a 3D point
  377 +
  378 + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
  379 + (y == NULL) ? p[1] = 0 : p[1] = y[i];
  380 + (z == NULL) ? p[2] = 0 : p[2] = z[i];
261 381  
262   - for(int l = 0; l <= Nl; l++){
263   - jl = lut_lookup<T>(&bessel_lut[l], kr, Nlut, min_kr, delta_kr, Nl+1);
264   - Pl = Pl_cos_phi[l];
265   - F[n] += pow(complex<T>(0, 1), l) * jl * Pl * C[l];
266   - }
267   - F[n] *= A * stim::TAU;
268   - }
269   -#endif
  382 + p = p - f; //shift the point to the center of the PSF (focal point)
  383 + p = q.toMatrix3() * p; //rotate the point to align with the propagation direction
  384 +
  385 + stim::vec3<T> ps = p.cart2sph(); //convert from cartesian to spherical coordinates
  386 + r[i] = ps[0]; //store r
  387 + phi[i] = ps[2]; //phi = [0 pi]
270 388 }
271 389  
  390 +#ifdef __CUDACC__
  391 +/// Calculate the analytical solution to a point spread function given a set of points in cartesian coordinates
  392 +template<typename T>
  393 +void gpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
  394 +
  395 + T* gpu_r; //allocate space for the coordinates in r
  396 + HANDLE_ERROR( cudaMalloc(&gpu_r, sizeof(T) * N) );
  397 + T* gpu_phi;
  398 + HANDLE_ERROR( cudaMalloc(&gpu_phi, sizeof(T) * N) );
  399 + //stim::complex<T>* gpu_E;
  400 + //HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex<T>) * N) );
  401 +
  402 + stim::quaternion<T> q; //create a quaternion
  403 + q.CreateRotation(d, stim::vec3<T>(0, 0, 1)); //create a mapping from the propagation direction to the PSF space
  404 + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
  405 + dim3 blocks( (unsigned)(N / threads + 1)); //calculate the optimal number of blocks
  406 + cuda_cart2psf<T> <<< blocks, threads >>> (gpu_r, gpu_phi, N, x, y, z, f, q); //call the CUDA kernel to move the cartesian coordinates to PSF space
  407 +
  408 + gpu_scalar_psf_local(E, N, gpu_r, gpu_phi, lambda, A, NA, NA_in, Nl, r_spacing);
  409 +
  410 +}
  411 +#endif
272 412  
273 413 template<typename T>
274   -void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, T NA, T NA_in, int Nl){
  414 +void cpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
  415 +
  416 +// If CUDA is available, copy the cartesian points to the GPU and evaluate them in a kernel
  417 +#ifdef __CUDACC__
  418 +
  419 + T* gpu_x = NULL;
  420 + if(x != NULL){
  421 + HANDLE_ERROR( cudaMalloc(&gpu_x, sizeof(T) * N) );
  422 + HANDLE_ERROR( cudaMemcpy(gpu_x, x, sizeof(T) * N, cudaMemcpyHostToDevice) );
  423 + }
  424 + T* gpu_y = NULL;
  425 + if(y != NULL){
  426 + HANDLE_ERROR( cudaMalloc(&gpu_y, sizeof(T) * N) );
  427 + HANDLE_ERROR( cudaMemcpy(gpu_y, y, sizeof(T) * N, cudaMemcpyHostToDevice) );
  428 + }
  429 + T* gpu_z = NULL;
  430 + if(z != NULL){
  431 + HANDLE_ERROR( cudaMalloc(&gpu_z, sizeof(T) * N) );
  432 + HANDLE_ERROR( cudaMemcpy(gpu_z, z, sizeof(T) * N, cudaMemcpyHostToDevice) );
  433 + }
  434 +
  435 + stim::complex<T>* gpu_E;
  436 + HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex<T>) * N) );
  437 + HANDLE_ERROR( cudaMemcpy(gpu_E, E, sizeof(stim::complex<T>) * N, cudaMemcpyHostToDevice) );
  438 + gpu_scalar_psf_cart<T>(gpu_E, N, gpu_x, gpu_y, gpu_z, lambda, A, f, d, NA, NA_in, Nl, r_spacing);
  439 + HANDLE_ERROR( cudaMemcpy(E, gpu_E, sizeof(stim::complex<T>) * N, cudaMemcpyDeviceToHost) );
  440 +
  441 + HANDLE_ERROR( cudaFree(gpu_x) );
  442 + HANDLE_ERROR( cudaFree(gpu_y) );
  443 + HANDLE_ERROR( cudaFree(gpu_z) );
  444 + HANDLE_ERROR( cudaFree(gpu_E) );
  445 +
  446 +#else
275 447 T* r = (T*) malloc(N * sizeof(T)); //allocate space for p in spherical coordinates
276 448 T* phi = (T*) malloc(N * sizeof(T)); // only r and phi are necessary (the scalar PSF is symmetric about theta)
277 449  
278   - stim::vec3<T> p, ps;
  450 + stim::quaternion<T> q;
  451 + q.CreateRotation(d, stim::vec3<T>(0, 0, 1));
  452 + stim::matrix<T, 3> R = q.toMatrix3();
  453 + stim::vec3<T> p, ps, ds;
279 454 for(size_t i = 0; i < N; i++){
280 455 (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
281 456 (y == NULL) ? p[1] = 0 : p[1] = y[i];
282 457 (z == NULL) ? p[2] = 0 : p[2] = z[i];
283 458  
  459 + p = p - f;
  460 +
  461 + p = R * p; //rotate the cartesian point
  462 +
284 463 ps = p.cart2sph(); //convert from cartesian to spherical coordinates
285 464 r[i] = ps[0]; //store r
286 465 phi[i] = ps[2]; //phi = [0 pi]
287 466 }
288 467  
289   - cpu_scalar_psf(F, N, r, phi, lambda, A, f, NA, NA_in, Nl); //call the spherical coordinate CPU function
  468 + cpu_scalar_psf_local(F, N, r, phi, lambda, A, NA, NA_in, Nl); //call the spherical coordinate CPU function
290 469  
291 470 free(r);
292 471 free(phi);
  472 +#endif
293 473 }
294 474  
295 475 } //end namespace stim
... ...
stim/optics/scalarwave.h
... ... @@ -60,7 +60,7 @@ public:
60 60 return k.len();
61 61 }
62 62  
63   - CUDA_CALLABLE vec3< complex<T> > E(){
  63 + CUDA_CALLABLE complex<T> E(){
64 64 return E0;
65 65 }
66 66  
... ... @@ -235,6 +235,34 @@ void gpu_scalarwave(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scala
235 235 cuda_scalarwave<T><<< blocks, threads >>>(F, N, x, y, z, w); //call the kernel
236 236 }
237 237  
  238 +template<typename T>
  239 +void gpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW){
  240 +
  241 + size_t wave_bytes = sizeof(stim::scalarwave<T>);
  242 + size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available
  243 + size_t array_bytes = nW * wave_bytes; //calculate the maximum number of bytes required for the planewave array
  244 + size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory
  245 + size_t num_batches = nW / max_batch + 1; //calculate the number of batches required to process all plane waves
  246 + size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required
  247 +
  248 + stim::scalarwave<T>* batch_W;
  249 + HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes)); //allocate memory for a single batch of plane waves
  250 +
  251 + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
  252 + dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
  253 +
  254 + size_t batch_size; //declare a variable to store the size of the current batch
  255 + size_t waves_processed = 0; //initialize the number of waves processed to zero
  256 + while(waves_processed < nW){ //while there are still waves to be processed
  257 + batch_size = min<size_t>(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left
  258 + batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
  259 + HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory
  260 + cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(F, N, x, y, z, batch_W, batch_size); //call the kernel
  261 + waves_processed += batch_size; //increment the counter indicating how many waves have been processed
  262 + }
  263 + cudaFree(batch_W);
  264 +}
  265 +
238 266 /// Sums a series of coherent plane waves at a specified point
239 267 /// @param field is the output array of field values corresponding to each input point
240 268 /// @param x is an array of x coordinates for the field point
... ... @@ -245,24 +273,13 @@ void gpu_scalarwave(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scala
245 273 /// @param A is the list of amplitudes for each wave
246 274 /// @param S is the list of propagation directions for each wave
247 275 template<typename T>
248   -void cpu_sum_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > w_array){
249   - size_t S = w_array.size(); //store the number of waves
250   -#ifdef NO_CUDA
251   - memset(F, 0, N * sizeof(stim::complex<T>));
252   - T px, py, pz;
253   - for(size_t i = 0; i < N; i++){ // for each element in the array
254   - (x == NULL) ? px = 0 : px = x[i]; // test for NULL values
255   - (y == NULL) ? py = 0 : py = y[i];
256   - (z == NULL) ? pz = 0 : pz = z[i];
257   -
258   - for(size_t s = 0; s < S; s++){
259   - F[i] += w_array[s].pos(px, py, pz); //sum all plane waves at this point
260   - }
261   - }
262   -#else
  276 +void cpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W){
  277 + size_t S = W.size(); //store the number of waves
  278 +#ifdef __CUDACC__
263 279 stim::complex<T>* dev_F; //allocate space for the field
264 280 cudaMalloc(&dev_F, N * sizeof(stim::complex<T>));
265   - cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used)
  281 + cudaMemcpy(dev_F, F, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
  282 + //cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used)
266 283  
267 284 T* dev_x = NULL; //allocate space and copy the X coordinate (if specified)
268 285 if(x != NULL){
... ... @@ -282,28 +299,11 @@ void cpu_sum_scalarwaves(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, std::v
282 299 HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
283 300 }
284 301  
285   - size_t wave_bytes = sizeof(stim::scalarwave<T>);
286   - size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available
287   - size_t array_bytes = w_array.size() * wave_bytes; //calculate the maximum number of bytes required for the planewave array
288   - size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory
289   - size_t num_batches = w_array.size() / max_batch + 1; //calculate the number of batches required to process all plane waves
290   - size_t batch_bytes = min(w_array.size(), max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required
291   -
292   - stim::scalarwave<T>* dev_w;
293   - HANDLE_ERROR(cudaMalloc(&dev_w, batch_bytes)); //allocate memory for a single batch of plane waves
  302 + stim::scalarwave<T>* dev_W;
  303 + HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
  304 + HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
294 305  
295   - int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
296   - dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
297   -
298   - size_t batch_size; //declare a variable to store the size of the current batch
299   - size_t waves_processed = 0; //initialize the number of waves processed to zero
300   - while(waves_processed < w_array.size()){ //while there are still waves to be processed
301   - batch_size = min<size_t>(max_batch, w_array.size() - waves_processed); //process either a whole batch, or whatever is left
302   - batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
303   - HANDLE_ERROR(cudaMemcpy(dev_w, &w_array[waves_processed], batch_bytes, cudaMemcpyHostToDevice)); //copy the plane waves into global memory
304   - cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(dev_F, N, dev_x, dev_y, dev_z, dev_w, batch_size); //call the kernel
305   - waves_processed += batch_size; //increment the counter indicating how many waves have been processed
306   - }
  306 + gpu_scalarwaves(dev_F, N, dev_x, dev_y, dev_z, dev_W, W.size());
307 307  
308 308 cudaMemcpy(F, dev_F, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory
309 309  
... ... @@ -311,15 +311,25 @@ void cpu_sum_scalarwaves(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, std::v
311 311 if(y != NULL) cudaFree(dev_y);
312 312 if(z != NULL) cudaFree(dev_z);
313 313 cudaFree(dev_F);
314   - cudaFree(dev_w);
  314 +#else
  315 + memset(F, 0, N * sizeof(stim::complex<T>));
  316 + T px, py, pz;
  317 + for(size_t i = 0; i < N; i++){ // for each element in the array
  318 + (x == NULL) ? px = 0 : px = x[i]; // test for NULL values
  319 + (y == NULL) ? py = 0 : py = y[i];
  320 + (z == NULL) ? pz = 0 : pz = z[i];
315 321  
  322 + for(size_t s = 0; s < S; s++){
  323 + F[i] += w_array[s].pos(px, py, pz); //sum all plane waves at this point
  324 + }
  325 + }
316 326 #endif
317 327 }
318 328  
319 329 template<typename T>
320 330 void cpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
321 331 std::vector< stim::scalarwave<T> > w_array(1, w);
322   - cpu_sum_scalarwaves(F, N, x, y, z, w_array);
  332 + cpu_scalarwaves(F, N, x, y, z, w_array);
323 333 }
324 334  
325 335  
... ... @@ -331,7 +341,7 @@ void cpu_scalarwave(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scala
331 341 /// @param A is the list of amplitudes for each wave
332 342 /// @param S is the list of propagation directions for each wave
333 343 template<typename T>
334   -CUDA_CALLABLE stim::complex<T> sum_scalarwaves(T x, T y, T z, std::vector< stim::scalarwave<T> > W){
  344 +CUDA_CALLABLE stim::complex<T> cpu_scalarwaves(T x, T y, T z, std::vector< stim::scalarwave<T> > W){
335 345 size_t N = W.size(); //get the number of plane wave samples
336 346 stim::complex<T> field(0, 0); //initialize the field to zero (0)
337 347 stim::vec3<T> k; //allocate space for the direction vector
... ...