Commit 9339fbad873457047446ee3a90f52296eda250a5

Authored by David Mayerich
1 parent 308a743c

implementing mie scattering

stim/cuda/cudatools/callable.h
@@ -2,7 +2,7 @@ @@ -2,7 +2,7 @@
2 2
3 //define the CUDA_CALLABLE macro (will prefix all members) 3 //define the CUDA_CALLABLE macro (will prefix all members)
4 #ifdef __CUDACC__ 4 #ifdef __CUDACC__
5 -#define CUDA_CALLABLE __host__ __device__ 5 +#define CUDA_CALLABLE __host__ __device__ inline
6 #else 6 #else
7 #define CUDA_CALLABLE 7 #define CUDA_CALLABLE
8 #endif 8 #endif
stim/image/image.h
@@ -58,12 +58,12 @@ class image{ @@ -58,12 +58,12 @@ class image{
58 58
59 int cv_type(){ 59 int cv_type(){
60 if(std::is_same<T, unsigned char>::value) return CV_MAKETYPE(CV_8U, (int)C()); 60 if(std::is_same<T, unsigned char>::value) return CV_MAKETYPE(CV_8U, (int)C());
61 - if(std::is_same<T, char>::value) return CV_MAKETYPE(CV_8S, (int)C());  
62 - if(std::is_same<T, unsigned short>::value) return CV_MAKETYPE(CV_16U, (int)C());  
63 - if(std::is_same<T, short>::value) return CV_MAKETYPE(CV_16S, (int)C());  
64 - if(std::is_same<T, int>::value) return CV_MAKETYPE(CV_32S, (int)C());  
65 - if(std::is_same<T, float>::value) return CV_MAKETYPE(CV_32F, (int)C());  
66 - if(std::is_same<T, double>::value) return CV_MAKETYPE(CV_64F, (int)C()); 61 + else if(std::is_same<T, char>::value) return CV_MAKETYPE(CV_8S, (int)C());
  62 + else if(std::is_same<T, unsigned short>::value) return CV_MAKETYPE(CV_16U, (int)C());
  63 + else if(std::is_same<T, short>::value) return CV_MAKETYPE(CV_16S, (int)C());
  64 + else if(std::is_same<T, int>::value) return CV_MAKETYPE(CV_32S, (int)C());
  65 + else if(std::is_same<T, float>::value) return CV_MAKETYPE(CV_32F, (int)C());
  66 + else if(std::is_same<T, double>::value) return CV_MAKETYPE(CV_64F, (int)C());
67 67
68 std::cout<<"ERROR in stim::image::cv_type - no valid data type found"<<std::endl; 68 std::cout<<"ERROR in stim::image::cv_type - no valid data type found"<<std::endl;
69 exit(1); 69 exit(1);
@@ -72,12 +72,12 @@ class image{ @@ -72,12 +72,12 @@ class image{
72 /// Returns the value for "white" based on the dynamic range (assumes white is 1.0 for floating point images) 72 /// Returns the value for "white" based on the dynamic range (assumes white is 1.0 for floating point images)
73 T white(){ 73 T white(){
74 if(std::is_same<T, unsigned char>::value) return UCHAR_MAX; 74 if(std::is_same<T, unsigned char>::value) return UCHAR_MAX;
75 - if(std::is_same<T, unsigned short>::value) return SHRT_MAX;  
76 - if(std::is_same<T, unsigned>::value) return UINT_MAX;  
77 - if(std::is_same<T, unsigned long>::value) return ULONG_MAX;  
78 - if(std::is_same<T, unsigned long long>::value) return ULLONG_MAX;  
79 - if(std::is_same<T, float>::value) return 1.0f;  
80 - if(std::is_same<T, double>::value) return 1.0; 75 + else if(std::is_same<T, unsigned short>::value) return SHRT_MAX;
  76 + else if(std::is_same<T, unsigned>::value) return UINT_MAX;
  77 + else if(std::is_same<T, unsigned long>::value) return ULONG_MAX;
  78 + else if(std::is_same<T, unsigned long long>::value) return ULLONG_MAX;
  79 + else if(std::is_same<T, float>::value) return 1.0f;
  80 + else if(std::is_same<T, double>::value) return 1.0;
81 81
82 std::cout<<"ERROR in stim::image::white - no white value known for this data type"<<std::endl; 82 std::cout<<"ERROR in stim::image::white - no white value known for this data type"<<std::endl;
83 83
@@ -120,14 +120,6 @@ public: @@ -120,14 +120,6 @@ public:
120 free(img); 120 free(img);
121 } 121 }
122 122
123 - /*stim::image<T> operator=(const stim::image<T>& I){  
124 - if(&I == this) //handle self-assignment  
125 - return *this;  
126 - allocate(I.X(), I.Y(), I.C());  
127 - memcpy(img, I.img, bytes());  
128 - return *this;  
129 - }*/  
130 -  
131 stim::image<T>& operator=(const stim::image<T>& I){ 123 stim::image<T>& operator=(const stim::image<T>& I){
132 init(); 124 init();
133 if(&I == this) //handle self-assignment 125 if(&I == this) //handle self-assignment
stim/math/bessel.h
@@ -1258,7 +1258,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv, @@ -1258,7 +1258,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1258 P a0,v0,pv0,pv1,vl,ga,gb,vg,vv,w0,w1,ya0,yak,ya1,wa; 1258 P a0,v0,pv0,pv1,vl,ga,gb,vg,vv,w0,w1,ya0,yak,ya1,wa;
1259 int j,n,k,kz,l,lb,lb0,m; 1259 int j,n,k,kz,l,lb,lb0,m;
1260 1260
1261 - a0 = abs(z); 1261 + a0 = ::abs(z);
1262 z1 = z; 1262 z1 = z;
1263 z2 = z*z; 1263 z2 = z*z;
1264 n = (int)v; 1264 n = (int)v;
@@ -1286,7 +1286,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv, @@ -1286,7 +1286,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1286 vm = v; 1286 vm = v;
1287 return 0; 1287 return 0;
1288 } 1288 }
1289 - if (real(z1) < 0.0) z1 = -z; 1289 + if (::real(z1) < 0.0) z1 = -z;
1290 if (a0 <= 12.0) { 1290 if (a0 <= 12.0) {
1291 for (l=0;l<2;l++) { 1291 for (l=0;l<2;l++) {
1292 vl = v0+l; 1292 vl = v0+l;
@@ -1295,7 +1295,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv, @@ -1295,7 +1295,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1295 for (k=1;k<=40;k++) { 1295 for (k=1;k<=40;k++) {
1296 cr *= -0.25*z2/(k*(k+vl)); 1296 cr *= -0.25*z2/(k*(k+vl));
1297 cjvl += cr; 1297 cjvl += cr;
1298 - if (abs(cr) < abs(cjvl)*eps) break; 1298 + if (::abs(cr) < ::abs(cjvl)*eps) break;
1299 } 1299 }
1300 vg = 1.0 + vl; 1300 vg = 1.0 + vl;
1301 ga = gamma(vg); 1301 ga = gamma(vg);
@@ -1348,7 +1348,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv, @@ -1348,7 +1348,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1348 for (k=1;k<=40;k++) { 1348 for (k=1;k<=40;k++) {
1349 cr *= -0.25*z2/(k*(k-vl)); 1349 cr *= -0.25*z2/(k*(k-vl));
1350 cjvl += cr; 1350 cjvl += cr;
1351 - if (abs(cr) < abs(cjvl)*eps) break; 1351 + if (::abs(cr) < ::abs(cjvl)*eps) break;
1352 } 1352 }
1353 vg = 1.0-vl; 1353 vg = 1.0-vl;
1354 gb = gamma(vg); 1354 gb = gamma(vg);
@@ -1381,16 +1381,16 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv, @@ -1381,16 +1381,16 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1381 cyv1 = M_2_PI*(cec*cjv1-1.0/z1-0.25*z1*cs1); 1381 cyv1 = M_2_PI*(cec*cjv1-1.0/z1-0.25*z1*cs1);
1382 } 1382 }
1383 } 1383 }
1384 - if (real(z) < 0.0) { 1384 + if (::real(z) < 0.0) {
1385 cfac0 = exp(pv0*cii); 1385 cfac0 = exp(pv0*cii);
1386 cfac1 = exp(pv1*cii); 1386 cfac1 = exp(pv1*cii);
1387 - if (imag(z) < 0.0) { 1387 + if (::imag(z) < 0.0) {
1388 cyv0 = cfac0*cyv0-(P)2.0*(complex<P>)cii*cos(pv0)*cjv0; 1388 cyv0 = cfac0*cyv0-(P)2.0*(complex<P>)cii*cos(pv0)*cjv0;
1389 cyv1 = cfac1*cyv1-(P)2.0*(complex<P>)cii*cos(pv1)*cjv1; 1389 cyv1 = cfac1*cyv1-(P)2.0*(complex<P>)cii*cos(pv1)*cjv1;
1390 cjv0 /= cfac0; 1390 cjv0 /= cfac0;
1391 cjv1 /= cfac1; 1391 cjv1 /= cfac1;
1392 } 1392 }
1393 - else if (imag(z) > 0.0) { 1393 + else if (::imag(z) > 0.0) {
1394 cyv0 = cyv0/cfac0+(P)2.0*(complex<P>)cii*cos(pv0)*cjv0; 1394 cyv0 = cyv0/cfac0+(P)2.0*(complex<P>)cii*cos(pv0)*cjv0;
1395 cyv1 = cyv1/cfac1+(P)2.0*(complex<P>)cii*cos(pv1)*cjv1; 1395 cyv1 = cyv1/cfac1+(P)2.0*(complex<P>)cii*cos(pv1)*cjv1;
1396 cjv0 *= cfac0; 1396 cjv0 *= cfac0;
@@ -1421,7 +1421,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv, @@ -1421,7 +1421,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1421 cf2 = cf1; 1421 cf2 = cf1;
1422 cf1 = cf; 1422 cf1 = cf;
1423 } 1423 }
1424 - if (abs(cjv0) > abs(cjv1)) cs = cjv0/cf; 1424 + if (::abs(cjv0) > ::abs(cjv1)) cs = cjv0/cf;
1425 else cs = cjv1/cf2; 1425 else cs = cjv1/cf2;
1426 for (k=0;k<=n;k++) { 1426 for (k=0;k<=n;k++) {
1427 cjv[k] *= cs; 1427 cjv[k] *= cs;
@@ -1433,21 +1433,21 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv, @@ -1433,21 +1433,21 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1433 } 1433 }
1434 cyv[0] = cyv0; 1434 cyv[0] = cyv0;
1435 cyv[1] = cyv1; 1435 cyv[1] = cyv1;
1436 - ya0 = abs(cyv0); 1436 + ya0 = ::abs(cyv0);
1437 lb = 0; 1437 lb = 0;
1438 cg0 = cyv0; 1438 cg0 = cyv0;
1439 cg1 = cyv1; 1439 cg1 = cyv1;
1440 for (k=2;k<=n;k++) { 1440 for (k=2;k<=n;k++) {
1441 cyk = 2.0*(v0+k-1.0)*cg1/z-cg0; 1441 cyk = 2.0*(v0+k-1.0)*cg1/z-cg0;
1442 - yak = abs(cyk);  
1443 - ya1 = abs(cg0); 1442 + yak = ::abs(cyk);
  1443 + ya1 = ::abs(cg0);
1444 if ((yak < ya0) && (yak< ya1)) lb = k; 1444 if ((yak < ya0) && (yak< ya1)) lb = k;
1445 cyv[k] = cyk; 1445 cyv[k] = cyk;
1446 cg0 = cg1; 1446 cg0 = cg1;
1447 cg1 = cyk; 1447 cg1 = cyk;
1448 } 1448 }
1449 lb0 = 0; 1449 lb0 = 0;
1450 - if ((lb > 4) && (imag(z) != 0.0)) { 1450 + if ((lb > 4) && (::imag(z) != 0.0)) {
1451 while(lb != lb0) { 1451 while(lb != lb0) {
1452 ch2 = cone; 1452 ch2 = cone;
1453 ch1 = czero; 1453 ch1 = czero;
@@ -1470,7 +1470,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv, @@ -1470,7 +1470,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1470 cp21 = ch2; 1470 cp21 = ch2;
1471 if (lb == n) 1471 if (lb == n)
1472 cjv[lb+1] = 2.0*(lb+v0)*cjv[lb]/z-cjv[lb-1]; 1472 cjv[lb+1] = 2.0*(lb+v0)*cjv[lb]/z-cjv[lb-1];
1473 - if (abs(cjv[0]) > abs(cjv[1])) { 1473 + if (::abs(cjv[0]) > ::abs(cjv[1])) {
1474 cyv[lb+1] = (cjv[lb+1]*cyv0-2.0*cp11/(M_PI*z))/cjv[0]; 1474 cyv[lb+1] = (cjv[lb+1]*cyv0-2.0*cp11/(M_PI*z))/cjv[0];
1475 cyv[lb] = (cjv[lb]*cyv0+2.0*cp12/(M_PI*z))/cjv[0]; 1475 cyv[lb] = (cjv[lb]*cyv0+2.0*cp12/(M_PI*z))/cjv[0];
1476 } 1476 }
@@ -1495,8 +1495,8 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv, @@ -1495,8 +1495,8 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1495 cyl2 = cylk; 1495 cyl2 = cylk;
1496 } 1496 }
1497 for (k=2;k<=n;k++) { 1497 for (k=2;k<=n;k++) {
1498 - wa = abs(cyv[k]);  
1499 - if (wa < abs(cyv[k-1])) lb = k; 1498 + wa = ::abs(cyv[k]);
  1499 + if (wa < ::abs(cyv[k-1])) lb = k;
1500 } 1500 }
1501 } 1501 }
1502 } 1502 }
@@ -1515,12 +1515,18 @@ int cbessjyva_sph(int v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv, @@ -1515,12 +1515,18 @@ int cbessjyva_sph(int v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1515 //first, compute the bessel functions of fractional order 1515 //first, compute the bessel functions of fractional order
1516 cbessjyva<P>(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp); 1516 cbessjyva<P>(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp);
1517 1517
  1518 + if(z == 0){ //handle degenerate case of z = 0
  1519 + memset(cjv, 0, sizeof(P) * (v+1));
  1520 + cjv[0] = 1;
  1521 + }
  1522 +
1518 //iterate through each and scale 1523 //iterate through each and scale
1519 for(int n = 0; n<=v; n++) 1524 for(int n = 0; n<=v; n++)
1520 { 1525 {
1521 -  
1522 - cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));  
1523 - cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0)); 1526 + if(z != 0){ //handle degenerate case of z = 0
  1527 + cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
  1528 + cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
  1529 + }
1524 1530
1525 cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0)); 1531 cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0));
1526 cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0)); 1532 cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0));
stim/math/constants.h
1 #ifndef STIM_CONSTANTS_H 1 #ifndef STIM_CONSTANTS_H
2 #define STIM_CONSTANTS_H 2 #define STIM_CONSTANTS_H
3 3
  4 +#include "stim/cuda/cudatools/callable.h"
4 namespace stim{ 5 namespace stim{
5 const double PI = 3.1415926535897932384626433832795028841971693993751058209749445923078164062862; 6 const double PI = 3.1415926535897932384626433832795028841971693993751058209749445923078164062862;
6 const double TAU = 2 * stim::PI; 7 const double TAU = 2 * stim::PI;
stim/math/matrix.h
@@ -55,7 +55,7 @@ struct matrix @@ -55,7 +55,7 @@ struct matrix
55 vec<Y> operator*(vec<Y> rhs){ 55 vec<Y> operator*(vec<Y> rhs){
56 unsigned int N = rhs.size(); 56 unsigned int N = rhs.size();
57 57
58 - vec3Y> result; 58 + vec<Y> result;
59 result.resize(N); 59 result.resize(N);
60 60
61 for(int r=0; r<N; r++) 61 for(int r=0; r<N; r++)
stim/math/quaternion.h
@@ -43,6 +43,8 @@ public: @@ -43,6 +43,8 @@ public:
43 43
44 CUDA_CALLABLE void CreateRotation(vec3<T> from, vec3<T> to){ 44 CUDA_CALLABLE void CreateRotation(vec3<T> from, vec3<T> to){
45 45
  46 + from = from.norm();
  47 + to = to.norm();
46 vec3<T> r = from.cross(to); //compute the rotation vector 48 vec3<T> r = from.cross(to); //compute the rotation vector
47 T theta = asin(r.len()); //compute the angle of the rotation about r 49 T theta = asin(r.len()); //compute the angle of the rotation about r
48 //deal with a zero vector (both k and kn point in the same direction) 50 //deal with a zero vector (both k and kn point in the same direction)
stim/optics/mie.h 0 โ†’ 100644
  1 +#ifndef STIM_MIE_H
  2 +#define STIM_MIE_H
  3 +
  4 +#include "scalarwave.h"
  5 +#include "../math/bessel.h"
  6 +#include <cmath>
  7 +
  8 +namespace stim{
  9 +
  10 +
  11 +/// Calculate the scattering coefficients for a spherical scatterer
  12 +template<typename T>
  13 +void B_coefficients(stim::complex<T>* B, T a, T k, stim::complex<T> n, int Nl){
  14 +
  15 + //temporary variables
  16 + double vm; //allocate space to store the return values for the bessel function calculation
  17 + double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
  18 + double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
  19 + double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
  20 + double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
  21 +
  22 + stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  23 + stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  24 + stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  25 + stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  26 +
  27 + double ka = k * a; //store k*a (argument for spherical bessel and Hankel functions)
  28 + stim::complex<double> kna = k * n * a; //store k*n*a (argument for spherical bessel functions and derivatives)
  29 +
  30 + stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka); //calculate bessel functions and derivatives for k*a
  31 + stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna); //calculate complex bessel functions for k*n*a
  32 +
  33 + stim::complex<double> h_ka, dh_ka;
  34 + stim::complex<double> numerator, denominator;
  35 + stim::complex<double> i(0, 1);
  36 + for(size_t l = 0; l <= Nl; l++){
  37 + h_ka.r = j_ka[l];
  38 + h_ka.i = y_ka[l];
  39 + dh_ka.r = dj_ka[l];
  40 + dh_ka.i = dy_ka[l];
  41 +
  42 + numerator = j_ka[l] * dj_kna[l] * (stim::complex<double>)n - j_kna[l] * dj_ka[l];
  43 + denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
  44 + B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
  45 + std::cout<<B[l]<<std::endl;
  46 + }
  47 +}
  48 +
  49 +template<typename T>
  50 +void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){
  51 + //temporary variables
  52 + double vm; //allocate space to store the return values for the bessel function calculation
  53 + double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
  54 + double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
  55 + double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
  56 + double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
  57 +
  58 + stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  59 + stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  60 + stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  61 + stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  62 +
  63 + double ka = k * a; //store k*a (argument for spherical bessel and Hankel functions)
  64 + stim::complex<double> kna = k * n * a; //store k*n*a (argument for spherical bessel functions and derivatives)
  65 +
  66 + stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka); //calculate bessel functions and derivatives for k*a
  67 + stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna); //calculate complex bessel functions for k*n*a
  68 +
  69 + stim::complex<double> h_ka, dh_ka;
  70 + stim::complex<double> numerator, denominator;
  71 + stim::complex<double> i(0, 1);
  72 + for(size_t l = 0; l <= Nl; l++){
  73 + h_ka.r = j_ka[l];
  74 + h_ka.i = y_ka[l];
  75 + dh_ka.r = dj_ka[l];
  76 + dh_ka.i = dy_ka[l];
  77 +
  78 + numerator = j_ka[l] * dh_ka - dj_ka[l] * h_ka;
  79 + denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
  80 + A[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
  81 + }
  82 +}
  83 +
  84 +template<typename T>
  85 +__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* B, T* j, T kr_min, T dkr, int Nl){
  86 + extern __shared__ stim::scalarwave<T> shared_W[]; //declare the list of waves in shared memory
  87 +
  88 + stim::cuda::sharedMemcpy(shared_W, W, nW, threadIdx.x, blockDim.x); //copy the plane waves into shared memory for faster access
  89 + __syncthreads(); //synchronize threads to insure all data is copied
  90 +
  91 + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
  92 + if(i >= N) return; //exit if this thread is outside the array
  93 + stim::vec3<T> p;
  94 + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
  95 + (y == NULL) ? p[1] = 0 : p[1] = y[i];
  96 + (z == NULL) ? p[2] = 0 : p[2] = z[i];
  97 +
  98 + T r = p.len(); //calculate the distance from the sphere
  99 + T k = W[0].kmag();
  100 + if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field)
  101 +
  102 + size_t NC = Nl + 1; //calculate the number of coefficients to be used
  103 + T kr = r * k; //calculate the thread value for k*r
  104 + T fij = (kr - kr_min)/dkr; //FP index into the spherical bessel LUT
  105 + size_t ij = (size_t) fij; //convert to an integral index
  106 + T alpha = fij - ij; //calculate the fractional portion of the index
  107 + size_t n0j = ij * (NC); //start of the first entry in the LUT
  108 + size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT
  109 +
  110 + T cos_phi;
  111 + T Pl_2, Pl_1; //declare registers to store the previous two Legendre polynomials
  112 + T Pl = 1; //initialize the current value for the Legendre polynomial
  113 + T jl;
  114 + stim::complex<T> Ei = 0; //create a register to store the result
  115 + int l;
  116 + for(size_t w = 0; w < nW; w++){
  117 + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere
  118 + for(l = 0; l <= Nl; l++){
  119 + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
  120 + Pl_1 = Pl;
  121 + if(l == 0){ //computing Pl is done recursively, where the recursive relation
  122 + Pl = cos_phi; // requires the first two orders. This defines the second.
  123 + }
  124 + else{ //if this is not the first iteration, use the recursive relation to calculate Pl
  125 + Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
  126 + }
  127 +
  128 + jl = lerp<T>( j[n0j + l], j[n1j + l], alpha ); //read jl from the LUT and interpolate the result
  129 + Ei += W[w].E() * B[l] * jl * Pl;
  130 + }
  131 + //Ei += shared_W[w].pos(p); //evaluate the plane wave
  132 + }
  133 + E[i] += Ei; //copy the result to device memory
  134 +}
  135 +
  136 +template<typename T>
  137 +void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* B, T* j, T kr_min, T dkr, size_t Nl){
  138 +
  139 + size_t wave_bytes = sizeof(stim::scalarwave<T>);
  140 + size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available
  141 + size_t array_bytes = nW * wave_bytes; //calculate the maximum number of bytes required for the planewave array
  142 + size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory
  143 + size_t num_batches = nW / max_batch + 1; //calculate the number of batches required to process all plane waves
  144 + size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required
  145 +
  146 + stim::scalarwave<T>* batch_W;
  147 + HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes)); //allocate memory for a single batch of plane waves
  148 +
  149 + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
  150 + dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
  151 +
  152 + size_t batch_size; //declare a variable to store the size of the current batch
  153 + size_t waves_processed = 0; //initialize the number of waves processed to zero
  154 + while(waves_processed < nW){ //while there are still waves to be processed
  155 + batch_size = min<size_t>(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left
  156 + batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
  157 + HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory
  158 + cuda_scalar_mie_scatter<T><<< blocks, threads, batch_bytes >>>(E, N, x, y, z, batch_W, batch_size, a, n, B, j, kr_min, dkr, (int)Nl); //call the kernel
  159 + waves_processed += batch_size; //increment the counter indicating how many waves have been processed
  160 + }
  161 + cudaFree(batch_W);
  162 +}
  163 +/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
  164 +
  165 +/// @param E is a pointer to the destination field values
  166 +/// @param N is the number of points used to calculate the field
  167 +/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
  168 +/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
  169 +/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
  170 +/// @param W is an array of planewaves that will be scattered
  171 +/// @param a is the radius of the sphere
  172 +/// @param n is the complex refractive index of the sphere
  173 +template<typename T>
  174 +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n){
  175 + //calculate the necessary number of orders required to represent the scattered field
  176 + T k = W[0].kmag();
  177 +
  178 + size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2);
  179 +
  180 + //calculate the scattering coefficients for the sphere
  181 + stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients
  182 + B_coefficients(B, a, k, n, Nl);
  183 +
  184 +#ifdef __CUDACC__
  185 + stim::complex<T>* dev_E; //allocate space for the field
  186 + cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
  187 + cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
  188 + //cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used)
  189 +
  190 + // COORDINATES
  191 + T* dev_x = NULL; //allocate space and copy the X coordinate (if specified)
  192 + if(x != NULL){
  193 + HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
  194 + HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
  195 + }
  196 + T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified)
  197 + if(y != NULL){
  198 + HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
  199 + HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
  200 + }
  201 + T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified)
  202 + if(z != NULL){
  203 + HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
  204 + HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
  205 + }
  206 +
  207 + // PLANE WAVES
  208 + stim::scalarwave<T>* dev_W; //allocate space and copy plane waves
  209 + HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
  210 + HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
  211 +
  212 + // SCATTERING COEFFICIENTS
  213 + stim::complex<T>* dev_B;
  214 + HANDLE_ERROR( cudaMalloc(&dev_B, sizeof(stim::complex<T>) * (Nl+1)) );
  215 + HANDLE_ERROR( cudaMemcpy(dev_B, B, sizeof(stim::complex<T>) * (Nl+1), cudaMemcpyHostToDevice) );
  216 +
  217 + // BESSEL FUNCTION LOOK-UP TABLE
  218 + //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r
  219 + size_t Nlut_j = 1024;
  220 + T r_min = 0;
  221 + T r_max = 10;
  222 +
  223 + T kr_min = k * r_min;
  224 + T kr_max = k * r_max;
  225 +
  226 + //temporary variables
  227 + double vm; //allocate space to store the return values for the bessel function calculation
  228 + double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
  229 + double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
  230 + double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
  231 + double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
  232 +
  233 + size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j;
  234 + T* bessel_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table
  235 + T dkr = (kr_max - kr_min) / (Nlut_j-1); //distance between values in the LUT
  236 + std::cout<<"LUT jl bytes: "<<lutj_bytes<<std::endl;
  237 + for(size_t kri = 0; kri < Nlut_j; kri++){ //for each value in the LUT
  238 + stim::bessjyv_sph<double>(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
  239 + for(size_t l = 0; l <= Nl; l++){ //for each order
  240 + bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; //store the bessel function result
  241 + }
  242 + }
  243 +
  244 + stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
  245 +
  246 + //Allocate device memory and copy everything to the GPU
  247 + T* dev_j_lut;
  248 + HANDLE_ERROR( cudaMalloc(&dev_j_lut, lutj_bytes) );
  249 + HANDLE_ERROR( cudaMemcpy(dev_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) );
  250 +
  251 + gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_B, dev_j_lut, kr_min, dkr, Nl);
  252 +
  253 + cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory
  254 +
  255 + if(x != NULL) cudaFree(dev_x); //free everything
  256 + if(y != NULL) cudaFree(dev_y);
  257 + if(z != NULL) cudaFree(dev_z);
  258 + cudaFree(dev_E);
  259 +#else
  260 +
  261 +
  262 + //allocate space to store the bessel function call results
  263 + double vm;
  264 + double* j_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
  265 + double* y_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
  266 + double* dj_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
  267 + double* dy_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
  268 +
  269 + T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
  270 +
  271 + T r, kr, cos_phi;
  272 + stim::complex<T> h;
  273 + for(size_t i = 0; i < N; i++){
  274 + stim::vec3<T> p; //declare a 3D point
  275 +
  276 + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
  277 + (y == NULL) ? p[1] = 0 : p[1] = y[i];
  278 + (z == NULL) ? p[2] = 0 : p[2] = z[i];
  279 + r = p.len();
  280 + if(r >= a){
  281 + for(size_t w = 0; w < W.size(); w++){
  282 + kr = p.len() * W[w].kmag(); //calculate k*r
  283 + stim::bessjyv_sph<double>(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr);
  284 + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction
  285 + stim::legendre<T>(Nl, cos_phi, P);
  286 +
  287 + for(size_t l = 0; l <= Nl; l++){
  288 + h.r = j_kr[l];
  289 + h.i = y_kr[l];
  290 + E[i] += W[w].E() * B[l] * h * P[l];
  291 + }
  292 + }
  293 + }
  294 + }
  295 +#endif
  296 +}
  297 +
  298 +template<typename T>
  299 +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){
  300 + std::vector< stim::scalarwave<T> > W(1, w);
  301 + cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n);
  302 +}
  303 +
  304 +/// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere
  305 +
  306 +/// @param E is a pointer to the destination field values
  307 +/// @param N is the number of points used to calculate the field
  308 +/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
  309 +/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
  310 +/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
  311 +/// @param w is a planewave that will be scattered
  312 +/// @param a is the radius of the sphere
  313 +/// @param n is the complex refractive index of the sphere
  314 +template<typename T>
  315 +void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n){
  316 +
  317 + //calculate the necessary number of orders required to represent the scattered field
  318 + T k = W[0].kmag();
  319 +
  320 + size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2);
  321 +
  322 + //calculate the scattering coefficients for the sphere
  323 + stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients
  324 + A_coefficients(A, a, k, n, Nl);
  325 +
  326 + //allocate space to store the bessel function call results
  327 + double vm;
  328 + stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  329 + stim::complex<double>* y_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  330 + stim::complex<double>* dj_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  331 + stim::complex<double>* dy_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  332 +
  333 + T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
  334 +
  335 + T r, cos_phi;
  336 + stim::complex<double> knr;
  337 + stim::complex<T> h;
  338 + for(size_t i = 0; i < N; i++){
  339 + stim::vec3<T> p; //declare a 3D point
  340 +
  341 + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
  342 + (y == NULL) ? p[1] = 0 : p[1] = y[i];
  343 + (z == NULL) ? p[2] = 0 : p[2] = z[i];
  344 + r = p.len();
  345 + if(r < a){
  346 + E[i] = 0;
  347 + for(size_t w = 0; w < W.size(); w++){
  348 + knr = (stim::complex<double>)n * p.len() * W[w].kmag(); //calculate k*n*r
  349 +
  350 + stim::cbessjyva_sph<double>(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr);
  351 + if(r == 0)
  352 + cos_phi = 0;
  353 + else
  354 + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction
  355 + stim::legendre<T>(Nl, cos_phi, P);
  356 +
  357 + for(size_t l = 0; l <= Nl; l++){
  358 + E[i] += W[w].E() * A[l] * (stim::complex<T>)j_knr[l] * P[l];
  359 + }
  360 + }
  361 + }
  362 + }
  363 +}
  364 +
  365 +template<typename T>
  366 +void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){
  367 + std::vector< stim::scalarwave<T> > W(1, w);
  368 + cpu_scalar_mie_internal(E, N, x, y, z, W, a, n);
  369 +}
  370 +
  371 +}
  372 +
  373 +#endif
0 \ No newline at end of file 374 \ No newline at end of file
stim/optics/scalarbeam.h
@@ -5,7 +5,12 @@ @@ -5,7 +5,12 @@
5 #include "../optics/scalarwave.h" 5 #include "../optics/scalarwave.h"
6 #include "../math/bessel.h" 6 #include "../math/bessel.h"
7 #include "../math/legendre.h" 7 #include "../math/legendre.h"
  8 +#include "../cuda/cudatools/devices.h"
  9 +#include "../cuda/cudatools/timer.h"
  10 +#include <cublas_v2.h>
  11 +#include <math_constants.h>
8 #include <vector> 12 #include <vector>
  13 +#include <stdlib.h>
9 14
10 namespace stim{ 15 namespace stim{
11 16
@@ -105,10 +110,11 @@ public: @@ -105,10 +110,11 @@ public:
105 std::vector< scalarwave<T> > samples(N); //create a vector of plane waves 110 std::vector< scalarwave<T> > samples(N); //create a vector of plane waves
106 T kmag = (T)stim::TAU / lambda; //calculate the wavenumber 111 T kmag = (T)stim::TAU / lambda; //calculate the wavenumber
107 stim::complex<T> apw; //allocate space for the amplitude at the focal point 112 stim::complex<T> apw; //allocate space for the amplitude at the focal point
  113 + T a = stim::TAU * (1 - cos(asin(NA[0]))) / (double)N;
108 stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction 114 stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction
109 for(size_t i=0; i<N; i++){ //for each sample 115 for(size_t i=0; i<N; i++){ //for each sample
110 kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave 116 kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave
111 - apw = exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave 117 + apw = a * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave
112 samples[i] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction 118 samples[i] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction
113 } 119 }
114 120
@@ -148,7 +154,7 @@ public: @@ -148,7 +154,7 @@ public:
148 /// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional) 154 /// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional)
149 /// @param C is a pointer to Nl + 1 values where the terms will be stored 155 /// @param C is a pointer to Nl + 1 values where the terms will be stored
150 template<typename T> 156 template<typename T>
151 -CUDA_CALLABLE void cpu_aperture_integral(T* C, size_t Nl, T NA, T NA_in = 0){ 157 +CUDA_CALLABLE void cpu_aperture_integral(T* C, int Nl, T NA, T NA_in = 0){
152 158
153 size_t table_bytes = (Nl + 1) * sizeof(T); //calculate the number of bytes required to store the terms 159 size_t table_bytes = (Nl + 1) * sizeof(T); //calculate the number of bytes required to store the terms
154 T cos_alpha_1 = cos(asin(NA_in)); //calculate the cosine of the angle subtended by the central obscuration 160 T cos_alpha_1 = cos(asin(NA_in)); //calculate the cosine of the angle subtended by the central obscuration
@@ -182,23 +188,151 @@ CUDA_CALLABLE void cpu_aperture_integral(T* C, size_t Nl, T NA, T NA_in = 0){ @@ -182,23 +188,151 @@ CUDA_CALLABLE void cpu_aperture_integral(T* C, size_t Nl, T NA, T NA_in = 0){
182 188
183 /// performs linear interpolation into a look-up table 189 /// performs linear interpolation into a look-up table
184 template<typename T> 190 template<typename T>
185 -T lut_lookup(T* lut, T val, size_t N, T min_val, T delta, size_t stride = 0){  
186 - size_t idx = (size_t)((val - min_val) / delta);  
187 - T alpha = val - idx * delta + min_val; 191 +CUDA_CALLABLE void lut_lookup(T* lut_values, T* lut, T val, size_t N, T min_val, T delta, size_t n_vals){
  192 + T idx = ((val - min_val) / delta);
  193 + size_t i = (size_t) idx;
  194 + T a1 = idx - i;
  195 + T a0 = 1 - a1;
  196 + size_t n0 = i * n_vals;
  197 + size_t n1 = (i+1) * n_vals;
  198 + for(size_t n = 0; n < n_vals; n++){
  199 + lut_values[n] = lut[n0 + n] * a0 + lut[n1 + n] * a1;
  200 + }
  201 +}
188 202
189 - if(alpha == 0) return lut[idx];  
190 - else return lut[idx * stride] * (1 - alpha) + lut[ (idx+1) * stride] * alpha; 203 +template <typename T>
  204 +CUDA_CALLABLE T lerp(T v0, T v1, T t) {
  205 + return fma(t, v1, fma(-t, v0, v0));
191 } 206 }
192 207
  208 +#ifdef __CUDACC__
193 template<typename T> 209 template<typename T>
194 -void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, stim::vec3<T> f, T NA, T NA_in, int Nl){  
195 - T k = stim::TAU / lambda; 210 +__global__ void cuda_scalar_psf(stim::complex<T>* E, size_t N, T* r, T* phi, T k, T A, size_t Nl,
  211 + T* C,
  212 + T* lut_j, size_t Nj, T min_kr, T dkr){
  213 + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
  214 + if(i >= N) return; //exit if this thread is outside the array
  215 +
  216 + T cos_phi = cos(phi[i]); //calculate the thread value for cos(phi)
  217 + T kr = r[i] * k; //calculate the thread value for k*r
  218 + stim::complex<T> Ei = 0; //initialize the value of the field to zero
  219 + size_t NC = Nl + 1; //calculate the number of coefficients to be used
  220 +
  221 + T fij = (kr - min_kr)/dkr; //FP index into the spherical bessel LUT
  222 + size_t ij = (size_t) fij; //convert to an integral index
  223 + T a = fij - ij; //calculate the fractional portion of the index
  224 + size_t n0j = ij * (NC); //start of the first entry in the LUT
  225 + size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT
  226 +
  227 + T jl; //declare register to store the spherical bessel function
  228 + T Pl_2, Pl_1; //declare registers to store the previous two Legendre polynomials
  229 + T Pl = 1; //initialize the current value for the Legendre polynomial
  230 + stim::complex<T> im(0, 1); //declare i (imaginary 1)
  231 + stim::complex<T> i_pow(1, 0); //i_pow stores the current value of i^l so it doesn't have to be re-computed every iteration
  232 + for(int l = 0; l <= Nl; l++){ //for each order
  233 + jl = lerp<T>( lut_j[n0j + l], lut_j[n1j + l], a ); //read jl from the LUT and interpolate the result
  234 + Ei += i_pow * jl * Pl * C[l]; //calculate the value for the field and sum
  235 + i_pow *= im; //multiply i^l * i for the next iteration
  236 + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
  237 + Pl_1 = Pl;
  238 + if(l == 0){ //computing Pl is done recursively, where the recursive relation
  239 + Pl = cos_phi; // requires the first two orders. This defines the second.
  240 + }
  241 + else{ //if this is not the first iteration, use the recursive relation to calculate Pl
  242 + Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
  243 + }
  244 +
  245 + }
  246 + E[i] = Ei * A * 2 * CUDART_PI_F; //scale the integral by the amplitude
  247 +}
  248 +
  249 +template<typename T>
  250 +void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl, T r_spacing){
  251 +
  252 + //Find the minimum and maximum values of r
  253 + cublasStatus_t stat;
  254 + cublasHandle_t handle;
  255 +
  256 + stat = cublasCreate(&handle); //create a cuBLAS handle
  257 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  258 + printf ("CUBLAS initialization failed\n");
  259 + exit(1);
  260 + }
  261 +
  262 + int i_min, i_max;
  263 + stat = cublasIsamin(handle, (int)N, r, 1, &i_min);
  264 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  265 + printf ("CUBLAS Error: failed to calculate minimum r value.\n");
  266 + exit(1);
  267 + }
  268 + stat = cublasIsamax(handle, (int)N, r, 1, &i_max);
  269 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  270 + printf ("CUBLAS Error: failed to calculate maximum r value.\n");
  271 + exit(1);
  272 + }
  273 +
  274 + T r_min, r_max; //allocate space to store the minimum and maximum values
  275 + HANDLE_ERROR( cudaMemcpy(&r_min, r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU
  276 + HANDLE_ERROR( cudaMemcpy(&r_max, r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
  277 +
  278 + T k = (T)stim::TAU / lambda; //calculate the wavenumber from lambda
  279 + size_t C_bytes = (Nl + 1) * sizeof(T);
  280 + T* C = (T*) malloc( C_bytes ); //allocate space for the aperture integral terms
  281 + cpu_aperture_integral(C, Nl, NA, NA_in); //calculate the aperture integral terms
  282 +
  283 + size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r
  284 +
  285 + T kr_min = k * r_min;
  286 + T kr_max = k * r_max;
  287 +
  288 + //temporary variables
  289 + double vm; //allocate space to store the return values for the bessel function calculation
  290 + double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
  291 + double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
  292 + double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
  293 + double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
  294 +
  295 + size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j;
  296 + T* bessel_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table
  297 + T delta_kr = (kr_max - kr_min) / (Nlut_j-1); //distance between values in the LUT
  298 + std::cout<<"LUT jl bytes: "<<lutj_bytes<<std::endl;
  299 + for(size_t kri = 0; kri < Nlut_j; kri++){ //for each value in the LUT
  300 + stim::bessjyv_sph<double>(Nl, kr_min + kri * delta_kr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
  301 + for(size_t l = 0; l <= Nl; l++){ //for each order
  302 + bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; //store the bessel function result
  303 + }
  304 + }
  305 +
  306 + stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
  307 +
  308 + //Allocate device memory and copy everything to the GPU
  309 +
  310 + T* gpu_C;
  311 + HANDLE_ERROR( cudaMalloc(&gpu_C, C_bytes) );
  312 + HANDLE_ERROR( cudaMemcpy(gpu_C, C, C_bytes, cudaMemcpyHostToDevice) );
  313 + T* gpu_j_lut;
  314 + HANDLE_ERROR( cudaMalloc(&gpu_j_lut, lutj_bytes) );
  315 + HANDLE_ERROR( cudaMemcpy(gpu_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) );
  316 +
  317 + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
  318 + dim3 blocks( (unsigned)(N / threads + 1)); //calculate the optimal number of blocks
196 319
197 - T* C = (T*) malloc( (Nl + 1) * sizeof(T) ); //allocate space for the aperture integral terms 320 + cuda_scalar_psf<T><<< blocks, threads >>>(E, N, r, phi, (T)stim::TAU/lambda, A, Nl, gpu_C, gpu_j_lut, Nlut_j, kr_min, delta_kr);
  321 +
  322 + //free the LUT and condenser tables
  323 + HANDLE_ERROR( cudaFree(gpu_C) );
  324 + HANDLE_ERROR( cudaFree(gpu_j_lut) );
  325 +}
  326 +#endif
  327 +
  328 +/// Calculate the analytical solution to a scalar point spread function given a set of spherical coordinates about the PSF (beam propagation along phi = theta = 0)
  329 +template<typename T>
  330 +void cpu_scalar_psf_local(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl){
  331 + T k = (T)stim::TAU / lambda;
  332 + size_t C_bytes = (Nl + 1) * sizeof(T);
  333 + T* C = (T*) malloc( C_bytes ); //allocate space for the aperture integral terms
198 cpu_aperture_integral(C, Nl, NA, NA_in); //calculate the aperture integral terms 334 cpu_aperture_integral(C, Nl, NA, NA_in); //calculate the aperture integral terms
199 memset(F, 0, N * sizeof(stim::complex<T>)); 335 memset(F, 0, N * sizeof(stim::complex<T>));
200 -#ifdef NO_CUDA  
201 - memset(F, 0, N * sizeof(stim::complex<T>));  
202 T jl, Pl, kr, cos_phi; 336 T jl, Pl, kr, cos_phi;
203 337
204 double vm; 338 double vm;
@@ -225,71 +359,117 @@ void cpu_scalar_psf(stim::complex&lt;T&gt;* F, size_t N, T* r, T* phi, T lambda, T A, @@ -225,71 +359,117 @@ void cpu_scalar_psf(stim::complex&lt;T&gt;* F, size_t N, T* r, T* phi, T lambda, T A,
225 359
226 free(C); 360 free(C);
227 free(Pl_cos_phi); 361 free(Pl_cos_phi);
228 -#else  
229 - T min_r = r[0];  
230 - T max_r = r[0];  
231 - for(size_t i = 0; i < N; i++){ //find the minimum and maximum values of r (min and max distance from the focal point)  
232 - if(r[i] < min_r) min_r = r[i];  
233 - if(r[i] > max_r) max_r = r[i];  
234 - }  
235 - T min_kr = k * min_r;  
236 - T max_kr = k * max_r; 362 +}
237 363
238 - //temporary variables  
239 - double vm;  
240 - double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );  
241 - double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );  
242 - double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );  
243 - double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); 364 +/// Converts a set of cartesian points into spherical coordinates surrounding a point spread function (PSF)
  365 +/// @param r is the output distance from the PSF
  366 +/// @param phi is the non-symmetric direction about the PSF
  367 +/// @param x (x, y, z) are the cartesian coordinates in world space
  368 +/// @f is the focal point of the PSF in cartesian coordinates
  369 +/// @d is the propagation direction of the PSF in cartesian coordinates
  370 +template<typename T>
  371 +__global__ void cuda_cart2psf(T* r, T* phi, size_t N, T* x, T* y, T* z, stim::vec3<T> f, stim::quaternion<T> q){
244 372
245 - size_t Nlut = (size_t)sqrt(N) * 2;  
246 - T* bessel_lut = (T*) malloc(sizeof(T) * (Nl+1) * Nlut);  
247 - T delta_kr = (max_kr - min_kr) / (Nlut-1);  
248 - for(size_t kri = 0; kri < Nlut; kri++){  
249 - stim::bessjyv_sph<double>(Nl, min_kr + kri * delta_kr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]  
250 - for(size_t l = 0; l <= Nl; l++){  
251 - bessel_lut[kri * (Nl + 1) + l] = (T)jv[l];  
252 - }  
253 - } 373 + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
  374 + if(i >= N) return; //exit if this thread is outside the array
254 375
255 - T* Pl_cos_phi = (T*) malloc((Nl + 1) * sizeof(T));  
256 - T kr, cos_phi, jl, Pl;  
257 - for(size_t n = 0; n < N; n++){ //for each point in the field  
258 - kr = k * r[n]; //calculate kr (the optical distance between the focal point and p)  
259 - cos_phi = std::cos(phi[n]); //calculate the cosine of phi  
260 - stim::legendre<T>(Nl, cos_phi, Pl_cos_phi); //calculate the [0 Nl] legendre polynomials for this point 376 + stim::vec3<T> p; //declare a 3D point
  377 +
  378 + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
  379 + (y == NULL) ? p[1] = 0 : p[1] = y[i];
  380 + (z == NULL) ? p[2] = 0 : p[2] = z[i];
261 381
262 - for(int l = 0; l <= Nl; l++){  
263 - jl = lut_lookup<T>(&bessel_lut[l], kr, Nlut, min_kr, delta_kr, Nl+1);  
264 - Pl = Pl_cos_phi[l];  
265 - F[n] += pow(complex<T>(0, 1), l) * jl * Pl * C[l];  
266 - }  
267 - F[n] *= A * stim::TAU;  
268 - }  
269 -#endif 382 + p = p - f; //shift the point to the center of the PSF (focal point)
  383 + p = q.toMatrix3() * p; //rotate the point to align with the propagation direction
  384 +
  385 + stim::vec3<T> ps = p.cart2sph(); //convert from cartesian to spherical coordinates
  386 + r[i] = ps[0]; //store r
  387 + phi[i] = ps[2]; //phi = [0 pi]
270 } 388 }
271 389
  390 +#ifdef __CUDACC__
  391 +/// Calculate the analytical solution to a point spread function given a set of points in cartesian coordinates
  392 +template<typename T>
  393 +void gpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
  394 +
  395 + T* gpu_r; //allocate space for the coordinates in r
  396 + HANDLE_ERROR( cudaMalloc(&gpu_r, sizeof(T) * N) );
  397 + T* gpu_phi;
  398 + HANDLE_ERROR( cudaMalloc(&gpu_phi, sizeof(T) * N) );
  399 + //stim::complex<T>* gpu_E;
  400 + //HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex<T>) * N) );
  401 +
  402 + stim::quaternion<T> q; //create a quaternion
  403 + q.CreateRotation(d, stim::vec3<T>(0, 0, 1)); //create a mapping from the propagation direction to the PSF space
  404 + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
  405 + dim3 blocks( (unsigned)(N / threads + 1)); //calculate the optimal number of blocks
  406 + cuda_cart2psf<T> <<< blocks, threads >>> (gpu_r, gpu_phi, N, x, y, z, f, q); //call the CUDA kernel to move the cartesian coordinates to PSF space
  407 +
  408 + gpu_scalar_psf_local(E, N, gpu_r, gpu_phi, lambda, A, NA, NA_in, Nl, r_spacing);
  409 +
  410 +}
  411 +#endif
272 412
273 template<typename T> 413 template<typename T>
274 -void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, T NA, T NA_in, int Nl){ 414 +void cpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
  415 +
  416 +// If CUDA is available, copy the cartesian points to the GPU and evaluate them in a kernel
  417 +#ifdef __CUDACC__
  418 +
  419 + T* gpu_x = NULL;
  420 + if(x != NULL){
  421 + HANDLE_ERROR( cudaMalloc(&gpu_x, sizeof(T) * N) );
  422 + HANDLE_ERROR( cudaMemcpy(gpu_x, x, sizeof(T) * N, cudaMemcpyHostToDevice) );
  423 + }
  424 + T* gpu_y = NULL;
  425 + if(y != NULL){
  426 + HANDLE_ERROR( cudaMalloc(&gpu_y, sizeof(T) * N) );
  427 + HANDLE_ERROR( cudaMemcpy(gpu_y, y, sizeof(T) * N, cudaMemcpyHostToDevice) );
  428 + }
  429 + T* gpu_z = NULL;
  430 + if(z != NULL){
  431 + HANDLE_ERROR( cudaMalloc(&gpu_z, sizeof(T) * N) );
  432 + HANDLE_ERROR( cudaMemcpy(gpu_z, z, sizeof(T) * N, cudaMemcpyHostToDevice) );
  433 + }
  434 +
  435 + stim::complex<T>* gpu_E;
  436 + HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex<T>) * N) );
  437 + HANDLE_ERROR( cudaMemcpy(gpu_E, E, sizeof(stim::complex<T>) * N, cudaMemcpyHostToDevice) );
  438 + gpu_scalar_psf_cart<T>(gpu_E, N, gpu_x, gpu_y, gpu_z, lambda, A, f, d, NA, NA_in, Nl, r_spacing);
  439 + HANDLE_ERROR( cudaMemcpy(E, gpu_E, sizeof(stim::complex<T>) * N, cudaMemcpyDeviceToHost) );
  440 +
  441 + HANDLE_ERROR( cudaFree(gpu_x) );
  442 + HANDLE_ERROR( cudaFree(gpu_y) );
  443 + HANDLE_ERROR( cudaFree(gpu_z) );
  444 + HANDLE_ERROR( cudaFree(gpu_E) );
  445 +
  446 +#else
275 T* r = (T*) malloc(N * sizeof(T)); //allocate space for p in spherical coordinates 447 T* r = (T*) malloc(N * sizeof(T)); //allocate space for p in spherical coordinates
276 T* phi = (T*) malloc(N * sizeof(T)); // only r and phi are necessary (the scalar PSF is symmetric about theta) 448 T* phi = (T*) malloc(N * sizeof(T)); // only r and phi are necessary (the scalar PSF is symmetric about theta)
277 449
278 - stim::vec3<T> p, ps; 450 + stim::quaternion<T> q;
  451 + q.CreateRotation(d, stim::vec3<T>(0, 0, 1));
  452 + stim::matrix<T, 3> R = q.toMatrix3();
  453 + stim::vec3<T> p, ps, ds;
279 for(size_t i = 0; i < N; i++){ 454 for(size_t i = 0; i < N; i++){
280 (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions 455 (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
281 (y == NULL) ? p[1] = 0 : p[1] = y[i]; 456 (y == NULL) ? p[1] = 0 : p[1] = y[i];
282 (z == NULL) ? p[2] = 0 : p[2] = z[i]; 457 (z == NULL) ? p[2] = 0 : p[2] = z[i];
283 458
  459 + p = p - f;
  460 +
  461 + p = R * p; //rotate the cartesian point
  462 +
284 ps = p.cart2sph(); //convert from cartesian to spherical coordinates 463 ps = p.cart2sph(); //convert from cartesian to spherical coordinates
285 r[i] = ps[0]; //store r 464 r[i] = ps[0]; //store r
286 phi[i] = ps[2]; //phi = [0 pi] 465 phi[i] = ps[2]; //phi = [0 pi]
287 } 466 }
288 467
289 - cpu_scalar_psf(F, N, r, phi, lambda, A, f, NA, NA_in, Nl); //call the spherical coordinate CPU function 468 + cpu_scalar_psf_local(F, N, r, phi, lambda, A, NA, NA_in, Nl); //call the spherical coordinate CPU function
290 469
291 free(r); 470 free(r);
292 free(phi); 471 free(phi);
  472 +#endif
293 } 473 }
294 474
295 } //end namespace stim 475 } //end namespace stim
stim/optics/scalarwave.h
@@ -60,7 +60,7 @@ public: @@ -60,7 +60,7 @@ public:
60 return k.len(); 60 return k.len();
61 } 61 }
62 62
63 - CUDA_CALLABLE vec3< complex<T> > E(){ 63 + CUDA_CALLABLE complex<T> E(){
64 return E0; 64 return E0;
65 } 65 }
66 66
@@ -235,6 +235,34 @@ void gpu_scalarwave(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scala @@ -235,6 +235,34 @@ void gpu_scalarwave(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scala
235 cuda_scalarwave<T><<< blocks, threads >>>(F, N, x, y, z, w); //call the kernel 235 cuda_scalarwave<T><<< blocks, threads >>>(F, N, x, y, z, w); //call the kernel
236 } 236 }
237 237
  238 +template<typename T>
  239 +void gpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW){
  240 +
  241 + size_t wave_bytes = sizeof(stim::scalarwave<T>);
  242 + size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available
  243 + size_t array_bytes = nW * wave_bytes; //calculate the maximum number of bytes required for the planewave array
  244 + size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory
  245 + size_t num_batches = nW / max_batch + 1; //calculate the number of batches required to process all plane waves
  246 + size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required
  247 +
  248 + stim::scalarwave<T>* batch_W;
  249 + HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes)); //allocate memory for a single batch of plane waves
  250 +
  251 + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
  252 + dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
  253 +
  254 + size_t batch_size; //declare a variable to store the size of the current batch
  255 + size_t waves_processed = 0; //initialize the number of waves processed to zero
  256 + while(waves_processed < nW){ //while there are still waves to be processed
  257 + batch_size = min<size_t>(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left
  258 + batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
  259 + HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory
  260 + cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(F, N, x, y, z, batch_W, batch_size); //call the kernel
  261 + waves_processed += batch_size; //increment the counter indicating how many waves have been processed
  262 + }
  263 + cudaFree(batch_W);
  264 +}
  265 +
238 /// Sums a series of coherent plane waves at a specified point 266 /// Sums a series of coherent plane waves at a specified point
239 /// @param field is the output array of field values corresponding to each input point 267 /// @param field is the output array of field values corresponding to each input point
240 /// @param x is an array of x coordinates for the field point 268 /// @param x is an array of x coordinates for the field point
@@ -245,24 +273,13 @@ void gpu_scalarwave(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scala @@ -245,24 +273,13 @@ void gpu_scalarwave(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scala
245 /// @param A is the list of amplitudes for each wave 273 /// @param A is the list of amplitudes for each wave
246 /// @param S is the list of propagation directions for each wave 274 /// @param S is the list of propagation directions for each wave
247 template<typename T> 275 template<typename T>
248 -void cpu_sum_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > w_array){  
249 - size_t S = w_array.size(); //store the number of waves  
250 -#ifdef NO_CUDA  
251 - memset(F, 0, N * sizeof(stim::complex<T>));  
252 - T px, py, pz;  
253 - for(size_t i = 0; i < N; i++){ // for each element in the array  
254 - (x == NULL) ? px = 0 : px = x[i]; // test for NULL values  
255 - (y == NULL) ? py = 0 : py = y[i];  
256 - (z == NULL) ? pz = 0 : pz = z[i];  
257 -  
258 - for(size_t s = 0; s < S; s++){  
259 - F[i] += w_array[s].pos(px, py, pz); //sum all plane waves at this point  
260 - }  
261 - }  
262 -#else 276 +void cpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W){
  277 + size_t S = W.size(); //store the number of waves
  278 +#ifdef __CUDACC__
263 stim::complex<T>* dev_F; //allocate space for the field 279 stim::complex<T>* dev_F; //allocate space for the field
264 cudaMalloc(&dev_F, N * sizeof(stim::complex<T>)); 280 cudaMalloc(&dev_F, N * sizeof(stim::complex<T>));
265 - cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used) 281 + cudaMemcpy(dev_F, F, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
  282 + //cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used)
266 283
267 T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) 284 T* dev_x = NULL; //allocate space and copy the X coordinate (if specified)
268 if(x != NULL){ 285 if(x != NULL){
@@ -282,28 +299,11 @@ void cpu_sum_scalarwaves(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, std::v @@ -282,28 +299,11 @@ void cpu_sum_scalarwaves(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, std::v
282 HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); 299 HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
283 } 300 }
284 301
285 - size_t wave_bytes = sizeof(stim::scalarwave<T>);  
286 - size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available  
287 - size_t array_bytes = w_array.size() * wave_bytes; //calculate the maximum number of bytes required for the planewave array  
288 - size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory  
289 - size_t num_batches = w_array.size() / max_batch + 1; //calculate the number of batches required to process all plane waves  
290 - size_t batch_bytes = min(w_array.size(), max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required  
291 -  
292 - stim::scalarwave<T>* dev_w;  
293 - HANDLE_ERROR(cudaMalloc(&dev_w, batch_bytes)); //allocate memory for a single batch of plane waves 302 + stim::scalarwave<T>* dev_W;
  303 + HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
  304 + HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
294 305
295 - int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device  
296 - dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks  
297 -  
298 - size_t batch_size; //declare a variable to store the size of the current batch  
299 - size_t waves_processed = 0; //initialize the number of waves processed to zero  
300 - while(waves_processed < w_array.size()){ //while there are still waves to be processed  
301 - batch_size = min<size_t>(max_batch, w_array.size() - waves_processed); //process either a whole batch, or whatever is left  
302 - batch_bytes = batch_size * sizeof(stim::scalarwave<T>);  
303 - HANDLE_ERROR(cudaMemcpy(dev_w, &w_array[waves_processed], batch_bytes, cudaMemcpyHostToDevice)); //copy the plane waves into global memory  
304 - cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(dev_F, N, dev_x, dev_y, dev_z, dev_w, batch_size); //call the kernel  
305 - waves_processed += batch_size; //increment the counter indicating how many waves have been processed  
306 - } 306 + gpu_scalarwaves(dev_F, N, dev_x, dev_y, dev_z, dev_W, W.size());
307 307
308 cudaMemcpy(F, dev_F, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory 308 cudaMemcpy(F, dev_F, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory
309 309
@@ -311,15 +311,25 @@ void cpu_sum_scalarwaves(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, std::v @@ -311,15 +311,25 @@ void cpu_sum_scalarwaves(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, std::v
311 if(y != NULL) cudaFree(dev_y); 311 if(y != NULL) cudaFree(dev_y);
312 if(z != NULL) cudaFree(dev_z); 312 if(z != NULL) cudaFree(dev_z);
313 cudaFree(dev_F); 313 cudaFree(dev_F);
314 - cudaFree(dev_w); 314 +#else
  315 + memset(F, 0, N * sizeof(stim::complex<T>));
  316 + T px, py, pz;
  317 + for(size_t i = 0; i < N; i++){ // for each element in the array
  318 + (x == NULL) ? px = 0 : px = x[i]; // test for NULL values
  319 + (y == NULL) ? py = 0 : py = y[i];
  320 + (z == NULL) ? pz = 0 : pz = z[i];
315 321
  322 + for(size_t s = 0; s < S; s++){
  323 + F[i] += w_array[s].pos(px, py, pz); //sum all plane waves at this point
  324 + }
  325 + }
316 #endif 326 #endif
317 } 327 }
318 328
319 template<typename T> 329 template<typename T>
320 void cpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){ 330 void cpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
321 std::vector< stim::scalarwave<T> > w_array(1, w); 331 std::vector< stim::scalarwave<T> > w_array(1, w);
322 - cpu_sum_scalarwaves(F, N, x, y, z, w_array); 332 + cpu_scalarwaves(F, N, x, y, z, w_array);
323 } 333 }
324 334
325 335
@@ -331,7 +341,7 @@ void cpu_scalarwave(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scala @@ -331,7 +341,7 @@ void cpu_scalarwave(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scala
331 /// @param A is the list of amplitudes for each wave 341 /// @param A is the list of amplitudes for each wave
332 /// @param S is the list of propagation directions for each wave 342 /// @param S is the list of propagation directions for each wave
333 template<typename T> 343 template<typename T>
334 -CUDA_CALLABLE stim::complex<T> sum_scalarwaves(T x, T y, T z, std::vector< stim::scalarwave<T> > W){ 344 +CUDA_CALLABLE stim::complex<T> cpu_scalarwaves(T x, T y, T z, std::vector< stim::scalarwave<T> > W){
335 size_t N = W.size(); //get the number of plane wave samples 345 size_t N = W.size(); //get the number of plane wave samples
336 stim::complex<T> field(0, 0); //initialize the field to zero (0) 346 stim::complex<T> field(0, 0); //initialize the field to zero (0)
337 stim::vec3<T> k; //allocate space for the direction vector 347 stim::vec3<T> k; //allocate space for the direction vector