Commit d31038b3bf8eff2208bea9e67399009e2eeecec2

Authored by David Mayerich
2 parents 0bb6cf51 0288346a

Merge branch 'master' of git.stim.ee.uh.edu:codebase/stimlib

stim/cuda/cudatools/devices.h
... ... @@ -15,7 +15,7 @@ int maxThreadsPerBlock()
15 15 }
16 16  
17 17 extern "C"
18   -int sharedMemPerBlock()
  18 +size_t sharedMemPerBlock()
19 19 {
20 20 int device;
21 21 cudaGetDevice(&device); //get the id of the current device
... ... @@ -23,6 +23,16 @@ int sharedMemPerBlock()
23 23 cudaGetDeviceProperties(&props, device);
24 24 return props.sharedMemPerBlock;
25 25 }
  26 +
  27 +extern "C"
  28 +size_t constMem()
  29 +{
  30 + int device;
  31 + cudaGetDevice(&device); //get the id of the current device
  32 + cudaDeviceProp props; //device property structure
  33 + cudaGetDeviceProperties(&props, device);
  34 + return props.totalConstMem;
  35 +}
26 36 } //end namespace rts
27 37  
28 38 #endif
... ...
stim/cuda/sharedmem.cuh
... ... @@ -5,7 +5,7 @@
5 5 namespace stim{
6 6 namespace cuda{
7 7  
8   - // Copies values from global memory to shared memory, optimizing threads
  8 + // Copies values from texture memory to shared memory, optimizing threads
9 9 template<typename T>
10 10 __device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src,
11 11 unsigned int x, unsigned int y, unsigned int X, unsigned int Y,
... ... @@ -35,6 +35,19 @@ namespace stim{
35 35 }
36 36 }
37 37  
  38 + // Copies values from global memory to shared memory, optimizing threads
  39 + template<typename T>
  40 + __device__ void sharedMemcpy(T* dest, T* src, size_t N, size_t tid, size_t nt){
  41 +
  42 + size_t I = N / nt + 1; //calculate the number of iterations required to make the copy
  43 + size_t xi = tid; //initialize the source and destination index to the thread ID
  44 + for(size_t i = 0; i < I; i++){ //for each iteration
  45 + if(xi < N) //if the index is within the copy region
  46 + dest[xi] = src[xi]; //perform the copy
  47 + xi += nt;
  48 + }
  49 + }
  50 +
38 51  
39 52 }
40 53 }
... ...
stim/math/bessel.h
... ... @@ -17,6 +17,11 @@ static complex&lt;double&gt; czero(0.0,0.0);
17 17 template< typename P >
18 18 P gamma(P x)
19 19 {
  20 + const P EPS = numeric_limits<P>::epsilon();
  21 + const P FPMIN_MAG = numeric_limits<P>::min();
  22 + const P FPMIN = numeric_limits<P>::lowest();
  23 + const P FPMAX = numeric_limits<P>::max();
  24 +
20 25 int i,k,m;
21 26 P ga,gr,r,z;
22 27  
... ... @@ -47,7 +52,7 @@ P gamma(P x)
47 52 -0.54e-14,
48 53 0.14e-14};
49 54  
50   - if (x > 171.0) return 1e308; // This value is an overflow flag.
  55 + if (x > 171.0) return FPMAX; // This value is an overflow flag.
51 56 if (x == (int)x) {
52 57 if (x > 0.0) {
53 58 ga = 1.0; // use factorial
... ... @@ -56,7 +61,7 @@ P gamma(P x)
56 61 }
57 62 }
58 63 else
59   - ga = 1e308;
  64 + ga = FPMAX;
60 65 }
61 66 else {
62 67 if (fabs(x) > 1.0) {
... ... @@ -89,6 +94,11 @@ template&lt;typename P&gt;
89 94 int bessjy01a(P x,P &j0,P &j1,P &y0,P &y1,
90 95 P &j0p,P &j1p,P &y0p,P &y1p)
91 96 {
  97 + const P EPS = numeric_limits<P>::epsilon();
  98 + const P FPMIN_MAG = numeric_limits<P>::min();
  99 + const P FPMIN = numeric_limits<P>::lowest();
  100 + const P FPMAX = numeric_limits<P>::max();
  101 +
92 102 P x2,r,ec,w0,w1,r0,r1,cs0,cs1;
93 103 P cu,p0,q0,p1,q1,t1,t2;
94 104 int k,kz;
... ... @@ -157,12 +167,12 @@ int bessjy01a(P x,P &amp;j0,P &amp;j1,P &amp;y0,P &amp;y1,
157 167 if (x == 0.0) {
158 168 j0 = 1.0;
159 169 j1 = 0.0;
160   - y0 = -1e308;
161   - y1 = -1e308;
  170 + y0 = -FPMIN;
  171 + y1 = -FPMIN;
162 172 j0p = 0.0;
163 173 j1p = 0.5;
164   - y0p = 1e308;
165   - y1p = 1e308;
  174 + y0p = FPMAX;
  175 + y1p = FPMAX;
166 176 return 0;
167 177 }
168 178 x2 = x*x;
... ... @@ -329,7 +339,7 @@ int msta1(P x,int mp)
329 339 for (i=0;i<20;i++) {
330 340 nn = (int)(n1-(n1-n0)/(1.0-f0/f1));
331 341 f = 0.5*log10(6.28*nn)-nn*log10(1.36*a0/nn)-mp;
332   - if (abs(nn-n1) < 1) break;
  342 + if (std::abs(nn-n1) < 1) break;
333 343 n0 = n1;
334 344 f0 = f1;
335 345 n1 = nn;
... ... @@ -361,7 +371,7 @@ int msta2(P x,int n,int mp)
361 371 for (i=0;i<20;i++) {
362 372 nn = (int)(n1-(n1-n0)/(1.0-f0/f1));
363 373 f = 0.5*log10(6.28*nn)-nn*log10(1.36*a0/nn)-obj;
364   - if (abs(nn-n1) < 1) break;
  374 + if (std::abs(nn-n1) < 1) break;
365 375 n0 = n1;
366 376 f0 = f1;
367 377 n1 = nn;
... ... @@ -596,21 +606,26 @@ int bessjyv(P v,P x,P &amp;vm,P *jv,P *yv,
596 606 P b,ec,w0,w1,bju0,bju1,pv0,pv1,byvk;
597 607 int j,k,l,m,n,kz;
598 608  
  609 + const P EPS = numeric_limits<P>::epsilon();
  610 + const P FPMIN_MAG = numeric_limits<P>::min();
  611 + const P FPMIN = numeric_limits<P>::lowest();
  612 + const P FPMAX = numeric_limits<P>::max();
  613 +
599 614 x2 = x*x;
600 615 n = (int)v;
601 616 v0 = v-n;
602 617 if ((x < 0.0) || (v < 0.0)) return 1;
603   - if (x < 1e-15) {
  618 + if (x < EPS) {
604 619 for (k=0;k<=n;k++) {
605 620 jv[k] = 0.0;
606   - yv[k] = -1e308;
  621 + yv[k] = FPMIN;
607 622 djv[k] = 0.0;
608   - dyv[k] = 1e308;
  623 + dyv[k] = FPMAX;
609 624 if (v0 == 0.0) {
610 625 jv[0] = 1.0;
611 626 djv[1] = 0.5;
612 627 }
613   - else djv[0] = 1e308;
  628 + else djv[0] = FPMAX;
614 629 }
615 630 vm = v;
616 631 return 0;
... ... @@ -623,7 +638,7 @@ int bessjyv(P v,P x,P &amp;vm,P *jv,P *yv,
623 638 for (k=1;k<=40;k++) {
624 639 r *= -0.25*x2/(k*(k+vl));
625 640 bjvl += r;
626   - if (fabs(r) < fabs(bjvl)*1e-15) break;
  641 + if (fabs(r) < fabs(bjvl)*EPS) break;
627 642 }
628 643 vg = 1.0 + vl;
629 644 a = pow(0.5*x,vl)/gamma(vg);
... ... @@ -686,7 +701,7 @@ int bessjyv(P v,P x,P &amp;vm,P *jv,P *yv,
686 701 if (m < n) n = m;
687 702 else m = msta2(x,n,15);
688 703 f2 = 0.0;
689   - f1 = 1.0e-100;
  704 + f1 = FPMIN_MAG;
690 705 for (k=m;k>=0;k--) {
691 706 f = 2.0*(v0+k+1.0)*f1/x-f2;
692 707 if (k <= n) jv[k] = f;
... ... @@ -763,20 +778,26 @@ int bessjyv(P v,P x,P &amp;vm,P *jv,P *yv,
763 778  
764 779 template<typename P>
765 780 int bessjyv_sph(int v, P z, P &vm, P* cjv,
766   - P* cyv, P* cjvp, P* cyvp)
767   -{
  781 + P* cyv, P* cjvp, P* cyvp){
  782 +
768 783 //first, compute the bessel functions of fractional order
769   - bessjyv(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp);
  784 + bessjyv<P>(v + (P)0.5, z, vm, cjv, cyv, cjvp, cyvp);
  785 +
  786 + if(z == 0){ //handle degenerate case of z = 0
  787 + memset(cjv, 0, sizeof(P) * (v+1));
  788 + cjv[0] = 1;
  789 + }
770 790  
771 791 //iterate through each and scale
772   - for(int n = 0; n<=v; n++)
773   - {
  792 + for(int n = 0; n<=v; n++){
774 793  
775   - cjv[n] = cjv[n] * sqrt(rtsPI/(z * 2.0));
776   - cyv[n] = cyv[n] * sqrt(rtsPI/(z * 2.0));
  794 + if(z != 0){ //handle degenerate case of z = 0
  795 + cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
  796 + cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
  797 + }
777 798  
778   - cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(rtsPI / (z * 2.0));
779   - cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(rtsPI / (z * 2.0));
  799 + cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0));
  800 + cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0));
780 801 }
781 802  
782 803 return 0;
... ... @@ -1498,11 +1519,11 @@ int cbessjyva_sph(int v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1498 1519 for(int n = 0; n<=v; n++)
1499 1520 {
1500 1521  
1501   - cjv[n] = cjv[n] * sqrt(rtsPI/(z * 2.0));
1502   - cyv[n] = cyv[n] * sqrt(rtsPI/(z * 2.0));
  1522 + cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
  1523 + cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
1503 1524  
1504   - cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(rtsPI / (z * 2.0));
1505   - cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(rtsPI / (z * 2.0));
  1525 + cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0));
  1526 + cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0));
1506 1527 }
1507 1528  
1508 1529 return 0;
... ...
stim/math/complex.h
1   -/*RTS Complex number class. This class is CUDA compatible,
2   -and can therefore be used in CUDA code and on CUDA devices.
3   -*/
  1 +/// CUDA compatible complex number class
4 2  
5   -#ifndef RTS_COMPLEX
6   -#define RTS_COMPLEX
  3 +#ifndef STIM_COMPLEX
  4 +#define STIM_COMPLEX
7 5  
8   -#include "../cuda/callable.h"
  6 +#include "../cuda/cudatools/callable.h"
9 7 #include <cmath>
10 8 #include <string>
11 9 #include <sstream>
... ... @@ -230,12 +228,6 @@ struct complex
230 228 return result;
231 229 }
232 230  
233   - /*CUDA_CALLABLE complex<T> pow(int y)
234   - {
235   -
236   - return pow((double)y);
237   - }*/
238   -
239 231 CUDA_CALLABLE complex<T> pow(T y)
240 232 {
241 233 complex<T> result;
... ... @@ -328,8 +320,31 @@ struct complex
328 320 return *this;
329 321 }
330 322  
  323 +
  324 +
331 325 };
332 326  
  327 +/// Cast an array of complex values to an array of real values
  328 +template<typename T>
  329 +static void real(T* r, complex<T>* c, size_t n){
  330 + for(size_t i = 0; i < n; i++)
  331 + r[i] = c[i].real();
  332 +}
  333 +
  334 +/// Cast an array of complex values to an array of real values
  335 +template<typename T>
  336 +static void imag(T* r, complex<T>* c, size_t n){
  337 + for(size_t i = 0; i < n; i++)
  338 + r[i] = c[i].imag();
  339 +}
  340 +
  341 +/// Calculate the magnitude of an array of complex values
  342 +template<typename T>
  343 +static void abs(T* m, complex<T>* c, size_t n){
  344 + for(size_t i = 0; i < n; i++)
  345 + m[i] = c[i].abs();
  346 +}
  347 +
333 348 } //end RTS namespace
334 349  
335 350 //addition
... ... @@ -432,17 +447,6 @@ CUDA_CALLABLE static T imag(stim::complex&lt;T&gt; a)
432 447 return a.i;
433 448 }
434 449  
435   -//trigonometric functions
436   -//template<class A>
437   -/*CUDA_CALLABLE static stim::complex<float> sinf(const stim::complex<float> x)
438   -{
439   - stim::complex<float> result;
440   - result.r = sinf(x.r) * coshf(x.i);
441   - result.i = cosf(x.r) * sinhf(x.i);
442   -
443   - return result;
444   -}*/
445   -
446 450 template<class A>
447 451 CUDA_CALLABLE stim::complex<A> sin(const stim::complex<A> x)
448 452 {
... ... @@ -453,17 +457,6 @@ CUDA_CALLABLE stim::complex&lt;A&gt; sin(const stim::complex&lt;A&gt; x)
453 457 return result;
454 458 }
455 459  
456   -//floating point template
457   -//template<class A>
458   -/*CUDA_CALLABLE static stim::complex<float> cosf(const stim::complex<float> x)
459   -{
460   - stim::complex<float> result;
461   - result.r = cosf(x.r) * coshf(x.i);
462   - result.i = -(sinf(x.r) * sinhf(x.i));
463   -
464   - return result;
465   -}*/
466   -
467 460 template<class A>
468 461 CUDA_CALLABLE stim::complex<A> cos(const stim::complex<A> x)
469 462 {
... ... @@ -496,10 +489,4 @@ std::istream&amp; operator&gt;&gt;(std::istream&amp; is, stim::complex&lt;A&gt;&amp; x)
496 489 return is; //return the stream
497 490 }
498 491  
499   -//#if __GNUC__ > 3 && __GNUC_MINOR__ > 7
500   -//template<class T> using rtsComplex = stim::complex<T>;
501   -//#endif
502   -
503   -
504   -
505 492 #endif
... ...
stim/math/constants.h
1   -#ifndef RTS_CONSTANTS_H
2   -#define RTS_CONSTANTS_H
  1 +#ifndef STIM_CONSTANTS_H
  2 +#define STIM_CONSTANTS_H
3 3  
4   -#define stimPI 3.14159
5   -#define stimTAU 2 * rtsPI
  4 +namespace stim{
  5 + const double PI = 3.1415926535897932384626433832795028841971693993751058209749445923078164062862;
  6 + const double TAU = 2 * stim::PI;
  7 +}
6 8  
7 9 #endif
... ...
stim/math/legendre.h
1 1 #ifndef RTS_LEGENDRE_H
2 2 #define RTS_LEGENDRE_H
3 3  
4   -#include "rts/cuda/callable.h"
  4 +#include "../cuda/cudatools/callable.h"
5 5  
6 6 namespace stim{
7 7  
... ... @@ -24,9 +24,11 @@ CUDA_CALLABLE void shift_legendre(int n, T x, T&amp; P0, T&amp; P1)
24 24 P1 = Pnew;
25 25 }
26 26  
  27 +/// Iteratively evaluates the Legendre polynomials for orders l = [0 n]
27 28 template <typename T>
28 29 CUDA_CALLABLE void legendre(int n, T x, T* P)
29 30 {
  31 + if(n < 0) return;
30 32 P[0] = 1;
31 33  
32 34 if(n >= 1)
... ...
stim/math/matrix.h
... ... @@ -50,10 +50,8 @@ struct matrix
50 50 return *this;
51 51 }
52 52  
53   -
54 53 template<typename Y>
55   - CUDA_CALLABLE vec<Y> operator*(vec<Y> rhs)
56   - {
  54 + vec<Y> operator*(vec<Y> rhs){
57 55 unsigned int N = rhs.size();
58 56  
59 57 vec<Y> result;
... ... @@ -66,6 +64,16 @@ struct matrix
66 64 return result;
67 65 }
68 66  
  67 + template<typename Y>
  68 + CUDA_CALLABLE vec3<Y> operator*(vec3<Y> rhs){
  69 + vec3<Y> result = 0;
  70 + for(int r=0; r<3; r++)
  71 + for(int c=0; c<3; c++)
  72 + result[r] += (*this)(r, c) * rhs[c];
  73 +
  74 + return result;
  75 + }
  76 +
69 77 std::string toStr()
70 78 {
71 79 std::stringstream ss;
... ... @@ -82,10 +90,6 @@ struct matrix
82 90  
83 91 return ss.str();
84 92 }
85   -
86   -
87   -
88   -
89 93 };
90 94  
91 95 } //end namespace rts
... ...
stim/math/meshgrid.h 0 → 100644
  1 +#ifndef STIM_MESHGRID_H
  2 +#define STIM_MESHGRID_H
  3 +
  4 +namespace stim{
  5 +
  6 + /// Create a 2D grid based on a pair of vectors representing the grid spacing (see Matlab)
  7 + /// @param X is an [nx x ny] array that will store the X coordinates for each 2D point
  8 + /// @param Y is an [nx x ny] array that will store the Y coordinates for each 2D point
  9 + /// @param x is an [nx] array that provides the positions of grid points in the x direction
  10 + /// @param nx is the number of grid points in the x direction
  11 + /// @param y is an [ny] array that provides the positions of grid points in the y direction
  12 + /// @param ny is the number of grid points in the y direction
  13 + template<typename T>
  14 + void meshgrid(T* X, T* Y, T* x, size_t nx, T* y, size_t ny){
  15 + size_t xi, yi; //allocate index variables
  16 + for(yi = 0; yi < ny; yi++){ //iterate through each column
  17 + for(xi = 0; xi < nx; xi++){ //iterate through each row
  18 + X[yi * nx + xi] = x[xi];
  19 + Y[yi * nx + xi] = y[yi];
  20 + }
  21 + }
  22 + }
  23 +
  24 + /// Creates an array of n equally spaced values in the range [xmin xmax]
  25 + /// @param X is an array of length n that stores the values
  26 + /// @param xmin is the start point of the array
  27 + /// @param xmax is the end point of the array
  28 + /// @param n is the number of points in the array
  29 + template<typename T>
  30 + void linspace(T* X, T xmin, T xmax, size_t n){
  31 + T alpha;
  32 + for(size_t i = 0; i < n; i++){
  33 + alpha = (T)i / (T)n;
  34 + X[i] = (1 - alpha) * xmin + alpha * xmax;
  35 + }
  36 + }
  37 +
  38 +
  39 +}
  40 +
  41 +
  42 +#endif
0 43 \ No newline at end of file
... ...
stim/math/quaternion.h
... ... @@ -26,13 +26,13 @@ public:
26 26  
27 27 CUDA_CALLABLE void CreateRotation(T theta, T ux, T uy, T uz){
28 28  
29   - vec<T> u(ux, uy, uz);
  29 + vec3<T> u(ux, uy, uz);
30 30 CreateRotation(theta, u);
31 31 }
32 32  
33   - CUDA_CALLABLE void CreateRotation(T theta, vec<T> u){
  33 + CUDA_CALLABLE void CreateRotation(T theta, vec3<T> u){
34 34  
35   - vec<T> u_hat = u.norm();
  35 + vec3<T> u_hat = u.norm();
36 36  
37 37 //assign the given Euler rotation to this quaternion
38 38 w = (T)cos(theta/2);
... ... @@ -41,9 +41,9 @@ public:
41 41 z = u_hat[2]*(T)sin(theta/2);
42 42 }
43 43  
44   - void CreateRotation(vec<T> from, vec<T> to){
  44 + CUDA_CALLABLE void CreateRotation(vec3<T> from, vec3<T> to){
45 45  
46   - vec<T> r = from.cross(to); //compute the rotation vector
  46 + vec3<T> r = from.cross(to); //compute the rotation vector
47 47 T theta = asin(r.len()); //compute the angle of the rotation about r
48 48 //deal with a zero vector (both k and kn point in the same direction)
49 49 if(theta == (T)0){
... ...
stim/math/vec3.h 0 → 100644
  1 +#ifndef STIM_VEC3_H
  2 +#define STIM_VEC3_H
  3 +
  4 +
  5 +#include <stim/cuda/cudatools/callable.h>
  6 +
  7 +
  8 +namespace stim{
  9 +
  10 +
  11 +/// A class designed to act as a 3D vector with CUDA compatibility
  12 +template<typename T>
  13 +class vec3{
  14 +
  15 +protected:
  16 + T ptr[3];
  17 +
  18 +public:
  19 +
  20 + CUDA_CALLABLE vec3(){}
  21 +
  22 + CUDA_CALLABLE vec3(T v){
  23 + ptr[0] = ptr[1] = ptr[2] = v;
  24 + }
  25 +
  26 + CUDA_CALLABLE vec3(T x, T y, T z){
  27 + ptr[0] = x;
  28 + ptr[1] = y;
  29 + ptr[2] = z;
  30 + }
  31 +
  32 + //copy constructor
  33 + CUDA_CALLABLE vec3( const vec3<T>& other){
  34 + ptr[0] = other.ptr[0];
  35 + ptr[1] = other.ptr[1];
  36 + ptr[2] = other.ptr[2];
  37 + }
  38 +
  39 + //access an element using an index
  40 + CUDA_CALLABLE T& operator[](int idx){
  41 + return ptr[idx];
  42 + }
  43 +
  44 +/// Casting operator. Creates a new vector with a new type U.
  45 + template< typename U >
  46 + CUDA_CALLABLE operator vec3<U>(){
  47 + vec3<U> result;
  48 + result.ptr[0] = (U)ptr[0];
  49 + result.ptr[1] = (U)ptr[1];
  50 + result.ptr[2] = (U)ptr[2];
  51 +
  52 + return result;
  53 + }
  54 +
  55 + // computes the squared Euclidean length (useful for several operations where only >, =, or < matter)
  56 + CUDA_CALLABLE T len_sq() const{
  57 + return ptr[0] * ptr[0] + ptr[1] * ptr[1] + ptr[2] * ptr[2];
  58 + }
  59 +
  60 + /// computes the Euclidean length of the vector
  61 + CUDA_CALLABLE T len() const{
  62 + return sqrt(len_sq());
  63 + }
  64 +
  65 +
  66 + /// Convert the vector from cartesian to spherical coordinates (x, y, z -> r, theta, phi where theta = [0, 2*pi])
  67 + CUDA_CALLABLE vec3<T> cart2sph() const{
  68 + vec3<T> sph;
  69 + sph.ptr[0] = len();
  70 + sph.ptr[1] = std::atan2(ptr[1], ptr[0]);
  71 + if(sph.ptr[0] == 0)
  72 + sph.ptr[2] = 0;
  73 + else
  74 + sph.ptr[2] = std::acos(ptr[2] / sph.ptr[0]);
  75 + return sph;
  76 + }
  77 +
  78 + /// Convert the vector from cartesian to spherical coordinates (r, theta, phi -> x, y, z where theta = [0, 2*pi])
  79 + CUDA_CALLABLE vec3<T> sph2cart() const{
  80 + vec3<T> cart;
  81 + cart.ptr[0] = ptr[0] * std::cos(ptr[1]) * std::sin(ptr[2]);
  82 + cart.ptr[1] = ptr[0] * std::sin(ptr[1]) * std::sin(ptr[2]);
  83 + cart.ptr[2] = ptr[0] * std::cos(ptr[2]);
  84 +
  85 + return cart;
  86 + }
  87 +
  88 + /// Computes the normalized vector (where each coordinate is divided by the L2 norm)
  89 + CUDA_CALLABLE vec3<T> norm() const{
  90 + vec3<T> result;
  91 + T l = len(); //compute the vector length
  92 + return (*this) / l;
  93 + }
  94 +
  95 + /// Computes the cross product of a 3-dimensional vector
  96 + CUDA_CALLABLE vec3<T> cross(const vec3<T> rhs) const{
  97 +
  98 + vec3<T> result;
  99 +
  100 + result[0] = (ptr[1] * rhs.ptr[2] - ptr[2] * rhs.ptr[1]);
  101 + result[1] = (ptr[2] * rhs.ptr[0] - ptr[0] * rhs.ptr[2]);
  102 + result[2] = (ptr[0] * rhs.ptr[1] - ptr[1] * rhs.ptr[0]);
  103 +
  104 + return result;
  105 + }
  106 +
  107 + /// Compute the Euclidean inner (dot) product
  108 + CUDA_CALLABLE T dot(vec3<T> rhs) const{
  109 + return ptr[0] * rhs.ptr[0] + ptr[1] * rhs.ptr[1] + ptr[2] * rhs.ptr[2];
  110 + }
  111 +
  112 + /// Arithmetic addition operator
  113 +
  114 + /// @param rhs is the right-hand-side operator for the addition
  115 + CUDA_CALLABLE vec3<T> operator+(vec3<T> rhs) const{
  116 + vec3<T> result;
  117 + result.ptr[0] = ptr[0] + rhs[0];
  118 + result.ptr[1] = ptr[1] + rhs[1];
  119 + result.ptr[2] = ptr[2] + rhs[2];
  120 + return result;
  121 + }
  122 +
  123 + /// Arithmetic addition to a scalar
  124 +
  125 + /// @param rhs is the right-hand-side operator for the addition
  126 + CUDA_CALLABLE vec3<T> operator+(T rhs) const{
  127 + vec3<T> result;
  128 + result.ptr[0] = ptr[0] + rhs;
  129 + result.ptr[1] = ptr[1] + rhs;
  130 + result.ptr[2] = ptr[2] + rhs;
  131 + return result;
  132 + }
  133 +
  134 + /// Arithmetic subtraction operator
  135 +
  136 + /// @param rhs is the right-hand-side operator for the subtraction
  137 + CUDA_CALLABLE vec3<T> operator-(vec3<T> rhs) const{
  138 + vec3<T> result;
  139 + result.ptr[0] = ptr[0] - rhs[0];
  140 + result.ptr[1] = ptr[1] - rhs[1];
  141 + result.ptr[2] = ptr[2] - rhs[2];
  142 + return result;
  143 + }
  144 + /// Arithmetic subtraction to a scalar
  145 +
  146 + /// @param rhs is the right-hand-side operator for the addition
  147 + CUDA_CALLABLE vec3<T> operator-(T rhs) const{
  148 + vec3<T> result;
  149 + result.ptr[0] = ptr[0] - rhs;
  150 + result.ptr[1] = ptr[1] - rhs;
  151 + result.ptr[2] = ptr[2] - rhs;
  152 + return result;
  153 + }
  154 +
  155 + /// Arithmetic scalar multiplication operator
  156 +
  157 + /// @param rhs is the right-hand-side operator for the subtraction
  158 + CUDA_CALLABLE vec3<T> operator*(T rhs) const{
  159 + vec3<T> result;
  160 + result.ptr[0] = ptr[0] * rhs;
  161 + result.ptr[1] = ptr[1] * rhs;
  162 + result.ptr[2] = ptr[2] * rhs;
  163 + return result;
  164 + }
  165 +
  166 + /// Arithmetic scalar division operator
  167 +
  168 + /// @param rhs is the right-hand-side operator for the subtraction
  169 + CUDA_CALLABLE vec3<T> operator/(T rhs) const{
  170 + return (*this) * ((T)1.0/rhs);
  171 + }
  172 +
  173 + /// Multiplication by a scalar, followed by assignment
  174 + CUDA_CALLABLE vec3<T> operator*=(T rhs){
  175 + ptr[0] = ptr[0] * rhs;
  176 + ptr[1] = ptr[1] * rhs;
  177 + ptr[2] = ptr[2] * rhs;
  178 + return *this;
  179 + }
  180 +
  181 + /// Addition and assignment
  182 + CUDA_CALLABLE vec3<T> operator+=(vec3<T> rhs){
  183 + ptr[0] = ptr[0] + rhs;
  184 + ptr[1] = ptr[1] + rhs;
  185 + ptr[2] = ptr[2] + rhs;
  186 + return *this;
  187 + }
  188 +
  189 + /// Assign a scalar to all values
  190 + CUDA_CALLABLE vec3<T> & operator=(T rhs){
  191 + ptr[0] = ptr[0] = rhs;
  192 + ptr[1] = ptr[1] = rhs;
  193 + ptr[2] = ptr[2] = rhs;
  194 + return *this;
  195 + }
  196 +
  197 + /// Casting and assignment
  198 + template<typename Y>
  199 + CUDA_CALLABLE vec3<T> & operator=(vec3<Y> rhs){
  200 + ptr[0] = (T)rhs.ptr[0];
  201 + ptr[1] = (T)rhs.ptr[1];
  202 + ptr[2] = (T)rhs.ptr[2];
  203 + return *this;
  204 + }
  205 +
  206 + /// Unary minus (returns the negative of the vector)
  207 + CUDA_CALLABLE vec3<T> operator-() const{
  208 + vec3<T> result;
  209 + result.ptr[0] = -ptr[0];
  210 + result.ptr[1] = -ptr[1];
  211 + result.ptr[2] = -ptr[2];
  212 + return result;
  213 + }
  214 +
  215 +
  216 + /// Outputs the vector as a string
  217 + std::string str() const{
  218 + std::stringstream ss;
  219 +
  220 + size_t N = size();
  221 +
  222 + ss<<"[";
  223 + for(size_t i=0; i<N; i++)
  224 + {
  225 + ss<<at(i);
  226 + if(i != N-1)
  227 + ss<<", ";
  228 + }
  229 + ss<<"]";
  230 +
  231 + return ss.str();
  232 + }
  233 + }; //end class triple
  234 +} //end namespace stim
  235 +
  236 +/// Multiply a vector by a constant when the vector is on the right hand side
  237 +template <typename T>
  238 +stim::vec3<T> operator*(T lhs, stim::vec3<T> rhs){
  239 + return rhs * lhs;
  240 +}
  241 +
  242 +#endif
0 243 \ No newline at end of file
... ...
stim/math/vector.h
1   -#ifndef RTS_VECTOR_H
2   -#define RTS_VECTOR_H
  1 +#ifndef STIM_VECTOR_H
  2 +#define STIM_VECTOR_H
3 3  
4 4 #include <iostream>
5 5 #include <cmath>
... ... @@ -11,8 +11,6 @@
11 11 namespace stim
12 12 {
13 13  
14   -
15   -
16 14 template <class T>
17 15 struct vec : public std::vector<T>
18 16 {
... ...
stim/optics/planewave.h
1   -#ifndef RTS_PLANEWAVE
2   -#define RTS_PLANEWAVE
  1 +#ifndef STIM_PLANEWAVE_H
  2 +#define STIM_PLANEWAVE_H
3 3  
4 4 #include <string>
5 5 #include <sstream>
  6 +#include <cmath>
6 7  
7 8 #include "../math/vector.h"
8 9 #include "../math/quaternion.h"
9 10 #include "../math/constants.h"
10 11 #include "../math/plane.h"
11   -#include "../cuda/callable.h"
12   -
13   -/*Basic conversions used here (assuming a vacuum)
14   - lambda =
15   -*/
  12 +#include "../math/complex.h"
16 13  
17 14 namespace stim{
  15 + namespace optics{
  16 +
  17 + /// evaluate the scalar field produced by a plane wave at a point (x, y, z)
  18 +
  19 + /// @param x is the x-coordinate of the point
  20 + /// @param y is the y-coordinate of the point
  21 + /// @param z is the z-coordinate of the point
  22 + /// @param A is the amplitude of the plane wave, specifically the field at (0, 0, 0)
  23 + /// @param kx is the k-vector component in the x direction
  24 + /// @param ky is the k-vector component in the y direction
  25 + /// @param kz is the k-vector component in the z direction
  26 + template<typename T>
  27 + stim::complex<T> planewave_scalar(T x, T y, T z, stim::complex<T> A, T kx, T ky, T kz){
  28 + T d = x * kx + y * ky + z * kz; //calculate the dot product between k and p = (x, y, z) to find the distance p is along the propagation direction
  29 + stim::complex<T> di = stim::complex<T>(0, d); //calculate the phase shift that will have to be applied to propagate the wave distance d
  30 + return A * exp(di); //multiply the phase term by the amplitude at (0, 0, 0) to propagate the wave to p
  31 + }
  32 +
  33 + /// evaluate the scalar field produced by a plane wave at several positions
  34 +
  35 + /// @param field is a pre-allocated block of memory that will store the complex field at all points
  36 + /// @param N is the number of field values to be evaluated
  37 + /// @param x is a set of x coordinates defining positions within the field (NULL implies that all values are zero)
  38 + /// @param y is a set of y coordinates defining positions within the field (NULL implies that all values are zero)
  39 + /// @param z is a set of z coordinates defining positions within the field (NULL implies that all values are zero)
  40 + /// @param A is the amplitude of the plane wave, specifically the field at (0, 0, 0)
  41 + /// @param kx is the k-vector component in the x direction
  42 + /// @param ky is the k-vector component in the y direction
  43 + /// @param kz is the k-vector component in the z direction
  44 + template<typename T>
  45 + void cpu_planewave_scalar(stim::complex<T>* field, size_t N, T* x, T* y = NULL, T* z = NULL, stim::complex<T> A = 1.0, T kx = 0.0, T ky = 0.0, T kz = 0.0){
  46 + T px, py, pz;
  47 + for(size_t i = 0; i < N; i++){ // for each element in the array
  48 + (x == NULL) ? px = 0 : px = x[i]; // test for NULL values
  49 + (y == NULL) ? py = 0 : py = y[i];
  50 + (z == NULL) ? pz = 0 : pz = z[i];
  51 +
  52 + field[i] = planewave_scalar(px, py, pz, A, kx, ky, kz); // call the single-value plane wave function
  53 + }
  54 + }
18 55  
19 56 template<typename T>
20 57 class planewave{
21 58  
22 59 protected:
23 60  
24   - vec<T> k; //k = tau / lambda
25   - vec< complex<T> > E0; //amplitude
26   - //T phi;
27   -
28   - CUDA_CALLABLE planewave<T> bend(rts::vec<T> kn) const{
  61 + stim::vec<T> k; //k-vector, pointed in propagation direction with magnitude |k| = tau / lambda = 2pi / lambda
  62 + stim::vec< stim::complex<T> > E0; //amplitude (for a scalar plane wave, only E0[0] is used)
29 63  
30   - vec<T> kn_hat = kn.norm(); //normalize the new k
31   - vec<T> k_hat = k.norm(); //normalize the current k
  64 + /// Bend a plane wave via refraction, given that the new propagation direction is known
  65 + CUDA_CALLABLE planewave<T> bend(stim::vec<T> kn) const{
32 66  
33   - //std::cout<<"PLANE WAVE BENDING------------------"<<std::endl;
34   - //std::cout<<"kn_hat: "<<kn_hat<<" k_hat: "<<k_hat<<std::endl;
  67 + stim::vec<T> kn_hat = kn.norm(); //normalize the new k
  68 + stim::vec<T> k_hat = k.norm(); //normalize the current k
35 69  
36   - planewave<T> new_p; //create a new plane wave
  70 + planewave<T> new_p; //create a new plane wave
37 71  
38   - //if kn is equal to k or -k, handle the degenerate case
39   - T k_dot_kn = k_hat.dot(kn_hat);
  72 + T k_dot_kn = k_hat.dot(kn_hat); //if kn is equal to k or -k, handle the degenerate case
40 73  
41 74 //if k . n < 0, then the bend is a reflection
42   - //flip k_hat
43   - if(k_dot_kn < 0) k_hat = -k_hat;
  75 + if(k_dot_kn < 0) k_hat = -k_hat; //flip k_hat
44 76  
45   - //std::cout<<"k dot kn: "<<k_dot_kn<<std::endl;
46   -
47   - //std::cout<<"k_dot_kn: "<<k_dot_kn<<std::endl;
48 77 if(k_dot_kn == -1){
49 78 new_p.k = -k;
50 79 new_p.E0 = E0;
... ... @@ -56,28 +85,11 @@ protected:
56 85 return new_p;
57 86 }
58 87  
59   - vec<T> r = k_hat.cross(kn_hat); //compute the rotation vector
60   -
61   - //std::cout<<"r: "<<r<<std::endl;
62   -
63   - T theta = asin(r.len()); //compute the angle of the rotation about r
64   -
65   -
66   -
67   - //deal with a zero vector (both k and kn point in the same direction)
68   - //if(theta == (T)0)
69   - //{
70   - // new_p = *this;
71   - // return new_p;
72   - //}
73   -
74   - //create a quaternion to capture the rotation
75   - quaternion<T> q;
76   - q.CreateRotation(theta, r.norm());
77   -
78   - //apply the rotation to E0
79   - vec< complex<T> > E0n = q.toMatrix3() * E0;
80   -
  88 + vec<T> r = k_hat.cross(kn_hat); //compute the rotation vector
  89 + T theta = asin(r.len()); //compute the angle of the rotation about r
  90 + quaternion<T> q; //create a quaternion to capture the rotation
  91 + q.CreateRotation(theta, r.norm());
  92 + vec< complex<T> > E0n = q.toMatrix3() * E0; //apply the rotation to E0
81 93 new_p.k = kn_hat * kmag();
82 94 new_p.E0 = E0n;
83 95  
... ... @@ -86,16 +98,9 @@ protected:
86 98  
87 99 public:
88 100  
89   -
90   - ///constructor: create a plane wave propagating along z, polarized along x
91   - /*planewave(T lambda = (T)1)
92   - {
93   - k = rts::vec<T>(0, 0, 1) * (TAU/lambda);
94   - E0 = rts::vec<T>(1, 0, 0);
95   - }*/
96   - ///constructor: create a plane wave propagating along k, polarized along _E0, at frequency _omega
97   - CUDA_CALLABLE planewave(vec<T> kvec = rts::vec<T>(0, 0, rtsTAU),
98   - vec< complex<T> > E = rts::vec<T>(1, 0, 0), T phase = 0)
  101 + ///constructor: create a plane wave propagating along k
  102 + CUDA_CALLABLE planewave(vec<T> kvec = stim::vec<T>(0, 0, stim::TAU),
  103 + vec< complex<T> > E = stim::vec<T>(1, 0, 0))
99 104 {
100 105 //phi = phase;
101 106  
... ... @@ -107,27 +112,23 @@ public:
107 112 else{
108 113 vec< complex<T> > s = (k_hat.cross(E)).norm(); //compute an orthogonal side vector
109 114 vec< complex<T> > E_hat = (s.cross(k)).norm(); //compute a normalized E0 direction vector
110   - E0 = E_hat * E_hat.dot(E); //compute the projection of _E0 onto E0_hat
  115 + E0 = E_hat;// * E_hat.dot(E); //compute the projection of _E0 onto E0_hat
111 116 }
112 117  
113 118 E0 = E0 * exp( complex<T>(0, phase) );
114 119 }
115 120  
116 121 ///multiplication operator: scale E0
117   - CUDA_CALLABLE planewave<T> & operator* (const T & rhs)
118   - {
119   -
  122 + CUDA_CALLABLE planewave<T> & operator* (const T & rhs){
120 123 E0 = E0 * rhs;
121 124 return *this;
122 125 }
123 126  
124   - CUDA_CALLABLE T lambda() const
125   - {
126   - return rtsTAU / k.len();
  127 + CUDA_CALLABLE T lambda() const{
  128 + return stim::TAU / k.len();
127 129 }
128 130  
129   - CUDA_CALLABLE T kmag() const
130   - {
  131 + CUDA_CALLABLE T kmag() const{
131 132 return k.len();
132 133 }
133 134  
... ... @@ -139,14 +140,11 @@ public:
139 140 return k;
140 141 }
141 142  
142   - /*CUDA_CALLABLE T phase(){
143   - return phi;
  143 + /// calculate the value of the field produced by the plane wave given a three-dimensional position
  144 + CUDA_CALLABLE vec< complex<T> > pos(T x, T y, T z){
  145 + return pos( stim::vec<T>(x, y, z) );
144 146 }
145 147  
146   - CUDA_CALLABLE void phase(T p){
147   - phi = p;
148   - }*/
149   -
150 148 CUDA_CALLABLE vec< complex<T> > pos(vec<T> p = vec<T>(0, 0, 0)){
151 149 vec< complex<T> > result;
152 150  
... ... @@ -166,18 +164,32 @@ public:
166 164 return planewave<T>(k * (nt / ni), E0);
167 165 }
168 166  
169   - CUDA_CALLABLE planewave<T> refract(rts::vec<T> kn) const
170   - {
  167 + CUDA_CALLABLE planewave<T> refract(stim::vec<T> kn) const{
171 168 return bend(kn);
172 169 }
173 170  
174   - void scatter(rts::plane<T> P, T nr, planewave<T> &r, planewave<T> &t){
  171 + /// Calculate the result of a plane wave hitting an interface between two refractive indices
  172 +
  173 + /// @param P is a plane representing the position and orientation of the surface
  174 + /// @param n0 is the refractive index outside of the surface (in the direction of the normal)
  175 + /// @param n1 is the refractive index inside the surface (in the direction away from the normal)
  176 + /// @param r is the reflected component of the plane wave
  177 + /// @param t is the transmitted component of the plane wave
  178 + void scatter(stim::plane<T> P, T n0, T n1, planewave<T> &r, planewave<T> &t){
  179 + scatter(P, n1/n0, r, t);
  180 + }
  181 +
  182 + /// Calculate the scattering result when nr = n1/n0
  183 +
  184 + /// @param P is a plane representing the position and orientation of the surface
  185 + /// @param r is the ration n1/n0
  186 + /// @param n1 is the refractive index inside the surface (in the direction away from the normal)
  187 + /// @param r is the reflected component of the plane wave
  188 + /// @param t is the transmitted component of the plane wave
  189 + void scatter(stim::plane<T> P, T nr, planewave<T> &r, planewave<T> &t){
175 190  
176 191 int facing = P.face(k); //determine which direction the plane wave is coming in
177 192  
178   - //if(facing == 0) //if the wave is tangent to the plane, return an identical wave
179   - // return *this;
180   - //else
181 193 if(facing == -1){ //if the wave hits the back of the plane, invert the plane and nr
182 194 P = P.flip(); //flip the plane
183 195 nr = 1/nr; //invert the refractive index (now nr = n0/n1)
... ... @@ -192,7 +204,7 @@ public:
192 204 bool tir = false; //flag for total internal reflection
193 205 if(theta_t != theta_t){
194 206 tir = true;
195   - theta_t = rtsPI / (T)2;
  207 + theta_t = stim::PI / (T)2;
196 208 }
197 209  
198 210 //handle the degenerate case where theta_i is 0 (the plane wave hits head-on)
... ... @@ -205,17 +217,10 @@ public:
205 217 vec< complex<T> > Et = E0 * tp;
206 218 T phase_t = P.p().dot(k - kt); //compute the phase offset
207 219 T phase_r = P.p().dot(k - kr);
208   - //std::cout<<"Degeneracy: Head-On"<<std::endl;
209   - //std::cout<<"rs: "<<rp<<" rp: "<<rp<<" ts: "<<tp<<" tp: "<<tp<<std::endl;
210   - //std::cout<<"phase r: "<<phase_r<<" phase t: "<<phase_t<<std::endl;
211 220  
212 221 //create the plane waves
213 222 r = planewave<T>(kr, Er, phase_r);
214 223 t = planewave<T>(kt, Et, phase_t);
215   -
216   - //std::cout<<"i + r: "<<pos()[0] + r.pos()[0]<<pos()[1] + r.pos()[1]<<pos()[2] + r.pos()[2]<<std::endl;
217   - //std::cout<<"t: "<<t.pos()[0]<<t.pos()[1]<<t.pos()[2]<<std::endl;
218   - //std::cout<<"--------------------------------"<<std::endl;
219 224 return;
220 225 }
221 226  
... ... @@ -245,11 +250,9 @@ public:
245 250  
246 251 //compute the magnitude of the p- and s-polarized components of the incident E vector
247 252 complex<T> Ei_s = E0.dot(x_hat);
248   - //int sgn = (0 < E0.dot(y_hat)) - (E0.dot(y_hat) < 0);
249 253 int sgn = E0.dot(y_hat).sgn();
250 254 vec< complex<T> > cx_hat = x_hat;
251 255 complex<T> Ei_p = ( E0 - cx_hat * Ei_s ).len() * sgn;
252   - //T Ei_p = ( E0 - x_hat * Ei_s ).len();
253 256 //compute the magnitude of the p- and s-polarized components of the reflected E vector
254 257 complex<T> Er_s = Ei_s * rs;
255 258 complex<T> Er_p = Ei_p * rp;
... ... @@ -257,14 +260,6 @@ public:
257 260 complex<T> Et_s = Ei_s * ts;
258 261 complex<T> Et_p = Ei_p * tp;
259 262  
260   - //std::cout<<"E0: "<<E0<<std::endl;
261   - //std::cout<<"E0 dot y_hat: "<<E0.dot(y_hat)<<std::endl;
262   - //std::cout<<"theta i: "<<theta_i<<" theta t: "<<theta_t<<std::endl;
263   - //std::cout<<"x_hat: "<<x_hat<<" y_hat: "<<y_hat<<" z_hat: "<<z_hat<<std::endl;
264   - //std::cout<<"Ei_s: "<<Ei_s<<" Ei_p: "<<Ei_p<<" Er_s: "<<Er_s<<" Er_p: "<<Er_p<<" Et_s: "<<Et_s<<" Et_p: "<<Et_p<<std::endl;
265   - //std::cout<<"rs: "<<rs<<" rp: "<<rp<<" ts: "<<ts<<" tp: "<<tp<<std::endl;
266   -
267   -
268 263 //compute the reflected E vector
269 264 vec< complex<T> > Er = vec< complex<T> >(y_hat * cos(theta_i) + z_hat * sin(theta_i)) * Er_p + cx_hat * Er_s;
270 265 //compute the transmitted E vector
... ... @@ -273,29 +268,12 @@ public:
273 268 T phase_t = P.p().dot(k - kt);
274 269 T phase_r = P.p().dot(k - kr);
275 270  
276   - //std::cout<<"phase r: "<<phase_r<<" phase t: "<<phase_t<<std::endl;
277   -
278   - //std::cout<<"phase: "<<phase<<std::endl;
279   -
280 271 //create the plane waves
281 272 r.k = kr;
282 273 r.E0 = Er * exp( complex<T>(0, phase_r) );
283   - //r.phi = phase_r;
284   -
285   - //t = bend(kt);
286   - //t.k = t.k * nr;
287 274  
288 275 t.k = kt;
289 276 t.E0 = Et * exp( complex<T>(0, phase_t) );
290   - //t.phi = phase_t;
291   - //std::cout<<"i: "<<str()<<std::endl;
292   - //std::cout<<"r: "<<r.str()<<std::endl;
293   - //std::cout<<"t: "<<t.str()<<std::endl;
294   -
295   - //std::cout<<"i + r: "<<pos()[0] + r.pos()[0]<<pos()[1] + r.pos()[1]<<pos()[2] + r.pos()[2]<<std::endl;
296   - //std::cout<<"t: "<<t.pos()[0]<<t.pos()[1]<<t.pos()[2]<<std::endl;
297   - //std::cout<<"--------------------------------"<<std::endl;
298   -
299 277 }
300 278  
301 279 std::string str()
... ... @@ -305,14 +283,15 @@ public:
305 283 ss<<" "<<E0<<" e^i ( "<<k<<" . r )";
306 284 return ss.str();
307 285 }
308   -};
309   -}
  286 +}; //end planewave class
  287 +} //end namespace optics
  288 +} //end namespace stim
310 289  
311 290 template <typename T>
312   -std::ostream& operator<<(std::ostream& os, rts::planewave<T> p)
  291 +std::ostream& operator<<(std::ostream& os, stim::optics::planewave<T> p)
313 292 {
314 293 os<<p.str();
315 294 return os;
316 295 }
317 296  
318 297 -#endif
  298 +#endif
319 299 \ No newline at end of file
... ...
stim/optics/scalarbeam.h 0 → 100644
  1 +#ifndef RTS_BEAM
  2 +#define RTS_BEAM
  3 +
  4 +#include "../math/vec3.h"
  5 +#include "../optics/scalarwave.h"
  6 +#include "../math/bessel.h"
  7 +#include "../math/legendre.h"
  8 +#include <vector>
  9 +
  10 +namespace stim{
  11 +
  12 + /// Function returns the value of the scalar field produced by a beam with the specified parameters
  13 +
  14 + template<typename T>
  15 + std::vector< stim::vec3<T> > generate_focusing_vectors(size_t N, stim::vec3<T> d, T NA, T NA_in = 0){
  16 +
  17 + std::vector< stim::vec3<T> > dirs(N); //allocate an array to store the focusing vectors
  18 +
  19 + ///compute the rotation operator to transform (0, 0, 1) to k
  20 + T cos_angle = d.dot(vec3<T>(0, 0, 1));
  21 + stim::matrix<T, 3> rotation;
  22 +
  23 + //if the cosine of the angle is -1, the rotation is just a flip across the z axis
  24 + if(cos_angle == -1){
  25 + rotation(2, 2) = -1;
  26 + }
  27 + else if(cos_angle != 1.0)
  28 + {
  29 + vec3<T> r_axis = vec3<T>(0, 0, 1).cross(d).norm(); //compute the axis of rotation
  30 + T angle = acos(cos_angle); //compute the angle of rotation
  31 + quaternion<T> quat; //create a quaternion describing the rotation
  32 + quat.CreateRotation(angle, r_axis);
  33 + rotation = quat.toMatrix3(); //compute the rotation matrix
  34 + }
  35 +
  36 + //find the phi values associated with the cassegrain ring
  37 + T PHI[2];
  38 + PHI[0] = (T)asin(NA);
  39 + PHI[1] = (T)asin(NA_in);
  40 +
  41 + //calculate the z-axis cylinder coordinates associated with these angles
  42 + T Z[2];
  43 + Z[0] = cos(PHI[0]);
  44 + Z[1] = cos(PHI[1]);
  45 + T range = Z[0] - Z[1];
  46 +
  47 + //draw a distribution of random phi, z values
  48 + T z, phi, theta;
  49 + //T kmag = stim::TAU / lambda;
  50 + for(int i=0; i<N; i++){ //for each sample
  51 + z = (T)((double)rand() / (double)RAND_MAX) * range + Z[1]; //find a random position on the surface of a cylinder
  52 + theta = (T)(((double)rand() / (double)RAND_MAX) * stim::TAU);
  53 + phi = acos(z); //project onto the sphere, computing phi in spherical coordinates
  54 +
  55 + //compute and store cartesian coordinates
  56 + vec3<T> spherical(1, theta, phi); //convert from spherical to cartesian coordinates
  57 + vec3<T> cart = spherical.sph2cart();
  58 + dirs[i] = rotation * cart; //create a sample vector
  59 + }
  60 + return dirs;
  61 + }
  62 +
  63 +/// Class stim::beam represents a beam of light focused at a point and composed of several plane waves
  64 +template<typename T>
  65 +class scalarbeam
  66 +{
  67 +public:
  68 + //enum beam_type {Uniform, Bartlett, Hamming, Hanning};
  69 +
  70 +private:
  71 +
  72 + T NA[2]; //numerical aperature of the focusing optics
  73 + vec3<T> f; //focal point
  74 + vec3<T> d; //propagation direction
  75 + stim::complex<T> A; //beam amplitude
  76 + T lambda; //beam wavelength
  77 +public:
  78 +
  79 + ///constructor: build a default beam (NA=1.0)
  80 + scalarbeam(T wavelength = 1, stim::complex<T> amplitude = 1, vec3<T> focal_point = vec3<T>(0, 0, 0), vec3<T> direction = vec3<T>(0, 0, 1), T numerical_aperture = 1, T center_obsc = 0){
  81 + lambda = wavelength;
  82 + A = amplitude;
  83 + f = focal_point;
  84 + d = direction.norm(); //make sure that the direction vector is normalized (makes calculations more efficient later on)
  85 + NA[0] = numerical_aperture;
  86 + NA[1] = center_obsc;
  87 + }
  88 +
  89 + ///Numerical Aperature functions
  90 + void setNA(T na)
  91 + {
  92 + NA[0] = (T)0;
  93 + NA[1] = na;
  94 + }
  95 + void setNA(T na0, T na1)
  96 + {
  97 + NA[0] = na0;
  98 + NA[1] = na1;
  99 + }
  100 +
  101 + //Monte-Carlo decomposition into plane waves
  102 + std::vector< scalarwave<T> > mc(size_t N = 100000) const{
  103 +
  104 + std::vector< stim::vec3<T> > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]); //generate a random set of N vectors forming a focus
  105 + std::vector< scalarwave<T> > samples(N); //create a vector of plane waves
  106 + T kmag = (T)stim::TAU / lambda; //calculate the wavenumber
  107 + stim::complex<T> apw; //allocate space for the amplitude at the focal point
  108 + stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction
  109 + for(size_t i=0; i<N; i++){ //for each sample
  110 + kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave
  111 + apw = exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave
  112 + samples[i] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction
  113 + }
  114 +
  115 + return samples;
  116 + }
  117 +
  118 + /// Calculate the field at a given point
  119 + /// @param x is the x-coordinate of the field point
  120 + /// @O is the approximation accuracy
  121 + stim::complex<T> field(T x, T y, T z, size_t O){
  122 + std::vector< scalarwave<T> > W = mc(O);
  123 + T result = 0; //initialize the result to zero (0)
  124 + for(size_t i = 0; i < O; i++){ //for each plane wave
  125 + result += W[i].pos(x, y, z);
  126 + }
  127 + return result;
  128 + }
  129 +
  130 + std::string str()
  131 + {
  132 + std::stringstream ss;
  133 + ss<<"Beam:"<<std::endl;
  134 + //ss<<" Central Plane Wave: "<<beam::E0<<" e^i ( "<<beam::k<<" . r )"<<std::endl;
  135 + ss<<" Beam Direction: "<<d<<std::endl;
  136 + if(NA[0] == 0)
  137 + ss<<" NA: "<<NA[1];
  138 + else
  139 + ss<<" NA: "<<NA[0]<<" -- "<<NA[1];
  140 +
  141 + return ss.str();
  142 + }
  143 +
  144 +
  145 +
  146 +}; //end beam
  147 +
  148 +/// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional)
  149 +/// @param C is a pointer to Nl + 1 values where the terms will be stored
  150 +template<typename T>
  151 +CUDA_CALLABLE void cpu_aperture_integral(T* C, size_t Nl, T NA, T NA_in = 0){
  152 +
  153 + size_t table_bytes = (Nl + 1) * sizeof(T); //calculate the number of bytes required to store the terms
  154 + T cos_alpha_1 = cos(asin(NA_in)); //calculate the cosine of the angle subtended by the central obscuration
  155 + T cos_alpha_2 = cos(asin(NA)); //calculate the cosine of the angle subtended by the aperture
  156 +
  157 + // the aperture integral is computed using four individual Legendre polynomials, each a function of the angles subtended
  158 + // by the objective and central obscuration
  159 + T* Pln_a1 = (T*) malloc(table_bytes);
  160 + stim::legendre<T>(Nl-1, cos_alpha_1, &Pln_a1[1]);
  161 + Pln_a1[0] = 1;
  162 +
  163 + T* Pln_a2 = (T*) malloc(table_bytes);
  164 + stim::legendre<T>(Nl-1, cos_alpha_2, &Pln_a2[1]);
  165 + Pln_a2[0] = 1;
  166 +
  167 + T* Plp_a1 = (T*) malloc(table_bytes+sizeof(T));
  168 + stim::legendre<T>(Nl+1, cos_alpha_1, Plp_a1);
  169 +
  170 + T* Plp_a2 = (T*) malloc(table_bytes+sizeof(T));
  171 + stim::legendre<T>(Nl+1, cos_alpha_2, Plp_a2);
  172 +
  173 + for(size_t l = 0; l <= Nl; l++){
  174 + C[l] = Plp_a1[l+1] - Plp_a2[l+1] - Pln_a1[l] + Pln_a2[l];
  175 + }
  176 +
  177 + free(Pln_a1);
  178 + free(Pln_a2);
  179 + free(Plp_a1);
  180 + free(Plp_a2);
  181 +}
  182 +
  183 +/// performs linear interpolation into a look-up table
  184 +template<typename T>
  185 +T lut_lookup(T* lut, T val, size_t N, T min_val, T delta, size_t stride = 0){
  186 + size_t idx = (size_t)((val - min_val) / delta);
  187 + T alpha = val - idx * delta + min_val;
  188 +
  189 + if(alpha == 0) return lut[idx];
  190 + else return lut[idx * stride] * (1 - alpha) + lut[ (idx+1) * stride] * alpha;
  191 +}
  192 +
  193 +template<typename T>
  194 +void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, stim::vec3<T> f, T NA, T NA_in, int Nl){
  195 + T k = stim::TAU / lambda;
  196 +
  197 + T* C = (T*) malloc( (Nl + 1) * sizeof(T) ); //allocate space for the aperture integral terms
  198 + cpu_aperture_integral(C, Nl, NA, NA_in); //calculate the aperture integral terms
  199 + memset(F, 0, N * sizeof(stim::complex<T>));
  200 +#ifdef NO_CUDA
  201 + memset(F, 0, N * sizeof(stim::complex<T>));
  202 + T jl, Pl, kr, cos_phi;
  203 +
  204 + double vm;
  205 + double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
  206 + double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
  207 + double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
  208 + double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
  209 +
  210 + T* Pl_cos_phi = (T*) malloc((Nl + 1) * sizeof(T));
  211 +
  212 + for(size_t n = 0; n < N; n++){ //for each point in the field
  213 + kr = k * r[n]; //calculate kr (the optical distance between the focal point and p)
  214 + cos_phi = std::cos(phi[n]); //calculate the cosine of phi
  215 + stim::bessjyv_sph<double>(Nl, kr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
  216 + stim::legendre<T>(Nl, cos_phi, Pl_cos_phi); //calculate the [0 Nl] legendre polynomials for this point
  217 +
  218 + for(int l = 0; l <= Nl; l++){
  219 + jl = (T)jv[l];
  220 + Pl = Pl_cos_phi[l];
  221 + F[n] += pow(complex<T>(0, 1), l) * jl * Pl * C[l];
  222 + }
  223 + F[n] *= A * stim::TAU;
  224 + }
  225 +
  226 + free(C);
  227 + free(Pl_cos_phi);
  228 +#else
  229 + T min_r = r[0];
  230 + T max_r = r[0];
  231 + for(size_t i = 0; i < N; i++){ //find the minimum and maximum values of r (min and max distance from the focal point)
  232 + if(r[i] < min_r) min_r = r[i];
  233 + if(r[i] > max_r) max_r = r[i];
  234 + }
  235 + T min_kr = k * min_r;
  236 + T max_kr = k * max_r;
  237 +
  238 + //temporary variables
  239 + double vm;
  240 + double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
  241 + double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
  242 + double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
  243 + double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
  244 +
  245 + size_t Nlut = (size_t)sqrt(N) * 2;
  246 + T* bessel_lut = (T*) malloc(sizeof(T) * (Nl+1) * Nlut);
  247 + T delta_kr = (max_kr - min_kr) / (Nlut-1);
  248 + for(size_t kri = 0; kri < Nlut; kri++){
  249 + stim::bessjyv_sph<double>(Nl, min_kr + kri * delta_kr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
  250 + for(size_t l = 0; l <= Nl; l++){
  251 + bessel_lut[kri * (Nl + 1) + l] = (T)jv[l];
  252 + }
  253 + }
  254 +
  255 + T* Pl_cos_phi = (T*) malloc((Nl + 1) * sizeof(T));
  256 + T kr, cos_phi, jl, Pl;
  257 + for(size_t n = 0; n < N; n++){ //for each point in the field
  258 + kr = k * r[n]; //calculate kr (the optical distance between the focal point and p)
  259 + cos_phi = std::cos(phi[n]); //calculate the cosine of phi
  260 + stim::legendre<T>(Nl, cos_phi, Pl_cos_phi); //calculate the [0 Nl] legendre polynomials for this point
  261 +
  262 + for(int l = 0; l <= Nl; l++){
  263 + jl = lut_lookup<T>(&bessel_lut[l], kr, Nlut, min_kr, delta_kr, Nl+1);
  264 + Pl = Pl_cos_phi[l];
  265 + F[n] += pow(complex<T>(0, 1), l) * jl * Pl * C[l];
  266 + }
  267 + F[n] *= A * stim::TAU;
  268 + }
  269 +#endif
  270 +}
  271 +
  272 +
  273 +template<typename T>
  274 +void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, T NA, T NA_in, int Nl){
  275 + T* r = (T*) malloc(N * sizeof(T)); //allocate space for p in spherical coordinates
  276 + T* phi = (T*) malloc(N * sizeof(T)); // only r and phi are necessary (the scalar PSF is symmetric about theta)
  277 +
  278 + stim::vec3<T> p, ps;
  279 + for(size_t i = 0; i < N; i++){
  280 + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
  281 + (y == NULL) ? p[1] = 0 : p[1] = y[i];
  282 + (z == NULL) ? p[2] = 0 : p[2] = z[i];
  283 +
  284 + ps = p.cart2sph(); //convert from cartesian to spherical coordinates
  285 + r[i] = ps[0]; //store r
  286 + phi[i] = ps[2]; //phi = [0 pi]
  287 + }
  288 +
  289 + cpu_scalar_psf(F, N, r, phi, lambda, A, f, NA, NA_in, Nl); //call the spherical coordinate CPU function
  290 +
  291 + free(r);
  292 + free(phi);
  293 +}
  294 +
  295 +} //end namespace stim
  296 +
  297 +#endif
... ...
stim/optics/scalarwave.h 0 → 100644
  1 +#ifndef STIM_SCALARWAVE_H
  2 +#define STIM_SCALARWAVE_H
  3 +
  4 +
  5 +#include <string>
  6 +#include <sstream>
  7 +#include <cmath>
  8 +
  9 +//#include "../math/vector.h"
  10 +#include "../math/vec3.h"
  11 +#include "../math/quaternion.h"
  12 +#include "../math/constants.h"
  13 +#include "../math/plane.h"
  14 +#include "../math/complex.h"
  15 +
  16 +//CUDA
  17 +#include "../cuda/cudatools/devices.h"
  18 +#include "../cuda/cudatools/error.h"
  19 +#include "../cuda/sharedmem.cuh"
  20 +
  21 +namespace stim{
  22 +
  23 +template<typename T>
  24 +class scalarwave{
  25 +
  26 +protected:
  27 +
  28 + stim::vec3<T> k; //k-vector, pointed in propagation direction with magnitude |k| = tau / lambda = 2pi / lambda
  29 + stim::complex<T> E0; //amplitude
  30 +
  31 + /// Bend a plane wave via refraction, given that the new propagation direction is known
  32 + CUDA_CALLABLE scalarwave<T> bend(stim::vec3<T> kn) const{
  33 + return scalarwave<T>(kn.norm() * kmag(), E0);
  34 + }
  35 +
  36 +public:
  37 +
  38 + ///constructor: create a plane wave propagating along k
  39 + CUDA_CALLABLE scalarwave(vec3<T> kvec = stim::vec3<T>(0, 0, (T)stim::TAU), complex<T> E = 1){
  40 + k = kvec;
  41 + E0 = E;
  42 + }
  43 +
  44 + CUDA_CALLABLE scalarwave(T kx, T ky, T kz, complex<T> E = 1){
  45 + k = vec3<T>(kx, ky, kz);
  46 + E0 = E;
  47 + }
  48 +
  49 + ///multiplication operator: scale E0
  50 + CUDA_CALLABLE scalarwave<T> & operator* (const T & rhs){
  51 + E0 = E0 * rhs;
  52 + return *this;
  53 + }
  54 +
  55 + CUDA_CALLABLE T lambda() const{
  56 + return stim::TAU / k.len();
  57 + }
  58 +
  59 + CUDA_CALLABLE T kmag() const{
  60 + return k.len();
  61 + }
  62 +
  63 + CUDA_CALLABLE vec3< complex<T> > E(){
  64 + return E0;
  65 + }
  66 +
  67 + CUDA_CALLABLE vec3<T> kvec(){
  68 + return k;
  69 + }
  70 +
  71 + /// calculate the value of the field produced by the plane wave given a three-dimensional position
  72 + CUDA_CALLABLE complex<T> pos(T x, T y, T z){
  73 + return pos( stim::vec3<T>(x, y, z) );
  74 + }
  75 +
  76 + CUDA_CALLABLE complex<T> pos(vec3<T> p = vec3<T>(0, 0, 0)){
  77 + return E0 * exp(complex<T>(0, k.dot(p)));
  78 + }
  79 +
  80 + //scales k based on a transition from material ni to material nt
  81 + CUDA_CALLABLE scalarwave<T> n(T ni, T nt){
  82 + return scalarwave<T>(k * (nt / ni), E0);
  83 + }
  84 +
  85 + CUDA_CALLABLE scalarwave<T> refract(stim::vec3<T> kn) const{
  86 + return bend(kn);
  87 + }
  88 +
  89 + /// Calculate the result of a plane wave hitting an interface between two refractive indices
  90 +
  91 + /// @param P is a plane representing the position and orientation of the surface
  92 + /// @param n0 is the refractive index outside of the surface (in the direction of the normal)
  93 + /// @param n1 is the refractive index inside the surface (in the direction away from the normal)
  94 + /// @param r is the reflected component of the plane wave
  95 + /// @param t is the transmitted component of the plane wave
  96 + void scatter(stim::plane<T> P, T n0, T n1, scalarwave<T> &r, scalarwave<T> &t){
  97 + scatter(P, n1/n0, r, t);
  98 + }
  99 +
  100 + /// Calculate the scattering result when nr = n1/n0
  101 +
  102 + /// @param P is a plane representing the position and orientation of the surface
  103 + /// @param r is the ration n1/n0
  104 + /// @param n1 is the refractive index inside the surface (in the direction away from the normal)
  105 + /// @param r is the reflected component of the plane wave
  106 + /// @param t is the transmitted component of the plane wave
  107 + void scatter(stim::plane<T> P, T nr, scalarwave<T> &r, scalarwave<T> &t){
  108 + /*
  109 + int facing = P.face(k); //determine which direction the plane wave is coming in
  110 +
  111 + if(facing == -1){ //if the wave hits the back of the plane, invert the plane and nr
  112 + P = P.flip(); //flip the plane
  113 + nr = 1/nr; //invert the refractive index (now nr = n0/n1)
  114 + }
  115 +
  116 + //use Snell's Law to calculate the transmitted angle
  117 + T cos_theta_i = k.norm().dot(-P.norm()); //compute the cosine of theta_i
  118 + T theta_i = acos(cos_theta_i); //compute theta_i
  119 + T sin_theta_t = (1/nr) * sin(theta_i); //compute the sine of theta_t using Snell's law
  120 + T theta_t = asin(sin_theta_t); //compute the cosine of theta_t
  121 +
  122 + bool tir = false; //flag for total internal reflection
  123 + if(theta_t != theta_t){
  124 + tir = true;
  125 + theta_t = stim::PI / (T)2;
  126 + }
  127 +
  128 + //handle the degenerate case where theta_i is 0 (the plane wave hits head-on)
  129 + if(theta_i == 0){
  130 + T rp = (1 - nr) / (1 + nr); //compute the Fresnel coefficients
  131 + T tp = 2 / (1 + nr);
  132 + vec3<T> kr = -k;
  133 + vec3<T> kt = k * nr; //set the k vectors for theta_i = 0
  134 + vec3< complex<T> > Er = E0 * rp; //compute the E vectors
  135 + vec3< complex<T> > Et = E0 * tp;
  136 + T phase_t = P.p().dot(k - kt); //compute the phase offset
  137 + T phase_r = P.p().dot(k - kr);
  138 +
  139 + //create the plane waves
  140 + r = planewave<T>(kr, Er, phase_r);
  141 + t = planewave<T>(kt, Et, phase_t);
  142 + return;
  143 + }
  144 +
  145 +
  146 + //compute the Fresnel coefficients
  147 + T rp, rs, tp, ts;
  148 + rp = tan(theta_t - theta_i) / tan(theta_t + theta_i);
  149 + rs = sin(theta_t - theta_i) / sin(theta_t + theta_i);
  150 +
  151 + if(tir){
  152 + tp = ts = 0;
  153 + }
  154 + else{
  155 + tp = ( 2 * sin(theta_t) * cos(theta_i) ) / ( sin(theta_t + theta_i) * cos(theta_t - theta_i) );
  156 + ts = ( 2 * sin(theta_t) * cos(theta_i) ) / sin(theta_t + theta_i);
  157 + }
  158 +
  159 + //compute the coordinate space for the plane of incidence
  160 + vec3<T> z_hat = -P.norm();
  161 + vec3<T> y_hat = P.parallel(k).norm();
  162 + vec3<T> x_hat = y_hat.cross(z_hat).norm();
  163 +
  164 + //compute the k vectors for r and t
  165 + vec3<T> kr, kt;
  166 + kr = ( y_hat * sin(theta_i) - z_hat * cos(theta_i) ) * kmag();
  167 + kt = ( y_hat * sin(theta_t) + z_hat * cos(theta_t) ) * kmag() * nr;
  168 +
  169 + //compute the magnitude of the p- and s-polarized components of the incident E vector
  170 + complex<T> Ei_s = E0.dot(x_hat);
  171 + int sgn = E0.dot(y_hat).sgn();
  172 + vec3< complex<T> > cx_hat = x_hat;
  173 + complex<T> Ei_p = ( E0 - cx_hat * Ei_s ).len() * sgn;
  174 + //compute the magnitude of the p- and s-polarized components of the reflected E vector
  175 + complex<T> Er_s = Ei_s * rs;
  176 + complex<T> Er_p = Ei_p * rp;
  177 + //compute the magnitude of the p- and s-polarized components of the transmitted E vector
  178 + complex<T> Et_s = Ei_s * ts;
  179 + complex<T> Et_p = Ei_p * tp;
  180 +
  181 + //compute the reflected E vector
  182 + vec3< complex<T> > Er = vec3< complex<T> >(y_hat * cos(theta_i) + z_hat * sin(theta_i)) * Er_p + cx_hat * Er_s;
  183 + //compute the transmitted E vector
  184 + vec3< complex<T> > Et = vec3< complex<T> >(y_hat * cos(theta_t) - z_hat * sin(theta_t)) * Et_p + cx_hat * Et_s;
  185 +
  186 + T phase_t = P.p().dot(k - kt);
  187 + T phase_r = P.p().dot(k - kr);
  188 +
  189 + //create the plane waves
  190 + r.k = kr;
  191 + r.E0 = Er * exp( complex<T>(0, phase_r) );
  192 +
  193 + t.k = kt;
  194 + t.E0 = Et * exp( complex<T>(0, phase_t) );
  195 + */
  196 + }
  197 +
  198 + std::string str()
  199 + {
  200 + std::stringstream ss;
  201 + ss<<"Plane Wave:"<<std::endl;
  202 + ss<<" "<<E0<<" e^i ( "<<k<<" . r )";
  203 + return ss.str();
  204 + }
  205 +}; //end planewave class
  206 +
  207 +
  208 +/// CUDA kernel for computing the field produced by a batch of plane waves at an array of locations
  209 +template<typename T>
  210 +__global__ void cuda_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t n_waves){
  211 + extern __shared__ stim::scalarwave<T> shared_W[]; //declare the list of waves in shared memory
  212 +
  213 + stim::cuda::sharedMemcpy(shared_W, W, n_waves, threadIdx.x, blockDim.x); //copy the plane waves into shared memory for faster access
  214 + __syncthreads(); //synchronize threads to insure all data is copied
  215 +
  216 + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
  217 + if(i >= N) return; //exit if this thread is outside the array
  218 + T px, py, pz;
  219 + (x == NULL) ? px = 0 : px = x[i]; // test for NULL values and set positions
  220 + (y == NULL) ? py = 0 : py = y[i];
  221 + (z == NULL) ? pz = 0 : pz = z[i];
  222 +
  223 + stim::complex<T> f = 0; //create a register to store the result
  224 + for(size_t w = 0; w < n_waves; w++)
  225 + f += shared_W[w].pos(px, py, pz); //evaluate the plane wave
  226 + F[i] += f; //copy the result to device memory
  227 +}
  228 +
  229 +/// evaluate a scalar wave at several points, where all arrays are on the GPU
  230 +template<typename T>
  231 +void gpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
  232 +
  233 + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
  234 + dim3 blocks(N / threads + 1); //calculate the optimal number of blocks
  235 + cuda_scalarwave<T><<< blocks, threads >>>(F, N, x, y, z, w); //call the kernel
  236 +}
  237 +
  238 +/// Sums a series of coherent plane waves at a specified point
  239 +/// @param field is the output array of field values corresponding to each input point
  240 +/// @param x is an array of x coordinates for the field point
  241 +/// @param y is an array of y coordinates for the field point
  242 +/// @param z is an array of z coordinates for the field point
  243 +/// @param N is the number of points in the input and output arrays
  244 +/// @param lambda is the wavelength (all coherent waves are assumed to have the same wavelength)
  245 +/// @param A is the list of amplitudes for each wave
  246 +/// @param S is the list of propagation directions for each wave
  247 +template<typename T>
  248 +void cpu_sum_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > w_array){
  249 + size_t S = w_array.size(); //store the number of waves
  250 +#ifdef NO_CUDA
  251 + memset(F, 0, N * sizeof(stim::complex<T>));
  252 + T px, py, pz;
  253 + for(size_t i = 0; i < N; i++){ // for each element in the array
  254 + (x == NULL) ? px = 0 : px = x[i]; // test for NULL values
  255 + (y == NULL) ? py = 0 : py = y[i];
  256 + (z == NULL) ? pz = 0 : pz = z[i];
  257 +
  258 + for(size_t s = 0; s < S; s++){
  259 + F[i] += w_array[s].pos(px, py, pz); //sum all plane waves at this point
  260 + }
  261 + }
  262 +#else
  263 + stim::complex<T>* dev_F; //allocate space for the field
  264 + cudaMalloc(&dev_F, N * sizeof(stim::complex<T>));
  265 + cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used)
  266 +
  267 + T* dev_x = NULL; //allocate space and copy the X coordinate (if specified)
  268 + if(x != NULL){
  269 + HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
  270 + HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
  271 + }
  272 +
  273 + T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified)
  274 + if(y != NULL){
  275 + HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
  276 + HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
  277 + }
  278 +
  279 + T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified)
  280 + if(z != NULL){
  281 + HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
  282 + HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
  283 + }
  284 +
  285 + size_t wave_bytes = sizeof(stim::scalarwave<T>);
  286 + size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available
  287 + size_t array_bytes = w_array.size() * wave_bytes; //calculate the maximum number of bytes required for the planewave array
  288 + size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory
  289 + size_t num_batches = w_array.size() / max_batch + 1; //calculate the number of batches required to process all plane waves
  290 + size_t batch_bytes = min(w_array.size(), max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required
  291 +
  292 + stim::scalarwave<T>* dev_w;
  293 + HANDLE_ERROR(cudaMalloc(&dev_w, batch_bytes)); //allocate memory for a single batch of plane waves
  294 +
  295 + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
  296 + dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
  297 +
  298 + size_t batch_size; //declare a variable to store the size of the current batch
  299 + size_t waves_processed = 0; //initialize the number of waves processed to zero
  300 + while(waves_processed < w_array.size()){ //while there are still waves to be processed
  301 + batch_size = min<size_t>(max_batch, w_array.size() - waves_processed); //process either a whole batch, or whatever is left
  302 + batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
  303 + HANDLE_ERROR(cudaMemcpy(dev_w, &w_array[waves_processed], batch_bytes, cudaMemcpyHostToDevice)); //copy the plane waves into global memory
  304 + cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(dev_F, N, dev_x, dev_y, dev_z, dev_w, batch_size); //call the kernel
  305 + waves_processed += batch_size; //increment the counter indicating how many waves have been processed
  306 + }
  307 +
  308 + cudaMemcpy(F, dev_F, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory
  309 +
  310 + if(x != NULL) cudaFree(dev_x); //free everything
  311 + if(y != NULL) cudaFree(dev_y);
  312 + if(z != NULL) cudaFree(dev_z);
  313 + cudaFree(dev_F);
  314 + cudaFree(dev_w);
  315 +
  316 +#endif
  317 +}
  318 +
  319 +template<typename T>
  320 +void cpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
  321 + std::vector< stim::scalarwave<T> > w_array(1, w);
  322 + cpu_sum_scalarwaves(F, N, x, y, z, w_array);
  323 +}
  324 +
  325 +
  326 +/// Sums a series of coherent plane waves at a specified point
  327 +/// @param x is the x coordinate of the field point
  328 +/// @param y is the y coordinate of the field point
  329 +/// @param z is the z coordinate of the field point
  330 +/// @param lambda is the wavelength (all coherent waves are assumed to have the same wavelength)
  331 +/// @param A is the list of amplitudes for each wave
  332 +/// @param S is the list of propagation directions for each wave
  333 +template<typename T>
  334 +CUDA_CALLABLE stim::complex<T> sum_scalarwaves(T x, T y, T z, std::vector< stim::scalarwave<T> > W){
  335 + size_t N = W.size(); //get the number of plane wave samples
  336 + stim::complex<T> field(0, 0); //initialize the field to zero (0)
  337 + stim::vec3<T> k; //allocate space for the direction vector
  338 + for(size_t i = 0; i < N; i++){
  339 + field += W[i].pos(x, y, z);
  340 + }
  341 + return field;
  342 +}
  343 +
  344 +} //end namespace stim
  345 +
  346 +template <typename T>
  347 +std::ostream& operator<<(std::ostream& os, stim::scalarwave<T> p)
  348 +{
  349 + os<<p.str();
  350 + return os;
  351