Commit 9339fbad873457047446ee3a90f52296eda250a5
1 parent
308a743c
implementing mie scattering
Showing
9 changed files
with
698 additions
and
134 deletions
Show diff stats
stim/cuda/cudatools/callable.h
@@ -2,7 +2,7 @@ | @@ -2,7 +2,7 @@ | ||
2 | 2 | ||
3 | //define the CUDA_CALLABLE macro (will prefix all members) | 3 | //define the CUDA_CALLABLE macro (will prefix all members) |
4 | #ifdef __CUDACC__ | 4 | #ifdef __CUDACC__ |
5 | -#define CUDA_CALLABLE __host__ __device__ | 5 | +#define CUDA_CALLABLE __host__ __device__ inline |
6 | #else | 6 | #else |
7 | #define CUDA_CALLABLE | 7 | #define CUDA_CALLABLE |
8 | #endif | 8 | #endif |
stim/image/image.h
@@ -58,12 +58,12 @@ class image{ | @@ -58,12 +58,12 @@ class image{ | ||
58 | 58 | ||
59 | int cv_type(){ | 59 | int cv_type(){ |
60 | if(std::is_same<T, unsigned char>::value) return CV_MAKETYPE(CV_8U, (int)C()); | 60 | if(std::is_same<T, unsigned char>::value) return CV_MAKETYPE(CV_8U, (int)C()); |
61 | - if(std::is_same<T, char>::value) return CV_MAKETYPE(CV_8S, (int)C()); | ||
62 | - if(std::is_same<T, unsigned short>::value) return CV_MAKETYPE(CV_16U, (int)C()); | ||
63 | - if(std::is_same<T, short>::value) return CV_MAKETYPE(CV_16S, (int)C()); | ||
64 | - if(std::is_same<T, int>::value) return CV_MAKETYPE(CV_32S, (int)C()); | ||
65 | - if(std::is_same<T, float>::value) return CV_MAKETYPE(CV_32F, (int)C()); | ||
66 | - if(std::is_same<T, double>::value) return CV_MAKETYPE(CV_64F, (int)C()); | 61 | + else if(std::is_same<T, char>::value) return CV_MAKETYPE(CV_8S, (int)C()); |
62 | + else if(std::is_same<T, unsigned short>::value) return CV_MAKETYPE(CV_16U, (int)C()); | ||
63 | + else if(std::is_same<T, short>::value) return CV_MAKETYPE(CV_16S, (int)C()); | ||
64 | + else if(std::is_same<T, int>::value) return CV_MAKETYPE(CV_32S, (int)C()); | ||
65 | + else if(std::is_same<T, float>::value) return CV_MAKETYPE(CV_32F, (int)C()); | ||
66 | + else if(std::is_same<T, double>::value) return CV_MAKETYPE(CV_64F, (int)C()); | ||
67 | 67 | ||
68 | std::cout<<"ERROR in stim::image::cv_type - no valid data type found"<<std::endl; | 68 | std::cout<<"ERROR in stim::image::cv_type - no valid data type found"<<std::endl; |
69 | exit(1); | 69 | exit(1); |
@@ -72,12 +72,12 @@ class image{ | @@ -72,12 +72,12 @@ class image{ | ||
72 | /// Returns the value for "white" based on the dynamic range (assumes white is 1.0 for floating point images) | 72 | /// Returns the value for "white" based on the dynamic range (assumes white is 1.0 for floating point images) |
73 | T white(){ | 73 | T white(){ |
74 | if(std::is_same<T, unsigned char>::value) return UCHAR_MAX; | 74 | if(std::is_same<T, unsigned char>::value) return UCHAR_MAX; |
75 | - if(std::is_same<T, unsigned short>::value) return SHRT_MAX; | ||
76 | - if(std::is_same<T, unsigned>::value) return UINT_MAX; | ||
77 | - if(std::is_same<T, unsigned long>::value) return ULONG_MAX; | ||
78 | - if(std::is_same<T, unsigned long long>::value) return ULLONG_MAX; | ||
79 | - if(std::is_same<T, float>::value) return 1.0f; | ||
80 | - if(std::is_same<T, double>::value) return 1.0; | 75 | + else if(std::is_same<T, unsigned short>::value) return SHRT_MAX; |
76 | + else if(std::is_same<T, unsigned>::value) return UINT_MAX; | ||
77 | + else if(std::is_same<T, unsigned long>::value) return ULONG_MAX; | ||
78 | + else if(std::is_same<T, unsigned long long>::value) return ULLONG_MAX; | ||
79 | + else if(std::is_same<T, float>::value) return 1.0f; | ||
80 | + else if(std::is_same<T, double>::value) return 1.0; | ||
81 | 81 | ||
82 | std::cout<<"ERROR in stim::image::white - no white value known for this data type"<<std::endl; | 82 | std::cout<<"ERROR in stim::image::white - no white value known for this data type"<<std::endl; |
83 | 83 | ||
@@ -120,14 +120,6 @@ public: | @@ -120,14 +120,6 @@ public: | ||
120 | free(img); | 120 | free(img); |
121 | } | 121 | } |
122 | 122 | ||
123 | - /*stim::image<T> operator=(const stim::image<T>& I){ | ||
124 | - if(&I == this) //handle self-assignment | ||
125 | - return *this; | ||
126 | - allocate(I.X(), I.Y(), I.C()); | ||
127 | - memcpy(img, I.img, bytes()); | ||
128 | - return *this; | ||
129 | - }*/ | ||
130 | - | ||
131 | stim::image<T>& operator=(const stim::image<T>& I){ | 123 | stim::image<T>& operator=(const stim::image<T>& I){ |
132 | init(); | 124 | init(); |
133 | if(&I == this) //handle self-assignment | 125 | if(&I == this) //handle self-assignment |
stim/math/bessel.h
@@ -1258,7 +1258,7 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv, | @@ -1258,7 +1258,7 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv, | ||
1258 | P a0,v0,pv0,pv1,vl,ga,gb,vg,vv,w0,w1,ya0,yak,ya1,wa; | 1258 | P a0,v0,pv0,pv1,vl,ga,gb,vg,vv,w0,w1,ya0,yak,ya1,wa; |
1259 | int j,n,k,kz,l,lb,lb0,m; | 1259 | int j,n,k,kz,l,lb,lb0,m; |
1260 | 1260 | ||
1261 | - a0 = abs(z); | 1261 | + a0 = ::abs(z); |
1262 | z1 = z; | 1262 | z1 = z; |
1263 | z2 = z*z; | 1263 | z2 = z*z; |
1264 | n = (int)v; | 1264 | n = (int)v; |
@@ -1286,7 +1286,7 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv, | @@ -1286,7 +1286,7 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv, | ||
1286 | vm = v; | 1286 | vm = v; |
1287 | return 0; | 1287 | return 0; |
1288 | } | 1288 | } |
1289 | - if (real(z1) < 0.0) z1 = -z; | 1289 | + if (::real(z1) < 0.0) z1 = -z; |
1290 | if (a0 <= 12.0) { | 1290 | if (a0 <= 12.0) { |
1291 | for (l=0;l<2;l++) { | 1291 | for (l=0;l<2;l++) { |
1292 | vl = v0+l; | 1292 | vl = v0+l; |
@@ -1295,7 +1295,7 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv, | @@ -1295,7 +1295,7 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv, | ||
1295 | for (k=1;k<=40;k++) { | 1295 | for (k=1;k<=40;k++) { |
1296 | cr *= -0.25*z2/(k*(k+vl)); | 1296 | cr *= -0.25*z2/(k*(k+vl)); |
1297 | cjvl += cr; | 1297 | cjvl += cr; |
1298 | - if (abs(cr) < abs(cjvl)*eps) break; | 1298 | + if (::abs(cr) < ::abs(cjvl)*eps) break; |
1299 | } | 1299 | } |
1300 | vg = 1.0 + vl; | 1300 | vg = 1.0 + vl; |
1301 | ga = gamma(vg); | 1301 | ga = gamma(vg); |
@@ -1348,7 +1348,7 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv, | @@ -1348,7 +1348,7 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv, | ||
1348 | for (k=1;k<=40;k++) { | 1348 | for (k=1;k<=40;k++) { |
1349 | cr *= -0.25*z2/(k*(k-vl)); | 1349 | cr *= -0.25*z2/(k*(k-vl)); |
1350 | cjvl += cr; | 1350 | cjvl += cr; |
1351 | - if (abs(cr) < abs(cjvl)*eps) break; | 1351 | + if (::abs(cr) < ::abs(cjvl)*eps) break; |
1352 | } | 1352 | } |
1353 | vg = 1.0-vl; | 1353 | vg = 1.0-vl; |
1354 | gb = gamma(vg); | 1354 | gb = gamma(vg); |
@@ -1381,16 +1381,16 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv, | @@ -1381,16 +1381,16 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv, | ||
1381 | cyv1 = M_2_PI*(cec*cjv1-1.0/z1-0.25*z1*cs1); | 1381 | cyv1 = M_2_PI*(cec*cjv1-1.0/z1-0.25*z1*cs1); |
1382 | } | 1382 | } |
1383 | } | 1383 | } |
1384 | - if (real(z) < 0.0) { | 1384 | + if (::real(z) < 0.0) { |
1385 | cfac0 = exp(pv0*cii); | 1385 | cfac0 = exp(pv0*cii); |
1386 | cfac1 = exp(pv1*cii); | 1386 | cfac1 = exp(pv1*cii); |
1387 | - if (imag(z) < 0.0) { | 1387 | + if (::imag(z) < 0.0) { |
1388 | cyv0 = cfac0*cyv0-(P)2.0*(complex<P>)cii*cos(pv0)*cjv0; | 1388 | cyv0 = cfac0*cyv0-(P)2.0*(complex<P>)cii*cos(pv0)*cjv0; |
1389 | cyv1 = cfac1*cyv1-(P)2.0*(complex<P>)cii*cos(pv1)*cjv1; | 1389 | cyv1 = cfac1*cyv1-(P)2.0*(complex<P>)cii*cos(pv1)*cjv1; |
1390 | cjv0 /= cfac0; | 1390 | cjv0 /= cfac0; |
1391 | cjv1 /= cfac1; | 1391 | cjv1 /= cfac1; |
1392 | } | 1392 | } |
1393 | - else if (imag(z) > 0.0) { | 1393 | + else if (::imag(z) > 0.0) { |
1394 | cyv0 = cyv0/cfac0+(P)2.0*(complex<P>)cii*cos(pv0)*cjv0; | 1394 | cyv0 = cyv0/cfac0+(P)2.0*(complex<P>)cii*cos(pv0)*cjv0; |
1395 | cyv1 = cyv1/cfac1+(P)2.0*(complex<P>)cii*cos(pv1)*cjv1; | 1395 | cyv1 = cyv1/cfac1+(P)2.0*(complex<P>)cii*cos(pv1)*cjv1; |
1396 | cjv0 *= cfac0; | 1396 | cjv0 *= cfac0; |
@@ -1421,7 +1421,7 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv, | @@ -1421,7 +1421,7 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv, | ||
1421 | cf2 = cf1; | 1421 | cf2 = cf1; |
1422 | cf1 = cf; | 1422 | cf1 = cf; |
1423 | } | 1423 | } |
1424 | - if (abs(cjv0) > abs(cjv1)) cs = cjv0/cf; | 1424 | + if (::abs(cjv0) > ::abs(cjv1)) cs = cjv0/cf; |
1425 | else cs = cjv1/cf2; | 1425 | else cs = cjv1/cf2; |
1426 | for (k=0;k<=n;k++) { | 1426 | for (k=0;k<=n;k++) { |
1427 | cjv[k] *= cs; | 1427 | cjv[k] *= cs; |
@@ -1433,21 +1433,21 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv, | @@ -1433,21 +1433,21 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv, | ||
1433 | } | 1433 | } |
1434 | cyv[0] = cyv0; | 1434 | cyv[0] = cyv0; |
1435 | cyv[1] = cyv1; | 1435 | cyv[1] = cyv1; |
1436 | - ya0 = abs(cyv0); | 1436 | + ya0 = ::abs(cyv0); |
1437 | lb = 0; | 1437 | lb = 0; |
1438 | cg0 = cyv0; | 1438 | cg0 = cyv0; |
1439 | cg1 = cyv1; | 1439 | cg1 = cyv1; |
1440 | for (k=2;k<=n;k++) { | 1440 | for (k=2;k<=n;k++) { |
1441 | cyk = 2.0*(v0+k-1.0)*cg1/z-cg0; | 1441 | cyk = 2.0*(v0+k-1.0)*cg1/z-cg0; |
1442 | - yak = abs(cyk); | ||
1443 | - ya1 = abs(cg0); | 1442 | + yak = ::abs(cyk); |
1443 | + ya1 = ::abs(cg0); | ||
1444 | if ((yak < ya0) && (yak< ya1)) lb = k; | 1444 | if ((yak < ya0) && (yak< ya1)) lb = k; |
1445 | cyv[k] = cyk; | 1445 | cyv[k] = cyk; |
1446 | cg0 = cg1; | 1446 | cg0 = cg1; |
1447 | cg1 = cyk; | 1447 | cg1 = cyk; |
1448 | } | 1448 | } |
1449 | lb0 = 0; | 1449 | lb0 = 0; |
1450 | - if ((lb > 4) && (imag(z) != 0.0)) { | 1450 | + if ((lb > 4) && (::imag(z) != 0.0)) { |
1451 | while(lb != lb0) { | 1451 | while(lb != lb0) { |
1452 | ch2 = cone; | 1452 | ch2 = cone; |
1453 | ch1 = czero; | 1453 | ch1 = czero; |
@@ -1470,7 +1470,7 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv, | @@ -1470,7 +1470,7 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv, | ||
1470 | cp21 = ch2; | 1470 | cp21 = ch2; |
1471 | if (lb == n) | 1471 | if (lb == n) |
1472 | cjv[lb+1] = 2.0*(lb+v0)*cjv[lb]/z-cjv[lb-1]; | 1472 | cjv[lb+1] = 2.0*(lb+v0)*cjv[lb]/z-cjv[lb-1]; |
1473 | - if (abs(cjv[0]) > abs(cjv[1])) { | 1473 | + if (::abs(cjv[0]) > ::abs(cjv[1])) { |
1474 | cyv[lb+1] = (cjv[lb+1]*cyv0-2.0*cp11/(M_PI*z))/cjv[0]; | 1474 | cyv[lb+1] = (cjv[lb+1]*cyv0-2.0*cp11/(M_PI*z))/cjv[0]; |
1475 | cyv[lb] = (cjv[lb]*cyv0+2.0*cp12/(M_PI*z))/cjv[0]; | 1475 | cyv[lb] = (cjv[lb]*cyv0+2.0*cp12/(M_PI*z))/cjv[0]; |
1476 | } | 1476 | } |
@@ -1495,8 +1495,8 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv, | @@ -1495,8 +1495,8 @@ int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv, | ||
1495 | cyl2 = cylk; | 1495 | cyl2 = cylk; |
1496 | } | 1496 | } |
1497 | for (k=2;k<=n;k++) { | 1497 | for (k=2;k<=n;k++) { |
1498 | - wa = abs(cyv[k]); | ||
1499 | - if (wa < abs(cyv[k-1])) lb = k; | 1498 | + wa = ::abs(cyv[k]); |
1499 | + if (wa < ::abs(cyv[k-1])) lb = k; | ||
1500 | } | 1500 | } |
1501 | } | 1501 | } |
1502 | } | 1502 | } |
@@ -1515,12 +1515,18 @@ int cbessjyva_sph(int v,complex<P> z,P &vm,complex<P>*cjv, | @@ -1515,12 +1515,18 @@ int cbessjyva_sph(int v,complex<P> z,P &vm,complex<P>*cjv, | ||
1515 | //first, compute the bessel functions of fractional order | 1515 | //first, compute the bessel functions of fractional order |
1516 | cbessjyva<P>(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp); | 1516 | cbessjyva<P>(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp); |
1517 | 1517 | ||
1518 | + if(z == 0){ //handle degenerate case of z = 0 | ||
1519 | + memset(cjv, 0, sizeof(P) * (v+1)); | ||
1520 | + cjv[0] = 1; | ||
1521 | + } | ||
1522 | + | ||
1518 | //iterate through each and scale | 1523 | //iterate through each and scale |
1519 | for(int n = 0; n<=v; n++) | 1524 | for(int n = 0; n<=v; n++) |
1520 | { | 1525 | { |
1521 | - | ||
1522 | - cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0)); | ||
1523 | - cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0)); | 1526 | + if(z != 0){ //handle degenerate case of z = 0 |
1527 | + cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0)); | ||
1528 | + cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0)); | ||
1529 | + } | ||
1524 | 1530 | ||
1525 | cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0)); | 1531 | cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0)); |
1526 | cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0)); | 1532 | cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0)); |
stim/math/constants.h
1 | #ifndef STIM_CONSTANTS_H | 1 | #ifndef STIM_CONSTANTS_H |
2 | #define STIM_CONSTANTS_H | 2 | #define STIM_CONSTANTS_H |
3 | 3 | ||
4 | +#include "stim/cuda/cudatools/callable.h" | ||
4 | namespace stim{ | 5 | namespace stim{ |
5 | const double PI = 3.1415926535897932384626433832795028841971693993751058209749445923078164062862; | 6 | const double PI = 3.1415926535897932384626433832795028841971693993751058209749445923078164062862; |
6 | const double TAU = 2 * stim::PI; | 7 | const double TAU = 2 * stim::PI; |
stim/math/matrix.h
@@ -55,7 +55,7 @@ struct matrix | @@ -55,7 +55,7 @@ struct matrix | ||
55 | vec<Y> operator*(vec<Y> rhs){ | 55 | vec<Y> operator*(vec<Y> rhs){ |
56 | unsigned int N = rhs.size(); | 56 | unsigned int N = rhs.size(); |
57 | 57 | ||
58 | - vec3Y> result; | 58 | + vec<Y> result; |
59 | result.resize(N); | 59 | result.resize(N); |
60 | 60 | ||
61 | for(int r=0; r<N; r++) | 61 | for(int r=0; r<N; r++) |
stim/math/quaternion.h
@@ -43,6 +43,8 @@ public: | @@ -43,6 +43,8 @@ public: | ||
43 | 43 | ||
44 | CUDA_CALLABLE void CreateRotation(vec3<T> from, vec3<T> to){ | 44 | CUDA_CALLABLE void CreateRotation(vec3<T> from, vec3<T> to){ |
45 | 45 | ||
46 | + from = from.norm(); | ||
47 | + to = to.norm(); | ||
46 | vec3<T> r = from.cross(to); //compute the rotation vector | 48 | vec3<T> r = from.cross(to); //compute the rotation vector |
47 | T theta = asin(r.len()); //compute the angle of the rotation about r | 49 | T theta = asin(r.len()); //compute the angle of the rotation about r |
48 | //deal with a zero vector (both k and kn point in the same direction) | 50 | //deal with a zero vector (both k and kn point in the same direction) |
1 | +#ifndef STIM_MIE_H | ||
2 | +#define STIM_MIE_H | ||
3 | + | ||
4 | +#include "scalarwave.h" | ||
5 | +#include "../math/bessel.h" | ||
6 | +#include <cmath> | ||
7 | + | ||
8 | +namespace stim{ | ||
9 | + | ||
10 | + | ||
11 | +/// Calculate the scattering coefficients for a spherical scatterer | ||
12 | +template<typename T> | ||
13 | +void B_coefficients(stim::complex<T>* B, T a, T k, stim::complex<T> n, int Nl){ | ||
14 | + | ||
15 | + //temporary variables | ||
16 | + double vm; //allocate space to store the return values for the bessel function calculation | ||
17 | + double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
18 | + double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
19 | + double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
20 | + double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
21 | + | ||
22 | + stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); | ||
23 | + stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); | ||
24 | + stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); | ||
25 | + stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); | ||
26 | + | ||
27 | + double ka = k * a; //store k*a (argument for spherical bessel and Hankel functions) | ||
28 | + stim::complex<double> kna = k * n * a; //store k*n*a (argument for spherical bessel functions and derivatives) | ||
29 | + | ||
30 | + stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka); //calculate bessel functions and derivatives for k*a | ||
31 | + stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna); //calculate complex bessel functions for k*n*a | ||
32 | + | ||
33 | + stim::complex<double> h_ka, dh_ka; | ||
34 | + stim::complex<double> numerator, denominator; | ||
35 | + stim::complex<double> i(0, 1); | ||
36 | + for(size_t l = 0; l <= Nl; l++){ | ||
37 | + h_ka.r = j_ka[l]; | ||
38 | + h_ka.i = y_ka[l]; | ||
39 | + dh_ka.r = dj_ka[l]; | ||
40 | + dh_ka.i = dy_ka[l]; | ||
41 | + | ||
42 | + numerator = j_ka[l] * dj_kna[l] * (stim::complex<double>)n - j_kna[l] * dj_ka[l]; | ||
43 | + denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n; | ||
44 | + B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator; | ||
45 | + std::cout<<B[l]<<std::endl; | ||
46 | + } | ||
47 | +} | ||
48 | + | ||
49 | +template<typename T> | ||
50 | +void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){ | ||
51 | + //temporary variables | ||
52 | + double vm; //allocate space to store the return values for the bessel function calculation | ||
53 | + double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
54 | + double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
55 | + double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
56 | + double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
57 | + | ||
58 | + stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); | ||
59 | + stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); | ||
60 | + stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); | ||
61 | + stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); | ||
62 | + | ||
63 | + double ka = k * a; //store k*a (argument for spherical bessel and Hankel functions) | ||
64 | + stim::complex<double> kna = k * n * a; //store k*n*a (argument for spherical bessel functions and derivatives) | ||
65 | + | ||
66 | + stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka); //calculate bessel functions and derivatives for k*a | ||
67 | + stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna); //calculate complex bessel functions for k*n*a | ||
68 | + | ||
69 | + stim::complex<double> h_ka, dh_ka; | ||
70 | + stim::complex<double> numerator, denominator; | ||
71 | + stim::complex<double> i(0, 1); | ||
72 | + for(size_t l = 0; l <= Nl; l++){ | ||
73 | + h_ka.r = j_ka[l]; | ||
74 | + h_ka.i = y_ka[l]; | ||
75 | + dh_ka.r = dj_ka[l]; | ||
76 | + dh_ka.i = dy_ka[l]; | ||
77 | + | ||
78 | + numerator = j_ka[l] * dh_ka - dj_ka[l] * h_ka; | ||
79 | + denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n; | ||
80 | + A[l] = (2 * l + 1) * pow(i, l) * numerator / denominator; | ||
81 | + } | ||
82 | +} | ||
83 | + | ||
84 | +template<typename T> | ||
85 | +__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* B, T* j, T kr_min, T dkr, int Nl){ | ||
86 | + extern __shared__ stim::scalarwave<T> shared_W[]; //declare the list of waves in shared memory | ||
87 | + | ||
88 | + stim::cuda::sharedMemcpy(shared_W, W, nW, threadIdx.x, blockDim.x); //copy the plane waves into shared memory for faster access | ||
89 | + __syncthreads(); //synchronize threads to insure all data is copied | ||
90 | + | ||
91 | + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array | ||
92 | + if(i >= N) return; //exit if this thread is outside the array | ||
93 | + stim::vec3<T> p; | ||
94 | + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions | ||
95 | + (y == NULL) ? p[1] = 0 : p[1] = y[i]; | ||
96 | + (z == NULL) ? p[2] = 0 : p[2] = z[i]; | ||
97 | + | ||
98 | + T r = p.len(); //calculate the distance from the sphere | ||
99 | + T k = W[0].kmag(); | ||
100 | + if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) | ||
101 | + | ||
102 | + size_t NC = Nl + 1; //calculate the number of coefficients to be used | ||
103 | + T kr = r * k; //calculate the thread value for k*r | ||
104 | + T fij = (kr - kr_min)/dkr; //FP index into the spherical bessel LUT | ||
105 | + size_t ij = (size_t) fij; //convert to an integral index | ||
106 | + T alpha = fij - ij; //calculate the fractional portion of the index | ||
107 | + size_t n0j = ij * (NC); //start of the first entry in the LUT | ||
108 | + size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT | ||
109 | + | ||
110 | + T cos_phi; | ||
111 | + T Pl_2, Pl_1; //declare registers to store the previous two Legendre polynomials | ||
112 | + T Pl = 1; //initialize the current value for the Legendre polynomial | ||
113 | + T jl; | ||
114 | + stim::complex<T> Ei = 0; //create a register to store the result | ||
115 | + int l; | ||
116 | + for(size_t w = 0; w < nW; w++){ | ||
117 | + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere | ||
118 | + for(l = 0; l <= Nl; l++){ | ||
119 | + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 | ||
120 | + Pl_1 = Pl; | ||
121 | + if(l == 0){ //computing Pl is done recursively, where the recursive relation | ||
122 | + Pl = cos_phi; // requires the first two orders. This defines the second. | ||
123 | + } | ||
124 | + else{ //if this is not the first iteration, use the recursive relation to calculate Pl | ||
125 | + Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1); | ||
126 | + } | ||
127 | + | ||
128 | + jl = lerp<T>( j[n0j + l], j[n1j + l], alpha ); //read jl from the LUT and interpolate the result | ||
129 | + Ei += W[w].E() * B[l] * jl * Pl; | ||
130 | + } | ||
131 | + //Ei += shared_W[w].pos(p); //evaluate the plane wave | ||
132 | + } | ||
133 | + E[i] += Ei; //copy the result to device memory | ||
134 | +} | ||
135 | + | ||
136 | +template<typename T> | ||
137 | +void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* B, T* j, T kr_min, T dkr, size_t Nl){ | ||
138 | + | ||
139 | + size_t wave_bytes = sizeof(stim::scalarwave<T>); | ||
140 | + size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available | ||
141 | + size_t array_bytes = nW * wave_bytes; //calculate the maximum number of bytes required for the planewave array | ||
142 | + size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory | ||
143 | + size_t num_batches = nW / max_batch + 1; //calculate the number of batches required to process all plane waves | ||
144 | + size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required | ||
145 | + | ||
146 | + stim::scalarwave<T>* batch_W; | ||
147 | + HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes)); //allocate memory for a single batch of plane waves | ||
148 | + | ||
149 | + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device | ||
150 | + dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks | ||
151 | + | ||
152 | + size_t batch_size; //declare a variable to store the size of the current batch | ||
153 | + size_t waves_processed = 0; //initialize the number of waves processed to zero | ||
154 | + while(waves_processed < nW){ //while there are still waves to be processed | ||
155 | + batch_size = min<size_t>(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left | ||
156 | + batch_bytes = batch_size * sizeof(stim::scalarwave<T>); | ||
157 | + HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory | ||
158 | + cuda_scalar_mie_scatter<T><<< blocks, threads, batch_bytes >>>(E, N, x, y, z, batch_W, batch_size, a, n, B, j, kr_min, dkr, (int)Nl); //call the kernel | ||
159 | + waves_processed += batch_size; //increment the counter indicating how many waves have been processed | ||
160 | + } | ||
161 | + cudaFree(batch_W); | ||
162 | +} | ||
163 | +/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave | ||
164 | + | ||
165 | +/// @param E is a pointer to the destination field values | ||
166 | +/// @param N is the number of points used to calculate the field | ||
167 | +/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros) | ||
168 | +/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros) | ||
169 | +/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros) | ||
170 | +/// @param W is an array of planewaves that will be scattered | ||
171 | +/// @param a is the radius of the sphere | ||
172 | +/// @param n is the complex refractive index of the sphere | ||
173 | +template<typename T> | ||
174 | +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n){ | ||
175 | + //calculate the necessary number of orders required to represent the scattered field | ||
176 | + T k = W[0].kmag(); | ||
177 | + | ||
178 | + size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2); | ||
179 | + | ||
180 | + //calculate the scattering coefficients for the sphere | ||
181 | + stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients | ||
182 | + B_coefficients(B, a, k, n, Nl); | ||
183 | + | ||
184 | +#ifdef __CUDACC__ | ||
185 | + stim::complex<T>* dev_E; //allocate space for the field | ||
186 | + cudaMalloc(&dev_E, N * sizeof(stim::complex<T>)); | ||
187 | + cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice); | ||
188 | + //cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used) | ||
189 | + | ||
190 | + // COORDINATES | ||
191 | + T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) | ||
192 | + if(x != NULL){ | ||
193 | + HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T))); | ||
194 | + HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice)); | ||
195 | + } | ||
196 | + T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified) | ||
197 | + if(y != NULL){ | ||
198 | + HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T))); | ||
199 | + HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice)); | ||
200 | + } | ||
201 | + T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified) | ||
202 | + if(z != NULL){ | ||
203 | + HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T))); | ||
204 | + HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); | ||
205 | + } | ||
206 | + | ||
207 | + // PLANE WAVES | ||
208 | + stim::scalarwave<T>* dev_W; //allocate space and copy plane waves | ||
209 | + HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) ); | ||
210 | + HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) ); | ||
211 | + | ||
212 | + // SCATTERING COEFFICIENTS | ||
213 | + stim::complex<T>* dev_B; | ||
214 | + HANDLE_ERROR( cudaMalloc(&dev_B, sizeof(stim::complex<T>) * (Nl+1)) ); | ||
215 | + HANDLE_ERROR( cudaMemcpy(dev_B, B, sizeof(stim::complex<T>) * (Nl+1), cudaMemcpyHostToDevice) ); | ||
216 | + | ||
217 | + // BESSEL FUNCTION LOOK-UP TABLE | ||
218 | + //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r | ||
219 | + size_t Nlut_j = 1024; | ||
220 | + T r_min = 0; | ||
221 | + T r_max = 10; | ||
222 | + | ||
223 | + T kr_min = k * r_min; | ||
224 | + T kr_max = k * r_max; | ||
225 | + | ||
226 | + //temporary variables | ||
227 | + double vm; //allocate space to store the return values for the bessel function calculation | ||
228 | + double* jv = (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
229 | + double* yv = (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
230 | + double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
231 | + double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
232 | + | ||
233 | + size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j; | ||
234 | + T* bessel_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table | ||
235 | + T dkr = (kr_max - kr_min) / (Nlut_j-1); //distance between values in the LUT | ||
236 | + std::cout<<"LUT jl bytes: "<<lutj_bytes<<std::endl; | ||
237 | + for(size_t kri = 0; kri < Nlut_j; kri++){ //for each value in the LUT | ||
238 | + stim::bessjyv_sph<double>(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] | ||
239 | + for(size_t l = 0; l <= Nl; l++){ //for each order | ||
240 | + bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; //store the bessel function result | ||
241 | + } | ||
242 | + } | ||
243 | + | ||
244 | + stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer); | ||
245 | + | ||
246 | + //Allocate device memory and copy everything to the GPU | ||
247 | + T* dev_j_lut; | ||
248 | + HANDLE_ERROR( cudaMalloc(&dev_j_lut, lutj_bytes) ); | ||
249 | + HANDLE_ERROR( cudaMemcpy(dev_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) ); | ||
250 | + | ||
251 | + gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_B, dev_j_lut, kr_min, dkr, Nl); | ||
252 | + | ||
253 | + cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory | ||
254 | + | ||
255 | + if(x != NULL) cudaFree(dev_x); //free everything | ||
256 | + if(y != NULL) cudaFree(dev_y); | ||
257 | + if(z != NULL) cudaFree(dev_z); | ||
258 | + cudaFree(dev_E); | ||
259 | +#else | ||
260 | + | ||
261 | + | ||
262 | + //allocate space to store the bessel function call results | ||
263 | + double vm; | ||
264 | + double* j_kr = (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
265 | + double* y_kr = (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
266 | + double* dj_kr= (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
267 | + double* dy_kr= (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
268 | + | ||
269 | + T* P = (T*) malloc( (Nl + 1) * sizeof(T) ); | ||
270 | + | ||
271 | + T r, kr, cos_phi; | ||
272 | + stim::complex<T> h; | ||
273 | + for(size_t i = 0; i < N; i++){ | ||
274 | + stim::vec3<T> p; //declare a 3D point | ||
275 | + | ||
276 | + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions | ||
277 | + (y == NULL) ? p[1] = 0 : p[1] = y[i]; | ||
278 | + (z == NULL) ? p[2] = 0 : p[2] = z[i]; | ||
279 | + r = p.len(); | ||
280 | + if(r >= a){ | ||
281 | + for(size_t w = 0; w < W.size(); w++){ | ||
282 | + kr = p.len() * W[w].kmag(); //calculate k*r | ||
283 | + stim::bessjyv_sph<double>(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr); | ||
284 | + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction | ||
285 | + stim::legendre<T>(Nl, cos_phi, P); | ||
286 | + | ||
287 | + for(size_t l = 0; l <= Nl; l++){ | ||
288 | + h.r = j_kr[l]; | ||
289 | + h.i = y_kr[l]; | ||
290 | + E[i] += W[w].E() * B[l] * h * P[l]; | ||
291 | + } | ||
292 | + } | ||
293 | + } | ||
294 | + } | ||
295 | +#endif | ||
296 | +} | ||
297 | + | ||
298 | +template<typename T> | ||
299 | +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){ | ||
300 | + std::vector< stim::scalarwave<T> > W(1, w); | ||
301 | + cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n); | ||
302 | +} | ||
303 | + | ||
304 | +/// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere | ||
305 | + | ||
306 | +/// @param E is a pointer to the destination field values | ||
307 | +/// @param N is the number of points used to calculate the field | ||
308 | +/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros) | ||
309 | +/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros) | ||
310 | +/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros) | ||
311 | +/// @param w is a planewave that will be scattered | ||
312 | +/// @param a is the radius of the sphere | ||
313 | +/// @param n is the complex refractive index of the sphere | ||
314 | +template<typename T> | ||
315 | +void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n){ | ||
316 | + | ||
317 | + //calculate the necessary number of orders required to represent the scattered field | ||
318 | + T k = W[0].kmag(); | ||
319 | + | ||
320 | + size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2); | ||
321 | + | ||
322 | + //calculate the scattering coefficients for the sphere | ||
323 | + stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients | ||
324 | + A_coefficients(A, a, k, n, Nl); | ||
325 | + | ||
326 | + //allocate space to store the bessel function call results | ||
327 | + double vm; | ||
328 | + stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); | ||
329 | + stim::complex<double>* y_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); | ||
330 | + stim::complex<double>* dj_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); | ||
331 | + stim::complex<double>* dy_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); | ||
332 | + | ||
333 | + T* P = (T*) malloc( (Nl + 1) * sizeof(T) ); | ||
334 | + | ||
335 | + T r, cos_phi; | ||
336 | + stim::complex<double> knr; | ||
337 | + stim::complex<T> h; | ||
338 | + for(size_t i = 0; i < N; i++){ | ||
339 | + stim::vec3<T> p; //declare a 3D point | ||
340 | + | ||
341 | + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions | ||
342 | + (y == NULL) ? p[1] = 0 : p[1] = y[i]; | ||
343 | + (z == NULL) ? p[2] = 0 : p[2] = z[i]; | ||
344 | + r = p.len(); | ||
345 | + if(r < a){ | ||
346 | + E[i] = 0; | ||
347 | + for(size_t w = 0; w < W.size(); w++){ | ||
348 | + knr = (stim::complex<double>)n * p.len() * W[w].kmag(); //calculate k*n*r | ||
349 | + | ||
350 | + stim::cbessjyva_sph<double>(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr); | ||
351 | + if(r == 0) | ||
352 | + cos_phi = 0; | ||
353 | + else | ||
354 | + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction | ||
355 | + stim::legendre<T>(Nl, cos_phi, P); | ||
356 | + | ||
357 | + for(size_t l = 0; l <= Nl; l++){ | ||
358 | + E[i] += W[w].E() * A[l] * (stim::complex<T>)j_knr[l] * P[l]; | ||
359 | + } | ||
360 | + } | ||
361 | + } | ||
362 | + } | ||
363 | +} | ||
364 | + | ||
365 | +template<typename T> | ||
366 | +void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){ | ||
367 | + std::vector< stim::scalarwave<T> > W(1, w); | ||
368 | + cpu_scalar_mie_internal(E, N, x, y, z, W, a, n); | ||
369 | +} | ||
370 | + | ||
371 | +} | ||
372 | + | ||
373 | +#endif | ||
0 | \ No newline at end of file | 374 | \ No newline at end of file |
stim/optics/scalarbeam.h
@@ -5,7 +5,12 @@ | @@ -5,7 +5,12 @@ | ||
5 | #include "../optics/scalarwave.h" | 5 | #include "../optics/scalarwave.h" |
6 | #include "../math/bessel.h" | 6 | #include "../math/bessel.h" |
7 | #include "../math/legendre.h" | 7 | #include "../math/legendre.h" |
8 | +#include "../cuda/cudatools/devices.h" | ||
9 | +#include "../cuda/cudatools/timer.h" | ||
10 | +#include <cublas_v2.h> | ||
11 | +#include <math_constants.h> | ||
8 | #include <vector> | 12 | #include <vector> |
13 | +#include <stdlib.h> | ||
9 | 14 | ||
10 | namespace stim{ | 15 | namespace stim{ |
11 | 16 | ||
@@ -105,10 +110,11 @@ public: | @@ -105,10 +110,11 @@ public: | ||
105 | std::vector< scalarwave<T> > samples(N); //create a vector of plane waves | 110 | std::vector< scalarwave<T> > samples(N); //create a vector of plane waves |
106 | T kmag = (T)stim::TAU / lambda; //calculate the wavenumber | 111 | T kmag = (T)stim::TAU / lambda; //calculate the wavenumber |
107 | stim::complex<T> apw; //allocate space for the amplitude at the focal point | 112 | stim::complex<T> apw; //allocate space for the amplitude at the focal point |
113 | + T a = stim::TAU * (1 - cos(asin(NA[0]))) / (double)N; | ||
108 | stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction | 114 | stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction |
109 | for(size_t i=0; i<N; i++){ //for each sample | 115 | for(size_t i=0; i<N; i++){ //for each sample |
110 | kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave | 116 | kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave |
111 | - apw = exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave | 117 | + apw = a * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave |
112 | samples[i] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction | 118 | samples[i] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction |
113 | } | 119 | } |
114 | 120 | ||
@@ -148,7 +154,7 @@ public: | @@ -148,7 +154,7 @@ public: | ||
148 | /// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional) | 154 | /// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional) |
149 | /// @param C is a pointer to Nl + 1 values where the terms will be stored | 155 | /// @param C is a pointer to Nl + 1 values where the terms will be stored |
150 | template<typename T> | 156 | template<typename T> |
151 | -CUDA_CALLABLE void cpu_aperture_integral(T* C, size_t Nl, T NA, T NA_in = 0){ | 157 | +CUDA_CALLABLE void cpu_aperture_integral(T* C, int Nl, T NA, T NA_in = 0){ |
152 | 158 | ||
153 | size_t table_bytes = (Nl + 1) * sizeof(T); //calculate the number of bytes required to store the terms | 159 | size_t table_bytes = (Nl + 1) * sizeof(T); //calculate the number of bytes required to store the terms |
154 | T cos_alpha_1 = cos(asin(NA_in)); //calculate the cosine of the angle subtended by the central obscuration | 160 | T cos_alpha_1 = cos(asin(NA_in)); //calculate the cosine of the angle subtended by the central obscuration |
@@ -182,23 +188,151 @@ CUDA_CALLABLE void cpu_aperture_integral(T* C, size_t Nl, T NA, T NA_in = 0){ | @@ -182,23 +188,151 @@ CUDA_CALLABLE void cpu_aperture_integral(T* C, size_t Nl, T NA, T NA_in = 0){ | ||
182 | 188 | ||
183 | /// performs linear interpolation into a look-up table | 189 | /// performs linear interpolation into a look-up table |
184 | template<typename T> | 190 | template<typename T> |
185 | -T lut_lookup(T* lut, T val, size_t N, T min_val, T delta, size_t stride = 0){ | ||
186 | - size_t idx = (size_t)((val - min_val) / delta); | ||
187 | - T alpha = val - idx * delta + min_val; | 191 | +CUDA_CALLABLE void lut_lookup(T* lut_values, T* lut, T val, size_t N, T min_val, T delta, size_t n_vals){ |
192 | + T idx = ((val - min_val) / delta); | ||
193 | + size_t i = (size_t) idx; | ||
194 | + T a1 = idx - i; | ||
195 | + T a0 = 1 - a1; | ||
196 | + size_t n0 = i * n_vals; | ||
197 | + size_t n1 = (i+1) * n_vals; | ||
198 | + for(size_t n = 0; n < n_vals; n++){ | ||
199 | + lut_values[n] = lut[n0 + n] * a0 + lut[n1 + n] * a1; | ||
200 | + } | ||
201 | +} | ||
188 | 202 | ||
189 | - if(alpha == 0) return lut[idx]; | ||
190 | - else return lut[idx * stride] * (1 - alpha) + lut[ (idx+1) * stride] * alpha; | 203 | +template <typename T> |
204 | +CUDA_CALLABLE T lerp(T v0, T v1, T t) { | ||
205 | + return fma(t, v1, fma(-t, v0, v0)); | ||
191 | } | 206 | } |
192 | 207 | ||
208 | +#ifdef __CUDACC__ | ||
193 | template<typename T> | 209 | template<typename T> |
194 | -void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, stim::vec3<T> f, T NA, T NA_in, int Nl){ | ||
195 | - T k = stim::TAU / lambda; | 210 | +__global__ void cuda_scalar_psf(stim::complex<T>* E, size_t N, T* r, T* phi, T k, T A, size_t Nl, |
211 | + T* C, | ||
212 | + T* lut_j, size_t Nj, T min_kr, T dkr){ | ||
213 | + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array | ||
214 | + if(i >= N) return; //exit if this thread is outside the array | ||
215 | + | ||
216 | + T cos_phi = cos(phi[i]); //calculate the thread value for cos(phi) | ||
217 | + T kr = r[i] * k; //calculate the thread value for k*r | ||
218 | + stim::complex<T> Ei = 0; //initialize the value of the field to zero | ||
219 | + size_t NC = Nl + 1; //calculate the number of coefficients to be used | ||
220 | + | ||
221 | + T fij = (kr - min_kr)/dkr; //FP index into the spherical bessel LUT | ||
222 | + size_t ij = (size_t) fij; //convert to an integral index | ||
223 | + T a = fij - ij; //calculate the fractional portion of the index | ||
224 | + size_t n0j = ij * (NC); //start of the first entry in the LUT | ||
225 | + size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT | ||
226 | + | ||
227 | + T jl; //declare register to store the spherical bessel function | ||
228 | + T Pl_2, Pl_1; //declare registers to store the previous two Legendre polynomials | ||
229 | + T Pl = 1; //initialize the current value for the Legendre polynomial | ||
230 | + stim::complex<T> im(0, 1); //declare i (imaginary 1) | ||
231 | + stim::complex<T> i_pow(1, 0); //i_pow stores the current value of i^l so it doesn't have to be re-computed every iteration | ||
232 | + for(int l = 0; l <= Nl; l++){ //for each order | ||
233 | + jl = lerp<T>( lut_j[n0j + l], lut_j[n1j + l], a ); //read jl from the LUT and interpolate the result | ||
234 | + Ei += i_pow * jl * Pl * C[l]; //calculate the value for the field and sum | ||
235 | + i_pow *= im; //multiply i^l * i for the next iteration | ||
236 | + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 | ||
237 | + Pl_1 = Pl; | ||
238 | + if(l == 0){ //computing Pl is done recursively, where the recursive relation | ||
239 | + Pl = cos_phi; // requires the first two orders. This defines the second. | ||
240 | + } | ||
241 | + else{ //if this is not the first iteration, use the recursive relation to calculate Pl | ||
242 | + Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1); | ||
243 | + } | ||
244 | + | ||
245 | + } | ||
246 | + E[i] = Ei * A * 2 * CUDART_PI_F; //scale the integral by the amplitude | ||
247 | +} | ||
248 | + | ||
249 | +template<typename T> | ||
250 | +void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl, T r_spacing){ | ||
251 | + | ||
252 | + //Find the minimum and maximum values of r | ||
253 | + cublasStatus_t stat; | ||
254 | + cublasHandle_t handle; | ||
255 | + | ||
256 | + stat = cublasCreate(&handle); //create a cuBLAS handle | ||
257 | + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure | ||
258 | + printf ("CUBLAS initialization failed\n"); | ||
259 | + exit(1); | ||
260 | + } | ||
261 | + | ||
262 | + int i_min, i_max; | ||
263 | + stat = cublasIsamin(handle, (int)N, r, 1, &i_min); | ||
264 | + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure | ||
265 | + printf ("CUBLAS Error: failed to calculate minimum r value.\n"); | ||
266 | + exit(1); | ||
267 | + } | ||
268 | + stat = cublasIsamax(handle, (int)N, r, 1, &i_max); | ||
269 | + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure | ||
270 | + printf ("CUBLAS Error: failed to calculate maximum r value.\n"); | ||
271 | + exit(1); | ||
272 | + } | ||
273 | + | ||
274 | + T r_min, r_max; //allocate space to store the minimum and maximum values | ||
275 | + HANDLE_ERROR( cudaMemcpy(&r_min, r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU | ||
276 | + HANDLE_ERROR( cudaMemcpy(&r_max, r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); | ||
277 | + | ||
278 | + T k = (T)stim::TAU / lambda; //calculate the wavenumber from lambda | ||
279 | + size_t C_bytes = (Nl + 1) * sizeof(T); | ||
280 | + T* C = (T*) malloc( C_bytes ); //allocate space for the aperture integral terms | ||
281 | + cpu_aperture_integral(C, Nl, NA, NA_in); //calculate the aperture integral terms | ||
282 | + | ||
283 | + size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r | ||
284 | + | ||
285 | + T kr_min = k * r_min; | ||
286 | + T kr_max = k * r_max; | ||
287 | + | ||
288 | + //temporary variables | ||
289 | + double vm; //allocate space to store the return values for the bessel function calculation | ||
290 | + double* jv = (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
291 | + double* yv = (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
292 | + double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
293 | + double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
294 | + | ||
295 | + size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j; | ||
296 | + T* bessel_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table | ||
297 | + T delta_kr = (kr_max - kr_min) / (Nlut_j-1); //distance between values in the LUT | ||
298 | + std::cout<<"LUT jl bytes: "<<lutj_bytes<<std::endl; | ||
299 | + for(size_t kri = 0; kri < Nlut_j; kri++){ //for each value in the LUT | ||
300 | + stim::bessjyv_sph<double>(Nl, kr_min + kri * delta_kr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] | ||
301 | + for(size_t l = 0; l <= Nl; l++){ //for each order | ||
302 | + bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; //store the bessel function result | ||
303 | + } | ||
304 | + } | ||
305 | + | ||
306 | + stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer); | ||
307 | + | ||
308 | + //Allocate device memory and copy everything to the GPU | ||
309 | + | ||
310 | + T* gpu_C; | ||
311 | + HANDLE_ERROR( cudaMalloc(&gpu_C, C_bytes) ); | ||
312 | + HANDLE_ERROR( cudaMemcpy(gpu_C, C, C_bytes, cudaMemcpyHostToDevice) ); | ||
313 | + T* gpu_j_lut; | ||
314 | + HANDLE_ERROR( cudaMalloc(&gpu_j_lut, lutj_bytes) ); | ||
315 | + HANDLE_ERROR( cudaMemcpy(gpu_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) ); | ||
316 | + | ||
317 | + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device | ||
318 | + dim3 blocks( (unsigned)(N / threads + 1)); //calculate the optimal number of blocks | ||
196 | 319 | ||
197 | - T* C = (T*) malloc( (Nl + 1) * sizeof(T) ); //allocate space for the aperture integral terms | 320 | + cuda_scalar_psf<T><<< blocks, threads >>>(E, N, r, phi, (T)stim::TAU/lambda, A, Nl, gpu_C, gpu_j_lut, Nlut_j, kr_min, delta_kr); |
321 | + | ||
322 | + //free the LUT and condenser tables | ||
323 | + HANDLE_ERROR( cudaFree(gpu_C) ); | ||
324 | + HANDLE_ERROR( cudaFree(gpu_j_lut) ); | ||
325 | +} | ||
326 | +#endif | ||
327 | + | ||
328 | +/// Calculate the analytical solution to a scalar point spread function given a set of spherical coordinates about the PSF (beam propagation along phi = theta = 0) | ||
329 | +template<typename T> | ||
330 | +void cpu_scalar_psf_local(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl){ | ||
331 | + T k = (T)stim::TAU / lambda; | ||
332 | + size_t C_bytes = (Nl + 1) * sizeof(T); | ||
333 | + T* C = (T*) malloc( C_bytes ); //allocate space for the aperture integral terms | ||
198 | cpu_aperture_integral(C, Nl, NA, NA_in); //calculate the aperture integral terms | 334 | cpu_aperture_integral(C, Nl, NA, NA_in); //calculate the aperture integral terms |
199 | memset(F, 0, N * sizeof(stim::complex<T>)); | 335 | memset(F, 0, N * sizeof(stim::complex<T>)); |
200 | -#ifdef NO_CUDA | ||
201 | - memset(F, 0, N * sizeof(stim::complex<T>)); | ||
202 | T jl, Pl, kr, cos_phi; | 336 | T jl, Pl, kr, cos_phi; |
203 | 337 | ||
204 | double vm; | 338 | double vm; |
@@ -225,71 +359,117 @@ void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, | @@ -225,71 +359,117 @@ void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, | ||
225 | 359 | ||
226 | free(C); | 360 | free(C); |
227 | free(Pl_cos_phi); | 361 | free(Pl_cos_phi); |
228 | -#else | ||
229 | - T min_r = r[0]; | ||
230 | - T max_r = r[0]; | ||
231 | - for(size_t i = 0; i < N; i++){ //find the minimum and maximum values of r (min and max distance from the focal point) | ||
232 | - if(r[i] < min_r) min_r = r[i]; | ||
233 | - if(r[i] > max_r) max_r = r[i]; | ||
234 | - } | ||
235 | - T min_kr = k * min_r; | ||
236 | - T max_kr = k * max_r; | 362 | +} |
237 | 363 | ||
238 | - //temporary variables | ||
239 | - double vm; | ||
240 | - double* jv = (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
241 | - double* yv = (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
242 | - double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); | ||
243 | - double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); | 364 | +/// Converts a set of cartesian points into spherical coordinates surrounding a point spread function (PSF) |
365 | +/// @param r is the output distance from the PSF | ||
366 | +/// @param phi is the non-symmetric direction about the PSF | ||
367 | +/// @param x (x, y, z) are the cartesian coordinates in world space | ||
368 | +/// @f is the focal point of the PSF in cartesian coordinates | ||
369 | +/// @d is the propagation direction of the PSF in cartesian coordinates | ||
370 | +template<typename T> | ||
371 | +__global__ void cuda_cart2psf(T* r, T* phi, size_t N, T* x, T* y, T* z, stim::vec3<T> f, stim::quaternion<T> q){ | ||
244 | 372 | ||
245 | - size_t Nlut = (size_t)sqrt(N) * 2; | ||
246 | - T* bessel_lut = (T*) malloc(sizeof(T) * (Nl+1) * Nlut); | ||
247 | - T delta_kr = (max_kr - min_kr) / (Nlut-1); | ||
248 | - for(size_t kri = 0; kri < Nlut; kri++){ | ||
249 | - stim::bessjyv_sph<double>(Nl, min_kr + kri * delta_kr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] | ||
250 | - for(size_t l = 0; l <= Nl; l++){ | ||
251 | - bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; | ||
252 | - } | ||
253 | - } | 373 | + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array |
374 | + if(i >= N) return; //exit if this thread is outside the array | ||
254 | 375 | ||
255 | - T* Pl_cos_phi = (T*) malloc((Nl + 1) * sizeof(T)); | ||
256 | - T kr, cos_phi, jl, Pl; | ||
257 | - for(size_t n = 0; n < N; n++){ //for each point in the field | ||
258 | - kr = k * r[n]; //calculate kr (the optical distance between the focal point and p) | ||
259 | - cos_phi = std::cos(phi[n]); //calculate the cosine of phi | ||
260 | - stim::legendre<T>(Nl, cos_phi, Pl_cos_phi); //calculate the [0 Nl] legendre polynomials for this point | 376 | + stim::vec3<T> p; //declare a 3D point |
377 | + | ||
378 | + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions | ||
379 | + (y == NULL) ? p[1] = 0 : p[1] = y[i]; | ||
380 | + (z == NULL) ? p[2] = 0 : p[2] = z[i]; | ||
261 | 381 | ||
262 | - for(int l = 0; l <= Nl; l++){ | ||
263 | - jl = lut_lookup<T>(&bessel_lut[l], kr, Nlut, min_kr, delta_kr, Nl+1); | ||
264 | - Pl = Pl_cos_phi[l]; | ||
265 | - F[n] += pow(complex<T>(0, 1), l) * jl * Pl * C[l]; | ||
266 | - } | ||
267 | - F[n] *= A * stim::TAU; | ||
268 | - } | ||
269 | -#endif | 382 | + p = p - f; //shift the point to the center of the PSF (focal point) |
383 | + p = q.toMatrix3() * p; //rotate the point to align with the propagation direction | ||
384 | + | ||
385 | + stim::vec3<T> ps = p.cart2sph(); //convert from cartesian to spherical coordinates | ||
386 | + r[i] = ps[0]; //store r | ||
387 | + phi[i] = ps[2]; //phi = [0 pi] | ||
270 | } | 388 | } |
271 | 389 | ||
390 | +#ifdef __CUDACC__ | ||
391 | +/// Calculate the analytical solution to a point spread function given a set of points in cartesian coordinates | ||
392 | +template<typename T> | ||
393 | +void gpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){ | ||
394 | + | ||
395 | + T* gpu_r; //allocate space for the coordinates in r | ||
396 | + HANDLE_ERROR( cudaMalloc(&gpu_r, sizeof(T) * N) ); | ||
397 | + T* gpu_phi; | ||
398 | + HANDLE_ERROR( cudaMalloc(&gpu_phi, sizeof(T) * N) ); | ||
399 | + //stim::complex<T>* gpu_E; | ||
400 | + //HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex<T>) * N) ); | ||
401 | + | ||
402 | + stim::quaternion<T> q; //create a quaternion | ||
403 | + q.CreateRotation(d, stim::vec3<T>(0, 0, 1)); //create a mapping from the propagation direction to the PSF space | ||
404 | + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device | ||
405 | + dim3 blocks( (unsigned)(N / threads + 1)); //calculate the optimal number of blocks | ||
406 | + cuda_cart2psf<T> <<< blocks, threads >>> (gpu_r, gpu_phi, N, x, y, z, f, q); //call the CUDA kernel to move the cartesian coordinates to PSF space | ||
407 | + | ||
408 | + gpu_scalar_psf_local(E, N, gpu_r, gpu_phi, lambda, A, NA, NA_in, Nl, r_spacing); | ||
409 | + | ||
410 | +} | ||
411 | +#endif | ||
272 | 412 | ||
273 | template<typename T> | 413 | template<typename T> |
274 | -void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, T NA, T NA_in, int Nl){ | 414 | +void cpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){ |
415 | + | ||
416 | +// If CUDA is available, copy the cartesian points to the GPU and evaluate them in a kernel | ||
417 | +#ifdef __CUDACC__ | ||
418 | + | ||
419 | + T* gpu_x = NULL; | ||
420 | + if(x != NULL){ | ||
421 | + HANDLE_ERROR( cudaMalloc(&gpu_x, sizeof(T) * N) ); | ||
422 | + HANDLE_ERROR( cudaMemcpy(gpu_x, x, sizeof(T) * N, cudaMemcpyHostToDevice) ); | ||
423 | + } | ||
424 | + T* gpu_y = NULL; | ||
425 | + if(y != NULL){ | ||
426 | + HANDLE_ERROR( cudaMalloc(&gpu_y, sizeof(T) * N) ); | ||
427 | + HANDLE_ERROR( cudaMemcpy(gpu_y, y, sizeof(T) * N, cudaMemcpyHostToDevice) ); | ||
428 | + } | ||
429 | + T* gpu_z = NULL; | ||
430 | + if(z != NULL){ | ||
431 | + HANDLE_ERROR( cudaMalloc(&gpu_z, sizeof(T) * N) ); | ||
432 | + HANDLE_ERROR( cudaMemcpy(gpu_z, z, sizeof(T) * N, cudaMemcpyHostToDevice) ); | ||
433 | + } | ||
434 | + | ||
435 | + stim::complex<T>* gpu_E; | ||
436 | + HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex<T>) * N) ); | ||
437 | + HANDLE_ERROR( cudaMemcpy(gpu_E, E, sizeof(stim::complex<T>) * N, cudaMemcpyHostToDevice) ); | ||
438 | + gpu_scalar_psf_cart<T>(gpu_E, N, gpu_x, gpu_y, gpu_z, lambda, A, f, d, NA, NA_in, Nl, r_spacing); | ||
439 | + HANDLE_ERROR( cudaMemcpy(E, gpu_E, sizeof(stim::complex<T>) * N, cudaMemcpyDeviceToHost) ); | ||
440 | + | ||
441 | + HANDLE_ERROR( cudaFree(gpu_x) ); | ||
442 | + HANDLE_ERROR( cudaFree(gpu_y) ); | ||
443 | + HANDLE_ERROR( cudaFree(gpu_z) ); | ||
444 | + HANDLE_ERROR( cudaFree(gpu_E) ); | ||
445 | + | ||
446 | +#else | ||
275 | T* r = (T*) malloc(N * sizeof(T)); //allocate space for p in spherical coordinates | 447 | T* r = (T*) malloc(N * sizeof(T)); //allocate space for p in spherical coordinates |
276 | T* phi = (T*) malloc(N * sizeof(T)); // only r and phi are necessary (the scalar PSF is symmetric about theta) | 448 | T* phi = (T*) malloc(N * sizeof(T)); // only r and phi are necessary (the scalar PSF is symmetric about theta) |
277 | 449 | ||
278 | - stim::vec3<T> p, ps; | 450 | + stim::quaternion<T> q; |
451 | + q.CreateRotation(d, stim::vec3<T>(0, 0, 1)); | ||
452 | + stim::matrix<T, 3> R = q.toMatrix3(); | ||
453 | + stim::vec3<T> p, ps, ds; | ||
279 | for(size_t i = 0; i < N; i++){ | 454 | for(size_t i = 0; i < N; i++){ |
280 | (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions | 455 | (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions |
281 | (y == NULL) ? p[1] = 0 : p[1] = y[i]; | 456 | (y == NULL) ? p[1] = 0 : p[1] = y[i]; |
282 | (z == NULL) ? p[2] = 0 : p[2] = z[i]; | 457 | (z == NULL) ? p[2] = 0 : p[2] = z[i]; |
283 | 458 | ||
459 | + p = p - f; | ||
460 | + | ||
461 | + p = R * p; //rotate the cartesian point | ||
462 | + | ||
284 | ps = p.cart2sph(); //convert from cartesian to spherical coordinates | 463 | ps = p.cart2sph(); //convert from cartesian to spherical coordinates |
285 | r[i] = ps[0]; //store r | 464 | r[i] = ps[0]; //store r |
286 | phi[i] = ps[2]; //phi = [0 pi] | 465 | phi[i] = ps[2]; //phi = [0 pi] |
287 | } | 466 | } |
288 | 467 | ||
289 | - cpu_scalar_psf(F, N, r, phi, lambda, A, f, NA, NA_in, Nl); //call the spherical coordinate CPU function | 468 | + cpu_scalar_psf_local(F, N, r, phi, lambda, A, NA, NA_in, Nl); //call the spherical coordinate CPU function |
290 | 469 | ||
291 | free(r); | 470 | free(r); |
292 | free(phi); | 471 | free(phi); |
472 | +#endif | ||
293 | } | 473 | } |
294 | 474 | ||
295 | } //end namespace stim | 475 | } //end namespace stim |
stim/optics/scalarwave.h
@@ -60,7 +60,7 @@ public: | @@ -60,7 +60,7 @@ public: | ||
60 | return k.len(); | 60 | return k.len(); |
61 | } | 61 | } |
62 | 62 | ||
63 | - CUDA_CALLABLE vec3< complex<T> > E(){ | 63 | + CUDA_CALLABLE complex<T> E(){ |
64 | return E0; | 64 | return E0; |
65 | } | 65 | } |
66 | 66 | ||
@@ -235,6 +235,34 @@ void gpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scala | @@ -235,6 +235,34 @@ void gpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scala | ||
235 | cuda_scalarwave<T><<< blocks, threads >>>(F, N, x, y, z, w); //call the kernel | 235 | cuda_scalarwave<T><<< blocks, threads >>>(F, N, x, y, z, w); //call the kernel |
236 | } | 236 | } |
237 | 237 | ||
238 | +template<typename T> | ||
239 | +void gpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW){ | ||
240 | + | ||
241 | + size_t wave_bytes = sizeof(stim::scalarwave<T>); | ||
242 | + size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available | ||
243 | + size_t array_bytes = nW * wave_bytes; //calculate the maximum number of bytes required for the planewave array | ||
244 | + size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory | ||
245 | + size_t num_batches = nW / max_batch + 1; //calculate the number of batches required to process all plane waves | ||
246 | + size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required | ||
247 | + | ||
248 | + stim::scalarwave<T>* batch_W; | ||
249 | + HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes)); //allocate memory for a single batch of plane waves | ||
250 | + | ||
251 | + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device | ||
252 | + dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks | ||
253 | + | ||
254 | + size_t batch_size; //declare a variable to store the size of the current batch | ||
255 | + size_t waves_processed = 0; //initialize the number of waves processed to zero | ||
256 | + while(waves_processed < nW){ //while there are still waves to be processed | ||
257 | + batch_size = min<size_t>(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left | ||
258 | + batch_bytes = batch_size * sizeof(stim::scalarwave<T>); | ||
259 | + HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory | ||
260 | + cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(F, N, x, y, z, batch_W, batch_size); //call the kernel | ||
261 | + waves_processed += batch_size; //increment the counter indicating how many waves have been processed | ||
262 | + } | ||
263 | + cudaFree(batch_W); | ||
264 | +} | ||
265 | + | ||
238 | /// Sums a series of coherent plane waves at a specified point | 266 | /// Sums a series of coherent plane waves at a specified point |
239 | /// @param field is the output array of field values corresponding to each input point | 267 | /// @param field is the output array of field values corresponding to each input point |
240 | /// @param x is an array of x coordinates for the field point | 268 | /// @param x is an array of x coordinates for the field point |
@@ -245,24 +273,13 @@ void gpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scala | @@ -245,24 +273,13 @@ void gpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scala | ||
245 | /// @param A is the list of amplitudes for each wave | 273 | /// @param A is the list of amplitudes for each wave |
246 | /// @param S is the list of propagation directions for each wave | 274 | /// @param S is the list of propagation directions for each wave |
247 | template<typename T> | 275 | template<typename T> |
248 | -void cpu_sum_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > w_array){ | ||
249 | - size_t S = w_array.size(); //store the number of waves | ||
250 | -#ifdef NO_CUDA | ||
251 | - memset(F, 0, N * sizeof(stim::complex<T>)); | ||
252 | - T px, py, pz; | ||
253 | - for(size_t i = 0; i < N; i++){ // for each element in the array | ||
254 | - (x == NULL) ? px = 0 : px = x[i]; // test for NULL values | ||
255 | - (y == NULL) ? py = 0 : py = y[i]; | ||
256 | - (z == NULL) ? pz = 0 : pz = z[i]; | ||
257 | - | ||
258 | - for(size_t s = 0; s < S; s++){ | ||
259 | - F[i] += w_array[s].pos(px, py, pz); //sum all plane waves at this point | ||
260 | - } | ||
261 | - } | ||
262 | -#else | 276 | +void cpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W){ |
277 | + size_t S = W.size(); //store the number of waves | ||
278 | +#ifdef __CUDACC__ | ||
263 | stim::complex<T>* dev_F; //allocate space for the field | 279 | stim::complex<T>* dev_F; //allocate space for the field |
264 | cudaMalloc(&dev_F, N * sizeof(stim::complex<T>)); | 280 | cudaMalloc(&dev_F, N * sizeof(stim::complex<T>)); |
265 | - cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used) | 281 | + cudaMemcpy(dev_F, F, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice); |
282 | + //cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used) | ||
266 | 283 | ||
267 | T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) | 284 | T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) |
268 | if(x != NULL){ | 285 | if(x != NULL){ |
@@ -282,28 +299,11 @@ void cpu_sum_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::v | @@ -282,28 +299,11 @@ void cpu_sum_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::v | ||
282 | HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); | 299 | HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); |
283 | } | 300 | } |
284 | 301 | ||
285 | - size_t wave_bytes = sizeof(stim::scalarwave<T>); | ||
286 | - size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available | ||
287 | - size_t array_bytes = w_array.size() * wave_bytes; //calculate the maximum number of bytes required for the planewave array | ||
288 | - size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory | ||
289 | - size_t num_batches = w_array.size() / max_batch + 1; //calculate the number of batches required to process all plane waves | ||
290 | - size_t batch_bytes = min(w_array.size(), max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required | ||
291 | - | ||
292 | - stim::scalarwave<T>* dev_w; | ||
293 | - HANDLE_ERROR(cudaMalloc(&dev_w, batch_bytes)); //allocate memory for a single batch of plane waves | 302 | + stim::scalarwave<T>* dev_W; |
303 | + HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) ); | ||
304 | + HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) ); | ||
294 | 305 | ||
295 | - int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device | ||
296 | - dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks | ||
297 | - | ||
298 | - size_t batch_size; //declare a variable to store the size of the current batch | ||
299 | - size_t waves_processed = 0; //initialize the number of waves processed to zero | ||
300 | - while(waves_processed < w_array.size()){ //while there are still waves to be processed | ||
301 | - batch_size = min<size_t>(max_batch, w_array.size() - waves_processed); //process either a whole batch, or whatever is left | ||
302 | - batch_bytes = batch_size * sizeof(stim::scalarwave<T>); | ||
303 | - HANDLE_ERROR(cudaMemcpy(dev_w, &w_array[waves_processed], batch_bytes, cudaMemcpyHostToDevice)); //copy the plane waves into global memory | ||
304 | - cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(dev_F, N, dev_x, dev_y, dev_z, dev_w, batch_size); //call the kernel | ||
305 | - waves_processed += batch_size; //increment the counter indicating how many waves have been processed | ||
306 | - } | 306 | + gpu_scalarwaves(dev_F, N, dev_x, dev_y, dev_z, dev_W, W.size()); |
307 | 307 | ||
308 | cudaMemcpy(F, dev_F, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory | 308 | cudaMemcpy(F, dev_F, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory |
309 | 309 | ||
@@ -311,15 +311,25 @@ void cpu_sum_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::v | @@ -311,15 +311,25 @@ void cpu_sum_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::v | ||
311 | if(y != NULL) cudaFree(dev_y); | 311 | if(y != NULL) cudaFree(dev_y); |
312 | if(z != NULL) cudaFree(dev_z); | 312 | if(z != NULL) cudaFree(dev_z); |
313 | cudaFree(dev_F); | 313 | cudaFree(dev_F); |
314 | - cudaFree(dev_w); | 314 | +#else |
315 | + memset(F, 0, N * sizeof(stim::complex<T>)); | ||
316 | + T px, py, pz; | ||
317 | + for(size_t i = 0; i < N; i++){ // for each element in the array | ||
318 | + (x == NULL) ? px = 0 : px = x[i]; // test for NULL values | ||
319 | + (y == NULL) ? py = 0 : py = y[i]; | ||
320 | + (z == NULL) ? pz = 0 : pz = z[i]; | ||
315 | 321 | ||
322 | + for(size_t s = 0; s < S; s++){ | ||
323 | + F[i] += w_array[s].pos(px, py, pz); //sum all plane waves at this point | ||
324 | + } | ||
325 | + } | ||
316 | #endif | 326 | #endif |
317 | } | 327 | } |
318 | 328 | ||
319 | template<typename T> | 329 | template<typename T> |
320 | void cpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){ | 330 | void cpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){ |
321 | std::vector< stim::scalarwave<T> > w_array(1, w); | 331 | std::vector< stim::scalarwave<T> > w_array(1, w); |
322 | - cpu_sum_scalarwaves(F, N, x, y, z, w_array); | 332 | + cpu_scalarwaves(F, N, x, y, z, w_array); |
323 | } | 333 | } |
324 | 334 | ||
325 | 335 | ||
@@ -331,7 +341,7 @@ void cpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scala | @@ -331,7 +341,7 @@ void cpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scala | ||
331 | /// @param A is the list of amplitudes for each wave | 341 | /// @param A is the list of amplitudes for each wave |
332 | /// @param S is the list of propagation directions for each wave | 342 | /// @param S is the list of propagation directions for each wave |
333 | template<typename T> | 343 | template<typename T> |
334 | -CUDA_CALLABLE stim::complex<T> sum_scalarwaves(T x, T y, T z, std::vector< stim::scalarwave<T> > W){ | 344 | +CUDA_CALLABLE stim::complex<T> cpu_scalarwaves(T x, T y, T z, std::vector< stim::scalarwave<T> > W){ |
335 | size_t N = W.size(); //get the number of plane wave samples | 345 | size_t N = W.size(); //get the number of plane wave samples |
336 | stim::complex<T> field(0, 0); //initialize the field to zero (0) | 346 | stim::complex<T> field(0, 0); //initialize the field to zero (0) |
337 | stim::vec3<T> k; //allocate space for the direction vector | 347 | stim::vec3<T> k; //allocate space for the direction vector |