Commit 8309b07a16caaadfd4bc85bfbebdf7041a63f0e0

Authored by David Mayerich
1 parent 31262e83

fixed some vec3 errors

stim/math/complex.h
@@ -11,6 +11,7 @@ @@ -11,6 +11,7 @@
11 11
12 namespace stim 12 namespace stim
13 { 13 {
  14 + enum complexComponentType {complexReal, complexImaginary, complexMag};
14 15
15 template <class T> 16 template <class T>
16 struct complex 17 struct complex
stim/math/plane_old.h deleted
1 -#ifndef RTS_PLANE_H  
2 -#define RTS_PLANE_H  
3 -  
4 -#include <iostream>  
5 -#include <stim/math/vector.h>  
6 -#include "rts/cuda/callable.h"  
7 -  
8 -  
9 -namespace stim{  
10 -template <typename T, int D> class plane;  
11 -}  
12 -  
13 -template <typename T, int D>  
14 -CUDA_CALLABLE stim::plane<T, D> operator-(stim::plane<T, D> v);  
15 -  
16 -namespace stim{  
17 -  
18 -template <class T, int D = 3>  
19 -class plane{  
20 -  
21 - //a plane is defined by a point and a normal  
22 -  
23 -private:  
24 -  
25 - vec<T, D> P; //point on the plane  
26 - vec<T, D> N; //plane normal  
27 -  
28 - CUDA_CALLABLE void init(){  
29 - P = vec<T, D>(0, 0, 0);  
30 - N = vec<T, D>(0, 0, 1);  
31 - }  
32 -  
33 -  
34 -public:  
35 -  
36 - //default constructor  
37 - CUDA_CALLABLE plane(){  
38 - init();  
39 - }  
40 -  
41 - CUDA_CALLABLE plane(vec<T, D> n, vec<T, D> p = vec<T, D>(0, 0, 0)){  
42 - P = p;  
43 - N = n.norm();  
44 - }  
45 -  
46 - CUDA_CALLABLE plane(T z_pos){  
47 - init();  
48 - P[2] = z_pos;  
49 - }  
50 -  
51 - //create a plane from three points (a triangle)  
52 - CUDA_CALLABLE plane(vec<T, D> a, vec<T, D> b, vec<T, D> c){  
53 - P = c;  
54 - N = (c - a).cross(b - a);  
55 - if(N.len() == 0) //handle the degenerate case when two vectors are the same, N = 0  
56 - N = 0;  
57 - else  
58 - N = N.norm();  
59 - }  
60 -  
61 - template< typename U >  
62 - CUDA_CALLABLE operator plane<U, D>(){  
63 -  
64 - plane<U, D> result(N, P);  
65 - return result;  
66 - }  
67 -  
68 - CUDA_CALLABLE vec<T, D> norm(){  
69 - return N;  
70 - }  
71 -  
72 - CUDA_CALLABLE vec<T, D> p(){  
73 - return P;  
74 - }  
75 -  
76 - //flip the plane front-to-back  
77 - CUDA_CALLABLE plane<T, D> flip(){  
78 - plane<T, D> result = *this;  
79 - result.N = -result.N;  
80 - return result;  
81 - }  
82 -  
83 - //determines how a vector v intersects the plane (1 = intersects front, 0 = within plane, -1 = intersects back)  
84 - CUDA_CALLABLE int face(vec<T, D> v){  
85 -  
86 - T dprod = v.dot(N); //get the dot product between v and N  
87 -  
88 - //conditional returns the appropriate value  
89 - if(dprod < 0)  
90 - return 1;  
91 - else if(dprod > 0)  
92 - return -1;  
93 - else  
94 - return 0;  
95 - }  
96 -  
97 - //determine on which side of the plane a point lies (1 = front, 0 = on the plane, -1 = back)  
98 - CUDA_CALLABLE int side(vec<T, D> p){  
99 -  
100 - vec<T, D> v = p - P; //get the vector from P to the query point p  
101 -  
102 - return face(v);  
103 - }  
104 -  
105 - //compute the component of v that is perpendicular to the plane  
106 - CUDA_CALLABLE vec<T, D> perpendicular(vec<T, D> v){  
107 - return N * v.dot(N);  
108 - }  
109 -  
110 - //compute the projection of v in the plane  
111 - CUDA_CALLABLE vec<T, D> parallel(vec<T, D> v){  
112 - return v - perpendicular(v);  
113 - }  
114 -  
115 - CUDA_CALLABLE void decompose(vec<T, D> v, vec<T, D>& para, vec<T, D>& perp){  
116 - perp = N * v.dot(N);  
117 - para = v - perp;  
118 - }  
119 -  
120 - //get both the parallel and perpendicular components of a vector v w.r.t. the plane  
121 - CUDA_CALLABLE void project(vec<T, D> v, vec<T, D> &v_par, vec<T, D> &v_perp){  
122 -  
123 - v_perp = v.dot(N);  
124 - v_par = v - v_perp;  
125 - }  
126 -  
127 - //compute the reflection of v off of the plane  
128 - CUDA_CALLABLE vec<T, D> reflect(vec<T, D> v){  
129 -  
130 - //compute the reflection using N_prime as the plane normal  
131 - vec<T, D> par = parallel(v);  
132 - vec<T, D> r = (-v) + par * 2;  
133 -  
134 - /*std::cout<<"----------------REFLECT-----------------------------"<<std::endl;  
135 - std::cout<<str()<<std::endl;  
136 - std::cout<<"v: "<<v<<std::endl;  
137 - std::cout<<"r: "<<r<<std::endl;  
138 - std::cout<<"Perpendicular: "<<perpendicular(v)<<std::endl;  
139 - std::cout<<"Parallel: "<<par<<std::endl;*/  
140 - return r;  
141 -  
142 - }  
143 -  
144 - CUDA_CALLABLE rts::plane<T, D> operator-()  
145 - {  
146 - rts::plane<T, D> p = *this;  
147 -  
148 - //negate the normal vector  
149 - p.N = -p.N;  
150 -  
151 - return p;  
152 - }  
153 -  
154 - //output a string  
155 - std::string str(){  
156 - std::stringstream ss;  
157 - ss<<"P: "<<P<<std::endl;  
158 - ss<<"N: "<<N;  
159 - return ss.str();  
160 - }  
161 -  
162 - ///////Friendship  
163 - //friend CUDA_CALLABLE rts::plane<T, D> operator- <> (rts::plane<T, D> v);  
164 -  
165 -  
166 -  
167 -};  
168 -  
169 -}  
170 -  
171 -//arithmetic operators  
172 -  
173 -//negative operator flips the plane (front to back)  
174 -//template <typename T, int D>  
175 -  
176 -  
177 -  
178 -  
179 -#endif  
stim/math/quad.h deleted
1 -#ifndef RTS_QUAD_H  
2 -#define RTS_QUAD_H  
3 -  
4 -//enable CUDA_CALLABLE macro  
5 -#include <stim/cuda/callable.h>  
6 -#include <stim/math/vector.h>  
7 -#include <stim/math/triangle.h>  
8 -#include <stim/math/quaternion.h>  
9 -#include <iostream>  
10 -#include <iomanip>  
11 -#include <algorithm>  
12 -  
13 -namespace stim{  
14 -  
15 -//template for a quadangle class in ND space  
16 -template <class T, int N = 3>  
17 -struct quad  
18 -{  
19 - /*  
20 - B------------------>C  
21 - ^ ^  
22 - | |  
23 - Y |  
24 - | |  
25 - | |  
26 - A---------X-------->O  
27 - */  
28 -  
29 - /*T A[N];  
30 - T B[N];  
31 - T C[N];*/  
32 -  
33 - rts::vec<T, N> A;  
34 - rts::vec<T, N> X;  
35 - rts::vec<T, N> Y;  
36 -  
37 -  
38 - CUDA_CALLABLE quad()  
39 - {  
40 -  
41 - }  
42 -  
43 - CUDA_CALLABLE quad(vec<T, N> a, vec<T, N> b, vec<T, N> c)  
44 - {  
45 -  
46 - A = a;  
47 - Y = b - a;  
48 - X = c - a - Y;  
49 -  
50 - }  
51 -  
52 - /*******************************************************************  
53 - Constructor - create a quad from a position, normal, and rotation  
54 - *******************************************************************/  
55 - CUDA_CALLABLE quad(rts::vec<T, N> c, rts::vec<T, N> normal, T width, T height, T theta)  
56 - {  
57 -  
58 - //compute the X direction - start along world-space X  
59 - Y = rts::vec<T, N>(0, 1, 0);  
60 - if(Y == normal)  
61 - Y = rts::vec<T, N>(0, 0, 1);  
62 -  
63 - X = Y.cross(normal).norm();  
64 -  
65 - std::cout<<X<<std::endl;  
66 -  
67 - //rotate the X axis by theta radians  
68 - rts::quaternion<T> q;  
69 - q.CreateRotation(theta, normal);  
70 - X = q.toMatrix3() * X;  
71 - Y = normal.cross(X);  
72 -  
73 - //normalize everything  
74 - X = X.norm();  
75 - Y = Y.norm();  
76 -  
77 - //scale to match the quad width and height  
78 - X = X * width;  
79 - Y = Y * height;  
80 -  
81 - //set the corner of the plane  
82 - A = c - X * 0.5f - Y * 0.5f;  
83 -  
84 - std::cout<<X<<std::endl;  
85 - }  
86 -  
87 - //boolean comparison  
88 - bool operator==(const quad<T, N> & rhs)  
89 - {  
90 - if(A == rhs.A && X == rhs.X && Y == rhs.Y)  
91 - return true;  
92 - else  
93 - return false;  
94 - }  
95 -  
96 - /*******************************************  
97 - Return the normal for the quad  
98 - *******************************************/  
99 - CUDA_CALLABLE rts::vec<T, N> n()  
100 - {  
101 - return (X.cross(Y)).norm();  
102 - }  
103 -  
104 - CUDA_CALLABLE rts::vec<T, N> p(T a, T b)  
105 - {  
106 - rts::vec<T, N> result;  
107 - //given the two parameters a, b = [0 1], returns the position in world space  
108 - result = A + X * a + Y * b;  
109 -  
110 - return result;  
111 - }  
112 -  
113 - CUDA_CALLABLE rts::vec<T, N> operator()(T a, T b)  
114 - {  
115 - return p(a, b);  
116 - }  
117 -  
118 - std::string str()  
119 - {  
120 - std::stringstream ss;  
121 -  
122 - ss<<std::left<<"B="<<setfill('-')<<setw(20)<<A + Y<<">"<<"C="<<A + Y + X<<std::endl;  
123 - ss<<setfill(' ')<<setw(23)<<"|"<<"|"<<std::endl<<setw(23)<<"|"<<"|"<<std::endl;  
124 - ss<<std::left<<"A="<<setfill('-')<<setw(20)<<A<<">"<<"D="<<A + X;  
125 -  
126 - return ss.str();  
127 -  
128 - }  
129 -  
130 - CUDA_CALLABLE quad<T, N> operator*(T rhs)  
131 - {  
132 - //scales the plane by a scalar value  
133 -  
134 - //compute the center point  
135 - rts::vec<T, N> c = A + X*0.5f + Y*0.5f;  
136 -  
137 - //create the new quadangle  
138 - quad<T, N> result;  
139 - result.X = X * rhs;  
140 - result.Y = Y * rhs;  
141 - result.A = c - result.X*0.5f - result.Y*0.5f;  
142 -  
143 - return result;  
144 -  
145 - }  
146 -  
147 - CUDA_CALLABLE T dist(vec<T, N> p)  
148 - {  
149 - //compute the distance between a point and this quad  
150 -  
151 - //first break the quad up into two triangles  
152 - triangle<T, N> T0(A, A+X, A+Y);  
153 - triangle<T, N> T1(A+X+Y, A+X, A+Y);  
154 -  
155 -  
156 - T d0 = T0.dist(p);  
157 - T d1 = T1.dist(p);  
158 -  
159 - if(d0 < d1)  
160 - return d0;  
161 - else  
162 - return d1;  
163 - }  
164 -  
165 - CUDA_CALLABLE T dist_max(vec<T, N> p)  
166 - {  
167 - T da = (A - p).len();  
168 - T db = (A+X - p).len();  
169 - T dc = (A+Y - p).len();  
170 - T dd = (A+X+Y - p).len();  
171 -  
172 - return std::max( da, std::max(db, std::max(dc, dd) ) );  
173 - }  
174 -};  
175 -  
176 -} //end namespace rts  
177 -  
178 -template <typename T, int N>  
179 -std::ostream& operator<<(std::ostream& os, rts::quad<T, N> R)  
180 -{  
181 - os<<R.str();  
182 - return os;  
183 -}  
184 -  
185 -  
186 -#endif  
@@ -28,13 +28,10 @@ class rect : plane &lt;T&gt; @@ -28,13 +28,10 @@ class rect : plane &lt;T&gt;
28 O---------X---------> 28 O---------X--------->
29 */ 29 */
30 30
31 -private:  
32 -  
33 - stim::vec<T> X;  
34 - stim::vec<T> Y;  
35 -  
36 - 31 +protected:
37 32
  33 + stim::vec3<T> X;
  34 + stim::vec3<T> Y;
38 35
39 public: 36 public:
40 37
@@ -65,7 +62,7 @@ public: @@ -65,7 +62,7 @@ public:
65 ///create a rectangle from a center point, normal 62 ///create a rectangle from a center point, normal
66 ///@param c: x,y,z location of the center. 63 ///@param c: x,y,z location of the center.
67 ///@param n: x,y,z direction of the normal. 64 ///@param n: x,y,z direction of the normal.
68 - CUDA_CALLABLE rect(vec<T> c, vec<T> n = vec<T>(0, 0, 1)) 65 + CUDA_CALLABLE rect(vec3<T> c, vec3<T> n = vec3<T>(0, 0, 1))
69 : plane<T>() 66 : plane<T>()
70 { 67 {
71 init(); //start with the default setting 68 init(); //start with the default setting
@@ -76,7 +73,7 @@ public: @@ -76,7 +73,7 @@ public:
76 ///@param c: x,y,z location of the center. 73 ///@param c: x,y,z location of the center.
77 ///@param s: size of the rectangle. 74 ///@param s: size of the rectangle.
78 ///@param n: x,y,z direction of the normal. 75 ///@param n: x,y,z direction of the normal.
79 - CUDA_CALLABLE rect(vec<T> c, T s, vec<T> n = vec<T>(0, 0, 1)) 76 + CUDA_CALLABLE rect(vec3<T> c, T s, vec3<T> n = vec3<T>(0, 0, 1))
80 : plane<T>() 77 : plane<T>()
81 { 78 {
82 init(); //start with the default setting 79 init(); //start with the default setting
@@ -89,7 +86,7 @@ public: @@ -89,7 +86,7 @@ public:
89 ///@param center: x,y,z location of the center. 86 ///@param center: x,y,z location of the center.
90 ///@param directionX: u,v,w direction of the X vector. 87 ///@param directionX: u,v,w direction of the X vector.
91 ///@param directionY: u,v,w direction of the Y vector. 88 ///@param directionY: u,v,w direction of the Y vector.
92 - CUDA_CALLABLE rect(vec<T> center, vec<T> directionX, vec<T> directionY ) 89 + CUDA_CALLABLE rect(vec3<T> center, vec3<T> directionX, vec3<T> directionY )
93 : plane<T>((directionX.cross(directionY)).norm(),center) 90 : plane<T>((directionX.cross(directionY)).norm(),center)
94 { 91 {
95 X = directionX; 92 X = directionX;
@@ -101,7 +98,7 @@ public: @@ -101,7 +98,7 @@ public:
101 ///@param center: x,y,z location of the center. 98 ///@param center: x,y,z location of the center.
102 ///@param directionX: u,v,w direction of the X vector. 99 ///@param directionX: u,v,w direction of the X vector.
103 ///@param directionY: u,v,w direction of the Y vector. 100 ///@param directionY: u,v,w direction of the Y vector.
104 - CUDA_CALLABLE rect(T size, vec<T> center, vec<T> directionX, vec<T> directionY ) 101 + CUDA_CALLABLE rect(T size, vec3<T> center, vec3<T> directionX, vec3<T> directionY )
105 : plane<T>((directionX.cross(directionY)).norm(),center) 102 : plane<T>((directionX.cross(directionY)).norm(),center)
106 { 103 {
107 X = directionX; 104 X = directionX;
@@ -114,7 +111,7 @@ public: @@ -114,7 +111,7 @@ public:
114 ///@param center: x,y,z location of the center. 111 ///@param center: x,y,z location of the center.
115 ///@param directionX: u,v,w direction of the X vector. 112 ///@param directionX: u,v,w direction of the X vector.
116 ///@param directionY: u,v,w direction of the Y vector. 113 ///@param directionY: u,v,w direction of the Y vector.
117 - CUDA_CALLABLE rect(vec<T> size, vec<T> center, vec<T> directionX, vec<T> directionY) 114 + CUDA_CALLABLE rect(vec3<T> size, vec3<T> center, vec3<T> directionX, vec3<T> directionY)
118 : plane<T>((directionX.cross(directionY)).norm(), center) 115 : plane<T>((directionX.cross(directionY)).norm(), center)
119 { 116 {
120 X = directionX; 117 X = directionX;
@@ -138,7 +135,7 @@ public: @@ -138,7 +135,7 @@ public:
138 135
139 ///@param n; vector with the normal. 136 ///@param n; vector with the normal.
140 ///Orients the rectangle along the normal n. 137 ///Orients the rectangle along the normal n.
141 - CUDA_CALLABLE void normal(vec<T> n) 138 + CUDA_CALLABLE void normal(vec3<T> n)
142 { 139 {
143 //orient the rectangle along the specified normal 140 //orient the rectangle along the specified normal
144 rotate(n, X, Y); 141 rotate(n, X, Y);
@@ -147,8 +144,8 @@ public: @@ -147,8 +144,8 @@ public:
147 ///general init method that sets a general rectangle. 144 ///general init method that sets a general rectangle.
148 CUDA_CALLABLE void init() 145 CUDA_CALLABLE void init()
149 { 146 {
150 - X = vec<T>(1, 0, 0);  
151 - Y = vec<T>(0, 1, 0); 147 + X = vec3<T>(1, 0, 0);
  148 + Y = vec3<T>(0, 1, 0);
152 } 149 }
153 150
154 //boolean comparison 151 //boolean comparison
@@ -162,18 +159,18 @@ public: @@ -162,18 +159,18 @@ public:
162 159
163 160
164 //get the world space value given the planar coordinates a, b in [0, 1] 161 //get the world space value given the planar coordinates a, b in [0, 1]
165 - CUDA_CALLABLE stim::vec<T> p(T a, T b) 162 + CUDA_CALLABLE stim::vec3<T> p(T a, T b)
166 { 163 {
167 - stim::vec<T> result; 164 + stim::vec3<T> result;
168 //given the two parameters a, b = [0 1], returns the position in world space 165 //given the two parameters a, b = [0 1], returns the position in world space
169 - vec<T> A = this->P - X * (T)0.5 - Y * (T)0.5; 166 + vec3<T> A = this->P - X * (T)0.5 - Y * (T)0.5;
170 result = A + X * a + Y * b; 167 result = A + X * a + Y * b;
171 168
172 return result; 169 return result;
173 } 170 }
174 171
175 //parenthesis operator returns the world space given rectangular coordinates a and b in [0 1] 172 //parenthesis operator returns the world space given rectangular coordinates a and b in [0 1]
176 - CUDA_CALLABLE stim::vec<T> operator()(T a, T b) 173 + CUDA_CALLABLE stim::vec3<T> operator()(T a, T b)
177 { 174 {
178 return p(a, b); 175 return p(a, b);
179 } 176 }
@@ -181,12 +178,12 @@ public: @@ -181,12 +178,12 @@ public:
181 std::string str() 178 std::string str()
182 { 179 {
183 std::stringstream ss; 180 std::stringstream ss;
184 - vec<T> A = P - X * (T)0.5 - Y * (T)0.5; 181 + vec3<T> A = P - X * (T)0.5 - Y * (T)0.5;
185 ss<<std::left<<"B="<<std::setfill('-')<<std::setw(20)<<A + Y<<">"<<"C="<<A + Y + X<<std::endl; 182 ss<<std::left<<"B="<<std::setfill('-')<<std::setw(20)<<A + Y<<">"<<"C="<<A + Y + X<<std::endl;
186 ss<<std::setfill(' ')<<std::setw(23)<<"|"<<"|"<<std::endl<<std::setw(23)<<"|"<<"|"<<std::endl; 183 ss<<std::setfill(' ')<<std::setw(23)<<"|"<<"|"<<std::endl<<std::setw(23)<<"|"<<"|"<<std::endl;
187 ss<<std::left<<"A="<<std::setfill('-')<<std::setw(20)<<A<<">"<<"D="<<A + X; 184 ss<<std::left<<"A="<<std::setfill('-')<<std::setw(20)<<A<<">"<<"D="<<A + X;
188 185
189 - return ss.str(); 186 + return ss.str();
190 187
191 } 188 }
192 189
@@ -205,11 +202,11 @@ public: @@ -205,11 +202,11 @@ public:
205 202
206 ///computes the distance between the specified point and this rectangle. 203 ///computes the distance between the specified point and this rectangle.
207 ///@param p: x, y, z coordinates of the point to calculate distance to. 204 ///@param p: x, y, z coordinates of the point to calculate distance to.
208 - CUDA_CALLABLE T dist(vec<T> p) 205 + CUDA_CALLABLE T dist(vec3<T> p)
209 { 206 {
210 //compute the distance between a point and this rect 207 //compute the distance between a point and this rect
211 208
212 - vec<T> A = P - X * (T)0.5 - Y * (T)0.5; 209 + vec3<T> A = P - X * (T)0.5 - Y * (T)0.5;
213 210
214 //first break the rect up into two triangles 211 //first break the rect up into two triangles
215 triangle<T> T0(A, A+X, A+Y); 212 triangle<T> T0(A, A+X, A+Y);
@@ -225,16 +222,16 @@ public: @@ -225,16 +222,16 @@ public:
225 return d1; 222 return d1;
226 } 223 }
227 224
228 - CUDA_CALLABLE T center(vec<T> p) 225 + CUDA_CALLABLE T center(vec3<T> p)
229 { 226 {
230 this->P = p; 227 this->P = p;
231 } 228 }
232 229
233 ///Returns the maximum distance of the rectangle from a point p to the sides of the rectangle. 230 ///Returns the maximum distance of the rectangle from a point p to the sides of the rectangle.
234 ///@param p: x, y, z point. 231 ///@param p: x, y, z point.
235 - CUDA_CALLABLE T dist_max(vec<T> p) 232 + CUDA_CALLABLE T dist_max(vec3<T> p)
236 { 233 {
237 - vec<T> A = P - X * (T)0.5 - Y * (T)0.5; 234 + vec3<T> A = P - X * (T)0.5 - Y * (T)0.5;
238 T da = (A - p).len(); 235 T da = (A - p).len();
239 T db = (A+X - p).len(); 236 T db = (A+X - p).len();
240 T dc = (A+Y - p).len(); 237 T dc = (A+Y - p).len();
@@ -242,4 +242,11 @@ stim::vec3&lt;T&gt; operator*(T lhs, stim::vec3&lt;T&gt; rhs){ @@ -242,4 +242,11 @@ stim::vec3&lt;T&gt; operator*(T lhs, stim::vec3&lt;T&gt; rhs){
242 return rhs * lhs; 242 return rhs * lhs;
243 } 243 }
244 244
  245 +//stream operator
  246 +template<typename T>
  247 +std::ostream& operator<<(std::ostream& os, stim::vec3<T> const& rhs){
  248 + os<<rhs.str();
  249 + return os;
  250 +}
  251 +
245 #endif 252 #endif
246 \ No newline at end of file 253 \ No newline at end of file
stim/math/vector.h
@@ -5,8 +5,9 @@ @@ -5,8 +5,9 @@
5 #include <cmath> 5 #include <cmath>
6 #include <sstream> 6 #include <sstream>
7 #include <vector> 7 #include <vector>
8 - 8 +
9 #include <stim/cuda/cudatools/callable.h> 9 #include <stim/cuda/cudatools/callable.h>
  10 +#include <stim/math/vec3.h>
10 11
11 namespace stim 12 namespace stim
12 { 13 {
@@ -70,8 +71,8 @@ struct vec : public std::vector&lt;T&gt; @@ -70,8 +71,8 @@ struct vec : public std::vector&lt;T&gt;
70 size_t N = other.size(); 71 size_t N = other.size();
71 resize(N); //resize the current vector to match the copy 72 resize(N); //resize the current vector to match the copy
72 for(size_t i=0; i<N; i++){ //copy each element 73 for(size_t i=0; i<N; i++){ //copy each element
73 - at(i) = other[i];  
74 - } 74 + at(i) = other[i];
  75 + }
75 } 76 }
76 77
77 //I'm not sure what these were doing here. 78 //I'm not sure what these were doing here.
@@ -318,8 +319,8 @@ struct vec : public std::vector&lt;T&gt; @@ -318,8 +319,8 @@ struct vec : public std::vector&lt;T&gt;
318 } 319 }
319 320
320 /// Cast to a vec3 321 /// Cast to a vec3
321 - operator vec3<T>(){  
322 - vec3<T> r; 322 + operator stim::vec3<T>(){
  323 + stim::vec3<T> r;
323 size_t N = std::min<size_t>(size(), 3); 324 size_t N = std::min<size_t>(size(), 3);
324 for(size_t i = 0; i < N; i++) 325 for(size_t i = 0; i < N; i++)
325 r[i] = at(i); 326 r[i] = at(i);
1 #ifndef STIM_MIE_H 1 #ifndef STIM_MIE_H
2 #define STIM_MIE_H 2 #define STIM_MIE_H
  3 +#include <boost/math/special_functions/bessel.hpp>
3 4
4 #include "scalarwave.h" 5 #include "scalarwave.h"
5 #include "../math/bessel.h" 6 #include "../math/bessel.h"
@@ -43,7 +44,6 @@ void B_coefficients(stim::complex&lt;T&gt;* B, T a, T k, stim::complex&lt;T&gt; n, int Nl){ @@ -43,7 +44,6 @@ void B_coefficients(stim::complex&lt;T&gt;* B, T a, T k, stim::complex&lt;T&gt; n, int Nl){
43 numerator = j_ka[l] * dj_kna[l] * (stim::complex<double>)n - j_kna[l] * dj_ka[l]; 44 numerator = j_ka[l] * dj_kna[l] * (stim::complex<double>)n - j_kna[l] * dj_ka[l];
44 denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n; 45 denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
45 B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator; 46 B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
46 - std::cout<<B[l]<<std::endl;  
47 } 47 }
48 } 48 }
49 49
@@ -84,7 +84,7 @@ void A_coefficients(stim::complex&lt;T&gt;* A, T a, T k, stim::complex&lt;T&gt; n, int Nl){ @@ -84,7 +84,7 @@ void A_coefficients(stim::complex&lt;T&gt;* A, T a, T k, stim::complex&lt;T&gt; n, int Nl){
84 84
85 #define LOCAL_NL 16 85 #define LOCAL_NL 16
86 template<typename T> 86 template<typename T>
87 -__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, int Nl){ 87 +__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){
88 extern __shared__ stim::complex<T> shared_hB[]; //declare the list of waves in shared memory 88 extern __shared__ stim::complex<T> shared_hB[]; //declare the list of waves in shared memory
89 89
90 size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array 90 size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
@@ -96,14 +96,11 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* @@ -96,14 +96,11 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T*
96 96
97 T r = p.len(); //calculate the distance from the sphere 97 T r = p.len(); //calculate the distance from the sphere
98 if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) 98 if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field)
99 - T k = W[0].kmag();  
100 - size_t NC = Nl + 1; //calculate the number of coefficients to be used  
101 - T kr = r * k; //calculate the thread value for k*r  
102 - T fij = (kr - kr_min)/dkr; //FP index into the spherical bessel LUT 99 + T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT
103 size_t ij = (size_t) fij; //convert to an integral index 100 size_t ij = (size_t) fij; //convert to an integral index
104 T alpha = fij - ij; //calculate the fractional portion of the index 101 T alpha = fij - ij; //calculate the fractional portion of the index
105 - size_t n0j = ij * (NC); //start of the first entry in the LUT  
106 - size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT 102 + size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT
  103 + size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT
107 104
108 T cos_phi; 105 T cos_phi;
109 T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials 106 T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials
@@ -112,37 +109,36 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* @@ -112,37 +109,36 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T*
112 stim::complex<T> Ei = 0; //create a register to store the result 109 stim::complex<T> Ei = 0; //create a register to store the result
113 int l; 110 int l;
114 111
115 - stim::complex<T> hlBl[LOCAL_NL+1];  
116 - int shared_start = threadIdx.x * (Nl - LOCAL_NL); 112 + stim::complex<T> hlBl[LOCAL_NL+1]; //the first LOCAL_NL components are stored in registers for speed
  113 + int shared_start = threadIdx.x * (Nl - LOCAL_NL); //wrap up some operations so that they aren't done in the main loops
117 114
118 - #pragma unroll LOCAL_NL+1 115 + #pragma unroll LOCAL_NL+1 //copy the first LOCAL_NL+1 h_l * B_l components to registers
119 for(l = 0; l <= LOCAL_NL; l++) 116 for(l = 0; l <= LOCAL_NL; l++)
120 hlBl[l] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha ); 117 hlBl[l] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
121 118
122 - for(l = LOCAL_NL+1; l <= Nl; l++) 119 + for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory
123 shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha ); 120 shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
124 121
125 - for(size_t w = 0; w < nW; w++){ 122 + for(size_t w = 0; w < nW; w++){ //for each plane wave
126 cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere 123 cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere
127 - Pl_2 = 1; 124 + Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
128 Pl_1 = cos_phi; 125 Pl_1 = cos_phi;
129 - Ei += W[w].E() * hlBl[0] * Pl_2; 126 + Ei += W[w].E() * hlBl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation
130 Ei += W[w].E() * hlBl[1] * Pl_1; 127 Ei += W[w].E() * hlBl[1] * Pl_1;
131 128
132 - #pragma unroll LOCAL_NL-1 129 + #pragma unroll LOCAL_NL-1 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file)
133 for(l = 2; l <= LOCAL_NL; l++){ 130 for(l = 2; l <= LOCAL_NL; l++){
134 - Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);  
135 - Ei += W[w].E() * hlBl[l] * Pl; 131 + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs)
  132 + Ei += W[w].E() * hlBl[l] * Pl; //calculate and sum the current field order
136 Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 133 Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
137 Pl_1 = Pl; 134 Pl_1 = Pl;
138 } 135 }
139 136
140 - for(l = LOCAL_NL+1; l <= Nl; l++){  
141 - Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);  
142 - Ei += W[w].E() * shared_hB[shared_start + (l - (LOCAL_NL+1))] * Pl;  
143 - Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1  
144 - Pl_1 = Pl;  
145 - 137 + for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers)
  138 + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs
  139 + Ei += W[w].E() * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl;
  140 + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
  141 + Pl_1 = Pl;
146 } 142 }
147 } 143 }
148 E[i] += Ei; //copy the result to device memory 144 E[i] += Ei; //copy the result to device memory
@@ -152,10 +148,10 @@ template&lt;typename T&gt; @@ -152,10 +148,10 @@ template&lt;typename T&gt;
152 void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){ 148 void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){
153 149
154 size_t max_shared_mem = stim::sharedMemPerBlock(); 150 size_t max_shared_mem = stim::sharedMemPerBlock();
155 - int hBl_array = sizeof(stim::complex<T>) * (Nl + 1); 151 + size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
156 std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl; 152 std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl;
157 std::cout<<"shared memory: "<<max_shared_mem<<std::endl; 153 std::cout<<"shared memory: "<<max_shared_mem<<std::endl;
158 - int threads = (max_shared_mem / hBl_array) / 32 * 32; 154 + int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
159 std::cout<<"threads per block: "<<threads<<std::endl; 155 std::cout<<"threads per block: "<<threads<<std::endl;
160 dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks 156 dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
161 157
@@ -164,7 +160,6 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, sti @@ -164,7 +160,6 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, sti
164 else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate 160 else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate
165 std::cout<<"shared memory allocated: "<<shared_mem<<std::endl; 161 std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
166 cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel 162 cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel
167 -  
168 } 163 }
169 164
170 template<typename T> 165 template<typename T>
@@ -261,16 +256,19 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std @@ -261,16 +256,19 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
261 exit(1); 256 exit(1);
262 } 257 }
263 258
  259 + i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility
  260 + i_max--;
264 T r_min, r_max; //allocate space to store the minimum and maximum values 261 T r_min, r_max; //allocate space to store the minimum and maximum values
265 HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU 262 HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU
266 HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); 263 HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
267 264
  265 + r_min = max(r_min, a); //if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere)
268 266
269 //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r 267 //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r
270 size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1); 268 size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1);
271 269
272 - T kr_min = k * r_min;  
273 - T kr_max = k * r_max; 270 + //T kr_min = k * r_min;
  271 + //T kr_max = k * r_max;
274 272
275 //temporary variables 273 //temporary variables
276 double vm; //allocate space to store the return values for the bessel function calculation 274 double vm; //allocate space to store the return values for the bessel function calculation
@@ -281,27 +279,29 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std @@ -281,27 +279,29 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
281 279
282 size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut; 280 size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut;
283 stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes); //pointer to the look-up table 281 stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes); //pointer to the look-up table
284 - T dkr = (kr_max - kr_min) / (N_hB_lut-1); //distance between values in the LUT 282 + T dr = (r_max - r_min) / (N_hB_lut-1); //distance between values in the LUT
285 std::cout<<"LUT jl bytes: "<<hB_bytes<<std::endl; 283 std::cout<<"LUT jl bytes: "<<hB_bytes<<std::endl;
286 stim::complex<T> hl; 284 stim::complex<T> hl;
287 - for(size_t kri = 0; kri < N_hB_lut; kri++){ //for each value in the LUT  
288 - stim::bessjyv_sph<double>(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] 285 + for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT
  286 + stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
289 for(size_t l = 0; l <= Nl; l++){ //for each order 287 for(size_t l = 0; l <= Nl; l++){ //for each order
290 hl.r = (T)jv[l]; 288 hl.r = (T)jv[l];
291 hl.i = (T)yv[l]; 289 hl.i = (T)yv[l];
292 290
293 - hB_lut[kri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result 291 + hB_lut[ri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result
  292 + //std::cout<<hB_lut[ri * (Nl + 1) + l]<<std::endl;
294 } 293 }
295 } 294 }
296 -  
297 - //stim::cpu2image<T>(hankel_lut, "hankel.bmp", Nl+1, Nlut_j, stim::cmBrewer); 295 + T* real_lut = (T*) malloc(hB_bytes/2);
  296 + stim::real(real_lut, hB_lut, N_hB_lut);
  297 + stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer);
298 298
299 //Allocate device memory and copy everything to the GPU 299 //Allocate device memory and copy everything to the GPU
300 stim::complex<T>* dev_hB_lut; 300 stim::complex<T>* dev_hB_lut;
301 HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); 301 HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
302 HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); 302 HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );
303 303
304 - gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, kr_min, dkr, N_hB_lut, Nl); 304 + gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl);
305 305
306 cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory 306 cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory
307 307
@@ -349,9 +349,90 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std @@ -349,9 +349,90 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
349 } 349 }
350 350
351 template<typename T> 351 template<typename T>
352 -void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){ 352 +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){
353 std::vector< stim::scalarwave<T> > W(1, w); 353 std::vector< stim::scalarwave<T> > W(1, w);
354 - cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n); 354 + cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, r_spacing);
  355 +}
  356 +
  357 +template<typename T>
  358 +__global__ void cuda_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* jA, T r_min, T dr, size_t N_jA, int Nl){
  359 + extern __shared__ stim::complex<T> shared_jA[]; //declare the list of waves in shared memory
  360 +
  361 + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
  362 + if(i >= N) return; //exit if this thread is outside the array
  363 + stim::vec3<T> p;
  364 + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
  365 + (y == NULL) ? p[1] = 0 : p[1] = y[i];
  366 + (z == NULL) ? p[2] = 0 : p[2] = z[i];
  367 +
  368 + T r = p.len(); //calculate the distance from the sphere
  369 + if(r > a) return; //exit if the point is inside the sphere (we only calculate the internal field)
  370 + T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT
  371 + size_t ij = (size_t) fij; //convert to an integral index
  372 + T alpha = fij - ij; //calculate the fractional portion of the index
  373 + size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT
  374 + size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT
  375 +
  376 + T cos_phi;
  377 + T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials
  378 +
  379 + stim::complex<T> jAl;
  380 + stim::complex<T> Ei = 0; //create a register to store the result
  381 + int l;
  382 +
  383 + stim::complex<T> jlAl[LOCAL_NL+1]; //the first LOCAL_NL components are stored in registers for speed
  384 + int shared_start = threadIdx.x * (Nl - LOCAL_NL); //wrap up some operations so that they aren't done in the main loops
  385 +
  386 + #pragma unroll LOCAL_NL+1 //copy the first LOCAL_NL+1 h_l * B_l components to registers
  387 + for(l = 0; l <= LOCAL_NL; l++)
  388 + jlAl[l] = clerp<T>( jA[n0j + l], jA[n1j + l], alpha );
  389 +
  390 + for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory
  391 + shared_jA[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( jA[n0j + l], jA[n1j + l], alpha );
  392 +
  393 + for(size_t w = 0; w < nW; w++){ //for each plane wave
  394 + if(r == 0) cos_phi = 0;
  395 + else
  396 + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere
  397 + Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
  398 + Pl_1 = cos_phi;
  399 + Ei += W[w].E() * jlAl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation
  400 + Ei += W[w].E() * jlAl[1] * Pl_1;
  401 +
  402 + #pragma unroll LOCAL_NL-1 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file)
  403 + for(l = 2; l <= LOCAL_NL; l++){
  404 + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs)
  405 + Ei += W[w].E() * jlAl[l] * Pl; //calculate and sum the current field order
  406 + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
  407 + Pl_1 = Pl;
  408 + }
  409 +
  410 + for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers)
  411 + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs
  412 + Ei += W[w].E() * shared_jA[shared_start + l - LOCAL_NL - 1] * Pl;
  413 + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
  414 + Pl_1 = Pl;
  415 + }
  416 + }
  417 + E[i] = Ei; //copy the result to device memory
  418 +}
  419 +
  420 +template<typename T>
  421 +void gpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* jA, T r_min, T dr, size_t N_jA, size_t Nl){
  422 +
  423 + size_t max_shared_mem = stim::sharedMemPerBlock();
  424 + size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
  425 + std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl;
  426 + std::cout<<"shared memory: "<<max_shared_mem<<std::endl;
  427 + int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
  428 + std::cout<<"threads per block: "<<threads<<std::endl;
  429 + dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
  430 +
  431 + size_t shared_mem;
  432 + if(Nl <= LOCAL_NL) shared_mem = 0;
  433 + else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate
  434 + std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
  435 + cuda_scalar_mie_internal<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl); //call the kernel
355 } 436 }
356 437
357 /// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere 438 /// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere
@@ -365,18 +446,122 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, sti @@ -365,18 +446,122 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, sti
365 /// @param a is the radius of the sphere 446 /// @param a is the radius of the sphere
366 /// @param n is the complex refractive index of the sphere 447 /// @param n is the complex refractive index of the sphere
367 template<typename T> 448 template<typename T>
368 -void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n){  
369 -  
370 - //calculate the necessary number of orders required to represent the scattered field 449 +void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n, T r_spacing = 0.1){
  450 +//calculate the necessary number of orders required to represent the scattered field
371 T k = W[0].kmag(); 451 T k = W[0].kmag();
372 452
373 - size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2); 453 + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
  454 + if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization)
374 std::cout<<"Nl: "<<Nl<<std::endl; 455 std::cout<<"Nl: "<<Nl<<std::endl;
375 456
376 //calculate the scattering coefficients for the sphere 457 //calculate the scattering coefficients for the sphere
377 stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients 458 stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients
378 A_coefficients(A, a, k, n, Nl); 459 A_coefficients(A, a, k, n, Nl);
379 460
  461 +#ifdef CUDA_FOUND
  462 + stim::complex<T>* dev_E; //allocate space for the field
  463 + cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
  464 + cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
  465 + //cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used)
  466 +
  467 + // COORDINATES
  468 + T* dev_x = NULL; //allocate space and copy the X coordinate (if specified)
  469 + if(x != NULL){
  470 + HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
  471 + HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
  472 + }
  473 + T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified)
  474 + if(y != NULL){
  475 + HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
  476 + HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
  477 + }
  478 + T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified)
  479 + if(z != NULL){
  480 + HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
  481 + HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
  482 + }
  483 +
  484 + // PLANE WAVES
  485 + stim::scalarwave<T>* dev_W; //allocate space and copy plane waves
  486 + HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
  487 + HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
  488 +
  489 + // BESSEL FUNCTION LOOK-UP TABLE
  490 + //calculate the distance from the sphere center
  491 + T* dev_r;
  492 + HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );
  493 +
  494 + int threads = stim::maxThreadsPerBlock();
  495 + dim3 blocks((unsigned)(N / threads + 1));
  496 + cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);
  497 +
  498 + //Find the minimum and maximum values of r
  499 + cublasStatus_t stat;
  500 + cublasHandle_t handle;
  501 +
  502 + stat = cublasCreate(&handle); //create a cuBLAS handle
  503 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  504 + printf ("CUBLAS initialization failed\n");
  505 + exit(1);
  506 + }
  507 +
  508 + int i_min, i_max;
  509 + stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
  510 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  511 + printf ("CUBLAS Error: failed to calculate minimum r value.\n");
  512 + exit(1);
  513 + }
  514 + stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
  515 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  516 + printf ("CUBLAS Error: failed to calculate maximum r value.\n");
  517 + exit(1);
  518 + }
  519 +
  520 + i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility
  521 + i_max--;
  522 + T r_min, r_max; //allocate space to store the minimum and maximum values
  523 + HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU
  524 + HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
  525 +
  526 + r_max = min(r_max, a); //the internal field doesn't exist outside of the sphere
  527 +
  528 + size_t N_jA_lut = (size_t)((r_max - r_min) / r_spacing + 1);
  529 +
  530 + //temporary variables
  531 + double vm; //allocate space to store the return values for the bessel function calculation
  532 + stim::complex<double>* jv = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  533 + stim::complex<double>* yv = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  534 + stim::complex<double>* djv= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  535 + stim::complex<double>* dyv= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  536 +
  537 + size_t jA_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_jA_lut;
  538 + stim::complex<T>* jA_lut = (stim::complex<T>*) malloc(jA_bytes); //pointer to the look-up table
  539 + T dr = (r_max - r_min) / (N_jA_lut-1); //distance between values in the LUT
  540 + std::cout<<"LUT jl bytes: "<<jA_bytes<<std::endl;
  541 + stim::complex<T> hl;
  542 + stim::complex<double> nd = (stim::complex<double>)n;
  543 + for(size_t ri = 0; ri < N_jA_lut; ri++){ //for each value in the LUT
  544 + stim::cbessjyva_sph<double>(Nl, nd * k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
  545 + for(size_t l = 0; l <= Nl; l++){ //for each order
  546 + jA_lut[ri * (Nl + 1) + l] = (stim::complex<T>)(jv[l] * (stim::complex<double>)A[l]); //store the bessel function result
  547 + }
  548 + }
  549 +
  550 + //Allocate device memory and copy everything to the GPU
  551 + stim::complex<T>* dev_jA_lut;
  552 + HANDLE_ERROR( cudaMalloc(&dev_jA_lut, jA_bytes) );
  553 + HANDLE_ERROR( cudaMemcpy(dev_jA_lut, jA_lut, jA_bytes, cudaMemcpyHostToDevice) );
  554 +
  555 + gpu_scalar_mie_internal<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_jA_lut, r_min, dr, N_jA_lut, Nl);
  556 +
  557 + cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory
  558 +
  559 + if(x != NULL) cudaFree(dev_x); //free everything
  560 + if(y != NULL) cudaFree(dev_y);
  561 + if(z != NULL) cudaFree(dev_z);
  562 + cudaFree(dev_E);
  563 +#else
  564 +
380 //allocate space to store the bessel function call results 565 //allocate space to store the bessel function call results
381 double vm; 566 double vm;
382 stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); 567 stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
@@ -414,12 +599,13 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st @@ -414,12 +599,13 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
414 } 599 }
415 } 600 }
416 } 601 }
  602 +#endif
417 } 603 }
418 604
419 template<typename T> 605 template<typename T>
420 -void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){ 606 +void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){
421 std::vector< stim::scalarwave<T> > W(1, w); 607 std::vector< stim::scalarwave<T> > W(1, w);
422 - cpu_scalar_mie_internal(E, N, x, y, z, W, a, n); 608 + cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, r_spacing);
423 } 609 }
424 610
425 } 611 }
stim/optics/scalarbeam.h
1 #ifndef RTS_BEAM 1 #ifndef RTS_BEAM
2 #define RTS_BEAM 2 #define RTS_BEAM
  3 +#include <boost/math/special_functions/bessel.hpp>
3 4
4 #include "../math/vec3.h" 5 #include "../math/vec3.h"
5 #include "../optics/scalarwave.h" 6 #include "../optics/scalarwave.h"
@@ -7,150 +8,68 @@ @@ -7,150 +8,68 @@
7 #include "../math/legendre.h" 8 #include "../math/legendre.h"
8 #include "../cuda/cudatools/devices.h" 9 #include "../cuda/cudatools/devices.h"
9 #include "../cuda/cudatools/timer.h" 10 #include "../cuda/cudatools/timer.h"
  11 +#include "../optics/scalarfield.h"
10 #include <cublas_v2.h> 12 #include <cublas_v2.h>
11 #include <math_constants.h> 13 #include <math_constants.h>
12 #include <vector> 14 #include <vector>
13 #include <stdlib.h> 15 #include <stdlib.h>
14 16
15 -namespace stim{  
16 17
17 - /// Function returns the value of the scalar field produced by a beam with the specified parameters  
18 -  
19 - template<typename T>  
20 - std::vector< stim::vec3<T> > generate_focusing_vectors(size_t N, stim::vec3<T> d, T NA, T NA_in = 0){  
21 -  
22 - std::vector< stim::vec3<T> > dirs(N); //allocate an array to store the focusing vectors  
23 -  
24 - ///compute the rotation operator to transform (0, 0, 1) to k  
25 - T cos_angle = d.dot(vec3<T>(0, 0, 1));  
26 - stim::matrix<T, 3> rotation;  
27 -  
28 - //if the cosine of the angle is -1, the rotation is just a flip across the z axis  
29 - if(cos_angle == -1){  
30 - rotation(2, 2) = -1;  
31 - }  
32 - else if(cos_angle != 1.0)  
33 - {  
34 - vec3<T> r_axis = vec3<T>(0, 0, 1).cross(d).norm(); //compute the axis of rotation  
35 - T angle = acos(cos_angle); //compute the angle of rotation  
36 - quaternion<T> quat; //create a quaternion describing the rotation  
37 - quat.CreateRotation(angle, r_axis);  
38 - rotation = quat.toMatrix3(); //compute the rotation matrix  
39 - }  
40 -  
41 - //find the phi values associated with the cassegrain ring  
42 - T PHI[2];  
43 - PHI[0] = (T)asin(NA);  
44 - PHI[1] = (T)asin(NA_in);  
45 -  
46 - //calculate the z-axis cylinder coordinates associated with these angles  
47 - T Z[2];  
48 - Z[0] = cos(PHI[0]);  
49 - Z[1] = cos(PHI[1]);  
50 - T range = Z[0] - Z[1];  
51 -  
52 - //draw a distribution of random phi, z values  
53 - T z, phi, theta;  
54 - //T kmag = stim::TAU / lambda;  
55 - for(int i=0; i<N; i++){ //for each sample  
56 - z = (T)((double)rand() / (double)RAND_MAX) * range + Z[1]; //find a random position on the surface of a cylinder  
57 - theta = (T)(((double)rand() / (double)RAND_MAX) * stim::TAU);  
58 - phi = acos(z); //project onto the sphere, computing phi in spherical coordinates  
59 -  
60 - //compute and store cartesian coordinates  
61 - vec3<T> spherical(1, theta, phi); //convert from spherical to cartesian coordinates  
62 - vec3<T> cart = spherical.sph2cart();  
63 - dirs[i] = rotation * cart; //create a sample vector  
64 - }  
65 - return dirs;  
66 - }  
67 -  
68 -/// Class stim::beam represents a beam of light focused at a point and composed of several plane waves  
69 -template<typename T>  
70 -class scalarbeam  
71 -{  
72 -public:  
73 - //enum beam_type {Uniform, Bartlett, Hamming, Hanning};  
74 -  
75 -private:  
76 -  
77 - T NA[2]; //numerical aperature of the focusing optics  
78 - vec3<T> f; //focal point  
79 - vec3<T> d; //propagation direction  
80 - stim::complex<T> A; //beam amplitude  
81 - T lambda; //beam wavelength  
82 -public:  
83 18
84 - ///constructor: build a default beam (NA=1.0)  
85 - scalarbeam(T wavelength = 1, stim::complex<T> amplitude = 1, vec3<T> focal_point = vec3<T>(0, 0, 0), vec3<T> direction = vec3<T>(0, 0, 1), T numerical_aperture = 1, T center_obsc = 0){  
86 - lambda = wavelength;  
87 - A = amplitude;  
88 - f = focal_point;  
89 - d = direction.norm(); //make sure that the direction vector is normalized (makes calculations more efficient later on)  
90 - NA[0] = numerical_aperture;  
91 - NA[1] = center_obsc;  
92 - } 19 +namespace stim{
93 20
94 - ///Numerical Aperature functions  
95 - void setNA(T na)  
96 - {  
97 - NA[0] = (T)0;  
98 - NA[1] = na;  
99 - }  
100 - void setNA(T na0, T na1)  
101 - {  
102 - NA[0] = na0;  
103 - NA[1] = na1;  
104 - } 21 +/// Function returns the value of the scalar field produced by a beam with the specified parameters
105 22
106 - //Monte-Carlo decomposition into plane waves  
107 - std::vector< scalarwave<T> > mc(size_t N = 100000) const{ 23 +template<typename T>
  24 +std::vector< stim::vec3<T> > generate_focusing_vectors(size_t N, stim::vec3<T> d, T NA, T NA_in = 0){
108 25
109 - std::vector< stim::vec3<T> > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]); //generate a random set of N vectors forming a focus  
110 - std::vector< scalarwave<T> > samples(N); //create a vector of plane waves  
111 - T kmag = (T)stim::TAU / lambda; //calculate the wavenumber  
112 - stim::complex<T> apw; //allocate space for the amplitude at the focal point  
113 - T a = (T)(stim::TAU * (1 - cos(asin(NA[0]))) / (double)N);  
114 - stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction  
115 - for(size_t i=0; i<N; i++){ //for each sample  
116 - kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave  
117 - apw = a * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave  
118 - samples[i] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction  
119 - } 26 + std::vector< stim::vec3<T> > dirs(N); //allocate an array to store the focusing vectors
120 27
121 - return samples;  
122 - } 28 + ///compute the rotation operator to transform (0, 0, 1) to k
  29 + T cos_angle = d.dot(vec3<T>(0, 0, 1));
  30 + stim::matrix<T, 3> rotation;
123 31
124 - /// Calculate the field at a given point  
125 - /// @param x is the x-coordinate of the field point  
126 - /// @O is the approximation accuracy  
127 - stim::complex<T> field(T x, T y, T z, size_t O){  
128 - std::vector< scalarwave<T> > W = mc(O);  
129 - T result = 0; //initialize the result to zero (0)  
130 - for(size_t i = 0; i < O; i++){ //for each plane wave  
131 - result += W[i].pos(x, y, z);  
132 - }  
133 - return result; 32 + //if the cosine of the angle is -1, the rotation is just a flip across the z axis
  33 + if(cos_angle == -1){
  34 + rotation(2, 2) = -1;
134 } 35 }
135 -  
136 - std::string str() 36 + else if(cos_angle != 1.0)
137 { 37 {
138 - std::stringstream ss;  
139 - ss<<"Beam:"<<std::endl;  
140 - //ss<<" Central Plane Wave: "<<beam::E0<<" e^i ( "<<beam::k<<" . r )"<<std::endl;  
141 - ss<<" Beam Direction: "<<d<<std::endl;  
142 - if(NA[0] == 0)  
143 - ss<<" NA: "<<NA[1];  
144 - else  
145 - ss<<" NA: "<<NA[0]<<" -- "<<NA[1];  
146 -  
147 - return ss.str(); 38 + vec3<T> r_axis = vec3<T>(0, 0, 1).cross(d).norm(); //compute the axis of rotation
  39 + T angle = acos(cos_angle); //compute the angle of rotation
  40 + quaternion<T> quat; //create a quaternion describing the rotation
  41 + quat.CreateRotation(angle, r_axis);
  42 + rotation = quat.toMatrix3(); //compute the rotation matrix
148 } 43 }
149 44
  45 + //find the phi values associated with the cassegrain ring
  46 + T PHI[2];
  47 + PHI[0] = (T)asin(NA);
  48 + PHI[1] = (T)asin(NA_in);
  49 +
  50 + //calculate the z-axis cylinder coordinates associated with these angles
  51 + T Z[2];
  52 + Z[0] = cos(PHI[0]);
  53 + Z[1] = cos(PHI[1]);
  54 + T range = Z[0] - Z[1];
  55 +
  56 + //draw a distribution of random phi, z values
  57 + T z, phi, theta;
  58 + //T kmag = stim::TAU / lambda;
  59 + for(int i=0; i<N; i++){ //for each sample
  60 + z = (T)((double)rand() / (double)RAND_MAX) * range + Z[1]; //find a random position on the surface of a cylinder
  61 + theta = (T)(((double)rand() / (double)RAND_MAX) * stim::TAU);
  62 + phi = acos(z); //project onto the sphere, computing phi in spherical coordinates
  63 +
  64 + //compute and store cartesian coordinates
  65 + vec3<T> spherical(1, theta, phi); //convert from spherical to cartesian coordinates
  66 + vec3<T> cart = spherical.sph2cart();
  67 + dirs[i] = rotation * cart; //create a sample vector
  68 + }
  69 + return dirs;
  70 +}
150 71
151 -  
152 -}; //end beam  
153 - 72 +
154 /// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional) 73 /// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional)
155 /// @param C is a pointer to Nl + 1 values where the terms will be stored 74 /// @param C is a pointer to Nl + 1 values where the terms will be stored
156 template<typename T> 75 template<typename T>
@@ -210,20 +129,19 @@ CUDA_CALLABLE T lerp(T v0, T v1, T t) { @@ -210,20 +129,19 @@ CUDA_CALLABLE T lerp(T v0, T v1, T t) {
210 return fma(t, v1, fma(-t, v0, v0)); 129 return fma(t, v1, fma(-t, v0, v0));
211 } 130 }
212 131
213 -#ifdef __CUDACC__ 132 +#ifdef CUDA_FOUND
214 template<typename T> 133 template<typename T>
215 -__global__ void cuda_scalar_psf(stim::complex<T>* E, size_t N, T* r, T* phi, T k, T A, size_t Nl, 134 +__global__ void cuda_scalar_psf(stim::complex<T>* E, size_t N, T* r, T* phi, T A, size_t Nl,
216 T* C, 135 T* C,
217 - T* lut_j, size_t Nj, T min_kr, T dkr){ 136 + T* lut_j, size_t Nj, T min_r, T dr){
218 size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array 137 size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
219 if(i >= N) return; //exit if this thread is outside the array 138 if(i >= N) return; //exit if this thread is outside the array
220 139
221 T cos_phi = cos(phi[i]); //calculate the thread value for cos(phi) 140 T cos_phi = cos(phi[i]); //calculate the thread value for cos(phi)
222 - T kr = r[i] * k; //calculate the thread value for k*r  
223 stim::complex<T> Ei = 0; //initialize the value of the field to zero 141 stim::complex<T> Ei = 0; //initialize the value of the field to zero
224 size_t NC = Nl + 1; //calculate the number of coefficients to be used 142 size_t NC = Nl + 1; //calculate the number of coefficients to be used
225 143
226 - T fij = (kr - min_kr)/dkr; //FP index into the spherical bessel LUT 144 + T fij = (r[i] - min_r)/dr; //FP index into the spherical bessel LUT
227 size_t ij = (size_t) fij; //convert to an integral index 145 size_t ij = (size_t) fij; //convert to an integral index
228 T a = fij - ij; //calculate the fractional portion of the index 146 T a = fij - ij; //calculate the fractional portion of the index
229 size_t n0j = ij * (NC); //start of the first entry in the LUT 147 size_t n0j = ij * (NC); //start of the first entry in the LUT
@@ -276,6 +194,8 @@ void gpu_scalar_psf_local(stim::complex&lt;T&gt;* E, size_t N, T* r, T* phi, T lambda, @@ -276,6 +194,8 @@ void gpu_scalar_psf_local(stim::complex&lt;T&gt;* E, size_t N, T* r, T* phi, T lambda,
276 exit(1); 194 exit(1);
277 } 195 }
278 196
  197 + i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility
  198 + i_max--;
279 T r_min, r_max; //allocate space to store the minimum and maximum values 199 T r_min, r_max; //allocate space to store the minimum and maximum values
280 HANDLE_ERROR( cudaMemcpy(&r_min, r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU 200 HANDLE_ERROR( cudaMemcpy(&r_min, r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU
281 HANDLE_ERROR( cudaMemcpy(&r_max, r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); 201 HANDLE_ERROR( cudaMemcpy(&r_max, r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
@@ -287,29 +207,19 @@ void gpu_scalar_psf_local(stim::complex&lt;T&gt;* E, size_t N, T* r, T* phi, T lambda, @@ -287,29 +207,19 @@ void gpu_scalar_psf_local(stim::complex&lt;T&gt;* E, size_t N, T* r, T* phi, T lambda,
287 207
288 size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r 208 size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r
289 209
290 - T kr_min = k * r_min;  
291 - T kr_max = k * r_max;  
292 -  
293 - //temporary variables  
294 - double vm; //allocate space to store the return values for the bessel function calculation  
295 - double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );  
296 - double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );  
297 - double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );  
298 - double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );  
299 210
300 size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j; 211 size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j;
301 - T* bessel_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table  
302 - T delta_kr = (kr_max - kr_min) / (Nlut_j-1); //distance between values in the LUT  
303 - std::cout<<"LUT jl bytes: "<<lutj_bytes<<std::endl;  
304 - for(size_t kri = 0; kri < Nlut_j; kri++){ //for each value in the LUT  
305 - stim::bessjyv_sph<double>(Nl, kr_min + kri * delta_kr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] 212 + T* j_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table
  213 + T dr = (r_max - r_min) / (Nlut_j-1); //distance between values in the LUT
  214 + T jl;
  215 + for(size_t ri = 0; ri < Nlut_j; ri++){ //for each value in the LUT
306 for(size_t l = 0; l <= Nl; l++){ //for each order 216 for(size_t l = 0; l <= Nl; l++){ //for each order
307 - bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; //store the bessel function result 217 + jl = boost::math::sph_bessel<T>(l, k*(r_min + ri * dr)); //use boost to calculate the spherical bessel function
  218 + j_lut[ri * (Nl + 1) + l] = jl; //store the bessel function result
308 } 219 }
309 } 220 }
310 221
311 - stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);  
312 - 222 + stim::cpu2image<T>(j_lut, "j_lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
313 //Allocate device memory and copy everything to the GPU 223 //Allocate device memory and copy everything to the GPU
314 224
315 T* gpu_C; 225 T* gpu_C;
@@ -317,12 +227,12 @@ void gpu_scalar_psf_local(stim::complex&lt;T&gt;* E, size_t N, T* r, T* phi, T lambda, @@ -317,12 +227,12 @@ void gpu_scalar_psf_local(stim::complex&lt;T&gt;* E, size_t N, T* r, T* phi, T lambda,
317 HANDLE_ERROR( cudaMemcpy(gpu_C, C, C_bytes, cudaMemcpyHostToDevice) ); 227 HANDLE_ERROR( cudaMemcpy(gpu_C, C, C_bytes, cudaMemcpyHostToDevice) );
318 T* gpu_j_lut; 228 T* gpu_j_lut;
319 HANDLE_ERROR( cudaMalloc(&gpu_j_lut, lutj_bytes) ); 229 HANDLE_ERROR( cudaMalloc(&gpu_j_lut, lutj_bytes) );
320 - HANDLE_ERROR( cudaMemcpy(gpu_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) ); 230 + HANDLE_ERROR( cudaMemcpy(gpu_j_lut, j_lut, lutj_bytes, cudaMemcpyHostToDevice) );
321 231
322 int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device 232 int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
323 dim3 blocks( (unsigned)(N / threads + 1)); //calculate the optimal number of blocks 233 dim3 blocks( (unsigned)(N / threads + 1)); //calculate the optimal number of blocks
324 234
325 - cuda_scalar_psf<T><<< blocks, threads >>>(E, N, r, phi, (T)stim::TAU/lambda, A, Nl, gpu_C, gpu_j_lut, Nlut_j, kr_min, delta_kr); 235 + cuda_scalar_psf<T><<< blocks, threads >>>(E, N, r, phi, A, Nl, gpu_C, gpu_j_lut, Nlut_j, r_min, dr);
326 236
327 //free the LUT and condenser tables 237 //free the LUT and condenser tables
328 HANDLE_ERROR( cudaFree(gpu_C) ); 238 HANDLE_ERROR( cudaFree(gpu_C) );
@@ -392,7 +302,7 @@ __global__ void cuda_cart2psf(T* r, T* phi, size_t N, T* x, T* y, T* z, stim::ve @@ -392,7 +302,7 @@ __global__ void cuda_cart2psf(T* r, T* phi, size_t N, T* x, T* y, T* z, stim::ve
392 phi[i] = ps[2]; //phi = [0 pi] 302 phi[i] = ps[2]; //phi = [0 pi]
393 } 303 }
394 304
395 -#ifdef __CUDACC__ 305 +#ifdef CUDA_FOUND
396 /// Calculate the analytical solution to a point spread function given a set of points in cartesian coordinates 306 /// Calculate the analytical solution to a point spread function given a set of points in cartesian coordinates
397 template<typename T> 307 template<typename T>
398 void gpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){ 308 void gpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
@@ -419,7 +329,7 @@ template&lt;typename T&gt; @@ -419,7 +329,7 @@ template&lt;typename T&gt;
419 void cpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){ 329 void cpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
420 330
421 // If CUDA is available, copy the cartesian points to the GPU and evaluate them in a kernel 331 // If CUDA is available, copy the cartesian points to the GPU and evaluate them in a kernel
422 -#ifdef __CUDACC__ 332 +#ifdef CUDA_FOUND
423 333
424 T* gpu_x = NULL; 334 T* gpu_x = NULL;
425 if(x != NULL){ 335 if(x != NULL){
@@ -470,13 +380,112 @@ void cpu_scalar_psf_cart(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, T lamb @@ -470,13 +380,112 @@ void cpu_scalar_psf_cart(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, T lamb
470 phi[i] = ps[2]; //phi = [0 pi] 380 phi[i] = ps[2]; //phi = [0 pi]
471 } 381 }
472 382
473 - cpu_scalar_psf_local(F, N, r, phi, lambda, A, NA, NA_in, Nl); //call the spherical coordinate CPU function 383 + cpu_scalar_psf_local(E, N, r, phi, lambda, A, NA, NA_in, Nl); //call the spherical coordinate CPU function
474 384
475 free(r); 385 free(r);
476 free(phi); 386 free(phi);
477 #endif 387 #endif
478 } 388 }
  389 +
  390 +/// Class stim::beam represents a beam of light focused at a point and composed of several plane waves
  391 +template<typename T>
  392 +class scalarbeam
  393 +{
  394 +public:
  395 + //enum beam_type {Uniform, Bartlett, Hamming, Hanning};
  396 +
  397 +private:
  398 +
  399 + T NA[2]; //numerical aperature of the focusing optics
  400 + vec3<T> f; //focal point
  401 + vec3<T> d; //propagation direction
  402 + T A; //beam amplitude
  403 + T lambda; //beam wavelength
  404 +public:
479 405
  406 + ///constructor: build a default beam (NA=1.0)
  407 + scalarbeam(T wavelength = 1, T amplitude = 1, vec3<T> focal_point = vec3<T>(0, 0, 0), vec3<T> direction = vec3<T>(0, 0, 1), T numerical_aperture = 1, T center_obsc = 0){
  408 + lambda = wavelength;
  409 + A = amplitude;
  410 + f = focal_point;
  411 + d = direction.norm(); //make sure that the direction vector is normalized (makes calculations more efficient later on)
  412 + NA[0] = numerical_aperture;
  413 + NA[1] = center_obsc;