Commit 8309b07a16caaadfd4bc85bfbebdf7041a63f0e0
1 parent
31262e83
fixed some vec3 errors
Showing
8 changed files
with
429 additions
and
593 deletions
Show diff stats
stim/math/complex.h
stim/math/plane_old.h deleted
1 | -#ifndef RTS_PLANE_H | |
2 | -#define RTS_PLANE_H | |
3 | - | |
4 | -#include <iostream> | |
5 | -#include <stim/math/vector.h> | |
6 | -#include "rts/cuda/callable.h" | |
7 | - | |
8 | - | |
9 | -namespace stim{ | |
10 | -template <typename T, int D> class plane; | |
11 | -} | |
12 | - | |
13 | -template <typename T, int D> | |
14 | -CUDA_CALLABLE stim::plane<T, D> operator-(stim::plane<T, D> v); | |
15 | - | |
16 | -namespace stim{ | |
17 | - | |
18 | -template <class T, int D = 3> | |
19 | -class plane{ | |
20 | - | |
21 | - //a plane is defined by a point and a normal | |
22 | - | |
23 | -private: | |
24 | - | |
25 | - vec<T, D> P; //point on the plane | |
26 | - vec<T, D> N; //plane normal | |
27 | - | |
28 | - CUDA_CALLABLE void init(){ | |
29 | - P = vec<T, D>(0, 0, 0); | |
30 | - N = vec<T, D>(0, 0, 1); | |
31 | - } | |
32 | - | |
33 | - | |
34 | -public: | |
35 | - | |
36 | - //default constructor | |
37 | - CUDA_CALLABLE plane(){ | |
38 | - init(); | |
39 | - } | |
40 | - | |
41 | - CUDA_CALLABLE plane(vec<T, D> n, vec<T, D> p = vec<T, D>(0, 0, 0)){ | |
42 | - P = p; | |
43 | - N = n.norm(); | |
44 | - } | |
45 | - | |
46 | - CUDA_CALLABLE plane(T z_pos){ | |
47 | - init(); | |
48 | - P[2] = z_pos; | |
49 | - } | |
50 | - | |
51 | - //create a plane from three points (a triangle) | |
52 | - CUDA_CALLABLE plane(vec<T, D> a, vec<T, D> b, vec<T, D> c){ | |
53 | - P = c; | |
54 | - N = (c - a).cross(b - a); | |
55 | - if(N.len() == 0) //handle the degenerate case when two vectors are the same, N = 0 | |
56 | - N = 0; | |
57 | - else | |
58 | - N = N.norm(); | |
59 | - } | |
60 | - | |
61 | - template< typename U > | |
62 | - CUDA_CALLABLE operator plane<U, D>(){ | |
63 | - | |
64 | - plane<U, D> result(N, P); | |
65 | - return result; | |
66 | - } | |
67 | - | |
68 | - CUDA_CALLABLE vec<T, D> norm(){ | |
69 | - return N; | |
70 | - } | |
71 | - | |
72 | - CUDA_CALLABLE vec<T, D> p(){ | |
73 | - return P; | |
74 | - } | |
75 | - | |
76 | - //flip the plane front-to-back | |
77 | - CUDA_CALLABLE plane<T, D> flip(){ | |
78 | - plane<T, D> result = *this; | |
79 | - result.N = -result.N; | |
80 | - return result; | |
81 | - } | |
82 | - | |
83 | - //determines how a vector v intersects the plane (1 = intersects front, 0 = within plane, -1 = intersects back) | |
84 | - CUDA_CALLABLE int face(vec<T, D> v){ | |
85 | - | |
86 | - T dprod = v.dot(N); //get the dot product between v and N | |
87 | - | |
88 | - //conditional returns the appropriate value | |
89 | - if(dprod < 0) | |
90 | - return 1; | |
91 | - else if(dprod > 0) | |
92 | - return -1; | |
93 | - else | |
94 | - return 0; | |
95 | - } | |
96 | - | |
97 | - //determine on which side of the plane a point lies (1 = front, 0 = on the plane, -1 = back) | |
98 | - CUDA_CALLABLE int side(vec<T, D> p){ | |
99 | - | |
100 | - vec<T, D> v = p - P; //get the vector from P to the query point p | |
101 | - | |
102 | - return face(v); | |
103 | - } | |
104 | - | |
105 | - //compute the component of v that is perpendicular to the plane | |
106 | - CUDA_CALLABLE vec<T, D> perpendicular(vec<T, D> v){ | |
107 | - return N * v.dot(N); | |
108 | - } | |
109 | - | |
110 | - //compute the projection of v in the plane | |
111 | - CUDA_CALLABLE vec<T, D> parallel(vec<T, D> v){ | |
112 | - return v - perpendicular(v); | |
113 | - } | |
114 | - | |
115 | - CUDA_CALLABLE void decompose(vec<T, D> v, vec<T, D>& para, vec<T, D>& perp){ | |
116 | - perp = N * v.dot(N); | |
117 | - para = v - perp; | |
118 | - } | |
119 | - | |
120 | - //get both the parallel and perpendicular components of a vector v w.r.t. the plane | |
121 | - CUDA_CALLABLE void project(vec<T, D> v, vec<T, D> &v_par, vec<T, D> &v_perp){ | |
122 | - | |
123 | - v_perp = v.dot(N); | |
124 | - v_par = v - v_perp; | |
125 | - } | |
126 | - | |
127 | - //compute the reflection of v off of the plane | |
128 | - CUDA_CALLABLE vec<T, D> reflect(vec<T, D> v){ | |
129 | - | |
130 | - //compute the reflection using N_prime as the plane normal | |
131 | - vec<T, D> par = parallel(v); | |
132 | - vec<T, D> r = (-v) + par * 2; | |
133 | - | |
134 | - /*std::cout<<"----------------REFLECT-----------------------------"<<std::endl; | |
135 | - std::cout<<str()<<std::endl; | |
136 | - std::cout<<"v: "<<v<<std::endl; | |
137 | - std::cout<<"r: "<<r<<std::endl; | |
138 | - std::cout<<"Perpendicular: "<<perpendicular(v)<<std::endl; | |
139 | - std::cout<<"Parallel: "<<par<<std::endl;*/ | |
140 | - return r; | |
141 | - | |
142 | - } | |
143 | - | |
144 | - CUDA_CALLABLE rts::plane<T, D> operator-() | |
145 | - { | |
146 | - rts::plane<T, D> p = *this; | |
147 | - | |
148 | - //negate the normal vector | |
149 | - p.N = -p.N; | |
150 | - | |
151 | - return p; | |
152 | - } | |
153 | - | |
154 | - //output a string | |
155 | - std::string str(){ | |
156 | - std::stringstream ss; | |
157 | - ss<<"P: "<<P<<std::endl; | |
158 | - ss<<"N: "<<N; | |
159 | - return ss.str(); | |
160 | - } | |
161 | - | |
162 | - ///////Friendship | |
163 | - //friend CUDA_CALLABLE rts::plane<T, D> operator- <> (rts::plane<T, D> v); | |
164 | - | |
165 | - | |
166 | - | |
167 | -}; | |
168 | - | |
169 | -} | |
170 | - | |
171 | -//arithmetic operators | |
172 | - | |
173 | -//negative operator flips the plane (front to back) | |
174 | -//template <typename T, int D> | |
175 | - | |
176 | - | |
177 | - | |
178 | - | |
179 | -#endif |
stim/math/quad.h deleted
1 | -#ifndef RTS_QUAD_H | |
2 | -#define RTS_QUAD_H | |
3 | - | |
4 | -//enable CUDA_CALLABLE macro | |
5 | -#include <stim/cuda/callable.h> | |
6 | -#include <stim/math/vector.h> | |
7 | -#include <stim/math/triangle.h> | |
8 | -#include <stim/math/quaternion.h> | |
9 | -#include <iostream> | |
10 | -#include <iomanip> | |
11 | -#include <algorithm> | |
12 | - | |
13 | -namespace stim{ | |
14 | - | |
15 | -//template for a quadangle class in ND space | |
16 | -template <class T, int N = 3> | |
17 | -struct quad | |
18 | -{ | |
19 | - /* | |
20 | - B------------------>C | |
21 | - ^ ^ | |
22 | - | | | |
23 | - Y | | |
24 | - | | | |
25 | - | | | |
26 | - A---------X-------->O | |
27 | - */ | |
28 | - | |
29 | - /*T A[N]; | |
30 | - T B[N]; | |
31 | - T C[N];*/ | |
32 | - | |
33 | - rts::vec<T, N> A; | |
34 | - rts::vec<T, N> X; | |
35 | - rts::vec<T, N> Y; | |
36 | - | |
37 | - | |
38 | - CUDA_CALLABLE quad() | |
39 | - { | |
40 | - | |
41 | - } | |
42 | - | |
43 | - CUDA_CALLABLE quad(vec<T, N> a, vec<T, N> b, vec<T, N> c) | |
44 | - { | |
45 | - | |
46 | - A = a; | |
47 | - Y = b - a; | |
48 | - X = c - a - Y; | |
49 | - | |
50 | - } | |
51 | - | |
52 | - /******************************************************************* | |
53 | - Constructor - create a quad from a position, normal, and rotation | |
54 | - *******************************************************************/ | |
55 | - CUDA_CALLABLE quad(rts::vec<T, N> c, rts::vec<T, N> normal, T width, T height, T theta) | |
56 | - { | |
57 | - | |
58 | - //compute the X direction - start along world-space X | |
59 | - Y = rts::vec<T, N>(0, 1, 0); | |
60 | - if(Y == normal) | |
61 | - Y = rts::vec<T, N>(0, 0, 1); | |
62 | - | |
63 | - X = Y.cross(normal).norm(); | |
64 | - | |
65 | - std::cout<<X<<std::endl; | |
66 | - | |
67 | - //rotate the X axis by theta radians | |
68 | - rts::quaternion<T> q; | |
69 | - q.CreateRotation(theta, normal); | |
70 | - X = q.toMatrix3() * X; | |
71 | - Y = normal.cross(X); | |
72 | - | |
73 | - //normalize everything | |
74 | - X = X.norm(); | |
75 | - Y = Y.norm(); | |
76 | - | |
77 | - //scale to match the quad width and height | |
78 | - X = X * width; | |
79 | - Y = Y * height; | |
80 | - | |
81 | - //set the corner of the plane | |
82 | - A = c - X * 0.5f - Y * 0.5f; | |
83 | - | |
84 | - std::cout<<X<<std::endl; | |
85 | - } | |
86 | - | |
87 | - //boolean comparison | |
88 | - bool operator==(const quad<T, N> & rhs) | |
89 | - { | |
90 | - if(A == rhs.A && X == rhs.X && Y == rhs.Y) | |
91 | - return true; | |
92 | - else | |
93 | - return false; | |
94 | - } | |
95 | - | |
96 | - /******************************************* | |
97 | - Return the normal for the quad | |
98 | - *******************************************/ | |
99 | - CUDA_CALLABLE rts::vec<T, N> n() | |
100 | - { | |
101 | - return (X.cross(Y)).norm(); | |
102 | - } | |
103 | - | |
104 | - CUDA_CALLABLE rts::vec<T, N> p(T a, T b) | |
105 | - { | |
106 | - rts::vec<T, N> result; | |
107 | - //given the two parameters a, b = [0 1], returns the position in world space | |
108 | - result = A + X * a + Y * b; | |
109 | - | |
110 | - return result; | |
111 | - } | |
112 | - | |
113 | - CUDA_CALLABLE rts::vec<T, N> operator()(T a, T b) | |
114 | - { | |
115 | - return p(a, b); | |
116 | - } | |
117 | - | |
118 | - std::string str() | |
119 | - { | |
120 | - std::stringstream ss; | |
121 | - | |
122 | - ss<<std::left<<"B="<<setfill('-')<<setw(20)<<A + Y<<">"<<"C="<<A + Y + X<<std::endl; | |
123 | - ss<<setfill(' ')<<setw(23)<<"|"<<"|"<<std::endl<<setw(23)<<"|"<<"|"<<std::endl; | |
124 | - ss<<std::left<<"A="<<setfill('-')<<setw(20)<<A<<">"<<"D="<<A + X; | |
125 | - | |
126 | - return ss.str(); | |
127 | - | |
128 | - } | |
129 | - | |
130 | - CUDA_CALLABLE quad<T, N> operator*(T rhs) | |
131 | - { | |
132 | - //scales the plane by a scalar value | |
133 | - | |
134 | - //compute the center point | |
135 | - rts::vec<T, N> c = A + X*0.5f + Y*0.5f; | |
136 | - | |
137 | - //create the new quadangle | |
138 | - quad<T, N> result; | |
139 | - result.X = X * rhs; | |
140 | - result.Y = Y * rhs; | |
141 | - result.A = c - result.X*0.5f - result.Y*0.5f; | |
142 | - | |
143 | - return result; | |
144 | - | |
145 | - } | |
146 | - | |
147 | - CUDA_CALLABLE T dist(vec<T, N> p) | |
148 | - { | |
149 | - //compute the distance between a point and this quad | |
150 | - | |
151 | - //first break the quad up into two triangles | |
152 | - triangle<T, N> T0(A, A+X, A+Y); | |
153 | - triangle<T, N> T1(A+X+Y, A+X, A+Y); | |
154 | - | |
155 | - | |
156 | - T d0 = T0.dist(p); | |
157 | - T d1 = T1.dist(p); | |
158 | - | |
159 | - if(d0 < d1) | |
160 | - return d0; | |
161 | - else | |
162 | - return d1; | |
163 | - } | |
164 | - | |
165 | - CUDA_CALLABLE T dist_max(vec<T, N> p) | |
166 | - { | |
167 | - T da = (A - p).len(); | |
168 | - T db = (A+X - p).len(); | |
169 | - T dc = (A+Y - p).len(); | |
170 | - T dd = (A+X+Y - p).len(); | |
171 | - | |
172 | - return std::max( da, std::max(db, std::max(dc, dd) ) ); | |
173 | - } | |
174 | -}; | |
175 | - | |
176 | -} //end namespace rts | |
177 | - | |
178 | -template <typename T, int N> | |
179 | -std::ostream& operator<<(std::ostream& os, rts::quad<T, N> R) | |
180 | -{ | |
181 | - os<<R.str(); | |
182 | - return os; | |
183 | -} | |
184 | - | |
185 | - | |
186 | -#endif |
stim/math/rect.h
... | ... | @@ -28,13 +28,10 @@ class rect : plane <T> |
28 | 28 | O---------X---------> |
29 | 29 | */ |
30 | 30 | |
31 | -private: | |
32 | - | |
33 | - stim::vec<T> X; | |
34 | - stim::vec<T> Y; | |
35 | - | |
36 | - | |
31 | +protected: | |
37 | 32 | |
33 | + stim::vec3<T> X; | |
34 | + stim::vec3<T> Y; | |
38 | 35 | |
39 | 36 | public: |
40 | 37 | |
... | ... | @@ -65,7 +62,7 @@ public: |
65 | 62 | ///create a rectangle from a center point, normal |
66 | 63 | ///@param c: x,y,z location of the center. |
67 | 64 | ///@param n: x,y,z direction of the normal. |
68 | - CUDA_CALLABLE rect(vec<T> c, vec<T> n = vec<T>(0, 0, 1)) | |
65 | + CUDA_CALLABLE rect(vec3<T> c, vec3<T> n = vec3<T>(0, 0, 1)) | |
69 | 66 | : plane<T>() |
70 | 67 | { |
71 | 68 | init(); //start with the default setting |
... | ... | @@ -76,7 +73,7 @@ public: |
76 | 73 | ///@param c: x,y,z location of the center. |
77 | 74 | ///@param s: size of the rectangle. |
78 | 75 | ///@param n: x,y,z direction of the normal. |
79 | - CUDA_CALLABLE rect(vec<T> c, T s, vec<T> n = vec<T>(0, 0, 1)) | |
76 | + CUDA_CALLABLE rect(vec3<T> c, T s, vec3<T> n = vec3<T>(0, 0, 1)) | |
80 | 77 | : plane<T>() |
81 | 78 | { |
82 | 79 | init(); //start with the default setting |
... | ... | @@ -89,7 +86,7 @@ public: |
89 | 86 | ///@param center: x,y,z location of the center. |
90 | 87 | ///@param directionX: u,v,w direction of the X vector. |
91 | 88 | ///@param directionY: u,v,w direction of the Y vector. |
92 | - CUDA_CALLABLE rect(vec<T> center, vec<T> directionX, vec<T> directionY ) | |
89 | + CUDA_CALLABLE rect(vec3<T> center, vec3<T> directionX, vec3<T> directionY ) | |
93 | 90 | : plane<T>((directionX.cross(directionY)).norm(),center) |
94 | 91 | { |
95 | 92 | X = directionX; |
... | ... | @@ -101,7 +98,7 @@ public: |
101 | 98 | ///@param center: x,y,z location of the center. |
102 | 99 | ///@param directionX: u,v,w direction of the X vector. |
103 | 100 | ///@param directionY: u,v,w direction of the Y vector. |
104 | - CUDA_CALLABLE rect(T size, vec<T> center, vec<T> directionX, vec<T> directionY ) | |
101 | + CUDA_CALLABLE rect(T size, vec3<T> center, vec3<T> directionX, vec3<T> directionY ) | |
105 | 102 | : plane<T>((directionX.cross(directionY)).norm(),center) |
106 | 103 | { |
107 | 104 | X = directionX; |
... | ... | @@ -114,7 +111,7 @@ public: |
114 | 111 | ///@param center: x,y,z location of the center. |
115 | 112 | ///@param directionX: u,v,w direction of the X vector. |
116 | 113 | ///@param directionY: u,v,w direction of the Y vector. |
117 | - CUDA_CALLABLE rect(vec<T> size, vec<T> center, vec<T> directionX, vec<T> directionY) | |
114 | + CUDA_CALLABLE rect(vec3<T> size, vec3<T> center, vec3<T> directionX, vec3<T> directionY) | |
118 | 115 | : plane<T>((directionX.cross(directionY)).norm(), center) |
119 | 116 | { |
120 | 117 | X = directionX; |
... | ... | @@ -138,7 +135,7 @@ public: |
138 | 135 | |
139 | 136 | ///@param n; vector with the normal. |
140 | 137 | ///Orients the rectangle along the normal n. |
141 | - CUDA_CALLABLE void normal(vec<T> n) | |
138 | + CUDA_CALLABLE void normal(vec3<T> n) | |
142 | 139 | { |
143 | 140 | //orient the rectangle along the specified normal |
144 | 141 | rotate(n, X, Y); |
... | ... | @@ -147,8 +144,8 @@ public: |
147 | 144 | ///general init method that sets a general rectangle. |
148 | 145 | CUDA_CALLABLE void init() |
149 | 146 | { |
150 | - X = vec<T>(1, 0, 0); | |
151 | - Y = vec<T>(0, 1, 0); | |
147 | + X = vec3<T>(1, 0, 0); | |
148 | + Y = vec3<T>(0, 1, 0); | |
152 | 149 | } |
153 | 150 | |
154 | 151 | //boolean comparison |
... | ... | @@ -162,18 +159,18 @@ public: |
162 | 159 | |
163 | 160 | |
164 | 161 | //get the world space value given the planar coordinates a, b in [0, 1] |
165 | - CUDA_CALLABLE stim::vec<T> p(T a, T b) | |
162 | + CUDA_CALLABLE stim::vec3<T> p(T a, T b) | |
166 | 163 | { |
167 | - stim::vec<T> result; | |
164 | + stim::vec3<T> result; | |
168 | 165 | //given the two parameters a, b = [0 1], returns the position in world space |
169 | - vec<T> A = this->P - X * (T)0.5 - Y * (T)0.5; | |
166 | + vec3<T> A = this->P - X * (T)0.5 - Y * (T)0.5; | |
170 | 167 | result = A + X * a + Y * b; |
171 | 168 | |
172 | 169 | return result; |
173 | 170 | } |
174 | 171 | |
175 | 172 | //parenthesis operator returns the world space given rectangular coordinates a and b in [0 1] |
176 | - CUDA_CALLABLE stim::vec<T> operator()(T a, T b) | |
173 | + CUDA_CALLABLE stim::vec3<T> operator()(T a, T b) | |
177 | 174 | { |
178 | 175 | return p(a, b); |
179 | 176 | } |
... | ... | @@ -181,12 +178,12 @@ public: |
181 | 178 | std::string str() |
182 | 179 | { |
183 | 180 | std::stringstream ss; |
184 | - vec<T> A = P - X * (T)0.5 - Y * (T)0.5; | |
181 | + vec3<T> A = P - X * (T)0.5 - Y * (T)0.5; | |
185 | 182 | ss<<std::left<<"B="<<std::setfill('-')<<std::setw(20)<<A + Y<<">"<<"C="<<A + Y + X<<std::endl; |
186 | 183 | ss<<std::setfill(' ')<<std::setw(23)<<"|"<<"|"<<std::endl<<std::setw(23)<<"|"<<"|"<<std::endl; |
187 | 184 | ss<<std::left<<"A="<<std::setfill('-')<<std::setw(20)<<A<<">"<<"D="<<A + X; |
188 | 185 | |
189 | - return ss.str(); | |
186 | + return ss.str(); | |
190 | 187 | |
191 | 188 | } |
192 | 189 | |
... | ... | @@ -205,11 +202,11 @@ public: |
205 | 202 | |
206 | 203 | ///computes the distance between the specified point and this rectangle. |
207 | 204 | ///@param p: x, y, z coordinates of the point to calculate distance to. |
208 | - CUDA_CALLABLE T dist(vec<T> p) | |
205 | + CUDA_CALLABLE T dist(vec3<T> p) | |
209 | 206 | { |
210 | 207 | //compute the distance between a point and this rect |
211 | 208 | |
212 | - vec<T> A = P - X * (T)0.5 - Y * (T)0.5; | |
209 | + vec3<T> A = P - X * (T)0.5 - Y * (T)0.5; | |
213 | 210 | |
214 | 211 | //first break the rect up into two triangles |
215 | 212 | triangle<T> T0(A, A+X, A+Y); |
... | ... | @@ -225,16 +222,16 @@ public: |
225 | 222 | return d1; |
226 | 223 | } |
227 | 224 | |
228 | - CUDA_CALLABLE T center(vec<T> p) | |
225 | + CUDA_CALLABLE T center(vec3<T> p) | |
229 | 226 | { |
230 | 227 | this->P = p; |
231 | 228 | } |
232 | 229 | |
233 | 230 | ///Returns the maximum distance of the rectangle from a point p to the sides of the rectangle. |
234 | 231 | ///@param p: x, y, z point. |
235 | - CUDA_CALLABLE T dist_max(vec<T> p) | |
232 | + CUDA_CALLABLE T dist_max(vec3<T> p) | |
236 | 233 | { |
237 | - vec<T> A = P - X * (T)0.5 - Y * (T)0.5; | |
234 | + vec3<T> A = P - X * (T)0.5 - Y * (T)0.5; | |
238 | 235 | T da = (A - p).len(); |
239 | 236 | T db = (A+X - p).len(); |
240 | 237 | T dc = (A+Y - p).len(); | ... | ... |
stim/math/vec3.h
... | ... | @@ -242,4 +242,11 @@ stim::vec3<T> operator*(T lhs, stim::vec3<T> rhs){ |
242 | 242 | return rhs * lhs; |
243 | 243 | } |
244 | 244 | |
245 | +//stream operator | |
246 | +template<typename T> | |
247 | +std::ostream& operator<<(std::ostream& os, stim::vec3<T> const& rhs){ | |
248 | + os<<rhs.str(); | |
249 | + return os; | |
250 | +} | |
251 | + | |
245 | 252 | #endif |
246 | 253 | \ No newline at end of file | ... | ... |
stim/math/vector.h
... | ... | @@ -5,8 +5,9 @@ |
5 | 5 | #include <cmath> |
6 | 6 | #include <sstream> |
7 | 7 | #include <vector> |
8 | - | |
8 | + | |
9 | 9 | #include <stim/cuda/cudatools/callable.h> |
10 | +#include <stim/math/vec3.h> | |
10 | 11 | |
11 | 12 | namespace stim |
12 | 13 | { |
... | ... | @@ -70,8 +71,8 @@ struct vec : public std::vector<T> |
70 | 71 | size_t N = other.size(); |
71 | 72 | resize(N); //resize the current vector to match the copy |
72 | 73 | for(size_t i=0; i<N; i++){ //copy each element |
73 | - at(i) = other[i]; | |
74 | - } | |
74 | + at(i) = other[i]; | |
75 | + } | |
75 | 76 | } |
76 | 77 | |
77 | 78 | //I'm not sure what these were doing here. |
... | ... | @@ -318,8 +319,8 @@ struct vec : public std::vector<T> |
318 | 319 | } |
319 | 320 | |
320 | 321 | /// Cast to a vec3 |
321 | - operator vec3<T>(){ | |
322 | - vec3<T> r; | |
322 | + operator stim::vec3<T>(){ | |
323 | + stim::vec3<T> r; | |
323 | 324 | size_t N = std::min<size_t>(size(), 3); |
324 | 325 | for(size_t i = 0; i < N; i++) |
325 | 326 | r[i] = at(i); | ... | ... |
stim/optics/mie.h
1 | 1 | #ifndef STIM_MIE_H |
2 | 2 | #define STIM_MIE_H |
3 | +#include <boost/math/special_functions/bessel.hpp> | |
3 | 4 | |
4 | 5 | #include "scalarwave.h" |
5 | 6 | #include "../math/bessel.h" |
... | ... | @@ -43,7 +44,6 @@ void B_coefficients(stim::complex<T>* B, T a, T k, stim::complex<T> n, int Nl){ |
43 | 44 | numerator = j_ka[l] * dj_kna[l] * (stim::complex<double>)n - j_kna[l] * dj_ka[l]; |
44 | 45 | denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n; |
45 | 46 | B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator; |
46 | - std::cout<<B[l]<<std::endl; | |
47 | 47 | } |
48 | 48 | } |
49 | 49 | |
... | ... | @@ -84,7 +84,7 @@ void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){ |
84 | 84 | |
85 | 85 | #define LOCAL_NL 16 |
86 | 86 | template<typename T> |
87 | -__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, int Nl){ | |
87 | +__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){ | |
88 | 88 | extern __shared__ stim::complex<T> shared_hB[]; //declare the list of waves in shared memory |
89 | 89 | |
90 | 90 | size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array |
... | ... | @@ -96,14 +96,11 @@ __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* |
96 | 96 | |
97 | 97 | T r = p.len(); //calculate the distance from the sphere |
98 | 98 | if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) |
99 | - T k = W[0].kmag(); | |
100 | - size_t NC = Nl + 1; //calculate the number of coefficients to be used | |
101 | - T kr = r * k; //calculate the thread value for k*r | |
102 | - T fij = (kr - kr_min)/dkr; //FP index into the spherical bessel LUT | |
99 | + T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT | |
103 | 100 | size_t ij = (size_t) fij; //convert to an integral index |
104 | 101 | T alpha = fij - ij; //calculate the fractional portion of the index |
105 | - size_t n0j = ij * (NC); //start of the first entry in the LUT | |
106 | - size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT | |
102 | + size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT | |
103 | + size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT | |
107 | 104 | |
108 | 105 | T cos_phi; |
109 | 106 | T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials |
... | ... | @@ -112,37 +109,36 @@ __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* |
112 | 109 | stim::complex<T> Ei = 0; //create a register to store the result |
113 | 110 | int l; |
114 | 111 | |
115 | - stim::complex<T> hlBl[LOCAL_NL+1]; | |
116 | - int shared_start = threadIdx.x * (Nl - LOCAL_NL); | |
112 | + stim::complex<T> hlBl[LOCAL_NL+1]; //the first LOCAL_NL components are stored in registers for speed | |
113 | + int shared_start = threadIdx.x * (Nl - LOCAL_NL); //wrap up some operations so that they aren't done in the main loops | |
117 | 114 | |
118 | - #pragma unroll LOCAL_NL+1 | |
115 | + #pragma unroll LOCAL_NL+1 //copy the first LOCAL_NL+1 h_l * B_l components to registers | |
119 | 116 | for(l = 0; l <= LOCAL_NL; l++) |
120 | 117 | hlBl[l] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha ); |
121 | 118 | |
122 | - for(l = LOCAL_NL+1; l <= Nl; l++) | |
119 | + for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory | |
123 | 120 | shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha ); |
124 | 121 | |
125 | - for(size_t w = 0; w < nW; w++){ | |
122 | + for(size_t w = 0; w < nW; w++){ //for each plane wave | |
126 | 123 | cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere |
127 | - Pl_2 = 1; | |
124 | + Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation | |
128 | 125 | Pl_1 = cos_phi; |
129 | - Ei += W[w].E() * hlBl[0] * Pl_2; | |
126 | + Ei += W[w].E() * hlBl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation | |
130 | 127 | Ei += W[w].E() * hlBl[1] * Pl_1; |
131 | 128 | |
132 | - #pragma unroll LOCAL_NL-1 | |
129 | + #pragma unroll LOCAL_NL-1 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file) | |
133 | 130 | for(l = 2; l <= LOCAL_NL; l++){ |
134 | - Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1); | |
135 | - Ei += W[w].E() * hlBl[l] * Pl; | |
131 | + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs) | |
132 | + Ei += W[w].E() * hlBl[l] * Pl; //calculate and sum the current field order | |
136 | 133 | Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 |
137 | 134 | Pl_1 = Pl; |
138 | 135 | } |
139 | 136 | |
140 | - for(l = LOCAL_NL+1; l <= Nl; l++){ | |
141 | - Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1); | |
142 | - Ei += W[w].E() * shared_hB[shared_start + (l - (LOCAL_NL+1))] * Pl; | |
143 | - Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 | |
144 | - Pl_1 = Pl; | |
145 | - | |
137 | + for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers) | |
138 | + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs | |
139 | + Ei += W[w].E() * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl; | |
140 | + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 | |
141 | + Pl_1 = Pl; | |
146 | 142 | } |
147 | 143 | } |
148 | 144 | E[i] += Ei; //copy the result to device memory |
... | ... | @@ -152,10 +148,10 @@ template<typename T> |
152 | 148 | void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){ |
153 | 149 | |
154 | 150 | size_t max_shared_mem = stim::sharedMemPerBlock(); |
155 | - int hBl_array = sizeof(stim::complex<T>) * (Nl + 1); | |
151 | + size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1); | |
156 | 152 | std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl; |
157 | 153 | std::cout<<"shared memory: "<<max_shared_mem<<std::endl; |
158 | - int threads = (max_shared_mem / hBl_array) / 32 * 32; | |
154 | + int threads = (int)((max_shared_mem / hBl_array) / 32 * 32); | |
159 | 155 | std::cout<<"threads per block: "<<threads<<std::endl; |
160 | 156 | dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks |
161 | 157 | |
... | ... | @@ -164,7 +160,6 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, sti |
164 | 160 | else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate |
165 | 161 | std::cout<<"shared memory allocated: "<<shared_mem<<std::endl; |
166 | 162 | cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel |
167 | - | |
168 | 163 | } |
169 | 164 | |
170 | 165 | template<typename T> |
... | ... | @@ -261,16 +256,19 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std |
261 | 256 | exit(1); |
262 | 257 | } |
263 | 258 | |
259 | + i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility | |
260 | + i_max--; | |
264 | 261 | T r_min, r_max; //allocate space to store the minimum and maximum values |
265 | 262 | HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU |
266 | 263 | HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); |
267 | 264 | |
265 | + r_min = max(r_min, a); //if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere) | |
268 | 266 | |
269 | 267 | //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r |
270 | 268 | size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1); |
271 | 269 | |
272 | - T kr_min = k * r_min; | |
273 | - T kr_max = k * r_max; | |
270 | + //T kr_min = k * r_min; | |
271 | + //T kr_max = k * r_max; | |
274 | 272 | |
275 | 273 | //temporary variables |
276 | 274 | double vm; //allocate space to store the return values for the bessel function calculation |
... | ... | @@ -281,27 +279,29 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std |
281 | 279 | |
282 | 280 | size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut; |
283 | 281 | stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes); //pointer to the look-up table |
284 | - T dkr = (kr_max - kr_min) / (N_hB_lut-1); //distance between values in the LUT | |
282 | + T dr = (r_max - r_min) / (N_hB_lut-1); //distance between values in the LUT | |
285 | 283 | std::cout<<"LUT jl bytes: "<<hB_bytes<<std::endl; |
286 | 284 | stim::complex<T> hl; |
287 | - for(size_t kri = 0; kri < N_hB_lut; kri++){ //for each value in the LUT | |
288 | - stim::bessjyv_sph<double>(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] | |
285 | + for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT | |
286 | + stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] | |
289 | 287 | for(size_t l = 0; l <= Nl; l++){ //for each order |
290 | 288 | hl.r = (T)jv[l]; |
291 | 289 | hl.i = (T)yv[l]; |
292 | 290 | |
293 | - hB_lut[kri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result | |
291 | + hB_lut[ri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result | |
292 | + //std::cout<<hB_lut[ri * (Nl + 1) + l]<<std::endl; | |
294 | 293 | } |
295 | 294 | } |
296 | - | |
297 | - //stim::cpu2image<T>(hankel_lut, "hankel.bmp", Nl+1, Nlut_j, stim::cmBrewer); | |
295 | + T* real_lut = (T*) malloc(hB_bytes/2); | |
296 | + stim::real(real_lut, hB_lut, N_hB_lut); | |
297 | + stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer); | |
298 | 298 | |
299 | 299 | //Allocate device memory and copy everything to the GPU |
300 | 300 | stim::complex<T>* dev_hB_lut; |
301 | 301 | HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); |
302 | 302 | HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); |
303 | 303 | |
304 | - gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, kr_min, dkr, N_hB_lut, Nl); | |
304 | + gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl); | |
305 | 305 | |
306 | 306 | cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory |
307 | 307 | |
... | ... | @@ -349,9 +349,90 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std |
349 | 349 | } |
350 | 350 | |
351 | 351 | template<typename T> |
352 | -void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){ | |
352 | +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){ | |
353 | 353 | std::vector< stim::scalarwave<T> > W(1, w); |
354 | - cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n); | |
354 | + cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, r_spacing); | |
355 | +} | |
356 | + | |
357 | +template<typename T> | |
358 | +__global__ void cuda_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* jA, T r_min, T dr, size_t N_jA, int Nl){ | |
359 | + extern __shared__ stim::complex<T> shared_jA[]; //declare the list of waves in shared memory | |
360 | + | |
361 | + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array | |
362 | + if(i >= N) return; //exit if this thread is outside the array | |
363 | + stim::vec3<T> p; | |
364 | + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions | |
365 | + (y == NULL) ? p[1] = 0 : p[1] = y[i]; | |
366 | + (z == NULL) ? p[2] = 0 : p[2] = z[i]; | |
367 | + | |
368 | + T r = p.len(); //calculate the distance from the sphere | |
369 | + if(r > a) return; //exit if the point is inside the sphere (we only calculate the internal field) | |
370 | + T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT | |
371 | + size_t ij = (size_t) fij; //convert to an integral index | |
372 | + T alpha = fij - ij; //calculate the fractional portion of the index | |
373 | + size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT | |
374 | + size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT | |
375 | + | |
376 | + T cos_phi; | |
377 | + T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials | |
378 | + | |
379 | + stim::complex<T> jAl; | |
380 | + stim::complex<T> Ei = 0; //create a register to store the result | |
381 | + int l; | |
382 | + | |
383 | + stim::complex<T> jlAl[LOCAL_NL+1]; //the first LOCAL_NL components are stored in registers for speed | |
384 | + int shared_start = threadIdx.x * (Nl - LOCAL_NL); //wrap up some operations so that they aren't done in the main loops | |
385 | + | |
386 | + #pragma unroll LOCAL_NL+1 //copy the first LOCAL_NL+1 h_l * B_l components to registers | |
387 | + for(l = 0; l <= LOCAL_NL; l++) | |
388 | + jlAl[l] = clerp<T>( jA[n0j + l], jA[n1j + l], alpha ); | |
389 | + | |
390 | + for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory | |
391 | + shared_jA[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( jA[n0j + l], jA[n1j + l], alpha ); | |
392 | + | |
393 | + for(size_t w = 0; w < nW; w++){ //for each plane wave | |
394 | + if(r == 0) cos_phi = 0; | |
395 | + else | |
396 | + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere | |
397 | + Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation | |
398 | + Pl_1 = cos_phi; | |
399 | + Ei += W[w].E() * jlAl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation | |
400 | + Ei += W[w].E() * jlAl[1] * Pl_1; | |
401 | + | |
402 | + #pragma unroll LOCAL_NL-1 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file) | |
403 | + for(l = 2; l <= LOCAL_NL; l++){ | |
404 | + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs) | |
405 | + Ei += W[w].E() * jlAl[l] * Pl; //calculate and sum the current field order | |
406 | + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 | |
407 | + Pl_1 = Pl; | |
408 | + } | |
409 | + | |
410 | + for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers) | |
411 | + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs | |
412 | + Ei += W[w].E() * shared_jA[shared_start + l - LOCAL_NL - 1] * Pl; | |
413 | + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 | |
414 | + Pl_1 = Pl; | |
415 | + } | |
416 | + } | |
417 | + E[i] = Ei; //copy the result to device memory | |
418 | +} | |
419 | + | |
420 | +template<typename T> | |
421 | +void gpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* jA, T r_min, T dr, size_t N_jA, size_t Nl){ | |
422 | + | |
423 | + size_t max_shared_mem = stim::sharedMemPerBlock(); | |
424 | + size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1); | |
425 | + std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl; | |
426 | + std::cout<<"shared memory: "<<max_shared_mem<<std::endl; | |
427 | + int threads = (int)((max_shared_mem / hBl_array) / 32 * 32); | |
428 | + std::cout<<"threads per block: "<<threads<<std::endl; | |
429 | + dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks | |
430 | + | |
431 | + size_t shared_mem; | |
432 | + if(Nl <= LOCAL_NL) shared_mem = 0; | |
433 | + else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate | |
434 | + std::cout<<"shared memory allocated: "<<shared_mem<<std::endl; | |
435 | + cuda_scalar_mie_internal<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl); //call the kernel | |
355 | 436 | } |
356 | 437 | |
357 | 438 | /// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere |
... | ... | @@ -365,18 +446,122 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, sti |
365 | 446 | /// @param a is the radius of the sphere |
366 | 447 | /// @param n is the complex refractive index of the sphere |
367 | 448 | template<typename T> |
368 | -void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n){ | |
369 | - | |
370 | - //calculate the necessary number of orders required to represent the scattered field | |
449 | +void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n, T r_spacing = 0.1){ | |
450 | +//calculate the necessary number of orders required to represent the scattered field | |
371 | 451 | T k = W[0].kmag(); |
372 | 452 | |
373 | - size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2); | |
453 | + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); | |
454 | + if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) | |
374 | 455 | std::cout<<"Nl: "<<Nl<<std::endl; |
375 | 456 | |
376 | 457 | //calculate the scattering coefficients for the sphere |
377 | 458 | stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients |
378 | 459 | A_coefficients(A, a, k, n, Nl); |
379 | 460 | |
461 | +#ifdef CUDA_FOUND | |
462 | + stim::complex<T>* dev_E; //allocate space for the field | |
463 | + cudaMalloc(&dev_E, N * sizeof(stim::complex<T>)); | |
464 | + cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice); | |
465 | + //cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used) | |
466 | + | |
467 | + // COORDINATES | |
468 | + T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) | |
469 | + if(x != NULL){ | |
470 | + HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T))); | |
471 | + HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice)); | |
472 | + } | |
473 | + T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified) | |
474 | + if(y != NULL){ | |
475 | + HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T))); | |
476 | + HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice)); | |
477 | + } | |
478 | + T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified) | |
479 | + if(z != NULL){ | |
480 | + HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T))); | |
481 | + HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); | |
482 | + } | |
483 | + | |
484 | + // PLANE WAVES | |
485 | + stim::scalarwave<T>* dev_W; //allocate space and copy plane waves | |
486 | + HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) ); | |
487 | + HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) ); | |
488 | + | |
489 | + // BESSEL FUNCTION LOOK-UP TABLE | |
490 | + //calculate the distance from the sphere center | |
491 | + T* dev_r; | |
492 | + HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) ); | |
493 | + | |
494 | + int threads = stim::maxThreadsPerBlock(); | |
495 | + dim3 blocks((unsigned)(N / threads + 1)); | |
496 | + cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N); | |
497 | + | |
498 | + //Find the minimum and maximum values of r | |
499 | + cublasStatus_t stat; | |
500 | + cublasHandle_t handle; | |
501 | + | |
502 | + stat = cublasCreate(&handle); //create a cuBLAS handle | |
503 | + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure | |
504 | + printf ("CUBLAS initialization failed\n"); | |
505 | + exit(1); | |
506 | + } | |
507 | + | |
508 | + int i_min, i_max; | |
509 | + stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min); | |
510 | + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure | |
511 | + printf ("CUBLAS Error: failed to calculate minimum r value.\n"); | |
512 | + exit(1); | |
513 | + } | |
514 | + stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max); | |
515 | + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure | |
516 | + printf ("CUBLAS Error: failed to calculate maximum r value.\n"); | |
517 | + exit(1); | |
518 | + } | |
519 | + | |
520 | + i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility | |
521 | + i_max--; | |
522 | + T r_min, r_max; //allocate space to store the minimum and maximum values | |
523 | + HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU | |
524 | + HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); | |
525 | + | |
526 | + r_max = min(r_max, a); //the internal field doesn't exist outside of the sphere | |
527 | + | |
528 | + size_t N_jA_lut = (size_t)((r_max - r_min) / r_spacing + 1); | |
529 | + | |
530 | + //temporary variables | |
531 | + double vm; //allocate space to store the return values for the bessel function calculation | |
532 | + stim::complex<double>* jv = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); | |
533 | + stim::complex<double>* yv = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); | |
534 | + stim::complex<double>* djv= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); | |
535 | + stim::complex<double>* dyv= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); | |
536 | + | |
537 | + size_t jA_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_jA_lut; | |
538 | + stim::complex<T>* jA_lut = (stim::complex<T>*) malloc(jA_bytes); //pointer to the look-up table | |
539 | + T dr = (r_max - r_min) / (N_jA_lut-1); //distance between values in the LUT | |
540 | + std::cout<<"LUT jl bytes: "<<jA_bytes<<std::endl; | |
541 | + stim::complex<T> hl; | |
542 | + stim::complex<double> nd = (stim::complex<double>)n; | |
543 | + for(size_t ri = 0; ri < N_jA_lut; ri++){ //for each value in the LUT | |
544 | + stim::cbessjyva_sph<double>(Nl, nd * k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] | |
545 | + for(size_t l = 0; l <= Nl; l++){ //for each order | |
546 | + jA_lut[ri * (Nl + 1) + l] = (stim::complex<T>)(jv[l] * (stim::complex<double>)A[l]); //store the bessel function result | |
547 | + } | |
548 | + } | |
549 | + | |
550 | + //Allocate device memory and copy everything to the GPU | |
551 | + stim::complex<T>* dev_jA_lut; | |
552 | + HANDLE_ERROR( cudaMalloc(&dev_jA_lut, jA_bytes) ); | |
553 | + HANDLE_ERROR( cudaMemcpy(dev_jA_lut, jA_lut, jA_bytes, cudaMemcpyHostToDevice) ); | |
554 | + | |
555 | + gpu_scalar_mie_internal<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_jA_lut, r_min, dr, N_jA_lut, Nl); | |
556 | + | |
557 | + cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory | |
558 | + | |
559 | + if(x != NULL) cudaFree(dev_x); //free everything | |
560 | + if(y != NULL) cudaFree(dev_y); | |
561 | + if(z != NULL) cudaFree(dev_z); | |
562 | + cudaFree(dev_E); | |
563 | +#else | |
564 | + | |
380 | 565 | //allocate space to store the bessel function call results |
381 | 566 | double vm; |
382 | 567 | stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); |
... | ... | @@ -414,12 +599,13 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st |
414 | 599 | } |
415 | 600 | } |
416 | 601 | } |
602 | +#endif | |
417 | 603 | } |
418 | 604 | |
419 | 605 | template<typename T> |
420 | -void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){ | |
606 | +void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){ | |
421 | 607 | std::vector< stim::scalarwave<T> > W(1, w); |
422 | - cpu_scalar_mie_internal(E, N, x, y, z, W, a, n); | |
608 | + cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, r_spacing); | |
423 | 609 | } |
424 | 610 | |
425 | 611 | } | ... | ... |
stim/optics/scalarbeam.h
1 | 1 | #ifndef RTS_BEAM |
2 | 2 | #define RTS_BEAM |
3 | +#include <boost/math/special_functions/bessel.hpp> | |
3 | 4 | |
4 | 5 | #include "../math/vec3.h" |
5 | 6 | #include "../optics/scalarwave.h" |
... | ... | @@ -7,150 +8,68 @@ |
7 | 8 | #include "../math/legendre.h" |
8 | 9 | #include "../cuda/cudatools/devices.h" |
9 | 10 | #include "../cuda/cudatools/timer.h" |
11 | +#include "../optics/scalarfield.h" | |
10 | 12 | #include <cublas_v2.h> |
11 | 13 | #include <math_constants.h> |
12 | 14 | #include <vector> |
13 | 15 | #include <stdlib.h> |
14 | 16 | |
15 | -namespace stim{ | |
16 | 17 | |
17 | - /// Function returns the value of the scalar field produced by a beam with the specified parameters | |
18 | - | |
19 | - template<typename T> | |
20 | - std::vector< stim::vec3<T> > generate_focusing_vectors(size_t N, stim::vec3<T> d, T NA, T NA_in = 0){ | |
21 | - | |
22 | - std::vector< stim::vec3<T> > dirs(N); //allocate an array to store the focusing vectors | |
23 | - | |
24 | - ///compute the rotation operator to transform (0, 0, 1) to k | |
25 | - T cos_angle = d.dot(vec3<T>(0, 0, 1)); | |
26 | - stim::matrix<T, 3> rotation; | |
27 | - | |
28 | - //if the cosine of the angle is -1, the rotation is just a flip across the z axis | |
29 | - if(cos_angle == -1){ | |
30 | - rotation(2, 2) = -1; | |
31 | - } | |
32 | - else if(cos_angle != 1.0) | |
33 | - { | |
34 | - vec3<T> r_axis = vec3<T>(0, 0, 1).cross(d).norm(); //compute the axis of rotation | |
35 | - T angle = acos(cos_angle); //compute the angle of rotation | |
36 | - quaternion<T> quat; //create a quaternion describing the rotation | |
37 | - quat.CreateRotation(angle, r_axis); | |
38 | - rotation = quat.toMatrix3(); //compute the rotation matrix | |
39 | - } | |
40 | - | |
41 | - //find the phi values associated with the cassegrain ring | |
42 | - T PHI[2]; | |
43 | - PHI[0] = (T)asin(NA); | |
44 | - PHI[1] = (T)asin(NA_in); | |
45 | - | |
46 | - //calculate the z-axis cylinder coordinates associated with these angles | |
47 | - T Z[2]; | |
48 | - Z[0] = cos(PHI[0]); | |
49 | - Z[1] = cos(PHI[1]); | |
50 | - T range = Z[0] - Z[1]; | |
51 | - | |
52 | - //draw a distribution of random phi, z values | |
53 | - T z, phi, theta; | |
54 | - //T kmag = stim::TAU / lambda; | |
55 | - for(int i=0; i<N; i++){ //for each sample | |
56 | - z = (T)((double)rand() / (double)RAND_MAX) * range + Z[1]; //find a random position on the surface of a cylinder | |
57 | - theta = (T)(((double)rand() / (double)RAND_MAX) * stim::TAU); | |
58 | - phi = acos(z); //project onto the sphere, computing phi in spherical coordinates | |
59 | - | |
60 | - //compute and store cartesian coordinates | |
61 | - vec3<T> spherical(1, theta, phi); //convert from spherical to cartesian coordinates | |
62 | - vec3<T> cart = spherical.sph2cart(); | |
63 | - dirs[i] = rotation * cart; //create a sample vector | |
64 | - } | |
65 | - return dirs; | |
66 | - } | |
67 | - | |
68 | -/// Class stim::beam represents a beam of light focused at a point and composed of several plane waves | |
69 | -template<typename T> | |
70 | -class scalarbeam | |
71 | -{ | |
72 | -public: | |
73 | - //enum beam_type {Uniform, Bartlett, Hamming, Hanning}; | |
74 | - | |
75 | -private: | |
76 | - | |
77 | - T NA[2]; //numerical aperature of the focusing optics | |
78 | - vec3<T> f; //focal point | |
79 | - vec3<T> d; //propagation direction | |
80 | - stim::complex<T> A; //beam amplitude | |
81 | - T lambda; //beam wavelength | |
82 | -public: | |
83 | 18 | |
84 | - ///constructor: build a default beam (NA=1.0) | |
85 | - scalarbeam(T wavelength = 1, stim::complex<T> amplitude = 1, vec3<T> focal_point = vec3<T>(0, 0, 0), vec3<T> direction = vec3<T>(0, 0, 1), T numerical_aperture = 1, T center_obsc = 0){ | |
86 | - lambda = wavelength; | |
87 | - A = amplitude; | |
88 | - f = focal_point; | |
89 | - d = direction.norm(); //make sure that the direction vector is normalized (makes calculations more efficient later on) | |
90 | - NA[0] = numerical_aperture; | |
91 | - NA[1] = center_obsc; | |
92 | - } | |
19 | +namespace stim{ | |
93 | 20 | |
94 | - ///Numerical Aperature functions | |
95 | - void setNA(T na) | |
96 | - { | |
97 | - NA[0] = (T)0; | |
98 | - NA[1] = na; | |
99 | - } | |
100 | - void setNA(T na0, T na1) | |
101 | - { | |
102 | - NA[0] = na0; | |
103 | - NA[1] = na1; | |
104 | - } | |
21 | +/// Function returns the value of the scalar field produced by a beam with the specified parameters | |
105 | 22 | |
106 | - //Monte-Carlo decomposition into plane waves | |
107 | - std::vector< scalarwave<T> > mc(size_t N = 100000) const{ | |
23 | +template<typename T> | |
24 | +std::vector< stim::vec3<T> > generate_focusing_vectors(size_t N, stim::vec3<T> d, T NA, T NA_in = 0){ | |
108 | 25 | |
109 | - std::vector< stim::vec3<T> > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]); //generate a random set of N vectors forming a focus | |
110 | - std::vector< scalarwave<T> > samples(N); //create a vector of plane waves | |
111 | - T kmag = (T)stim::TAU / lambda; //calculate the wavenumber | |
112 | - stim::complex<T> apw; //allocate space for the amplitude at the focal point | |
113 | - T a = (T)(stim::TAU * (1 - cos(asin(NA[0]))) / (double)N); | |
114 | - stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction | |
115 | - for(size_t i=0; i<N; i++){ //for each sample | |
116 | - kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave | |
117 | - apw = a * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave | |
118 | - samples[i] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction | |
119 | - } | |
26 | + std::vector< stim::vec3<T> > dirs(N); //allocate an array to store the focusing vectors | |
120 | 27 | |
121 | - return samples; | |
122 | - } | |
28 | + ///compute the rotation operator to transform (0, 0, 1) to k | |
29 | + T cos_angle = d.dot(vec3<T>(0, 0, 1)); | |
30 | + stim::matrix<T, 3> rotation; | |
123 | 31 | |
124 | - /// Calculate the field at a given point | |
125 | - /// @param x is the x-coordinate of the field point | |
126 | - /// @O is the approximation accuracy | |
127 | - stim::complex<T> field(T x, T y, T z, size_t O){ | |
128 | - std::vector< scalarwave<T> > W = mc(O); | |
129 | - T result = 0; //initialize the result to zero (0) | |
130 | - for(size_t i = 0; i < O; i++){ //for each plane wave | |
131 | - result += W[i].pos(x, y, z); | |
132 | - } | |
133 | - return result; | |
32 | + //if the cosine of the angle is -1, the rotation is just a flip across the z axis | |
33 | + if(cos_angle == -1){ | |
34 | + rotation(2, 2) = -1; | |
134 | 35 | } |
135 | - | |
136 | - std::string str() | |
36 | + else if(cos_angle != 1.0) | |
137 | 37 | { |
138 | - std::stringstream ss; | |
139 | - ss<<"Beam:"<<std::endl; | |
140 | - //ss<<" Central Plane Wave: "<<beam::E0<<" e^i ( "<<beam::k<<" . r )"<<std::endl; | |
141 | - ss<<" Beam Direction: "<<d<<std::endl; | |
142 | - if(NA[0] == 0) | |
143 | - ss<<" NA: "<<NA[1]; | |
144 | - else | |
145 | - ss<<" NA: "<<NA[0]<<" -- "<<NA[1]; | |
146 | - | |
147 | - return ss.str(); | |
38 | + vec3<T> r_axis = vec3<T>(0, 0, 1).cross(d).norm(); //compute the axis of rotation | |
39 | + T angle = acos(cos_angle); //compute the angle of rotation | |
40 | + quaternion<T> quat; //create a quaternion describing the rotation | |
41 | + quat.CreateRotation(angle, r_axis); | |
42 | + rotation = quat.toMatrix3(); //compute the rotation matrix | |
148 | 43 | } |
149 | 44 | |
45 | + //find the phi values associated with the cassegrain ring | |
46 | + T PHI[2]; | |
47 | + PHI[0] = (T)asin(NA); | |
48 | + PHI[1] = (T)asin(NA_in); | |
49 | + | |
50 | + //calculate the z-axis cylinder coordinates associated with these angles | |
51 | + T Z[2]; | |
52 | + Z[0] = cos(PHI[0]); | |
53 | + Z[1] = cos(PHI[1]); | |
54 | + T range = Z[0] - Z[1]; | |
55 | + | |
56 | + //draw a distribution of random phi, z values | |
57 | + T z, phi, theta; | |
58 | + //T kmag = stim::TAU / lambda; | |
59 | + for(int i=0; i<N; i++){ //for each sample | |
60 | + z = (T)((double)rand() / (double)RAND_MAX) * range + Z[1]; //find a random position on the surface of a cylinder | |
61 | + theta = (T)(((double)rand() / (double)RAND_MAX) * stim::TAU); | |
62 | + phi = acos(z); //project onto the sphere, computing phi in spherical coordinates | |
63 | + | |
64 | + //compute and store cartesian coordinates | |
65 | + vec3<T> spherical(1, theta, phi); //convert from spherical to cartesian coordinates | |
66 | + vec3<T> cart = spherical.sph2cart(); | |
67 | + dirs[i] = rotation * cart; //create a sample vector | |
68 | + } | |
69 | + return dirs; | |
70 | +} | |
150 | 71 | |
151 | - | |
152 | -}; //end beam | |
153 | - | |
72 | + | |
154 | 73 | /// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional) |
155 | 74 | /// @param C is a pointer to Nl + 1 values where the terms will be stored |
156 | 75 | template<typename T> |
... | ... | @@ -210,20 +129,19 @@ CUDA_CALLABLE T lerp(T v0, T v1, T t) { |
210 | 129 | return fma(t, v1, fma(-t, v0, v0)); |
211 | 130 | } |
212 | 131 | |
213 | -#ifdef __CUDACC__ | |
132 | +#ifdef CUDA_FOUND | |
214 | 133 | template<typename T> |
215 | -__global__ void cuda_scalar_psf(stim::complex<T>* E, size_t N, T* r, T* phi, T k, T A, size_t Nl, | |
134 | +__global__ void cuda_scalar_psf(stim::complex<T>* E, size_t N, T* r, T* phi, T A, size_t Nl, | |
216 | 135 | T* C, |
217 | - T* lut_j, size_t Nj, T min_kr, T dkr){ | |
136 | + T* lut_j, size_t Nj, T min_r, T dr){ | |
218 | 137 | size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array |
219 | 138 | if(i >= N) return; //exit if this thread is outside the array |
220 | 139 | |
221 | 140 | T cos_phi = cos(phi[i]); //calculate the thread value for cos(phi) |
222 | - T kr = r[i] * k; //calculate the thread value for k*r | |
223 | 141 | stim::complex<T> Ei = 0; //initialize the value of the field to zero |
224 | 142 | size_t NC = Nl + 1; //calculate the number of coefficients to be used |
225 | 143 | |
226 | - T fij = (kr - min_kr)/dkr; //FP index into the spherical bessel LUT | |
144 | + T fij = (r[i] - min_r)/dr; //FP index into the spherical bessel LUT | |
227 | 145 | size_t ij = (size_t) fij; //convert to an integral index |
228 | 146 | T a = fij - ij; //calculate the fractional portion of the index |
229 | 147 | size_t n0j = ij * (NC); //start of the first entry in the LUT |
... | ... | @@ -276,6 +194,8 @@ void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda, |
276 | 194 | exit(1); |
277 | 195 | } |
278 | 196 | |
197 | + i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility | |
198 | + i_max--; | |
279 | 199 | T r_min, r_max; //allocate space to store the minimum and maximum values |
280 | 200 | HANDLE_ERROR( cudaMemcpy(&r_min, r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU |
281 | 201 | HANDLE_ERROR( cudaMemcpy(&r_max, r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); |
... | ... | @@ -287,29 +207,19 @@ void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda, |
287 | 207 | |
288 | 208 | size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r |
289 | 209 | |
290 | - T kr_min = k * r_min; | |
291 | - T kr_max = k * r_max; | |
292 | - | |
293 | - //temporary variables | |
294 | - double vm; //allocate space to store the return values for the bessel function calculation | |
295 | - double* jv = (double*) malloc( (Nl + 1) * sizeof(double) ); | |
296 | - double* yv = (double*) malloc( (Nl + 1) * sizeof(double) ); | |
297 | - double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); | |
298 | - double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); | |
299 | 210 | |
300 | 211 | size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j; |
301 | - T* bessel_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table | |
302 | - T delta_kr = (kr_max - kr_min) / (Nlut_j-1); //distance between values in the LUT | |
303 | - std::cout<<"LUT jl bytes: "<<lutj_bytes<<std::endl; | |
304 | - for(size_t kri = 0; kri < Nlut_j; kri++){ //for each value in the LUT | |
305 | - stim::bessjyv_sph<double>(Nl, kr_min + kri * delta_kr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] | |
212 | + T* j_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table | |
213 | + T dr = (r_max - r_min) / (Nlut_j-1); //distance between values in the LUT | |
214 | + T jl; | |
215 | + for(size_t ri = 0; ri < Nlut_j; ri++){ //for each value in the LUT | |
306 | 216 | for(size_t l = 0; l <= Nl; l++){ //for each order |
307 | - bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; //store the bessel function result | |
217 | + jl = boost::math::sph_bessel<T>(l, k*(r_min + ri * dr)); //use boost to calculate the spherical bessel function | |
218 | + j_lut[ri * (Nl + 1) + l] = jl; //store the bessel function result | |
308 | 219 | } |
309 | 220 | } |
310 | 221 | |
311 | - stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer); | |
312 | - | |
222 | + stim::cpu2image<T>(j_lut, "j_lut.bmp", Nl+1, Nlut_j, stim::cmBrewer); | |
313 | 223 | //Allocate device memory and copy everything to the GPU |
314 | 224 | |
315 | 225 | T* gpu_C; |
... | ... | @@ -317,12 +227,12 @@ void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda, |
317 | 227 | HANDLE_ERROR( cudaMemcpy(gpu_C, C, C_bytes, cudaMemcpyHostToDevice) ); |
318 | 228 | T* gpu_j_lut; |
319 | 229 | HANDLE_ERROR( cudaMalloc(&gpu_j_lut, lutj_bytes) ); |
320 | - HANDLE_ERROR( cudaMemcpy(gpu_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) ); | |
230 | + HANDLE_ERROR( cudaMemcpy(gpu_j_lut, j_lut, lutj_bytes, cudaMemcpyHostToDevice) ); | |
321 | 231 | |
322 | 232 | int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device |
323 | 233 | dim3 blocks( (unsigned)(N / threads + 1)); //calculate the optimal number of blocks |
324 | 234 | |
325 | - cuda_scalar_psf<T><<< blocks, threads >>>(E, N, r, phi, (T)stim::TAU/lambda, A, Nl, gpu_C, gpu_j_lut, Nlut_j, kr_min, delta_kr); | |
235 | + cuda_scalar_psf<T><<< blocks, threads >>>(E, N, r, phi, A, Nl, gpu_C, gpu_j_lut, Nlut_j, r_min, dr); | |
326 | 236 | |
327 | 237 | //free the LUT and condenser tables |
328 | 238 | HANDLE_ERROR( cudaFree(gpu_C) ); |
... | ... | @@ -392,7 +302,7 @@ __global__ void cuda_cart2psf(T* r, T* phi, size_t N, T* x, T* y, T* z, stim::ve |
392 | 302 | phi[i] = ps[2]; //phi = [0 pi] |
393 | 303 | } |
394 | 304 | |
395 | -#ifdef __CUDACC__ | |
305 | +#ifdef CUDA_FOUND | |
396 | 306 | /// Calculate the analytical solution to a point spread function given a set of points in cartesian coordinates |
397 | 307 | template<typename T> |
398 | 308 | void gpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){ |
... | ... | @@ -419,7 +329,7 @@ template<typename T> |
419 | 329 | void cpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){ |
420 | 330 | |
421 | 331 | // If CUDA is available, copy the cartesian points to the GPU and evaluate them in a kernel |
422 | -#ifdef __CUDACC__ | |
332 | +#ifdef CUDA_FOUND | |
423 | 333 | |
424 | 334 | T* gpu_x = NULL; |
425 | 335 | if(x != NULL){ |
... | ... | @@ -470,13 +380,112 @@ void cpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lamb |
470 | 380 | phi[i] = ps[2]; //phi = [0 pi] |
471 | 381 | } |
472 | 382 | |
473 | - cpu_scalar_psf_local(F, N, r, phi, lambda, A, NA, NA_in, Nl); //call the spherical coordinate CPU function | |
383 | + cpu_scalar_psf_local(E, N, r, phi, lambda, A, NA, NA_in, Nl); //call the spherical coordinate CPU function | |
474 | 384 | |
475 | 385 | free(r); |
476 | 386 | free(phi); |
477 | 387 | #endif |
478 | 388 | } |
389 | + | |
390 | +/// Class stim::beam represents a beam of light focused at a point and composed of several plane waves | |
391 | +template<typename T> | |
392 | +class scalarbeam | |
393 | +{ | |
394 | +public: | |
395 | + //enum beam_type {Uniform, Bartlett, Hamming, Hanning}; | |
396 | + | |
397 | +private: | |
398 | + | |
399 | + T NA[2]; //numerical aperature of the focusing optics | |
400 | + vec3<T> f; //focal point | |
401 | + vec3<T> d; //propagation direction | |
402 | + T A; //beam amplitude | |
403 | + T lambda; //beam wavelength | |
404 | +public: | |
479 | 405 | |
406 | + ///constructor: build a default beam (NA=1.0) | |
407 | + scalarbeam(T wavelength = 1, T amplitude = 1, vec3<T> focal_point = vec3<T>(0, 0, 0), vec3<T> direction = vec3<T>(0, 0, 1), T numerical_aperture = 1, T center_obsc = 0){ | |
408 | + lambda = wavelength; | |
409 | + A = amplitude; | |
410 | + f = focal_point; | |
411 | + d = direction.norm(); //make sure that the direction vector is normalized (makes calculations more efficient later on) | |
412 | + NA[0] = numerical_aperture; | |
413 | + NA[1] = center_obsc; | |
414 | + } | |
415 | + | |
416 | + ///Numerical Aperature functions | |
417 | + void setNA(T na) | |
418 | + { | |
419 | + NA[0] = (T)0; | |
420 | + NA[1] = na; | |
421 | + } | |
422 | + void setNA(T na0, T na1) | |
423 | + { | |
424 | + NA[0] = na0; | |
425 | + NA[1] = na1; | |
426 | + } | |
427 | + | |
428 | + //Monte-Carlo decomposition into plane waves | |
429 | + std::vector< scalarwave<T> > mc(size_t N = 100000) const{ | |
430 | + | |
431 | + std::vector< stim::vec3<T> > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]); //generate a random set of N vectors forming a focus | |
432 | + std::vector< scalarwave<T> > samples(N); //create a vector of plane waves | |
433 | + T kmag = (T)stim::TAU / lambda; //calculate the wavenumber | |
434 | + stim::complex<T> apw; //allocate space for the amplitude at the focal point | |
435 | + T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N); //constant value weights plane waves based on the aperture and number of samples (N) | |
436 | + stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction | |
437 | + for(size_t i=0; i<N; i++){ //for each sample | |
438 | + kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave | |
439 | + apw = a * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave | |
440 | + samples[i] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction | |
441 | + } | |
442 | + return samples; | |
443 | + } | |
444 | + | |
445 | + /// Evaluate the beam to a scalar field using Debye focusing | |
446 | + void eval(stim::scalarfield<T>& E, size_t order = 500){ | |
447 | + size_t array_size = E.grid_bytes(); | |
448 | + T* X = (T*) malloc( array_size ); //allocate space for the coordinate meshes | |
449 | + T* Y = (T*) malloc( array_size ); | |
450 | + T* Z = (T*) malloc( array_size ); | |
451 | + | |
452 | + E.meshgrid(X, Y, Z, stim::CPUmem); //calculate the coordinate meshes | |
453 | + cpu_scalar_psf_cart<T>(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[0], NA[1], order, E.spacing()); | |
454 | + | |
455 | + free(X); //free the coordinate meshes | |
456 | + free(Y); | |
457 | + free(Z); | |
458 | + } | |
459 | + | |
460 | + /// Calculate the field at a given point | |
461 | + /// @param x is the x-coordinate of the field point | |
462 | + /// @O is the approximation accuracy | |
463 | + stim::complex<T> field(T x, T y, T z, size_t O){ | |
464 | + std::vector< scalarwave<T> > W = mc(O); | |
465 | + T result = 0; //initialize the result to zero (0) | |
466 | + for(size_t i = 0; i < O; i++){ //for each plane wave | |
467 | + result += W[i].pos(x, y, z); | |
468 | + } | |
469 | + return result; | |
470 | + } | |
471 | + | |
472 | + std::string str() | |
473 | + { | |
474 | + std::stringstream ss; | |
475 | + ss<<"Beam:"<<std::endl; | |
476 | + //ss<<" Central Plane Wave: "<<beam::E0<<" e^i ( "<<beam::k<<" . r )"<<std::endl; | |
477 | + ss<<" Beam Direction: "<<d<<std::endl; | |
478 | + if(NA[0] == 0) | |
479 | + ss<<" NA: "<<NA[1]; | |
480 | + else | |
481 | + ss<<" NA: "<<NA[0]<<" -- "<<NA[1]; | |
482 | + | |
483 | + return ss.str(); | |
484 | + } | |
485 | + | |
486 | + | |
487 | + | |
488 | +}; //end beam | |
480 | 489 | } //end namespace stim |
481 | 490 | |
482 | 491 | #endif | ... | ... |