Commit 8309b07a16caaadfd4bc85bfbebdf7041a63f0e0

Authored by David Mayerich
1 parent 31262e83

fixed some vec3 errors

stim/math/complex.h
... ... @@ -11,6 +11,7 @@
11 11  
12 12 namespace stim
13 13 {
  14 + enum complexComponentType {complexReal, complexImaginary, complexMag};
14 15  
15 16 template <class T>
16 17 struct complex
... ...
stim/math/plane_old.h deleted
1   -#ifndef RTS_PLANE_H
2   -#define RTS_PLANE_H
3   -
4   -#include <iostream>
5   -#include <stim/math/vector.h>
6   -#include "rts/cuda/callable.h"
7   -
8   -
9   -namespace stim{
10   -template <typename T, int D> class plane;
11   -}
12   -
13   -template <typename T, int D>
14   -CUDA_CALLABLE stim::plane<T, D> operator-(stim::plane<T, D> v);
15   -
16   -namespace stim{
17   -
18   -template <class T, int D = 3>
19   -class plane{
20   -
21   - //a plane is defined by a point and a normal
22   -
23   -private:
24   -
25   - vec<T, D> P; //point on the plane
26   - vec<T, D> N; //plane normal
27   -
28   - CUDA_CALLABLE void init(){
29   - P = vec<T, D>(0, 0, 0);
30   - N = vec<T, D>(0, 0, 1);
31   - }
32   -
33   -
34   -public:
35   -
36   - //default constructor
37   - CUDA_CALLABLE plane(){
38   - init();
39   - }
40   -
41   - CUDA_CALLABLE plane(vec<T, D> n, vec<T, D> p = vec<T, D>(0, 0, 0)){
42   - P = p;
43   - N = n.norm();
44   - }
45   -
46   - CUDA_CALLABLE plane(T z_pos){
47   - init();
48   - P[2] = z_pos;
49   - }
50   -
51   - //create a plane from three points (a triangle)
52   - CUDA_CALLABLE plane(vec<T, D> a, vec<T, D> b, vec<T, D> c){
53   - P = c;
54   - N = (c - a).cross(b - a);
55   - if(N.len() == 0) //handle the degenerate case when two vectors are the same, N = 0
56   - N = 0;
57   - else
58   - N = N.norm();
59   - }
60   -
61   - template< typename U >
62   - CUDA_CALLABLE operator plane<U, D>(){
63   -
64   - plane<U, D> result(N, P);
65   - return result;
66   - }
67   -
68   - CUDA_CALLABLE vec<T, D> norm(){
69   - return N;
70   - }
71   -
72   - CUDA_CALLABLE vec<T, D> p(){
73   - return P;
74   - }
75   -
76   - //flip the plane front-to-back
77   - CUDA_CALLABLE plane<T, D> flip(){
78   - plane<T, D> result = *this;
79   - result.N = -result.N;
80   - return result;
81   - }
82   -
83   - //determines how a vector v intersects the plane (1 = intersects front, 0 = within plane, -1 = intersects back)
84   - CUDA_CALLABLE int face(vec<T, D> v){
85   -
86   - T dprod = v.dot(N); //get the dot product between v and N
87   -
88   - //conditional returns the appropriate value
89   - if(dprod < 0)
90   - return 1;
91   - else if(dprod > 0)
92   - return -1;
93   - else
94   - return 0;
95   - }
96   -
97   - //determine on which side of the plane a point lies (1 = front, 0 = on the plane, -1 = back)
98   - CUDA_CALLABLE int side(vec<T, D> p){
99   -
100   - vec<T, D> v = p - P; //get the vector from P to the query point p
101   -
102   - return face(v);
103   - }
104   -
105   - //compute the component of v that is perpendicular to the plane
106   - CUDA_CALLABLE vec<T, D> perpendicular(vec<T, D> v){
107   - return N * v.dot(N);
108   - }
109   -
110   - //compute the projection of v in the plane
111   - CUDA_CALLABLE vec<T, D> parallel(vec<T, D> v){
112   - return v - perpendicular(v);
113   - }
114   -
115   - CUDA_CALLABLE void decompose(vec<T, D> v, vec<T, D>& para, vec<T, D>& perp){
116   - perp = N * v.dot(N);
117   - para = v - perp;
118   - }
119   -
120   - //get both the parallel and perpendicular components of a vector v w.r.t. the plane
121   - CUDA_CALLABLE void project(vec<T, D> v, vec<T, D> &v_par, vec<T, D> &v_perp){
122   -
123   - v_perp = v.dot(N);
124   - v_par = v - v_perp;
125   - }
126   -
127   - //compute the reflection of v off of the plane
128   - CUDA_CALLABLE vec<T, D> reflect(vec<T, D> v){
129   -
130   - //compute the reflection using N_prime as the plane normal
131   - vec<T, D> par = parallel(v);
132   - vec<T, D> r = (-v) + par * 2;
133   -
134   - /*std::cout<<"----------------REFLECT-----------------------------"<<std::endl;
135   - std::cout<<str()<<std::endl;
136   - std::cout<<"v: "<<v<<std::endl;
137   - std::cout<<"r: "<<r<<std::endl;
138   - std::cout<<"Perpendicular: "<<perpendicular(v)<<std::endl;
139   - std::cout<<"Parallel: "<<par<<std::endl;*/
140   - return r;
141   -
142   - }
143   -
144   - CUDA_CALLABLE rts::plane<T, D> operator-()
145   - {
146   - rts::plane<T, D> p = *this;
147   -
148   - //negate the normal vector
149   - p.N = -p.N;
150   -
151   - return p;
152   - }
153   -
154   - //output a string
155   - std::string str(){
156   - std::stringstream ss;
157   - ss<<"P: "<<P<<std::endl;
158   - ss<<"N: "<<N;
159   - return ss.str();
160   - }
161   -
162   - ///////Friendship
163   - //friend CUDA_CALLABLE rts::plane<T, D> operator- <> (rts::plane<T, D> v);
164   -
165   -
166   -
167   -};
168   -
169   -}
170   -
171   -//arithmetic operators
172   -
173   -//negative operator flips the plane (front to back)
174   -//template <typename T, int D>
175   -
176   -
177   -
178   -
179   -#endif
stim/math/quad.h deleted
1   -#ifndef RTS_QUAD_H
2   -#define RTS_QUAD_H
3   -
4   -//enable CUDA_CALLABLE macro
5   -#include <stim/cuda/callable.h>
6   -#include <stim/math/vector.h>
7   -#include <stim/math/triangle.h>
8   -#include <stim/math/quaternion.h>
9   -#include <iostream>
10   -#include <iomanip>
11   -#include <algorithm>
12   -
13   -namespace stim{
14   -
15   -//template for a quadangle class in ND space
16   -template <class T, int N = 3>
17   -struct quad
18   -{
19   - /*
20   - B------------------>C
21   - ^ ^
22   - | |
23   - Y |
24   - | |
25   - | |
26   - A---------X-------->O
27   - */
28   -
29   - /*T A[N];
30   - T B[N];
31   - T C[N];*/
32   -
33   - rts::vec<T, N> A;
34   - rts::vec<T, N> X;
35   - rts::vec<T, N> Y;
36   -
37   -
38   - CUDA_CALLABLE quad()
39   - {
40   -
41   - }
42   -
43   - CUDA_CALLABLE quad(vec<T, N> a, vec<T, N> b, vec<T, N> c)
44   - {
45   -
46   - A = a;
47   - Y = b - a;
48   - X = c - a - Y;
49   -
50   - }
51   -
52   - /*******************************************************************
53   - Constructor - create a quad from a position, normal, and rotation
54   - *******************************************************************/
55   - CUDA_CALLABLE quad(rts::vec<T, N> c, rts::vec<T, N> normal, T width, T height, T theta)
56   - {
57   -
58   - //compute the X direction - start along world-space X
59   - Y = rts::vec<T, N>(0, 1, 0);
60   - if(Y == normal)
61   - Y = rts::vec<T, N>(0, 0, 1);
62   -
63   - X = Y.cross(normal).norm();
64   -
65   - std::cout<<X<<std::endl;
66   -
67   - //rotate the X axis by theta radians
68   - rts::quaternion<T> q;
69   - q.CreateRotation(theta, normal);
70   - X = q.toMatrix3() * X;
71   - Y = normal.cross(X);
72   -
73   - //normalize everything
74   - X = X.norm();
75   - Y = Y.norm();
76   -
77   - //scale to match the quad width and height
78   - X = X * width;
79   - Y = Y * height;
80   -
81   - //set the corner of the plane
82   - A = c - X * 0.5f - Y * 0.5f;
83   -
84   - std::cout<<X<<std::endl;
85   - }
86   -
87   - //boolean comparison
88   - bool operator==(const quad<T, N> & rhs)
89   - {
90   - if(A == rhs.A && X == rhs.X && Y == rhs.Y)
91   - return true;
92   - else
93   - return false;
94   - }
95   -
96   - /*******************************************
97   - Return the normal for the quad
98   - *******************************************/
99   - CUDA_CALLABLE rts::vec<T, N> n()
100   - {
101   - return (X.cross(Y)).norm();
102   - }
103   -
104   - CUDA_CALLABLE rts::vec<T, N> p(T a, T b)
105   - {
106   - rts::vec<T, N> result;
107   - //given the two parameters a, b = [0 1], returns the position in world space
108   - result = A + X * a + Y * b;
109   -
110   - return result;
111   - }
112   -
113   - CUDA_CALLABLE rts::vec<T, N> operator()(T a, T b)
114   - {
115   - return p(a, b);
116   - }
117   -
118   - std::string str()
119   - {
120   - std::stringstream ss;
121   -
122   - ss<<std::left<<"B="<<setfill('-')<<setw(20)<<A + Y<<">"<<"C="<<A + Y + X<<std::endl;
123   - ss<<setfill(' ')<<setw(23)<<"|"<<"|"<<std::endl<<setw(23)<<"|"<<"|"<<std::endl;
124   - ss<<std::left<<"A="<<setfill('-')<<setw(20)<<A<<">"<<"D="<<A + X;
125   -
126   - return ss.str();
127   -
128   - }
129   -
130   - CUDA_CALLABLE quad<T, N> operator*(T rhs)
131   - {
132   - //scales the plane by a scalar value
133   -
134   - //compute the center point
135   - rts::vec<T, N> c = A + X*0.5f + Y*0.5f;
136   -
137   - //create the new quadangle
138   - quad<T, N> result;
139   - result.X = X * rhs;
140   - result.Y = Y * rhs;
141   - result.A = c - result.X*0.5f - result.Y*0.5f;
142   -
143   - return result;
144   -
145   - }
146   -
147   - CUDA_CALLABLE T dist(vec<T, N> p)
148   - {
149   - //compute the distance between a point and this quad
150   -
151   - //first break the quad up into two triangles
152   - triangle<T, N> T0(A, A+X, A+Y);
153   - triangle<T, N> T1(A+X+Y, A+X, A+Y);
154   -
155   -
156   - T d0 = T0.dist(p);
157   - T d1 = T1.dist(p);
158   -
159   - if(d0 < d1)
160   - return d0;
161   - else
162   - return d1;
163   - }
164   -
165   - CUDA_CALLABLE T dist_max(vec<T, N> p)
166   - {
167   - T da = (A - p).len();
168   - T db = (A+X - p).len();
169   - T dc = (A+Y - p).len();
170   - T dd = (A+X+Y - p).len();
171   -
172   - return std::max( da, std::max(db, std::max(dc, dd) ) );
173   - }
174   -};
175   -
176   -} //end namespace rts
177   -
178   -template <typename T, int N>
179   -std::ostream& operator<<(std::ostream& os, rts::quad<T, N> R)
180   -{
181   - os<<R.str();
182   - return os;
183   -}
184   -
185   -
186   -#endif
stim/math/rect.h
... ... @@ -28,13 +28,10 @@ class rect : plane &lt;T&gt;
28 28 O---------X--------->
29 29 */
30 30  
31   -private:
32   -
33   - stim::vec<T> X;
34   - stim::vec<T> Y;
35   -
36   -
  31 +protected:
37 32  
  33 + stim::vec3<T> X;
  34 + stim::vec3<T> Y;
38 35  
39 36 public:
40 37  
... ... @@ -65,7 +62,7 @@ public:
65 62 ///create a rectangle from a center point, normal
66 63 ///@param c: x,y,z location of the center.
67 64 ///@param n: x,y,z direction of the normal.
68   - CUDA_CALLABLE rect(vec<T> c, vec<T> n = vec<T>(0, 0, 1))
  65 + CUDA_CALLABLE rect(vec3<T> c, vec3<T> n = vec3<T>(0, 0, 1))
69 66 : plane<T>()
70 67 {
71 68 init(); //start with the default setting
... ... @@ -76,7 +73,7 @@ public:
76 73 ///@param c: x,y,z location of the center.
77 74 ///@param s: size of the rectangle.
78 75 ///@param n: x,y,z direction of the normal.
79   - CUDA_CALLABLE rect(vec<T> c, T s, vec<T> n = vec<T>(0, 0, 1))
  76 + CUDA_CALLABLE rect(vec3<T> c, T s, vec3<T> n = vec3<T>(0, 0, 1))
80 77 : plane<T>()
81 78 {
82 79 init(); //start with the default setting
... ... @@ -89,7 +86,7 @@ public:
89 86 ///@param center: x,y,z location of the center.
90 87 ///@param directionX: u,v,w direction of the X vector.
91 88 ///@param directionY: u,v,w direction of the Y vector.
92   - CUDA_CALLABLE rect(vec<T> center, vec<T> directionX, vec<T> directionY )
  89 + CUDA_CALLABLE rect(vec3<T> center, vec3<T> directionX, vec3<T> directionY )
93 90 : plane<T>((directionX.cross(directionY)).norm(),center)
94 91 {
95 92 X = directionX;
... ... @@ -101,7 +98,7 @@ public:
101 98 ///@param center: x,y,z location of the center.
102 99 ///@param directionX: u,v,w direction of the X vector.
103 100 ///@param directionY: u,v,w direction of the Y vector.
104   - CUDA_CALLABLE rect(T size, vec<T> center, vec<T> directionX, vec<T> directionY )
  101 + CUDA_CALLABLE rect(T size, vec3<T> center, vec3<T> directionX, vec3<T> directionY )
105 102 : plane<T>((directionX.cross(directionY)).norm(),center)
106 103 {
107 104 X = directionX;
... ... @@ -114,7 +111,7 @@ public:
114 111 ///@param center: x,y,z location of the center.
115 112 ///@param directionX: u,v,w direction of the X vector.
116 113 ///@param directionY: u,v,w direction of the Y vector.
117   - CUDA_CALLABLE rect(vec<T> size, vec<T> center, vec<T> directionX, vec<T> directionY)
  114 + CUDA_CALLABLE rect(vec3<T> size, vec3<T> center, vec3<T> directionX, vec3<T> directionY)
118 115 : plane<T>((directionX.cross(directionY)).norm(), center)
119 116 {
120 117 X = directionX;
... ... @@ -138,7 +135,7 @@ public:
138 135  
139 136 ///@param n; vector with the normal.
140 137 ///Orients the rectangle along the normal n.
141   - CUDA_CALLABLE void normal(vec<T> n)
  138 + CUDA_CALLABLE void normal(vec3<T> n)
142 139 {
143 140 //orient the rectangle along the specified normal
144 141 rotate(n, X, Y);
... ... @@ -147,8 +144,8 @@ public:
147 144 ///general init method that sets a general rectangle.
148 145 CUDA_CALLABLE void init()
149 146 {
150   - X = vec<T>(1, 0, 0);
151   - Y = vec<T>(0, 1, 0);
  147 + X = vec3<T>(1, 0, 0);
  148 + Y = vec3<T>(0, 1, 0);
152 149 }
153 150  
154 151 //boolean comparison
... ... @@ -162,18 +159,18 @@ public:
162 159  
163 160  
164 161 //get the world space value given the planar coordinates a, b in [0, 1]
165   - CUDA_CALLABLE stim::vec<T> p(T a, T b)
  162 + CUDA_CALLABLE stim::vec3<T> p(T a, T b)
166 163 {
167   - stim::vec<T> result;
  164 + stim::vec3<T> result;
168 165 //given the two parameters a, b = [0 1], returns the position in world space
169   - vec<T> A = this->P - X * (T)0.5 - Y * (T)0.5;
  166 + vec3<T> A = this->P - X * (T)0.5 - Y * (T)0.5;
170 167 result = A + X * a + Y * b;
171 168  
172 169 return result;
173 170 }
174 171  
175 172 //parenthesis operator returns the world space given rectangular coordinates a and b in [0 1]
176   - CUDA_CALLABLE stim::vec<T> operator()(T a, T b)
  173 + CUDA_CALLABLE stim::vec3<T> operator()(T a, T b)
177 174 {
178 175 return p(a, b);
179 176 }
... ... @@ -181,12 +178,12 @@ public:
181 178 std::string str()
182 179 {
183 180 std::stringstream ss;
184   - vec<T> A = P - X * (T)0.5 - Y * (T)0.5;
  181 + vec3<T> A = P - X * (T)0.5 - Y * (T)0.5;
185 182 ss<<std::left<<"B="<<std::setfill('-')<<std::setw(20)<<A + Y<<">"<<"C="<<A + Y + X<<std::endl;
186 183 ss<<std::setfill(' ')<<std::setw(23)<<"|"<<"|"<<std::endl<<std::setw(23)<<"|"<<"|"<<std::endl;
187 184 ss<<std::left<<"A="<<std::setfill('-')<<std::setw(20)<<A<<">"<<"D="<<A + X;
188 185  
189   - return ss.str();
  186 + return ss.str();
190 187  
191 188 }
192 189  
... ... @@ -205,11 +202,11 @@ public:
205 202  
206 203 ///computes the distance between the specified point and this rectangle.
207 204 ///@param p: x, y, z coordinates of the point to calculate distance to.
208   - CUDA_CALLABLE T dist(vec<T> p)
  205 + CUDA_CALLABLE T dist(vec3<T> p)
209 206 {
210 207 //compute the distance between a point and this rect
211 208  
212   - vec<T> A = P - X * (T)0.5 - Y * (T)0.5;
  209 + vec3<T> A = P - X * (T)0.5 - Y * (T)0.5;
213 210  
214 211 //first break the rect up into two triangles
215 212 triangle<T> T0(A, A+X, A+Y);
... ... @@ -225,16 +222,16 @@ public:
225 222 return d1;
226 223 }
227 224  
228   - CUDA_CALLABLE T center(vec<T> p)
  225 + CUDA_CALLABLE T center(vec3<T> p)
229 226 {
230 227 this->P = p;
231 228 }
232 229  
233 230 ///Returns the maximum distance of the rectangle from a point p to the sides of the rectangle.
234 231 ///@param p: x, y, z point.
235   - CUDA_CALLABLE T dist_max(vec<T> p)
  232 + CUDA_CALLABLE T dist_max(vec3<T> p)
236 233 {
237   - vec<T> A = P - X * (T)0.5 - Y * (T)0.5;
  234 + vec3<T> A = P - X * (T)0.5 - Y * (T)0.5;
238 235 T da = (A - p).len();
239 236 T db = (A+X - p).len();
240 237 T dc = (A+Y - p).len();
... ...
stim/math/vec3.h
... ... @@ -242,4 +242,11 @@ stim::vec3&lt;T&gt; operator*(T lhs, stim::vec3&lt;T&gt; rhs){
242 242 return rhs * lhs;
243 243 }
244 244  
  245 +//stream operator
  246 +template<typename T>
  247 +std::ostream& operator<<(std::ostream& os, stim::vec3<T> const& rhs){
  248 + os<<rhs.str();
  249 + return os;
  250 +}
  251 +
245 252 #endif
246 253 \ No newline at end of file
... ...
stim/math/vector.h
... ... @@ -5,8 +5,9 @@
5 5 #include <cmath>
6 6 #include <sstream>
7 7 #include <vector>
8   -
  8 +
9 9 #include <stim/cuda/cudatools/callable.h>
  10 +#include <stim/math/vec3.h>
10 11  
11 12 namespace stim
12 13 {
... ... @@ -70,8 +71,8 @@ struct vec : public std::vector&lt;T&gt;
70 71 size_t N = other.size();
71 72 resize(N); //resize the current vector to match the copy
72 73 for(size_t i=0; i<N; i++){ //copy each element
73   - at(i) = other[i];
74   - }
  74 + at(i) = other[i];
  75 + }
75 76 }
76 77  
77 78 //I'm not sure what these were doing here.
... ... @@ -318,8 +319,8 @@ struct vec : public std::vector&lt;T&gt;
318 319 }
319 320  
320 321 /// Cast to a vec3
321   - operator vec3<T>(){
322   - vec3<T> r;
  322 + operator stim::vec3<T>(){
  323 + stim::vec3<T> r;
323 324 size_t N = std::min<size_t>(size(), 3);
324 325 for(size_t i = 0; i < N; i++)
325 326 r[i] = at(i);
... ...
stim/optics/mie.h
1 1 #ifndef STIM_MIE_H
2 2 #define STIM_MIE_H
  3 +#include <boost/math/special_functions/bessel.hpp>
3 4  
4 5 #include "scalarwave.h"
5 6 #include "../math/bessel.h"
... ... @@ -43,7 +44,6 @@ void B_coefficients(stim::complex&lt;T&gt;* B, T a, T k, stim::complex&lt;T&gt; n, int Nl){
43 44 numerator = j_ka[l] * dj_kna[l] * (stim::complex<double>)n - j_kna[l] * dj_ka[l];
44 45 denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
45 46 B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
46   - std::cout<<B[l]<<std::endl;
47 47 }
48 48 }
49 49  
... ... @@ -84,7 +84,7 @@ void A_coefficients(stim::complex&lt;T&gt;* A, T a, T k, stim::complex&lt;T&gt; n, int Nl){
84 84  
85 85 #define LOCAL_NL 16
86 86 template<typename T>
87   -__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, int Nl){
  87 +__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){
88 88 extern __shared__ stim::complex<T> shared_hB[]; //declare the list of waves in shared memory
89 89  
90 90 size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
... ... @@ -96,14 +96,11 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T*
96 96  
97 97 T r = p.len(); //calculate the distance from the sphere
98 98 if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field)
99   - T k = W[0].kmag();
100   - size_t NC = Nl + 1; //calculate the number of coefficients to be used
101   - T kr = r * k; //calculate the thread value for k*r
102   - T fij = (kr - kr_min)/dkr; //FP index into the spherical bessel LUT
  99 + T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT
103 100 size_t ij = (size_t) fij; //convert to an integral index
104 101 T alpha = fij - ij; //calculate the fractional portion of the index
105   - size_t n0j = ij * (NC); //start of the first entry in the LUT
106   - size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT
  102 + size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT
  103 + size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT
107 104  
108 105 T cos_phi;
109 106 T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials
... ... @@ -112,37 +109,36 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T*
112 109 stim::complex<T> Ei = 0; //create a register to store the result
113 110 int l;
114 111  
115   - stim::complex<T> hlBl[LOCAL_NL+1];
116   - int shared_start = threadIdx.x * (Nl - LOCAL_NL);
  112 + stim::complex<T> hlBl[LOCAL_NL+1]; //the first LOCAL_NL components are stored in registers for speed
  113 + int shared_start = threadIdx.x * (Nl - LOCAL_NL); //wrap up some operations so that they aren't done in the main loops
117 114  
118   - #pragma unroll LOCAL_NL+1
  115 + #pragma unroll LOCAL_NL+1 //copy the first LOCAL_NL+1 h_l * B_l components to registers
119 116 for(l = 0; l <= LOCAL_NL; l++)
120 117 hlBl[l] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
121 118  
122   - for(l = LOCAL_NL+1; l <= Nl; l++)
  119 + for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory
123 120 shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
124 121  
125   - for(size_t w = 0; w < nW; w++){
  122 + for(size_t w = 0; w < nW; w++){ //for each plane wave
126 123 cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere
127   - Pl_2 = 1;
  124 + Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
128 125 Pl_1 = cos_phi;
129   - Ei += W[w].E() * hlBl[0] * Pl_2;
  126 + Ei += W[w].E() * hlBl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation
130 127 Ei += W[w].E() * hlBl[1] * Pl_1;
131 128  
132   - #pragma unroll LOCAL_NL-1
  129 + #pragma unroll LOCAL_NL-1 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file)
133 130 for(l = 2; l <= LOCAL_NL; l++){
134   - Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
135   - Ei += W[w].E() * hlBl[l] * Pl;
  131 + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs)
  132 + Ei += W[w].E() * hlBl[l] * Pl; //calculate and sum the current field order
136 133 Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
137 134 Pl_1 = Pl;
138 135 }
139 136  
140   - for(l = LOCAL_NL+1; l <= Nl; l++){
141   - Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
142   - Ei += W[w].E() * shared_hB[shared_start + (l - (LOCAL_NL+1))] * Pl;
143   - Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
144   - Pl_1 = Pl;
145   -
  137 + for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers)
  138 + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs
  139 + Ei += W[w].E() * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl;
  140 + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
  141 + Pl_1 = Pl;
146 142 }
147 143 }
148 144 E[i] += Ei; //copy the result to device memory
... ... @@ -152,10 +148,10 @@ template&lt;typename T&gt;
152 148 void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){
153 149  
154 150 size_t max_shared_mem = stim::sharedMemPerBlock();
155   - int hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
  151 + size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
156 152 std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl;
157 153 std::cout<<"shared memory: "<<max_shared_mem<<std::endl;
158   - int threads = (max_shared_mem / hBl_array) / 32 * 32;
  154 + int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
159 155 std::cout<<"threads per block: "<<threads<<std::endl;
160 156 dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
161 157  
... ... @@ -164,7 +160,6 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, sti
164 160 else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate
165 161 std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
166 162 cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel
167   -
168 163 }
169 164  
170 165 template<typename T>
... ... @@ -261,16 +256,19 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
261 256 exit(1);
262 257 }
263 258  
  259 + i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility
  260 + i_max--;
264 261 T r_min, r_max; //allocate space to store the minimum and maximum values
265 262 HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU
266 263 HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
267 264  
  265 + r_min = max(r_min, a); //if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere)
268 266  
269 267 //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r
270 268 size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1);
271 269  
272   - T kr_min = k * r_min;
273   - T kr_max = k * r_max;
  270 + //T kr_min = k * r_min;
  271 + //T kr_max = k * r_max;
274 272  
275 273 //temporary variables
276 274 double vm; //allocate space to store the return values for the bessel function calculation
... ... @@ -281,27 +279,29 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
281 279  
282 280 size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut;
283 281 stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes); //pointer to the look-up table
284   - T dkr = (kr_max - kr_min) / (N_hB_lut-1); //distance between values in the LUT
  282 + T dr = (r_max - r_min) / (N_hB_lut-1); //distance between values in the LUT
285 283 std::cout<<"LUT jl bytes: "<<hB_bytes<<std::endl;
286 284 stim::complex<T> hl;
287   - for(size_t kri = 0; kri < N_hB_lut; kri++){ //for each value in the LUT
288   - stim::bessjyv_sph<double>(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
  285 + for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT
  286 + stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
289 287 for(size_t l = 0; l <= Nl; l++){ //for each order
290 288 hl.r = (T)jv[l];
291 289 hl.i = (T)yv[l];
292 290  
293   - hB_lut[kri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result
  291 + hB_lut[ri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result
  292 + //std::cout<<hB_lut[ri * (Nl + 1) + l]<<std::endl;
294 293 }
295 294 }
296   -
297   - //stim::cpu2image<T>(hankel_lut, "hankel.bmp", Nl+1, Nlut_j, stim::cmBrewer);
  295 + T* real_lut = (T*) malloc(hB_bytes/2);
  296 + stim::real(real_lut, hB_lut, N_hB_lut);
  297 + stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer);
298 298  
299 299 //Allocate device memory and copy everything to the GPU
300 300 stim::complex<T>* dev_hB_lut;
301 301 HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
302 302 HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );
303 303  
304   - gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, kr_min, dkr, N_hB_lut, Nl);
  304 + gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl);
305 305  
306 306 cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory
307 307  
... ... @@ -349,9 +349,90 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
349 349 }
350 350  
351 351 template<typename T>
352   -void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){
  352 +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){
353 353 std::vector< stim::scalarwave<T> > W(1, w);
354   - cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n);
  354 + cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, r_spacing);
  355 +}
  356 +
  357 +template<typename T>
  358 +__global__ void cuda_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* jA, T r_min, T dr, size_t N_jA, int Nl){
  359 + extern __shared__ stim::complex<T> shared_jA[]; //declare the list of waves in shared memory
  360 +
  361 + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
  362 + if(i >= N) return; //exit if this thread is outside the array
  363 + stim::vec3<T> p;
  364 + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
  365 + (y == NULL) ? p[1] = 0 : p[1] = y[i];
  366 + (z == NULL) ? p[2] = 0 : p[2] = z[i];
  367 +
  368 + T r = p.len(); //calculate the distance from the sphere
  369 + if(r > a) return; //exit if the point is inside the sphere (we only calculate the internal field)
  370 + T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT
  371 + size_t ij = (size_t) fij; //convert to an integral index
  372 + T alpha = fij - ij; //calculate the fractional portion of the index
  373 + size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT
  374 + size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT
  375 +
  376 + T cos_phi;
  377 + T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials
  378 +
  379 + stim::complex<T> jAl;
  380 + stim::complex<T> Ei = 0; //create a register to store the result
  381 + int l;
  382 +
  383 + stim::complex<T> jlAl[LOCAL_NL+1]; //the first LOCAL_NL components are stored in registers for speed
  384 + int shared_start = threadIdx.x * (Nl - LOCAL_NL); //wrap up some operations so that they aren't done in the main loops
  385 +
  386 + #pragma unroll LOCAL_NL+1 //copy the first LOCAL_NL+1 h_l * B_l components to registers
  387 + for(l = 0; l <= LOCAL_NL; l++)
  388 + jlAl[l] = clerp<T>( jA[n0j + l], jA[n1j + l], alpha );
  389 +
  390 + for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory
  391 + shared_jA[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( jA[n0j + l], jA[n1j + l], alpha );
  392 +
  393 + for(size_t w = 0; w < nW; w++){ //for each plane wave
  394 + if(r == 0) cos_phi = 0;
  395 + else
  396 + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere
  397 + Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
  398 + Pl_1 = cos_phi;
  399 + Ei += W[w].E() * jlAl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation
  400 + Ei += W[w].E() * jlAl[1] * Pl_1;
  401 +
  402 + #pragma unroll LOCAL_NL-1 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file)
  403 + for(l = 2; l <= LOCAL_NL; l++){
  404 + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs)
  405 + Ei += W[w].E() * jlAl[l] * Pl; //calculate and sum the current field order
  406 + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
  407 + Pl_1 = Pl;
  408 + }
  409 +
  410 + for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers)
  411 + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs
  412 + Ei += W[w].E() * shared_jA[shared_start + l - LOCAL_NL - 1] * Pl;
  413 + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
  414 + Pl_1 = Pl;
  415 + }
  416 + }
  417 + E[i] = Ei; //copy the result to device memory
  418 +}
  419 +
  420 +template<typename T>
  421 +void gpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* jA, T r_min, T dr, size_t N_jA, size_t Nl){
  422 +
  423 + size_t max_shared_mem = stim::sharedMemPerBlock();
  424 + size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
  425 + std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl;
  426 + std::cout<<"shared memory: "<<max_shared_mem<<std::endl;
  427 + int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
  428 + std::cout<<"threads per block: "<<threads<<std::endl;
  429 + dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
  430 +
  431 + size_t shared_mem;
  432 + if(Nl <= LOCAL_NL) shared_mem = 0;
  433 + else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate
  434 + std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
  435 + cuda_scalar_mie_internal<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl); //call the kernel
355 436 }
356 437  
357 438 /// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere
... ... @@ -365,18 +446,122 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, sti
365 446 /// @param a is the radius of the sphere
366 447 /// @param n is the complex refractive index of the sphere
367 448 template<typename T>
368   -void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n){
369   -
370   - //calculate the necessary number of orders required to represent the scattered field
  449 +void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n, T r_spacing = 0.1){
  450 +//calculate the necessary number of orders required to represent the scattered field
371 451 T k = W[0].kmag();
372 452  
373   - size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2);
  453 + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
  454 + if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization)
374 455 std::cout<<"Nl: "<<Nl<<std::endl;
375 456  
376 457 //calculate the scattering coefficients for the sphere
377 458 stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients
378 459 A_coefficients(A, a, k, n, Nl);
379 460  
  461 +#ifdef CUDA_FOUND
  462 + stim::complex<T>* dev_E; //allocate space for the field
  463 + cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
  464 + cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
  465 + //cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used)
  466 +
  467 + // COORDINATES
  468 + T* dev_x = NULL; //allocate space and copy the X coordinate (if specified)
  469 + if(x != NULL){
  470 + HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
  471 + HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
  472 + }
  473 + T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified)
  474 + if(y != NULL){
  475 + HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
  476 + HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
  477 + }
  478 + T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified)
  479 + if(z != NULL){
  480 + HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
  481 + HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
  482 + }
  483 +
  484 + // PLANE WAVES
  485 + stim::scalarwave<T>* dev_W; //allocate space and copy plane waves
  486 + HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
  487 + HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
  488 +
  489 + // BESSEL FUNCTION LOOK-UP TABLE
  490 + //calculate the distance from the sphere center
  491 + T* dev_r;
  492 + HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );
  493 +
  494 + int threads = stim::maxThreadsPerBlock();
  495 + dim3 blocks((unsigned)(N / threads + 1));
  496 + cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);
  497 +
  498 + //Find the minimum and maximum values of r
  499 + cublasStatus_t stat;
  500 + cublasHandle_t handle;
  501 +
  502 + stat = cublasCreate(&handle); //create a cuBLAS handle
  503 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  504 + printf ("CUBLAS initialization failed\n");
  505 + exit(1);
  506 + }
  507 +
  508 + int i_min, i_max;
  509 + stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
  510 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  511 + printf ("CUBLAS Error: failed to calculate minimum r value.\n");
  512 + exit(1);
  513 + }
  514 + stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
  515 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  516 + printf ("CUBLAS Error: failed to calculate maximum r value.\n");
  517 + exit(1);
  518 + }
  519 +
  520 + i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility
  521 + i_max--;
  522 + T r_min, r_max; //allocate space to store the minimum and maximum values
  523 + HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU
  524 + HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
  525 +
  526 + r_max = min(r_max, a); //the internal field doesn't exist outside of the sphere
  527 +
  528 + size_t N_jA_lut = (size_t)((r_max - r_min) / r_spacing + 1);
  529 +
  530 + //temporary variables
  531 + double vm; //allocate space to store the return values for the bessel function calculation
  532 + stim::complex<double>* jv = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  533 + stim::complex<double>* yv = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  534 + stim::complex<double>* djv= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  535 + stim::complex<double>* dyv= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  536 +
  537 + size_t jA_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_jA_lut;
  538 + stim::complex<T>* jA_lut = (stim::complex<T>*) malloc(jA_bytes); //pointer to the look-up table
  539 + T dr = (r_max - r_min) / (N_jA_lut-1); //distance between values in the LUT
  540 + std::cout<<"LUT jl bytes: "<<jA_bytes<<std::endl;
  541 + stim::complex<T> hl;
  542 + stim::complex<double> nd = (stim::complex<double>)n;
  543 + for(size_t ri = 0; ri < N_jA_lut; ri++){ //for each value in the LUT
  544 + stim::cbessjyva_sph<double>(Nl, nd * k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
  545 + for(size_t l = 0; l <= Nl; l++){ //for each order
  546 + jA_lut[ri * (Nl + 1) + l] = (stim::complex<T>)(jv[l] * (stim::complex<double>)A[l]); //store the bessel function result
  547 + }
  548 + }
  549 +
  550 + //Allocate device memory and copy everything to the GPU
  551 + stim::complex<T>* dev_jA_lut;
  552 + HANDLE_ERROR( cudaMalloc(&dev_jA_lut, jA_bytes) );
  553 + HANDLE_ERROR( cudaMemcpy(dev_jA_lut, jA_lut, jA_bytes, cudaMemcpyHostToDevice) );
  554 +
  555 + gpu_scalar_mie_internal<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_jA_lut, r_min, dr, N_jA_lut, Nl);
  556 +
  557 + cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory
  558 +
  559 + if(x != NULL) cudaFree(dev_x); //free everything
  560 + if(y != NULL) cudaFree(dev_y);
  561 + if(z != NULL) cudaFree(dev_z);
  562 + cudaFree(dev_E);
  563 +#else
  564 +
380 565 //allocate space to store the bessel function call results
381 566 double vm;
382 567 stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
... ... @@ -414,12 +599,13 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
414 599 }
415 600 }
416 601 }
  602 +#endif
417 603 }
418 604  
419 605 template<typename T>
420   -void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){
  606 +void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){
421 607 std::vector< stim::scalarwave<T> > W(1, w);
422   - cpu_scalar_mie_internal(E, N, x, y, z, W, a, n);
  608 + cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, r_spacing);
423 609 }
424 610  
425 611 }
... ...
stim/optics/scalarbeam.h
1 1 #ifndef RTS_BEAM
2 2 #define RTS_BEAM
  3 +#include <boost/math/special_functions/bessel.hpp>
3 4  
4 5 #include "../math/vec3.h"
5 6 #include "../optics/scalarwave.h"
... ... @@ -7,150 +8,68 @@
7 8 #include "../math/legendre.h"
8 9 #include "../cuda/cudatools/devices.h"
9 10 #include "../cuda/cudatools/timer.h"
  11 +#include "../optics/scalarfield.h"
10 12 #include <cublas_v2.h>
11 13 #include <math_constants.h>
12 14 #include <vector>
13 15 #include <stdlib.h>
14 16  
15   -namespace stim{
16 17  
17   - /// Function returns the value of the scalar field produced by a beam with the specified parameters
18   -
19   - template<typename T>
20   - std::vector< stim::vec3<T> > generate_focusing_vectors(size_t N, stim::vec3<T> d, T NA, T NA_in = 0){
21   -
22   - std::vector< stim::vec3<T> > dirs(N); //allocate an array to store the focusing vectors
23   -
24   - ///compute the rotation operator to transform (0, 0, 1) to k
25   - T cos_angle = d.dot(vec3<T>(0, 0, 1));
26   - stim::matrix<T, 3> rotation;
27   -
28   - //if the cosine of the angle is -1, the rotation is just a flip across the z axis
29   - if(cos_angle == -1){
30   - rotation(2, 2) = -1;
31   - }
32   - else if(cos_angle != 1.0)
33   - {
34   - vec3<T> r_axis = vec3<T>(0, 0, 1).cross(d).norm(); //compute the axis of rotation
35   - T angle = acos(cos_angle); //compute the angle of rotation
36   - quaternion<T> quat; //create a quaternion describing the rotation
37   - quat.CreateRotation(angle, r_axis);
38   - rotation = quat.toMatrix3(); //compute the rotation matrix
39   - }
40   -
41   - //find the phi values associated with the cassegrain ring
42   - T PHI[2];
43   - PHI[0] = (T)asin(NA);
44   - PHI[1] = (T)asin(NA_in);
45   -
46   - //calculate the z-axis cylinder coordinates associated with these angles
47   - T Z[2];
48   - Z[0] = cos(PHI[0]);
49   - Z[1] = cos(PHI[1]);
50   - T range = Z[0] - Z[1];
51   -
52   - //draw a distribution of random phi, z values
53   - T z, phi, theta;
54   - //T kmag = stim::TAU / lambda;
55   - for(int i=0; i<N; i++){ //for each sample
56   - z = (T)((double)rand() / (double)RAND_MAX) * range + Z[1]; //find a random position on the surface of a cylinder
57   - theta = (T)(((double)rand() / (double)RAND_MAX) * stim::TAU);
58   - phi = acos(z); //project onto the sphere, computing phi in spherical coordinates
59   -
60   - //compute and store cartesian coordinates
61   - vec3<T> spherical(1, theta, phi); //convert from spherical to cartesian coordinates
62   - vec3<T> cart = spherical.sph2cart();
63   - dirs[i] = rotation * cart; //create a sample vector
64   - }
65   - return dirs;
66   - }
67   -
68   -/// Class stim::beam represents a beam of light focused at a point and composed of several plane waves
69   -template<typename T>
70   -class scalarbeam
71   -{
72   -public:
73   - //enum beam_type {Uniform, Bartlett, Hamming, Hanning};
74   -
75   -private:
76   -
77   - T NA[2]; //numerical aperature of the focusing optics
78   - vec3<T> f; //focal point
79   - vec3<T> d; //propagation direction
80   - stim::complex<T> A; //beam amplitude
81   - T lambda; //beam wavelength
82   -public:
83 18  
84   - ///constructor: build a default beam (NA=1.0)
85   - scalarbeam(T wavelength = 1, stim::complex<T> amplitude = 1, vec3<T> focal_point = vec3<T>(0, 0, 0), vec3<T> direction = vec3<T>(0, 0, 1), T numerical_aperture = 1, T center_obsc = 0){
86   - lambda = wavelength;
87   - A = amplitude;
88   - f = focal_point;
89   - d = direction.norm(); //make sure that the direction vector is normalized (makes calculations more efficient later on)
90   - NA[0] = numerical_aperture;
91   - NA[1] = center_obsc;
92   - }
  19 +namespace stim{
93 20  
94   - ///Numerical Aperature functions
95   - void setNA(T na)
96   - {
97   - NA[0] = (T)0;
98   - NA[1] = na;
99   - }
100   - void setNA(T na0, T na1)
101   - {
102   - NA[0] = na0;
103   - NA[1] = na1;
104   - }
  21 +/// Function returns the value of the scalar field produced by a beam with the specified parameters
105 22  
106   - //Monte-Carlo decomposition into plane waves
107   - std::vector< scalarwave<T> > mc(size_t N = 100000) const{
  23 +template<typename T>
  24 +std::vector< stim::vec3<T> > generate_focusing_vectors(size_t N, stim::vec3<T> d, T NA, T NA_in = 0){
108 25  
109   - std::vector< stim::vec3<T> > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]); //generate a random set of N vectors forming a focus
110   - std::vector< scalarwave<T> > samples(N); //create a vector of plane waves
111   - T kmag = (T)stim::TAU / lambda; //calculate the wavenumber
112   - stim::complex<T> apw; //allocate space for the amplitude at the focal point
113   - T a = (T)(stim::TAU * (1 - cos(asin(NA[0]))) / (double)N);
114   - stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction
115   - for(size_t i=0; i<N; i++){ //for each sample
116   - kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave
117   - apw = a * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave
118   - samples[i] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction
119   - }
  26 + std::vector< stim::vec3<T> > dirs(N); //allocate an array to store the focusing vectors
120 27  
121   - return samples;
122   - }
  28 + ///compute the rotation operator to transform (0, 0, 1) to k
  29 + T cos_angle = d.dot(vec3<T>(0, 0, 1));
  30 + stim::matrix<T, 3> rotation;
123 31  
124   - /// Calculate the field at a given point
125   - /// @param x is the x-coordinate of the field point
126   - /// @O is the approximation accuracy
127   - stim::complex<T> field(T x, T y, T z, size_t O){
128   - std::vector< scalarwave<T> > W = mc(O);
129   - T result = 0; //initialize the result to zero (0)
130   - for(size_t i = 0; i < O; i++){ //for each plane wave
131   - result += W[i].pos(x, y, z);
132   - }
133   - return result;
  32 + //if the cosine of the angle is -1, the rotation is just a flip across the z axis
  33 + if(cos_angle == -1){
  34 + rotation(2, 2) = -1;
134 35 }
135   -
136   - std::string str()
  36 + else if(cos_angle != 1.0)
137 37 {
138   - std::stringstream ss;
139   - ss<<"Beam:"<<std::endl;
140   - //ss<<" Central Plane Wave: "<<beam::E0<<" e^i ( "<<beam::k<<" . r )"<<std::endl;
141   - ss<<" Beam Direction: "<<d<<std::endl;
142   - if(NA[0] == 0)
143   - ss<<" NA: "<<NA[1];
144   - else
145   - ss<<" NA: "<<NA[0]<<" -- "<<NA[1];
146   -
147   - return ss.str();
  38 + vec3<T> r_axis = vec3<T>(0, 0, 1).cross(d).norm(); //compute the axis of rotation
  39 + T angle = acos(cos_angle); //compute the angle of rotation
  40 + quaternion<T> quat; //create a quaternion describing the rotation
  41 + quat.CreateRotation(angle, r_axis);
  42 + rotation = quat.toMatrix3(); //compute the rotation matrix
148 43 }
149 44  
  45 + //find the phi values associated with the cassegrain ring
  46 + T PHI[2];
  47 + PHI[0] = (T)asin(NA);
  48 + PHI[1] = (T)asin(NA_in);
  49 +
  50 + //calculate the z-axis cylinder coordinates associated with these angles
  51 + T Z[2];
  52 + Z[0] = cos(PHI[0]);
  53 + Z[1] = cos(PHI[1]);
  54 + T range = Z[0] - Z[1];
  55 +
  56 + //draw a distribution of random phi, z values
  57 + T z, phi, theta;
  58 + //T kmag = stim::TAU / lambda;
  59 + for(int i=0; i<N; i++){ //for each sample
  60 + z = (T)((double)rand() / (double)RAND_MAX) * range + Z[1]; //find a random position on the surface of a cylinder
  61 + theta = (T)(((double)rand() / (double)RAND_MAX) * stim::TAU);
  62 + phi = acos(z); //project onto the sphere, computing phi in spherical coordinates
  63 +
  64 + //compute and store cartesian coordinates
  65 + vec3<T> spherical(1, theta, phi); //convert from spherical to cartesian coordinates
  66 + vec3<T> cart = spherical.sph2cart();
  67 + dirs[i] = rotation * cart; //create a sample vector
  68 + }
  69 + return dirs;
  70 +}
150 71  
151   -
152   -}; //end beam
153   -
  72 +
154 73 /// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional)
155 74 /// @param C is a pointer to Nl + 1 values where the terms will be stored
156 75 template<typename T>
... ... @@ -210,20 +129,19 @@ CUDA_CALLABLE T lerp(T v0, T v1, T t) {
210 129 return fma(t, v1, fma(-t, v0, v0));
211 130 }
212 131  
213   -#ifdef __CUDACC__
  132 +#ifdef CUDA_FOUND
214 133 template<typename T>
215   -__global__ void cuda_scalar_psf(stim::complex<T>* E, size_t N, T* r, T* phi, T k, T A, size_t Nl,
  134 +__global__ void cuda_scalar_psf(stim::complex<T>* E, size_t N, T* r, T* phi, T A, size_t Nl,
216 135 T* C,
217   - T* lut_j, size_t Nj, T min_kr, T dkr){
  136 + T* lut_j, size_t Nj, T min_r, T dr){
218 137 size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
219 138 if(i >= N) return; //exit if this thread is outside the array
220 139  
221 140 T cos_phi = cos(phi[i]); //calculate the thread value for cos(phi)
222   - T kr = r[i] * k; //calculate the thread value for k*r
223 141 stim::complex<T> Ei = 0; //initialize the value of the field to zero
224 142 size_t NC = Nl + 1; //calculate the number of coefficients to be used
225 143  
226   - T fij = (kr - min_kr)/dkr; //FP index into the spherical bessel LUT
  144 + T fij = (r[i] - min_r)/dr; //FP index into the spherical bessel LUT
227 145 size_t ij = (size_t) fij; //convert to an integral index
228 146 T a = fij - ij; //calculate the fractional portion of the index
229 147 size_t n0j = ij * (NC); //start of the first entry in the LUT
... ... @@ -276,6 +194,8 @@ void gpu_scalar_psf_local(stim::complex&lt;T&gt;* E, size_t N, T* r, T* phi, T lambda,
276 194 exit(1);
277 195 }
278 196  
  197 + i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility
  198 + i_max--;
279 199 T r_min, r_max; //allocate space to store the minimum and maximum values
280 200 HANDLE_ERROR( cudaMemcpy(&r_min, r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU
281 201 HANDLE_ERROR( cudaMemcpy(&r_max, r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
... ... @@ -287,29 +207,19 @@ void gpu_scalar_psf_local(stim::complex&lt;T&gt;* E, size_t N, T* r, T* phi, T lambda,
287 207  
288 208 size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r
289 209  
290   - T kr_min = k * r_min;
291   - T kr_max = k * r_max;
292   -
293   - //temporary variables
294   - double vm; //allocate space to store the return values for the bessel function calculation
295   - double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
296   - double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
297   - double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
298   - double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
299 210  
300 211 size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j;
301   - T* bessel_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table
302   - T delta_kr = (kr_max - kr_min) / (Nlut_j-1); //distance between values in the LUT
303   - std::cout<<"LUT jl bytes: "<<lutj_bytes<<std::endl;
304   - for(size_t kri = 0; kri < Nlut_j; kri++){ //for each value in the LUT
305   - stim::bessjyv_sph<double>(Nl, kr_min + kri * delta_kr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
  212 + T* j_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table
  213 + T dr = (r_max - r_min) / (Nlut_j-1); //distance between values in the LUT
  214 + T jl;
  215 + for(size_t ri = 0; ri < Nlut_j; ri++){ //for each value in the LUT
306 216 for(size_t l = 0; l <= Nl; l++){ //for each order
307   - bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; //store the bessel function result
  217 + jl = boost::math::sph_bessel<T>(l, k*(r_min + ri * dr)); //use boost to calculate the spherical bessel function
  218 + j_lut[ri * (Nl + 1) + l] = jl; //store the bessel function result
308 219 }
309 220 }
310 221  
311   - stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
312   -
  222 + stim::cpu2image<T>(j_lut, "j_lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
313 223 //Allocate device memory and copy everything to the GPU
314 224  
315 225 T* gpu_C;
... ... @@ -317,12 +227,12 @@ void gpu_scalar_psf_local(stim::complex&lt;T&gt;* E, size_t N, T* r, T* phi, T lambda,
317 227 HANDLE_ERROR( cudaMemcpy(gpu_C, C, C_bytes, cudaMemcpyHostToDevice) );
318 228 T* gpu_j_lut;
319 229 HANDLE_ERROR( cudaMalloc(&gpu_j_lut, lutj_bytes) );
320   - HANDLE_ERROR( cudaMemcpy(gpu_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) );
  230 + HANDLE_ERROR( cudaMemcpy(gpu_j_lut, j_lut, lutj_bytes, cudaMemcpyHostToDevice) );
321 231  
322 232 int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
323 233 dim3 blocks( (unsigned)(N / threads + 1)); //calculate the optimal number of blocks
324 234  
325   - cuda_scalar_psf<T><<< blocks, threads >>>(E, N, r, phi, (T)stim::TAU/lambda, A, Nl, gpu_C, gpu_j_lut, Nlut_j, kr_min, delta_kr);
  235 + cuda_scalar_psf<T><<< blocks, threads >>>(E, N, r, phi, A, Nl, gpu_C, gpu_j_lut, Nlut_j, r_min, dr);
326 236  
327 237 //free the LUT and condenser tables
328 238 HANDLE_ERROR( cudaFree(gpu_C) );
... ... @@ -392,7 +302,7 @@ __global__ void cuda_cart2psf(T* r, T* phi, size_t N, T* x, T* y, T* z, stim::ve
392 302 phi[i] = ps[2]; //phi = [0 pi]
393 303 }
394 304  
395   -#ifdef __CUDACC__
  305 +#ifdef CUDA_FOUND
396 306 /// Calculate the analytical solution to a point spread function given a set of points in cartesian coordinates
397 307 template<typename T>
398 308 void gpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
... ... @@ -419,7 +329,7 @@ template&lt;typename T&gt;
419 329 void cpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
420 330  
421 331 // If CUDA is available, copy the cartesian points to the GPU and evaluate them in a kernel
422   -#ifdef __CUDACC__
  332 +#ifdef CUDA_FOUND
423 333  
424 334 T* gpu_x = NULL;
425 335 if(x != NULL){
... ... @@ -470,13 +380,112 @@ void cpu_scalar_psf_cart(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, T lamb
470 380 phi[i] = ps[2]; //phi = [0 pi]
471 381 }
472 382  
473   - cpu_scalar_psf_local(F, N, r, phi, lambda, A, NA, NA_in, Nl); //call the spherical coordinate CPU function
  383 + cpu_scalar_psf_local(E, N, r, phi, lambda, A, NA, NA_in, Nl); //call the spherical coordinate CPU function
474 384  
475 385 free(r);
476 386 free(phi);
477 387 #endif
478 388 }
  389 +
  390 +/// Class stim::beam represents a beam of light focused at a point and composed of several plane waves
  391 +template<typename T>
  392 +class scalarbeam
  393 +{
  394 +public:
  395 + //enum beam_type {Uniform, Bartlett, Hamming, Hanning};
  396 +
  397 +private:
  398 +
  399 + T NA[2]; //numerical aperature of the focusing optics
  400 + vec3<T> f; //focal point
  401 + vec3<T> d; //propagation direction
  402 + T A; //beam amplitude
  403 + T lambda; //beam wavelength
  404 +public:
479 405  
  406 + ///constructor: build a default beam (NA=1.0)
  407 + scalarbeam(T wavelength = 1, T amplitude = 1, vec3<T> focal_point = vec3<T>(0, 0, 0), vec3<T> direction = vec3<T>(0, 0, 1), T numerical_aperture = 1, T center_obsc = 0){
  408 + lambda = wavelength;
  409 + A = amplitude;
  410 + f = focal_point;
  411 + d = direction.norm(); //make sure that the direction vector is normalized (makes calculations more efficient later on)
  412 + NA[0] = numerical_aperture;
  413 + NA[1] = center_obsc;
  414 + }
  415 +
  416 + ///Numerical Aperature functions
  417 + void setNA(T na)
  418 + {
  419 + NA[0] = (T)0;
  420 + NA[1] = na;
  421 + }
  422 + void setNA(T na0, T na1)
  423 + {
  424 + NA[0] = na0;
  425 + NA[1] = na1;
  426 + }
  427 +
  428 + //Monte-Carlo decomposition into plane waves
  429 + std::vector< scalarwave<T> > mc(size_t N = 100000) const{
  430 +
  431 + std::vector< stim::vec3<T> > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]); //generate a random set of N vectors forming a focus
  432 + std::vector< scalarwave<T> > samples(N); //create a vector of plane waves
  433 + T kmag = (T)stim::TAU / lambda; //calculate the wavenumber
  434 + stim::complex<T> apw; //allocate space for the amplitude at the focal point
  435 + T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N); //constant value weights plane waves based on the aperture and number of samples (N)
  436 + stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction
  437 + for(size_t i=0; i<N; i++){ //for each sample
  438 + kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave
  439 + apw = a * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave
  440 + samples[i] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction
  441 + }
  442 + return samples;
  443 + }
  444 +
  445 + /// Evaluate the beam to a scalar field using Debye focusing
  446 + void eval(stim::scalarfield<T>& E, size_t order = 500){
  447 + size_t array_size = E.grid_bytes();
  448 + T* X = (T*) malloc( array_size ); //allocate space for the coordinate meshes
  449 + T* Y = (T*) malloc( array_size );
  450 + T* Z = (T*) malloc( array_size );
  451 +
  452 + E.meshgrid(X, Y, Z, stim::CPUmem); //calculate the coordinate meshes
  453 + cpu_scalar_psf_cart<T>(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[0], NA[1], order, E.spacing());
  454 +
  455 + free(X); //free the coordinate meshes
  456 + free(Y);
  457 + free(Z);
  458 + }
  459 +
  460 + /// Calculate the field at a given point
  461 + /// @param x is the x-coordinate of the field point
  462 + /// @O is the approximation accuracy
  463 + stim::complex<T> field(T x, T y, T z, size_t O){
  464 + std::vector< scalarwave<T> > W = mc(O);
  465 + T result = 0; //initialize the result to zero (0)
  466 + for(size_t i = 0; i < O; i++){ //for each plane wave
  467 + result += W[i].pos(x, y, z);
  468 + }
  469 + return result;
  470 + }
  471 +
  472 + std::string str()
  473 + {
  474 + std::stringstream ss;
  475 + ss<<"Beam:"<<std::endl;
  476 + //ss<<" Central Plane Wave: "<<beam::E0<<" e^i ( "<<beam::k<<" . r )"<<std::endl;
  477 + ss<<" Beam Direction: "<<d<<std::endl;
  478 + if(NA[0] == 0)
  479 + ss<<" NA: "<<NA[1];
  480 + else
  481 + ss<<" NA: "<<NA[0]<<" -- "<<NA[1];
  482 +
  483 + return ss.str();
  484 + }
  485 +
  486 +
  487 +
  488 +}; //end beam
480 489 } //end namespace stim
481 490  
482 491 #endif
... ...