Commit 2bafc56990f49d0822f03098b9bbd5bc14f22c69

Authored by David Mayerich
2 parents d31038b3 31262e83

Merge branch 'master' of git.stim.ee.uh.edu:codebase/stimlib

stim/biomodels/network.h
... ... @@ -8,7 +8,7 @@
8 8 #include <algorithm>
9 9 #include <string.h>
10 10 #include <math.h>
11   -#include <stim/math/vector.h>
  11 +#include <stim/math/vec3.h>
12 12 #include <stim/visualization/obj.h>
13 13 #include <stim/visualization/cylinder.h>
14 14 #include <ANN/ANN.h>
... ... @@ -37,7 +37,7 @@ class network{
37 37 /// Constructor - creates an edge from a list of points by calling the stim::fiber constructor
38 38  
39 39 ///@param p is an array of positions in space
40   - edge(std::vector< stim::vec<T> > p) : cylinder<T>(p){}
  40 + edge(std::vector< stim::vec3<T> > p) : cylinder<T>(p){}
41 41  
42 42 /// Copy constructor creates an edge from a fiber
43 43 edge(stim::cylinder<T> f) : cylinder<T>(f) {}
... ... @@ -61,20 +61,20 @@ class network{
61 61 };
62 62  
63 63 ///Node class that stores the physical position of the node as well as the edges it is connected to (edges that connect to it), As well as any additional data necessary.
64   - class vertex : public stim::vec<T>
  64 + class vertex : public stim::vec3<T>
65 65 {
66 66 public:
67 67 //std::vector<unsigned int> edges; //indices of edges connected to this node.
68 68 std::vector<unsigned int> e[2]; //indices of edges going out (e[0]) and coming in (e[1])
69   - //stim::vec<T> p; //position of this node in physical space.
  69 + //stim::vec3<T> p; //position of this node in physical space.
70 70  
71 71 //constructor takes a stim::vec
72   - vertex(stim::vec<T> p) : stim::vec<T>(p){}
  72 + vertex(stim::vec3<T> p) : stim::vec3<T>(p){}
73 73  
74 74 /// Output the vertex information as a string
75 75 std::string str(){
76 76 std::stringstream ss;
77   - ss<<"\t(x, y, z) = "<<stim::vec<T>::str();
  77 + ss<<"\t(x, y, z) = "<<stim::vec3<T>::str();
78 78  
79 79 if(e[0].size() > 0){
80 80 ss<<"\t> ";
... ... @@ -129,7 +129,11 @@ public:
129 129 std::vector< stim::vec<T> > c; //allocate an array of points for the vessel centerline
130 130 O.getLine(l, c); //get the fiber centerline
131 131  
132   - edge new_edge = c; //create an edge from the given centerline
  132 + std::vector< stim::vec3<T> > c3(c.size());
  133 + for(size_t j = 0; j < c.size(); j++)
  134 + c3[j] = c[j];
  135 +
  136 + edge new_edge = c3; //create an edge from the given centerline
133 137 unsigned int I = new_edge.size(); //calculate the number of points on the centerline
134 138  
135 139 //get the first and last vertex IDs for the line
... ... @@ -222,7 +226,7 @@ public:
222 226 float gaussianFunction(float x, float std=25){ return exp(-x/(2*std*std));} // by default std = 25
223 227  
224 228 // stim 3d vector to annpoint of 3 dimensions
225   - void stim2ann(ANNpoint &a, stim::vec<T> b){
  229 + void stim2ann(ANNpoint &a, stim::vec3<T> b){
226 230 a[0] = b[0];
227 231 a[1] = b[1];
228 232 a[2] = b[2];
... ... @@ -278,10 +282,9 @@ public:
278 282 ANNdistArray dists = new ANNdist[1]; // near neighbor distances
279 283 ANNidxArray nnIdx = new ANNidx[1]; // near neighbor indices // allocate near neigh indices
280 284  
281   - stim::vec<T> p0, p1;
282   - float m0, m1;
  285 + stim::vec3<T> p0, p1;
  286 + float m1;
283 287 float M = 0; //stores the total metric value
284   - float l; //stores the segment length
285 288 float L = 0; //stores the total network length
286 289 ANNpoint queryPt = annAllocPt(3);
287 290 for(unsigned e = 0; e < R.E.size(); e++){ //for each edge in A
... ... @@ -292,7 +295,7 @@ public:
292 295 p1 = R.E[e][p]; //get the next point in the edge
293 296 stim2ann(queryPt, p1);
294 297 kdt->annkSearch( queryPt, 1, nnIdx, dists, eps); //find the distance between A and the current network
295   - m1 = 1.0f - gaussianFunction(dists[0], sigma); //calculate the metric value based on the distance
  298 + m1 = 1.0f - gaussianFunction((float)dists[0], sigma); //calculate the metric value based on the distance
296 299 R.E[e].set_mag(m1, p, 1); //set the error for the second point in the segment
297 300  
298 301 }
... ...
stim/cuda/cudatools/callable.h
... ... @@ -2,7 +2,7 @@
2 2  
3 3 //define the CUDA_CALLABLE macro (will prefix all members)
4 4 #ifdef __CUDACC__
5   -#define CUDA_CALLABLE __host__ __device__
  5 +#define CUDA_CALLABLE __host__ __device__ inline
6 6 #else
7 7 #define CUDA_CALLABLE
8 8 #endif
... ...
stim/envi/bil.h
... ... @@ -884,7 +884,7 @@ public:
884 884 /// using the following indexing: i = p*B + b
885 885 /// @param matrix is the destination for the pixel data
886 886 /// @param mask is the mask
887   - bool sift(T* matrix, unsigned char* mask = NULL){
  887 + bool sift(T* matrix, unsigned char* mask = NULL, bool PROGRESS = false){
888 888 size_t Lbytes = sizeof(T) * X();
889 889 T* line = (T*) malloc( Lbytes ); //allocate space for a line
890 890  
... ... @@ -903,6 +903,7 @@ public:
903 903 pl++; //increment the pixel pointer
904 904 }
905 905 }
  906 + if(PROGRESS) progress = (double)( (y+1)*Z() + 1) / (double)(Y() * Z()) * 100;
906 907 }
907 908 p += pl; //add the line increment to the running pixel index
908 909 }
... ...
stim/envi/bip.h
... ... @@ -817,7 +817,7 @@ public:
817 817 /// using the following indexing: i = p*B + b
818 818 /// @param matrix is the destination for the pixel data
819 819 /// @param mask is the mask
820   - bool sift(T* matrix, unsigned char* mask = NULL){
  820 + bool sift(T* matrix, unsigned char* mask = NULL, bool PROGRESS = false){
821 821 size_t Bbytes = sizeof(T) * Z();
822 822 size_t XY = X() * Y();
823 823 T* band = (T*) malloc( Bbytes ); //allocate space for a line
... ... @@ -836,6 +836,7 @@ public:
836 836 }
837 837 else
838 838 file.seekg(Bbytes, std::ios::cur); //otherwise skip this band
  839 + if(PROGRESS) progress = (double)(xy+1) / (double)XY * 100;
839 840 }
840 841 return true;
841 842 }
... ...
stim/envi/bsq.h
... ... @@ -809,7 +809,7 @@ public:
809 809 /// using the following indexing: i = p*B + b
810 810 /// @param matrix is the destination for the pixel data
811 811 /// @param mask is the mask
812   - bool sift(T* matrix, unsigned char* mask = NULL){
  812 + bool sift(T* matrix, unsigned char* mask = NULL, bool PROGRESS = false){
813 813 unsigned long long XY = X() * Y(); //Number of XY pixels
814 814 unsigned long long L = XY * sizeof(T); //size of XY plane (in bytes)
815 815  
... ... @@ -827,9 +827,8 @@ public:
827 827 if(mask == NULL || mask[xy] != 0){ //if the pixel is valid
828 828 matrix[i*Z() + b] = band_image[xy]; //copy it to the appropriate point in the values[] array
829 829 i++;
830   - //std::cout<<i<<std::endl;
831 830 }
832   -
  831 + if(PROGRESS) progress = (double)(xy+1) / (double)XY * 100;
833 832 }
834 833 }
835 834  
... ...
stim/envi/envi.h
... ... @@ -670,13 +670,13 @@ public:
670 670 /// using the following indexing: i = b*P + p
671 671 /// @param matrix is the destination for the pixel data
672 672 /// @param p is the mask
673   - bool sift(void* matrix, unsigned char* p = NULL){
  673 + bool sift(void* matrix, unsigned char* p = NULL, bool PROGRESS = false){
674 674  
675 675 if (header.interleave == envi_header::BSQ){ //if the infile is bsq file
676 676 if (header.data_type == envi_header::float32)
677   - return ((bsq<float>*)file)->sift((float*)matrix, p);
  677 + return ((bsq<float>*)file)->sift((float*)matrix, p, PROGRESS);
678 678 else if (header.data_type == envi_header::float64)
679   - return ((bsq<double>*)file)->sift((double*)matrix, p);
  679 + return ((bsq<double>*)file)->sift((double*)matrix, p, PROGRESS);
680 680 else{
681 681 std::cout << "ERROR: unidentified data type" << std::endl;
682 682 exit(1);
... ... @@ -685,9 +685,9 @@ public:
685 685  
686 686 if (header.interleave == envi_header::BIP){
687 687 if (header.data_type == envi_header::float32)
688   - return ((bip<float>*)file)->sift((float*)matrix, p);
  688 + return ((bip<float>*)file)->sift((float*)matrix, p, PROGRESS);
689 689 else if (header.data_type == envi_header::float64)
690   - return ((bip<double>*)file)->sift((double*)matrix, p);
  690 + return ((bip<double>*)file)->sift((double*)matrix, p, PROGRESS);
691 691 else{
692 692 std::cout << "ERROR: unidentified data type" << std::endl;
693 693 exit(1);
... ... @@ -695,9 +695,9 @@ public:
695 695 }
696 696 if (header.interleave == envi_header::BIL){
697 697 if (header.data_type == envi_header::float32)
698   - return ((bil<float>*)file)->sift((float*)matrix, p);
  698 + return ((bil<float>*)file)->sift((float*)matrix, p, PROGRESS);
699 699 else if (header.data_type == envi_header::float64)
700   - return ((bil<double>*)file)->sift((double*)matrix, p);
  700 + return ((bil<double>*)file)->sift((double*)matrix, p, PROGRESS);
701 701 else{
702 702 std::cout << "ERROR: unidentified data type" << std::endl;
703 703 exit(1);
... ...
stim/image/image.h
... ... @@ -25,8 +25,6 @@ class image{
25 25 size_t Y() const { return R[2]; }
26 26 size_t C() const { return R[0]; }
27 27  
28   - size_t bytes(){ return size() * sizeof(T); }
29   -
30 28 void init(){ //initializes all variables, assumes no memory is allocated
31 29 memset(R, 0, sizeof(size_t) * 3); //set the resolution and number of channels to zero
32 30 img = NULL;
... ... @@ -34,7 +32,6 @@ class image{
34 32  
35 33 void unalloc(){ //frees any resources associated with the image
36 34 if(img) free(img); //if memory has been allocated, free it
37   - img=NULL;
38 35 }
39 36  
40 37  
... ... @@ -45,16 +42,15 @@ class image{
45 42  
46 43 void allocate(){
47 44 unalloc();
48   - img = (T*) malloc( bytes() ); //allocate memory
49   - memset(img, 0, bytes());
  45 + img = (T*) malloc( sizeof(T) * R[0] * R[1] * R[2] ); //allocate memory
50 46 }
51 47  
52 48 void allocate(size_t x, size_t y, size_t c){ //allocate memory based on the resolution
53   - unalloc();
54 49 R[0] = c; R[1] = x; R[2] = y; //set the resolution
55 50 allocate(); //allocate memory
56 51 }
57 52  
  53 + size_t bytes(){ return size() * sizeof(T); }
58 54  
59 55 size_t idx(size_t x, size_t y, size_t c = 0){
60 56 return y * C() * X() + x * C() + c;
... ... @@ -106,15 +102,14 @@ class image{
106 102  
107 103 std::cout<<"ERROR in stim::image::white - no white value known for this data type"<<std::endl;
108 104 exit(1);
  105 +
109 106 }
110 107  
111 108  
112 109 public:
113 110  
114 111 /// Default constructor - creates an empty image object
115   - image(){
116   - init(); //initialize all variables to zero, don't allocate any memory
117   - }
  112 + image(){ init(); } //initialize all variables to zero, don't allocate any memory
118 113  
119 114 /// Constructor with a filename - loads the specified file
120 115 image(std::string filename){ //constructor initialize the image with an image file
... ... @@ -136,7 +131,7 @@ public:
136 131 }
137 132  
138 133 /// Copy constructor - duplicates an image object
139   - image(const stim::image<T> &I){
  134 + image(const stim::image<T>& I){
140 135 init();
141 136 allocate(I.X(), I.Y(), I.C());
142 137 memcpy(img, I.img, bytes());
... ... @@ -148,6 +143,7 @@ public:
148 143 }
149 144  
150 145 stim::image<T>& operator=(const stim::image<T>& I){
  146 + init();
151 147 if(&I == this) //handle self-assignment
152 148 return *this;
153 149 allocate(I.X(), I.Y(), I.C());
... ... @@ -160,22 +156,15 @@ public:
160 156  
161 157 cv::Mat cvImage = cv::imread(filename, CV_LOAD_IMAGE_UNCHANGED); //use OpenCV to open the image file
162 158 if(!cvImage.data){
163   - std::cout<<"ERROR stim::image::load() - unable to find image "<<filename<<" ["<<__FILE__<<" (line "<<__LINE__<<")]"<<std::endl;
  159 + std::cout<<"ERROR stim::image::load() - unable to find image "<<filename<<std::endl;
164 160 exit(1);
165 161 }
166 162 allocate(cvImage.cols, cvImage.rows, cvImage.channels()); //allocate space for the image
167   - T* cv_ptr = (T*) cvImage.data;
168   - if(C() == 1)
169   - {
170   - //if this is a single-color image, just copy the data
171   - memcpy(img, cv_ptr, bytes());
172   - }
173   - if(C() == 3)
174   - { //if this is a 3-color image, OpenCV uses BGR interleaving
  163 + T* cv_ptr = (T*)cvImage.data;
  164 + if(C() == 1) //if this is a single-color image, just copy the data
  165 + memcpy(img, cv_ptr, bytes());
  166 + if(C() == 3) //if this is a 3-color image, OpenCV uses BGR interleaving
175 167 set_interleaved_bgr(cv_ptr, X(), Y());
176   - }
177   -
178   - cvImage.release();
179 168 }
180 169  
181 170 //save a file
... ... @@ -189,18 +178,16 @@ public:
189 178 get_interleaved_bgr(buffer);
190 179 cv::Mat cvImage((int)Y(), (int)X(), cv_type(), buffer);
191 180 cv::imwrite(filename, cvImage);
192   - cvImage.release();
193   - free(buffer);
194 181 }
195 182  
196 183 //create an image from an interleaved buffer
197   - void set_interleaved_rgb(T* buffer, size_t width, size_t height, size_t channels = 3){
198   - allocate(width, height, channels);
  184 + void set_interleaved_rgb(T* buffer, size_t width, size_t height){
  185 + allocate(width, height, 3);
199 186 memcpy(img, buffer, bytes());
200 187 }
201 188  
202   - void set_interleaved_bgr(T* buffer, size_t width, size_t height, size_t channels = 3){
203   - allocate(width, height, channels);
  189 + void set_interleaved_bgr(T* buffer, size_t width, size_t height){
  190 + allocate(width, height, 3);
204 191 for(size_t c = 0; c < C(); c++){ //copy directly
205 192 for(size_t y = 0; y < Y(); y++){
206 193 for(size_t x = 0; x < X(); x++){
... ... @@ -380,34 +367,6 @@ public:
380 367  
381 368 return r; //return the inverted image
382 369 }
383   -
384   - /// Invert an image by calculating I1 = alpha - I0, where alpha is the maximum image value
385   - image<T> invert(){
386   - size_t N = size(); //calculate the total number of values in the image
387   - image<T> r(X(), Y(), C()); //allocate space for the resulting image
388   - T white_val = maxv();
389   - for(size_t n = 0; n < N; n++)
390   - r.img[n] = white_val - img[n]; //perform the inversion
391   -
392   - return r; //return the inverted image
393   - }
394   -
395   - ///crops the image from x1 to x0 and y1 to y0 and returns a new (smaller) image.
396   - image<T> crop(int x0, int x1, int y0, int y1)
397   - {
398   -
399   - image<T> ret(x1-x0, y1-y0, C());
400   - int newWidth = x1-x0;
401   - int destidx, srcidx;
402   - ///for each row, cut what amount of data from the original and put it into the new copy.
403   - for(int i = 0; i < (y1-y0); i++)
404   - {
405   - destidx = i*newWidth*C(); ///destination index one per each row
406   - srcidx = ((i+(y0))*X()+x0)*C(); ///source index, one per each row.
407   - memcpy(&ret.img[destidx], &img[srcidx], sizeof(T)*newWidth*C());
408   - }
409   - return ret;
410   - }
411 370  
412 371 image<T> srgb2lab(){
413 372 std::cout<<"ERROR stim::image::srgb2lab - function has been broken, re-implement."<<std::endl;
... ... @@ -426,7 +385,6 @@ public:
426 385 exit(1);
427 386 }
428 387  
429   -
430 388 // leila's code for non_interleaving data in 3D
431 389 //create an data set from an interleaved buffer
432 390 void set_interleaved3(T* buffer, size_t width, size_t height, size_t depth, size_t channels = 3){
... ...
stim/math/bessel.h
... ... @@ -1258,7 +1258,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1258 1258 P a0,v0,pv0,pv1,vl,ga,gb,vg,vv,w0,w1,ya0,yak,ya1,wa;
1259 1259 int j,n,k,kz,l,lb,lb0,m;
1260 1260  
1261   - a0 = abs(z);
  1261 + a0 = ::abs(z);
1262 1262 z1 = z;
1263 1263 z2 = z*z;
1264 1264 n = (int)v;
... ... @@ -1286,7 +1286,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1286 1286 vm = v;
1287 1287 return 0;
1288 1288 }
1289   - if (real(z1) < 0.0) z1 = -z;
  1289 + if (::real(z1) < 0.0) z1 = -z;
1290 1290 if (a0 <= 12.0) {
1291 1291 for (l=0;l<2;l++) {
1292 1292 vl = v0+l;
... ... @@ -1295,7 +1295,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1295 1295 for (k=1;k<=40;k++) {
1296 1296 cr *= -0.25*z2/(k*(k+vl));
1297 1297 cjvl += cr;
1298   - if (abs(cr) < abs(cjvl)*eps) break;
  1298 + if (::abs(cr) < ::abs(cjvl)*eps) break;
1299 1299 }
1300 1300 vg = 1.0 + vl;
1301 1301 ga = gamma(vg);
... ... @@ -1348,7 +1348,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1348 1348 for (k=1;k<=40;k++) {
1349 1349 cr *= -0.25*z2/(k*(k-vl));
1350 1350 cjvl += cr;
1351   - if (abs(cr) < abs(cjvl)*eps) break;
  1351 + if (::abs(cr) < ::abs(cjvl)*eps) break;
1352 1352 }
1353 1353 vg = 1.0-vl;
1354 1354 gb = gamma(vg);
... ... @@ -1381,16 +1381,16 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1381 1381 cyv1 = M_2_PI*(cec*cjv1-1.0/z1-0.25*z1*cs1);
1382 1382 }
1383 1383 }
1384   - if (real(z) < 0.0) {
  1384 + if (::real(z) < 0.0) {
1385 1385 cfac0 = exp(pv0*cii);
1386 1386 cfac1 = exp(pv1*cii);
1387   - if (imag(z) < 0.0) {
  1387 + if (::imag(z) < 0.0) {
1388 1388 cyv0 = cfac0*cyv0-(P)2.0*(complex<P>)cii*cos(pv0)*cjv0;
1389 1389 cyv1 = cfac1*cyv1-(P)2.0*(complex<P>)cii*cos(pv1)*cjv1;
1390 1390 cjv0 /= cfac0;
1391 1391 cjv1 /= cfac1;
1392 1392 }
1393   - else if (imag(z) > 0.0) {
  1393 + else if (::imag(z) > 0.0) {
1394 1394 cyv0 = cyv0/cfac0+(P)2.0*(complex<P>)cii*cos(pv0)*cjv0;
1395 1395 cyv1 = cyv1/cfac1+(P)2.0*(complex<P>)cii*cos(pv1)*cjv1;
1396 1396 cjv0 *= cfac0;
... ... @@ -1421,7 +1421,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1421 1421 cf2 = cf1;
1422 1422 cf1 = cf;
1423 1423 }
1424   - if (abs(cjv0) > abs(cjv1)) cs = cjv0/cf;
  1424 + if (::abs(cjv0) > ::abs(cjv1)) cs = cjv0/cf;
1425 1425 else cs = cjv1/cf2;
1426 1426 for (k=0;k<=n;k++) {
1427 1427 cjv[k] *= cs;
... ... @@ -1433,21 +1433,21 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1433 1433 }
1434 1434 cyv[0] = cyv0;
1435 1435 cyv[1] = cyv1;
1436   - ya0 = abs(cyv0);
  1436 + ya0 = ::abs(cyv0);
1437 1437 lb = 0;
1438 1438 cg0 = cyv0;
1439 1439 cg1 = cyv1;
1440 1440 for (k=2;k<=n;k++) {
1441 1441 cyk = 2.0*(v0+k-1.0)*cg1/z-cg0;
1442   - yak = abs(cyk);
1443   - ya1 = abs(cg0);
  1442 + yak = ::abs(cyk);
  1443 + ya1 = ::abs(cg0);
1444 1444 if ((yak < ya0) && (yak< ya1)) lb = k;
1445 1445 cyv[k] = cyk;
1446 1446 cg0 = cg1;
1447 1447 cg1 = cyk;
1448 1448 }
1449 1449 lb0 = 0;
1450   - if ((lb > 4) && (imag(z) != 0.0)) {
  1450 + if ((lb > 4) && (::imag(z) != 0.0)) {
1451 1451 while(lb != lb0) {
1452 1452 ch2 = cone;
1453 1453 ch1 = czero;
... ... @@ -1470,7 +1470,7 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1470 1470 cp21 = ch2;
1471 1471 if (lb == n)
1472 1472 cjv[lb+1] = 2.0*(lb+v0)*cjv[lb]/z-cjv[lb-1];
1473   - if (abs(cjv[0]) > abs(cjv[1])) {
  1473 + if (::abs(cjv[0]) > ::abs(cjv[1])) {
1474 1474 cyv[lb+1] = (cjv[lb+1]*cyv0-2.0*cp11/(M_PI*z))/cjv[0];
1475 1475 cyv[lb] = (cjv[lb]*cyv0+2.0*cp12/(M_PI*z))/cjv[0];
1476 1476 }
... ... @@ -1495,8 +1495,8 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1495 1495 cyl2 = cylk;
1496 1496 }
1497 1497 for (k=2;k<=n;k++) {
1498   - wa = abs(cyv[k]);
1499   - if (wa < abs(cyv[k-1])) lb = k;
  1498 + wa = ::abs(cyv[k]);
  1499 + if (wa < ::abs(cyv[k-1])) lb = k;
1500 1500 }
1501 1501 }
1502 1502 }
... ... @@ -1515,12 +1515,18 @@ int cbessjyva_sph(int v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
1515 1515 //first, compute the bessel functions of fractional order
1516 1516 cbessjyva<P>(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp);
1517 1517  
  1518 + if(z == 0){ //handle degenerate case of z = 0
  1519 + memset(cjv, 0, sizeof(P) * (v+1));
  1520 + cjv[0] = 1;
  1521 + }
  1522 +
1518 1523 //iterate through each and scale
1519 1524 for(int n = 0; n<=v; n++)
1520 1525 {
1521   -
1522   - cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
1523   - cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
  1526 + if(z != 0){ //handle degenerate case of z = 0
  1527 + cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
  1528 + cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
  1529 + }
1524 1530  
1525 1531 cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0));
1526 1532 cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0));
... ...
stim/math/circle.h
... ... @@ -17,7 +17,7 @@ class circle : plane&lt;T&gt;
17 17  
18 18 private:
19 19  
20   - stim::vec<T> Y;
  20 + stim::vec3<T> Y;
21 21  
22 22 CUDA_CALLABLE void
23 23 init()
... ... @@ -48,7 +48,7 @@ public:
48 48 circle(T size, T z_pos = (T)0) : plane<T>()
49 49 {
50 50 init();
51   - center(stim::vec<T>(0,0,z_pos));
  51 + center(stim::vec3<T>(0,0,z_pos));
52 52 scale(size);
53 53 }
54 54  
... ... @@ -56,7 +56,7 @@ public:
56 56 ///@param c: x,y,z location of the center.
57 57 ///@param n: x,y,z direction of the normal.
58 58 CUDA_CALLABLE
59   - circle(vec<T> c, vec<T> n = vec<T>(0,0,1)) : plane<T>()
  59 + circle(vec3<T> c, vec3<T> n = vec3<T>(0,0,1)) : plane<T>()
60 60 {
61 61 center(c);
62 62 normal(n);
... ... @@ -68,7 +68,7 @@ public:
68 68 ///@param s: size of the rectangle.
69 69 ///@param n: x,y,z direction of the normal.
70 70 CUDA_CALLABLE
71   - circle(vec<T> c, T s, vec<T> n = vec<T>(0,0,1)) : plane<T>()
  71 + circle(vec3<T> c, T s, vec3<T> n = vec3<T>(0,0,1)) : plane<T>()
72 72 {
73 73 init();
74 74 center(c);
... ... @@ -82,7 +82,7 @@ public:
82 82 ///@param n: x,y,z direction of the normal.
83 83 ///@param u: x,y,z direction for the zero vector (from where the rotation starts)
84 84 CUDA_CALLABLE
85   - circle(vec<T> c, T s, vec<T> n = vec<T>(0,0,1), vec<T> u = vec<T>(1, 0, 0)) : plane<T>()
  85 + circle(vec3<T> c, T s, vec3<T> n = vec3<T>(0,0,1), vec3<T> u = vec3<T>(1, 0, 0)) : plane<T>()
86 86 {
87 87 init();
88 88 setU(u);
... ... @@ -103,16 +103,15 @@ public:
103 103 ///sets the normal for the cirlce
104 104 ///@param n: x,y,z direction of the normal.
105 105 CUDA_CALLABLE void
106   - normal(vec<T> n)
  106 + normal(vec3<T> n)
107 107 {
108 108 rotate(n, Y);
109 109 }
110 110  
111 111 ///sets the center of the circle.
112 112 ///@param n: x,y,z location of the center.
113   - CUDA_CALLABLE T
114   - center(vec<T> p)
115   - {
  113 + CUDA_CALLABLE void
  114 + center(vec3<T> p){
116 115 this->P = p;
117 116 }
118 117  
... ... @@ -127,17 +126,17 @@ public:
127 126 }
128 127  
129 128 ///get the world space value given the planar coordinates a, b in [0, 1]
130   - CUDA_CALLABLE stim::vec<T> p(T a, T b)
  129 + CUDA_CALLABLE stim::vec3<T> p(T a, T b)
131 130 {
132   - stim::vec<T> result;
  131 + stim::vec3<T> result;
133 132  
134   - vec<T> A = this->P - this->U * (T)0.5 - Y * (T)0.5;
  133 + vec3<T> A = this->P - this->U * (T)0.5 - Y * (T)0.5;
135 134 result = A + this->U * a + Y * b;
136 135 return result;
137 136 }
138 137  
139 138 ///parenthesis operator returns the world space given rectangular coordinates a and b in [0 1]
140   - CUDA_CALLABLE stim::vec<T> operator()(T a, T b)
  139 + CUDA_CALLABLE stim::vec3<T> operator()(T a, T b)
141 140 {
142 141 return p(a,b);
143 142 }
... ... @@ -145,11 +144,11 @@ public:
145 144 ///returns a vector with the points on the initialized circle.
146 145 ///connecting the points results in a circle.
147 146 ///@param n: integer for the number of points representing the circle.
148   - std::vector<stim::vec<T> >
  147 + std::vector<stim::vec3<T> >
149 148 getPoints(int n)
150 149 {
151   - std::vector<stim::vec<T> > result;
152   - stim::vec<T> point;
  150 + std::vector<stim::vec3<T> > result;
  151 + stim::vec3<T> point;
153 152 T x,y;
154 153 float step = 360.0/(float) n;
155 154 for(float j = 0; j <= 360.0; j += step)
... ... @@ -164,7 +163,7 @@ public:
164 163 ///returns a vector with the points on the initialized circle.
165 164 ///connecting the points results in a circle.
166 165 ///@param n: integer for the number of points representing the circle.
167   - stim::vec<T>
  166 + stim::vec3<T>
168 167 p(T theta)
169 168 {
170 169 T x,y;
... ...
stim/math/constants.h
1 1 #ifndef STIM_CONSTANTS_H
2 2 #define STIM_CONSTANTS_H
3 3  
  4 +#include "stim/cuda/cudatools/callable.h"
4 5 namespace stim{
5 6 const double PI = 3.1415926535897932384626433832795028841971693993751058209749445923078164062862;
6 7 const double TAU = 2 * stim::PI;
... ...
stim/math/matrix.h
... ... @@ -5,6 +5,7 @@
5 5 #include <string.h>
6 6 #include <iostream>
7 7 #include <stim/math/vector.h>
  8 +#include <stim/math/vec3.h>
8 9 #include <stim/cuda/cudatools/callable.h>
9 10  
10 11 namespace stim{
... ...
stim/math/plane.h
... ... @@ -2,7 +2,7 @@
2 2 #define STIM_PLANE_H
3 3  
4 4 #include <iostream>
5   -#include <stim/math/vector.h>
  5 +#include <stim/math/vec3.h>
6 6 #include <stim/cuda/cudatools/callable.h>
7 7 #include <stim/math/quaternion.h>
8 8  
... ... @@ -22,17 +22,17 @@ template &lt;typename T&gt;
22 22 class plane
23 23 {
24 24 protected:
25   - stim::vec<T> P;
26   - stim::vec<T> N;
27   - stim::vec<T> U;
  25 + stim::vec3<T> P;
  26 + stim::vec3<T> N;
  27 + stim::vec3<T> U;
28 28  
29 29 ///Initializes the plane with standard coordinates.
30 30 ///
31 31 CUDA_CALLABLE void init()
32 32 {
33   - P = stim::vec<T>(0, 0, 0);
34   - N = stim::vec<T>(0, 0, 1);
35   - U = stim::vec<T>(1, 0, 0);
  33 + P = stim::vec3<T>(0, 0, 0);
  34 + N = stim::vec3<T>(0, 0, 1);
  35 + U = stim::vec3<T>(1, 0, 0);
36 36 }
37 37  
38 38 public:
... ... @@ -42,7 +42,7 @@ class plane
42 42 init();
43 43 }
44 44  
45   - CUDA_CALLABLE plane(vec<T> n, vec<T> p = vec<T>(0, 0, 0))
  45 + CUDA_CALLABLE plane(vec3<T> n, vec3<T> p = vec3<T>(0, 0, 0))
46 46 {
47 47 init();
48 48 P = p;
... ... @@ -56,11 +56,11 @@ class plane
56 56 }
57 57  
58 58 //create a plane from three points (a triangle)
59   - CUDA_CALLABLE plane(vec<T> a, vec<T> b, vec<T> c)
  59 + CUDA_CALLABLE plane(vec3<T> a, vec3<T> b, vec3<T> c)
60 60 {
61 61 init();
62 62 P = c;
63   - stim::vec<T> n = (c - a).cross(b - a);
  63 + stim::vec3<T> n = (c - a).cross(b - a);
64 64 try
65 65 {
66 66 if(n.len() != 0)
... ... @@ -84,17 +84,17 @@ class plane
84 84  
85 85 }
86 86  
87   - CUDA_CALLABLE vec<T> n()
  87 + CUDA_CALLABLE vec3<T> n()
88 88 {
89 89 return N;
90 90 }
91 91  
92   - CUDA_CALLABLE vec<T> p()
  92 + CUDA_CALLABLE vec3<T> p()
93 93 {
94 94 return P;
95 95 }
96 96  
97   - CUDA_CALLABLE vec<T> u()
  97 + CUDA_CALLABLE vec3<T> u()
98 98 {
99 99 return U;
100 100 }
... ... @@ -107,7 +107,7 @@ class plane
107 107 }
108 108  
109 109 //determines how a vector v intersects the plane (1 = intersects front, 0 = within plane, -1 = intersects back)
110   - CUDA_CALLABLE int face(vec<T> v){
  110 + CUDA_CALLABLE int face(vec3<T> v){
111 111  
112 112 T dprod = v.dot(N); //get the dot product between v and N
113 113  
... ... @@ -121,46 +121,46 @@ class plane
121 121 }
122 122  
123 123 //determine on which side of the plane a point lies (1 = front, 0 = on the plane, -1 = bac k)
124   - CUDA_CALLABLE int side(vec<T> p){
  124 + CUDA_CALLABLE int side(vec3<T> p){
125 125  
126   - vec<T> v = p - P; //get the vector from P to the query point p
  126 + vec3<T> v = p - P; //get the vector from P to the query point p
127 127  
128 128 return face(v);
129 129 }
130 130  
131 131 //compute the component of v that is perpendicular to the plane
132   - CUDA_CALLABLE vec<T> perpendicular(vec<T> v){
  132 + CUDA_CALLABLE vec3<T> perpendicular(vec3<T> v){
133 133 return N * v.dot(N);
134 134 }
135 135  
136 136 //compute the projection of v in the plane
137   - CUDA_CALLABLE vec<T> parallel(vec<T> v){
  137 + CUDA_CALLABLE vec3<T> parallel(vec3<T> v){
138 138 return v - perpendicular(v);
139 139 }
140 140  
141   - CUDA_CALLABLE void setU(vec<T> v)
  141 + CUDA_CALLABLE void setU(vec3<T> v)
142 142 {
143 143 U = (parallel(v.norm())).norm();
144 144 }
145 145  
146   - CUDA_CALLABLE void decompose(vec<T> v, vec<T>& para, vec<T>& perp){
  146 + CUDA_CALLABLE void decompose(vec3<T> v, vec3<T>& para, vec3<T>& perp){
147 147 perp = N * v.dot(N);
148 148 para = v - perp;
149 149 }
150 150  
151 151 //get both the parallel and perpendicular components of a vector v w.r.t. the plane
152   - CUDA_CALLABLE void project(vec<T> v, vec<T> &v_par, vec<T> &v_perp){
  152 + CUDA_CALLABLE void project(vec3<T> v, vec3<T> &v_par, vec3<T> &v_perp){
153 153  
154 154 v_perp = v.dot(N);
155 155 v_par = v - v_perp;
156 156 }
157 157  
158 158 //compute the reflection of v off of the plane
159   - CUDA_CALLABLE vec<T> reflect(vec<T> v){
  159 + CUDA_CALLABLE vec3<T> reflect(vec3<T> v){
160 160  
161 161 //compute the reflection using N_prime as the plane normal
162   - vec<T> par = parallel(v);
163   - vec<T> r = (-v) + par * 2;
  162 + vec3<T> par = parallel(v);
  163 + vec3<T> r = (-v) + par * 2;
164 164 return r;
165 165  
166 166 }
... ... @@ -184,7 +184,7 @@ class plane
184 184 }
185 185  
186 186  
187   - CUDA_CALLABLE void rotate(vec<T> n)
  187 + CUDA_CALLABLE void rotate(vec3<T> n)
188 188 {
189 189 quaternion<T> q;
190 190 q.CreateRotation(N, n);
... ... @@ -194,7 +194,7 @@ class plane
194 194  
195 195 }
196 196  
197   - CUDA_CALLABLE void rotate(vec<T> n, vec<T> &Y)
  197 + CUDA_CALLABLE void rotate(vec3<T> n, vec3<T> &Y)
198 198 {
199 199 quaternion<T> q;
200 200 q.CreateRotation(N, n);
... ... @@ -205,7 +205,7 @@ class plane
205 205  
206 206 }
207 207  
208   - CUDA_CALLABLE void rotate(vec<T> n, vec<T> &X, vec<T> &Y)
  208 + CUDA_CALLABLE void rotate(vec3<T> n, vec3<T> &X, vec3<T> &Y)
209 209 {
210 210 quaternion<T> q;
211 211 q.CreateRotation(N, n);
... ...
stim/math/quaternion.h
... ... @@ -43,6 +43,8 @@ public:
43 43  
44 44 CUDA_CALLABLE void CreateRotation(vec3<T> from, vec3<T> to){
45 45  
  46 + from = from.norm();
  47 + to = to.norm();
46 48 vec3<T> r = from.cross(to); //compute the rotation vector
47 49 T theta = asin(r.len()); //compute the angle of the rotation about r
48 50 //deal with a zero vector (both k and kn point in the same direction)
... ...
stim/math/vec3.h
... ... @@ -217,12 +217,12 @@ public:
217 217 std::string str() const{
218 218 std::stringstream ss;
219 219  
220   - size_t N = size();
  220 + const size_t N = 3;
221 221  
222 222 ss<<"[";
223 223 for(size_t i=0; i<N; i++)
224 224 {
225   - ss<<at(i);
  225 + ss<<ptr[i];
226 226 if(i != N-1)
227 227 ss<<", ";
228 228 }
... ... @@ -230,7 +230,10 @@ public:
230 230  
231 231 return ss.str();
232 232 }
233   - }; //end class triple
  233 +
  234 + size_t size(){ return 3; }
  235 +
  236 + }; //end class vec3
234 237 } //end namespace stim
235 238  
236 239 /// Multiply a vector by a constant when the vector is on the right hand side
... ...
stim/math/vector.h
... ... @@ -317,6 +317,15 @@ struct vec : public std::vector&lt;T&gt;
317 317 return *this;
318 318 }
319 319  
  320 + /// Cast to a vec3
  321 + operator vec3<T>(){
  322 + vec3<T> r;
  323 + size_t N = std::min<size_t>(size(), 3);
  324 + for(size_t i = 0; i < N; i++)
  325 + r[i] = at(i);
  326 + return r;
  327 + }
  328 +
320 329 /// Casting and assignment
321 330 template<typename Y>
322 331 vec<T> & operator=(vec<Y> rhs){
... ...
stim/optics/mie.h 0 → 100644
  1 +#ifndef STIM_MIE_H
  2 +#define STIM_MIE_H
  3 +
  4 +#include "scalarwave.h"
  5 +#include "../math/bessel.h"
  6 +#include "../cuda/cudatools/devices.h"
  7 +#include <cmath>
  8 +
  9 +namespace stim{
  10 +
  11 +
  12 +/// Calculate the scattering coefficients for a spherical scatterer
  13 +template<typename T>
  14 +void B_coefficients(stim::complex<T>* B, T a, T k, stim::complex<T> n, int Nl){
  15 +
  16 + //temporary variables
  17 + double vm; //allocate space to store the return values for the bessel function calculation
  18 + double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
  19 + double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
  20 + double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
  21 + double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
  22 +
  23 + stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  24 + stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  25 + stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  26 + stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  27 +
  28 + double ka = k * a; //store k*a (argument for spherical bessel and Hankel functions)
  29 + stim::complex<double> kna = k * n * a; //store k*n*a (argument for spherical bessel functions and derivatives)
  30 +
  31 + stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka); //calculate bessel functions and derivatives for k*a
  32 + stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna); //calculate complex bessel functions for k*n*a
  33 +
  34 + stim::complex<double> h_ka, dh_ka;
  35 + stim::complex<double> numerator, denominator;
  36 + stim::complex<double> i(0, 1);
  37 + for(int l = 0; l <= Nl; l++){
  38 + h_ka.r = j_ka[l];
  39 + h_ka.i = y_ka[l];
  40 + dh_ka.r = dj_ka[l];
  41 + dh_ka.i = dy_ka[l];
  42 +
  43 + numerator = j_ka[l] * dj_kna[l] * (stim::complex<double>)n - j_kna[l] * dj_ka[l];
  44 + denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
  45 + B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
  46 + std::cout<<B[l]<<std::endl;
  47 + }
  48 +}
  49 +
  50 +template<typename T>
  51 +void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){
  52 + //temporary variables
  53 + double vm; //allocate space to store the return values for the bessel function calculation
  54 + double* j_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
  55 + double* y_ka = (double*) malloc( (Nl + 1) * sizeof(double) );
  56 + double* dj_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
  57 + double* dy_ka= (double*) malloc( (Nl + 1) * sizeof(double) );
  58 +
  59 + stim::complex<double>* j_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  60 + stim::complex<double>* y_kna = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  61 + stim::complex<double>* dj_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  62 + stim::complex<double>* dy_kna= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  63 +
  64 + double ka = k * a; //store k*a (argument for spherical bessel and Hankel functions)
  65 + stim::complex<double> kna = k * n * a; //store k*n*a (argument for spherical bessel functions and derivatives)
  66 +
  67 + stim::bessjyv_sph<double>(Nl, ka, vm, j_ka, y_ka, dj_ka, dy_ka); //calculate bessel functions and derivatives for k*a
  68 + stim::cbessjyva_sph<double>(Nl, kna, vm, j_kna, y_kna, dj_kna, dy_kna); //calculate complex bessel functions for k*n*a
  69 +
  70 + stim::complex<double> h_ka, dh_ka;
  71 + stim::complex<double> numerator, denominator;
  72 + stim::complex<double> i(0, 1);
  73 + for(size_t l = 0; l <= Nl; l++){
  74 + h_ka.r = j_ka[l];
  75 + h_ka.i = y_ka[l];
  76 + dh_ka.r = dj_ka[l];
  77 + dh_ka.i = dy_ka[l];
  78 +
  79 + numerator = j_ka[l] * dh_ka - dj_ka[l] * h_ka;
  80 + denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex<double>)n;
  81 + A[l] = (2 * l + 1) * pow(i, l) * numerator / denominator;
  82 + }
  83 +}
  84 +
  85 +#define LOCAL_NL 16
  86 +template<typename T>
  87 +__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, int Nl){
  88 + extern __shared__ stim::complex<T> shared_hB[]; //declare the list of waves in shared memory
  89 +
  90 + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
  91 + if(i >= N) return; //exit if this thread is outside the array
  92 + stim::vec3<T> p;
  93 + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
  94 + (y == NULL) ? p[1] = 0 : p[1] = y[i];
  95 + (z == NULL) ? p[2] = 0 : p[2] = z[i];
  96 +
  97 + T r = p.len(); //calculate the distance from the sphere
  98 + if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field)
  99 + T k = W[0].kmag();
  100 + size_t NC = Nl + 1; //calculate the number of coefficients to be used
  101 + T kr = r * k; //calculate the thread value for k*r
  102 + T fij = (kr - kr_min)/dkr; //FP index into the spherical bessel LUT
  103 + size_t ij = (size_t) fij; //convert to an integral index
  104 + T alpha = fij - ij; //calculate the fractional portion of the index
  105 + size_t n0j = ij * (NC); //start of the first entry in the LUT
  106 + size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT
  107 +
  108 + T cos_phi;
  109 + T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials
  110 +
  111 + stim::complex<T> hBl;
  112 + stim::complex<T> Ei = 0; //create a register to store the result
  113 + int l;
  114 +
  115 + stim::complex<T> hlBl[LOCAL_NL+1];
  116 + int shared_start = threadIdx.x * (Nl - LOCAL_NL);
  117 +
  118 + #pragma unroll LOCAL_NL+1
  119 + for(l = 0; l <= LOCAL_NL; l++)
  120 + hlBl[l] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
  121 +
  122 + for(l = LOCAL_NL+1; l <= Nl; l++)
  123 + shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
  124 +
  125 + for(size_t w = 0; w < nW; w++){
  126 + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere
  127 + Pl_2 = 1;
  128 + Pl_1 = cos_phi;
  129 + Ei += W[w].E() * hlBl[0] * Pl_2;
  130 + Ei += W[w].E() * hlBl[1] * Pl_1;
  131 +
  132 + #pragma unroll LOCAL_NL-1
  133 + for(l = 2; l <= LOCAL_NL; l++){
  134 + Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
  135 + Ei += W[w].E() * hlBl[l] * Pl;
  136 + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
  137 + Pl_1 = Pl;
  138 + }
  139 +
  140 + for(l = LOCAL_NL+1; l <= Nl; l++){
  141 + Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
  142 + Ei += W[w].E() * shared_hB[shared_start + (l - (LOCAL_NL+1))] * Pl;
  143 + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
  144 + Pl_1 = Pl;
  145 +
  146 + }
  147 + }
  148 + E[i] += Ei; //copy the result to device memory
  149 +}
  150 +
  151 +template<typename T>
  152 +void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){
  153 +
  154 + size_t max_shared_mem = stim::sharedMemPerBlock();
  155 + int hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
  156 + std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl;
  157 + std::cout<<"shared memory: "<<max_shared_mem<<std::endl;
  158 + int threads = (max_shared_mem / hBl_array) / 32 * 32;
  159 + std::cout<<"threads per block: "<<threads<<std::endl;
  160 + dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
  161 +
  162 + size_t shared_mem;
  163 + if(Nl <= LOCAL_NL) shared_mem = 0;
  164 + else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate
  165 + std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
  166 + cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel
  167 +
  168 +}
  169 +
  170 +template<typename T>
  171 +__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){
  172 + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
  173 + if(i >= N) return; //exit if this thread is outside the array
  174 +
  175 + stim::vec3<T> p;
  176 + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
  177 + (y == NULL) ? p[1] = 0 : p[1] = y[i];
  178 + (z == NULL) ? p[2] = 0 : p[2] = z[i];
  179 +
  180 + r[i] = p.len();
  181 +}
  182 +/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
  183 +
  184 +/// @param E is a pointer to the destination field values
  185 +/// @param N is the number of points used to calculate the field
  186 +/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
  187 +/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
  188 +/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
  189 +/// @param W is an array of planewaves that will be scattered
  190 +/// @param a is the radius of the sphere
  191 +/// @param n is the complex refractive index of the sphere
  192 +template<typename T>
  193 +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
  194 + //calculate the necessary number of orders required to represent the scattered field
  195 + T k = W[0].kmag();
  196 +
  197 + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
  198 + if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization)
  199 + std::cout<<"Nl: "<<Nl<<std::endl;
  200 +
  201 + //calculate the scattering coefficients for the sphere
  202 + stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients
  203 + B_coefficients(B, a, k, n, Nl);
  204 +
  205 +#ifdef CUDA_FOUND
  206 + stim::complex<T>* dev_E; //allocate space for the field
  207 + cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
  208 + cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
  209 + //cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used)
  210 +
  211 + // COORDINATES
  212 + T* dev_x = NULL; //allocate space and copy the X coordinate (if specified)
  213 + if(x != NULL){
  214 + HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
  215 + HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
  216 + }
  217 + T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified)
  218 + if(y != NULL){
  219 + HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
  220 + HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
  221 + }
  222 + T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified)
  223 + if(z != NULL){
  224 + HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
  225 + HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
  226 + }
  227 +
  228 + // PLANE WAVES
  229 + stim::scalarwave<T>* dev_W; //allocate space and copy plane waves
  230 + HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
  231 + HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
  232 +
  233 + // BESSEL FUNCTION LOOK-UP TABLE
  234 + //calculate the distance from the sphere center
  235 + T* dev_r;
  236 + HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );
  237 +
  238 + int threads = stim::maxThreadsPerBlock();
  239 + dim3 blocks((unsigned)(N / threads + 1));
  240 + cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);
  241 +
  242 + //Find the minimum and maximum values of r
  243 + cublasStatus_t stat;
  244 + cublasHandle_t handle;
  245 +
  246 + stat = cublasCreate(&handle); //create a cuBLAS handle
  247 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  248 + printf ("CUBLAS initialization failed\n");
  249 + exit(1);
  250 + }
  251 +
  252 + int i_min, i_max;
  253 + stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
  254 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  255 + printf ("CUBLAS Error: failed to calculate minimum r value.\n");
  256 + exit(1);
  257 + }
  258 + stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
  259 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  260 + printf ("CUBLAS Error: failed to calculate maximum r value.\n");
  261 + exit(1);
  262 + }
  263 +
  264 + T r_min, r_max; //allocate space to store the minimum and maximum values
  265 + HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU
  266 + HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
  267 +
  268 +
  269 + //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r
  270 + size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1);
  271 +
  272 + T kr_min = k * r_min;
  273 + T kr_max = k * r_max;
  274 +
  275 + //temporary variables
  276 + double vm; //allocate space to store the return values for the bessel function calculation
  277 + double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
  278 + double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
  279 + double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
  280 + double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
  281 +
  282 + size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut;
  283 + stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes); //pointer to the look-up table
  284 + T dkr = (kr_max - kr_min) / (N_hB_lut-1); //distance between values in the LUT
  285 + std::cout<<"LUT jl bytes: "<<hB_bytes<<std::endl;
  286 + stim::complex<T> hl;
  287 + for(size_t kri = 0; kri < N_hB_lut; kri++){ //for each value in the LUT
  288 + stim::bessjyv_sph<double>(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
  289 + for(size_t l = 0; l <= Nl; l++){ //for each order
  290 + hl.r = (T)jv[l];
  291 + hl.i = (T)yv[l];
  292 +
  293 + hB_lut[kri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result
  294 + }
  295 + }
  296 +
  297 + //stim::cpu2image<T>(hankel_lut, "hankel.bmp", Nl+1, Nlut_j, stim::cmBrewer);
  298 +
  299 + //Allocate device memory and copy everything to the GPU
  300 + stim::complex<T>* dev_hB_lut;
  301 + HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
  302 + HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );
  303 +
  304 + gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, kr_min, dkr, N_hB_lut, Nl);
  305 +
  306 + cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory
  307 +
  308 + if(x != NULL) cudaFree(dev_x); //free everything
  309 + if(y != NULL) cudaFree(dev_y);
  310 + if(z != NULL) cudaFree(dev_z);
  311 + cudaFree(dev_E);
  312 +#else
  313 +
  314 +
  315 + //allocate space to store the bessel function call results
  316 + double vm;
  317 + double* j_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
  318 + double* y_kr = (double*) malloc( (Nl + 1) * sizeof(double) );
  319 + double* dj_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
  320 + double* dy_kr= (double*) malloc( (Nl + 1) * sizeof(double) );
  321 +
  322 + T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
  323 +
  324 + T r, kr, cos_phi;
  325 + stim::complex<T> h;
  326 + for(size_t i = 0; i < N; i++){
  327 + stim::vec3<T> p; //declare a 3D point
  328 +
  329 + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
  330 + (y == NULL) ? p[1] = 0 : p[1] = y[i];
  331 + (z == NULL) ? p[2] = 0 : p[2] = z[i];
  332 + r = p.len();
  333 + if(r >= a){
  334 + for(size_t w = 0; w < W.size(); w++){
  335 + kr = p.len() * W[w].kmag(); //calculate k*r
  336 + stim::bessjyv_sph<double>(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr);
  337 + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction
  338 + stim::legendre<T>(Nl, cos_phi, P);
  339 +
  340 + for(size_t l = 0; l <= Nl; l++){
  341 + h.r = j_kr[l];
  342 + h.i = y_kr[l];
  343 + E[i] += W[w].E() * B[l] * h * P[l];
  344 + }
  345 + }
  346 + }
  347 + }
  348 +#endif
  349 +}
  350 +
  351 +template<typename T>
  352 +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){
  353 + std::vector< stim::scalarwave<T> > W(1, w);
  354 + cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n);
  355 +}
  356 +
  357 +/// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere
  358 +
  359 +/// @param E is a pointer to the destination field values
  360 +/// @param N is the number of points used to calculate the field
  361 +/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
  362 +/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
  363 +/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
  364 +/// @param w is a planewave that will be scattered
  365 +/// @param a is the radius of the sphere
  366 +/// @param n is the complex refractive index of the sphere
  367 +template<typename T>
  368 +void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n){
  369 +
  370 + //calculate the necessary number of orders required to represent the scattered field
  371 + T k = W[0].kmag();
  372 +
  373 + size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2);
  374 + std::cout<<"Nl: "<<Nl<<std::endl;
  375 +
  376 + //calculate the scattering coefficients for the sphere
  377 + stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients
  378 + A_coefficients(A, a, k, n, Nl);
  379 +
  380 + //allocate space to store the bessel function call results
  381 + double vm;
  382 + stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  383 + stim::complex<double>* y_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  384 + stim::complex<double>* dj_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  385 + stim::complex<double>* dy_knr= (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
  386 +
  387 + T* P = (T*) malloc( (Nl + 1) * sizeof(T) );
  388 +
  389 + T r, cos_phi;
  390 + stim::complex<double> knr;
  391 + stim::complex<T> h;
  392 + for(size_t i = 0; i < N; i++){
  393 + stim::vec3<T> p; //declare a 3D point
  394 +
  395 + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
  396 + (y == NULL) ? p[1] = 0 : p[1] = y[i];
  397 + (z == NULL) ? p[2] = 0 : p[2] = z[i];
  398 + r = p.len();
  399 + if(r < a){
  400 + E[i] = 0;
  401 + for(size_t w = 0; w < W.size(); w++){
  402 + knr = (stim::complex<double>)n * p.len() * W[w].kmag(); //calculate k*n*r
  403 +
  404 + stim::cbessjyva_sph<double>(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr);
  405 + if(r == 0)
  406 + cos_phi = 0;
  407 + else
  408 + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction
  409 + stim::legendre<T>(Nl, cos_phi, P);
  410 +
  411 + for(size_t l = 0; l <= Nl; l++){
  412 + E[i] += W[w].E() * A[l] * (stim::complex<T>)j_knr[l] * P[l];
  413 + }
  414 + }
  415 + }
  416 + }
  417 +}
  418 +
  419 +template<typename T>
  420 +void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n){
  421 + std::vector< stim::scalarwave<T> > W(1, w);
  422 + cpu_scalar_mie_internal(E, N, x, y, z, W, a, n);
  423 +}
  424 +
  425 +}
  426 +
  427 +#endif
0 428 \ No newline at end of file
... ...
stim/optics/scalarbeam.h
... ... @@ -5,7 +5,12 @@
5 5 #include "../optics/scalarwave.h"
6 6 #include "../math/bessel.h"
7 7 #include "../math/legendre.h"
  8 +#include "../cuda/cudatools/devices.h"
  9 +#include "../cuda/cudatools/timer.h"
  10 +#include <cublas_v2.h>
  11 +#include <math_constants.h>
8 12 #include <vector>
  13 +#include <stdlib.h>
9 14  
10 15 namespace stim{
11 16  
... ... @@ -105,10 +110,11 @@ public:
105 110 std::vector< scalarwave<T> > samples(N); //create a vector of plane waves
106 111 T kmag = (T)stim::TAU / lambda; //calculate the wavenumber
107 112 stim::complex<T> apw; //allocate space for the amplitude at the focal point
  113 + T a = (T)(stim::TAU * (1 - cos(asin(NA[0]))) / (double)N);
108 114 stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction
109 115 for(size_t i=0; i<N; i++){ //for each sample
110 116 kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave
111   - apw = exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave
  117 + apw = a * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave
112 118 samples[i] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction
113 119 }
114 120  
... ... @@ -148,7 +154,7 @@ public:
148 154 /// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional)
149 155 /// @param C is a pointer to Nl + 1 values where the terms will be stored
150 156 template<typename T>
151   -CUDA_CALLABLE void cpu_aperture_integral(T* C, size_t Nl, T NA, T NA_in = 0){
  157 +CUDA_CALLABLE void cpu_aperture_integral(T* C, int Nl, T NA, T NA_in = 0){
152 158  
153 159 size_t table_bytes = (Nl + 1) * sizeof(T); //calculate the number of bytes required to store the terms
154 160 T cos_alpha_1 = cos(asin(NA_in)); //calculate the cosine of the angle subtended by the central obscuration
... ... @@ -182,23 +188,156 @@ CUDA_CALLABLE void cpu_aperture_integral(T* C, size_t Nl, T NA, T NA_in = 0){
182 188  
183 189 /// performs linear interpolation into a look-up table
184 190 template<typename T>
185   -T lut_lookup(T* lut, T val, size_t N, T min_val, T delta, size_t stride = 0){
186   - size_t idx = (size_t)((val - min_val) / delta);
187   - T alpha = val - idx * delta + min_val;
  191 +CUDA_CALLABLE void lut_lookup(T* lut_values, T* lut, T val, size_t N, T min_val, T delta, size_t n_vals){
  192 + T idx = ((val - min_val) / delta);
  193 + size_t i = (size_t) idx;
  194 + T a1 = idx - i;
  195 + T a0 = 1 - a1;
  196 + size_t n0 = i * n_vals;
  197 + size_t n1 = (i+1) * n_vals;
  198 + for(size_t n = 0; n < n_vals; n++){
  199 + lut_values[n] = lut[n0 + n] * a0 + lut[n1 + n] * a1;
  200 + }
  201 +}
  202 +
  203 +template <typename T>
  204 +CUDA_CALLABLE stim::complex<T> clerp(stim::complex<T> v0, stim::complex<T> v1, T t) {
  205 + return stim::complex<T>( fma(t, v1.r, fma(-t, v0.r, v0.r)), fma(t, v1.i, fma(-t, v0.i, v0.i)) );
  206 +}
  207 +
  208 +template <typename T>
  209 +CUDA_CALLABLE T lerp(T v0, T v1, T t) {
  210 + return fma(t, v1, fma(-t, v0, v0));
  211 +}
188 212  
189   - if(alpha == 0) return lut[idx];
190   - else return lut[idx * stride] * (1 - alpha) + lut[ (idx+1) * stride] * alpha;
  213 +#ifdef __CUDACC__
  214 +template<typename T>
  215 +__global__ void cuda_scalar_psf(stim::complex<T>* E, size_t N, T* r, T* phi, T k, T A, size_t Nl,
  216 + T* C,
  217 + T* lut_j, size_t Nj, T min_kr, T dkr){
  218 + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
  219 + if(i >= N) return; //exit if this thread is outside the array
  220 +
  221 + T cos_phi = cos(phi[i]); //calculate the thread value for cos(phi)
  222 + T kr = r[i] * k; //calculate the thread value for k*r
  223 + stim::complex<T> Ei = 0; //initialize the value of the field to zero
  224 + size_t NC = Nl + 1; //calculate the number of coefficients to be used
  225 +
  226 + T fij = (kr - min_kr)/dkr; //FP index into the spherical bessel LUT
  227 + size_t ij = (size_t) fij; //convert to an integral index
  228 + T a = fij - ij; //calculate the fractional portion of the index
  229 + size_t n0j = ij * (NC); //start of the first entry in the LUT
  230 + size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT
  231 +
  232 + T jl; //declare register to store the spherical bessel function
  233 + T Pl_2, Pl_1; //declare registers to store the previous two Legendre polynomials
  234 + T Pl = 1; //initialize the current value for the Legendre polynomial
  235 + stim::complex<T> im(0, 1); //declare i (imaginary 1)
  236 + stim::complex<T> i_pow(1, 0); //i_pow stores the current value of i^l so it doesn't have to be re-computed every iteration
  237 + for(int l = 0; l <= Nl; l++){ //for each order
  238 + jl = lerp<T>( lut_j[n0j + l], lut_j[n1j + l], a ); //read jl from the LUT and interpolate the result
  239 + Ei += i_pow * jl * Pl * C[l]; //calculate the value for the field and sum
  240 + i_pow *= im; //multiply i^l * i for the next iteration
  241 + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
  242 + Pl_1 = Pl;
  243 + if(l == 0){ //computing Pl is done recursively, where the recursive relation
  244 + Pl = cos_phi; // requires the first two orders. This defines the second.
  245 + }
  246 + else{ //if this is not the first iteration, use the recursive relation to calculate Pl
  247 + Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
  248 + }
  249 +
  250 + }
  251 + E[i] = Ei * A * 2 * CUDART_PI_F; //scale the integral by the amplitude
191 252 }
192 253  
193 254 template<typename T>
194   -void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, stim::vec3<T> f, T NA, T NA_in, int Nl){
195   - T k = stim::TAU / lambda;
  255 +void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl, T r_spacing){
196 256  
197   - T* C = (T*) malloc( (Nl + 1) * sizeof(T) ); //allocate space for the aperture integral terms
  257 + //Find the minimum and maximum values of r
  258 + cublasStatus_t stat;
  259 + cublasHandle_t handle;
  260 +
  261 + stat = cublasCreate(&handle); //create a cuBLAS handle
  262 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  263 + printf ("CUBLAS initialization failed\n");
  264 + exit(1);
  265 + }
  266 +
  267 + int i_min, i_max;
  268 + stat = cublasIsamin(handle, (int)N, r, 1, &i_min);
  269 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  270 + printf ("CUBLAS Error: failed to calculate minimum r value.\n");
  271 + exit(1);
  272 + }
  273 + stat = cublasIsamax(handle, (int)N, r, 1, &i_max);
  274 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  275 + printf ("CUBLAS Error: failed to calculate maximum r value.\n");
  276 + exit(1);
  277 + }
  278 +
  279 + T r_min, r_max; //allocate space to store the minimum and maximum values
  280 + HANDLE_ERROR( cudaMemcpy(&r_min, r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU
  281 + HANDLE_ERROR( cudaMemcpy(&r_max, r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
  282 +
  283 + T k = (T)stim::TAU / lambda; //calculate the wavenumber from lambda
  284 + size_t C_bytes = (Nl + 1) * sizeof(T);
  285 + T* C = (T*) malloc( C_bytes ); //allocate space for the aperture integral terms
  286 + cpu_aperture_integral(C, Nl, NA, NA_in); //calculate the aperture integral terms
  287 +
  288 + size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r
  289 +
  290 + T kr_min = k * r_min;
  291 + T kr_max = k * r_max;
  292 +
  293 + //temporary variables
  294 + double vm; //allocate space to store the return values for the bessel function calculation
  295 + double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
  296 + double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
  297 + double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
  298 + double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
  299 +
  300 + size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j;
  301 + T* bessel_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table
  302 + T delta_kr = (kr_max - kr_min) / (Nlut_j-1); //distance between values in the LUT
  303 + std::cout<<"LUT jl bytes: "<<lutj_bytes<<std::endl;
  304 + for(size_t kri = 0; kri < Nlut_j; kri++){ //for each value in the LUT
  305 + stim::bessjyv_sph<double>(Nl, kr_min + kri * delta_kr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
  306 + for(size_t l = 0; l <= Nl; l++){ //for each order
  307 + bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; //store the bessel function result
  308 + }
  309 + }
  310 +
  311 + stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
  312 +
  313 + //Allocate device memory and copy everything to the GPU
  314 +
  315 + T* gpu_C;
  316 + HANDLE_ERROR( cudaMalloc(&gpu_C, C_bytes) );
  317 + HANDLE_ERROR( cudaMemcpy(gpu_C, C, C_bytes, cudaMemcpyHostToDevice) );
  318 + T* gpu_j_lut;
  319 + HANDLE_ERROR( cudaMalloc(&gpu_j_lut, lutj_bytes) );
  320 + HANDLE_ERROR( cudaMemcpy(gpu_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) );
  321 +
  322 + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
  323 + dim3 blocks( (unsigned)(N / threads + 1)); //calculate the optimal number of blocks
  324 +
  325 + cuda_scalar_psf<T><<< blocks, threads >>>(E, N, r, phi, (T)stim::TAU/lambda, A, Nl, gpu_C, gpu_j_lut, Nlut_j, kr_min, delta_kr);
  326 +
  327 + //free the LUT and condenser tables
  328 + HANDLE_ERROR( cudaFree(gpu_C) );
  329 + HANDLE_ERROR( cudaFree(gpu_j_lut) );
  330 +}
  331 +#endif
  332 +
  333 +/// Calculate the analytical solution to a scalar point spread function given a set of spherical coordinates about the PSF (beam propagation along phi = theta = 0)
  334 +template<typename T>
  335 +void cpu_scalar_psf_local(stim::complex<T>* F, size_t N, T* r, T* phi, T lambda, T A, T NA, T NA_in, int Nl){
  336 + T k = (T)stim::TAU / lambda;
  337 + size_t C_bytes = (Nl + 1) * sizeof(T);
  338 + T* C = (T*) malloc( C_bytes ); //allocate space for the aperture integral terms
198 339 cpu_aperture_integral(C, Nl, NA, NA_in); //calculate the aperture integral terms
199 340 memset(F, 0, N * sizeof(stim::complex<T>));
200   -#ifdef NO_CUDA
201   - memset(F, 0, N * sizeof(stim::complex<T>));
202 341 T jl, Pl, kr, cos_phi;
203 342  
204 343 double vm;
... ... @@ -225,71 +364,117 @@ void cpu_scalar_psf(stim::complex&lt;T&gt;* F, size_t N, T* r, T* phi, T lambda, T A,
225 364  
226 365 free(C);
227 366 free(Pl_cos_phi);
228   -#else
229   - T min_r = r[0];
230   - T max_r = r[0];
231   - for(size_t i = 0; i < N; i++){ //find the minimum and maximum values of r (min and max distance from the focal point)
232   - if(r[i] < min_r) min_r = r[i];
233   - if(r[i] > max_r) max_r = r[i];
234   - }
235   - T min_kr = k * min_r;
236   - T max_kr = k * max_r;
  367 +}
237 368  
238   - //temporary variables
239   - double vm;
240   - double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
241   - double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
242   - double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
243   - double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
  369 +/// Converts a set of cartesian points into spherical coordinates surrounding a point spread function (PSF)
  370 +/// @param r is the output distance from the PSF
  371 +/// @param phi is the non-symmetric direction about the PSF
  372 +/// @param x (x, y, z) are the cartesian coordinates in world space
  373 +/// @f is the focal point of the PSF in cartesian coordinates
  374 +/// @d is the propagation direction of the PSF in cartesian coordinates
  375 +template<typename T>
  376 +__global__ void cuda_cart2psf(T* r, T* phi, size_t N, T* x, T* y, T* z, stim::vec3<T> f, stim::quaternion<T> q){
244 377  
245   - size_t Nlut = (size_t)sqrt(N) * 2;
246   - T* bessel_lut = (T*) malloc(sizeof(T) * (Nl+1) * Nlut);
247   - T delta_kr = (max_kr - min_kr) / (Nlut-1);
248   - for(size_t kri = 0; kri < Nlut; kri++){
249   - stim::bessjyv_sph<double>(Nl, min_kr + kri * delta_kr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
250   - for(size_t l = 0; l <= Nl; l++){
251   - bessel_lut[kri * (Nl + 1) + l] = (T)jv[l];
252   - }
253   - }
  378 + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
  379 + if(i >= N) return; //exit if this thread is outside the array
254 380  
255   - T* Pl_cos_phi = (T*) malloc((Nl + 1) * sizeof(T));
256   - T kr, cos_phi, jl, Pl;
257   - for(size_t n = 0; n < N; n++){ //for each point in the field
258   - kr = k * r[n]; //calculate kr (the optical distance between the focal point and p)
259   - cos_phi = std::cos(phi[n]); //calculate the cosine of phi
260   - stim::legendre<T>(Nl, cos_phi, Pl_cos_phi); //calculate the [0 Nl] legendre polynomials for this point
  381 + stim::vec3<T> p; //declare a 3D point
  382 +
  383 + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
  384 + (y == NULL) ? p[1] = 0 : p[1] = y[i];
  385 + (z == NULL) ? p[2] = 0 : p[2] = z[i];
261 386  
262   - for(int l = 0; l <= Nl; l++){
263   - jl = lut_lookup<T>(&bessel_lut[l], kr, Nlut, min_kr, delta_kr, Nl+1);
264   - Pl = Pl_cos_phi[l];
265   - F[n] += pow(complex<T>(0, 1), l) * jl * Pl * C[l];
266   - }
267   - F[n] *= A * stim::TAU;
268   - }
269   -#endif
  387 + p = p - f; //shift the point to the center of the PSF (focal point)
  388 + p = q.toMatrix3() * p; //rotate the point to align with the propagation direction
  389 +
  390 + stim::vec3<T> ps = p.cart2sph(); //convert from cartesian to spherical coordinates
  391 + r[i] = ps[0]; //store r
  392 + phi[i] = ps[2]; //phi = [0 pi]
270 393 }
271 394  
  395 +#ifdef __CUDACC__
  396 +/// Calculate the analytical solution to a point spread function given a set of points in cartesian coordinates
  397 +template<typename T>
  398 +void gpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
  399 +
  400 + T* gpu_r; //allocate space for the coordinates in r
  401 + HANDLE_ERROR( cudaMalloc(&gpu_r, sizeof(T) * N) );
  402 + T* gpu_phi;
  403 + HANDLE_ERROR( cudaMalloc(&gpu_phi, sizeof(T) * N) );
  404 + //stim::complex<T>* gpu_E;
  405 + //HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex<T>) * N) );
  406 +
  407 + stim::quaternion<T> q; //create a quaternion
  408 + q.CreateRotation(d, stim::vec3<T>(0, 0, 1)); //create a mapping from the propagation direction to the PSF space
  409 + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
  410 + dim3 blocks( (unsigned)(N / threads + 1)); //calculate the optimal number of blocks
  411 + cuda_cart2psf<T> <<< blocks, threads >>> (gpu_r, gpu_phi, N, x, y, z, f, q); //call the CUDA kernel to move the cartesian coordinates to PSF space
  412 +
  413 + gpu_scalar_psf_local(E, N, gpu_r, gpu_phi, lambda, A, NA, NA_in, Nl, r_spacing);
  414 +
  415 +}
  416 +#endif
272 417  
273 418 template<typename T>
274   -void cpu_scalar_psf(stim::complex<T>* F, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, T NA, T NA_in, int Nl){
  419 +void cpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3<T> f, stim::vec3<T> d, T NA, T NA_in, int Nl, T r_spacing = 1){
  420 +
  421 +// If CUDA is available, copy the cartesian points to the GPU and evaluate them in a kernel
  422 +#ifdef __CUDACC__
  423 +
  424 + T* gpu_x = NULL;
  425 + if(x != NULL){
  426 + HANDLE_ERROR( cudaMalloc(&gpu_x, sizeof(T) * N) );
  427 + HANDLE_ERROR( cudaMemcpy(gpu_x, x, sizeof(T) * N, cudaMemcpyHostToDevice) );
  428 + }
  429 + T* gpu_y = NULL;
  430 + if(y != NULL){
  431 + HANDLE_ERROR( cudaMalloc(&gpu_y, sizeof(T) * N) );
  432 + HANDLE_ERROR( cudaMemcpy(gpu_y, y, sizeof(T) * N, cudaMemcpyHostToDevice) );
  433 + }
  434 + T* gpu_z = NULL;
  435 + if(z != NULL){
  436 + HANDLE_ERROR( cudaMalloc(&gpu_z, sizeof(T) * N) );
  437 + HANDLE_ERROR( cudaMemcpy(gpu_z, z, sizeof(T) * N, cudaMemcpyHostToDevice) );
  438 + }
  439 +
  440 + stim::complex<T>* gpu_E;
  441 + HANDLE_ERROR( cudaMalloc(&gpu_E, sizeof(stim::complex<T>) * N) );
  442 + HANDLE_ERROR( cudaMemcpy(gpu_E, E, sizeof(stim::complex<T>) * N, cudaMemcpyHostToDevice) );
  443 + gpu_scalar_psf_cart<T>(gpu_E, N, gpu_x, gpu_y, gpu_z, lambda, A, f, d, NA, NA_in, Nl, r_spacing);
  444 + HANDLE_ERROR( cudaMemcpy(E, gpu_E, sizeof(stim::complex<T>) * N, cudaMemcpyDeviceToHost) );
  445 +
  446 + HANDLE_ERROR( cudaFree(gpu_x) );
  447 + HANDLE_ERROR( cudaFree(gpu_y) );
  448 + HANDLE_ERROR( cudaFree(gpu_z) );
  449 + HANDLE_ERROR( cudaFree(gpu_E) );
  450 +
  451 +#else
275 452 T* r = (T*) malloc(N * sizeof(T)); //allocate space for p in spherical coordinates
276 453 T* phi = (T*) malloc(N * sizeof(T)); // only r and phi are necessary (the scalar PSF is symmetric about theta)
277 454  
278   - stim::vec3<T> p, ps;
  455 + stim::quaternion<T> q;
  456 + q.CreateRotation(d, stim::vec3<T>(0, 0, 1));
  457 + stim::matrix<T, 3> R = q.toMatrix3();
  458 + stim::vec3<T> p, ps, ds;
279 459 for(size_t i = 0; i < N; i++){
280 460 (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
281 461 (y == NULL) ? p[1] = 0 : p[1] = y[i];
282 462 (z == NULL) ? p[2] = 0 : p[2] = z[i];
283 463  
  464 + p = p - f;
  465 +
  466 + p = R * p; //rotate the cartesian point
  467 +
284 468 ps = p.cart2sph(); //convert from cartesian to spherical coordinates
285 469 r[i] = ps[0]; //store r
286 470 phi[i] = ps[2]; //phi = [0 pi]
287 471 }
288 472  
289   - cpu_scalar_psf(F, N, r, phi, lambda, A, f, NA, NA_in, Nl); //call the spherical coordinate CPU function
  473 + cpu_scalar_psf_local(F, N, r, phi, lambda, A, NA, NA_in, Nl); //call the spherical coordinate CPU function
290 474  
291 475 free(r);
292 476 free(phi);
  477 +#endif
293 478 }
294 479  
295 480 } //end namespace stim
... ...
stim/optics/scalarwave.h
... ... @@ -23,7 +23,7 @@ namespace stim{
23 23 template<typename T>
24 24 class scalarwave{
25 25  
26   -protected:
  26 +public:
27 27  
28 28 stim::vec3<T> k; //k-vector, pointed in propagation direction with magnitude |k| = tau / lambda = 2pi / lambda
29 29 stim::complex<T> E0; //amplitude
... ... @@ -60,7 +60,7 @@ public:
60 60 return k.len();
61 61 }
62 62  
63   - CUDA_CALLABLE vec3< complex<T> > E(){
  63 + CUDA_CALLABLE complex<T> E(){
64 64 return E0;
65 65 }
66 66  
... ... @@ -235,6 +235,32 @@ void gpu_scalarwave(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scala
235 235 cuda_scalarwave<T><<< blocks, threads >>>(F, N, x, y, z, w); //call the kernel
236 236 }
237 237  
  238 +template<typename T>
  239 +void gpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW){
  240 +
  241 + size_t wave_bytes = sizeof(stim::scalarwave<T>);
  242 + size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available
  243 + size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory
  244 + size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required
  245 +
  246 + stim::scalarwave<T>* batch_W;
  247 + HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes)); //allocate memory for a single batch of plane waves
  248 +
  249 + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
  250 + dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
  251 +
  252 + size_t batch_size; //declare a variable to store the size of the current batch
  253 + size_t waves_processed = 0; //initialize the number of waves processed to zero
  254 + while(waves_processed < nW){ //while there are still waves to be processed
  255 + batch_size = min<size_t>(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left
  256 + batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
  257 + HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory
  258 + cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(F, N, x, y, z, batch_W, batch_size); //call the kernel
  259 + waves_processed += batch_size; //increment the counter indicating how many waves have been processed
  260 + }
  261 + cudaFree(batch_W);
  262 +}
  263 +
238 264 /// Sums a series of coherent plane waves at a specified point
239 265 /// @param field is the output array of field values corresponding to each input point
240 266 /// @param x is an array of x coordinates for the field point
... ... @@ -245,24 +271,13 @@ void gpu_scalarwave(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scala
245 271 /// @param A is the list of amplitudes for each wave
246 272 /// @param S is the list of propagation directions for each wave
247 273 template<typename T>
248   -void cpu_sum_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > w_array){
249   - size_t S = w_array.size(); //store the number of waves
250   -#ifdef NO_CUDA
251   - memset(F, 0, N * sizeof(stim::complex<T>));
252   - T px, py, pz;
253   - for(size_t i = 0; i < N; i++){ // for each element in the array
254   - (x == NULL) ? px = 0 : px = x[i]; // test for NULL values
255   - (y == NULL) ? py = 0 : py = y[i];
256   - (z == NULL) ? pz = 0 : pz = z[i];
257   -
258   - for(size_t s = 0; s < S; s++){
259   - F[i] += w_array[s].pos(px, py, pz); //sum all plane waves at this point
260   - }
261   - }
262   -#else
  274 +void cpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W){
  275 + size_t S = W.size(); //store the number of waves
  276 +#ifdef __CUDACC__
263 277 stim::complex<T>* dev_F; //allocate space for the field
264 278 cudaMalloc(&dev_F, N * sizeof(stim::complex<T>));
265   - cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used)
  279 + cudaMemcpy(dev_F, F, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
  280 + //cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used)
266 281  
267 282 T* dev_x = NULL; //allocate space and copy the X coordinate (if specified)
268 283 if(x != NULL){
... ... @@ -282,28 +297,11 @@ void cpu_sum_scalarwaves(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, std::v
282 297 HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
283 298 }
284 299  
285