Commit 2a10ecf4739211c4e356bba41fe8c6756b36eb81

Authored by David Mayerich
1 parent 0012d210

updated files and added features to support BIMSim

stim/cuda/cudatools/devices.h
@@ -4,35 +4,56 @@ @@ -4,35 +4,56 @@
4 #include <cuda.h> 4 #include <cuda.h>
5 5
6 namespace stim{ 6 namespace stim{
7 -extern "C"  
8 -int maxThreadsPerBlock()  
9 -{  
10 - int device;  
11 - cudaGetDevice(&device); //get the id of the current device  
12 - cudaDeviceProp props; //device property structure  
13 - cudaGetDeviceProperties(&props, device);  
14 - return props.maxThreadsPerBlock;  
15 -} 7 + extern "C"
  8 + int maxThreadsPerBlock(){
  9 + int device;
  10 + cudaGetDevice(&device); //get the id of the current device
  11 + cudaDeviceProp props; //device property structure
  12 + cudaGetDeviceProperties(&props, device);
  13 + return props.maxThreadsPerBlock;
  14 + }
16 15
17 -extern "C"  
18 -size_t sharedMemPerBlock()  
19 -{  
20 - int device;  
21 - cudaGetDevice(&device); //get the id of the current device  
22 - cudaDeviceProp props; //device property structure  
23 - cudaGetDeviceProperties(&props, device);  
24 - return props.sharedMemPerBlock;  
25 -} 16 + extern "C"
  17 + size_t sharedMemPerBlock(){
  18 + int device;
  19 + cudaGetDevice(&device); //get the id of the current device
  20 + cudaDeviceProp props; //device property structure
  21 + cudaGetDeviceProperties(&props, device);
  22 + return props.sharedMemPerBlock;
  23 + }
26 24
27 -extern "C"  
28 -size_t constMem()  
29 -{  
30 - int device;  
31 - cudaGetDevice(&device); //get the id of the current device  
32 - cudaDeviceProp props; //device property structure  
33 - cudaGetDeviceProperties(&props, device);  
34 - return props.totalConstMem;  
35 -} 25 + extern "C"
  26 + size_t constMem(){
  27 + int device;
  28 + cudaGetDevice(&device); //get the id of the current device
  29 + cudaDeviceProp props; //device property structure
  30 + cudaGetDeviceProperties(&props, device);
  31 + return props.totalConstMem;
  32 + }
  33 +
  34 + //tests that a given device ID is valid and provides at least the specified compute capability
  35 + bool testDevice(int d, int major, int minor){
  36 + int nd;
  37 + cudaGetDeviceCount(&nd); //get the number of CUDA devices
  38 + if(d < nd && d > 0) { //if the given ID has an associated device
  39 + cudaDeviceProp props;
  40 + cudaGetDeviceProperties(&props, d); //get the device properties structure
  41 + if(props.major >= major && props.minor >= minor)
  42 + return true;
  43 + }
  44 + return false;
  45 + }
  46 +
  47 + //tests each device ID in a list and returns the number of devices that fit the desired
  48 + // compute capability
  49 + int testDevices(int* dlist, unsigned n_devices, int major, int minor){
  50 + int valid = 0;
  51 + for(int d = 0; d < n_devices; d++){
  52 + if(testDevice(dlist[d], major, minor))
  53 + valid++;
  54 + }
  55 + return valid;
  56 + }
36 } //end namespace rts 57 } //end namespace rts
37 58
38 #endif 59 #endif
stim/image/image.h
@@ -324,6 +324,12 @@ public: @@ -324,6 +324,12 @@ public:
324 324
325 //save a file 325 //save a file
326 void save(std::string filename){ 326 void save(std::string filename){
  327 + stim::filename file(filename);
  328 + if (file.extension() == "raw" || file.extension() == "") {
  329 + std::ofstream outfile(filename, std::ios::binary);
  330 + outfile.write((char*)img, sizeof(T) * R[0] * R[1] * R[2]);
  331 + outfile.close();
  332 + }
327 #ifdef USING_OPENCV 333 #ifdef USING_OPENCV
328 //OpenCV uses an interleaved format, so convert first and then output 334 //OpenCV uses an interleaved format, so convert first and then output
329 T* buffer = (T*) malloc(bytes()); 335 T* buffer = (T*) malloc(bytes());
stim/math/bessel.h
@@ -17,10 +17,10 @@ static complex&lt;double&gt; czero(0.0,0.0); @@ -17,10 +17,10 @@ static complex&lt;double&gt; czero(0.0,0.0);
17 template< typename P > 17 template< typename P >
18 P gamma(P x) 18 P gamma(P x)
19 { 19 {
20 - const P EPS = numeric_limits<P>::epsilon();  
21 - const P FPMIN_MAG = numeric_limits<P>::min();  
22 - const P FPMIN = numeric_limits<P>::lowest();  
23 - const P FPMAX = numeric_limits<P>::max(); 20 + const P EPS = std::numeric_limits<P>::epsilon();
  21 + const P FPMIN_MAG = std::numeric_limits<P>::min();
  22 + const P FPMIN = std::numeric_limits<P>::lowest();
  23 + const P FPMAX = std::numeric_limits<P>::max();
24 24
25 int i,k,m; 25 int i,k,m;
26 P ga,gr,r,z; 26 P ga,gr,r,z;
@@ -94,10 +94,10 @@ template&lt;typename P&gt; @@ -94,10 +94,10 @@ template&lt;typename P&gt;
94 int bessjy01a(P x,P &j0,P &j1,P &y0,P &y1, 94 int bessjy01a(P x,P &j0,P &j1,P &y0,P &y1,
95 P &j0p,P &j1p,P &y0p,P &y1p) 95 P &j0p,P &j1p,P &y0p,P &y1p)
96 { 96 {
97 - const P EPS = numeric_limits<P>::epsilon();  
98 - const P FPMIN_MAG = numeric_limits<P>::min();  
99 - const P FPMIN = numeric_limits<P>::lowest();  
100 - const P FPMAX = numeric_limits<P>::max(); 97 + const P EPS = std::numeric_limits<P>::epsilon();
  98 + const P FPMIN_MAG = std::numeric_limits<P>::min();
  99 + const P FPMIN = std::numeric_limits<P>::lowest();
  100 + const P FPMAX = std::numeric_limits<P>::max();
101 101
102 P x2,r,ec,w0,w1,r0,r1,cs0,cs1; 102 P x2,r,ec,w0,w1,r0,r1,cs0,cs1;
103 P cu,p0,q0,p1,q1,t1,t2; 103 P cu,p0,q0,p1,q1,t1,t2;
@@ -606,10 +606,10 @@ int bessjyv(P v,P x,P &amp;vm,P *jv,P *yv, @@ -606,10 +606,10 @@ int bessjyv(P v,P x,P &amp;vm,P *jv,P *yv,
606 P b,ec,w0,w1,bju0,bju1,pv0,pv1,byvk; 606 P b,ec,w0,w1,bju0,bju1,pv0,pv1,byvk;
607 int j,k,l,m,n,kz; 607 int j,k,l,m,n,kz;
608 608
609 - const P EPS = numeric_limits<P>::epsilon();  
610 - const P FPMIN_MAG = numeric_limits<P>::min();  
611 - const P FPMIN = numeric_limits<P>::lowest();  
612 - const P FPMAX = numeric_limits<P>::max(); 609 + const P EPS = std::numeric_limits<P>::epsilon();
  610 + const P FPMIN_MAG = std::numeric_limits<P>::min();
  611 + const P FPMIN = std::numeric_limits<P>::lowest();
  612 + const P FPMAX = std::numeric_limits<P>::max();
613 613
614 x2 = x*x; 614 x2 = x*x;
615 n = (int)v; 615 n = (int)v;
stim/optics/scalarbeam.h
@@ -405,7 +405,7 @@ private: @@ -405,7 +405,7 @@ private:
405 T NA[2]; //numerical aperature of the focusing optics 405 T NA[2]; //numerical aperature of the focusing optics
406 vec3<T> f; //focal point 406 vec3<T> f; //focal point
407 vec3<T> d; //propagation direction 407 vec3<T> d; //propagation direction
408 - T A; //beam amplitude 408 + T A; //beam amplitude
409 T lambda; //beam wavelength 409 T lambda; //beam wavelength
410 public: 410 public:
411 411
@@ -440,10 +440,10 @@ public: @@ -440,10 +440,10 @@ public:
440 stim::complex<T> apw; //allocate space for the amplitude at the focal point 440 stim::complex<T> apw; //allocate space for the amplitude at the focal point
441 T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N); //constant value weights plane waves based on the aperture and number of samples (N) 441 T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N); //constant value weights plane waves based on the aperture and number of samples (N)
442 stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction 442 stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction
443 - for(size_t i=0; i<N; i++){ //for each sample 443 + for(size_t i=0; i<N; i++){ //for each sample
444 kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave 444 kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave
445 - apw = a * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave  
446 - samples[i] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction 445 + apw = a * A * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave
  446 + samples[i] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction
447 } 447 }
448 return samples; 448 return samples;
449 } 449 }
stim/optics/scalarfield.h
@@ -143,8 +143,8 @@ namespace stim{ @@ -143,8 +143,8 @@ namespace stim{
143 143
144 144
145 /// Propagate a field slice along its orthogonal direction by a given distance z 145 /// Propagate a field slice along its orthogonal direction by a given distance z
146 - /// @param Enew is the resulting propogated field  
147 - /// @param E is the field to be propogated 146 + /// @param Enew is the resulting propagated field
  147 + /// @param E is the field to be propagated
148 /// @param sx is the size of the field in the lateral x direction 148 /// @param sx is the size of the field in the lateral x direction
149 /// @param sy is the size of the field in the lateral y direction 149 /// @param sy is the size of the field in the lateral y direction
150 /// @param z is the distance to be propagated 150 /// @param z is the distance to be propagated
@@ -187,12 +187,13 @@ namespace stim{ @@ -187,12 +187,13 @@ namespace stim{
187 187
188 if(kx_sq + ky_sq < k_sq){ 188 if(kx_sq + ky_sq < k_sq){
189 kz = sqrt(k_sq - kx_sq - ky_sq); //estimate kz using the Fresnel approximation 189 kz = sqrt(k_sq - kx_sq - ky_sq); //estimate kz using the Fresnel approximation
190 - shift = -exp(stim::complex<T>(0, kz * z)); 190 + shift = exp(stim::complex<T>(0, kz * z));
  191 + //std::cout << shift << std::endl;
191 K[i] *= shift; 192 K[i] *= shift;
192 K[i] /= (nx*ny); //normalize the DFT 193 K[i] /= (nx*ny); //normalize the DFT
193 } 194 }
194 else{ 195 else{
195 - K[i] = 0; 196 + K[i] /= (nx*ny);
196 } 197 }
197 } 198 }
198 } 199 }
@@ -342,7 +343,7 @@ public: @@ -342,7 +343,7 @@ public:
342 T spacing(){ 343 T spacing(){
343 T du = rect<T>::X.len() / R[0]; 344 T du = rect<T>::X.len() / R[0];
344 T dv = rect<T>::Y.len() / R[1]; 345 T dv = rect<T>::Y.len() / R[1];
345 - return min<T>(du, dv); 346 + return std::min<T>(du, dv);
346 } 347 }
347 348
348 /// Copy the field array to the GPU, if it isn't already there 349 /// Copy the field array to the GPU, if it isn't already there
@@ -537,7 +538,7 @@ public: @@ -537,7 +538,7 @@ public:
537 memset(E, 0, grid_bytes()); 538 memset(E, 0, grid_bytes());
538 } 539 }
539 540
540 - void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer){ 541 + void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer, T minval = 0, T maxval = 0){
541 542
542 if(loc == GPUmem){ 543 if(loc == GPUmem){
543 T* image; 544 T* image;
@@ -559,7 +560,10 @@ public: @@ -559,7 +560,10 @@ public:
559 cuda_intensity<T><<< blocks, threads >>>(image, E, size()); 560 cuda_intensity<T><<< blocks, threads >>>(image, E, size());
560 break; 561 break;
561 } 562 }
562 - stim::gpu2image<T>(image, filename, R[0], R[1], stim::cmBrewer); 563 + if (minval == maxval)
  564 + stim::gpu2image<T>(image, filename, R[0], R[1], cmap);
  565 + else
  566 + stim::gpu2image<T>(image, filename, R[0], R[1], minval, maxval, cmap);
563 HANDLE_ERROR( cudaFree(image) ); 567 HANDLE_ERROR( cudaFree(image) );
564 } 568 }
565 else{ 569 else{
@@ -579,7 +583,10 @@ public: @@ -579,7 +583,10 @@ public:
579 stim::intensity(image, E, size()); 583 stim::intensity(image, E, size());
580 break; 584 break;
581 } 585 }
582 - stim::cpu2image(image, filename, R[0], R[1], cmap); //save the resulting image 586 + if (minval == maxval)
  587 + stim::cpu2image(image, filename, R[0], R[1], cmap); //save the resulting image
  588 + else
  589 + stim::cpu2image<T>(image, filename, R[0], R[1], minval, maxval, cmap);
583 free(image); //free the real image 590 free(image); //free the real image
584 } 591 }
585 } 592 }
stim/optics/scalarmie.h
@@ -89,7 +89,7 @@ void A_coefficients(stim::complex&lt;T&gt;* A, T a, T k, stim::complex&lt;T&gt; n, int Nl){ @@ -89,7 +89,7 @@ void A_coefficients(stim::complex&lt;T&gt;* A, T a, T k, stim::complex&lt;T&gt; n, int Nl){
89 89
90 #define LOCAL_NL 16 90 #define LOCAL_NL 16
91 template<typename T> 91 template<typename T>
92 -__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){ 92 +__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::vec3<T> c, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){
93 extern __shared__ stim::complex<T> shared_hB[]; //declare the list of waves in shared memory 93 extern __shared__ stim::complex<T> shared_hB[]; //declare the list of waves in shared memory
94 94
95 size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array 95 size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
@@ -98,7 +98,7 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* @@ -98,7 +98,7 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T*
98 (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions 98 (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
99 (y == NULL) ? p[1] = 0 : p[1] = y[i]; 99 (y == NULL) ? p[1] = 0 : p[1] = y[i];
100 (z == NULL) ? p[2] = 0 : p[2] = z[i]; 100 (z == NULL) ? p[2] = 0 : p[2] = z[i];
101 - 101 + p = p - c;
102 T r = p.len(); //calculate the distance from the sphere 102 T r = p.len(); //calculate the distance from the sphere
103 if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) 103 if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field)
104 T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT 104 T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT
@@ -124,24 +124,27 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* @@ -124,24 +124,27 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T*
124 for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory 124 for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory
125 shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha ); 125 shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
126 126
  127 + complex<T> e, Ew;
127 for(size_t w = 0; w < nW; w++){ //for each plane wave 128 for(size_t w = 0; w < nW; w++){ //for each plane wave
128 cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere 129 cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere
129 Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation 130 Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
130 Pl_1 = cos_phi; 131 Pl_1 = cos_phi;
131 - Ei += W[w].E() * hlBl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation  
132 - Ei += W[w].E() * hlBl[1] * Pl_1; 132 + e = exp(complex<T>(0, W[w].kvec().dot(c)));
  133 + Ew = W[w].E() * e;
  134 + Ei += Ew * hlBl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation
  135 + Ei += Ew * hlBl[1] * Pl_1;
133 136
134 #pragma unroll LOCAL_NL-1 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file) 137 #pragma unroll LOCAL_NL-1 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file)
135 for(l = 2; l <= LOCAL_NL; l++){ 138 for(l = 2; l <= LOCAL_NL; l++){
136 Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs) 139 Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs)
137 - Ei += W[w].E() * hlBl[l] * Pl; //calculate and sum the current field order 140 + Ei += Ew * hlBl[l] * Pl; //calculate and sum the current field order
138 Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 141 Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
139 Pl_1 = Pl; 142 Pl_1 = Pl;
140 } 143 }
141 144
142 for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers) 145 for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers)
143 Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs 146 Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs
144 - Ei += W[w].E() * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl; 147 + Ei += Ew * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl;
145 Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 148 Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
146 Pl_1 = Pl; 149 Pl_1 = Pl;
147 } 150 }
@@ -149,8 +152,18 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* @@ -149,8 +152,18 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T*
149 E[i] += Ei; //copy the result to device memory 152 E[i] += Ei; //copy the result to device memory
150 } 153 }
151 154
  155 +///Calculate the scalar Mie scattered field on the GPU
  156 +/// @param E (GPU) is the N x N destination scalar field
  157 +/// @param N is the number fo elements of the scalar field in each direction
  158 +/// @param x (GPU) is the grid of X coordinates for each point in E
  159 +/// @param y (GPU) is the grid of Y coordinates for each point in E
  160 +/// @param z (GPU) is the grid of Z coordinates for each point in E
  161 +/// @param W (CPU) is an array of coherent scalar plane waves incident on the Mie scatterer
  162 +/// @param a is the radius of the Mie scatterer
  163 +/// @param n is the complex refractive index of the Mie scatterer
  164 +/// @param r_spacing is the minimum distance between r values of the sample points in E (used to calculate look-up tables)
152 template<typename T> 165 template<typename T>
153 -void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){ 166 +void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::vec3<T> c, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){
154 167
155 size_t max_shared_mem = stim::sharedMemPerBlock(); 168 size_t max_shared_mem = stim::sharedMemPerBlock();
156 size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1); 169 size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
@@ -164,11 +177,11 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, sti @@ -164,11 +177,11 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, sti
164 if(Nl <= LOCAL_NL) shared_mem = 0; 177 if(Nl <= LOCAL_NL) shared_mem = 0;
165 else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate 178 else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate
166 //std::cout<<"shared memory allocated: "<<shared_mem<<std::endl; 179 //std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
167 - cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel 180 + cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, c, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel
168 } 181 }
169 182
170 template<typename T> 183 template<typename T>
171 -__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){ 184 +__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N, stim::vec3<T> c = stim::vec3<T>(0, 0, 0)){
172 size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array 185 size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
173 if(i >= N) return; //exit if this thread is outside the array 186 if(i >= N) return; //exit if this thread is outside the array
174 187
@@ -177,10 +190,21 @@ __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){ @@ -177,10 +190,21 @@ __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){
177 (y == NULL) ? p[1] = 0 : p[1] = y[i]; 190 (y == NULL) ? p[1] = 0 : p[1] = y[i];
178 (z == NULL) ? p[2] = 0 : p[2] = z[i]; 191 (z == NULL) ? p[2] = 0 : p[2] = z[i];
179 192
180 - r[i] = p.len(); 193 + r[i] = (p - c).len();
181 } 194 }
  195 +
  196 +///Calculate the scalar Mie scattered field on the GPU
  197 +/// @param E (GPU) is the N x N destination scalar field
  198 +/// @param N is the number fo elements of the scalar field in each direction
  199 +/// @param x (GPU) is the grid of X coordinates for each point in E
  200 +/// @param y (GPU) is the grid of Y coordinates for each point in E
  201 +/// @param z (GPU) is the grid of Z coordinates for each point in E
  202 +/// @param W (CPU) is an array of coherent scalar plane waves incident on the Mie scatterer
  203 +/// @param a is the radius of the Mie scatterer
  204 +/// @param n is the complex refractive index of the Mie scatterer
  205 +/// @param r_spacing is the minimum distance between r values of the sample points in E (used to calculate look-up tables)
182 template<typename T> 206 template<typename T>
183 -void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){ 207 +void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0), T r_spacing = 0.1){
184 208
185 //calculate the necessary number of orders required to represent the scattered field 209 //calculate the necessary number of orders required to represent the scattered field
186 T k = W[0].kmag(); 210 T k = W[0].kmag();
@@ -205,7 +229,7 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std @@ -205,7 +229,7 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
205 229
206 int threads = stim::maxThreadsPerBlock(); 230 int threads = stim::maxThreadsPerBlock();
207 dim3 blocks((unsigned)(N / threads + 1)); 231 dim3 blocks((unsigned)(N / threads + 1));
208 - cuda_dist<T> <<< blocks, threads >>>(dev_r, x, y, z, N); 232 + cuda_dist<T> <<< blocks, threads >>>(dev_r, x, y, z, N, c);
209 233
210 //Find the minimum and maximum values of r 234 //Find the minimum and maximum values of r
211 cublasStatus_t stat; 235 cublasStatus_t stat;
@@ -273,10 +297,16 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std @@ -273,10 +297,16 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
273 stim::complex<T>* dev_hB_lut; 297 stim::complex<T>* dev_hB_lut;
274 HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); 298 HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
275 HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); 299 HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );
  300 + //std::cout << "r_min: " << r_min << std::endl;
  301 + //std::cout << "dr: " << dr << std::endl;
  302 + gpu_scalar_mie_scatter<T>(E, N, x, y, z, dev_W, W.size(), a, n, c, dev_hB_lut, r_min, dr, N_hB_lut, Nl);
276 303
277 - gpu_scalar_mie_scatter<T>(E, N, x, y, z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl); 304 + HANDLE_ERROR(cudaMemcpy(E, E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost)); //copy the field from device memory
  305 +
  306 + HANDLE_ERROR(cudaFree(dev_hB_lut));
  307 + HANDLE_ERROR(cudaFree(dev_r));
  308 + HANDLE_ERROR(cudaFree(dev_W));
278 309
279 - cudaMemcpy(E, E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory  
280 } 310 }
281 /// Calculate the scalar Mie solution for the scattered field produced by a single plane wave 311 /// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
282 312
@@ -289,10 +319,10 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std @@ -289,10 +319,10 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
289 /// @param a is the radius of the sphere 319 /// @param a is the radius of the sphere
290 /// @param n is the complex refractive index of the sphere 320 /// @param n is the complex refractive index of the sphere
291 template<typename T> 321 template<typename T>
292 -void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){ 322 +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0)){
293 323
294 324
295 -#ifdef CUDA_FOUND 325 +/*#ifdef CUDA_FOUND
296 stim::complex<T>* dev_E; //allocate space for the field 326 stim::complex<T>* dev_E; //allocate space for the field
297 cudaMalloc(&dev_E, N * sizeof(stim::complex<T>)); 327 cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
298 cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice); 328 cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
@@ -315,13 +345,14 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std @@ -315,13 +345,14 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
315 HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); 345 HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
316 } 346 }
317 347
318 - gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, W, a, n, r_spacing); 348 + gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, W, a, n, c, r_spacing);
319 349
320 if(x != NULL) cudaFree(dev_x); //free everything 350 if(x != NULL) cudaFree(dev_x); //free everything
321 if(y != NULL) cudaFree(dev_y); 351 if(y != NULL) cudaFree(dev_y);
322 if(z != NULL) cudaFree(dev_z); 352 if(z != NULL) cudaFree(dev_z);
323 cudaFree(dev_E); 353 cudaFree(dev_E);
324 #else 354 #else
  355 +*/
325 356
326 //calculate the necessary number of orders required to represent the scattered field 357 //calculate the necessary number of orders required to represent the scattered field
327 T k = W[0].kmag(); 358 T k = W[0].kmag();
@@ -345,15 +376,18 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std @@ -345,15 +376,18 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
345 376
346 T r, kr, cos_phi; 377 T r, kr, cos_phi;
347 stim::complex<T> h; 378 stim::complex<T> h;
  379 + stim::complex<T> Ew;
348 for(size_t i = 0; i < N; i++){ 380 for(size_t i = 0; i < N; i++){
349 stim::vec3<T> p; //declare a 3D point 381 stim::vec3<T> p; //declare a 3D point
350 382
351 (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions 383 (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
352 (y == NULL) ? p[1] = 0 : p[1] = y[i]; 384 (y == NULL) ? p[1] = 0 : p[1] = y[i];
353 (z == NULL) ? p[2] = 0 : p[2] = z[i]; 385 (z == NULL) ? p[2] = 0 : p[2] = z[i];
  386 + p = p - c;
354 r = p.len(); 387 r = p.len();
355 if(r >= a){ 388 if(r >= a){
356 for(size_t w = 0; w < W.size(); w++){ 389 for(size_t w = 0; w < W.size(); w++){
  390 + Ew = W[w].E() * exp(stim::complex<double>(0, W[w].kvec().dot(c)));
357 kr = p.len() * W[w].kmag(); //calculate k*r 391 kr = p.len() * W[w].kmag(); //calculate k*r
358 stim::bessjyv_sph<double>(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr); 392 stim::bessjyv_sph<double>(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr);
359 cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction 393 cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction
@@ -362,18 +396,18 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std @@ -362,18 +396,18 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
362 for(size_t l = 0; l <= Nl; l++){ 396 for(size_t l = 0; l <= Nl; l++){
363 h.r = j_kr[l]; 397 h.r = j_kr[l];
364 h.i = y_kr[l]; 398 h.i = y_kr[l];
365 - E[i] += W[w].E() * B[l] * h * P[l]; 399 + E[i] += Ew * B[l] * h * P[l];
366 } 400 }
367 } 401 }
368 } 402 }
369 } 403 }
370 -#endif 404 +//#endif
371 } 405 }
372 406
373 template<typename T> 407 template<typename T>
374 -void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){ 408 +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0), T r_spacing = 0.1){
375 std::vector< stim::scalarwave<T> > W(1, w); 409 std::vector< stim::scalarwave<T> > W(1, w);
376 - cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, r_spacing); 410 + cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, c, r_spacing);
377 } 411 }
378 412
379 template<typename T> 413 template<typename T>
@@ -389,14 +423,14 @@ __global__ void cuda_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* @@ -389,14 +423,14 @@ __global__ void cuda_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T*
389 423
390 T r = p.len(); //calculate the distance from the sphere 424 T r = p.len(); //calculate the distance from the sphere
391 if(r >= a) return; //exit if the point is inside the sphere (we only calculate the internal field) 425 if(r >= a) return; //exit if the point is inside the sphere (we only calculate the internal field)
392 - T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT 426 + T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT
393 size_t ij = (size_t) fij; //convert to an integral index 427 size_t ij = (size_t) fij; //convert to an integral index
394 T alpha = fij - ij; //calculate the fractional portion of the index 428 T alpha = fij - ij; //calculate the fractional portion of the index
395 - size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT  
396 - size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT 429 + size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT
  430 + size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT
397 431
398 T cos_phi; 432 T cos_phi;
399 - T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials 433 + T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials
400 434
401 stim::complex<T> jAl; 435 stim::complex<T> jAl;
402 stim::complex<T> Ei = 0; //create a register to store the result 436 stim::complex<T> Ei = 0; //create a register to store the result
@@ -415,7 +449,7 @@ __global__ void cuda_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* @@ -415,7 +449,7 @@ __global__ void cuda_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T*
415 for(size_t w = 0; w < nW; w++){ //for each plane wave 449 for(size_t w = 0; w < nW; w++){ //for each plane wave
416 if(r == 0) cos_phi = 0; 450 if(r == 0) cos_phi = 0;
417 else 451 else
418 - cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere 452 + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere
419 Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation 453 Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation
420 Pl_1 = cos_phi; 454 Pl_1 = cos_phi;
421 Ei += W[w].E() * jlAl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation 455 Ei += W[w].E() * jlAl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation
@@ -468,7 +502,7 @@ void gpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st @@ -468,7 +502,7 @@ void gpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
468 /// @param a is the radius of the sphere 502 /// @param a is the radius of the sphere
469 /// @param n is the complex refractive index of the sphere 503 /// @param n is the complex refractive index of the sphere
470 template<typename T> 504 template<typename T>
471 -void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n, T r_spacing = 0.1){ 505 +void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0)){
472 //calculate the necessary number of orders required to represent the scattered field 506 //calculate the necessary number of orders required to represent the scattered field
473 T k = W[0].kmag(); 507 T k = W[0].kmag();
474 508
@@ -480,7 +514,7 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st @@ -480,7 +514,7 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
480 stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients 514 stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients
481 A_coefficients(A, a, k, n, Nl); 515 A_coefficients(A, a, k, n, Nl);
482 516
483 -#ifdef CUDA_FOUND 517 +/*#ifdef CUDA_FOUND
484 stim::complex<T>* dev_E; //allocate space for the field 518 stim::complex<T>* dev_E; //allocate space for the field
485 cudaMalloc(&dev_E, N * sizeof(stim::complex<T>)); 519 cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
486 cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice); 520 cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
@@ -538,6 +572,7 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st @@ -538,6 +572,7 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
538 printf ("CUBLAS Error: failed to calculate maximum r value.\n"); 572 printf ("CUBLAS Error: failed to calculate maximum r value.\n");
539 exit(1); 573 exit(1);
540 } 574 }
  575 + cublasDestroy(handle); //destroy the CUBLAS handle
541 576
542 i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility 577 i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility
543 i_max--; 578 i_max--;
@@ -585,9 +620,9 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st @@ -585,9 +620,9 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
585 HANDLE_ERROR( cudaFree(dev_E) ); 620 HANDLE_ERROR( cudaFree(dev_E) );
586 HANDLE_ERROR( cudaFree(dev_W) ); 621 HANDLE_ERROR( cudaFree(dev_W) );
587 HANDLE_ERROR( cudaFree(dev_r) ); 622 HANDLE_ERROR( cudaFree(dev_r) );
588 - cudaFree(dev_E); 623 + HANDLE_ERROR( cudaFree(dev_E) );
589 #else 624 #else
590 - 625 +*/
591 //allocate space to store the bessel function call results 626 //allocate space to store the bessel function call results
592 double vm; 627 double vm;
593 stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); 628 stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) );
@@ -600,16 +635,19 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st @@ -600,16 +635,19 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
600 T r, cos_phi; 635 T r, cos_phi;
601 stim::complex<double> knr; 636 stim::complex<double> knr;
602 stim::complex<T> h; 637 stim::complex<T> h;
  638 + stim::complex<T> Ew;
603 for(size_t i = 0; i < N; i++){ 639 for(size_t i = 0; i < N; i++){
604 stim::vec3<T> p; //declare a 3D point 640 stim::vec3<T> p; //declare a 3D point
605 641
606 (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions 642 (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
607 (y == NULL) ? p[1] = 0 : p[1] = y[i]; 643 (y == NULL) ? p[1] = 0 : p[1] = y[i];
608 (z == NULL) ? p[2] = 0 : p[2] = z[i]; 644 (z == NULL) ? p[2] = 0 : p[2] = z[i];
  645 + p = p - c;
609 r = p.len(); 646 r = p.len();
610 if(r < a){ 647 if(r < a){
611 E[i] = 0; 648 E[i] = 0;
612 for(size_t w = 0; w < W.size(); w++){ 649 for(size_t w = 0; w < W.size(); w++){
  650 + Ew = W[w].E() * exp(stim::complex<double>(0, W[w].kvec().dot(c)));
613 knr = (stim::complex<double>)n * p.len() * W[w].kmag(); //calculate k*n*r 651 knr = (stim::complex<double>)n * p.len() * W[w].kmag(); //calculate k*n*r
614 652
615 stim::cbessjyva_sph<double>(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr); 653 stim::cbessjyva_sph<double>(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr);
@@ -620,18 +658,18 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st @@ -620,18 +658,18 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
620 stim::legendre<T>(Nl, cos_phi, P); 658 stim::legendre<T>(Nl, cos_phi, P);
621 659
622 for(size_t l = 0; l <= Nl; l++){ 660 for(size_t l = 0; l <= Nl; l++){
623 - E[i] += W[w].E() * A[l] * (stim::complex<T>)j_knr[l] * P[l]; 661 + E[i] += Ew * A[l] * (stim::complex<T>)j_knr[l] * P[l];
624 } 662 }
625 } 663 }
626 } 664 }
627 } 665 }
628 -#endif 666 +//#endif
629 } 667 }
630 668
631 template<typename T> 669 template<typename T>
632 -void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){ 670 +void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0)){
633 std::vector< stim::scalarwave<T> > W(1, w); 671 std::vector< stim::scalarwave<T> > W(1, w);
634 - cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, r_spacing); 672 + cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, c);
635 } 673 }
636 674
637 675
@@ -639,59 +677,89 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st @@ -639,59 +677,89 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
639 template<typename T> 677 template<typename T>
640 class scalarmie 678 class scalarmie
641 { 679 {
642 -private: 680 +public:
643 T radius; //radius of the scattering sphere 681 T radius; //radius of the scattering sphere
644 stim::complex<T> n; //refractive index of the scattering sphere 682 stim::complex<T> n; //refractive index of the scattering sphere
  683 + vec3<T> c; //position of the sphere in space
645 684
646 public: 685 public:
647 686
648 - scalarmie(T r, stim::complex<T> ri){ 687 + scalarmie() { //default constructor
  688 + radius = 0.5;
  689 + n = stim::complex<T>(1.4, 0.0);
  690 + c = stim::vec3<T>(0, 0, 0);
  691 + }
  692 +
  693 + scalarmie(T r, stim::complex<T> ri, stim::vec3<T> center = stim::vec3<T>(0, 0, 0)){
649 radius = r; 694 radius = r;
650 n = ri; 695 n = ri;
  696 + c = center;
  697 + //c = stim::vec3<T>(2, 1, 0);
651 } 698 }
652 699
653 - void sum_scat(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){  
654 - std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves  
655 - stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());  
656 - } 700 + //void sum_scat(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){
  701 + // std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves
  702 + // stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
  703 + //}
657 704
658 - void sum_intern(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){  
659 - std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves  
660 - stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());  
661 - } 705 + //void sum_intern(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){
  706 + // std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves
  707 + // stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
  708 + //}
662 709
663 - void eval(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int order = 500, int samples = 1000){  
664 - b.eval(E, X, Y, Z, order); //evaluate the incident field using a plane wave expansion  
665 - std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves  
666 - sum_scat(E, X, Y, Z, b, samples);  
667 - sum_intern(E, X, Y, Z, b, samples);  
668 - } 710 + //void eval(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int order = 500, int samples = 1000){
  711 + // b.eval(E, X, Y, Z, order); //evaluate the incident field using a plane wave expansion
  712 + // std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves
  713 + // sum_scat(E, X, Y, Z, b, samples);
  714 + // sum_intern(E, X, Y, Z, b, samples);
  715 + //}
669 716
670 void eval(stim::scalarfield<T>& E, stim::scalarbeam<T> b, int order = 500, int samples = 1000){ 717 void eval(stim::scalarfield<T>& E, stim::scalarbeam<T> b, int order = 500, int samples = 1000){
671 718
672 - /*size_t array_size = E.grid_bytes(); //calculate the number of bytes in the scalar grid  
673 - float* X = (float*) malloc( array_size ); //allocate space for the coordinate meshes  
674 - float* Y = (float*) malloc( array_size );  
675 - float* Z = (float*) malloc( array_size );  
676 - E.meshgrid(X, Y, Z, stim::CPUmem); //calculate the coordinate meshes  
677 - */  
678 E.meshgrid(); 719 E.meshgrid();
679 b.eval(E, order); 720 b.eval(E, order);
680 721
681 std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves 722 std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves
682 723
683 if(E.gpu()){ 724 if(E.gpu()){
684 - stim::gpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); 725 + stim::gpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c, E.spacing());
685 } 726 }
686 else{ 727 else{
687 stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); 728 stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing());
688 stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); 729 stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing());
689 } 730 }
690 - //eval(E, X, Y, Z, b, order, samples); //evaluate the field  
691 } 731 }
692 732
693 }; //end stim::scalarmie 733 }; //end stim::scalarmie
694 734
  735 +template<typename T>
  736 +class scalarcluster : public std::vector< scalarmie<T> > {
  737 +public:
  738 +
  739 + void eval(stim::scalarfield<T>& E, stim::scalarbeam<T> b, int order = 500, int samples = 1000) {
  740 + E.meshgrid();
  741 + b.eval(E, order);
  742 +
  743 + std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves
  744 +
  745 + T radius;
  746 + stim::complex<T> n;
  747 + stim::vec3<T> c;
  748 + for (size_t si = 0; si < size(); si++) { //for each sphere in the cluster
  749 + radius = at(si).radius;
  750 + n = at(si).n;
  751 + c = at(si).c;
  752 + if (E.gpu()) {
  753 + stim::gpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c, E.spacing());
  754 + }
  755 + else {
  756 + stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c);
  757 + stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c);
  758 + }
  759 + }
  760 + }
  761 +};
  762 +
695 } //end namespace stim 763 } //end namespace stim
696 764
697 #endif 765 #endif
698 \ No newline at end of file 766 \ No newline at end of file
stim/optics/scalarwave.h
@@ -252,7 +252,7 @@ void gpu_scalarwaves(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scal @@ -252,7 +252,7 @@ void gpu_scalarwaves(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scal
252 size_t batch_size; //declare a variable to store the size of the current batch 252 size_t batch_size; //declare a variable to store the size of the current batch
253 size_t waves_processed = 0; //initialize the number of waves processed to zero 253 size_t waves_processed = 0; //initialize the number of waves processed to zero
254 while(waves_processed < nW){ //while there are still waves to be processed 254 while(waves_processed < nW){ //while there are still waves to be processed
255 - batch_size = min<size_t>(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left 255 + batch_size = std::min<size_t>(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left
256 batch_bytes = batch_size * sizeof(stim::scalarwave<T>); 256 batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
257 HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory 257 HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory
258 cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(F, N, x, y, z, batch_W, batch_size); //call the kernel 258 cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(F, N, x, y, z, batch_W, batch_size); //call the kernel
stim/parser/table.h
1 -#ifndef STIM_CSV_H  
2 -#define STIM_CSV_H 1 +#ifndef STIM_TABLE_H
  2 +#define STIM_TABLE_H
3 3
4 #include <type_traits> 4 #include <type_traits>
5 #include <vector> 5 #include <vector>
@@ -102,6 +102,27 @@ namespace stim{ @@ -102,6 +102,27 @@ namespace stim{
102 return result; //return the casted table 102 return result; //return the casted table
103 } 103 }
104 104
  105 + size_t rows(){
  106 + return TABLE.size();
  107 + }
  108 +
  109 + size_t cols(size_t ri){
  110 + return TABLE[ri].size();
  111 + }
  112 +
  113 + //returns the maximum number of columns in the table
  114 + size_t cols(){
  115 + size_t max_c = 0;
  116 + for(size_t ri = 0; ri < rows(); ri++){
  117 + max_c = std::max(max_c, cols(ri));
  118 + }
  119 + return max_c;
  120 + }
  121 +
  122 + std::string operator()(size_t ri, size_t ci){
  123 + return TABLE[ri][ci];
  124 + }
  125 +
105 }; 126 };
106 }; 127 };
107 128