Commit 2a10ecf4739211c4e356bba41fe8c6756b36eb81
1 parent
0012d210
updated files and added features to support BIMSim
Showing
8 changed files
with
235 additions
and
112 deletions
Show diff stats
stim/cuda/cudatools/devices.h
@@ -4,35 +4,56 @@ | @@ -4,35 +4,56 @@ | ||
4 | #include <cuda.h> | 4 | #include <cuda.h> |
5 | 5 | ||
6 | namespace stim{ | 6 | namespace stim{ |
7 | -extern "C" | ||
8 | -int maxThreadsPerBlock() | ||
9 | -{ | ||
10 | - int device; | ||
11 | - cudaGetDevice(&device); //get the id of the current device | ||
12 | - cudaDeviceProp props; //device property structure | ||
13 | - cudaGetDeviceProperties(&props, device); | ||
14 | - return props.maxThreadsPerBlock; | ||
15 | -} | 7 | + extern "C" |
8 | + int maxThreadsPerBlock(){ | ||
9 | + int device; | ||
10 | + cudaGetDevice(&device); //get the id of the current device | ||
11 | + cudaDeviceProp props; //device property structure | ||
12 | + cudaGetDeviceProperties(&props, device); | ||
13 | + return props.maxThreadsPerBlock; | ||
14 | + } | ||
16 | 15 | ||
17 | -extern "C" | ||
18 | -size_t sharedMemPerBlock() | ||
19 | -{ | ||
20 | - int device; | ||
21 | - cudaGetDevice(&device); //get the id of the current device | ||
22 | - cudaDeviceProp props; //device property structure | ||
23 | - cudaGetDeviceProperties(&props, device); | ||
24 | - return props.sharedMemPerBlock; | ||
25 | -} | 16 | + extern "C" |
17 | + size_t sharedMemPerBlock(){ | ||
18 | + int device; | ||
19 | + cudaGetDevice(&device); //get the id of the current device | ||
20 | + cudaDeviceProp props; //device property structure | ||
21 | + cudaGetDeviceProperties(&props, device); | ||
22 | + return props.sharedMemPerBlock; | ||
23 | + } | ||
26 | 24 | ||
27 | -extern "C" | ||
28 | -size_t constMem() | ||
29 | -{ | ||
30 | - int device; | ||
31 | - cudaGetDevice(&device); //get the id of the current device | ||
32 | - cudaDeviceProp props; //device property structure | ||
33 | - cudaGetDeviceProperties(&props, device); | ||
34 | - return props.totalConstMem; | ||
35 | -} | 25 | + extern "C" |
26 | + size_t constMem(){ | ||
27 | + int device; | ||
28 | + cudaGetDevice(&device); //get the id of the current device | ||
29 | + cudaDeviceProp props; //device property structure | ||
30 | + cudaGetDeviceProperties(&props, device); | ||
31 | + return props.totalConstMem; | ||
32 | + } | ||
33 | + | ||
34 | + //tests that a given device ID is valid and provides at least the specified compute capability | ||
35 | + bool testDevice(int d, int major, int minor){ | ||
36 | + int nd; | ||
37 | + cudaGetDeviceCount(&nd); //get the number of CUDA devices | ||
38 | + if(d < nd && d > 0) { //if the given ID has an associated device | ||
39 | + cudaDeviceProp props; | ||
40 | + cudaGetDeviceProperties(&props, d); //get the device properties structure | ||
41 | + if(props.major >= major && props.minor >= minor) | ||
42 | + return true; | ||
43 | + } | ||
44 | + return false; | ||
45 | + } | ||
46 | + | ||
47 | + //tests each device ID in a list and returns the number of devices that fit the desired | ||
48 | + // compute capability | ||
49 | + int testDevices(int* dlist, unsigned n_devices, int major, int minor){ | ||
50 | + int valid = 0; | ||
51 | + for(int d = 0; d < n_devices; d++){ | ||
52 | + if(testDevice(dlist[d], major, minor)) | ||
53 | + valid++; | ||
54 | + } | ||
55 | + return valid; | ||
56 | + } | ||
36 | } //end namespace rts | 57 | } //end namespace rts |
37 | 58 | ||
38 | #endif | 59 | #endif |
stim/image/image.h
@@ -324,6 +324,12 @@ public: | @@ -324,6 +324,12 @@ public: | ||
324 | 324 | ||
325 | //save a file | 325 | //save a file |
326 | void save(std::string filename){ | 326 | void save(std::string filename){ |
327 | + stim::filename file(filename); | ||
328 | + if (file.extension() == "raw" || file.extension() == "") { | ||
329 | + std::ofstream outfile(filename, std::ios::binary); | ||
330 | + outfile.write((char*)img, sizeof(T) * R[0] * R[1] * R[2]); | ||
331 | + outfile.close(); | ||
332 | + } | ||
327 | #ifdef USING_OPENCV | 333 | #ifdef USING_OPENCV |
328 | //OpenCV uses an interleaved format, so convert first and then output | 334 | //OpenCV uses an interleaved format, so convert first and then output |
329 | T* buffer = (T*) malloc(bytes()); | 335 | T* buffer = (T*) malloc(bytes()); |
stim/math/bessel.h
@@ -17,10 +17,10 @@ static complex<double> czero(0.0,0.0); | @@ -17,10 +17,10 @@ static complex<double> czero(0.0,0.0); | ||
17 | template< typename P > | 17 | template< typename P > |
18 | P gamma(P x) | 18 | P gamma(P x) |
19 | { | 19 | { |
20 | - const P EPS = numeric_limits<P>::epsilon(); | ||
21 | - const P FPMIN_MAG = numeric_limits<P>::min(); | ||
22 | - const P FPMIN = numeric_limits<P>::lowest(); | ||
23 | - const P FPMAX = numeric_limits<P>::max(); | 20 | + const P EPS = std::numeric_limits<P>::epsilon(); |
21 | + const P FPMIN_MAG = std::numeric_limits<P>::min(); | ||
22 | + const P FPMIN = std::numeric_limits<P>::lowest(); | ||
23 | + const P FPMAX = std::numeric_limits<P>::max(); | ||
24 | 24 | ||
25 | int i,k,m; | 25 | int i,k,m; |
26 | P ga,gr,r,z; | 26 | P ga,gr,r,z; |
@@ -94,10 +94,10 @@ template<typename P> | @@ -94,10 +94,10 @@ template<typename P> | ||
94 | int bessjy01a(P x,P &j0,P &j1,P &y0,P &y1, | 94 | int bessjy01a(P x,P &j0,P &j1,P &y0,P &y1, |
95 | P &j0p,P &j1p,P &y0p,P &y1p) | 95 | P &j0p,P &j1p,P &y0p,P &y1p) |
96 | { | 96 | { |
97 | - const P EPS = numeric_limits<P>::epsilon(); | ||
98 | - const P FPMIN_MAG = numeric_limits<P>::min(); | ||
99 | - const P FPMIN = numeric_limits<P>::lowest(); | ||
100 | - const P FPMAX = numeric_limits<P>::max(); | 97 | + const P EPS = std::numeric_limits<P>::epsilon(); |
98 | + const P FPMIN_MAG = std::numeric_limits<P>::min(); | ||
99 | + const P FPMIN = std::numeric_limits<P>::lowest(); | ||
100 | + const P FPMAX = std::numeric_limits<P>::max(); | ||
101 | 101 | ||
102 | P x2,r,ec,w0,w1,r0,r1,cs0,cs1; | 102 | P x2,r,ec,w0,w1,r0,r1,cs0,cs1; |
103 | P cu,p0,q0,p1,q1,t1,t2; | 103 | P cu,p0,q0,p1,q1,t1,t2; |
@@ -606,10 +606,10 @@ int bessjyv(P v,P x,P &vm,P *jv,P *yv, | @@ -606,10 +606,10 @@ int bessjyv(P v,P x,P &vm,P *jv,P *yv, | ||
606 | P b,ec,w0,w1,bju0,bju1,pv0,pv1,byvk; | 606 | P b,ec,w0,w1,bju0,bju1,pv0,pv1,byvk; |
607 | int j,k,l,m,n,kz; | 607 | int j,k,l,m,n,kz; |
608 | 608 | ||
609 | - const P EPS = numeric_limits<P>::epsilon(); | ||
610 | - const P FPMIN_MAG = numeric_limits<P>::min(); | ||
611 | - const P FPMIN = numeric_limits<P>::lowest(); | ||
612 | - const P FPMAX = numeric_limits<P>::max(); | 609 | + const P EPS = std::numeric_limits<P>::epsilon(); |
610 | + const P FPMIN_MAG = std::numeric_limits<P>::min(); | ||
611 | + const P FPMIN = std::numeric_limits<P>::lowest(); | ||
612 | + const P FPMAX = std::numeric_limits<P>::max(); | ||
613 | 613 | ||
614 | x2 = x*x; | 614 | x2 = x*x; |
615 | n = (int)v; | 615 | n = (int)v; |
stim/optics/scalarbeam.h
@@ -405,7 +405,7 @@ private: | @@ -405,7 +405,7 @@ private: | ||
405 | T NA[2]; //numerical aperature of the focusing optics | 405 | T NA[2]; //numerical aperature of the focusing optics |
406 | vec3<T> f; //focal point | 406 | vec3<T> f; //focal point |
407 | vec3<T> d; //propagation direction | 407 | vec3<T> d; //propagation direction |
408 | - T A; //beam amplitude | 408 | + T A; //beam amplitude |
409 | T lambda; //beam wavelength | 409 | T lambda; //beam wavelength |
410 | public: | 410 | public: |
411 | 411 | ||
@@ -440,10 +440,10 @@ public: | @@ -440,10 +440,10 @@ public: | ||
440 | stim::complex<T> apw; //allocate space for the amplitude at the focal point | 440 | stim::complex<T> apw; //allocate space for the amplitude at the focal point |
441 | T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N); //constant value weights plane waves based on the aperture and number of samples (N) | 441 | T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N); //constant value weights plane waves based on the aperture and number of samples (N) |
442 | stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction | 442 | stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction |
443 | - for(size_t i=0; i<N; i++){ //for each sample | 443 | + for(size_t i=0; i<N; i++){ //for each sample |
444 | kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave | 444 | kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave |
445 | - apw = a * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave | ||
446 | - samples[i] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction | 445 | + apw = a * A * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave |
446 | + samples[i] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction | ||
447 | } | 447 | } |
448 | return samples; | 448 | return samples; |
449 | } | 449 | } |
stim/optics/scalarfield.h
@@ -143,8 +143,8 @@ namespace stim{ | @@ -143,8 +143,8 @@ namespace stim{ | ||
143 | 143 | ||
144 | 144 | ||
145 | /// Propagate a field slice along its orthogonal direction by a given distance z | 145 | /// Propagate a field slice along its orthogonal direction by a given distance z |
146 | - /// @param Enew is the resulting propogated field | ||
147 | - /// @param E is the field to be propogated | 146 | + /// @param Enew is the resulting propagated field |
147 | + /// @param E is the field to be propagated | ||
148 | /// @param sx is the size of the field in the lateral x direction | 148 | /// @param sx is the size of the field in the lateral x direction |
149 | /// @param sy is the size of the field in the lateral y direction | 149 | /// @param sy is the size of the field in the lateral y direction |
150 | /// @param z is the distance to be propagated | 150 | /// @param z is the distance to be propagated |
@@ -187,12 +187,13 @@ namespace stim{ | @@ -187,12 +187,13 @@ namespace stim{ | ||
187 | 187 | ||
188 | if(kx_sq + ky_sq < k_sq){ | 188 | if(kx_sq + ky_sq < k_sq){ |
189 | kz = sqrt(k_sq - kx_sq - ky_sq); //estimate kz using the Fresnel approximation | 189 | kz = sqrt(k_sq - kx_sq - ky_sq); //estimate kz using the Fresnel approximation |
190 | - shift = -exp(stim::complex<T>(0, kz * z)); | 190 | + shift = exp(stim::complex<T>(0, kz * z)); |
191 | + //std::cout << shift << std::endl; | ||
191 | K[i] *= shift; | 192 | K[i] *= shift; |
192 | K[i] /= (nx*ny); //normalize the DFT | 193 | K[i] /= (nx*ny); //normalize the DFT |
193 | } | 194 | } |
194 | else{ | 195 | else{ |
195 | - K[i] = 0; | 196 | + K[i] /= (nx*ny); |
196 | } | 197 | } |
197 | } | 198 | } |
198 | } | 199 | } |
@@ -342,7 +343,7 @@ public: | @@ -342,7 +343,7 @@ public: | ||
342 | T spacing(){ | 343 | T spacing(){ |
343 | T du = rect<T>::X.len() / R[0]; | 344 | T du = rect<T>::X.len() / R[0]; |
344 | T dv = rect<T>::Y.len() / R[1]; | 345 | T dv = rect<T>::Y.len() / R[1]; |
345 | - return min<T>(du, dv); | 346 | + return std::min<T>(du, dv); |
346 | } | 347 | } |
347 | 348 | ||
348 | /// Copy the field array to the GPU, if it isn't already there | 349 | /// Copy the field array to the GPU, if it isn't already there |
@@ -537,7 +538,7 @@ public: | @@ -537,7 +538,7 @@ public: | ||
537 | memset(E, 0, grid_bytes()); | 538 | memset(E, 0, grid_bytes()); |
538 | } | 539 | } |
539 | 540 | ||
540 | - void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer){ | 541 | + void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer, T minval = 0, T maxval = 0){ |
541 | 542 | ||
542 | if(loc == GPUmem){ | 543 | if(loc == GPUmem){ |
543 | T* image; | 544 | T* image; |
@@ -559,7 +560,10 @@ public: | @@ -559,7 +560,10 @@ public: | ||
559 | cuda_intensity<T><<< blocks, threads >>>(image, E, size()); | 560 | cuda_intensity<T><<< blocks, threads >>>(image, E, size()); |
560 | break; | 561 | break; |
561 | } | 562 | } |
562 | - stim::gpu2image<T>(image, filename, R[0], R[1], stim::cmBrewer); | 563 | + if (minval == maxval) |
564 | + stim::gpu2image<T>(image, filename, R[0], R[1], cmap); | ||
565 | + else | ||
566 | + stim::gpu2image<T>(image, filename, R[0], R[1], minval, maxval, cmap); | ||
563 | HANDLE_ERROR( cudaFree(image) ); | 567 | HANDLE_ERROR( cudaFree(image) ); |
564 | } | 568 | } |
565 | else{ | 569 | else{ |
@@ -579,7 +583,10 @@ public: | @@ -579,7 +583,10 @@ public: | ||
579 | stim::intensity(image, E, size()); | 583 | stim::intensity(image, E, size()); |
580 | break; | 584 | break; |
581 | } | 585 | } |
582 | - stim::cpu2image(image, filename, R[0], R[1], cmap); //save the resulting image | 586 | + if (minval == maxval) |
587 | + stim::cpu2image(image, filename, R[0], R[1], cmap); //save the resulting image | ||
588 | + else | ||
589 | + stim::cpu2image<T>(image, filename, R[0], R[1], minval, maxval, cmap); | ||
583 | free(image); //free the real image | 590 | free(image); //free the real image |
584 | } | 591 | } |
585 | } | 592 | } |
stim/optics/scalarmie.h
@@ -89,7 +89,7 @@ void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){ | @@ -89,7 +89,7 @@ void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){ | ||
89 | 89 | ||
90 | #define LOCAL_NL 16 | 90 | #define LOCAL_NL 16 |
91 | template<typename T> | 91 | template<typename T> |
92 | -__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){ | 92 | +__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::vec3<T> c, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){ |
93 | extern __shared__ stim::complex<T> shared_hB[]; //declare the list of waves in shared memory | 93 | extern __shared__ stim::complex<T> shared_hB[]; //declare the list of waves in shared memory |
94 | 94 | ||
95 | size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array | 95 | size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array |
@@ -98,7 +98,7 @@ __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* | @@ -98,7 +98,7 @@ __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* | ||
98 | (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions | 98 | (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions |
99 | (y == NULL) ? p[1] = 0 : p[1] = y[i]; | 99 | (y == NULL) ? p[1] = 0 : p[1] = y[i]; |
100 | (z == NULL) ? p[2] = 0 : p[2] = z[i]; | 100 | (z == NULL) ? p[2] = 0 : p[2] = z[i]; |
101 | - | 101 | + p = p - c; |
102 | T r = p.len(); //calculate the distance from the sphere | 102 | T r = p.len(); //calculate the distance from the sphere |
103 | if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) | 103 | if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) |
104 | T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT | 104 | T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT |
@@ -124,24 +124,27 @@ __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* | @@ -124,24 +124,27 @@ __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* | ||
124 | for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory | 124 | for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory |
125 | shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha ); | 125 | shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha ); |
126 | 126 | ||
127 | + complex<T> e, Ew; | ||
127 | for(size_t w = 0; w < nW; w++){ //for each plane wave | 128 | for(size_t w = 0; w < nW; w++){ //for each plane wave |
128 | cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere | 129 | cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere |
129 | Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation | 130 | Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation |
130 | Pl_1 = cos_phi; | 131 | Pl_1 = cos_phi; |
131 | - Ei += W[w].E() * hlBl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation | ||
132 | - Ei += W[w].E() * hlBl[1] * Pl_1; | 132 | + e = exp(complex<T>(0, W[w].kvec().dot(c))); |
133 | + Ew = W[w].E() * e; | ||
134 | + Ei += Ew * hlBl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation | ||
135 | + Ei += Ew * hlBl[1] * Pl_1; | ||
133 | 136 | ||
134 | #pragma unroll LOCAL_NL-1 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file) | 137 | #pragma unroll LOCAL_NL-1 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file) |
135 | for(l = 2; l <= LOCAL_NL; l++){ | 138 | for(l = 2; l <= LOCAL_NL; l++){ |
136 | Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs) | 139 | Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs) |
137 | - Ei += W[w].E() * hlBl[l] * Pl; //calculate and sum the current field order | 140 | + Ei += Ew * hlBl[l] * Pl; //calculate and sum the current field order |
138 | Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 | 141 | Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 |
139 | Pl_1 = Pl; | 142 | Pl_1 = Pl; |
140 | } | 143 | } |
141 | 144 | ||
142 | for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers) | 145 | for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers) |
143 | Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs | 146 | Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs |
144 | - Ei += W[w].E() * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl; | 147 | + Ei += Ew * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl; |
145 | Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 | 148 | Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 |
146 | Pl_1 = Pl; | 149 | Pl_1 = Pl; |
147 | } | 150 | } |
@@ -149,8 +152,18 @@ __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* | @@ -149,8 +152,18 @@ __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* | ||
149 | E[i] += Ei; //copy the result to device memory | 152 | E[i] += Ei; //copy the result to device memory |
150 | } | 153 | } |
151 | 154 | ||
155 | +///Calculate the scalar Mie scattered field on the GPU | ||
156 | +/// @param E (GPU) is the N x N destination scalar field | ||
157 | +/// @param N is the number fo elements of the scalar field in each direction | ||
158 | +/// @param x (GPU) is the grid of X coordinates for each point in E | ||
159 | +/// @param y (GPU) is the grid of Y coordinates for each point in E | ||
160 | +/// @param z (GPU) is the grid of Z coordinates for each point in E | ||
161 | +/// @param W (CPU) is an array of coherent scalar plane waves incident on the Mie scatterer | ||
162 | +/// @param a is the radius of the Mie scatterer | ||
163 | +/// @param n is the complex refractive index of the Mie scatterer | ||
164 | +/// @param r_spacing is the minimum distance between r values of the sample points in E (used to calculate look-up tables) | ||
152 | template<typename T> | 165 | template<typename T> |
153 | -void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){ | 166 | +void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::vec3<T> c, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){ |
154 | 167 | ||
155 | size_t max_shared_mem = stim::sharedMemPerBlock(); | 168 | size_t max_shared_mem = stim::sharedMemPerBlock(); |
156 | size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1); | 169 | size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1); |
@@ -164,11 +177,11 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, sti | @@ -164,11 +177,11 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, sti | ||
164 | if(Nl <= LOCAL_NL) shared_mem = 0; | 177 | if(Nl <= LOCAL_NL) shared_mem = 0; |
165 | else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate | 178 | else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate |
166 | //std::cout<<"shared memory allocated: "<<shared_mem<<std::endl; | 179 | //std::cout<<"shared memory allocated: "<<shared_mem<<std::endl; |
167 | - cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel | 180 | + cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, c, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel |
168 | } | 181 | } |
169 | 182 | ||
170 | template<typename T> | 183 | template<typename T> |
171 | -__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){ | 184 | +__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N, stim::vec3<T> c = stim::vec3<T>(0, 0, 0)){ |
172 | size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array | 185 | size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array |
173 | if(i >= N) return; //exit if this thread is outside the array | 186 | if(i >= N) return; //exit if this thread is outside the array |
174 | 187 | ||
@@ -177,10 +190,21 @@ __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){ | @@ -177,10 +190,21 @@ __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){ | ||
177 | (y == NULL) ? p[1] = 0 : p[1] = y[i]; | 190 | (y == NULL) ? p[1] = 0 : p[1] = y[i]; |
178 | (z == NULL) ? p[2] = 0 : p[2] = z[i]; | 191 | (z == NULL) ? p[2] = 0 : p[2] = z[i]; |
179 | 192 | ||
180 | - r[i] = p.len(); | 193 | + r[i] = (p - c).len(); |
181 | } | 194 | } |
195 | + | ||
196 | +///Calculate the scalar Mie scattered field on the GPU | ||
197 | +/// @param E (GPU) is the N x N destination scalar field | ||
198 | +/// @param N is the number fo elements of the scalar field in each direction | ||
199 | +/// @param x (GPU) is the grid of X coordinates for each point in E | ||
200 | +/// @param y (GPU) is the grid of Y coordinates for each point in E | ||
201 | +/// @param z (GPU) is the grid of Z coordinates for each point in E | ||
202 | +/// @param W (CPU) is an array of coherent scalar plane waves incident on the Mie scatterer | ||
203 | +/// @param a is the radius of the Mie scatterer | ||
204 | +/// @param n is the complex refractive index of the Mie scatterer | ||
205 | +/// @param r_spacing is the minimum distance between r values of the sample points in E (used to calculate look-up tables) | ||
182 | template<typename T> | 206 | template<typename T> |
183 | -void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){ | 207 | +void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0), T r_spacing = 0.1){ |
184 | 208 | ||
185 | //calculate the necessary number of orders required to represent the scattered field | 209 | //calculate the necessary number of orders required to represent the scattered field |
186 | T k = W[0].kmag(); | 210 | T k = W[0].kmag(); |
@@ -205,7 +229,7 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | @@ -205,7 +229,7 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | ||
205 | 229 | ||
206 | int threads = stim::maxThreadsPerBlock(); | 230 | int threads = stim::maxThreadsPerBlock(); |
207 | dim3 blocks((unsigned)(N / threads + 1)); | 231 | dim3 blocks((unsigned)(N / threads + 1)); |
208 | - cuda_dist<T> <<< blocks, threads >>>(dev_r, x, y, z, N); | 232 | + cuda_dist<T> <<< blocks, threads >>>(dev_r, x, y, z, N, c); |
209 | 233 | ||
210 | //Find the minimum and maximum values of r | 234 | //Find the minimum and maximum values of r |
211 | cublasStatus_t stat; | 235 | cublasStatus_t stat; |
@@ -273,10 +297,16 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | @@ -273,10 +297,16 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | ||
273 | stim::complex<T>* dev_hB_lut; | 297 | stim::complex<T>* dev_hB_lut; |
274 | HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); | 298 | HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); |
275 | HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); | 299 | HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); |
300 | + //std::cout << "r_min: " << r_min << std::endl; | ||
301 | + //std::cout << "dr: " << dr << std::endl; | ||
302 | + gpu_scalar_mie_scatter<T>(E, N, x, y, z, dev_W, W.size(), a, n, c, dev_hB_lut, r_min, dr, N_hB_lut, Nl); | ||
276 | 303 | ||
277 | - gpu_scalar_mie_scatter<T>(E, N, x, y, z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl); | 304 | + HANDLE_ERROR(cudaMemcpy(E, E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost)); //copy the field from device memory |
305 | + | ||
306 | + HANDLE_ERROR(cudaFree(dev_hB_lut)); | ||
307 | + HANDLE_ERROR(cudaFree(dev_r)); | ||
308 | + HANDLE_ERROR(cudaFree(dev_W)); | ||
278 | 309 | ||
279 | - cudaMemcpy(E, E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory | ||
280 | } | 310 | } |
281 | /// Calculate the scalar Mie solution for the scattered field produced by a single plane wave | 311 | /// Calculate the scalar Mie solution for the scattered field produced by a single plane wave |
282 | 312 | ||
@@ -289,10 +319,10 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | @@ -289,10 +319,10 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | ||
289 | /// @param a is the radius of the sphere | 319 | /// @param a is the radius of the sphere |
290 | /// @param n is the complex refractive index of the sphere | 320 | /// @param n is the complex refractive index of the sphere |
291 | template<typename T> | 321 | template<typename T> |
292 | -void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){ | 322 | +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0)){ |
293 | 323 | ||
294 | 324 | ||
295 | -#ifdef CUDA_FOUND | 325 | +/*#ifdef CUDA_FOUND |
296 | stim::complex<T>* dev_E; //allocate space for the field | 326 | stim::complex<T>* dev_E; //allocate space for the field |
297 | cudaMalloc(&dev_E, N * sizeof(stim::complex<T>)); | 327 | cudaMalloc(&dev_E, N * sizeof(stim::complex<T>)); |
298 | cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice); | 328 | cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice); |
@@ -315,13 +345,14 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | @@ -315,13 +345,14 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | ||
315 | HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); | 345 | HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); |
316 | } | 346 | } |
317 | 347 | ||
318 | - gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, W, a, n, r_spacing); | 348 | + gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, W, a, n, c, r_spacing); |
319 | 349 | ||
320 | if(x != NULL) cudaFree(dev_x); //free everything | 350 | if(x != NULL) cudaFree(dev_x); //free everything |
321 | if(y != NULL) cudaFree(dev_y); | 351 | if(y != NULL) cudaFree(dev_y); |
322 | if(z != NULL) cudaFree(dev_z); | 352 | if(z != NULL) cudaFree(dev_z); |
323 | cudaFree(dev_E); | 353 | cudaFree(dev_E); |
324 | #else | 354 | #else |
355 | +*/ | ||
325 | 356 | ||
326 | //calculate the necessary number of orders required to represent the scattered field | 357 | //calculate the necessary number of orders required to represent the scattered field |
327 | T k = W[0].kmag(); | 358 | T k = W[0].kmag(); |
@@ -345,15 +376,18 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | @@ -345,15 +376,18 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | ||
345 | 376 | ||
346 | T r, kr, cos_phi; | 377 | T r, kr, cos_phi; |
347 | stim::complex<T> h; | 378 | stim::complex<T> h; |
379 | + stim::complex<T> Ew; | ||
348 | for(size_t i = 0; i < N; i++){ | 380 | for(size_t i = 0; i < N; i++){ |
349 | stim::vec3<T> p; //declare a 3D point | 381 | stim::vec3<T> p; //declare a 3D point |
350 | 382 | ||
351 | (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions | 383 | (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions |
352 | (y == NULL) ? p[1] = 0 : p[1] = y[i]; | 384 | (y == NULL) ? p[1] = 0 : p[1] = y[i]; |
353 | (z == NULL) ? p[2] = 0 : p[2] = z[i]; | 385 | (z == NULL) ? p[2] = 0 : p[2] = z[i]; |
386 | + p = p - c; | ||
354 | r = p.len(); | 387 | r = p.len(); |
355 | if(r >= a){ | 388 | if(r >= a){ |
356 | for(size_t w = 0; w < W.size(); w++){ | 389 | for(size_t w = 0; w < W.size(); w++){ |
390 | + Ew = W[w].E() * exp(stim::complex<double>(0, W[w].kvec().dot(c))); | ||
357 | kr = p.len() * W[w].kmag(); //calculate k*r | 391 | kr = p.len() * W[w].kmag(); //calculate k*r |
358 | stim::bessjyv_sph<double>(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr); | 392 | stim::bessjyv_sph<double>(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr); |
359 | cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction | 393 | cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction |
@@ -362,18 +396,18 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | @@ -362,18 +396,18 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | ||
362 | for(size_t l = 0; l <= Nl; l++){ | 396 | for(size_t l = 0; l <= Nl; l++){ |
363 | h.r = j_kr[l]; | 397 | h.r = j_kr[l]; |
364 | h.i = y_kr[l]; | 398 | h.i = y_kr[l]; |
365 | - E[i] += W[w].E() * B[l] * h * P[l]; | 399 | + E[i] += Ew * B[l] * h * P[l]; |
366 | } | 400 | } |
367 | } | 401 | } |
368 | } | 402 | } |
369 | } | 403 | } |
370 | -#endif | 404 | +//#endif |
371 | } | 405 | } |
372 | 406 | ||
373 | template<typename T> | 407 | template<typename T> |
374 | -void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){ | 408 | +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0), T r_spacing = 0.1){ |
375 | std::vector< stim::scalarwave<T> > W(1, w); | 409 | std::vector< stim::scalarwave<T> > W(1, w); |
376 | - cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, r_spacing); | 410 | + cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, c, r_spacing); |
377 | } | 411 | } |
378 | 412 | ||
379 | template<typename T> | 413 | template<typename T> |
@@ -389,14 +423,14 @@ __global__ void cuda_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* | @@ -389,14 +423,14 @@ __global__ void cuda_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* | ||
389 | 423 | ||
390 | T r = p.len(); //calculate the distance from the sphere | 424 | T r = p.len(); //calculate the distance from the sphere |
391 | if(r >= a) return; //exit if the point is inside the sphere (we only calculate the internal field) | 425 | if(r >= a) return; //exit if the point is inside the sphere (we only calculate the internal field) |
392 | - T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT | 426 | + T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT |
393 | size_t ij = (size_t) fij; //convert to an integral index | 427 | size_t ij = (size_t) fij; //convert to an integral index |
394 | T alpha = fij - ij; //calculate the fractional portion of the index | 428 | T alpha = fij - ij; //calculate the fractional portion of the index |
395 | - size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT | ||
396 | - size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT | 429 | + size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT |
430 | + size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT | ||
397 | 431 | ||
398 | T cos_phi; | 432 | T cos_phi; |
399 | - T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials | 433 | + T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials |
400 | 434 | ||
401 | stim::complex<T> jAl; | 435 | stim::complex<T> jAl; |
402 | stim::complex<T> Ei = 0; //create a register to store the result | 436 | stim::complex<T> Ei = 0; //create a register to store the result |
@@ -415,7 +449,7 @@ __global__ void cuda_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* | @@ -415,7 +449,7 @@ __global__ void cuda_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* | ||
415 | for(size_t w = 0; w < nW; w++){ //for each plane wave | 449 | for(size_t w = 0; w < nW; w++){ //for each plane wave |
416 | if(r == 0) cos_phi = 0; | 450 | if(r == 0) cos_phi = 0; |
417 | else | 451 | else |
418 | - cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere | 452 | + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere |
419 | Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation | 453 | Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation |
420 | Pl_1 = cos_phi; | 454 | Pl_1 = cos_phi; |
421 | Ei += W[w].E() * jlAl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation | 455 | Ei += W[w].E() * jlAl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation |
@@ -468,7 +502,7 @@ void gpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | @@ -468,7 +502,7 @@ void gpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | ||
468 | /// @param a is the radius of the sphere | 502 | /// @param a is the radius of the sphere |
469 | /// @param n is the complex refractive index of the sphere | 503 | /// @param n is the complex refractive index of the sphere |
470 | template<typename T> | 504 | template<typename T> |
471 | -void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n, T r_spacing = 0.1){ | 505 | +void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0)){ |
472 | //calculate the necessary number of orders required to represent the scattered field | 506 | //calculate the necessary number of orders required to represent the scattered field |
473 | T k = W[0].kmag(); | 507 | T k = W[0].kmag(); |
474 | 508 | ||
@@ -480,7 +514,7 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | @@ -480,7 +514,7 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | ||
480 | stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients | 514 | stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients |
481 | A_coefficients(A, a, k, n, Nl); | 515 | A_coefficients(A, a, k, n, Nl); |
482 | 516 | ||
483 | -#ifdef CUDA_FOUND | 517 | +/*#ifdef CUDA_FOUND |
484 | stim::complex<T>* dev_E; //allocate space for the field | 518 | stim::complex<T>* dev_E; //allocate space for the field |
485 | cudaMalloc(&dev_E, N * sizeof(stim::complex<T>)); | 519 | cudaMalloc(&dev_E, N * sizeof(stim::complex<T>)); |
486 | cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice); | 520 | cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice); |
@@ -538,6 +572,7 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | @@ -538,6 +572,7 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | ||
538 | printf ("CUBLAS Error: failed to calculate maximum r value.\n"); | 572 | printf ("CUBLAS Error: failed to calculate maximum r value.\n"); |
539 | exit(1); | 573 | exit(1); |
540 | } | 574 | } |
575 | + cublasDestroy(handle); //destroy the CUBLAS handle | ||
541 | 576 | ||
542 | i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility | 577 | i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility |
543 | i_max--; | 578 | i_max--; |
@@ -585,9 +620,9 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | @@ -585,9 +620,9 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | ||
585 | HANDLE_ERROR( cudaFree(dev_E) ); | 620 | HANDLE_ERROR( cudaFree(dev_E) ); |
586 | HANDLE_ERROR( cudaFree(dev_W) ); | 621 | HANDLE_ERROR( cudaFree(dev_W) ); |
587 | HANDLE_ERROR( cudaFree(dev_r) ); | 622 | HANDLE_ERROR( cudaFree(dev_r) ); |
588 | - cudaFree(dev_E); | 623 | + HANDLE_ERROR( cudaFree(dev_E) ); |
589 | #else | 624 | #else |
590 | - | 625 | +*/ |
591 | //allocate space to store the bessel function call results | 626 | //allocate space to store the bessel function call results |
592 | double vm; | 627 | double vm; |
593 | stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); | 628 | stim::complex<double>* j_knr = (stim::complex<double>*) malloc( (Nl + 1) * sizeof(stim::complex<double>) ); |
@@ -600,16 +635,19 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | @@ -600,16 +635,19 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | ||
600 | T r, cos_phi; | 635 | T r, cos_phi; |
601 | stim::complex<double> knr; | 636 | stim::complex<double> knr; |
602 | stim::complex<T> h; | 637 | stim::complex<T> h; |
638 | + stim::complex<T> Ew; | ||
603 | for(size_t i = 0; i < N; i++){ | 639 | for(size_t i = 0; i < N; i++){ |
604 | stim::vec3<T> p; //declare a 3D point | 640 | stim::vec3<T> p; //declare a 3D point |
605 | 641 | ||
606 | (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions | 642 | (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions |
607 | (y == NULL) ? p[1] = 0 : p[1] = y[i]; | 643 | (y == NULL) ? p[1] = 0 : p[1] = y[i]; |
608 | (z == NULL) ? p[2] = 0 : p[2] = z[i]; | 644 | (z == NULL) ? p[2] = 0 : p[2] = z[i]; |
645 | + p = p - c; | ||
609 | r = p.len(); | 646 | r = p.len(); |
610 | if(r < a){ | 647 | if(r < a){ |
611 | E[i] = 0; | 648 | E[i] = 0; |
612 | for(size_t w = 0; w < W.size(); w++){ | 649 | for(size_t w = 0; w < W.size(); w++){ |
650 | + Ew = W[w].E() * exp(stim::complex<double>(0, W[w].kvec().dot(c))); | ||
613 | knr = (stim::complex<double>)n * p.len() * W[w].kmag(); //calculate k*n*r | 651 | knr = (stim::complex<double>)n * p.len() * W[w].kmag(); //calculate k*n*r |
614 | 652 | ||
615 | stim::cbessjyva_sph<double>(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr); | 653 | stim::cbessjyva_sph<double>(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr); |
@@ -620,18 +658,18 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | @@ -620,18 +658,18 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | ||
620 | stim::legendre<T>(Nl, cos_phi, P); | 658 | stim::legendre<T>(Nl, cos_phi, P); |
621 | 659 | ||
622 | for(size_t l = 0; l <= Nl; l++){ | 660 | for(size_t l = 0; l <= Nl; l++){ |
623 | - E[i] += W[w].E() * A[l] * (stim::complex<T>)j_knr[l] * P[l]; | 661 | + E[i] += Ew * A[l] * (stim::complex<T>)j_knr[l] * P[l]; |
624 | } | 662 | } |
625 | } | 663 | } |
626 | } | 664 | } |
627 | } | 665 | } |
628 | -#endif | 666 | +//#endif |
629 | } | 667 | } |
630 | 668 | ||
631 | template<typename T> | 669 | template<typename T> |
632 | -void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, T r_spacing = 0.1){ | 670 | +void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w, T a, stim::complex<T> n, stim::vec3<T> c = stim::vec3<T>(0, 0, 0)){ |
633 | std::vector< stim::scalarwave<T> > W(1, w); | 671 | std::vector< stim::scalarwave<T> > W(1, w); |
634 | - cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, r_spacing); | 672 | + cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, c); |
635 | } | 673 | } |
636 | 674 | ||
637 | 675 | ||
@@ -639,59 +677,89 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | @@ -639,59 +677,89 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | ||
639 | template<typename T> | 677 | template<typename T> |
640 | class scalarmie | 678 | class scalarmie |
641 | { | 679 | { |
642 | -private: | 680 | +public: |
643 | T radius; //radius of the scattering sphere | 681 | T radius; //radius of the scattering sphere |
644 | stim::complex<T> n; //refractive index of the scattering sphere | 682 | stim::complex<T> n; //refractive index of the scattering sphere |
683 | + vec3<T> c; //position of the sphere in space | ||
645 | 684 | ||
646 | public: | 685 | public: |
647 | 686 | ||
648 | - scalarmie(T r, stim::complex<T> ri){ | 687 | + scalarmie() { //default constructor |
688 | + radius = 0.5; | ||
689 | + n = stim::complex<T>(1.4, 0.0); | ||
690 | + c = stim::vec3<T>(0, 0, 0); | ||
691 | + } | ||
692 | + | ||
693 | + scalarmie(T r, stim::complex<T> ri, stim::vec3<T> center = stim::vec3<T>(0, 0, 0)){ | ||
649 | radius = r; | 694 | radius = r; |
650 | n = ri; | 695 | n = ri; |
696 | + c = center; | ||
697 | + //c = stim::vec3<T>(2, 1, 0); | ||
651 | } | 698 | } |
652 | 699 | ||
653 | - void sum_scat(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){ | ||
654 | - std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves | ||
655 | - stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); | ||
656 | - } | 700 | + //void sum_scat(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){ |
701 | + // std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves | ||
702 | + // stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); | ||
703 | + //} | ||
657 | 704 | ||
658 | - void sum_intern(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){ | ||
659 | - std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves | ||
660 | - stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); | ||
661 | - } | 705 | + //void sum_intern(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){ |
706 | + // std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves | ||
707 | + // stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); | ||
708 | + //} | ||
662 | 709 | ||
663 | - void eval(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int order = 500, int samples = 1000){ | ||
664 | - b.eval(E, X, Y, Z, order); //evaluate the incident field using a plane wave expansion | ||
665 | - std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves | ||
666 | - sum_scat(E, X, Y, Z, b, samples); | ||
667 | - sum_intern(E, X, Y, Z, b, samples); | ||
668 | - } | 710 | + //void eval(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int order = 500, int samples = 1000){ |
711 | + // b.eval(E, X, Y, Z, order); //evaluate the incident field using a plane wave expansion | ||
712 | + // std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves | ||
713 | + // sum_scat(E, X, Y, Z, b, samples); | ||
714 | + // sum_intern(E, X, Y, Z, b, samples); | ||
715 | + //} | ||
669 | 716 | ||
670 | void eval(stim::scalarfield<T>& E, stim::scalarbeam<T> b, int order = 500, int samples = 1000){ | 717 | void eval(stim::scalarfield<T>& E, stim::scalarbeam<T> b, int order = 500, int samples = 1000){ |
671 | 718 | ||
672 | - /*size_t array_size = E.grid_bytes(); //calculate the number of bytes in the scalar grid | ||
673 | - float* X = (float*) malloc( array_size ); //allocate space for the coordinate meshes | ||
674 | - float* Y = (float*) malloc( array_size ); | ||
675 | - float* Z = (float*) malloc( array_size ); | ||
676 | - E.meshgrid(X, Y, Z, stim::CPUmem); //calculate the coordinate meshes | ||
677 | - */ | ||
678 | E.meshgrid(); | 719 | E.meshgrid(); |
679 | b.eval(E, order); | 720 | b.eval(E, order); |
680 | 721 | ||
681 | std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves | 722 | std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves |
682 | 723 | ||
683 | if(E.gpu()){ | 724 | if(E.gpu()){ |
684 | - stim::gpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); | 725 | + stim::gpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c, E.spacing()); |
685 | } | 726 | } |
686 | else{ | 727 | else{ |
687 | stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); | 728 | stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); |
688 | stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); | 729 | stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); |
689 | } | 730 | } |
690 | - //eval(E, X, Y, Z, b, order, samples); //evaluate the field | ||
691 | } | 731 | } |
692 | 732 | ||
693 | }; //end stim::scalarmie | 733 | }; //end stim::scalarmie |
694 | 734 | ||
735 | +template<typename T> | ||
736 | +class scalarcluster : public std::vector< scalarmie<T> > { | ||
737 | +public: | ||
738 | + | ||
739 | + void eval(stim::scalarfield<T>& E, stim::scalarbeam<T> b, int order = 500, int samples = 1000) { | ||
740 | + E.meshgrid(); | ||
741 | + b.eval(E, order); | ||
742 | + | ||
743 | + std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves | ||
744 | + | ||
745 | + T radius; | ||
746 | + stim::complex<T> n; | ||
747 | + stim::vec3<T> c; | ||
748 | + for (size_t si = 0; si < size(); si++) { //for each sphere in the cluster | ||
749 | + radius = at(si).radius; | ||
750 | + n = at(si).n; | ||
751 | + c = at(si).c; | ||
752 | + if (E.gpu()) { | ||
753 | + stim::gpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c, E.spacing()); | ||
754 | + } | ||
755 | + else { | ||
756 | + stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c); | ||
757 | + stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c); | ||
758 | + } | ||
759 | + } | ||
760 | + } | ||
761 | +}; | ||
762 | + | ||
695 | } //end namespace stim | 763 | } //end namespace stim |
696 | 764 | ||
697 | #endif | 765 | #endif |
698 | \ No newline at end of file | 766 | \ No newline at end of file |
stim/optics/scalarwave.h
@@ -252,7 +252,7 @@ void gpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scal | @@ -252,7 +252,7 @@ void gpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scal | ||
252 | size_t batch_size; //declare a variable to store the size of the current batch | 252 | size_t batch_size; //declare a variable to store the size of the current batch |
253 | size_t waves_processed = 0; //initialize the number of waves processed to zero | 253 | size_t waves_processed = 0; //initialize the number of waves processed to zero |
254 | while(waves_processed < nW){ //while there are still waves to be processed | 254 | while(waves_processed < nW){ //while there are still waves to be processed |
255 | - batch_size = min<size_t>(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left | 255 | + batch_size = std::min<size_t>(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left |
256 | batch_bytes = batch_size * sizeof(stim::scalarwave<T>); | 256 | batch_bytes = batch_size * sizeof(stim::scalarwave<T>); |
257 | HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory | 257 | HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory |
258 | cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(F, N, x, y, z, batch_W, batch_size); //call the kernel | 258 | cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(F, N, x, y, z, batch_W, batch_size); //call the kernel |
stim/parser/table.h
1 | -#ifndef STIM_CSV_H | ||
2 | -#define STIM_CSV_H | 1 | +#ifndef STIM_TABLE_H |
2 | +#define STIM_TABLE_H | ||
3 | 3 | ||
4 | #include <type_traits> | 4 | #include <type_traits> |
5 | #include <vector> | 5 | #include <vector> |
@@ -102,6 +102,27 @@ namespace stim{ | @@ -102,6 +102,27 @@ namespace stim{ | ||
102 | return result; //return the casted table | 102 | return result; //return the casted table |
103 | } | 103 | } |
104 | 104 | ||
105 | + size_t rows(){ | ||
106 | + return TABLE.size(); | ||
107 | + } | ||
108 | + | ||
109 | + size_t cols(size_t ri){ | ||
110 | + return TABLE[ri].size(); | ||
111 | + } | ||
112 | + | ||
113 | + //returns the maximum number of columns in the table | ||
114 | + size_t cols(){ | ||
115 | + size_t max_c = 0; | ||
116 | + for(size_t ri = 0; ri < rows(); ri++){ | ||
117 | + max_c = std::max(max_c, cols(ri)); | ||
118 | + } | ||
119 | + return max_c; | ||
120 | + } | ||
121 | + | ||
122 | + std::string operator()(size_t ri, size_t ci){ | ||
123 | + return TABLE[ri][ci]; | ||
124 | + } | ||
125 | + | ||
105 | }; | 126 | }; |
106 | }; | 127 | }; |
107 | 128 |