Commit 6a53ac0c677590fee03b1fc26a0f56f8e07c5055

Authored by David Mayerich
1 parent 59b0e5f5

updated optics files and FindSTIM to support new CMake standards

cmake/FindSTIM.cmake
@@ -3,20 +3,25 @@ @@ -3,20 +3,25 @@
3 3
4 include(FindPackageHandleStandardArgs) 4 include(FindPackageHandleStandardArgs)
5 5
6 -set(STIM_INCLUDE_DIR $ENV{STIMLIB_PATH}) 6 +set(STIM_ROOT $ENV{STIM_ROOT})
7 7
8 -find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIR) 8 +IF(NOT UNIX)
  9 + IF(NOT STIM_ROOT)
  10 + MESSAGE("ERROR: STIM_ROOT environment variable must be set!")
  11 + ENDIF(NOT STIM_ROOT)
  12 +
  13 + FIND_PATH(STIM_INCLUDE_DIRS DOC "Path to GLFW include directory."
  14 + NAMES stim/image/image.h
  15 + PATHS ${STIM_ROOT})
  16 +ENDIF(NOT UNIX)
  17 +
  18 +find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIRS)
9 19
10 if(STIM_FOUND) 20 if(STIM_FOUND)
11 - set(STIM_INCLUDE_DIRS ${STIM_INCLUDE_DIR}) 21 + set(STIM_INCLUDE_DIRS ${STIM_INCLUDE_DIRS})
12 elseif(STIM_FOUND) 22 elseif(STIM_FOUND)
13 - #if the STIM library isn't found, download it  
14 - #file(REMOVE_RECURSE ${CMAKE_BINARY_DIR}/stimlib) #remove the stimlib directory if it exists  
15 - #set(STIM_GIT "https://git.stim.ee.uh.edu/codebase/stimlib.git")  
16 - #execute_process(COMMAND git clone --depth 1 ${STIM_GIT} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})  
17 - #set(STIM_INCLUDE_DIRS "${CMAKE_BINARY_DIR}/stimlib" CACHE TYPE PATH)  
18 - message("STIM library not found. Set the STIMLIB_PATH environment variable to the STIMLIB location.") 23 + message("STIM library not found. Set the STIM_ROOT environment variable to the STIM location.")
19 message("STIMLIB can be found here: https://git.stim.ee.uh.edu/codebase/stimlib") 24 message("STIMLIB can be found here: https://git.stim.ee.uh.edu/codebase/stimlib")
20 endif(STIM_FOUND) 25 endif(STIM_FOUND)
21 26
22 -find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIR) 27 +find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIRS)
@@ -128,7 +128,10 @@ public: @@ -128,7 +128,10 @@ public:
128 return false; 128 return false;
129 size_t targetBytes = H.data_bytes(); //get the number of bytes that SHOULD be in the data file 129 size_t targetBytes = H.data_bytes(); //get the number of bytes that SHOULD be in the data file
130 size_t bytes = stim::file_size(fname); 130 size_t bytes = stim::file_size(fname);
131 - if(bytes != targetBytes) return false; //if the data doesn't match the header, return false 131 + if (bytes != targetBytes) {
  132 + std::cout << "ERROR: File size mismatch. Based on the header, a " << targetBytes << " byte file was expected. The data file contains " << bytes << " bytes." << std::endl;
  133 + return false; //if the data doesn't match the header, return false
  134 + }
132 return true; //otherwise everything looks fine 135 return true; //otherwise everything looks fine
133 136
134 } 137 }
stim/math/complex.h
@@ -11,7 +11,7 @@ @@ -11,7 +11,7 @@
11 11
12 namespace stim 12 namespace stim
13 { 13 {
14 - enum complexComponentType {complexReal, complexImaginary, complexMag, complexIntensity}; 14 + enum complexComponentType {complexFull, complexReal, complexImaginary, complexMag, complexIntensity};
15 15
16 template <class T> 16 template <class T>
17 struct complex 17 struct complex
stim/optics/scalarbeam.h
@@ -415,8 +415,8 @@ public: @@ -415,8 +415,8 @@ public:
415 A = amplitude; 415 A = amplitude;
416 f = focal_point; 416 f = focal_point;
417 d = direction.norm(); //make sure that the direction vector is normalized (makes calculations more efficient later on) 417 d = direction.norm(); //make sure that the direction vector is normalized (makes calculations more efficient later on)
418 - NA[0] = numerical_aperture;  
419 - NA[1] = center_obsc; 418 + NA[0] = center_obsc;
  419 + NA[1] = numerical_aperture;
420 } 420 }
421 421
422 ///Numerical Aperature functions 422 ///Numerical Aperature functions
@@ -425,21 +425,32 @@ public: @@ -425,21 +425,32 @@ public:
425 NA[0] = (T)0; 425 NA[0] = (T)0;
426 NA[1] = na; 426 NA[1] = na;
427 } 427 }
428 - void setNA(T na0, T na1) 428 + void setNA(T na_in, T na_out)
429 { 429 {
430 - NA[0] = na0;  
431 - NA[1] = na1; 430 + NA[0] = na_in;
  431 + NA[1] = na_out;
432 } 432 }
433 433
434 //Monte-Carlo decomposition into plane waves 434 //Monte-Carlo decomposition into plane waves
435 std::vector< scalarwave<T> > mc(size_t N = 100000) const{ 435 std::vector< scalarwave<T> > mc(size_t N = 100000) const{
436 436
437 - std::vector< stim::vec3<T> > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]); //generate a random set of N vectors forming a focus  
438 - std::vector< scalarwave<T> > samples(N); //create a vector of plane waves  
439 T kmag = (T)stim::TAU / lambda; //calculate the wavenumber 437 T kmag = (T)stim::TAU / lambda; //calculate the wavenumber
  438 + stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction
440 stim::complex<T> apw; //allocate space for the amplitude at the focal point 439 stim::complex<T> apw; //allocate space for the amplitude at the focal point
  440 +
  441 + //deal with the degenerative case where the outer NA is 0, in which case the beam will be specified as a single plane wave
  442 + if (NA[1] == 0) {
  443 + std::vector< scalarwave<T> > samples(1); //create a vector containing 1 sample
  444 + kpw = d * kmag; //calculate the k-vector for the plane wave (beam direction scaled by wavenumber)
  445 + apw = A * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude of the plane wave
  446 + samples[0] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction
  447 + return samples;
  448 + }
  449 +
  450 + //otherwise, evaluate the system using N monte-carlo samples
  451 + std::vector< stim::vec3<T> > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]); //generate a random set of N vectors forming a focus
  452 + std::vector< scalarwave<T> > samples(N); //create a vector of plane waves
441 T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N); //constant value weights plane waves based on the aperture and number of samples (N) 453 T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N); //constant value weights plane waves based on the aperture and number of samples (N)
442 - stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction  
443 for(size_t i=0; i<N; i++){ //for each sample 454 for(size_t i=0; i<N; i++){ //for each sample
444 kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave 455 kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave
445 apw = a * A * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave 456 apw = a * A * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave
@@ -449,17 +460,18 @@ public: @@ -449,17 +460,18 @@ public:
449 } 460 }
450 461
451 void eval(stim::scalarfield<T>& E, T* X, T* Y, T* Z, int order = 500){ 462 void eval(stim::scalarfield<T>& E, T* X, T* Y, T* Z, int order = 500){
452 - cpu_scalar_psf_cart<T>(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[0], NA[1], order, E.spacing()); 463 + cpu_scalar_psf_cart<T>(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[1], NA[0], order, E.spacing());
453 } 464 }
454 465
455 /// Evaluate the beam to a scalar field using Debye focusing 466 /// Evaluate the beam to a scalar field using Debye focusing
456 void eval(stim::scalarfield<T>& E, int order = 500){ 467 void eval(stim::scalarfield<T>& E, int order = 500){
457 - E.meshgrid(); //calculate a meshgrid if one isn't created 468 + E.meshgrid(); //calculate a meshgrid if one isn't created
  469 +
458 if(E.gpu()) 470 if(E.gpu())
459 - gpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[0], NA[1], order, E.spacing()); 471 + gpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[1], NA[0], order, E.spacing());
460 else 472 else
461 - cpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[0], NA[1], order, E.spacing());  
462 - //eval(E, E.x(), E.y(), E.z(), order); 473 + cpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[1], NA[0], order, E.spacing());
  474 +
463 } 475 }
464 476
465 /// Calculate the field at a given point 477 /// Calculate the field at a given point
stim/optics/scalarfield.h
@@ -538,14 +538,31 @@ public: @@ -538,14 +538,31 @@ public:
538 memset(E, 0, grid_bytes()); 538 memset(E, 0, grid_bytes());
539 } 539 }
540 540
541 - void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer, T minval = 0, T maxval = 0){ 541 + //write the field as a raw image to disk
  542 + void image_raw(std::string filename) {
  543 + if (loc == GPUmem) {
  544 + T* cpu_field = (T*)malloc(sizeof(T) * 2 * size()); //allocate temporary space on the CPU to store the image
  545 + HANDLE_ERROR(cudaMemcpy(cpu_field, E, sizeof(T) * 2 * size(), cudaMemcpyDeviceToHost)); //copy the field data from the GPU to the CPU
  546 + std::ofstream outfile(filename, std::ios::binary); //open a binary file for writing
  547 + outfile.write((const char*)cpu_field, sizeof(T) * 2 * size()); //write the raw field to disk
  548 + free(cpu_field); //free memory
  549 + }
  550 + //if the data is stored on the CPU, no need to cut it - just save it to disk
  551 + else {
  552 + std::ofstream outfile(filename, std::ios::binary); //open a binary file for writing
  553 + outfile.write((const char*)E, sizeof(T) * 2 * size()); //write the raw field to disk
  554 + }
  555 + }
542 556
  557 + void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer, T minval = 0, T maxval = 0){
  558 +
543 if(loc == GPUmem){ 559 if(loc == GPUmem){
544 T* image; 560 T* image;
545 HANDLE_ERROR( cudaMalloc(&image, sizeof(T) * size()) ); 561 HANDLE_ERROR( cudaMalloc(&image, sizeof(T) * size()) );
546 int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device 562 int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
547 dim3 blocks( R[0] * R[1] / threads + 1 ); //create a 1D array of blocks 563 dim3 blocks( R[0] * R[1] / threads + 1 ); //create a 1D array of blocks
548 564
  565 + // if the data is located on the GPU, execute a kernel that converts the image to the requested data type
549 switch(type){ 566 switch(type){
550 case complexMag: 567 case complexMag:
551 cuda_abs<T><<< blocks, threads >>>(image, E, size()); 568 cuda_abs<T><<< blocks, threads >>>(image, E, size());
@@ -559,6 +576,9 @@ public: @@ -559,6 +576,9 @@ public:
559 case complexIntensity: 576 case complexIntensity:
560 cuda_intensity<T><<< blocks, threads >>>(image, E, size()); 577 cuda_intensity<T><<< blocks, threads >>>(image, E, size());
561 break; 578 break;
  579 + default:
  580 + std::cout << "ERROR: invalid complex component specified." << std::endl;
  581 + exit(1);
562 } 582 }
563 if (minval == maxval) 583 if (minval == maxval)
564 stim::gpu2image<T>(image, filename, R[0], R[1], cmap); 584 stim::gpu2image<T>(image, filename, R[0], R[1], cmap);
stim/optics/scalarmie.h
@@ -88,11 +88,28 @@ void A_coefficients(stim::complex&lt;T&gt;* A, T a, T k, stim::complex&lt;T&gt; n, int Nl){ @@ -88,11 +88,28 @@ void A_coefficients(stim::complex&lt;T&gt;* A, T a, T k, stim::complex&lt;T&gt; n, int Nl){
88 } 88 }
89 89
90 #define LOCAL_NL 16 90 #define LOCAL_NL 16
  91 +
  92 +/// CUDA kernel for calculating the Mie scattering solution given a set of points (x, y, z), a list of plane waves, and a look-up table for Bl*hl
  93 +/// @param E (GPU) is the N x N destination scalar field
  94 +/// @param N is the number of sample points to evaluate
  95 +/// @param x (GPU) is the grid of X coordinates for each point in E
  96 +/// @param y (GPU) is the grid of Y coordinates for each point in E
  97 +/// @param z (GPU) is the grid of Z coordinates for each point in E
  98 +/// @param W (GPU) is an array of coherent scalar plane waves incident on the Mie scatterer
  99 +/// @param nW is the number of plane waves to evaluate (sum)
  100 +/// @param a is the radius of the Mie scatterer
  101 +/// @param n is the complex refractive index of the Mie scatterer
  102 +/// @param c is the position of the sphere in (x, y, z)
  103 +/// @param hB (GPU) is a look-up table of Hankel functions (equally spaced in distance from the sphere) pre-multiplied with scattering coefficients
  104 +/// @param kr_min is the minimum kr value in the hB look-up table (corresponding to the closest point to the sphere)
  105 +/// @param dkr is the spacing (in kr) between samples of the hB look-up table
  106 +/// @param N_hB is the number of samples in hB
  107 +/// @param Nl is the order of the calculation (number of Hankel function orders)
91 template<typename T> 108 template<typename T>
92 __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::vec3<T> c, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){ 109 __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::vec3<T> c, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){
93 - extern __shared__ stim::complex<T> shared_hB[]; //declare the list of waves in shared memory 110 + extern __shared__ stim::complex<T> shared_hB[]; //declare the list of waves in shared memory
94 111
95 - size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array 112 + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the sample array (sample point associated with this thread)
96 if(i >= N) return; //exit if this thread is outside the array 113 if(i >= N) return; //exit if this thread is outside the array
97 stim::vec3<T> p; 114 stim::vec3<T> p;
98 (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions 115 (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
@@ -101,14 +118,14 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* @@ -101,14 +118,14 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T*
101 p = p - c; 118 p = p - c;
102 T r = p.len(); //calculate the distance from the sphere 119 T r = p.len(); //calculate the distance from the sphere
103 if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) 120 if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field)
104 - T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT 121 + T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT
105 size_t ij = (size_t) fij; //convert to an integral index 122 size_t ij = (size_t) fij; //convert to an integral index
106 T alpha = fij - ij; //calculate the fractional portion of the index 123 T alpha = fij - ij; //calculate the fractional portion of the index
107 - size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT  
108 - size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT 124 + size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT
  125 + size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT
109 126
110 T cos_phi; 127 T cos_phi;
111 - T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials 128 + T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials
112 129
113 stim::complex<T> hBl; 130 stim::complex<T> hBl;
114 stim::complex<T> Ei = 0; //create a register to store the result 131 stim::complex<T> Ei = 0; //create a register to store the result
@@ -154,25 +171,28 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* @@ -154,25 +171,28 @@ __global__ void cuda_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T*
154 E[i] += Ei; //copy the result to device memory 171 E[i] += Ei; //copy the result to device memory
155 } 172 }
156 173
157 -///Calculate the scalar Mie scattered field on the GPU 174 +///Calculate the scalar Mie scattered field on the GPU when a list of GPU-based pre-multiplied Hankel functions are available
158 /// @param E (GPU) is the N x N destination scalar field 175 /// @param E (GPU) is the N x N destination scalar field
159 /// @param N is the number fo elements of the scalar field in each direction 176 /// @param N is the number fo elements of the scalar field in each direction
160 /// @param x (GPU) is the grid of X coordinates for each point in E 177 /// @param x (GPU) is the grid of X coordinates for each point in E
161 /// @param y (GPU) is the grid of Y coordinates for each point in E 178 /// @param y (GPU) is the grid of Y coordinates for each point in E
162 /// @param z (GPU) is the grid of Z coordinates for each point in E 179 /// @param z (GPU) is the grid of Z coordinates for each point in E
163 -/// @param W (CPU) is an array of coherent scalar plane waves incident on the Mie scatterer 180 +/// @param W (GPU) is an array of coherent scalar plane waves incident on the Mie scatterer
  181 +/// @param nW is the number of plane waves to evaluate (sum)
164 /// @param a is the radius of the Mie scatterer 182 /// @param a is the radius of the Mie scatterer
165 /// @param n is the complex refractive index of the Mie scatterer 183 /// @param n is the complex refractive index of the Mie scatterer
166 -/// @param r_spacing is the minimum distance between r values of the sample points in E (used to calculate look-up tables) 184 +/// @param c is the position of the sphere in (x, y, z)
  185 +/// @param hB (GPU) is a look-up table of Hankel functions (equally spaced in distance from the sphere) pre-multiplied with scattering coefficients
  186 +/// @param kr_min is the minimum kr value in the hB look-up table (corresponding to the closest point to the sphere)
  187 +/// @param dkr is the spacing (in kr) between samples of the hB look-up table
  188 +/// @param N_hB is the number of samples in hB
  189 +/// @param Nl is the order of the calculation (number of Hankel function orders)
167 template<typename T> 190 template<typename T>
168 void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::vec3<T> c, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){ 191 void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::vec3<T> c, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){
169 192
170 - size_t max_shared_mem = stim::sharedMemPerBlock();  
171 - size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);  
172 - //std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl;  
173 - //std::cout<<"shared memory: "<<max_shared_mem<<std::endl;  
174 - int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);  
175 - //std::cout<<"threads per block: "<<threads<<std::endl; 193 + size_t max_shared_mem = stim::sharedMemPerBlock(); //get the amount of shared memory per block
  194 + size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1); //calculate the number of bytes required to store the LUT corresponding to a single sample in shared memory
  195 + int threads = (int)((max_shared_mem / hBl_array) / 32 * 32); //calculate the optimal number of threads per block (make sure it's divisible by the number of warps - 32)
176 dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks 196 dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
177 197
178 size_t shared_mem; 198 size_t shared_mem;
@@ -197,7 +217,7 @@ __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N, stim::vec3&lt;T&gt; c = st @@ -197,7 +217,7 @@ __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N, stim::vec3&lt;T&gt; c = st
197 217
198 ///Calculate the scalar Mie scattered field on the GPU 218 ///Calculate the scalar Mie scattered field on the GPU
199 /// @param E (GPU) is the N x N destination scalar field 219 /// @param E (GPU) is the N x N destination scalar field
200 -/// @param N is the number fo elements of the scalar field in each direction 220 +/// @param N is the number of sample points of the scalar field
201 /// @param x (GPU) is the grid of X coordinates for each point in E 221 /// @param x (GPU) is the grid of X coordinates for each point in E
202 /// @param y (GPU) is the grid of Y coordinates for each point in E 222 /// @param y (GPU) is the grid of Y coordinates for each point in E
203 /// @param z (GPU) is the grid of Z coordinates for each point in E 223 /// @param z (GPU) is the grid of Z coordinates for each point in E
@@ -211,29 +231,28 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std @@ -211,29 +231,28 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
211 //calculate the necessary number of orders required to represent the scattered field 231 //calculate the necessary number of orders required to represent the scattered field
212 T k = W[0].kmag(); 232 T k = W[0].kmag();
213 233
214 - int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); 234 + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); //calculate the number of orders required to represent the sphere
215 if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) 235 if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization)
216 - //std::cout<<"Nl: "<<Nl<<std::endl;  
217 236
218 //calculate the scattering coefficients for the sphere 237 //calculate the scattering coefficients for the sphere
219 stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients 238 stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients
220 - B_coefficients(B, a, k, n, Nl); 239 + B_coefficients(B, a, k, n, Nl); //calculate the scattering coefficients
221 240
222 // PLANE WAVES 241 // PLANE WAVES
223 - stim::scalarwave<T>* dev_W; //allocate space and copy plane waves 242 + stim::scalarwave<T>* dev_W; //allocate space and copy plane waves
224 HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) ); 243 HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
225 HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) ); 244 HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
226 245
227 // BESSEL FUNCTION LOOK-UP TABLE 246 // BESSEL FUNCTION LOOK-UP TABLE
228 - //calculate the distance from the sphere center  
229 - T* dev_r;  
230 - HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) ); 247 + //calculate the distance from the sphere center at each sample point and store the result in dev_r
  248 + T* dev_r; //declare the device pointer to hold the distance from the sphere center
  249 + HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) ); //allocate space for the array of distances
231 250
232 - int threads = stim::maxThreadsPerBlock();  
233 - dim3 blocks((unsigned)(N / threads + 1));  
234 - cuda_dist<T> <<< blocks, threads >>>(dev_r, x, y, z, N, c); 251 + int threads = stim::maxThreadsPerBlock(); //query the device to find the maximum number of threads per block
  252 + dim3 blocks((unsigned)(N / threads + 1)); //calculate the number of blocks necessary to evaluate the total number of sample points N
  253 + cuda_dist<T> <<< blocks, threads >>>(dev_r, x, y, z, N, c); //calculate the distance
235 254
236 - //Find the minimum and maximum values of r 255 + //Use the cuBLAS library to find the minimum and maximum distances from the sphere center. This will be used to create a look-up table for the Hankel functions
237 cublasStatus_t stat; 256 cublasStatus_t stat;
238 cublasHandle_t handle; 257 cublasHandle_t handle;
239 258
@@ -261,49 +280,40 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std @@ -261,49 +280,40 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
261 HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU 280 HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU
262 HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); 281 HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
263 282
264 - r_min = max(r_min, a); //if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere)  
265 -  
266 - //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r  
267 - size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1); 283 + r_min = max(r_min, a); //if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere)
268 284
269 - //T kr_min = k * r_min;  
270 - //T kr_max = k * r_max; 285 + size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r
271 286
272 - //temporary variables  
273 - double vm; //allocate space to store the return values for the bessel function calculation 287 + //Declare and evaluate variables used to calculate the spherical Bessel functions and store them temporarily on the CPU
  288 + double vm; //allocate space to store the return values for the bessel function calculation
274 double* jv = (double*) malloc( (Nl + 1) * sizeof(double) ); 289 double* jv = (double*) malloc( (Nl + 1) * sizeof(double) );
275 double* yv = (double*) malloc( (Nl + 1) * sizeof(double) ); 290 double* yv = (double*) malloc( (Nl + 1) * sizeof(double) );
276 double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); 291 double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
277 double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); 292 double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
278 293
279 - size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut;  
280 - stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes); //pointer to the look-up table  
281 - T dr = (r_max - r_min) / (N_hB_lut-1); //distance between values in the LUT  
282 - //std::cout<<"LUT jl bytes: "<<hB_bytes<<std::endl;  
283 - stim::complex<T> hl;  
284 - for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT 294 + size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut; //calculate the number of bytes necessary to store the Hankel function LUT
  295 + stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes); //pointer to the look-up table
  296 + T dr = (r_max - r_min) / (N_hB_lut-1); //calculate the optimal distance between values in the LUT
  297 + stim::complex<T> hl; //declare a complex value for the Hankel function result
  298 + for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT
285 stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] 299 stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
286 - for(size_t l = 0; l <= Nl; l++){ //for each order  
287 - hl.r = (T)jv[l]; 300 + for(size_t l = 0; l <= Nl; l++){ //for each order
  301 + hl.r = (T)jv[l]; //generate the spherical Hankel function from the Bessel functions
288 hl.i = (T)yv[l]; 302 hl.i = (T)yv[l];
289 303
290 - hB_lut[ri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result  
291 - //std::cout<<hB_lut[ri * (Nl + 1) + l]<<std::endl; 304 + hB_lut[ri * (Nl + 1) + l] = hl * B[l]; //pre-multiply the Hankel function by the scattering coefficients
292 } 305 }
293 } 306 }
294 - //T* real_lut = (T*) malloc(hB_bytes/2);  
295 - //stim::real(real_lut, hB_lut, N_hB_lut);  
296 - //stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer);  
297 307
298 - //Allocate device memory and copy everything to the GPU 308 + //Copy the pre-multiplied Hankel function look-up table to the GPU - this LUT gives a list of uniformly spaced Hankel function values pre-multiplied by scattering coefficients
299 stim::complex<T>* dev_hB_lut; 309 stim::complex<T>* dev_hB_lut;
300 HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); 310 HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
301 HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); 311 HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );
302 - //std::cout << "r_min: " << r_min << std::endl;  
303 - //std::cout << "dr: " << dr << std::endl; 312 +
  313 + //calculate the Mie scattering solution on the GPU
304 gpu_scalar_mie_scatter<T>(E, N, x, y, z, dev_W, W.size(), a, n, c, dev_hB_lut, r_min, dr, N_hB_lut, Nl); 314 gpu_scalar_mie_scatter<T>(E, N, x, y, z, dev_W, W.size(), a, n, c, dev_hB_lut, r_min, dr, N_hB_lut, Nl);
305 315
306 - HANDLE_ERROR(cudaMemcpy(E, E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost)); //copy the field from device memory 316 + //HANDLE_ERROR(cudaMemcpy(E, E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost)); //copy the field from device memory
307 317
308 HANDLE_ERROR(cudaFree(dev_hB_lut)); 318 HANDLE_ERROR(cudaFree(dev_hB_lut));
309 HANDLE_ERROR(cudaFree(dev_r)); 319 HANDLE_ERROR(cudaFree(dev_r));
stim/parser/filename.h
@@ -238,7 +238,7 @@ public: @@ -238,7 +238,7 @@ public:
238 } 238 }
239 239
240 /// Create a matching file locator with a prefix s 240 /// Create a matching file locator with a prefix s
241 - stim::filename prefix(std::string s){ 241 + stim::filename with_prefix(std::string s){
242 stim::filename result = *this; 242 stim::filename result = *this;
243 result._prefix = s; 243 result._prefix = s;
244 return result; 244 return result;