Commit 6a53ac0c677590fee03b1fc26a0f56f8e07c5055
1 parent
59b0e5f5
updated optics files and FindSTIM to support new CMake standards
Showing
7 changed files
with
129 additions
and
79 deletions
Show diff stats
cmake/FindSTIM.cmake
... | ... | @@ -3,20 +3,25 @@ |
3 | 3 | |
4 | 4 | include(FindPackageHandleStandardArgs) |
5 | 5 | |
6 | -set(STIM_INCLUDE_DIR $ENV{STIMLIB_PATH}) | |
6 | +set(STIM_ROOT $ENV{STIM_ROOT}) | |
7 | 7 | |
8 | -find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIR) | |
8 | +IF(NOT UNIX) | |
9 | + IF(NOT STIM_ROOT) | |
10 | + MESSAGE("ERROR: STIM_ROOT environment variable must be set!") | |
11 | + ENDIF(NOT STIM_ROOT) | |
12 | + | |
13 | + FIND_PATH(STIM_INCLUDE_DIRS DOC "Path to GLFW include directory." | |
14 | + NAMES stim/image/image.h | |
15 | + PATHS ${STIM_ROOT}) | |
16 | +ENDIF(NOT UNIX) | |
17 | + | |
18 | +find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIRS) | |
9 | 19 | |
10 | 20 | if(STIM_FOUND) |
11 | - set(STIM_INCLUDE_DIRS ${STIM_INCLUDE_DIR}) | |
21 | + set(STIM_INCLUDE_DIRS ${STIM_INCLUDE_DIRS}) | |
12 | 22 | elseif(STIM_FOUND) |
13 | - #if the STIM library isn't found, download it | |
14 | - #file(REMOVE_RECURSE ${CMAKE_BINARY_DIR}/stimlib) #remove the stimlib directory if it exists | |
15 | - #set(STIM_GIT "https://git.stim.ee.uh.edu/codebase/stimlib.git") | |
16 | - #execute_process(COMMAND git clone --depth 1 ${STIM_GIT} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) | |
17 | - #set(STIM_INCLUDE_DIRS "${CMAKE_BINARY_DIR}/stimlib" CACHE TYPE PATH) | |
18 | - message("STIM library not found. Set the STIMLIB_PATH environment variable to the STIMLIB location.") | |
23 | + message("STIM library not found. Set the STIM_ROOT environment variable to the STIM location.") | |
19 | 24 | message("STIMLIB can be found here: https://git.stim.ee.uh.edu/codebase/stimlib") |
20 | 25 | endif(STIM_FOUND) |
21 | 26 | |
22 | -find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIR) | |
27 | +find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIRS) | ... | ... |
stim/envi/envi.h
... | ... | @@ -128,7 +128,10 @@ public: |
128 | 128 | return false; |
129 | 129 | size_t targetBytes = H.data_bytes(); //get the number of bytes that SHOULD be in the data file |
130 | 130 | size_t bytes = stim::file_size(fname); |
131 | - if(bytes != targetBytes) return false; //if the data doesn't match the header, return false | |
131 | + if (bytes != targetBytes) { | |
132 | + std::cout << "ERROR: File size mismatch. Based on the header, a " << targetBytes << " byte file was expected. The data file contains " << bytes << " bytes." << std::endl; | |
133 | + return false; //if the data doesn't match the header, return false | |
134 | + } | |
132 | 135 | return true; //otherwise everything looks fine |
133 | 136 | |
134 | 137 | } | ... | ... |
stim/math/complex.h
... | ... | @@ -11,7 +11,7 @@ |
11 | 11 | |
12 | 12 | namespace stim |
13 | 13 | { |
14 | - enum complexComponentType {complexReal, complexImaginary, complexMag, complexIntensity}; | |
14 | + enum complexComponentType {complexFull, complexReal, complexImaginary, complexMag, complexIntensity}; | |
15 | 15 | |
16 | 16 | template <class T> |
17 | 17 | struct complex | ... | ... |
stim/optics/scalarbeam.h
... | ... | @@ -415,8 +415,8 @@ public: |
415 | 415 | A = amplitude; |
416 | 416 | f = focal_point; |
417 | 417 | d = direction.norm(); //make sure that the direction vector is normalized (makes calculations more efficient later on) |
418 | - NA[0] = numerical_aperture; | |
419 | - NA[1] = center_obsc; | |
418 | + NA[0] = center_obsc; | |
419 | + NA[1] = numerical_aperture; | |
420 | 420 | } |
421 | 421 | |
422 | 422 | ///Numerical Aperature functions |
... | ... | @@ -425,21 +425,32 @@ public: |
425 | 425 | NA[0] = (T)0; |
426 | 426 | NA[1] = na; |
427 | 427 | } |
428 | - void setNA(T na0, T na1) | |
428 | + void setNA(T na_in, T na_out) | |
429 | 429 | { |
430 | - NA[0] = na0; | |
431 | - NA[1] = na1; | |
430 | + NA[0] = na_in; | |
431 | + NA[1] = na_out; | |
432 | 432 | } |
433 | 433 | |
434 | 434 | //Monte-Carlo decomposition into plane waves |
435 | 435 | std::vector< scalarwave<T> > mc(size_t N = 100000) const{ |
436 | 436 | |
437 | - std::vector< stim::vec3<T> > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]); //generate a random set of N vectors forming a focus | |
438 | - std::vector< scalarwave<T> > samples(N); //create a vector of plane waves | |
439 | 437 | T kmag = (T)stim::TAU / lambda; //calculate the wavenumber |
438 | + stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction | |
440 | 439 | stim::complex<T> apw; //allocate space for the amplitude at the focal point |
440 | + | |
441 | + //deal with the degenerative case where the outer NA is 0, in which case the beam will be specified as a single plane wave | |
442 | + if (NA[1] == 0) { | |
443 | + std::vector< scalarwave<T> > samples(1); //create a vector containing 1 sample | |
444 | + kpw = d * kmag; //calculate the k-vector for the plane wave (beam direction scaled by wavenumber) | |
445 | + apw = A * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude of the plane wave | |
446 | + samples[0] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction | |
447 | + return samples; | |
448 | + } | |
449 | + | |
450 | + //otherwise, evaluate the system using N monte-carlo samples | |
451 | + std::vector< stim::vec3<T> > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]); //generate a random set of N vectors forming a focus | |
452 | + std::vector< scalarwave<T> > samples(N); //create a vector of plane waves | |
441 | 453 | T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N); //constant value weights plane waves based on the aperture and number of samples (N) |
442 | - stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction | |
443 | 454 | for(size_t i=0; i<N; i++){ //for each sample |
444 | 455 | kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave |
445 | 456 | apw = a * A * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave |
... | ... | @@ -449,17 +460,18 @@ public: |
449 | 460 | } |
450 | 461 | |
451 | 462 | void eval(stim::scalarfield<T>& E, T* X, T* Y, T* Z, int order = 500){ |
452 | - cpu_scalar_psf_cart<T>(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[0], NA[1], order, E.spacing()); | |
463 | + cpu_scalar_psf_cart<T>(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[1], NA[0], order, E.spacing()); | |
453 | 464 | } |
454 | 465 | |
455 | 466 | /// Evaluate the beam to a scalar field using Debye focusing |
456 | 467 | void eval(stim::scalarfield<T>& E, int order = 500){ |
457 | - E.meshgrid(); //calculate a meshgrid if one isn't created | |
468 | + E.meshgrid(); //calculate a meshgrid if one isn't created | |
469 | + | |
458 | 470 | if(E.gpu()) |
459 | - gpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[0], NA[1], order, E.spacing()); | |
471 | + gpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[1], NA[0], order, E.spacing()); | |
460 | 472 | else |
461 | - cpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[0], NA[1], order, E.spacing()); | |
462 | - //eval(E, E.x(), E.y(), E.z(), order); | |
473 | + cpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[1], NA[0], order, E.spacing()); | |
474 | + | |
463 | 475 | } |
464 | 476 | |
465 | 477 | /// Calculate the field at a given point | ... | ... |
stim/optics/scalarfield.h
... | ... | @@ -538,14 +538,31 @@ public: |
538 | 538 | memset(E, 0, grid_bytes()); |
539 | 539 | } |
540 | 540 | |
541 | - void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer, T minval = 0, T maxval = 0){ | |
541 | + //write the field as a raw image to disk | |
542 | + void image_raw(std::string filename) { | |
543 | + if (loc == GPUmem) { | |
544 | + T* cpu_field = (T*)malloc(sizeof(T) * 2 * size()); //allocate temporary space on the CPU to store the image | |
545 | + HANDLE_ERROR(cudaMemcpy(cpu_field, E, sizeof(T) * 2 * size(), cudaMemcpyDeviceToHost)); //copy the field data from the GPU to the CPU | |
546 | + std::ofstream outfile(filename, std::ios::binary); //open a binary file for writing | |
547 | + outfile.write((const char*)cpu_field, sizeof(T) * 2 * size()); //write the raw field to disk | |
548 | + free(cpu_field); //free memory | |
549 | + } | |
550 | + //if the data is stored on the CPU, no need to cut it - just save it to disk | |
551 | + else { | |
552 | + std::ofstream outfile(filename, std::ios::binary); //open a binary file for writing | |
553 | + outfile.write((const char*)E, sizeof(T) * 2 * size()); //write the raw field to disk | |
554 | + } | |
555 | + } | |
542 | 556 | |
557 | + void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer, T minval = 0, T maxval = 0){ | |
558 | + | |
543 | 559 | if(loc == GPUmem){ |
544 | 560 | T* image; |
545 | 561 | HANDLE_ERROR( cudaMalloc(&image, sizeof(T) * size()) ); |
546 | 562 | int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device |
547 | 563 | dim3 blocks( R[0] * R[1] / threads + 1 ); //create a 1D array of blocks |
548 | 564 | |
565 | + // if the data is located on the GPU, execute a kernel that converts the image to the requested data type | |
549 | 566 | switch(type){ |
550 | 567 | case complexMag: |
551 | 568 | cuda_abs<T><<< blocks, threads >>>(image, E, size()); |
... | ... | @@ -559,6 +576,9 @@ public: |
559 | 576 | case complexIntensity: |
560 | 577 | cuda_intensity<T><<< blocks, threads >>>(image, E, size()); |
561 | 578 | break; |
579 | + default: | |
580 | + std::cout << "ERROR: invalid complex component specified." << std::endl; | |
581 | + exit(1); | |
562 | 582 | } |
563 | 583 | if (minval == maxval) |
564 | 584 | stim::gpu2image<T>(image, filename, R[0], R[1], cmap); | ... | ... |
stim/optics/scalarmie.h
... | ... | @@ -88,11 +88,28 @@ void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){ |
88 | 88 | } |
89 | 89 | |
90 | 90 | #define LOCAL_NL 16 |
91 | + | |
92 | +/// CUDA kernel for calculating the Mie scattering solution given a set of points (x, y, z), a list of plane waves, and a look-up table for Bl*hl | |
93 | +/// @param E (GPU) is the N x N destination scalar field | |
94 | +/// @param N is the number of sample points to evaluate | |
95 | +/// @param x (GPU) is the grid of X coordinates for each point in E | |
96 | +/// @param y (GPU) is the grid of Y coordinates for each point in E | |
97 | +/// @param z (GPU) is the grid of Z coordinates for each point in E | |
98 | +/// @param W (GPU) is an array of coherent scalar plane waves incident on the Mie scatterer | |
99 | +/// @param nW is the number of plane waves to evaluate (sum) | |
100 | +/// @param a is the radius of the Mie scatterer | |
101 | +/// @param n is the complex refractive index of the Mie scatterer | |
102 | +/// @param c is the position of the sphere in (x, y, z) | |
103 | +/// @param hB (GPU) is a look-up table of Hankel functions (equally spaced in distance from the sphere) pre-multiplied with scattering coefficients | |
104 | +/// @param kr_min is the minimum kr value in the hB look-up table (corresponding to the closest point to the sphere) | |
105 | +/// @param dkr is the spacing (in kr) between samples of the hB look-up table | |
106 | +/// @param N_hB is the number of samples in hB | |
107 | +/// @param Nl is the order of the calculation (number of Hankel function orders) | |
91 | 108 | template<typename T> |
92 | 109 | __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::vec3<T> c, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){ |
93 | - extern __shared__ stim::complex<T> shared_hB[]; //declare the list of waves in shared memory | |
110 | + extern __shared__ stim::complex<T> shared_hB[]; //declare the list of waves in shared memory | |
94 | 111 | |
95 | - size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array | |
112 | + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the sample array (sample point associated with this thread) | |
96 | 113 | if(i >= N) return; //exit if this thread is outside the array |
97 | 114 | stim::vec3<T> p; |
98 | 115 | (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions |
... | ... | @@ -101,14 +118,14 @@ __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* |
101 | 118 | p = p - c; |
102 | 119 | T r = p.len(); //calculate the distance from the sphere |
103 | 120 | if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) |
104 | - T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT | |
121 | + T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT | |
105 | 122 | size_t ij = (size_t) fij; //convert to an integral index |
106 | 123 | T alpha = fij - ij; //calculate the fractional portion of the index |
107 | - size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT | |
108 | - size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT | |
124 | + size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT | |
125 | + size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT | |
109 | 126 | |
110 | 127 | T cos_phi; |
111 | - T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials | |
128 | + T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials | |
112 | 129 | |
113 | 130 | stim::complex<T> hBl; |
114 | 131 | stim::complex<T> Ei = 0; //create a register to store the result |
... | ... | @@ -154,25 +171,28 @@ __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* |
154 | 171 | E[i] += Ei; //copy the result to device memory |
155 | 172 | } |
156 | 173 | |
157 | -///Calculate the scalar Mie scattered field on the GPU | |
174 | +///Calculate the scalar Mie scattered field on the GPU when a list of GPU-based pre-multiplied Hankel functions are available | |
158 | 175 | /// @param E (GPU) is the N x N destination scalar field |
159 | 176 | /// @param N is the number fo elements of the scalar field in each direction |
160 | 177 | /// @param x (GPU) is the grid of X coordinates for each point in E |
161 | 178 | /// @param y (GPU) is the grid of Y coordinates for each point in E |
162 | 179 | /// @param z (GPU) is the grid of Z coordinates for each point in E |
163 | -/// @param W (CPU) is an array of coherent scalar plane waves incident on the Mie scatterer | |
180 | +/// @param W (GPU) is an array of coherent scalar plane waves incident on the Mie scatterer | |
181 | +/// @param nW is the number of plane waves to evaluate (sum) | |
164 | 182 | /// @param a is the radius of the Mie scatterer |
165 | 183 | /// @param n is the complex refractive index of the Mie scatterer |
166 | -/// @param r_spacing is the minimum distance between r values of the sample points in E (used to calculate look-up tables) | |
184 | +/// @param c is the position of the sphere in (x, y, z) | |
185 | +/// @param hB (GPU) is a look-up table of Hankel functions (equally spaced in distance from the sphere) pre-multiplied with scattering coefficients | |
186 | +/// @param kr_min is the minimum kr value in the hB look-up table (corresponding to the closest point to the sphere) | |
187 | +/// @param dkr is the spacing (in kr) between samples of the hB look-up table | |
188 | +/// @param N_hB is the number of samples in hB | |
189 | +/// @param Nl is the order of the calculation (number of Hankel function orders) | |
167 | 190 | template<typename T> |
168 | 191 | void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::vec3<T> c, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){ |
169 | 192 | |
170 | - size_t max_shared_mem = stim::sharedMemPerBlock(); | |
171 | - size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1); | |
172 | - //std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl; | |
173 | - //std::cout<<"shared memory: "<<max_shared_mem<<std::endl; | |
174 | - int threads = (int)((max_shared_mem / hBl_array) / 32 * 32); | |
175 | - //std::cout<<"threads per block: "<<threads<<std::endl; | |
193 | + size_t max_shared_mem = stim::sharedMemPerBlock(); //get the amount of shared memory per block | |
194 | + size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1); //calculate the number of bytes required to store the LUT corresponding to a single sample in shared memory | |
195 | + int threads = (int)((max_shared_mem / hBl_array) / 32 * 32); //calculate the optimal number of threads per block (make sure it's divisible by the number of warps - 32) | |
176 | 196 | dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks |
177 | 197 | |
178 | 198 | size_t shared_mem; |
... | ... | @@ -197,7 +217,7 @@ __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N, stim::vec3<T> c = st |
197 | 217 | |
198 | 218 | ///Calculate the scalar Mie scattered field on the GPU |
199 | 219 | /// @param E (GPU) is the N x N destination scalar field |
200 | -/// @param N is the number fo elements of the scalar field in each direction | |
220 | +/// @param N is the number of sample points of the scalar field | |
201 | 221 | /// @param x (GPU) is the grid of X coordinates for each point in E |
202 | 222 | /// @param y (GPU) is the grid of Y coordinates for each point in E |
203 | 223 | /// @param z (GPU) is the grid of Z coordinates for each point in E |
... | ... | @@ -211,29 +231,28 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std |
211 | 231 | //calculate the necessary number of orders required to represent the scattered field |
212 | 232 | T k = W[0].kmag(); |
213 | 233 | |
214 | - int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); | |
234 | + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); //calculate the number of orders required to represent the sphere | |
215 | 235 | if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) |
216 | - //std::cout<<"Nl: "<<Nl<<std::endl; | |
217 | 236 | |
218 | 237 | //calculate the scattering coefficients for the sphere |
219 | 238 | stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients |
220 | - B_coefficients(B, a, k, n, Nl); | |
239 | + B_coefficients(B, a, k, n, Nl); //calculate the scattering coefficients | |
221 | 240 | |
222 | 241 | // PLANE WAVES |
223 | - stim::scalarwave<T>* dev_W; //allocate space and copy plane waves | |
242 | + stim::scalarwave<T>* dev_W; //allocate space and copy plane waves | |
224 | 243 | HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) ); |
225 | 244 | HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) ); |
226 | 245 | |
227 | 246 | // BESSEL FUNCTION LOOK-UP TABLE |
228 | - //calculate the distance from the sphere center | |
229 | - T* dev_r; | |
230 | - HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) ); | |
247 | + //calculate the distance from the sphere center at each sample point and store the result in dev_r | |
248 | + T* dev_r; //declare the device pointer to hold the distance from the sphere center | |
249 | + HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) ); //allocate space for the array of distances | |
231 | 250 | |
232 | - int threads = stim::maxThreadsPerBlock(); | |
233 | - dim3 blocks((unsigned)(N / threads + 1)); | |
234 | - cuda_dist<T> <<< blocks, threads >>>(dev_r, x, y, z, N, c); | |
251 | + int threads = stim::maxThreadsPerBlock(); //query the device to find the maximum number of threads per block | |
252 | + dim3 blocks((unsigned)(N / threads + 1)); //calculate the number of blocks necessary to evaluate the total number of sample points N | |
253 | + cuda_dist<T> <<< blocks, threads >>>(dev_r, x, y, z, N, c); //calculate the distance | |
235 | 254 | |
236 | - //Find the minimum and maximum values of r | |
255 | + //Use the cuBLAS library to find the minimum and maximum distances from the sphere center. This will be used to create a look-up table for the Hankel functions | |
237 | 256 | cublasStatus_t stat; |
238 | 257 | cublasHandle_t handle; |
239 | 258 | |
... | ... | @@ -261,49 +280,40 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std |
261 | 280 | HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU |
262 | 281 | HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); |
263 | 282 | |
264 | - r_min = max(r_min, a); //if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere) | |
265 | - | |
266 | - //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r | |
267 | - size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1); | |
283 | + r_min = max(r_min, a); //if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere) | |
268 | 284 | |
269 | - //T kr_min = k * r_min; | |
270 | - //T kr_max = k * r_max; | |
285 | + size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r | |
271 | 286 | |
272 | - //temporary variables | |
273 | - double vm; //allocate space to store the return values for the bessel function calculation | |
287 | + //Declare and evaluate variables used to calculate the spherical Bessel functions and store them temporarily on the CPU | |
288 | + double vm; //allocate space to store the return values for the bessel function calculation | |
274 | 289 | double* jv = (double*) malloc( (Nl + 1) * sizeof(double) ); |
275 | 290 | double* yv = (double*) malloc( (Nl + 1) * sizeof(double) ); |
276 | 291 | double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); |
277 | 292 | double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); |
278 | 293 | |
279 | - size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut; | |
280 | - stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes); //pointer to the look-up table | |
281 | - T dr = (r_max - r_min) / (N_hB_lut-1); //distance between values in the LUT | |
282 | - //std::cout<<"LUT jl bytes: "<<hB_bytes<<std::endl; | |
283 | - stim::complex<T> hl; | |
284 | - for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT | |
294 | + size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut; //calculate the number of bytes necessary to store the Hankel function LUT | |
295 | + stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes); //pointer to the look-up table | |
296 | + T dr = (r_max - r_min) / (N_hB_lut-1); //calculate the optimal distance between values in the LUT | |
297 | + stim::complex<T> hl; //declare a complex value for the Hankel function result | |
298 | + for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT | |
285 | 299 | stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] |
286 | - for(size_t l = 0; l <= Nl; l++){ //for each order | |
287 | - hl.r = (T)jv[l]; | |
300 | + for(size_t l = 0; l <= Nl; l++){ //for each order | |
301 | + hl.r = (T)jv[l]; //generate the spherical Hankel function from the Bessel functions | |
288 | 302 | hl.i = (T)yv[l]; |
289 | 303 | |
290 | - hB_lut[ri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result | |
291 | - //std::cout<<hB_lut[ri * (Nl + 1) + l]<<std::endl; | |
304 | + hB_lut[ri * (Nl + 1) + l] = hl * B[l]; //pre-multiply the Hankel function by the scattering coefficients | |
292 | 305 | } |
293 | 306 | } |
294 | - //T* real_lut = (T*) malloc(hB_bytes/2); | |
295 | - //stim::real(real_lut, hB_lut, N_hB_lut); | |
296 | - //stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer); | |
297 | 307 | |
298 | - //Allocate device memory and copy everything to the GPU | |
308 | + //Copy the pre-multiplied Hankel function look-up table to the GPU - this LUT gives a list of uniformly spaced Hankel function values pre-multiplied by scattering coefficients | |
299 | 309 | stim::complex<T>* dev_hB_lut; |
300 | 310 | HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); |
301 | 311 | HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); |
302 | - //std::cout << "r_min: " << r_min << std::endl; | |
303 | - //std::cout << "dr: " << dr << std::endl; | |
312 | + | |
313 | + //calculate the Mie scattering solution on the GPU | |
304 | 314 | gpu_scalar_mie_scatter<T>(E, N, x, y, z, dev_W, W.size(), a, n, c, dev_hB_lut, r_min, dr, N_hB_lut, Nl); |
305 | 315 | |
306 | - HANDLE_ERROR(cudaMemcpy(E, E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost)); //copy the field from device memory | |
316 | + //HANDLE_ERROR(cudaMemcpy(E, E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost)); //copy the field from device memory | |
307 | 317 | |
308 | 318 | HANDLE_ERROR(cudaFree(dev_hB_lut)); |
309 | 319 | HANDLE_ERROR(cudaFree(dev_r)); | ... | ... |
stim/parser/filename.h
... | ... | @@ -238,7 +238,7 @@ public: |
238 | 238 | } |
239 | 239 | |
240 | 240 | /// Create a matching file locator with a prefix s |
241 | - stim::filename prefix(std::string s){ | |
241 | + stim::filename with_prefix(std::string s){ | |
242 | 242 | stim::filename result = *this; |
243 | 243 | result._prefix = s; |
244 | 244 | return result; | ... | ... |