Commit 6a53ac0c677590fee03b1fc26a0f56f8e07c5055
1 parent
59b0e5f5
updated optics files and FindSTIM to support new CMake standards
Showing
7 changed files
with
129 additions
and
79 deletions
Show diff stats
cmake/FindSTIM.cmake
@@ -3,20 +3,25 @@ | @@ -3,20 +3,25 @@ | ||
3 | 3 | ||
4 | include(FindPackageHandleStandardArgs) | 4 | include(FindPackageHandleStandardArgs) |
5 | 5 | ||
6 | -set(STIM_INCLUDE_DIR $ENV{STIMLIB_PATH}) | 6 | +set(STIM_ROOT $ENV{STIM_ROOT}) |
7 | 7 | ||
8 | -find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIR) | 8 | +IF(NOT UNIX) |
9 | + IF(NOT STIM_ROOT) | ||
10 | + MESSAGE("ERROR: STIM_ROOT environment variable must be set!") | ||
11 | + ENDIF(NOT STIM_ROOT) | ||
12 | + | ||
13 | + FIND_PATH(STIM_INCLUDE_DIRS DOC "Path to GLFW include directory." | ||
14 | + NAMES stim/image/image.h | ||
15 | + PATHS ${STIM_ROOT}) | ||
16 | +ENDIF(NOT UNIX) | ||
17 | + | ||
18 | +find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIRS) | ||
9 | 19 | ||
10 | if(STIM_FOUND) | 20 | if(STIM_FOUND) |
11 | - set(STIM_INCLUDE_DIRS ${STIM_INCLUDE_DIR}) | 21 | + set(STIM_INCLUDE_DIRS ${STIM_INCLUDE_DIRS}) |
12 | elseif(STIM_FOUND) | 22 | elseif(STIM_FOUND) |
13 | - #if the STIM library isn't found, download it | ||
14 | - #file(REMOVE_RECURSE ${CMAKE_BINARY_DIR}/stimlib) #remove the stimlib directory if it exists | ||
15 | - #set(STIM_GIT "https://git.stim.ee.uh.edu/codebase/stimlib.git") | ||
16 | - #execute_process(COMMAND git clone --depth 1 ${STIM_GIT} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) | ||
17 | - #set(STIM_INCLUDE_DIRS "${CMAKE_BINARY_DIR}/stimlib" CACHE TYPE PATH) | ||
18 | - message("STIM library not found. Set the STIMLIB_PATH environment variable to the STIMLIB location.") | 23 | + message("STIM library not found. Set the STIM_ROOT environment variable to the STIM location.") |
19 | message("STIMLIB can be found here: https://git.stim.ee.uh.edu/codebase/stimlib") | 24 | message("STIMLIB can be found here: https://git.stim.ee.uh.edu/codebase/stimlib") |
20 | endif(STIM_FOUND) | 25 | endif(STIM_FOUND) |
21 | 26 | ||
22 | -find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIR) | 27 | +find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIRS) |
stim/envi/envi.h
@@ -128,7 +128,10 @@ public: | @@ -128,7 +128,10 @@ public: | ||
128 | return false; | 128 | return false; |
129 | size_t targetBytes = H.data_bytes(); //get the number of bytes that SHOULD be in the data file | 129 | size_t targetBytes = H.data_bytes(); //get the number of bytes that SHOULD be in the data file |
130 | size_t bytes = stim::file_size(fname); | 130 | size_t bytes = stim::file_size(fname); |
131 | - if(bytes != targetBytes) return false; //if the data doesn't match the header, return false | 131 | + if (bytes != targetBytes) { |
132 | + std::cout << "ERROR: File size mismatch. Based on the header, a " << targetBytes << " byte file was expected. The data file contains " << bytes << " bytes." << std::endl; | ||
133 | + return false; //if the data doesn't match the header, return false | ||
134 | + } | ||
132 | return true; //otherwise everything looks fine | 135 | return true; //otherwise everything looks fine |
133 | 136 | ||
134 | } | 137 | } |
stim/math/complex.h
@@ -11,7 +11,7 @@ | @@ -11,7 +11,7 @@ | ||
11 | 11 | ||
12 | namespace stim | 12 | namespace stim |
13 | { | 13 | { |
14 | - enum complexComponentType {complexReal, complexImaginary, complexMag, complexIntensity}; | 14 | + enum complexComponentType {complexFull, complexReal, complexImaginary, complexMag, complexIntensity}; |
15 | 15 | ||
16 | template <class T> | 16 | template <class T> |
17 | struct complex | 17 | struct complex |
stim/optics/scalarbeam.h
@@ -415,8 +415,8 @@ public: | @@ -415,8 +415,8 @@ public: | ||
415 | A = amplitude; | 415 | A = amplitude; |
416 | f = focal_point; | 416 | f = focal_point; |
417 | d = direction.norm(); //make sure that the direction vector is normalized (makes calculations more efficient later on) | 417 | d = direction.norm(); //make sure that the direction vector is normalized (makes calculations more efficient later on) |
418 | - NA[0] = numerical_aperture; | ||
419 | - NA[1] = center_obsc; | 418 | + NA[0] = center_obsc; |
419 | + NA[1] = numerical_aperture; | ||
420 | } | 420 | } |
421 | 421 | ||
422 | ///Numerical Aperature functions | 422 | ///Numerical Aperature functions |
@@ -425,21 +425,32 @@ public: | @@ -425,21 +425,32 @@ public: | ||
425 | NA[0] = (T)0; | 425 | NA[0] = (T)0; |
426 | NA[1] = na; | 426 | NA[1] = na; |
427 | } | 427 | } |
428 | - void setNA(T na0, T na1) | 428 | + void setNA(T na_in, T na_out) |
429 | { | 429 | { |
430 | - NA[0] = na0; | ||
431 | - NA[1] = na1; | 430 | + NA[0] = na_in; |
431 | + NA[1] = na_out; | ||
432 | } | 432 | } |
433 | 433 | ||
434 | //Monte-Carlo decomposition into plane waves | 434 | //Monte-Carlo decomposition into plane waves |
435 | std::vector< scalarwave<T> > mc(size_t N = 100000) const{ | 435 | std::vector< scalarwave<T> > mc(size_t N = 100000) const{ |
436 | 436 | ||
437 | - std::vector< stim::vec3<T> > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]); //generate a random set of N vectors forming a focus | ||
438 | - std::vector< scalarwave<T> > samples(N); //create a vector of plane waves | ||
439 | T kmag = (T)stim::TAU / lambda; //calculate the wavenumber | 437 | T kmag = (T)stim::TAU / lambda; //calculate the wavenumber |
438 | + stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction | ||
440 | stim::complex<T> apw; //allocate space for the amplitude at the focal point | 439 | stim::complex<T> apw; //allocate space for the amplitude at the focal point |
440 | + | ||
441 | + //deal with the degenerative case where the outer NA is 0, in which case the beam will be specified as a single plane wave | ||
442 | + if (NA[1] == 0) { | ||
443 | + std::vector< scalarwave<T> > samples(1); //create a vector containing 1 sample | ||
444 | + kpw = d * kmag; //calculate the k-vector for the plane wave (beam direction scaled by wavenumber) | ||
445 | + apw = A * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude of the plane wave | ||
446 | + samples[0] = scalarwave<T>(kpw, apw); //create a plane wave based on the direction | ||
447 | + return samples; | ||
448 | + } | ||
449 | + | ||
450 | + //otherwise, evaluate the system using N monte-carlo samples | ||
451 | + std::vector< stim::vec3<T> > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]); //generate a random set of N vectors forming a focus | ||
452 | + std::vector< scalarwave<T> > samples(N); //create a vector of plane waves | ||
441 | T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N); //constant value weights plane waves based on the aperture and number of samples (N) | 453 | T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N); //constant value weights plane waves based on the aperture and number of samples (N) |
442 | - stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction | ||
443 | for(size_t i=0; i<N; i++){ //for each sample | 454 | for(size_t i=0; i<N; i++){ //for each sample |
444 | kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave | 455 | kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave |
445 | apw = a * A * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave | 456 | apw = a * A * exp(stim::complex<T>(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave |
@@ -449,17 +460,18 @@ public: | @@ -449,17 +460,18 @@ public: | ||
449 | } | 460 | } |
450 | 461 | ||
451 | void eval(stim::scalarfield<T>& E, T* X, T* Y, T* Z, int order = 500){ | 462 | void eval(stim::scalarfield<T>& E, T* X, T* Y, T* Z, int order = 500){ |
452 | - cpu_scalar_psf_cart<T>(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[0], NA[1], order, E.spacing()); | 463 | + cpu_scalar_psf_cart<T>(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[1], NA[0], order, E.spacing()); |
453 | } | 464 | } |
454 | 465 | ||
455 | /// Evaluate the beam to a scalar field using Debye focusing | 466 | /// Evaluate the beam to a scalar field using Debye focusing |
456 | void eval(stim::scalarfield<T>& E, int order = 500){ | 467 | void eval(stim::scalarfield<T>& E, int order = 500){ |
457 | - E.meshgrid(); //calculate a meshgrid if one isn't created | 468 | + E.meshgrid(); //calculate a meshgrid if one isn't created |
469 | + | ||
458 | if(E.gpu()) | 470 | if(E.gpu()) |
459 | - gpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[0], NA[1], order, E.spacing()); | 471 | + gpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[1], NA[0], order, E.spacing()); |
460 | else | 472 | else |
461 | - cpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[0], NA[1], order, E.spacing()); | ||
462 | - //eval(E, E.x(), E.y(), E.z(), order); | 473 | + cpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[1], NA[0], order, E.spacing()); |
474 | + | ||
463 | } | 475 | } |
464 | 476 | ||
465 | /// Calculate the field at a given point | 477 | /// Calculate the field at a given point |
stim/optics/scalarfield.h
@@ -538,14 +538,31 @@ public: | @@ -538,14 +538,31 @@ public: | ||
538 | memset(E, 0, grid_bytes()); | 538 | memset(E, 0, grid_bytes()); |
539 | } | 539 | } |
540 | 540 | ||
541 | - void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer, T minval = 0, T maxval = 0){ | 541 | + //write the field as a raw image to disk |
542 | + void image_raw(std::string filename) { | ||
543 | + if (loc == GPUmem) { | ||
544 | + T* cpu_field = (T*)malloc(sizeof(T) * 2 * size()); //allocate temporary space on the CPU to store the image | ||
545 | + HANDLE_ERROR(cudaMemcpy(cpu_field, E, sizeof(T) * 2 * size(), cudaMemcpyDeviceToHost)); //copy the field data from the GPU to the CPU | ||
546 | + std::ofstream outfile(filename, std::ios::binary); //open a binary file for writing | ||
547 | + outfile.write((const char*)cpu_field, sizeof(T) * 2 * size()); //write the raw field to disk | ||
548 | + free(cpu_field); //free memory | ||
549 | + } | ||
550 | + //if the data is stored on the CPU, no need to cut it - just save it to disk | ||
551 | + else { | ||
552 | + std::ofstream outfile(filename, std::ios::binary); //open a binary file for writing | ||
553 | + outfile.write((const char*)E, sizeof(T) * 2 * size()); //write the raw field to disk | ||
554 | + } | ||
555 | + } | ||
542 | 556 | ||
557 | + void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer, T minval = 0, T maxval = 0){ | ||
558 | + | ||
543 | if(loc == GPUmem){ | 559 | if(loc == GPUmem){ |
544 | T* image; | 560 | T* image; |
545 | HANDLE_ERROR( cudaMalloc(&image, sizeof(T) * size()) ); | 561 | HANDLE_ERROR( cudaMalloc(&image, sizeof(T) * size()) ); |
546 | int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device | 562 | int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device |
547 | dim3 blocks( R[0] * R[1] / threads + 1 ); //create a 1D array of blocks | 563 | dim3 blocks( R[0] * R[1] / threads + 1 ); //create a 1D array of blocks |
548 | 564 | ||
565 | + // if the data is located on the GPU, execute a kernel that converts the image to the requested data type | ||
549 | switch(type){ | 566 | switch(type){ |
550 | case complexMag: | 567 | case complexMag: |
551 | cuda_abs<T><<< blocks, threads >>>(image, E, size()); | 568 | cuda_abs<T><<< blocks, threads >>>(image, E, size()); |
@@ -559,6 +576,9 @@ public: | @@ -559,6 +576,9 @@ public: | ||
559 | case complexIntensity: | 576 | case complexIntensity: |
560 | cuda_intensity<T><<< blocks, threads >>>(image, E, size()); | 577 | cuda_intensity<T><<< blocks, threads >>>(image, E, size()); |
561 | break; | 578 | break; |
579 | + default: | ||
580 | + std::cout << "ERROR: invalid complex component specified." << std::endl; | ||
581 | + exit(1); | ||
562 | } | 582 | } |
563 | if (minval == maxval) | 583 | if (minval == maxval) |
564 | stim::gpu2image<T>(image, filename, R[0], R[1], cmap); | 584 | stim::gpu2image<T>(image, filename, R[0], R[1], cmap); |
stim/optics/scalarmie.h
@@ -88,11 +88,28 @@ void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){ | @@ -88,11 +88,28 @@ void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){ | ||
88 | } | 88 | } |
89 | 89 | ||
90 | #define LOCAL_NL 16 | 90 | #define LOCAL_NL 16 |
91 | + | ||
92 | +/// CUDA kernel for calculating the Mie scattering solution given a set of points (x, y, z), a list of plane waves, and a look-up table for Bl*hl | ||
93 | +/// @param E (GPU) is the N x N destination scalar field | ||
94 | +/// @param N is the number of sample points to evaluate | ||
95 | +/// @param x (GPU) is the grid of X coordinates for each point in E | ||
96 | +/// @param y (GPU) is the grid of Y coordinates for each point in E | ||
97 | +/// @param z (GPU) is the grid of Z coordinates for each point in E | ||
98 | +/// @param W (GPU) is an array of coherent scalar plane waves incident on the Mie scatterer | ||
99 | +/// @param nW is the number of plane waves to evaluate (sum) | ||
100 | +/// @param a is the radius of the Mie scatterer | ||
101 | +/// @param n is the complex refractive index of the Mie scatterer | ||
102 | +/// @param c is the position of the sphere in (x, y, z) | ||
103 | +/// @param hB (GPU) is a look-up table of Hankel functions (equally spaced in distance from the sphere) pre-multiplied with scattering coefficients | ||
104 | +/// @param kr_min is the minimum kr value in the hB look-up table (corresponding to the closest point to the sphere) | ||
105 | +/// @param dkr is the spacing (in kr) between samples of the hB look-up table | ||
106 | +/// @param N_hB is the number of samples in hB | ||
107 | +/// @param Nl is the order of the calculation (number of Hankel function orders) | ||
91 | template<typename T> | 108 | template<typename T> |
92 | __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::vec3<T> c, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){ | 109 | __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::vec3<T> c, stim::complex<T>* hB, T r_min, T dr, size_t N_hB, int Nl){ |
93 | - extern __shared__ stim::complex<T> shared_hB[]; //declare the list of waves in shared memory | 110 | + extern __shared__ stim::complex<T> shared_hB[]; //declare the list of waves in shared memory |
94 | 111 | ||
95 | - size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array | 112 | + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the sample array (sample point associated with this thread) |
96 | if(i >= N) return; //exit if this thread is outside the array | 113 | if(i >= N) return; //exit if this thread is outside the array |
97 | stim::vec3<T> p; | 114 | stim::vec3<T> p; |
98 | (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions | 115 | (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions |
@@ -101,14 +118,14 @@ __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* | @@ -101,14 +118,14 @@ __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* | ||
101 | p = p - c; | 118 | p = p - c; |
102 | T r = p.len(); //calculate the distance from the sphere | 119 | T r = p.len(); //calculate the distance from the sphere |
103 | if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) | 120 | if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) |
104 | - T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT | 121 | + T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT |
105 | size_t ij = (size_t) fij; //convert to an integral index | 122 | size_t ij = (size_t) fij; //convert to an integral index |
106 | T alpha = fij - ij; //calculate the fractional portion of the index | 123 | T alpha = fij - ij; //calculate the fractional portion of the index |
107 | - size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT | ||
108 | - size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT | 124 | + size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT |
125 | + size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT | ||
109 | 126 | ||
110 | T cos_phi; | 127 | T cos_phi; |
111 | - T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials | 128 | + T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials |
112 | 129 | ||
113 | stim::complex<T> hBl; | 130 | stim::complex<T> hBl; |
114 | stim::complex<T> Ei = 0; //create a register to store the result | 131 | stim::complex<T> Ei = 0; //create a register to store the result |
@@ -154,25 +171,28 @@ __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* | @@ -154,25 +171,28 @@ __global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* | ||
154 | E[i] += Ei; //copy the result to device memory | 171 | E[i] += Ei; //copy the result to device memory |
155 | } | 172 | } |
156 | 173 | ||
157 | -///Calculate the scalar Mie scattered field on the GPU | 174 | +///Calculate the scalar Mie scattered field on the GPU when a list of GPU-based pre-multiplied Hankel functions are available |
158 | /// @param E (GPU) is the N x N destination scalar field | 175 | /// @param E (GPU) is the N x N destination scalar field |
159 | /// @param N is the number fo elements of the scalar field in each direction | 176 | /// @param N is the number fo elements of the scalar field in each direction |
160 | /// @param x (GPU) is the grid of X coordinates for each point in E | 177 | /// @param x (GPU) is the grid of X coordinates for each point in E |
161 | /// @param y (GPU) is the grid of Y coordinates for each point in E | 178 | /// @param y (GPU) is the grid of Y coordinates for each point in E |
162 | /// @param z (GPU) is the grid of Z coordinates for each point in E | 179 | /// @param z (GPU) is the grid of Z coordinates for each point in E |
163 | -/// @param W (CPU) is an array of coherent scalar plane waves incident on the Mie scatterer | 180 | +/// @param W (GPU) is an array of coherent scalar plane waves incident on the Mie scatterer |
181 | +/// @param nW is the number of plane waves to evaluate (sum) | ||
164 | /// @param a is the radius of the Mie scatterer | 182 | /// @param a is the radius of the Mie scatterer |
165 | /// @param n is the complex refractive index of the Mie scatterer | 183 | /// @param n is the complex refractive index of the Mie scatterer |
166 | -/// @param r_spacing is the minimum distance between r values of the sample points in E (used to calculate look-up tables) | 184 | +/// @param c is the position of the sphere in (x, y, z) |
185 | +/// @param hB (GPU) is a look-up table of Hankel functions (equally spaced in distance from the sphere) pre-multiplied with scattering coefficients | ||
186 | +/// @param kr_min is the minimum kr value in the hB look-up table (corresponding to the closest point to the sphere) | ||
187 | +/// @param dkr is the spacing (in kr) between samples of the hB look-up table | ||
188 | +/// @param N_hB is the number of samples in hB | ||
189 | +/// @param Nl is the order of the calculation (number of Hankel function orders) | ||
167 | template<typename T> | 190 | template<typename T> |
168 | void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::vec3<T> c, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){ | 191 | void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::vec3<T> c, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){ |
169 | 192 | ||
170 | - size_t max_shared_mem = stim::sharedMemPerBlock(); | ||
171 | - size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1); | ||
172 | - //std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl; | ||
173 | - //std::cout<<"shared memory: "<<max_shared_mem<<std::endl; | ||
174 | - int threads = (int)((max_shared_mem / hBl_array) / 32 * 32); | ||
175 | - //std::cout<<"threads per block: "<<threads<<std::endl; | 193 | + size_t max_shared_mem = stim::sharedMemPerBlock(); //get the amount of shared memory per block |
194 | + size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1); //calculate the number of bytes required to store the LUT corresponding to a single sample in shared memory | ||
195 | + int threads = (int)((max_shared_mem / hBl_array) / 32 * 32); //calculate the optimal number of threads per block (make sure it's divisible by the number of warps - 32) | ||
176 | dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks | 196 | dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks |
177 | 197 | ||
178 | size_t shared_mem; | 198 | size_t shared_mem; |
@@ -197,7 +217,7 @@ __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N, stim::vec3<T> c = st | @@ -197,7 +217,7 @@ __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N, stim::vec3<T> c = st | ||
197 | 217 | ||
198 | ///Calculate the scalar Mie scattered field on the GPU | 218 | ///Calculate the scalar Mie scattered field on the GPU |
199 | /// @param E (GPU) is the N x N destination scalar field | 219 | /// @param E (GPU) is the N x N destination scalar field |
200 | -/// @param N is the number fo elements of the scalar field in each direction | 220 | +/// @param N is the number of sample points of the scalar field |
201 | /// @param x (GPU) is the grid of X coordinates for each point in E | 221 | /// @param x (GPU) is the grid of X coordinates for each point in E |
202 | /// @param y (GPU) is the grid of Y coordinates for each point in E | 222 | /// @param y (GPU) is the grid of Y coordinates for each point in E |
203 | /// @param z (GPU) is the grid of Z coordinates for each point in E | 223 | /// @param z (GPU) is the grid of Z coordinates for each point in E |
@@ -211,29 +231,28 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | @@ -211,29 +231,28 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | ||
211 | //calculate the necessary number of orders required to represent the scattered field | 231 | //calculate the necessary number of orders required to represent the scattered field |
212 | T k = W[0].kmag(); | 232 | T k = W[0].kmag(); |
213 | 233 | ||
214 | - int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); | 234 | + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); //calculate the number of orders required to represent the sphere |
215 | if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) | 235 | if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) |
216 | - //std::cout<<"Nl: "<<Nl<<std::endl; | ||
217 | 236 | ||
218 | //calculate the scattering coefficients for the sphere | 237 | //calculate the scattering coefficients for the sphere |
219 | stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients | 238 | stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients |
220 | - B_coefficients(B, a, k, n, Nl); | 239 | + B_coefficients(B, a, k, n, Nl); //calculate the scattering coefficients |
221 | 240 | ||
222 | // PLANE WAVES | 241 | // PLANE WAVES |
223 | - stim::scalarwave<T>* dev_W; //allocate space and copy plane waves | 242 | + stim::scalarwave<T>* dev_W; //allocate space and copy plane waves |
224 | HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) ); | 243 | HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) ); |
225 | HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) ); | 244 | HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) ); |
226 | 245 | ||
227 | // BESSEL FUNCTION LOOK-UP TABLE | 246 | // BESSEL FUNCTION LOOK-UP TABLE |
228 | - //calculate the distance from the sphere center | ||
229 | - T* dev_r; | ||
230 | - HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) ); | 247 | + //calculate the distance from the sphere center at each sample point and store the result in dev_r |
248 | + T* dev_r; //declare the device pointer to hold the distance from the sphere center | ||
249 | + HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) ); //allocate space for the array of distances | ||
231 | 250 | ||
232 | - int threads = stim::maxThreadsPerBlock(); | ||
233 | - dim3 blocks((unsigned)(N / threads + 1)); | ||
234 | - cuda_dist<T> <<< blocks, threads >>>(dev_r, x, y, z, N, c); | 251 | + int threads = stim::maxThreadsPerBlock(); //query the device to find the maximum number of threads per block |
252 | + dim3 blocks((unsigned)(N / threads + 1)); //calculate the number of blocks necessary to evaluate the total number of sample points N | ||
253 | + cuda_dist<T> <<< blocks, threads >>>(dev_r, x, y, z, N, c); //calculate the distance | ||
235 | 254 | ||
236 | - //Find the minimum and maximum values of r | 255 | + //Use the cuBLAS library to find the minimum and maximum distances from the sphere center. This will be used to create a look-up table for the Hankel functions |
237 | cublasStatus_t stat; | 256 | cublasStatus_t stat; |
238 | cublasHandle_t handle; | 257 | cublasHandle_t handle; |
239 | 258 | ||
@@ -261,49 +280,40 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | @@ -261,49 +280,40 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | ||
261 | HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU | 280 | HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU |
262 | HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); | 281 | HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); |
263 | 282 | ||
264 | - r_min = max(r_min, a); //if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere) | ||
265 | - | ||
266 | - //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r | ||
267 | - size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1); | 283 | + r_min = max(r_min, a); //if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere) |
268 | 284 | ||
269 | - //T kr_min = k * r_min; | ||
270 | - //T kr_max = k * r_max; | 285 | + size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r |
271 | 286 | ||
272 | - //temporary variables | ||
273 | - double vm; //allocate space to store the return values for the bessel function calculation | 287 | + //Declare and evaluate variables used to calculate the spherical Bessel functions and store them temporarily on the CPU |
288 | + double vm; //allocate space to store the return values for the bessel function calculation | ||
274 | double* jv = (double*) malloc( (Nl + 1) * sizeof(double) ); | 289 | double* jv = (double*) malloc( (Nl + 1) * sizeof(double) ); |
275 | double* yv = (double*) malloc( (Nl + 1) * sizeof(double) ); | 290 | double* yv = (double*) malloc( (Nl + 1) * sizeof(double) ); |
276 | double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); | 291 | double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); |
277 | double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); | 292 | double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); |
278 | 293 | ||
279 | - size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut; | ||
280 | - stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes); //pointer to the look-up table | ||
281 | - T dr = (r_max - r_min) / (N_hB_lut-1); //distance between values in the LUT | ||
282 | - //std::cout<<"LUT jl bytes: "<<hB_bytes<<std::endl; | ||
283 | - stim::complex<T> hl; | ||
284 | - for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT | 294 | + size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut; //calculate the number of bytes necessary to store the Hankel function LUT |
295 | + stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes); //pointer to the look-up table | ||
296 | + T dr = (r_max - r_min) / (N_hB_lut-1); //calculate the optimal distance between values in the LUT | ||
297 | + stim::complex<T> hl; //declare a complex value for the Hankel function result | ||
298 | + for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT | ||
285 | stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] | 299 | stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] |
286 | - for(size_t l = 0; l <= Nl; l++){ //for each order | ||
287 | - hl.r = (T)jv[l]; | 300 | + for(size_t l = 0; l <= Nl; l++){ //for each order |
301 | + hl.r = (T)jv[l]; //generate the spherical Hankel function from the Bessel functions | ||
288 | hl.i = (T)yv[l]; | 302 | hl.i = (T)yv[l]; |
289 | 303 | ||
290 | - hB_lut[ri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result | ||
291 | - //std::cout<<hB_lut[ri * (Nl + 1) + l]<<std::endl; | 304 | + hB_lut[ri * (Nl + 1) + l] = hl * B[l]; //pre-multiply the Hankel function by the scattering coefficients |
292 | } | 305 | } |
293 | } | 306 | } |
294 | - //T* real_lut = (T*) malloc(hB_bytes/2); | ||
295 | - //stim::real(real_lut, hB_lut, N_hB_lut); | ||
296 | - //stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer); | ||
297 | 307 | ||
298 | - //Allocate device memory and copy everything to the GPU | 308 | + //Copy the pre-multiplied Hankel function look-up table to the GPU - this LUT gives a list of uniformly spaced Hankel function values pre-multiplied by scattering coefficients |
299 | stim::complex<T>* dev_hB_lut; | 309 | stim::complex<T>* dev_hB_lut; |
300 | HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); | 310 | HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); |
301 | HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); | 311 | HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); |
302 | - //std::cout << "r_min: " << r_min << std::endl; | ||
303 | - //std::cout << "dr: " << dr << std::endl; | 312 | + |
313 | + //calculate the Mie scattering solution on the GPU | ||
304 | gpu_scalar_mie_scatter<T>(E, N, x, y, z, dev_W, W.size(), a, n, c, dev_hB_lut, r_min, dr, N_hB_lut, Nl); | 314 | gpu_scalar_mie_scatter<T>(E, N, x, y, z, dev_W, W.size(), a, n, c, dev_hB_lut, r_min, dr, N_hB_lut, Nl); |
305 | 315 | ||
306 | - HANDLE_ERROR(cudaMemcpy(E, E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost)); //copy the field from device memory | 316 | + //HANDLE_ERROR(cudaMemcpy(E, E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost)); //copy the field from device memory |
307 | 317 | ||
308 | HANDLE_ERROR(cudaFree(dev_hB_lut)); | 318 | HANDLE_ERROR(cudaFree(dev_hB_lut)); |
309 | HANDLE_ERROR(cudaFree(dev_r)); | 319 | HANDLE_ERROR(cudaFree(dev_r)); |
stim/parser/filename.h
@@ -238,7 +238,7 @@ public: | @@ -238,7 +238,7 @@ public: | ||
238 | } | 238 | } |
239 | 239 | ||
240 | /// Create a matching file locator with a prefix s | 240 | /// Create a matching file locator with a prefix s |
241 | - stim::filename prefix(std::string s){ | 241 | + stim::filename with_prefix(std::string s){ |
242 | stim::filename result = *this; | 242 | stim::filename result = *this; |
243 | result._prefix = s; | 243 | result._prefix = s; |
244 | return result; | 244 | return result; |