From 6a53ac0c677590fee03b1fc26a0f56f8e07c5055 Mon Sep 17 00:00:00 2001 From: David Mayerich Date: Mon, 25 Feb 2019 11:39:30 -0600 Subject: [PATCH] updated optics files and FindSTIM to support new CMake standards --- cmake/FindSTIM.cmake | 25 +++++++++++++++---------- stim/envi/envi.h | 5 ++++- stim/math/complex.h | 2 +- stim/optics/scalarbeam.h | 38 +++++++++++++++++++++++++------------- stim/optics/scalarfield.h | 22 +++++++++++++++++++++- stim/optics/scalarmie.h | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------------- stim/parser/filename.h | 2 +- 7 files changed, 129 insertions(+), 79 deletions(-) diff --git a/cmake/FindSTIM.cmake b/cmake/FindSTIM.cmake index a83481a..c05f9f6 100644 --- a/cmake/FindSTIM.cmake +++ b/cmake/FindSTIM.cmake @@ -3,20 +3,25 @@ include(FindPackageHandleStandardArgs) -set(STIM_INCLUDE_DIR $ENV{STIMLIB_PATH}) +set(STIM_ROOT $ENV{STIM_ROOT}) -find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIR) +IF(NOT UNIX) + IF(NOT STIM_ROOT) + MESSAGE("ERROR: STIM_ROOT environment variable must be set!") + ENDIF(NOT STIM_ROOT) + + FIND_PATH(STIM_INCLUDE_DIRS DOC "Path to GLFW include directory." + NAMES stim/image/image.h + PATHS ${STIM_ROOT}) +ENDIF(NOT UNIX) + +find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIRS) if(STIM_FOUND) - set(STIM_INCLUDE_DIRS ${STIM_INCLUDE_DIR}) + set(STIM_INCLUDE_DIRS ${STIM_INCLUDE_DIRS}) elseif(STIM_FOUND) - #if the STIM library isn't found, download it - #file(REMOVE_RECURSE ${CMAKE_BINARY_DIR}/stimlib) #remove the stimlib directory if it exists - #set(STIM_GIT "https://git.stim.ee.uh.edu/codebase/stimlib.git") - #execute_process(COMMAND git clone --depth 1 ${STIM_GIT} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) - #set(STIM_INCLUDE_DIRS "${CMAKE_BINARY_DIR}/stimlib" CACHE TYPE PATH) - message("STIM library not found. Set the STIMLIB_PATH environment variable to the STIMLIB location.") + message("STIM library not found. Set the STIM_ROOT environment variable to the STIM location.") message("STIMLIB can be found here: https://git.stim.ee.uh.edu/codebase/stimlib") endif(STIM_FOUND) -find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIR) +find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIRS) diff --git a/stim/envi/envi.h b/stim/envi/envi.h index bbff889..aafa3ea 100644 --- a/stim/envi/envi.h +++ b/stim/envi/envi.h @@ -128,7 +128,10 @@ public: return false; size_t targetBytes = H.data_bytes(); //get the number of bytes that SHOULD be in the data file size_t bytes = stim::file_size(fname); - if(bytes != targetBytes) return false; //if the data doesn't match the header, return false + if (bytes != targetBytes) { + std::cout << "ERROR: File size mismatch. Based on the header, a " << targetBytes << " byte file was expected. The data file contains " << bytes << " bytes." << std::endl; + return false; //if the data doesn't match the header, return false + } return true; //otherwise everything looks fine } diff --git a/stim/math/complex.h b/stim/math/complex.h index 3ccc7d0..e8201e6 100644 --- a/stim/math/complex.h +++ b/stim/math/complex.h @@ -11,7 +11,7 @@ namespace stim { - enum complexComponentType {complexReal, complexImaginary, complexMag, complexIntensity}; + enum complexComponentType {complexFull, complexReal, complexImaginary, complexMag, complexIntensity}; template struct complex diff --git a/stim/optics/scalarbeam.h b/stim/optics/scalarbeam.h index 5841f9f..26de61f 100644 --- a/stim/optics/scalarbeam.h +++ b/stim/optics/scalarbeam.h @@ -415,8 +415,8 @@ public: A = amplitude; f = focal_point; d = direction.norm(); //make sure that the direction vector is normalized (makes calculations more efficient later on) - NA[0] = numerical_aperture; - NA[1] = center_obsc; + NA[0] = center_obsc; + NA[1] = numerical_aperture; } ///Numerical Aperature functions @@ -425,21 +425,32 @@ public: NA[0] = (T)0; NA[1] = na; } - void setNA(T na0, T na1) + void setNA(T na_in, T na_out) { - NA[0] = na0; - NA[1] = na1; + NA[0] = na_in; + NA[1] = na_out; } //Monte-Carlo decomposition into plane waves std::vector< scalarwave > mc(size_t N = 100000) const{ - std::vector< stim::vec3 > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]); //generate a random set of N vectors forming a focus - std::vector< scalarwave > samples(N); //create a vector of plane waves T kmag = (T)stim::TAU / lambda; //calculate the wavenumber + stim::vec3 kpw; //declare the new k-vector based on the focused plane wave direction stim::complex apw; //allocate space for the amplitude at the focal point + + //deal with the degenerative case where the outer NA is 0, in which case the beam will be specified as a single plane wave + if (NA[1] == 0) { + std::vector< scalarwave > samples(1); //create a vector containing 1 sample + kpw = d * kmag; //calculate the k-vector for the plane wave (beam direction scaled by wavenumber) + apw = A * exp(stim::complex(0, kpw.dot(-f))); //calculate the amplitude of the plane wave + samples[0] = scalarwave(kpw, apw); //create a plane wave based on the direction + return samples; + } + + //otherwise, evaluate the system using N monte-carlo samples + std::vector< stim::vec3 > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]); //generate a random set of N vectors forming a focus + std::vector< scalarwave > samples(N); //create a vector of plane waves T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N); //constant value weights plane waves based on the aperture and number of samples (N) - stim::vec3 kpw; //declare the new k-vector based on the focused plane wave direction for(size_t i=0; i(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave @@ -449,17 +460,18 @@ public: } void eval(stim::scalarfield& E, T* X, T* Y, T* Z, int order = 500){ - cpu_scalar_psf_cart(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[0], NA[1], order, E.spacing()); + cpu_scalar_psf_cart(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[1], NA[0], order, E.spacing()); } /// Evaluate the beam to a scalar field using Debye focusing void eval(stim::scalarfield& E, int order = 500){ - E.meshgrid(); //calculate a meshgrid if one isn't created + E.meshgrid(); //calculate a meshgrid if one isn't created + if(E.gpu()) - gpu_scalar_psf_cart(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[0], NA[1], order, E.spacing()); + gpu_scalar_psf_cart(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[1], NA[0], order, E.spacing()); else - cpu_scalar_psf_cart(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[0], NA[1], order, E.spacing()); - //eval(E, E.x(), E.y(), E.z(), order); + cpu_scalar_psf_cart(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[1], NA[0], order, E.spacing()); + } /// Calculate the field at a given point diff --git a/stim/optics/scalarfield.h b/stim/optics/scalarfield.h index 231342e..5a80ae5 100644 --- a/stim/optics/scalarfield.h +++ b/stim/optics/scalarfield.h @@ -538,14 +538,31 @@ public: memset(E, 0, grid_bytes()); } - void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer, T minval = 0, T maxval = 0){ + //write the field as a raw image to disk + void image_raw(std::string filename) { + if (loc == GPUmem) { + T* cpu_field = (T*)malloc(sizeof(T) * 2 * size()); //allocate temporary space on the CPU to store the image + HANDLE_ERROR(cudaMemcpy(cpu_field, E, sizeof(T) * 2 * size(), cudaMemcpyDeviceToHost)); //copy the field data from the GPU to the CPU + std::ofstream outfile(filename, std::ios::binary); //open a binary file for writing + outfile.write((const char*)cpu_field, sizeof(T) * 2 * size()); //write the raw field to disk + free(cpu_field); //free memory + } + //if the data is stored on the CPU, no need to cut it - just save it to disk + else { + std::ofstream outfile(filename, std::ios::binary); //open a binary file for writing + outfile.write((const char*)E, sizeof(T) * 2 * size()); //write the raw field to disk + } + } + void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer, T minval = 0, T maxval = 0){ + if(loc == GPUmem){ T* image; HANDLE_ERROR( cudaMalloc(&image, sizeof(T) * size()) ); int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device dim3 blocks( R[0] * R[1] / threads + 1 ); //create a 1D array of blocks + // if the data is located on the GPU, execute a kernel that converts the image to the requested data type switch(type){ case complexMag: cuda_abs<<< blocks, threads >>>(image, E, size()); @@ -559,6 +576,9 @@ public: case complexIntensity: cuda_intensity<<< blocks, threads >>>(image, E, size()); break; + default: + std::cout << "ERROR: invalid complex component specified." << std::endl; + exit(1); } if (minval == maxval) stim::gpu2image(image, filename, R[0], R[1], cmap); diff --git a/stim/optics/scalarmie.h b/stim/optics/scalarmie.h index 48b6972..4ad740f 100644 --- a/stim/optics/scalarmie.h +++ b/stim/optics/scalarmie.h @@ -88,11 +88,28 @@ void A_coefficients(stim::complex* A, T a, T k, stim::complex n, int Nl){ } #define LOCAL_NL 16 + +/// CUDA kernel for calculating the Mie scattering solution given a set of points (x, y, z), a list of plane waves, and a look-up table for Bl*hl +/// @param E (GPU) is the N x N destination scalar field +/// @param N is the number of sample points to evaluate +/// @param x (GPU) is the grid of X coordinates for each point in E +/// @param y (GPU) is the grid of Y coordinates for each point in E +/// @param z (GPU) is the grid of Z coordinates for each point in E +/// @param W (GPU) is an array of coherent scalar plane waves incident on the Mie scatterer +/// @param nW is the number of plane waves to evaluate (sum) +/// @param a is the radius of the Mie scatterer +/// @param n is the complex refractive index of the Mie scatterer +/// @param c is the position of the sphere in (x, y, z) +/// @param hB (GPU) is a look-up table of Hankel functions (equally spaced in distance from the sphere) pre-multiplied with scattering coefficients +/// @param kr_min is the minimum kr value in the hB look-up table (corresponding to the closest point to the sphere) +/// @param dkr is the spacing (in kr) between samples of the hB look-up table +/// @param N_hB is the number of samples in hB +/// @param Nl is the order of the calculation (number of Hankel function orders) template __global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::vec3 c, stim::complex* hB, T r_min, T dr, size_t N_hB, int Nl){ - extern __shared__ stim::complex shared_hB[]; //declare the list of waves in shared memory + extern __shared__ stim::complex shared_hB[]; //declare the list of waves in shared memory - size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the sample array (sample point associated with this thread) if(i >= N) return; //exit if this thread is outside the array stim::vec3 p; (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions @@ -101,14 +118,14 @@ __global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* p = p - c; T r = p.len(); //calculate the distance from the sphere if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) - T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT + T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT size_t ij = (size_t) fij; //convert to an integral index T alpha = fij - ij; //calculate the fractional portion of the index - size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT - size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT + size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT + size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT T cos_phi; - T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials + T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials stim::complex hBl; stim::complex Ei = 0; //create a register to store the result @@ -154,25 +171,28 @@ __global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* E[i] += Ei; //copy the result to device memory } -///Calculate the scalar Mie scattered field on the GPU +///Calculate the scalar Mie scattered field on the GPU when a list of GPU-based pre-multiplied Hankel functions are available /// @param E (GPU) is the N x N destination scalar field /// @param N is the number fo elements of the scalar field in each direction /// @param x (GPU) is the grid of X coordinates for each point in E /// @param y (GPU) is the grid of Y coordinates for each point in E /// @param z (GPU) is the grid of Z coordinates for each point in E -/// @param W (CPU) is an array of coherent scalar plane waves incident on the Mie scatterer +/// @param W (GPU) is an array of coherent scalar plane waves incident on the Mie scatterer +/// @param nW is the number of plane waves to evaluate (sum) /// @param a is the radius of the Mie scatterer /// @param n is the complex refractive index of the Mie scatterer -/// @param r_spacing is the minimum distance between r values of the sample points in E (used to calculate look-up tables) +/// @param c is the position of the sphere in (x, y, z) +/// @param hB (GPU) is a look-up table of Hankel functions (equally spaced in distance from the sphere) pre-multiplied with scattering coefficients +/// @param kr_min is the minimum kr value in the hB look-up table (corresponding to the closest point to the sphere) +/// @param dkr is the spacing (in kr) between samples of the hB look-up table +/// @param N_hB is the number of samples in hB +/// @param Nl is the order of the calculation (number of Hankel function orders) template void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::vec3 c, stim::complex* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){ - size_t max_shared_mem = stim::sharedMemPerBlock(); - size_t hBl_array = sizeof(stim::complex) * (Nl + 1); - //std::cout<<"hl*Bl array size: "<) * (Nl + 1); //calculate the number of bytes required to store the LUT corresponding to a single sample in shared memory + int threads = (int)((max_shared_mem / hBl_array) / 32 * 32); //calculate the optimal number of threads per block (make sure it's divisible by the number of warps - 32) dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks size_t shared_mem; @@ -197,7 +217,7 @@ __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N, stim::vec3 c = st ///Calculate the scalar Mie scattered field on the GPU /// @param E (GPU) is the N x N destination scalar field -/// @param N is the number fo elements of the scalar field in each direction +/// @param N is the number of sample points of the scalar field /// @param x (GPU) is the grid of X coordinates for each point in E /// @param y (GPU) is the grid of Y coordinates for each point in E /// @param z (GPU) is the grid of Z coordinates for each point in E @@ -211,29 +231,28 @@ void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std //calculate the necessary number of orders required to represent the scattered field T k = W[0].kmag(); - int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); //calculate the number of orders required to represent the sphere if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) - //std::cout<<"Nl: "<* B = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients - B_coefficients(B, a, k, n, Nl); + B_coefficients(B, a, k, n, Nl); //calculate the scattering coefficients // PLANE WAVES - stim::scalarwave* dev_W; //allocate space and copy plane waves + stim::scalarwave* dev_W; //allocate space and copy plane waves HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave) * W.size()) ); HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave) * W.size(), cudaMemcpyHostToDevice) ); // BESSEL FUNCTION LOOK-UP TABLE - //calculate the distance from the sphere center - T* dev_r; - HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) ); + //calculate the distance from the sphere center at each sample point and store the result in dev_r + T* dev_r; //declare the device pointer to hold the distance from the sphere center + HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) ); //allocate space for the array of distances - int threads = stim::maxThreadsPerBlock(); - dim3 blocks((unsigned)(N / threads + 1)); - cuda_dist <<< blocks, threads >>>(dev_r, x, y, z, N, c); + int threads = stim::maxThreadsPerBlock(); //query the device to find the maximum number of threads per block + dim3 blocks((unsigned)(N / threads + 1)); //calculate the number of blocks necessary to evaluate the total number of sample points N + cuda_dist <<< blocks, threads >>>(dev_r, x, y, z, N, c); //calculate the distance - //Find the minimum and maximum values of r + //Use the cuBLAS library to find the minimum and maximum distances from the sphere center. This will be used to create a look-up table for the Hankel functions cublasStatus_t stat; cublasHandle_t handle; @@ -261,49 +280,40 @@ void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); - r_min = max(r_min, a); //if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere) - - //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r - size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1); + r_min = max(r_min, a); //if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere) - //T kr_min = k * r_min; - //T kr_max = k * r_max; + size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r - //temporary variables - double vm; //allocate space to store the return values for the bessel function calculation + //Declare and evaluate variables used to calculate the spherical Bessel functions and store them temporarily on the CPU + double vm; //allocate space to store the return values for the bessel function calculation double* jv = (double*) malloc( (Nl + 1) * sizeof(double) ); double* yv = (double*) malloc( (Nl + 1) * sizeof(double) ); double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); - size_t hB_bytes = sizeof(stim::complex) * (Nl+1) * N_hB_lut; - stim::complex* hB_lut = (stim::complex*) malloc(hB_bytes); //pointer to the look-up table - T dr = (r_max - r_min) / (N_hB_lut-1); //distance between values in the LUT - //std::cout<<"LUT jl bytes: "< hl; - for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT + size_t hB_bytes = sizeof(stim::complex) * (Nl+1) * N_hB_lut; //calculate the number of bytes necessary to store the Hankel function LUT + stim::complex* hB_lut = (stim::complex*) malloc(hB_bytes); //pointer to the look-up table + T dr = (r_max - r_min) / (N_hB_lut-1); //calculate the optimal distance between values in the LUT + stim::complex hl; //declare a complex value for the Hankel function result + for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT stim::bessjyv_sph(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] - for(size_t l = 0; l <= Nl; l++){ //for each order - hl.r = (T)jv[l]; + for(size_t l = 0; l <= Nl; l++){ //for each order + hl.r = (T)jv[l]; //generate the spherical Hankel function from the Bessel functions hl.i = (T)yv[l]; - hB_lut[ri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result - //std::cout<(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer); - //Allocate device memory and copy everything to the GPU + //Copy the pre-multiplied Hankel function look-up table to the GPU - this LUT gives a list of uniformly spaced Hankel function values pre-multiplied by scattering coefficients stim::complex* dev_hB_lut; HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); - //std::cout << "r_min: " << r_min << std::endl; - //std::cout << "dr: " << dr << std::endl; + + //calculate the Mie scattering solution on the GPU gpu_scalar_mie_scatter(E, N, x, y, z, dev_W, W.size(), a, n, c, dev_hB_lut, r_min, dr, N_hB_lut, Nl); - HANDLE_ERROR(cudaMemcpy(E, E, N * sizeof(stim::complex), cudaMemcpyDeviceToHost)); //copy the field from device memory + //HANDLE_ERROR(cudaMemcpy(E, E, N * sizeof(stim::complex), cudaMemcpyDeviceToHost)); //copy the field from device memory HANDLE_ERROR(cudaFree(dev_hB_lut)); HANDLE_ERROR(cudaFree(dev_r)); diff --git a/stim/parser/filename.h b/stim/parser/filename.h index b0ce603..4e09a8a 100644 --- a/stim/parser/filename.h +++ b/stim/parser/filename.h @@ -238,7 +238,7 @@ public: } /// Create a matching file locator with a prefix s - stim::filename prefix(std::string s){ + stim::filename with_prefix(std::string s){ stim::filename result = *this; result._prefix = s; return result; -- libgit2 0.21.4