diff --git a/stim/cuda/cudatools/devices.h b/stim/cuda/cudatools/devices.h index 74509ee..9b32b38 100644 --- a/stim/cuda/cudatools/devices.h +++ b/stim/cuda/cudatools/devices.h @@ -4,35 +4,56 @@ #include namespace stim{ -extern "C" -int maxThreadsPerBlock() -{ - int device; - cudaGetDevice(&device); //get the id of the current device - cudaDeviceProp props; //device property structure - cudaGetDeviceProperties(&props, device); - return props.maxThreadsPerBlock; -} + extern "C" + int maxThreadsPerBlock(){ + int device; + cudaGetDevice(&device); //get the id of the current device + cudaDeviceProp props; //device property structure + cudaGetDeviceProperties(&props, device); + return props.maxThreadsPerBlock; + } -extern "C" -size_t sharedMemPerBlock() -{ - int device; - cudaGetDevice(&device); //get the id of the current device - cudaDeviceProp props; //device property structure - cudaGetDeviceProperties(&props, device); - return props.sharedMemPerBlock; -} + extern "C" + size_t sharedMemPerBlock(){ + int device; + cudaGetDevice(&device); //get the id of the current device + cudaDeviceProp props; //device property structure + cudaGetDeviceProperties(&props, device); + return props.sharedMemPerBlock; + } -extern "C" -size_t constMem() -{ - int device; - cudaGetDevice(&device); //get the id of the current device - cudaDeviceProp props; //device property structure - cudaGetDeviceProperties(&props, device); - return props.totalConstMem; -} + extern "C" + size_t constMem(){ + int device; + cudaGetDevice(&device); //get the id of the current device + cudaDeviceProp props; //device property structure + cudaGetDeviceProperties(&props, device); + return props.totalConstMem; + } + + //tests that a given device ID is valid and provides at least the specified compute capability + bool testDevice(int d, int major, int minor){ + int nd; + cudaGetDeviceCount(&nd); //get the number of CUDA devices + if(d < nd && d > 0) { //if the given ID has an associated device + cudaDeviceProp props; + cudaGetDeviceProperties(&props, d); //get the device properties structure + if(props.major >= major && props.minor >= minor) + return true; + } + return false; + } + + //tests each device ID in a list and returns the number of devices that fit the desired + // compute capability + int testDevices(int* dlist, unsigned n_devices, int major, int minor){ + int valid = 0; + for(int d = 0; d < n_devices; d++){ + if(testDevice(dlist[d], major, minor)) + valid++; + } + return valid; + } } //end namespace rts #endif diff --git a/stim/image/image.h b/stim/image/image.h index a471525..c74167e 100644 --- a/stim/image/image.h +++ b/stim/image/image.h @@ -324,6 +324,12 @@ public: //save a file void save(std::string filename){ + stim::filename file(filename); + if (file.extension() == "raw" || file.extension() == "") { + std::ofstream outfile(filename, std::ios::binary); + outfile.write((char*)img, sizeof(T) * R[0] * R[1] * R[2]); + outfile.close(); + } #ifdef USING_OPENCV //OpenCV uses an interleaved format, so convert first and then output T* buffer = (T*) malloc(bytes()); diff --git a/stim/math/bessel.h b/stim/math/bessel.h index f729aee..2ff82ca 100644 --- a/stim/math/bessel.h +++ b/stim/math/bessel.h @@ -17,10 +17,10 @@ static complex czero(0.0,0.0); template< typename P > P gamma(P x) { - const P EPS = numeric_limits

::epsilon(); - const P FPMIN_MAG = numeric_limits

::min(); - const P FPMIN = numeric_limits

::lowest(); - const P FPMAX = numeric_limits

::max(); + const P EPS = std::numeric_limits

::epsilon(); + const P FPMIN_MAG = std::numeric_limits

::min(); + const P FPMIN = std::numeric_limits

::lowest(); + const P FPMAX = std::numeric_limits

::max(); int i,k,m; P ga,gr,r,z; @@ -94,10 +94,10 @@ template int bessjy01a(P x,P &j0,P &j1,P &y0,P &y1, P &j0p,P &j1p,P &y0p,P &y1p) { - const P EPS = numeric_limits

::epsilon(); - const P FPMIN_MAG = numeric_limits

::min(); - const P FPMIN = numeric_limits

::lowest(); - const P FPMAX = numeric_limits

::max(); + const P EPS = std::numeric_limits

::epsilon(); + const P FPMIN_MAG = std::numeric_limits

::min(); + const P FPMIN = std::numeric_limits

::lowest(); + const P FPMAX = std::numeric_limits

::max(); P x2,r,ec,w0,w1,r0,r1,cs0,cs1; P cu,p0,q0,p1,q1,t1,t2; @@ -606,10 +606,10 @@ int bessjyv(P v,P x,P &vm,P *jv,P *yv, P b,ec,w0,w1,bju0,bju1,pv0,pv1,byvk; int j,k,l,m,n,kz; - const P EPS = numeric_limits

::epsilon(); - const P FPMIN_MAG = numeric_limits

::min(); - const P FPMIN = numeric_limits

::lowest(); - const P FPMAX = numeric_limits

::max(); + const P EPS = std::numeric_limits

::epsilon(); + const P FPMIN_MAG = std::numeric_limits

::min(); + const P FPMIN = std::numeric_limits

::lowest(); + const P FPMAX = std::numeric_limits

::max(); x2 = x*x; n = (int)v; diff --git a/stim/optics/scalarbeam.h b/stim/optics/scalarbeam.h index 07d3484..5841f9f 100644 --- a/stim/optics/scalarbeam.h +++ b/stim/optics/scalarbeam.h @@ -405,7 +405,7 @@ private: T NA[2]; //numerical aperature of the focusing optics vec3 f; //focal point vec3 d; //propagation direction - T A; //beam amplitude + T A; //beam amplitude T lambda; //beam wavelength public: @@ -440,10 +440,10 @@ public: stim::complex apw; //allocate space for the amplitude at the focal point T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N); //constant value weights plane waves based on the aperture and number of samples (N) stim::vec3 kpw; //declare the new k-vector based on the focused plane wave direction - for(size_t i=0; i(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave - samples[i] = scalarwave(kpw, apw); //create a plane wave based on the direction + apw = a * A * exp(stim::complex(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave + samples[i] = scalarwave(kpw, apw); //create a plane wave based on the direction } return samples; } diff --git a/stim/optics/scalarfield.h b/stim/optics/scalarfield.h index 6aebc8c..a7a5073 100644 --- a/stim/optics/scalarfield.h +++ b/stim/optics/scalarfield.h @@ -143,8 +143,8 @@ namespace stim{ /// Propagate a field slice along its orthogonal direction by a given distance z - /// @param Enew is the resulting propogated field - /// @param E is the field to be propogated + /// @param Enew is the resulting propagated field + /// @param E is the field to be propagated /// @param sx is the size of the field in the lateral x direction /// @param sy is the size of the field in the lateral y direction /// @param z is the distance to be propagated @@ -187,12 +187,13 @@ namespace stim{ if(kx_sq + ky_sq < k_sq){ kz = sqrt(k_sq - kx_sq - ky_sq); //estimate kz using the Fresnel approximation - shift = -exp(stim::complex(0, kz * z)); + shift = exp(stim::complex(0, kz * z)); + //std::cout << shift << std::endl; K[i] *= shift; K[i] /= (nx*ny); //normalize the DFT } else{ - K[i] = 0; + K[i] /= (nx*ny); } } } @@ -342,7 +343,7 @@ public: T spacing(){ T du = rect::X.len() / R[0]; T dv = rect::Y.len() / R[1]; - return min(du, dv); + return std::min(du, dv); } /// Copy the field array to the GPU, if it isn't already there @@ -537,7 +538,7 @@ public: memset(E, 0, grid_bytes()); } - void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer){ + void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer, T minval = 0, T maxval = 0){ if(loc == GPUmem){ T* image; @@ -559,7 +560,10 @@ public: cuda_intensity<<< blocks, threads >>>(image, E, size()); break; } - stim::gpu2image(image, filename, R[0], R[1], stim::cmBrewer); + if (minval == maxval) + stim::gpu2image(image, filename, R[0], R[1], cmap); + else + stim::gpu2image(image, filename, R[0], R[1], minval, maxval, cmap); HANDLE_ERROR( cudaFree(image) ); } else{ @@ -579,7 +583,10 @@ public: stim::intensity(image, E, size()); break; } - stim::cpu2image(image, filename, R[0], R[1], cmap); //save the resulting image + if (minval == maxval) + stim::cpu2image(image, filename, R[0], R[1], cmap); //save the resulting image + else + stim::cpu2image(image, filename, R[0], R[1], minval, maxval, cmap); free(image); //free the real image } } diff --git a/stim/optics/scalarmie.h b/stim/optics/scalarmie.h index 32386ee..bf56a09 100644 --- a/stim/optics/scalarmie.h +++ b/stim/optics/scalarmie.h @@ -89,7 +89,7 @@ void A_coefficients(stim::complex* A, T a, T k, stim::complex n, int Nl){ #define LOCAL_NL 16 template -__global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* hB, T r_min, T dr, size_t N_hB, int Nl){ +__global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::vec3 c, stim::complex* hB, T r_min, T dr, size_t N_hB, int Nl){ extern __shared__ stim::complex shared_hB[]; //declare the list of waves in shared memory size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array @@ -98,7 +98,7 @@ __global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions (y == NULL) ? p[1] = 0 : p[1] = y[i]; (z == NULL) ? p[2] = 0 : p[2] = z[i]; - + p = p - c; T r = p.len(); //calculate the distance from the sphere if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT @@ -124,24 +124,27 @@ __global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp( hB[n0j + l], hB[n1j + l], alpha ); + complex e, Ew; for(size_t w = 0; w < nW; w++){ //for each plane wave cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation Pl_1 = cos_phi; - Ei += W[w].E() * hlBl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation - Ei += W[w].E() * hlBl[1] * Pl_1; + e = exp(complex(0, W[w].kvec().dot(c))); + Ew = W[w].E() * e; + Ei += Ew * hlBl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation + Ei += Ew * hlBl[1] * Pl_1; #pragma unroll LOCAL_NL-1 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file) for(l = 2; l <= LOCAL_NL; l++){ Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs) - Ei += W[w].E() * hlBl[l] * Pl; //calculate and sum the current field order + Ei += Ew * hlBl[l] * Pl; //calculate and sum the current field order Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 Pl_1 = Pl; } for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers) Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs - Ei += W[w].E() * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl; + Ei += Ew * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl; Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 Pl_1 = Pl; } @@ -149,8 +152,18 @@ __global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* E[i] += Ei; //copy the result to device memory } +///Calculate the scalar Mie scattered field on the GPU +/// @param E (GPU) is the N x N destination scalar field +/// @param N is the number fo elements of the scalar field in each direction +/// @param x (GPU) is the grid of X coordinates for each point in E +/// @param y (GPU) is the grid of Y coordinates for each point in E +/// @param z (GPU) is the grid of Z coordinates for each point in E +/// @param W (CPU) is an array of coherent scalar plane waves incident on the Mie scatterer +/// @param a is the radius of the Mie scatterer +/// @param n is the complex refractive index of the Mie scatterer +/// @param r_spacing is the minimum distance between r values of the sample points in E (used to calculate look-up tables) template -void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){ +void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::vec3 c, stim::complex* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){ size_t max_shared_mem = stim::sharedMemPerBlock(); size_t hBl_array = sizeof(stim::complex) * (Nl + 1); @@ -164,11 +177,11 @@ void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, sti if(Nl <= LOCAL_NL) shared_mem = 0; else shared_mem = threads * sizeof(stim::complex) * (Nl - LOCAL_NL); //amount of shared memory to allocate //std::cout<<"shared memory allocated: "<<<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel + cuda_scalar_mie_scatter<<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, c, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel } template -__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){ +__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N, stim::vec3 c = stim::vec3(0, 0, 0)){ size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array if(i >= N) return; //exit if this thread is outside the array @@ -177,10 +190,21 @@ __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){ (y == NULL) ? p[1] = 0 : p[1] = y[i]; (z == NULL) ? p[2] = 0 : p[2] = z[i]; - r[i] = p.len(); + r[i] = (p - c).len(); } + +///Calculate the scalar Mie scattered field on the GPU +/// @param E (GPU) is the N x N destination scalar field +/// @param N is the number fo elements of the scalar field in each direction +/// @param x (GPU) is the grid of X coordinates for each point in E +/// @param y (GPU) is the grid of Y coordinates for each point in E +/// @param z (GPU) is the grid of Z coordinates for each point in E +/// @param W (CPU) is an array of coherent scalar plane waves incident on the Mie scatterer +/// @param a is the radius of the Mie scatterer +/// @param n is the complex refractive index of the Mie scatterer +/// @param r_spacing is the minimum distance between r values of the sample points in E (used to calculate look-up tables) template -void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector> W, T a, stim::complex n, T r_spacing = 0.1){ +void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector> W, T a, stim::complex n, stim::vec3 c = stim::vec3(0, 0, 0), T r_spacing = 0.1){ //calculate the necessary number of orders required to represent the scattered field T k = W[0].kmag(); @@ -205,7 +229,7 @@ void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std int threads = stim::maxThreadsPerBlock(); dim3 blocks((unsigned)(N / threads + 1)); - cuda_dist <<< blocks, threads >>>(dev_r, x, y, z, N); + cuda_dist <<< blocks, threads >>>(dev_r, x, y, z, N, c); //Find the minimum and maximum values of r cublasStatus_t stat; @@ -273,10 +297,16 @@ void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std stim::complex* dev_hB_lut; HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); + //std::cout << "r_min: " << r_min << std::endl; + //std::cout << "dr: " << dr << std::endl; + gpu_scalar_mie_scatter(E, N, x, y, z, dev_W, W.size(), a, n, c, dev_hB_lut, r_min, dr, N_hB_lut, Nl); - gpu_scalar_mie_scatter(E, N, x, y, z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl); + HANDLE_ERROR(cudaMemcpy(E, E, N * sizeof(stim::complex), cudaMemcpyDeviceToHost)); //copy the field from device memory + + HANDLE_ERROR(cudaFree(dev_hB_lut)); + HANDLE_ERROR(cudaFree(dev_r)); + HANDLE_ERROR(cudaFree(dev_W)); - cudaMemcpy(E, E, N * sizeof(stim::complex), cudaMemcpyDeviceToHost); //copy the field from device memory } /// Calculate the scalar Mie solution for the scattered field produced by a single plane wave @@ -289,10 +319,10 @@ void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std /// @param a is the radius of the sphere /// @param n is the complex refractive index of the sphere template -void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector> W, T a, stim::complex n, T r_spacing = 0.1){ +void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector> W, T a, stim::complex n, stim::vec3 c = stim::vec3(0, 0, 0)){ -#ifdef CUDA_FOUND +/*#ifdef CUDA_FOUND stim::complex* dev_E; //allocate space for the field cudaMalloc(&dev_E, N * sizeof(stim::complex)); cudaMemcpy(dev_E, E, N * sizeof(stim::complex), cudaMemcpyHostToDevice); @@ -315,13 +345,14 @@ void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); } - gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, W, a, n, r_spacing); + gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, W, a, n, c, r_spacing); if(x != NULL) cudaFree(dev_x); //free everything if(y != NULL) cudaFree(dev_y); if(z != NULL) cudaFree(dev_z); cudaFree(dev_E); #else +*/ //calculate the necessary number of orders required to represent the scattered field T k = W[0].kmag(); @@ -345,15 +376,18 @@ void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std T r, kr, cos_phi; stim::complex h; + stim::complex Ew; for(size_t i = 0; i < N; i++){ stim::vec3 p; //declare a 3D point (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions (y == NULL) ? p[1] = 0 : p[1] = y[i]; (z == NULL) ? p[2] = 0 : p[2] = z[i]; + p = p - c; r = p.len(); if(r >= a){ for(size_t w = 0; w < W.size(); w++){ + Ew = W[w].E() * exp(stim::complex(0, W[w].kvec().dot(c))); kr = p.len() * W[w].kmag(); //calculate k*r stim::bessjyv_sph(Nl, kr, vm, j_kr, y_kr, dj_kr, dy_kr); cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle from the propagating direction @@ -362,18 +396,18 @@ void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std for(size_t l = 0; l <= Nl; l++){ h.r = j_kr[l]; h.i = y_kr[l]; - E[i] += W[w].E() * B[l] * h * P[l]; + E[i] += Ew * B[l] * h * P[l]; } } } } -#endif +//#endif } template -void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave w, T a, stim::complex n, T r_spacing = 0.1){ +void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave w, T a, stim::complex n, stim::vec3 c = stim::vec3(0, 0, 0), T r_spacing = 0.1){ std::vector< stim::scalarwave > W(1, w); - cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, r_spacing); + cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, c, r_spacing); } template @@ -389,14 +423,14 @@ __global__ void cuda_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* T r = p.len(); //calculate the distance from the sphere if(r >= a) return; //exit if the point is inside the sphere (we only calculate the internal field) - T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT + T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT size_t ij = (size_t) fij; //convert to an integral index T alpha = fij - ij; //calculate the fractional portion of the index - size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT - size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT + size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT + size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT T cos_phi; - T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials + T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials stim::complex jAl; stim::complex Ei = 0; //create a register to store the result @@ -415,7 +449,7 @@ __global__ void cuda_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* for(size_t w = 0; w < nW; w++){ //for each plane wave if(r == 0) cos_phi = 0; else - cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation Pl_1 = cos_phi; Ei += W[w].E() * jlAl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation @@ -468,7 +502,7 @@ void gpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, st /// @param a is the radius of the sphere /// @param n is the complex refractive index of the sphere template -void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave > W, T a, stim::complex n, T r_spacing = 0.1){ +void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave > W, T a, stim::complex n, stim::vec3 c = stim::vec3(0, 0, 0)){ //calculate the necessary number of orders required to represent the scattered field T k = W[0].kmag(); @@ -480,7 +514,7 @@ void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, st stim::complex* A = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients A_coefficients(A, a, k, n, Nl); -#ifdef CUDA_FOUND +/*#ifdef CUDA_FOUND stim::complex* dev_E; //allocate space for the field cudaMalloc(&dev_E, N * sizeof(stim::complex)); cudaMemcpy(dev_E, E, N * sizeof(stim::complex), cudaMemcpyHostToDevice); @@ -538,6 +572,7 @@ void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, st printf ("CUBLAS Error: failed to calculate maximum r value.\n"); exit(1); } + cublasDestroy(handle); //destroy the CUBLAS handle i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility i_max--; @@ -585,9 +620,9 @@ void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, st HANDLE_ERROR( cudaFree(dev_E) ); HANDLE_ERROR( cudaFree(dev_W) ); HANDLE_ERROR( cudaFree(dev_r) ); - cudaFree(dev_E); + HANDLE_ERROR( cudaFree(dev_E) ); #else - +*/ //allocate space to store the bessel function call results double vm; stim::complex* j_knr = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); @@ -600,16 +635,19 @@ void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, st T r, cos_phi; stim::complex knr; stim::complex h; + stim::complex Ew; for(size_t i = 0; i < N; i++){ stim::vec3 p; //declare a 3D point (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions (y == NULL) ? p[1] = 0 : p[1] = y[i]; (z == NULL) ? p[2] = 0 : p[2] = z[i]; + p = p - c; r = p.len(); if(r < a){ E[i] = 0; for(size_t w = 0; w < W.size(); w++){ + Ew = W[w].E() * exp(stim::complex(0, W[w].kvec().dot(c))); knr = (stim::complex)n * p.len() * W[w].kmag(); //calculate k*n*r stim::cbessjyva_sph(Nl, knr, vm, j_knr, y_knr, dj_knr, dy_knr); @@ -620,18 +658,18 @@ void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, st stim::legendre(Nl, cos_phi, P); for(size_t l = 0; l <= Nl; l++){ - E[i] += W[w].E() * A[l] * (stim::complex)j_knr[l] * P[l]; + E[i] += Ew * A[l] * (stim::complex)j_knr[l] * P[l]; } } } } -#endif +//#endif } template -void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave w, T a, stim::complex n, T r_spacing = 0.1){ +void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave w, T a, stim::complex n, stim::vec3 c = stim::vec3(0, 0, 0)){ std::vector< stim::scalarwave > W(1, w); - cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, r_spacing); + cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, c); } @@ -639,59 +677,89 @@ void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, st template class scalarmie { -private: +public: T radius; //radius of the scattering sphere stim::complex n; //refractive index of the scattering sphere + vec3 c; //position of the sphere in space public: - scalarmie(T r, stim::complex ri){ + scalarmie() { //default constructor + radius = 0.5; + n = stim::complex(1.4, 0.0); + c = stim::vec3(0, 0, 0); + } + + scalarmie(T r, stim::complex ri, stim::vec3 center = stim::vec3(0, 0, 0)){ radius = r; n = ri; + c = center; + //c = stim::vec3(2, 1, 0); } - void sum_scat(stim::scalarfield& E, T* X, T* Y, T* Z, stim::scalarbeam b, int samples = 1000){ - std::vector< stim::scalarwave > wave_array = b.mc(samples); //decompose the beam into an array of plane waves - stim::cpu_scalar_mie_scatter(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); - } + //void sum_scat(stim::scalarfield& E, T* X, T* Y, T* Z, stim::scalarbeam b, int samples = 1000){ + // std::vector< stim::scalarwave > wave_array = b.mc(samples); //decompose the beam into an array of plane waves + // stim::cpu_scalar_mie_scatter(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); + //} - void sum_intern(stim::scalarfield& E, T* X, T* Y, T* Z, stim::scalarbeam b, int samples = 1000){ - std::vector< stim::scalarwave > wave_array = b.mc(samples); //decompose the beam into an array of plane waves - stim::cpu_scalar_mie_internal(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); - } + //void sum_intern(stim::scalarfield& E, T* X, T* Y, T* Z, stim::scalarbeam b, int samples = 1000){ + // std::vector< stim::scalarwave > wave_array = b.mc(samples); //decompose the beam into an array of plane waves + // stim::cpu_scalar_mie_internal(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); + //} - void eval(stim::scalarfield& E, T* X, T* Y, T* Z, stim::scalarbeam b, int order = 500, int samples = 1000){ - b.eval(E, X, Y, Z, order); //evaluate the incident field using a plane wave expansion - std::vector< stim::scalarwave > wave_array = b.mc(samples); //decompose the beam into an array of plane waves - sum_scat(E, X, Y, Z, b, samples); - sum_intern(E, X, Y, Z, b, samples); - } + //void eval(stim::scalarfield& E, T* X, T* Y, T* Z, stim::scalarbeam b, int order = 500, int samples = 1000){ + // b.eval(E, X, Y, Z, order); //evaluate the incident field using a plane wave expansion + // std::vector< stim::scalarwave > wave_array = b.mc(samples); //decompose the beam into an array of plane waves + // sum_scat(E, X, Y, Z, b, samples); + // sum_intern(E, X, Y, Z, b, samples); + //} void eval(stim::scalarfield& E, stim::scalarbeam b, int order = 500, int samples = 1000){ - /*size_t array_size = E.grid_bytes(); //calculate the number of bytes in the scalar grid - float* X = (float*) malloc( array_size ); //allocate space for the coordinate meshes - float* Y = (float*) malloc( array_size ); - float* Z = (float*) malloc( array_size ); - E.meshgrid(X, Y, Z, stim::CPUmem); //calculate the coordinate meshes - */ E.meshgrid(); b.eval(E, order); std::vector< stim::scalarwave > wave_array = b.mc(samples); //decompose the beam into an array of plane waves if(E.gpu()){ - stim::gpu_scalar_mie_scatter(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); + stim::gpu_scalar_mie_scatter(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c, E.spacing()); } else{ stim::cpu_scalar_mie_scatter(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); stim::cpu_scalar_mie_internal(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); } - //eval(E, X, Y, Z, b, order, samples); //evaluate the field } }; //end stim::scalarmie +template +class scalarcluster : public std::vector< scalarmie > { +public: + + void eval(stim::scalarfield& E, stim::scalarbeam b, int order = 500, int samples = 1000) { + E.meshgrid(); + b.eval(E, order); + + std::vector< stim::scalarwave > wave_array = b.mc(samples); //decompose the beam into an array of plane waves + + T radius; + stim::complex n; + stim::vec3 c; + for (size_t si = 0; si < size(); si++) { //for each sphere in the cluster + radius = at(si).radius; + n = at(si).n; + c = at(si).c; + if (E.gpu()) { + stim::gpu_scalar_mie_scatter(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c, E.spacing()); + } + else { + stim::cpu_scalar_mie_scatter(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c); + stim::cpu_scalar_mie_internal(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, c); + } + } + } +}; + } //end namespace stim #endif \ No newline at end of file diff --git a/stim/optics/scalarwave.h b/stim/optics/scalarwave.h index d993567..a6dd7ec 100644 --- a/stim/optics/scalarwave.h +++ b/stim/optics/scalarwave.h @@ -252,7 +252,7 @@ void gpu_scalarwaves(stim::complex* F, size_t N, T* x, T* y, T* z, stim::scal size_t batch_size; //declare a variable to store the size of the current batch size_t waves_processed = 0; //initialize the number of waves processed to zero while(waves_processed < nW){ //while there are still waves to be processed - batch_size = min(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left + batch_size = std::min(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left batch_bytes = batch_size * sizeof(stim::scalarwave); HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory cuda_scalarwave<<< blocks, threads, batch_bytes >>>(F, N, x, y, z, batch_W, batch_size); //call the kernel diff --git a/stim/parser/table.h b/stim/parser/table.h index 678f854..8d2ec0a 100644 --- a/stim/parser/table.h +++ b/stim/parser/table.h @@ -1,5 +1,5 @@ -#ifndef STIM_CSV_H -#define STIM_CSV_H +#ifndef STIM_TABLE_H +#define STIM_TABLE_H #include #include @@ -102,6 +102,27 @@ namespace stim{ return result; //return the casted table } + size_t rows(){ + return TABLE.size(); + } + + size_t cols(size_t ri){ + return TABLE[ri].size(); + } + + //returns the maximum number of columns in the table + size_t cols(){ + size_t max_c = 0; + for(size_t ri = 0; ri < rows(); ri++){ + max_c = std::max(max_c, cols(ri)); + } + return max_c; + } + + std::string operator()(size_t ri, size_t ci){ + return TABLE[ri][ci]; + } + }; }; -- libgit2 0.21.4