From 8309b07a16caaadfd4bc85bfbebdf7041a63f0e0 Mon Sep 17 00:00:00 2001 From: David Date: Wed, 15 Jun 2016 12:50:08 -0500 Subject: [PATCH] fixed some vec3 errors --- stim/math/complex.h | 1 + stim/math/plane_old.h | 179 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- stim/math/quad.h | 186 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ stim/math/rect.h | 47 ++++++++++++++++++++++------------------------- stim/math/vec3.h | 7 +++++++ stim/math/vector.h | 11 ++++++----- stim/optics/mie.h | 274 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------------------- stim/optics/scalarbeam.h | 317 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------------------------------------------------------------------------------------------------------------------- 8 files changed, 429 insertions(+), 593 deletions(-) delete mode 100644 stim/math/plane_old.h delete mode 100644 stim/math/quad.h diff --git a/stim/math/complex.h b/stim/math/complex.h index e5d0a35..237b3ca 100644 --- a/stim/math/complex.h +++ b/stim/math/complex.h @@ -11,6 +11,7 @@ namespace stim { + enum complexComponentType {complexReal, complexImaginary, complexMag}; template struct complex diff --git a/stim/math/plane_old.h b/stim/math/plane_old.h deleted file mode 100644 index b009cad..0000000 --- a/stim/math/plane_old.h +++ /dev/null @@ -1,179 +0,0 @@ -#ifndef RTS_PLANE_H -#define RTS_PLANE_H - -#include -#include -#include "rts/cuda/callable.h" - - -namespace stim{ -template class plane; -} - -template -CUDA_CALLABLE stim::plane operator-(stim::plane v); - -namespace stim{ - -template -class plane{ - - //a plane is defined by a point and a normal - -private: - - vec P; //point on the plane - vec N; //plane normal - - CUDA_CALLABLE void init(){ - P = vec(0, 0, 0); - N = vec(0, 0, 1); - } - - -public: - - //default constructor - CUDA_CALLABLE plane(){ - init(); - } - - CUDA_CALLABLE plane(vec n, vec p = vec(0, 0, 0)){ - P = p; - N = n.norm(); - } - - CUDA_CALLABLE plane(T z_pos){ - init(); - P[2] = z_pos; - } - - //create a plane from three points (a triangle) - CUDA_CALLABLE plane(vec a, vec b, vec c){ - P = c; - N = (c - a).cross(b - a); - if(N.len() == 0) //handle the degenerate case when two vectors are the same, N = 0 - N = 0; - else - N = N.norm(); - } - - template< typename U > - CUDA_CALLABLE operator plane(){ - - plane result(N, P); - return result; - } - - CUDA_CALLABLE vec norm(){ - return N; - } - - CUDA_CALLABLE vec p(){ - return P; - } - - //flip the plane front-to-back - CUDA_CALLABLE plane flip(){ - plane result = *this; - result.N = -result.N; - return result; - } - - //determines how a vector v intersects the plane (1 = intersects front, 0 = within plane, -1 = intersects back) - CUDA_CALLABLE int face(vec v){ - - T dprod = v.dot(N); //get the dot product between v and N - - //conditional returns the appropriate value - if(dprod < 0) - return 1; - else if(dprod > 0) - return -1; - else - return 0; - } - - //determine on which side of the plane a point lies (1 = front, 0 = on the plane, -1 = back) - CUDA_CALLABLE int side(vec p){ - - vec v = p - P; //get the vector from P to the query point p - - return face(v); - } - - //compute the component of v that is perpendicular to the plane - CUDA_CALLABLE vec perpendicular(vec v){ - return N * v.dot(N); - } - - //compute the projection of v in the plane - CUDA_CALLABLE vec parallel(vec v){ - return v - perpendicular(v); - } - - CUDA_CALLABLE void decompose(vec v, vec& para, vec& perp){ - perp = N * v.dot(N); - para = v - perp; - } - - //get both the parallel and perpendicular components of a vector v w.r.t. the plane - CUDA_CALLABLE void project(vec v, vec &v_par, vec &v_perp){ - - v_perp = v.dot(N); - v_par = v - v_perp; - } - - //compute the reflection of v off of the plane - CUDA_CALLABLE vec reflect(vec v){ - - //compute the reflection using N_prime as the plane normal - vec par = parallel(v); - vec r = (-v) + par * 2; - - /*std::cout<<"----------------REFLECT-----------------------------"< operator-() - { - rts::plane p = *this; - - //negate the normal vector - p.N = -p.N; - - return p; - } - - //output a string - std::string str(){ - std::stringstream ss; - ss<<"P: "< operator- <> (rts::plane v); - - - -}; - -} - -//arithmetic operators - -//negative operator flips the plane (front to back) -//template - - - - -#endif diff --git a/stim/math/quad.h b/stim/math/quad.h deleted file mode 100644 index c983008..0000000 --- a/stim/math/quad.h +++ /dev/null @@ -1,186 +0,0 @@ -#ifndef RTS_QUAD_H -#define RTS_QUAD_H - -//enable CUDA_CALLABLE macro -#include -#include -#include -#include -#include -#include -#include - -namespace stim{ - -//template for a quadangle class in ND space -template -struct quad -{ - /* - B------------------>C - ^ ^ - | | - Y | - | | - | | - A---------X-------->O - */ - - /*T A[N]; - T B[N]; - T C[N];*/ - - rts::vec A; - rts::vec X; - rts::vec Y; - - - CUDA_CALLABLE quad() - { - - } - - CUDA_CALLABLE quad(vec a, vec b, vec c) - { - - A = a; - Y = b - a; - X = c - a - Y; - - } - - /******************************************************************* - Constructor - create a quad from a position, normal, and rotation - *******************************************************************/ - CUDA_CALLABLE quad(rts::vec c, rts::vec normal, T width, T height, T theta) - { - - //compute the X direction - start along world-space X - Y = rts::vec(0, 1, 0); - if(Y == normal) - Y = rts::vec(0, 0, 1); - - X = Y.cross(normal).norm(); - - std::cout< q; - q.CreateRotation(theta, normal); - X = q.toMatrix3() * X; - Y = normal.cross(X); - - //normalize everything - X = X.norm(); - Y = Y.norm(); - - //scale to match the quad width and height - X = X * width; - Y = Y * height; - - //set the corner of the plane - A = c - X * 0.5f - Y * 0.5f; - - std::cout< & rhs) - { - if(A == rhs.A && X == rhs.X && Y == rhs.Y) - return true; - else - return false; - } - - /******************************************* - Return the normal for the quad - *******************************************/ - CUDA_CALLABLE rts::vec n() - { - return (X.cross(Y)).norm(); - } - - CUDA_CALLABLE rts::vec p(T a, T b) - { - rts::vec result; - //given the two parameters a, b = [0 1], returns the position in world space - result = A + X * a + Y * b; - - return result; - } - - CUDA_CALLABLE rts::vec operator()(T a, T b) - { - return p(a, b); - } - - std::string str() - { - std::stringstream ss; - - ss<"<<"C="<"<<"D="< operator*(T rhs) - { - //scales the plane by a scalar value - - //compute the center point - rts::vec c = A + X*0.5f + Y*0.5f; - - //create the new quadangle - quad result; - result.X = X * rhs; - result.Y = Y * rhs; - result.A = c - result.X*0.5f - result.Y*0.5f; - - return result; - - } - - CUDA_CALLABLE T dist(vec p) - { - //compute the distance between a point and this quad - - //first break the quad up into two triangles - triangle T0(A, A+X, A+Y); - triangle T1(A+X+Y, A+X, A+Y); - - - T d0 = T0.dist(p); - T d1 = T1.dist(p); - - if(d0 < d1) - return d0; - else - return d1; - } - - CUDA_CALLABLE T dist_max(vec p) - { - T da = (A - p).len(); - T db = (A+X - p).len(); - T dc = (A+Y - p).len(); - T dd = (A+X+Y - p).len(); - - return std::max( da, std::max(db, std::max(dc, dd) ) ); - } -}; - -} //end namespace rts - -template -std::ostream& operator<<(std::ostream& os, rts::quad R) -{ - os< O---------X---------> */ -private: - - stim::vec X; - stim::vec Y; - - +protected: + stim::vec3 X; + stim::vec3 Y; public: @@ -65,7 +62,7 @@ public: ///create a rectangle from a center point, normal ///@param c: x,y,z location of the center. ///@param n: x,y,z direction of the normal. - CUDA_CALLABLE rect(vec c, vec n = vec(0, 0, 1)) + CUDA_CALLABLE rect(vec3 c, vec3 n = vec3(0, 0, 1)) : plane() { init(); //start with the default setting @@ -76,7 +73,7 @@ public: ///@param c: x,y,z location of the center. ///@param s: size of the rectangle. ///@param n: x,y,z direction of the normal. - CUDA_CALLABLE rect(vec c, T s, vec n = vec(0, 0, 1)) + CUDA_CALLABLE rect(vec3 c, T s, vec3 n = vec3(0, 0, 1)) : plane() { init(); //start with the default setting @@ -89,7 +86,7 @@ public: ///@param center: x,y,z location of the center. ///@param directionX: u,v,w direction of the X vector. ///@param directionY: u,v,w direction of the Y vector. - CUDA_CALLABLE rect(vec center, vec directionX, vec directionY ) + CUDA_CALLABLE rect(vec3 center, vec3 directionX, vec3 directionY ) : plane((directionX.cross(directionY)).norm(),center) { X = directionX; @@ -101,7 +98,7 @@ public: ///@param center: x,y,z location of the center. ///@param directionX: u,v,w direction of the X vector. ///@param directionY: u,v,w direction of the Y vector. - CUDA_CALLABLE rect(T size, vec center, vec directionX, vec directionY ) + CUDA_CALLABLE rect(T size, vec3 center, vec3 directionX, vec3 directionY ) : plane((directionX.cross(directionY)).norm(),center) { X = directionX; @@ -114,7 +111,7 @@ public: ///@param center: x,y,z location of the center. ///@param directionX: u,v,w direction of the X vector. ///@param directionY: u,v,w direction of the Y vector. - CUDA_CALLABLE rect(vec size, vec center, vec directionX, vec directionY) + CUDA_CALLABLE rect(vec3 size, vec3 center, vec3 directionX, vec3 directionY) : plane((directionX.cross(directionY)).norm(), center) { X = directionX; @@ -138,7 +135,7 @@ public: ///@param n; vector with the normal. ///Orients the rectangle along the normal n. - CUDA_CALLABLE void normal(vec n) + CUDA_CALLABLE void normal(vec3 n) { //orient the rectangle along the specified normal rotate(n, X, Y); @@ -147,8 +144,8 @@ public: ///general init method that sets a general rectangle. CUDA_CALLABLE void init() { - X = vec(1, 0, 0); - Y = vec(0, 1, 0); + X = vec3(1, 0, 0); + Y = vec3(0, 1, 0); } //boolean comparison @@ -162,18 +159,18 @@ public: //get the world space value given the planar coordinates a, b in [0, 1] - CUDA_CALLABLE stim::vec p(T a, T b) + CUDA_CALLABLE stim::vec3 p(T a, T b) { - stim::vec result; + stim::vec3 result; //given the two parameters a, b = [0 1], returns the position in world space - vec A = this->P - X * (T)0.5 - Y * (T)0.5; + vec3 A = this->P - X * (T)0.5 - Y * (T)0.5; result = A + X * a + Y * b; return result; } //parenthesis operator returns the world space given rectangular coordinates a and b in [0 1] - CUDA_CALLABLE stim::vec operator()(T a, T b) + CUDA_CALLABLE stim::vec3 operator()(T a, T b) { return p(a, b); } @@ -181,12 +178,12 @@ public: std::string str() { std::stringstream ss; - vec A = P - X * (T)0.5 - Y * (T)0.5; + vec3 A = P - X * (T)0.5 - Y * (T)0.5; ss<"<<"C="<"<<"D="< p) + CUDA_CALLABLE T dist(vec3 p) { //compute the distance between a point and this rect - vec A = P - X * (T)0.5 - Y * (T)0.5; + vec3 A = P - X * (T)0.5 - Y * (T)0.5; //first break the rect up into two triangles triangle T0(A, A+X, A+Y); @@ -225,16 +222,16 @@ public: return d1; } - CUDA_CALLABLE T center(vec p) + CUDA_CALLABLE T center(vec3 p) { this->P = p; } ///Returns the maximum distance of the rectangle from a point p to the sides of the rectangle. ///@param p: x, y, z point. - CUDA_CALLABLE T dist_max(vec p) + CUDA_CALLABLE T dist_max(vec3 p) { - vec A = P - X * (T)0.5 - Y * (T)0.5; + vec3 A = P - X * (T)0.5 - Y * (T)0.5; T da = (A - p).len(); T db = (A+X - p).len(); T dc = (A+Y - p).len(); diff --git a/stim/math/vec3.h b/stim/math/vec3.h index 892a370..f5cedd2 100644 --- a/stim/math/vec3.h +++ b/stim/math/vec3.h @@ -242,4 +242,11 @@ stim::vec3 operator*(T lhs, stim::vec3 rhs){ return rhs * lhs; } +//stream operator +template +std::ostream& operator<<(std::ostream& os, stim::vec3 const& rhs){ + os< #include #include - + #include +#include namespace stim { @@ -70,8 +71,8 @@ struct vec : public std::vector size_t N = other.size(); resize(N); //resize the current vector to match the copy for(size_t i=0; i } /// Cast to a vec3 - operator vec3(){ - vec3 r; + operator stim::vec3(){ + stim::vec3 r; size_t N = std::min(size(), 3); for(size_t i = 0; i < N; i++) r[i] = at(i); diff --git a/stim/optics/mie.h b/stim/optics/mie.h index 9955205..12d208e 100644 --- a/stim/optics/mie.h +++ b/stim/optics/mie.h @@ -1,5 +1,6 @@ #ifndef STIM_MIE_H #define STIM_MIE_H +#include #include "scalarwave.h" #include "../math/bessel.h" @@ -43,7 +44,6 @@ void B_coefficients(stim::complex* B, T a, T k, stim::complex n, int Nl){ numerator = j_ka[l] * dj_kna[l] * (stim::complex)n - j_kna[l] * dj_ka[l]; denominator = j_kna[l] * dh_ka - h_ka * dj_kna[l] * (stim::complex)n; B[l] = (2 * l + 1) * pow(i, l) * numerator / denominator; - std::cout<* A, T a, T k, stim::complex n, int Nl){ #define LOCAL_NL 16 template -__global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* hB, T kr_min, T dkr, size_t N_hB, int Nl){ +__global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* hB, T r_min, T dr, size_t N_hB, int Nl){ extern __shared__ stim::complex shared_hB[]; //declare the list of waves in shared memory size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array @@ -96,14 +96,11 @@ __global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* T r = p.len(); //calculate the distance from the sphere if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) - T k = W[0].kmag(); - size_t NC = Nl + 1; //calculate the number of coefficients to be used - T kr = r * k; //calculate the thread value for k*r - T fij = (kr - kr_min)/dkr; //FP index into the spherical bessel LUT + T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT size_t ij = (size_t) fij; //convert to an integral index T alpha = fij - ij; //calculate the fractional portion of the index - size_t n0j = ij * (NC); //start of the first entry in the LUT - size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT + size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT + size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT T cos_phi; T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials @@ -112,37 +109,36 @@ __global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* stim::complex Ei = 0; //create a register to store the result int l; - stim::complex hlBl[LOCAL_NL+1]; - int shared_start = threadIdx.x * (Nl - LOCAL_NL); + stim::complex hlBl[LOCAL_NL+1]; //the first LOCAL_NL components are stored in registers for speed + int shared_start = threadIdx.x * (Nl - LOCAL_NL); //wrap up some operations so that they aren't done in the main loops - #pragma unroll LOCAL_NL+1 + #pragma unroll LOCAL_NL+1 //copy the first LOCAL_NL+1 h_l * B_l components to registers for(l = 0; l <= LOCAL_NL; l++) hlBl[l] = clerp( hB[n0j + l], hB[n1j + l], alpha ); - for(l = LOCAL_NL+1; l <= Nl; l++) + for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp( hB[n0j + l], hB[n1j + l], alpha ); - for(size_t w = 0; w < nW; w++){ + for(size_t w = 0; w < nW; w++){ //for each plane wave cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere - Pl_2 = 1; + Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation Pl_1 = cos_phi; - Ei += W[w].E() * hlBl[0] * Pl_2; + Ei += W[w].E() * hlBl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation Ei += W[w].E() * hlBl[1] * Pl_1; - #pragma unroll LOCAL_NL-1 + #pragma unroll LOCAL_NL-1 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file) for(l = 2; l <= LOCAL_NL; l++){ - Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1); - Ei += W[w].E() * hlBl[l] * Pl; + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs) + Ei += W[w].E() * hlBl[l] * Pl; //calculate and sum the current field order Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 Pl_1 = Pl; } - for(l = LOCAL_NL+1; l <= Nl; l++){ - Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1); - Ei += W[w].E() * shared_hB[shared_start + (l - (LOCAL_NL+1))] * Pl; - Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 - Pl_1 = Pl; - + for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers) + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs + Ei += W[w].E() * shared_hB[shared_start + l - LOCAL_NL - 1] * Pl; + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 + Pl_1 = Pl; } } E[i] += Ei; //copy the result to device memory @@ -152,10 +148,10 @@ template void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){ size_t max_shared_mem = stim::sharedMemPerBlock(); - int hBl_array = sizeof(stim::complex) * (Nl + 1); + size_t hBl_array = sizeof(stim::complex) * (Nl + 1); std::cout<<"hl*Bl array size: "<* E, size_t N, T* x, T* y, T* z, sti else shared_mem = threads * sizeof(stim::complex) * (Nl - LOCAL_NL); //amount of shared memory to allocate std::cout<<"shared memory allocated: "<<<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel - } template @@ -261,16 +256,19 @@ void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std exit(1); } + i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility + i_max--; T r_min, r_max; //allocate space to store the minimum and maximum values HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); + r_min = max(r_min, a); //if the radius of the sphere is larger than r_min, change r_min to a (the scattered field doesn't exist inside the sphere) //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1); - T kr_min = k * r_min; - T kr_max = k * r_max; + //T kr_min = k * r_min; + //T kr_max = k * r_max; //temporary variables double vm; //allocate space to store the return values for the bessel function calculation @@ -281,27 +279,29 @@ void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std size_t hB_bytes = sizeof(stim::complex) * (Nl+1) * N_hB_lut; stim::complex* hB_lut = (stim::complex*) malloc(hB_bytes); //pointer to the look-up table - T dkr = (kr_max - kr_min) / (N_hB_lut-1); //distance between values in the LUT + T dr = (r_max - r_min) / (N_hB_lut-1); //distance between values in the LUT std::cout<<"LUT jl bytes: "< hl; - for(size_t kri = 0; kri < N_hB_lut; kri++){ //for each value in the LUT - stim::bessjyv_sph(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] + for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT + stim::bessjyv_sph(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] for(size_t l = 0; l <= Nl; l++){ //for each order hl.r = (T)jv[l]; hl.i = (T)yv[l]; - hB_lut[kri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result + hB_lut[ri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result + //std::cout<(hankel_lut, "hankel.bmp", Nl+1, Nlut_j, stim::cmBrewer); + T* real_lut = (T*) malloc(hB_bytes/2); + stim::real(real_lut, hB_lut, N_hB_lut); + stim::cpu2image(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer); //Allocate device memory and copy everything to the GPU stim::complex* dev_hB_lut; HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); - gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, kr_min, dkr, N_hB_lut, Nl); + gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl); cudaMemcpy(E, dev_E, N * sizeof(stim::complex), cudaMemcpyDeviceToHost); //copy the field from device memory @@ -349,9 +349,90 @@ void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std } template -void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave w, T a, stim::complex n){ +void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave w, T a, stim::complex n, T r_spacing = 0.1){ std::vector< stim::scalarwave > W(1, w); - cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n); + cpu_scalar_mie_scatter(E, N, x, y, z, W, a, n, r_spacing); +} + +template +__global__ void cuda_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* jA, T r_min, T dr, size_t N_jA, int Nl){ + extern __shared__ stim::complex shared_jA[]; //declare the list of waves in shared memory + + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array + if(i >= N) return; //exit if this thread is outside the array + stim::vec3 p; + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions + (y == NULL) ? p[1] = 0 : p[1] = y[i]; + (z == NULL) ? p[2] = 0 : p[2] = z[i]; + + T r = p.len(); //calculate the distance from the sphere + if(r > a) return; //exit if the point is inside the sphere (we only calculate the internal field) + T fij = (r - r_min)/dr; //FP index into the spherical bessel LUT + size_t ij = (size_t) fij; //convert to an integral index + T alpha = fij - ij; //calculate the fractional portion of the index + size_t n0j = ij * (Nl + 1); //start of the first entry in the LUT + size_t n1j = (ij+1) * (Nl + 1); //start of the second entry in the LUT + + T cos_phi; + T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials + + stim::complex jAl; + stim::complex Ei = 0; //create a register to store the result + int l; + + stim::complex jlAl[LOCAL_NL+1]; //the first LOCAL_NL components are stored in registers for speed + int shared_start = threadIdx.x * (Nl - LOCAL_NL); //wrap up some operations so that they aren't done in the main loops + + #pragma unroll LOCAL_NL+1 //copy the first LOCAL_NL+1 h_l * B_l components to registers + for(l = 0; l <= LOCAL_NL; l++) + jlAl[l] = clerp( jA[n0j + l], jA[n1j + l], alpha ); + + for(l = LOCAL_NL+1; l <= Nl; l++) //copy any additional h_l * B_l components to shared memory + shared_jA[shared_start + (l - (LOCAL_NL+1))] = clerp( jA[n0j + l], jA[n1j + l], alpha ); + + for(size_t w = 0; w < nW; w++){ //for each plane wave + if(r == 0) cos_phi = 0; + else + cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere + Pl_2 = 1; //the Legendre polynomials will be calculated recursively, initialize the first two steps of the recursive relation + Pl_1 = cos_phi; + Ei += W[w].E() * jlAl[0] * Pl_2; //unroll the first two orders using the initial steps of the Legendre recursive relation + Ei += W[w].E() * jlAl[1] * Pl_1; + + #pragma unroll LOCAL_NL-1 //unroll the next LOCAL_NL-1 loops for speed (iterating through the components in the register file) + for(l = 2; l <= LOCAL_NL; l++){ + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //calculate the next step in the Legendre polynomial recursive relation (this is where most of the computation occurs) + Ei += W[w].E() * jlAl[l] * Pl; //calculate and sum the current field order + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 + Pl_1 = Pl; + } + + for(l = LOCAL_NL+1; l <= Nl; l++){ //do the same as above, except for any additional orders that are stored in shared memory (not registers) + Pl = ( (2 * (l-1) + 1) * cos_phi * Pl_1 - (l-1) * Pl_2 ) / (l); //again, this is where most computation in the kernel occurs + Ei += W[w].E() * shared_jA[shared_start + l - LOCAL_NL - 1] * Pl; + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 + Pl_1 = Pl; + } + } + E[i] = Ei; //copy the result to device memory +} + +template +void gpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* jA, T r_min, T dr, size_t N_jA, size_t Nl){ + + size_t max_shared_mem = stim::sharedMemPerBlock(); + size_t hBl_array = sizeof(stim::complex) * (Nl + 1); + std::cout<<"hl*Bl array size: "<) * (Nl - LOCAL_NL); //amount of shared memory to allocate + std::cout<<"shared memory allocated: "<<<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl); //call the kernel } /// Calculate the scalar Mie solution for the internal field produced by a single plane wave scattered by a sphere @@ -365,18 +446,122 @@ void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, sti /// @param a is the radius of the sphere /// @param n is the complex refractive index of the sphere template -void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave > W, T a, stim::complex n){ - - //calculate the necessary number of orders required to represent the scattered field +void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave > W, T a, stim::complex n, T r_spacing = 0.1){ +//calculate the necessary number of orders required to represent the scattered field T k = W[0].kmag(); - size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2); + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); + if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) std::cout<<"Nl: "<* A = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients A_coefficients(A, a, k, n, Nl); +#ifdef CUDA_FOUND + stim::complex* dev_E; //allocate space for the field + cudaMalloc(&dev_E, N * sizeof(stim::complex)); + cudaMemcpy(dev_E, E, N * sizeof(stim::complex), cudaMemcpyHostToDevice); + //cudaMemset(dev_F, 0, N * sizeof(stim::complex)); //set the field to zero (necessary because a sum is used) + + // COORDINATES + T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) + if(x != NULL){ + HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice)); + } + T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified) + if(y != NULL){ + HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice)); + } + T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified) + if(z != NULL){ + HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); + } + + // PLANE WAVES + stim::scalarwave* dev_W; //allocate space and copy plane waves + HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave) * W.size()) ); + HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave) * W.size(), cudaMemcpyHostToDevice) ); + + // BESSEL FUNCTION LOOK-UP TABLE + //calculate the distance from the sphere center + T* dev_r; + HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) ); + + int threads = stim::maxThreadsPerBlock(); + dim3 blocks((unsigned)(N / threads + 1)); + cuda_dist <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N); + + //Find the minimum and maximum values of r + cublasStatus_t stat; + cublasHandle_t handle; + + stat = cublasCreate(&handle); //create a cuBLAS handle + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure + printf ("CUBLAS initialization failed\n"); + exit(1); + } + + int i_min, i_max; + stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min); + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure + printf ("CUBLAS Error: failed to calculate minimum r value.\n"); + exit(1); + } + stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max); + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure + printf ("CUBLAS Error: failed to calculate maximum r value.\n"); + exit(1); + } + + i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility + i_max--; + T r_min, r_max; //allocate space to store the minimum and maximum values + HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU + HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); + + r_max = min(r_max, a); //the internal field doesn't exist outside of the sphere + + size_t N_jA_lut = (size_t)((r_max - r_min) / r_spacing + 1); + + //temporary variables + double vm; //allocate space to store the return values for the bessel function calculation + stim::complex* jv = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* yv = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* djv= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + stim::complex* dyv= (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); + + size_t jA_bytes = sizeof(stim::complex) * (Nl+1) * N_jA_lut; + stim::complex* jA_lut = (stim::complex*) malloc(jA_bytes); //pointer to the look-up table + T dr = (r_max - r_min) / (N_jA_lut-1); //distance between values in the LUT + std::cout<<"LUT jl bytes: "< hl; + stim::complex nd = (stim::complex)n; + for(size_t ri = 0; ri < N_jA_lut; ri++){ //for each value in the LUT + stim::cbessjyva_sph(Nl, nd * k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] + for(size_t l = 0; l <= Nl; l++){ //for each order + jA_lut[ri * (Nl + 1) + l] = (stim::complex)(jv[l] * (stim::complex)A[l]); //store the bessel function result + } + } + + //Allocate device memory and copy everything to the GPU + stim::complex* dev_jA_lut; + HANDLE_ERROR( cudaMalloc(&dev_jA_lut, jA_bytes) ); + HANDLE_ERROR( cudaMemcpy(dev_jA_lut, jA_lut, jA_bytes, cudaMemcpyHostToDevice) ); + + gpu_scalar_mie_internal(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_jA_lut, r_min, dr, N_jA_lut, Nl); + + cudaMemcpy(E, dev_E, N * sizeof(stim::complex), cudaMemcpyDeviceToHost); //copy the field from device memory + + if(x != NULL) cudaFree(dev_x); //free everything + if(y != NULL) cudaFree(dev_y); + if(z != NULL) cudaFree(dev_z); + cudaFree(dev_E); +#else + //allocate space to store the bessel function call results double vm; stim::complex* j_knr = (stim::complex*) malloc( (Nl + 1) * sizeof(stim::complex) ); @@ -414,12 +599,13 @@ void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, st } } } +#endif } template -void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave w, T a, stim::complex n){ +void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave w, T a, stim::complex n, T r_spacing = 0.1){ std::vector< stim::scalarwave > W(1, w); - cpu_scalar_mie_internal(E, N, x, y, z, W, a, n); + cpu_scalar_mie_internal(E, N, x, y, z, W, a, n, r_spacing); } } diff --git a/stim/optics/scalarbeam.h b/stim/optics/scalarbeam.h index 93881eb..800f4a3 100644 --- a/stim/optics/scalarbeam.h +++ b/stim/optics/scalarbeam.h @@ -1,5 +1,6 @@ #ifndef RTS_BEAM #define RTS_BEAM +#include #include "../math/vec3.h" #include "../optics/scalarwave.h" @@ -7,150 +8,68 @@ #include "../math/legendre.h" #include "../cuda/cudatools/devices.h" #include "../cuda/cudatools/timer.h" +#include "../optics/scalarfield.h" #include #include #include #include -namespace stim{ - /// Function returns the value of the scalar field produced by a beam with the specified parameters - - template - std::vector< stim::vec3 > generate_focusing_vectors(size_t N, stim::vec3 d, T NA, T NA_in = 0){ - - std::vector< stim::vec3 > dirs(N); //allocate an array to store the focusing vectors - - ///compute the rotation operator to transform (0, 0, 1) to k - T cos_angle = d.dot(vec3(0, 0, 1)); - stim::matrix rotation; - - //if the cosine of the angle is -1, the rotation is just a flip across the z axis - if(cos_angle == -1){ - rotation(2, 2) = -1; - } - else if(cos_angle != 1.0) - { - vec3 r_axis = vec3(0, 0, 1).cross(d).norm(); //compute the axis of rotation - T angle = acos(cos_angle); //compute the angle of rotation - quaternion quat; //create a quaternion describing the rotation - quat.CreateRotation(angle, r_axis); - rotation = quat.toMatrix3(); //compute the rotation matrix - } - - //find the phi values associated with the cassegrain ring - T PHI[2]; - PHI[0] = (T)asin(NA); - PHI[1] = (T)asin(NA_in); - - //calculate the z-axis cylinder coordinates associated with these angles - T Z[2]; - Z[0] = cos(PHI[0]); - Z[1] = cos(PHI[1]); - T range = Z[0] - Z[1]; - - //draw a distribution of random phi, z values - T z, phi, theta; - //T kmag = stim::TAU / lambda; - for(int i=0; i spherical(1, theta, phi); //convert from spherical to cartesian coordinates - vec3 cart = spherical.sph2cart(); - dirs[i] = rotation * cart; //create a sample vector - } - return dirs; - } - -/// Class stim::beam represents a beam of light focused at a point and composed of several plane waves -template -class scalarbeam -{ -public: - //enum beam_type {Uniform, Bartlett, Hamming, Hanning}; - -private: - - T NA[2]; //numerical aperature of the focusing optics - vec3 f; //focal point - vec3 d; //propagation direction - stim::complex A; //beam amplitude - T lambda; //beam wavelength -public: - ///constructor: build a default beam (NA=1.0) - scalarbeam(T wavelength = 1, stim::complex amplitude = 1, vec3 focal_point = vec3(0, 0, 0), vec3 direction = vec3(0, 0, 1), T numerical_aperture = 1, T center_obsc = 0){ - lambda = wavelength; - A = amplitude; - f = focal_point; - d = direction.norm(); //make sure that the direction vector is normalized (makes calculations more efficient later on) - NA[0] = numerical_aperture; - NA[1] = center_obsc; - } +namespace stim{ - ///Numerical Aperature functions - void setNA(T na) - { - NA[0] = (T)0; - NA[1] = na; - } - void setNA(T na0, T na1) - { - NA[0] = na0; - NA[1] = na1; - } +/// Function returns the value of the scalar field produced by a beam with the specified parameters - //Monte-Carlo decomposition into plane waves - std::vector< scalarwave > mc(size_t N = 100000) const{ +template +std::vector< stim::vec3 > generate_focusing_vectors(size_t N, stim::vec3 d, T NA, T NA_in = 0){ - std::vector< stim::vec3 > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]); //generate a random set of N vectors forming a focus - std::vector< scalarwave > samples(N); //create a vector of plane waves - T kmag = (T)stim::TAU / lambda; //calculate the wavenumber - stim::complex apw; //allocate space for the amplitude at the focal point - T a = (T)(stim::TAU * (1 - cos(asin(NA[0]))) / (double)N); - stim::vec3 kpw; //declare the new k-vector based on the focused plane wave direction - for(size_t i=0; i(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave - samples[i] = scalarwave(kpw, apw); //create a plane wave based on the direction - } + std::vector< stim::vec3 > dirs(N); //allocate an array to store the focusing vectors - return samples; - } + ///compute the rotation operator to transform (0, 0, 1) to k + T cos_angle = d.dot(vec3(0, 0, 1)); + stim::matrix rotation; - /// Calculate the field at a given point - /// @param x is the x-coordinate of the field point - /// @O is the approximation accuracy - stim::complex field(T x, T y, T z, size_t O){ - std::vector< scalarwave > W = mc(O); - T result = 0; //initialize the result to zero (0) - for(size_t i = 0; i < O; i++){ //for each plane wave - result += W[i].pos(x, y, z); - } - return result; + //if the cosine of the angle is -1, the rotation is just a flip across the z axis + if(cos_angle == -1){ + rotation(2, 2) = -1; } - - std::string str() + else if(cos_angle != 1.0) { - std::stringstream ss; - ss<<"Beam:"< r_axis = vec3(0, 0, 1).cross(d).norm(); //compute the axis of rotation + T angle = acos(cos_angle); //compute the angle of rotation + quaternion quat; //create a quaternion describing the rotation + quat.CreateRotation(angle, r_axis); + rotation = quat.toMatrix3(); //compute the rotation matrix } + //find the phi values associated with the cassegrain ring + T PHI[2]; + PHI[0] = (T)asin(NA); + PHI[1] = (T)asin(NA_in); + + //calculate the z-axis cylinder coordinates associated with these angles + T Z[2]; + Z[0] = cos(PHI[0]); + Z[1] = cos(PHI[1]); + T range = Z[0] - Z[1]; + + //draw a distribution of random phi, z values + T z, phi, theta; + //T kmag = stim::TAU / lambda; + for(int i=0; i spherical(1, theta, phi); //convert from spherical to cartesian coordinates + vec3 cart = spherical.sph2cart(); + dirs[i] = rotation * cart; //create a sample vector + } + return dirs; +} - -}; //end beam - + /// Calculate the [0 Nl] terms for the aperture integral based on the give numerical aperture and center obscuration (optional) /// @param C is a pointer to Nl + 1 values where the terms will be stored template @@ -210,20 +129,19 @@ CUDA_CALLABLE T lerp(T v0, T v1, T t) { return fma(t, v1, fma(-t, v0, v0)); } -#ifdef __CUDACC__ +#ifdef CUDA_FOUND template -__global__ void cuda_scalar_psf(stim::complex* E, size_t N, T* r, T* phi, T k, T A, size_t Nl, +__global__ void cuda_scalar_psf(stim::complex* E, size_t N, T* r, T* phi, T A, size_t Nl, T* C, - T* lut_j, size_t Nj, T min_kr, T dkr){ + T* lut_j, size_t Nj, T min_r, T dr){ size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array if(i >= N) return; //exit if this thread is outside the array T cos_phi = cos(phi[i]); //calculate the thread value for cos(phi) - T kr = r[i] * k; //calculate the thread value for k*r stim::complex Ei = 0; //initialize the value of the field to zero size_t NC = Nl + 1; //calculate the number of coefficients to be used - T fij = (kr - min_kr)/dkr; //FP index into the spherical bessel LUT + T fij = (r[i] - min_r)/dr; //FP index into the spherical bessel LUT size_t ij = (size_t) fij; //convert to an integral index T a = fij - ij; //calculate the fractional portion of the index size_t n0j = ij * (NC); //start of the first entry in the LUT @@ -276,6 +194,8 @@ void gpu_scalar_psf_local(stim::complex* E, size_t N, T* r, T* phi, T lambda, exit(1); } + i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility + i_max--; T r_min, r_max; //allocate space to store the minimum and maximum values HANDLE_ERROR( cudaMemcpy(&r_min, r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU HANDLE_ERROR( cudaMemcpy(&r_max, r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); @@ -287,29 +207,19 @@ void gpu_scalar_psf_local(stim::complex* E, size_t N, T* r, T* phi, T lambda, size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r - T kr_min = k * r_min; - T kr_max = k * r_max; - - //temporary variables - double vm; //allocate space to store the return values for the bessel function calculation - double* jv = (double*) malloc( (Nl + 1) * sizeof(double) ); - double* yv = (double*) malloc( (Nl + 1) * sizeof(double) ); - double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); - double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j; - T* bessel_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table - T delta_kr = (kr_max - kr_min) / (Nlut_j-1); //distance between values in the LUT - std::cout<<"LUT jl bytes: "<(Nl, kr_min + kri * delta_kr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] + T* j_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table + T dr = (r_max - r_min) / (Nlut_j-1); //distance between values in the LUT + T jl; + for(size_t ri = 0; ri < Nlut_j; ri++){ //for each value in the LUT for(size_t l = 0; l <= Nl; l++){ //for each order - bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; //store the bessel function result + jl = boost::math::sph_bessel(l, k*(r_min + ri * dr)); //use boost to calculate the spherical bessel function + j_lut[ri * (Nl + 1) + l] = jl; //store the bessel function result } } - stim::cpu2image(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer); - + stim::cpu2image(j_lut, "j_lut.bmp", Nl+1, Nlut_j, stim::cmBrewer); //Allocate device memory and copy everything to the GPU T* gpu_C; @@ -317,12 +227,12 @@ void gpu_scalar_psf_local(stim::complex* E, size_t N, T* r, T* phi, T lambda, HANDLE_ERROR( cudaMemcpy(gpu_C, C, C_bytes, cudaMemcpyHostToDevice) ); T* gpu_j_lut; HANDLE_ERROR( cudaMalloc(&gpu_j_lut, lutj_bytes) ); - HANDLE_ERROR( cudaMemcpy(gpu_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) ); + HANDLE_ERROR( cudaMemcpy(gpu_j_lut, j_lut, lutj_bytes, cudaMemcpyHostToDevice) ); int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device dim3 blocks( (unsigned)(N / threads + 1)); //calculate the optimal number of blocks - cuda_scalar_psf<<< blocks, threads >>>(E, N, r, phi, (T)stim::TAU/lambda, A, Nl, gpu_C, gpu_j_lut, Nlut_j, kr_min, delta_kr); + cuda_scalar_psf<<< blocks, threads >>>(E, N, r, phi, A, Nl, gpu_C, gpu_j_lut, Nlut_j, r_min, dr); //free the LUT and condenser tables HANDLE_ERROR( cudaFree(gpu_C) ); @@ -392,7 +302,7 @@ __global__ void cuda_cart2psf(T* r, T* phi, size_t N, T* x, T* y, T* z, stim::ve phi[i] = ps[2]; //phi = [0 pi] } -#ifdef __CUDACC__ +#ifdef CUDA_FOUND /// Calculate the analytical solution to a point spread function given a set of points in cartesian coordinates template void gpu_scalar_psf_cart(stim::complex* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3 f, stim::vec3 d, T NA, T NA_in, int Nl, T r_spacing = 1){ @@ -419,7 +329,7 @@ template void cpu_scalar_psf_cart(stim::complex* E, size_t N, T* x, T* y, T* z, T lambda, T A, stim::vec3 f, stim::vec3 d, T NA, T NA_in, int Nl, T r_spacing = 1){ // If CUDA is available, copy the cartesian points to the GPU and evaluate them in a kernel -#ifdef __CUDACC__ +#ifdef CUDA_FOUND T* gpu_x = NULL; if(x != NULL){ @@ -470,13 +380,112 @@ void cpu_scalar_psf_cart(stim::complex* E, size_t N, T* x, T* y, T* z, T lamb phi[i] = ps[2]; //phi = [0 pi] } - cpu_scalar_psf_local(F, N, r, phi, lambda, A, NA, NA_in, Nl); //call the spherical coordinate CPU function + cpu_scalar_psf_local(E, N, r, phi, lambda, A, NA, NA_in, Nl); //call the spherical coordinate CPU function free(r); free(phi); #endif } + +/// Class stim::beam represents a beam of light focused at a point and composed of several plane waves +template +class scalarbeam +{ +public: + //enum beam_type {Uniform, Bartlett, Hamming, Hanning}; + +private: + + T NA[2]; //numerical aperature of the focusing optics + vec3 f; //focal point + vec3 d; //propagation direction + T A; //beam amplitude + T lambda; //beam wavelength +public: + ///constructor: build a default beam (NA=1.0) + scalarbeam(T wavelength = 1, T amplitude = 1, vec3 focal_point = vec3(0, 0, 0), vec3 direction = vec3(0, 0, 1), T numerical_aperture = 1, T center_obsc = 0){ + lambda = wavelength; + A = amplitude; + f = focal_point; + d = direction.norm(); //make sure that the direction vector is normalized (makes calculations more efficient later on) + NA[0] = numerical_aperture; + NA[1] = center_obsc; + } + + ///Numerical Aperature functions + void setNA(T na) + { + NA[0] = (T)0; + NA[1] = na; + } + void setNA(T na0, T na1) + { + NA[0] = na0; + NA[1] = na1; + } + + //Monte-Carlo decomposition into plane waves + std::vector< scalarwave > mc(size_t N = 100000) const{ + + std::vector< stim::vec3 > dirs = generate_focusing_vectors(N, d, NA[0], NA[1]); //generate a random set of N vectors forming a focus + std::vector< scalarwave > samples(N); //create a vector of plane waves + T kmag = (T)stim::TAU / lambda; //calculate the wavenumber + stim::complex apw; //allocate space for the amplitude at the focal point + T a = (T)(stim::TAU * ( (1 - cos(asin(NA[0]))) - (1 - cos(asin(NA[1])))) / (double)N); //constant value weights plane waves based on the aperture and number of samples (N) + stim::vec3 kpw; //declare the new k-vector based on the focused plane wave direction + for(size_t i=0; i(0, kpw.dot(-f))); //calculate the amplitude for the new plane wave + samples[i] = scalarwave(kpw, apw); //create a plane wave based on the direction + } + return samples; + } + + /// Evaluate the beam to a scalar field using Debye focusing + void eval(stim::scalarfield& E, size_t order = 500){ + size_t array_size = E.grid_bytes(); + T* X = (T*) malloc( array_size ); //allocate space for the coordinate meshes + T* Y = (T*) malloc( array_size ); + T* Z = (T*) malloc( array_size ); + + E.meshgrid(X, Y, Z, stim::CPUmem); //calculate the coordinate meshes + cpu_scalar_psf_cart(E.ptr(), E.size(), X, Y, Z, lambda, A, f, d, NA[0], NA[1], order, E.spacing()); + + free(X); //free the coordinate meshes + free(Y); + free(Z); + } + + /// Calculate the field at a given point + /// @param x is the x-coordinate of the field point + /// @O is the approximation accuracy + stim::complex field(T x, T y, T z, size_t O){ + std::vector< scalarwave > W = mc(O); + T result = 0; //initialize the result to zero (0) + for(size_t i = 0; i < O; i++){ //for each plane wave + result += W[i].pos(x, y, z); + } + return result; + } + + std::string str() + { + std::stringstream ss; + ss<<"Beam:"<