nfScalarUs.cu 6.59 KB
``````#include "nearfield.h"
#include "rts/math/spherical_bessel.h"
#include "rts/math/legendre.h"
#include <stdlib.h>
#include "rts/cuda/error.h"
#include "rts/cuda/timer.h"

__device__ bsComplex calc_Us(ptype kd, ptype cos_theta, int Nl, bsComplex* B)
{
//initialize the spherical Bessel functions
ptype j[2];
rts::init_sbesselj<ptype>(kd, j);
ptype y[2];
rts::init_sbessely<ptype>(kd, y);

//initialize the Legendre polynomial
ptype P[2];
rts::init_legendre<ptype>(cos_theta, P[0], P[1]);

//initialize the spherical Hankel function
bsComplex h((ptype)0, (ptype)0);

//initialize the result
bsComplex Us((ptype)0, (ptype)0);

//for each order up to Nl
for(int l=0; l<=Nl; l++)
{
if(l == 0)
{
h.r = j[0];
h.i = y[0];
Us += B[0] * h * P[0];
}
else
{
//shift the bessel functions and legendre polynomials
if(l > 1)
{
rts::shift_legendre<ptype>(l, cos_theta, P[0], P[1]);
rts::shift_sbesselj<ptype>(l, kd, j);
rts::shift_sbessely<ptype>(l, kd, y);
}

h.r = j[1];
h.i = y[1];
Us += B[l] * h * P[1];

}
}
return Us;
}

__device__ bsComplex calc_Ui(bsComplex knd, ptype cos_theta, int Nl, bsComplex* A)
{
//calculate the internal field of a sphere

bsComplex Ui((ptype)0, (ptype)0);

//deal with rtsPoints near zero
if(real(knd) < EPSILON_FLOAT)
{
//for(int l=0; l<Nl; l++)
Ui = A[0];
return Ui;
}

//initialize the spherical Bessel functions
bsComplex j[2];
rts::init_sbesselj<bsComplex>(knd, j);

//initialize the Legendre polynomial
ptype P[2];
rts::init_legendre<ptype>(cos_theta, P[0], P[1]);

//for each order up to Nl
for(int l=0; l<=Nl; l++)
{
if(l == 0)
{
Ui += A[0] * j[0] * P[0];
}
else
{
//shift the bessel functions and legendre polynomials
if(l > 1)
{
rts::shift_legendre<ptype>(l, cos_theta, P[0], P[1]);
rts::shift_sbesselj<bsComplex>(l, knd, j);
}

Ui += A[l] * j[1] * P[1];

}
}
return Ui;
}

__global__ void gpuScalarUsp(bsComplex* Us, bsVector* k, int nk, ptype kmag, bsPoint f, bsPoint ps, ptype a, bsComplex n, bsComplex* Beta, bsComplex* Alpha, int Nl, ptype A, bsRect ABCD, int uR, int vR)
{

//get the current coordinate in the plane slice
int iu = blockIdx.x * blockDim.x + threadIdx.x;
int iv = blockIdx.y * blockDim.y + threadIdx.y;

//make sure that the thread indices are in-bounds
if(iu >= uR || iv >= vR) return;

int i = iv*uR + iu;

//compute the parameters for u and v
ptype u = (ptype)iu / (uR);
ptype v = (ptype)iv / (vR);

//get the rtsPoint in world space and then the r vector
bsPoint p = ABCD(u, v);
bsVector r = p - ps;
ptype d = r.len();

bsComplex sumUs(0, 0);
//for each plane wave in the wave list
for(int iw = 0; iw < nk; iw++)
{
//normalize the direction vectors and find their inner product
r = r.norm();
ptype cos_theta = k[iw].dot(r);

//compute the phase factor for spheres that are not at the origin
bsVector c = ps - f;
bsComplex phase = exp(bsComplex(0, kmag * k[iw].dot(c)));

//compute the internal field if we are inside a sphere
if(d <= a)
{
bsComplex knd = kmag * d * n;
sumUs += (1.0/nk) * A * phase * calc_Ui(knd, cos_theta, Nl, Alpha);
}
//otherwise compute the scattered field
else
{
//compute the argument for the spherical Hankel function
ptype kd = kmag * d;
sumUs += (1.0/nk) * A * phase * calc_Us(kd, cos_theta, Nl, Beta);
}

}

Us[i] += sumUs;

}

void nearfieldStruct::scalarUs()
{
//get the number of spheres
int nSpheres = sVector.size();

//if there are no spheres, nothing to do here
if(nSpheres == 0)
return;

//time the calculation of the focused field
//gpuStartTimer();

//clear the scattered field
U.clear_gpu();

//create one thread for each pixel of the field slice
dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
dim3 dimGrid((U.R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (U.R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);

//for each sphere
int Nl;
for(int s = 0; s<nSpheres; s++)
{
//get the number of orders for this sphere
Nl = sVector[s].Nl;

//allocate memory for the scattering coefficients
bsComplex* gpuB;
HANDLE_ERROR(cudaMalloc((void**) &gpuB, (Nl+1) * sizeof(bsComplex)));

bsComplex* gpuA;
HANDLE_ERROR(cudaMalloc((void**) &gpuA, (Nl+1) * sizeof(bsComplex)));

//copy the scattering coefficients to the GPU
HANDLE_ERROR(cudaMemcpy(gpuB, &sVector[s].B[0], (Nl+1) * sizeof(bsComplex), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(gpuA, &sVector[s].A[0], (Nl+1) * sizeof(bsComplex), cudaMemcpyHostToDevice));

//if we are computing a plane wave, call the gpuScalarUfp function
sphere S = sVector[s];
bsVector* gpuk;

if(planeWave)
{
//if this is a single plane wave, assume it goes along direction k (copy the k vector to the GPU)
HANDLE_ERROR(cudaMalloc( (void**)&gpuk, sizeof(bsVector) ));
HANDLE_ERROR(cudaMemcpy( gpuk, &k, sizeof(bsVector), cudaMemcpyHostToDevice));
gpuScalarUsp<<<dimGrid, dimBlock>>>(U.x_hat,
gpuk,
1,
2 * PI / lambda,
focus,
sVector[s].p,
sVector[s].a,
sVector[s].n,
gpuB,
gpuA,
Nl,
A,
pos,
U.R[0],
U.R[1]);
HANDLE_ERROR(cudaFree(gpuk));
}
//otherwise copy all of the monte-carlo samples to the GPU and compute
else
{
HANDLE_ERROR(cudaMalloc( (void**)&gpuk, sizeof(bsVector) * inWaves.size() ));
HANDLE_ERROR(cudaMemcpy( gpuk, &inWaves[0], sizeof(bsVector) * inWaves.size(), cudaMemcpyHostToDevice));

//compute the amplitude that makes it through the condenser
ptype subA = 2 * PI * A * ( (1 - cos(asin(condenser[1]))) - (1 - cos(asin(condenser[0]))) );

gpuScalarUsp<<<dimGrid, dimBlock>>>(U.x_hat,
gpuk,
inWaves.size(),
2 * PI / lambda,
focus,
sVector[s].p,
sVector[s].a,
sVector[s].n,
gpuB,
gpuA,
Nl,
subA,
pos,
U.R[0],
U.R[1]);
HANDLE_ERROR(cudaFree(gpuk));

}

//free memory for scattering coefficients
HANDLE_ERROR(cudaFree(gpuA));
HANDLE_ERROR(cudaFree(gpuB));
}

//float t = gpuStopTimer();
//std::cout<<"Scalar Us Time: "<<t<<"ms"<<std::endl;
//std::cout<<focus<<std::endl;

}``````