nfScalarUpLut.cu 7.16 KB
#include "nearfield.h"
#include "rts/math/spherical_bessel.h"
#include "rts/math/legendre.h"
#include <stdlib.h>
#include "rts/cuda/error.h"
#include "rts/cuda/timer.h"

texture<float2, cudaTextureType2D> texUsp;
texture<float2, cudaTextureType2D> texUip;

__global__ void gpuScalarUpLut(bsComplex* Us, bsVector* k, int nk, ptype kmag, ptype a, ptype dmin, ptype dmax, bsPoint f, bsPoint ps, ptype A, bsRect ABCD, int uR, int vR, int dR, int aR, int thetaR)
{
    /*This function uses Monte-Carlo integration to sample a texture-based LUT describing the scattered field
        produced by a plane wave through a sphere.  The MC sampling is used to approximate a focused field.

        Us  =   final scattered field
        k   =   list of incoming plane waves (Monte-Carlo samples)
        nk  =   number of incoming MC samples
        kmag=   magnitude of the incoming field 2pi/lambda
        dmin=   minimum distance of the Usp texture
        dmax=   maximum distance of the Usp texture
        f   =   position of the focus
        ps  =   position of the sphere
        A   =   total amplitude of the incident field arriving at the focal spot
        ABCD=   rectangle representing the field slice
        uR  =   resolution of the field slice in the u direction
        vR  =   resolution of the field slice in the v direction
        dR  =   resolution of the Usp texture in the d direction
        thetaR= resolution of the Usp texture in the theta direction
    */

	//get the current coordinate in the plane slice
	int iu = blockIdx.x * blockDim.x + threadIdx.x;
	int iv = blockIdx.y * blockDim.y + threadIdx.y;

	//make sure that the thread indices are in-bounds
	if(iu >= uR || iv >= vR) return;

	//compute the index (easier access to the scalar field array)
	int i = iv*uR + iu;

	//compute the parameters for u and v
	ptype u = (ptype)iu / (uR);
	ptype v = (ptype)iv / (vR);

	//get the rtsPoint in world space and then the r vector
	bsPoint p = ABCD(u, v);
	bsVector r = p - ps;
	ptype d = r.len();
	float di = ( (d - max(a, dmin))/(dmax - max(a, dmin)) ) * (dR - 1);
	float ai = ( (d - dmin)/(a - dmin)) * (aR - 1);

    bsComplex sumUs(0, 0);
    //for each plane wave in the wave list
    for(int iw = 0; iw < nk; iw++)
    {
        //normalize the direction vectors and find their inner product
        r = r.norm();
        ptype cos_theta = k[iw].dot(r);
        if(cos_theta < -1)
            cos_theta = -1;
        if(cos_theta > 1)
            cos_theta = 1;
        float thetai = ( acos(cos_theta) / PI ) * (thetaR - 1);

        //compute the phase factor for spheres that are not at the origin
        bsVector c = ps - f;
        bsComplex phase = exp(bsComplex(0, kmag * k[iw].dot(c)));

        //compute the internal field if we are inside a sphere
        if(d < a)
        {
			float2 Uip = tex2D(texUip, ai + 0.5f, thetai + 0.5f);
			sumUs += (1.0f/nk) * A * phase * bsComplex(Uip.x, Uip.y);
        }
        //otherwise compute the scattered field
        else
        {
            float2 Usp = tex2D(texUsp, di + 0.5f, thetai + 0.5f);
            sumUs += (1.0f/nk) * A * phase * bsComplex(Usp.x, Usp.y);
        }

    }

    Us[i] += sumUs;
}

void nearfieldStruct::scalarUpLut()
{
    //get the number of spheres
	int nSpheres = sVector.size();

	//if there are no spheres, nothing to do here
	if(nSpheres == 0)
		return;

	//time the calculation of the focused field
	gpuStartTimer();

	//clear the scattered field
	U.clear_gpu();

	//create one thread for each pixel of the field slice
	dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
	dim3 dimGrid((U.R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (U.R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);

    //copy Monte-Carlo samples to the GPU and determine the incident amplitude (plane-wave specific stuff)
    bsVector* gpuk;
    int nWaves;
    ptype subA;
    if(planeWave)
    {
        nWaves = 1;
        HANDLE_ERROR(cudaMalloc( (void**)&gpuk, sizeof(bsVector) ) );
        HANDLE_ERROR(cudaMemcpy( gpuk, &k, sizeof(bsVector), cudaMemcpyHostToDevice));
        subA = A;
    }
    else
    {
        nWaves = inWaves.size();
        HANDLE_ERROR(cudaMalloc( (void**)&gpuk, sizeof(bsVector) * nWaves ) );
        HANDLE_ERROR(cudaMemcpy( gpuk, &inWaves[0], sizeof(bsVector) * nWaves, cudaMemcpyHostToDevice));
        //compute the amplitude that makes it through the condenser
        subA = 2 * PI * A * ( (1 - cos(asin(condenser[1]))) - (1 - cos(asin(condenser[0]))) );
    }

	//for each sphere
	for(int s = 0; s<nSpheres; s++)
	{
        //get the current sphere
		//sphere S = sVector[s];

        //allocate space for the Usp and Uip textures
        //allocate the cuda array
        cudaArray* arrayUsp;
		cudaArray* arrayUip;
        cudaChannelFormatDesc channelDescUsp =
            cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat);
		cudaChannelFormatDesc channelDescUip =
            cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat);
        int dR = sVector[s].Usp.R[0];
        int thetaR = sVector[s].Usp.R[1];
		int aR = sVector[s].Uip.R[0];
        HANDLE_ERROR(cudaMallocArray(&arrayUsp, &channelDescUsp, dR, thetaR));
		HANDLE_ERROR(cudaMallocArray(&arrayUip, &channelDescUip, aR, thetaR));

        texUsp.addressMode[0] = cudaAddressModeMirror;
        texUsp.addressMode[1] = cudaAddressModeMirror;
        texUsp.filterMode     = cudaFilterModeLinear;
        texUsp.normalized     = false;

		texUip.addressMode[0] = cudaAddressModeMirror;
        texUip.addressMode[1] = cudaAddressModeMirror;
        texUip.filterMode     = cudaFilterModeLinear;
        texUip.normalized     = false;
        HANDLE_ERROR(cudaBindTextureToArray(texUsp, arrayUsp, channelDescUsp));
		HANDLE_ERROR(cudaBindTextureToArray(texUip, arrayUip, channelDescUip));

        //copy the LUT to the Usp texture
        HANDLE_ERROR( cudaMemcpy2DToArray(arrayUsp, 0, 0, sVector[s].Usp.x_hat, dR*sizeof(float2), dR*sizeof(float2), thetaR, cudaMemcpyDeviceToDevice));
		HANDLE_ERROR( cudaMemcpy2DToArray(arrayUip, 0, 0, sVector[s].Uip.x_hat, aR*sizeof(float2), aR*sizeof(float2), thetaR, cudaMemcpyDeviceToDevice));

        gpuScalarUpLut<<<dimGrid, dimBlock>>>(U.x_hat,
                                            gpuk,
                                            nWaves,
                                            2 * PI / lambda,
                                            sVector[s].a,
                                            sVector[s].d_min,
                                            sVector[s].d_max,
                                            focus,
                                            sVector[s].p,
                                            subA,
                                            pos,
                                            U.R[0],
                                            U.R[1],
                                            dR,
                                            aR,
											thetaR);

        cudaFreeArray(arrayUsp);
        cudaFreeArray(arrayUip);

	}


    //store the time to compute the scattered field
	t_Us = gpuStopTimer();

	//free monte-carlo samples
	cudaFree(gpuk);

}