#include "sphere.h" #include "rts/math/legendre.h" __global__ void gpuScalarUsp(bsComplex* Usp, bsComplex* h, bsComplex* B, int Nl, int rR, int thetaR) { //get the current coordinate in the plane slice int ir = blockIdx.x * blockDim.x + threadIdx.x; int itheta = blockIdx.y * blockDim.y + threadIdx.y; //make sure that the thread indices are in-bounds if(itheta >= thetaR || ir >= rR) return; int i = itheta * rR + ir; //ptype dr = (rmax - a) / (rR - 1); ptype dtheta = (PI) / (thetaR - 1); //comptue the current angle and distance //ptype r = dr * ir + a; ptype theta = dtheta * itheta; ptype cos_theta = cos(theta); //initialize the Legendre polynomial ptype P[2]; rts::init_legendre(cos_theta, P[0], P[1]); //initialize the result bsComplex Us((ptype)0, (ptype)0); //for each order l for(int l=0; l <= Nl; l++) { if(l == 0) { Us += B[l] * h[ir * (Nl+1) + l] * P[0]; //Us += P[0]; } else { if(l > 1) { rts::shift_legendre(l, cos_theta, P[0], P[1]); } Us += B[l] * h[ir * (Nl+1) + l] * P[1]; //Us += P[1]; } } Usp[i] = Us; //Usp[i] = h[ir * (Nl+1)]; //Usp[i] = ir; } __global__ void gpuScalarUip(bsComplex* Uip, bsComplex* j, bsComplex* A, int Nl, int aR, int thetaR) { //get the current coordinate in the plane slice int ia = blockIdx.x * blockDim.x + threadIdx.x; int itheta = blockIdx.y * blockDim.y + threadIdx.y; //make sure that the thread indices are in-bounds if(itheta >= thetaR || ia >= aR) return; int i = itheta * aR + ia; ptype dtheta = (PI) / (thetaR - 1); //comptue the current angle and distance ptype theta = dtheta * itheta; ptype cos_theta = cos(theta); //initialize the Legendre polynomial ptype P[2]; rts::init_legendre(cos_theta, P[0], P[1]); //initialize the result bsComplex Ui((ptype)0, (ptype)0); //for each order l for(int l=0; l <= Nl; l++) { if(l == 0) { Ui += A[l] * j[ia * (Nl+1) + l] * P[0]; } else { if(l > 1) { rts::shift_legendre(l, cos_theta, P[0], P[1]); } Ui += A[l] * j[ia * (Nl+1) + l] * P[1]; } } Uip[i] = Ui; } void sphere::scalarUsp(bsComplex* h, int rR, int thetaR) { //copy the hankel function to the GPU bsComplex* gpu_h; HANDLE_ERROR( cudaMalloc( (void**)&gpu_h, sizeof(bsComplex) * (Nl + 1) * rR ) ); HANDLE_ERROR( cudaMemcpy( gpu_h, h, sizeof(bsComplex) * (Nl + 1) * rR, cudaMemcpyHostToDevice ) ); //allocate memory for the scattering coefficients bsComplex* gpuB; HANDLE_ERROR(cudaMalloc((void**) &gpuB, (Nl+1) * sizeof(bsComplex))); //copy the scattering coefficients to the GPU HANDLE_ERROR(cudaMemcpy(gpuB, &B[0], (Nl+1) * sizeof(bsComplex), cudaMemcpyHostToDevice)); //create one thread for each pixel of the field slice dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK); dim3 dimGrid((Usp.R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (Usp.R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK); gpuScalarUsp<<>>(Usp.x_hat, gpu_h, gpuB, Nl, rR, thetaR); //free memory cudaFree(gpu_h); cudaFree(gpuB); } void sphere::scalarUip(bsComplex* j, int rR, int thetaR) { //copy the bessel and hankel LUTs to the GPU bsComplex* gpu_j; HANDLE_ERROR( cudaMalloc( (void**)&gpu_j, sizeof(bsComplex) * (Nl + 1) * rR ) ); HANDLE_ERROR( cudaMemcpy( gpu_j, j, sizeof(bsComplex) * (Nl + 1) * rR, cudaMemcpyHostToDevice ) ); //allocate memory for the scattering coefficients bsComplex* gpuA; HANDLE_ERROR(cudaMalloc((void**) &gpuA, (Nl+1) * sizeof(bsComplex))); //copy the scattering coefficients to the GPU HANDLE_ERROR(cudaMemcpy(gpuA, &A[0], (Nl+1) * sizeof(bsComplex), cudaMemcpyHostToDevice)); //create one thread for each pixel of the field slice dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK); dim3 dimGrid((Uip.R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (Uip.R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK); gpuScalarUip<<>>(Uip.x_hat, gpu_j, gpuA, Nl, rR, thetaR); //free memory cudaFree(gpu_j); cudaFree(gpuA); }