cudaKK.h

__device__ double g(double v0, double v1)
{
	return (v0 + v1)*log(abs(v0+v1)) + (v0-v1)*log(abs(v0-v1));
}

__device__ double hfin(double v0, double v1, double dv)
{
	double e = 0.001;
	double t0 = g(v0+e, v1-dv)/dv;
	double t1 = 2*g(v0+e, v1)/dv;
	double t2 = g(v0+e, v1+dv)/dv;

	return -1.0/PI * (t0 - t1 + t2);
}

__global__ void devKramersKronig(double* gpuN, double* gpuK, int numVals, double nuStart, double nuEnd, double nOffset)
{
	int i = blockIdx.x * blockDim.x + threadIdx.x;

	if(i >= numVals) return;
	double nuDelta = (nuEnd - nuStart)/(numVals - 1);

	double nu = nuStart + i*nuDelta;
	double n = 0.0;
	double jNu;
	double jK;
	for(int j=1; j<numVals-1; j++)
	{
		jNu = nuStart + j*nuDelta;
		jK = gpuK[j];
		n += hfin(nu, jNu, nuDelta) * jK;
	}
	gpuN[i] = n + nOffset;


}

void cudaKramersKronig(double* cpuN, double* cpuK, int nVals, double nuStart, double nuEnd, double nOffset)
{
	double* gpuK;
	HANDLE_ERROR(cudaMalloc(&gpuK, sizeof(double)*nVals));
	HANDLE_ERROR(cudaMemcpy(gpuK, cpuK, sizeof(double)*nVals, cudaMemcpyHostToDevice));
	double* gpuN;
	HANDLE_ERROR(cudaMalloc(&gpuN, sizeof(double)*nVals));

	dim3 block(BLOCK_SIZE*BLOCK_SIZE);
	dim3 grid(nVals/block.x + 1);
	devKramersKronig<<<grid, block>>>(gpuN, gpuK, nVals, nuStart, nuEnd, nOffset);

	HANDLE_ERROR(cudaMemcpy(cpuN, gpuN, sizeof(double)*nVals, cudaMemcpyDeviceToHost));

	//free resources
	HANDLE_ERROR(cudaFree(gpuK));
	HANDLE_ERROR(cudaFree(gpuN));
}

__global__ void devComputeSpectrum(double* I, double2* B, double* alpha, int Nl, 
								   int nSamples, double oThetaI, double oThetaO, double cThetaI, double cThetaO)
{
	int i = blockIdx.x * blockDim.x + threadIdx.x;

	//compute the delta-theta value
	double dTheta = (oThetaO - oThetaI)/nSamples;

	//allocate space for the Legendre polynomials
	double Ptheta[2];	

	double cosTheta, theta;
	cuDoubleComplex Us;
	cuDoubleComplex UsSample;
	cuDoubleComplex U;
	//cuComplex Ui;
	//Ui.x = 2*PI;
	//Ui.y = 0.0;
	cuDoubleComplex numer;
	numer.x = 0.0;
	cuDoubleComplex exp_numer;
	cuDoubleComplex iL;
	cuDoubleComplex imag;
	imag.x = 0.0; imag.y = 1.0;
	double realFac;
	cuDoubleComplex complexFac;
	double PlTheta;
	double Isum = 0.0;
	//float maxVal = 0;
	//float val;
	for(int iTheta = 0; iTheta < nSamples; iTheta++)
	{
		//calculate theta
		theta = iTheta * dTheta + oThetaI;
		cosTheta = cos(theta);

		//initialize the theta Legendre polynomial
		Ptheta[0] = 1.0;
		Ptheta[1] = cosTheta;

		//initialize the scattered field
		Us.x = Us.y = 0.0;
		iL.x = 1.0;
		iL.y = 0.0;
		for(int l = 0; l<Nl; l++)
		{
			//compute the theta legendre polynomial
			if(l == 0)
				PlTheta = Ptheta[0];
			else if(l == 1)
				PlTheta = Ptheta[1];
			else
			{
				PlTheta = ((2*l - 1)*cosTheta*Ptheta[1] - (l - 1)*Ptheta[0])/l;
				Ptheta[0] = Ptheta[1];
				Ptheta[1] = PlTheta;
			}

			//compute the real components of the scattered field
			realFac = alpha[l] * PlTheta;

			//compute the complex components of the scattered field
			numer.x = 0.0;
			numer.y = -(l*PI)/2.0;
			exp_numer = cExp(numer);

			complexFac = cMult(B[Nl * i + l], exp_numer);
			complexFac = cMult(complexFac, iL);
			

			//combine the real and complex components
			UsSample = cMult(complexFac, realFac);
			Us = cAdd(Us, UsSample);

			//increment the imaginary exponent i^l
			iL = cMult(iL, imag);

		
		}

		//sum the scattered and incident fields
		if(theta >= cThetaI && theta <= cThetaO)
			U = cAdd(Us, 2*PI);
		else
			U = Us;
		Isum += (U.x*U.x + U.y*U.y) * sin(theta) * 2 * PI * dTheta;
	}

	I[i] = Isum;
}

void cudaComputeSpectrum(double* cpuI, double* cpuB, double* cpuAlpha,
						 int Nl, int nLambda, double oThetaI, double oThetaO, double cThetaI, double cThetaO, int nSamples)
{
	//copy everything to the GPU
	double2* gpuB;
	HANDLE_ERROR(cudaMalloc(&gpuB, sizeof(double2) * nLambda * Nl));
	HANDLE_ERROR(cudaMemcpy(gpuB, cpuB, sizeof(double2) * nLambda * Nl, cudaMemcpyHostToDevice));

	double* gpuAlpha;
	HANDLE_ERROR(cudaMalloc(&gpuAlpha, sizeof(double) * Nl));
	HANDLE_ERROR(cudaMemcpy(gpuAlpha, cpuAlpha, sizeof(double) * Nl, cudaMemcpyHostToDevice));

	double* gpuI;
	HANDLE_ERROR(cudaMalloc(&gpuI, sizeof(double) * nLambda));
	

	//call the kernel to compute the spectrum
	dim3 block(BLOCK_SIZE*BLOCK_SIZE);
	dim3 grid(nLambda/block.x + 1);

	//devComputeSpectrum
	devComputeSpectrum<<<grid, block>>>(gpuI, (double2*)gpuB, gpuAlpha, Nl,
										nSamples, oThetaI, oThetaO, cThetaI, cThetaO);

	HANDLE_ERROR(cudaMemcpy(cpuI, gpuI, sizeof(double) * nLambda, cudaMemcpyDeviceToHost));

	HANDLE_ERROR(cudaFree(gpuB));
	HANDLE_ERROR(cudaFree(gpuAlpha));
	HANDLE_ERROR(cudaFree(gpuI));

	
}