stim/optics/scalarwave.h

#ifndef STIM_SCALARWAVE_H
#define STIM_SCALARWAVE_H
#include <string>
#include <sstream>
#include <cmath>
//#include "../math/vector.h"
#include "../math/vec3.h"
#include "../math/quaternion.h"
#include "../math/constants.h"
#include "../math/plane.h"
#include "../math/complex.h"
//CUDA
#include "../cuda/cudatools/devices.h"
#include "../cuda/cudatools/error.h"
#include "../cuda/sharedmem.cuh"
namespace stim{
template<typename T>
class scalarwave{
protected:
	stim::vec3<T> k;							//k-vector, pointed in propagation direction with magnitude |k| = tau / lambda = 2pi / lambda
	stim::complex<T> E0;						//amplitude
	/// Bend a plane wave via refraction, given that the new propagation direction is known
	CUDA_CALLABLE scalarwave<T> bend(stim::vec3<T> kn) const{
		return scalarwave<T>(kn.norm() * kmag(), E0);
	}
public:
	///constructor: create a plane wave propagating along k
	CUDA_CALLABLE scalarwave(vec3<T> kvec = stim::vec3<T>(0, 0, (T)stim::TAU), complex<T> E = 1){
		k = kvec;
		E0 = E;
	}
	CUDA_CALLABLE scalarwave(T kx, T ky, T kz, complex<T> E = 1){
		k = vec3<T>(kx, ky, kz);
		E0 = E;
	}
	///multiplication operator: scale E0
    CUDA_CALLABLE scalarwave<T> & operator* (const T & rhs){		
		E0 = E0 * rhs;
		return *this;
	}
	CUDA_CALLABLE T lambda() const{
		return stim::TAU / k.len();
	}
	CUDA_CALLABLE T kmag() const{
		return k.len();
	}
	CUDA_CALLABLE vec3< complex<T> > E(){
		return E0;
	}
	CUDA_CALLABLE vec3<T> kvec(){
		return k;
	}
	/// calculate the value of the field produced by the plane wave given a three-dimensional position
	CUDA_CALLABLE complex<T> pos(T x, T y, T z){
		return pos( stim::vec3<T>(x, y, z) );
	}
	CUDA_CALLABLE complex<T> pos(vec3<T> p = vec3<T>(0, 0, 0)){
		return E0 * exp(complex<T>(0, k.dot(p)));
	}
	//scales k based on a transition from material ni to material nt
	CUDA_CALLABLE scalarwave<T> n(T ni, T nt){
		return scalarwave<T>(k * (nt / ni), E0);
	}
	CUDA_CALLABLE scalarwave<T> refract(stim::vec3<T> kn) const{
		return bend(kn);
	}
	/// Calculate the result of a plane wave hitting an interface between two refractive indices
	/// @param P is a plane representing the position and orientation of the surface
	/// @param n0 is the refractive index outside of the surface (in the direction of the normal)
	/// @param n1 is the refractive index inside the surface (in the direction away from the normal)
	/// @param r is the reflected component of the plane wave
	/// @param t is the transmitted component of the plane wave
	void scatter(stim::plane<T> P, T n0, T n1, scalarwave<T> &r, scalarwave<T> &t){
		scatter(P, n1/n0, r, t);
	}
	/// Calculate the scattering result when nr = n1/n0
	/// @param P is a plane representing the position and orientation of the surface
	/// @param r is the ration n1/n0
	/// @param n1 is the refractive index inside the surface (in the direction away from the normal)
	/// @param r is the reflected component of the plane wave
	/// @param t is the transmitted component of the plane wave
	void scatter(stim::plane<T> P, T nr, scalarwave<T> &r, scalarwave<T> &t){
		/*
		int facing = P.face(k);		//determine which direction the plane wave is coming in
		if(facing == -1){		//if the wave hits the back of the plane, invert the plane and nr
			P = P.flip();			//flip the plane
			nr = 1/nr;				//invert the refractive index (now nr = n0/n1)
		}
		//use Snell's Law to calculate the transmitted angle
		T cos_theta_i = k.norm().dot(-P.norm());				//compute the cosine of theta_i
		T theta_i = acos(cos_theta_i);							//compute theta_i
		T sin_theta_t = (1/nr) * sin(theta_i);						//compute the sine of theta_t using Snell's law
		T theta_t = asin(sin_theta_t);							//compute the cosine of theta_t
		bool tir = false;						//flag for total internal reflection
		if(theta_t != theta_t){
			tir = true;
			theta_t = stim::PI / (T)2;
		}
		//handle the degenerate case where theta_i is 0 (the plane wave hits head-on)
		if(theta_i == 0){
			T rp = (1 - nr) / (1 + nr);		//compute the Fresnel coefficients
			T tp = 2 / (1 + nr);
			vec3<T> kr = -k;
			vec3<T> kt = k * nr;			//set the k vectors for theta_i = 0
			vec3< complex<T> > Er = E0 * rp;		//compute the E vectors
			vec3< complex<T> > Et = E0 * tp;
			T phase_t = P.p().dot(k - kt);	//compute the phase offset
			T phase_r = P.p().dot(k - kr);
			//create the plane waves
			r = planewave<T>(kr, Er, phase_r);
			t = planewave<T>(kt, Et, phase_t);
			return;
		}
		//compute the Fresnel coefficients
		T rp, rs, tp, ts;
		rp = tan(theta_t - theta_i) / tan(theta_t + theta_i);
		rs = sin(theta_t - theta_i) / sin(theta_t + theta_i);
		
		if(tir){
			tp = ts = 0;
		}
		else{
			tp = ( 2 * sin(theta_t) * cos(theta_i) ) / ( sin(theta_t + theta_i) * cos(theta_t - theta_i) );
			ts = ( 2 * sin(theta_t) * cos(theta_i) ) / sin(theta_t + theta_i);
		}
		//compute the coordinate space for the plane of incidence
		vec3<T> z_hat = -P.norm();
		vec3<T> y_hat = P.parallel(k).norm();
		vec3<T> x_hat = y_hat.cross(z_hat).norm();
		//compute the k vectors for r and t
		vec3<T> kr, kt;
		kr = ( y_hat * sin(theta_i) - z_hat * cos(theta_i) ) * kmag();
		kt = ( y_hat * sin(theta_t) + z_hat * cos(theta_t) ) * kmag() * nr;
		//compute the magnitude of the p- and s-polarized components of the incident E vector
		complex<T> Ei_s = E0.dot(x_hat);
		int sgn = E0.dot(y_hat).sgn();
		vec3< complex<T> > cx_hat = x_hat;
		complex<T> Ei_p = ( E0 - cx_hat * Ei_s ).len() * sgn;
		//compute the magnitude of the p- and s-polarized components of the reflected E vector
		complex<T> Er_s = Ei_s * rs;
		complex<T> Er_p = Ei_p * rp;
		//compute the magnitude of the p- and s-polarized components of the transmitted E vector
		complex<T> Et_s = Ei_s * ts;
		complex<T> Et_p = Ei_p * tp;
		//compute the reflected E vector
		vec3< complex<T> > Er = vec3< complex<T> >(y_hat * cos(theta_i) + z_hat * sin(theta_i)) * Er_p + cx_hat * Er_s;
		//compute the transmitted E vector
		vec3< complex<T> > Et = vec3< complex<T> >(y_hat * cos(theta_t) - z_hat * sin(theta_t)) * Et_p + cx_hat * Et_s;
		T phase_t = P.p().dot(k - kt);
		T phase_r = P.p().dot(k - kr);
		//create the plane waves
		r.k = kr;
		r.E0 = Er * exp( complex<T>(0, phase_r) );
		t.k = kt;
		t.E0 = Et * exp( complex<T>(0, phase_t) );
		*/
	}
	std::string str()
	{
		std::stringstream ss;
		ss<<"Plane Wave:"<<std::endl;
		ss<<"	"<<E0<<" e^i ( "<<k<<" . r )";
		return ss.str();
	}
};					//end planewave class
/// CUDA kernel for computing the field produced by a batch of plane waves at an array of locations
template<typename T>
__global__ void cuda_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t n_waves){
	extern __shared__ stim::scalarwave<T> shared_W[];		//declare the list of waves in shared memory
	stim::cuda::sharedMemcpy(shared_W, W, n_waves, threadIdx.x, blockDim.x);	//copy the plane waves into shared memory for faster access
	__syncthreads();															//synchronize threads to insure all data is copied
	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
	if(i >= N) return;												//exit if this thread is outside the array
	T px, py, pz;
	(x == NULL) ? px = 0 : px = x[i];								// test for NULL values and set positions
	(y == NULL) ? py = 0 : py = y[i];
	(z == NULL) ? pz = 0 : pz = z[i];
	
	stim::complex<T> f = 0;											//create a register to store the result
	for(size_t w = 0; w < n_waves; w++)
		f += shared_W[w].pos(px, py, pz);							//evaluate the plane wave
	F[i] += f;														//copy the result to device memory
}
/// evaluate a scalar wave at several points, where all arrays are on the GPU
template<typename T>
void gpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
	
	int threads = stim::maxThreadsPerBlock();			//get the maximum number of threads per block for the CUDA device
	dim3 blocks(N / threads + 1);						//calculate the optimal number of blocks
	cuda_scalarwave<T><<< blocks, threads >>>(F, N, x, y, z, w);			//call the kernel
}
/// Sums a series of coherent plane waves at a specified point
/// @param field is the output array of field values corresponding to each input point
/// @param x is an array of x coordinates for the field point
/// @param y is an array of y coordinates for the field point
/// @param z is an array of z coordinates for the field point
/// @param N is the number of points in the input and output arrays
/// @param lambda is the wavelength (all coherent waves are assumed to have the same wavelength)
/// @param A is the list of amplitudes for each wave
/// @param S is the list of propagation directions for each wave
template<typename T>
void cpu_sum_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > w_array){
	size_t S = w_array.size();											//store the number of waves
#ifdef NO_CUDA
	memset(F, 0, N * sizeof(stim::complex<T>));
	T px, py, pz;
	for(size_t i = 0; i < N; i++){										// for each element in the array
		(x == NULL) ? px = 0 : px = x[i];								// test for NULL values
		(y == NULL) ? py = 0 : py = y[i];
		(z == NULL) ? pz = 0 : pz = z[i];
		for(size_t s = 0; s < S; s++){
			F[i] += w_array[s].pos(px, py, pz);						//sum all plane waves at this point
		}
	}
#else
	stim::complex<T>* dev_F;										//allocate space for the field
	cudaMalloc(&dev_F, N * sizeof(stim::complex<T>));
	cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
	if(x != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
	if(y != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
	if(z != NULL){
		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
	}
	size_t wave_bytes = sizeof(stim::scalarwave<T>);
	size_t shared_bytes = stim::sharedMemPerBlock();									//calculate the maximum amount of shared memory available
	size_t array_bytes = w_array.size() * wave_bytes;			//calculate the maximum number of bytes required for the planewave array
	size_t max_batch = shared_bytes / wave_bytes;				//calculate number of plane waves that will fit into shared memory
	size_t num_batches = w_array.size() / max_batch + 1;								//calculate the number of batches required to process all plane waves
	size_t batch_bytes = min(w_array.size(), max_batch) * wave_bytes;				//initialize the batch size (in bytes) to the maximum batch required
	stim::scalarwave<T>* dev_w;
	HANDLE_ERROR(cudaMalloc(&dev_w, batch_bytes));										//allocate memory for a single batch of plane waves
	int threads = stim::maxThreadsPerBlock();							//get the maximum number of threads per block for the CUDA device
	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks	
	size_t batch_size;																	//declare a variable to store the size of the current batch
	size_t waves_processed = 0;															//initialize the number of waves processed to zero
	while(waves_processed < w_array.size()){												//while there are still waves to be processed
		batch_size = min<size_t>(max_batch, w_array.size() - waves_processed);			//process either a whole batch, or whatever is left
		batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
		HANDLE_ERROR(cudaMemcpy(dev_w, &w_array[waves_processed], batch_bytes, cudaMemcpyHostToDevice));	//copy the plane waves into global memory
		cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(dev_F, N, dev_x, dev_y, dev_z, dev_w, batch_size);	//call the kernel
		waves_processed += batch_size;													//increment the counter indicating how many waves have been processed
	}
	cudaMemcpy(F, dev_F, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
	if(x != NULL) cudaFree(dev_x);														//free everything
	if(y != NULL) cudaFree(dev_y);
	if(z != NULL) cudaFree(dev_z);
	cudaFree(dev_F);
	cudaFree(dev_w);
#endif
}
template<typename T>
void cpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
	std::vector< stim::scalarwave<T> > w_array(1, w);
	cpu_sum_scalarwaves(F, N, x, y, z, w_array);	
}
/// Sums a series of coherent plane waves at a specified point
/// @param x is the x coordinate of the field point
/// @param y is the y coordinate of the field point
/// @param z is the z coordinate of the field point
/// @param lambda is the wavelength (all coherent waves are assumed to have the same wavelength)
/// @param A is the list of amplitudes for each wave
/// @param S is the list of propagation directions for each wave
template<typename T>
CUDA_CALLABLE stim::complex<T> sum_scalarwaves(T x, T y, T z, std::vector< stim::scalarwave<T> > W){
	size_t N = W.size();												//get the number of plane wave samples
	stim::complex<T> field(0, 0);										//initialize the field to zero (0)
	stim::vec3<T> k;													//allocate space for the direction vector
	for(size_t i = 0; i < N; i++){
		field += W[i].pos(x, y, z);
	}
	return field;
}
}					//end namespace stim
template <typename T>
std::ostream& operator<<(std::ostream& os, stim::scalarwave<T> p)
{
    os<<p.str();
    return os;
}
#endif