#ifndef STIM_SCALARWAVE_H #define STIM_SCALARWAVE_H #include #include #include //#include "../math/vector.h" #include "../math/vec3.h" #include "../math/quaternion.h" #include "../math/constants.h" #include "../math/plane.h" #include "../math/complex.h" //CUDA #include "../cuda/cudatools/devices.h" #include "../cuda/cudatools/error.h" #include "../cuda/sharedmem.cuh" namespace stim{ template class scalarwave{ protected: stim::vec3 k; //k-vector, pointed in propagation direction with magnitude |k| = tau / lambda = 2pi / lambda stim::complex E0; //amplitude /// Bend a plane wave via refraction, given that the new propagation direction is known CUDA_CALLABLE scalarwave bend(stim::vec3 kn) const{ return scalarwave(kn.norm() * kmag(), E0); } public: ///constructor: create a plane wave propagating along k CUDA_CALLABLE scalarwave(vec3 kvec = stim::vec3(0, 0, (T)stim::TAU), complex E = 1){ k = kvec; E0 = E; } CUDA_CALLABLE scalarwave(T kx, T ky, T kz, complex E = 1){ k = vec3(kx, ky, kz); E0 = E; } ///multiplication operator: scale E0 CUDA_CALLABLE scalarwave & operator* (const T & rhs){ E0 = E0 * rhs; return *this; } CUDA_CALLABLE T lambda() const{ return stim::TAU / k.len(); } CUDA_CALLABLE T kmag() const{ return k.len(); } CUDA_CALLABLE vec3< complex > E(){ return E0; } CUDA_CALLABLE vec3 kvec(){ return k; } /// calculate the value of the field produced by the plane wave given a three-dimensional position CUDA_CALLABLE complex pos(T x, T y, T z){ return pos( stim::vec3(x, y, z) ); } CUDA_CALLABLE complex pos(vec3 p = vec3(0, 0, 0)){ return E0 * exp(complex(0, k.dot(p))); } //scales k based on a transition from material ni to material nt CUDA_CALLABLE scalarwave n(T ni, T nt){ return scalarwave(k * (nt / ni), E0); } CUDA_CALLABLE scalarwave refract(stim::vec3 kn) const{ return bend(kn); } /// Calculate the result of a plane wave hitting an interface between two refractive indices /// @param P is a plane representing the position and orientation of the surface /// @param n0 is the refractive index outside of the surface (in the direction of the normal) /// @param n1 is the refractive index inside the surface (in the direction away from the normal) /// @param r is the reflected component of the plane wave /// @param t is the transmitted component of the plane wave void scatter(stim::plane P, T n0, T n1, scalarwave &r, scalarwave &t){ scatter(P, n1/n0, r, t); } /// Calculate the scattering result when nr = n1/n0 /// @param P is a plane representing the position and orientation of the surface /// @param r is the ration n1/n0 /// @param n1 is the refractive index inside the surface (in the direction away from the normal) /// @param r is the reflected component of the plane wave /// @param t is the transmitted component of the plane wave void scatter(stim::plane P, T nr, scalarwave &r, scalarwave &t){ /* int facing = P.face(k); //determine which direction the plane wave is coming in if(facing == -1){ //if the wave hits the back of the plane, invert the plane and nr P = P.flip(); //flip the plane nr = 1/nr; //invert the refractive index (now nr = n0/n1) } //use Snell's Law to calculate the transmitted angle T cos_theta_i = k.norm().dot(-P.norm()); //compute the cosine of theta_i T theta_i = acos(cos_theta_i); //compute theta_i T sin_theta_t = (1/nr) * sin(theta_i); //compute the sine of theta_t using Snell's law T theta_t = asin(sin_theta_t); //compute the cosine of theta_t bool tir = false; //flag for total internal reflection if(theta_t != theta_t){ tir = true; theta_t = stim::PI / (T)2; } //handle the degenerate case where theta_i is 0 (the plane wave hits head-on) if(theta_i == 0){ T rp = (1 - nr) / (1 + nr); //compute the Fresnel coefficients T tp = 2 / (1 + nr); vec3 kr = -k; vec3 kt = k * nr; //set the k vectors for theta_i = 0 vec3< complex > Er = E0 * rp; //compute the E vectors vec3< complex > Et = E0 * tp; T phase_t = P.p().dot(k - kt); //compute the phase offset T phase_r = P.p().dot(k - kr); //create the plane waves r = planewave(kr, Er, phase_r); t = planewave(kt, Et, phase_t); return; } //compute the Fresnel coefficients T rp, rs, tp, ts; rp = tan(theta_t - theta_i) / tan(theta_t + theta_i); rs = sin(theta_t - theta_i) / sin(theta_t + theta_i); if(tir){ tp = ts = 0; } else{ tp = ( 2 * sin(theta_t) * cos(theta_i) ) / ( sin(theta_t + theta_i) * cos(theta_t - theta_i) ); ts = ( 2 * sin(theta_t) * cos(theta_i) ) / sin(theta_t + theta_i); } //compute the coordinate space for the plane of incidence vec3 z_hat = -P.norm(); vec3 y_hat = P.parallel(k).norm(); vec3 x_hat = y_hat.cross(z_hat).norm(); //compute the k vectors for r and t vec3 kr, kt; kr = ( y_hat * sin(theta_i) - z_hat * cos(theta_i) ) * kmag(); kt = ( y_hat * sin(theta_t) + z_hat * cos(theta_t) ) * kmag() * nr; //compute the magnitude of the p- and s-polarized components of the incident E vector complex Ei_s = E0.dot(x_hat); int sgn = E0.dot(y_hat).sgn(); vec3< complex > cx_hat = x_hat; complex Ei_p = ( E0 - cx_hat * Ei_s ).len() * sgn; //compute the magnitude of the p- and s-polarized components of the reflected E vector complex Er_s = Ei_s * rs; complex Er_p = Ei_p * rp; //compute the magnitude of the p- and s-polarized components of the transmitted E vector complex Et_s = Ei_s * ts; complex Et_p = Ei_p * tp; //compute the reflected E vector vec3< complex > Er = vec3< complex >(y_hat * cos(theta_i) + z_hat * sin(theta_i)) * Er_p + cx_hat * Er_s; //compute the transmitted E vector vec3< complex > Et = vec3< complex >(y_hat * cos(theta_t) - z_hat * sin(theta_t)) * Et_p + cx_hat * Et_s; T phase_t = P.p().dot(k - kt); T phase_r = P.p().dot(k - kr); //create the plane waves r.k = kr; r.E0 = Er * exp( complex(0, phase_r) ); t.k = kt; t.E0 = Et * exp( complex(0, phase_t) ); */ } std::string str() { std::stringstream ss; ss<<"Plane Wave:"< __global__ void cuda_scalarwave(stim::complex* F, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t n_waves){ extern __shared__ stim::scalarwave shared_W[]; //declare the list of waves in shared memory stim::cuda::sharedMemcpy(shared_W, W, n_waves, threadIdx.x, blockDim.x); //copy the plane waves into shared memory for faster access __syncthreads(); //synchronize threads to insure all data is copied size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array if(i >= N) return; //exit if this thread is outside the array T px, py, pz; (x == NULL) ? px = 0 : px = x[i]; // test for NULL values and set positions (y == NULL) ? py = 0 : py = y[i]; (z == NULL) ? pz = 0 : pz = z[i]; stim::complex f = 0; //create a register to store the result for(size_t w = 0; w < n_waves; w++) f += shared_W[w].pos(px, py, pz); //evaluate the plane wave F[i] += f; //copy the result to device memory } /// evaluate a scalar wave at several points, where all arrays are on the GPU template void gpu_scalarwave(stim::complex* F, size_t N, T* x, T* y, T* z, stim::scalarwave w){ int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device dim3 blocks(N / threads + 1); //calculate the optimal number of blocks cuda_scalarwave<<< blocks, threads >>>(F, N, x, y, z, w); //call the kernel } /// Sums a series of coherent plane waves at a specified point /// @param field is the output array of field values corresponding to each input point /// @param x is an array of x coordinates for the field point /// @param y is an array of y coordinates for the field point /// @param z is an array of z coordinates for the field point /// @param N is the number of points in the input and output arrays /// @param lambda is the wavelength (all coherent waves are assumed to have the same wavelength) /// @param A is the list of amplitudes for each wave /// @param S is the list of propagation directions for each wave template void cpu_sum_scalarwaves(stim::complex* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave > w_array){ size_t S = w_array.size(); //store the number of waves #ifdef NO_CUDA memset(F, 0, N * sizeof(stim::complex)); T px, py, pz; for(size_t i = 0; i < N; i++){ // for each element in the array (x == NULL) ? px = 0 : px = x[i]; // test for NULL values (y == NULL) ? py = 0 : py = y[i]; (z == NULL) ? pz = 0 : pz = z[i]; for(size_t s = 0; s < S; s++){ F[i] += w_array[s].pos(px, py, pz); //sum all plane waves at this point } } #else stim::complex* dev_F; //allocate space for the field cudaMalloc(&dev_F, N * sizeof(stim::complex)); cudaMemset(dev_F, 0, N * sizeof(stim::complex)); //set the field to zero (necessary because a sum is used) T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) if(x != NULL){ HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T))); HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice)); } T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified) if(y != NULL){ HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T))); HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice)); } T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified) if(z != NULL){ HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T))); HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); } size_t wave_bytes = sizeof(stim::scalarwave); size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available size_t array_bytes = w_array.size() * wave_bytes; //calculate the maximum number of bytes required for the planewave array size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory size_t num_batches = w_array.size() / max_batch + 1; //calculate the number of batches required to process all plane waves size_t batch_bytes = min(w_array.size(), max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required stim::scalarwave* dev_w; HANDLE_ERROR(cudaMalloc(&dev_w, batch_bytes)); //allocate memory for a single batch of plane waves int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks size_t batch_size; //declare a variable to store the size of the current batch size_t waves_processed = 0; //initialize the number of waves processed to zero while(waves_processed < w_array.size()){ //while there are still waves to be processed batch_size = min(max_batch, w_array.size() - waves_processed); //process either a whole batch, or whatever is left batch_bytes = batch_size * sizeof(stim::scalarwave); HANDLE_ERROR(cudaMemcpy(dev_w, &w_array[waves_processed], batch_bytes, cudaMemcpyHostToDevice)); //copy the plane waves into global memory cuda_scalarwave<<< blocks, threads, batch_bytes >>>(dev_F, N, dev_x, dev_y, dev_z, dev_w, batch_size); //call the kernel waves_processed += batch_size; //increment the counter indicating how many waves have been processed } cudaMemcpy(F, dev_F, N * sizeof(stim::complex), cudaMemcpyDeviceToHost); //copy the field from device memory if(x != NULL) cudaFree(dev_x); //free everything if(y != NULL) cudaFree(dev_y); if(z != NULL) cudaFree(dev_z); cudaFree(dev_F); cudaFree(dev_w); #endif } template void cpu_scalarwave(stim::complex* F, size_t N, T* x, T* y, T* z, stim::scalarwave w){ std::vector< stim::scalarwave > w_array(1, w); cpu_sum_scalarwaves(F, N, x, y, z, w_array); } /// Sums a series of coherent plane waves at a specified point /// @param x is the x coordinate of the field point /// @param y is the y coordinate of the field point /// @param z is the z coordinate of the field point /// @param lambda is the wavelength (all coherent waves are assumed to have the same wavelength) /// @param A is the list of amplitudes for each wave /// @param S is the list of propagation directions for each wave template CUDA_CALLABLE stim::complex sum_scalarwaves(T x, T y, T z, std::vector< stim::scalarwave > W){ size_t N = W.size(); //get the number of plane wave samples stim::complex field(0, 0); //initialize the field to zero (0) stim::vec3 k; //allocate space for the direction vector for(size_t i = 0; i < N; i++){ field += W[i].pos(x, y, z); } return field; } } //end namespace stim template std::ostream& operator<<(std::ostream& os, stim::scalarwave p) { os<