8e4f8364
David Mayerich
started a new opt...
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
#ifndef STIM_SCALARWAVE_H
#define STIM_SCALARWAVE_H
#include <string>
#include <sstream>
#include <cmath>
//#include "../math/vector.h"
#include "../math/vec3.h"
#include "../math/quaternion.h"
#include "../math/constants.h"
#include "../math/plane.h"
#include "../math/complex.h"
//CUDA
#include "../cuda/cudatools/devices.h"
#include "../cuda/cudatools/error.h"
#include "../cuda/sharedmem.cuh"
namespace stim{
template<typename T>
class scalarwave{
|
31262e83
David Mayerich
GPU implementatio...
|
26
|
public:
|
8e4f8364
David Mayerich
started a new opt...
|
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
|
stim::vec3<T> k; //k-vector, pointed in propagation direction with magnitude |k| = tau / lambda = 2pi / lambda
stim::complex<T> E0; //amplitude
/// Bend a plane wave via refraction, given that the new propagation direction is known
CUDA_CALLABLE scalarwave<T> bend(stim::vec3<T> kn) const{
return scalarwave<T>(kn.norm() * kmag(), E0);
}
public:
///constructor: create a plane wave propagating along k
CUDA_CALLABLE scalarwave(vec3<T> kvec = stim::vec3<T>(0, 0, (T)stim::TAU), complex<T> E = 1){
k = kvec;
E0 = E;
}
CUDA_CALLABLE scalarwave(T kx, T ky, T kz, complex<T> E = 1){
k = vec3<T>(kx, ky, kz);
E0 = E;
}
///multiplication operator: scale E0
CUDA_CALLABLE scalarwave<T> & operator* (const T & rhs){
E0 = E0 * rhs;
return *this;
}
CUDA_CALLABLE T lambda() const{
return stim::TAU / k.len();
}
CUDA_CALLABLE T kmag() const{
return k.len();
}
|
9339fbad
David Mayerich
implementing mie ...
|
63
|
CUDA_CALLABLE complex<T> E(){
|
8e4f8364
David Mayerich
started a new opt...
|
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
|
return E0;
}
CUDA_CALLABLE vec3<T> kvec(){
return k;
}
/// calculate the value of the field produced by the plane wave given a three-dimensional position
CUDA_CALLABLE complex<T> pos(T x, T y, T z){
return pos( stim::vec3<T>(x, y, z) );
}
CUDA_CALLABLE complex<T> pos(vec3<T> p = vec3<T>(0, 0, 0)){
return E0 * exp(complex<T>(0, k.dot(p)));
}
//scales k based on a transition from material ni to material nt
CUDA_CALLABLE scalarwave<T> n(T ni, T nt){
return scalarwave<T>(k * (nt / ni), E0);
}
CUDA_CALLABLE scalarwave<T> refract(stim::vec3<T> kn) const{
return bend(kn);
}
/// Calculate the result of a plane wave hitting an interface between two refractive indices
/// @param P is a plane representing the position and orientation of the surface
/// @param n0 is the refractive index outside of the surface (in the direction of the normal)
/// @param n1 is the refractive index inside the surface (in the direction away from the normal)
/// @param r is the reflected component of the plane wave
/// @param t is the transmitted component of the plane wave
void scatter(stim::plane<T> P, T n0, T n1, scalarwave<T> &r, scalarwave<T> &t){
scatter(P, n1/n0, r, t);
}
/// Calculate the scattering result when nr = n1/n0
/// @param P is a plane representing the position and orientation of the surface
/// @param r is the ration n1/n0
/// @param n1 is the refractive index inside the surface (in the direction away from the normal)
/// @param r is the reflected component of the plane wave
/// @param t is the transmitted component of the plane wave
void scatter(stim::plane<T> P, T nr, scalarwave<T> &r, scalarwave<T> &t){
/*
int facing = P.face(k); //determine which direction the plane wave is coming in
if(facing == -1){ //if the wave hits the back of the plane, invert the plane and nr
P = P.flip(); //flip the plane
nr = 1/nr; //invert the refractive index (now nr = n0/n1)
}
//use Snell's Law to calculate the transmitted angle
T cos_theta_i = k.norm().dot(-P.norm()); //compute the cosine of theta_i
T theta_i = acos(cos_theta_i); //compute theta_i
T sin_theta_t = (1/nr) * sin(theta_i); //compute the sine of theta_t using Snell's law
T theta_t = asin(sin_theta_t); //compute the cosine of theta_t
bool tir = false; //flag for total internal reflection
if(theta_t != theta_t){
tir = true;
theta_t = stim::PI / (T)2;
}
//handle the degenerate case where theta_i is 0 (the plane wave hits head-on)
if(theta_i == 0){
T rp = (1 - nr) / (1 + nr); //compute the Fresnel coefficients
T tp = 2 / (1 + nr);
vec3<T> kr = -k;
vec3<T> kt = k * nr; //set the k vectors for theta_i = 0
vec3< complex<T> > Er = E0 * rp; //compute the E vectors
vec3< complex<T> > Et = E0 * tp;
T phase_t = P.p().dot(k - kt); //compute the phase offset
T phase_r = P.p().dot(k - kr);
//create the plane waves
r = planewave<T>(kr, Er, phase_r);
t = planewave<T>(kt, Et, phase_t);
return;
}
//compute the Fresnel coefficients
T rp, rs, tp, ts;
rp = tan(theta_t - theta_i) / tan(theta_t + theta_i);
rs = sin(theta_t - theta_i) / sin(theta_t + theta_i);
if(tir){
tp = ts = 0;
}
else{
tp = ( 2 * sin(theta_t) * cos(theta_i) ) / ( sin(theta_t + theta_i) * cos(theta_t - theta_i) );
ts = ( 2 * sin(theta_t) * cos(theta_i) ) / sin(theta_t + theta_i);
}
//compute the coordinate space for the plane of incidence
vec3<T> z_hat = -P.norm();
vec3<T> y_hat = P.parallel(k).norm();
vec3<T> x_hat = y_hat.cross(z_hat).norm();
//compute the k vectors for r and t
vec3<T> kr, kt;
kr = ( y_hat * sin(theta_i) - z_hat * cos(theta_i) ) * kmag();
kt = ( y_hat * sin(theta_t) + z_hat * cos(theta_t) ) * kmag() * nr;
//compute the magnitude of the p- and s-polarized components of the incident E vector
complex<T> Ei_s = E0.dot(x_hat);
int sgn = E0.dot(y_hat).sgn();
vec3< complex<T> > cx_hat = x_hat;
complex<T> Ei_p = ( E0 - cx_hat * Ei_s ).len() * sgn;
//compute the magnitude of the p- and s-polarized components of the reflected E vector
complex<T> Er_s = Ei_s * rs;
complex<T> Er_p = Ei_p * rp;
//compute the magnitude of the p- and s-polarized components of the transmitted E vector
complex<T> Et_s = Ei_s * ts;
complex<T> Et_p = Ei_p * tp;
//compute the reflected E vector
vec3< complex<T> > Er = vec3< complex<T> >(y_hat * cos(theta_i) + z_hat * sin(theta_i)) * Er_p + cx_hat * Er_s;
//compute the transmitted E vector
vec3< complex<T> > Et = vec3< complex<T> >(y_hat * cos(theta_t) - z_hat * sin(theta_t)) * Et_p + cx_hat * Et_s;
T phase_t = P.p().dot(k - kt);
T phase_r = P.p().dot(k - kr);
//create the plane waves
r.k = kr;
r.E0 = Er * exp( complex<T>(0, phase_r) );
t.k = kt;
t.E0 = Et * exp( complex<T>(0, phase_t) );
*/
}
std::string str()
{
std::stringstream ss;
ss<<"Plane Wave:"<<std::endl;
ss<<" "<<E0<<" e^i ( "<<k<<" . r )";
return ss.str();
}
}; //end planewave class
/// CUDA kernel for computing the field produced by a batch of plane waves at an array of locations
template<typename T>
__global__ void cuda_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t n_waves){
extern __shared__ stim::scalarwave<T> shared_W[]; //declare the list of waves in shared memory
|
963d0676
David Mayerich
bug fixes related...
|
213
|
stim::cuda::threadedMemcpy(shared_W, W, n_waves, threadIdx.x, blockDim.x); //copy the plane waves into shared memory for faster access
|
8e4f8364
David Mayerich
started a new opt...
|
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
|
__syncthreads(); //synchronize threads to insure all data is copied
size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
if(i >= N) return; //exit if this thread is outside the array
T px, py, pz;
(x == NULL) ? px = 0 : px = x[i]; // test for NULL values and set positions
(y == NULL) ? py = 0 : py = y[i];
(z == NULL) ? pz = 0 : pz = z[i];
stim::complex<T> f = 0; //create a register to store the result
for(size_t w = 0; w < n_waves; w++)
f += shared_W[w].pos(px, py, pz); //evaluate the plane wave
F[i] += f; //copy the result to device memory
}
/// evaluate a scalar wave at several points, where all arrays are on the GPU
template<typename T>
void gpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
dim3 blocks(N / threads + 1); //calculate the optimal number of blocks
cuda_scalarwave<T><<< blocks, threads >>>(F, N, x, y, z, w); //call the kernel
}
|
9339fbad
David Mayerich
implementing mie ...
|
238
239
240
241
242
|
template<typename T>
void gpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW){
size_t wave_bytes = sizeof(stim::scalarwave<T>);
size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available
|
9339fbad
David Mayerich
implementing mie ...
|
243
|
size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory
|
9339fbad
David Mayerich
implementing mie ...
|
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
|
size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required
stim::scalarwave<T>* batch_W;
HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes)); //allocate memory for a single batch of plane waves
int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
size_t batch_size; //declare a variable to store the size of the current batch
size_t waves_processed = 0; //initialize the number of waves processed to zero
while(waves_processed < nW){ //while there are still waves to be processed
batch_size = min<size_t>(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left
batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory
cuda_scalarwave<T><<< blocks, threads, batch_bytes >>>(F, N, x, y, z, batch_W, batch_size); //call the kernel
waves_processed += batch_size; //increment the counter indicating how many waves have been processed
}
cudaFree(batch_W);
}
|
8e4f8364
David Mayerich
started a new opt...
|
264
265
266
267
268
269
270
271
272
273
|
/// Sums a series of coherent plane waves at a specified point
/// @param field is the output array of field values corresponding to each input point
/// @param x is an array of x coordinates for the field point
/// @param y is an array of y coordinates for the field point
/// @param z is an array of z coordinates for the field point
/// @param N is the number of points in the input and output arrays
/// @param lambda is the wavelength (all coherent waves are assumed to have the same wavelength)
/// @param A is the list of amplitudes for each wave
/// @param S is the list of propagation directions for each wave
template<typename T>
|
9339fbad
David Mayerich
implementing mie ...
|
274
275
276
|
void cpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, std::vector< stim::scalarwave<T> > W){
size_t S = W.size(); //store the number of waves
#ifdef __CUDACC__
|
8e4f8364
David Mayerich
started a new opt...
|
277
278
|
stim::complex<T>* dev_F; //allocate space for the field
cudaMalloc(&dev_F, N * sizeof(stim::complex<T>));
|
9339fbad
David Mayerich
implementing mie ...
|
279
280
|
cudaMemcpy(dev_F, F, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used)
|
8e4f8364
David Mayerich
started a new opt...
|
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
|
T* dev_x = NULL; //allocate space and copy the X coordinate (if specified)
if(x != NULL){
HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
}
T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified)
if(y != NULL){
HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
}
T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified)
if(z != NULL){
HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
}
|
9339fbad
David Mayerich
implementing mie ...
|
300
301
302
|
stim::scalarwave<T>* dev_W;
HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
|
8e4f8364
David Mayerich
started a new opt...
|
303
|
|
9339fbad
David Mayerich
implementing mie ...
|
304
|
gpu_scalarwaves(dev_F, N, dev_x, dev_y, dev_z, dev_W, W.size());
|
8e4f8364
David Mayerich
started a new opt...
|
305
306
307
308
309
310
311
|
cudaMemcpy(F, dev_F, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory
if(x != NULL) cudaFree(dev_x); //free everything
if(y != NULL) cudaFree(dev_y);
if(z != NULL) cudaFree(dev_z);
cudaFree(dev_F);
|
9339fbad
David Mayerich
implementing mie ...
|
312
313
314
315
316
317
318
|
#else
memset(F, 0, N * sizeof(stim::complex<T>));
T px, py, pz;
for(size_t i = 0; i < N; i++){ // for each element in the array
(x == NULL) ? px = 0 : px = x[i]; // test for NULL values
(y == NULL) ? py = 0 : py = y[i];
(z == NULL) ? pz = 0 : pz = z[i];
|
8e4f8364
David Mayerich
started a new opt...
|
319
|
|
9339fbad
David Mayerich
implementing mie ...
|
320
321
322
323
|
for(size_t s = 0; s < S; s++){
F[i] += w_array[s].pos(px, py, pz); //sum all plane waves at this point
}
}
|
8e4f8364
David Mayerich
started a new opt...
|
324
325
326
327
328
329
|
#endif
}
template<typename T>
void cpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
std::vector< stim::scalarwave<T> > w_array(1, w);
|
9339fbad
David Mayerich
implementing mie ...
|
330
|
cpu_scalarwaves(F, N, x, y, z, w_array);
|
31262e83
David Mayerich
GPU implementatio...
|
331
332
333
334
335
336
|
}
template<typename T>
void cpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
std::vector< stim::scalarwave<T> > w_array(1, w);
cpu_scalarwaves(F, N, x, y, z, w_array);
|
8e4f8364
David Mayerich
started a new opt...
|
337
338
339
340
341
342
343
344
345
346
347
|
}
/// Sums a series of coherent plane waves at a specified point
/// @param x is the x coordinate of the field point
/// @param y is the y coordinate of the field point
/// @param z is the z coordinate of the field point
/// @param lambda is the wavelength (all coherent waves are assumed to have the same wavelength)
/// @param A is the list of amplitudes for each wave
/// @param S is the list of propagation directions for each wave
template<typename T>
|
9339fbad
David Mayerich
implementing mie ...
|
348
|
CUDA_CALLABLE stim::complex<T> cpu_scalarwaves(T x, T y, T z, std::vector< stim::scalarwave<T> > W){
|
8e4f8364
David Mayerich
started a new opt...
|
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
|
size_t N = W.size(); //get the number of plane wave samples
stim::complex<T> field(0, 0); //initialize the field to zero (0)
stim::vec3<T> k; //allocate space for the direction vector
for(size_t i = 0; i < N; i++){
field += W[i].pos(x, y, z);
}
return field;
}
} //end namespace stim
template <typename T>
std::ostream& operator<<(std::ostream& os, stim::scalarwave<T> p)
{
os<<p.str();
return os;
}
#endif
|