Commit 31262e831e976c77dcedd6370811cc9f59bc24fe

Authored by David Mayerich
1 parent e252a6d6

GPU implementation of Mie scattering (external field)

stim/optics/mie.h
... ... @@ -3,6 +3,7 @@
3 3  
4 4 #include "scalarwave.h"
5 5 #include "../math/bessel.h"
  6 +#include "../cuda/cudatools/devices.h"
6 7 #include <cmath>
7 8  
8 9 namespace stim{
... ... @@ -33,7 +34,7 @@ void B_coefficients(stim::complex&lt;T&gt;* B, T a, T k, stim::complex&lt;T&gt; n, int Nl){
33 34 stim::complex<double> h_ka, dh_ka;
34 35 stim::complex<double> numerator, denominator;
35 36 stim::complex<double> i(0, 1);
36   - for(size_t l = 0; l <= Nl; l++){
  37 + for(int l = 0; l <= Nl; l++){
37 38 h_ka.r = j_ka[l];
38 39 h_ka.i = y_ka[l];
39 40 dh_ka.r = dj_ka[l];
... ... @@ -81,84 +82,102 @@ void A_coefficients(stim::complex&lt;T&gt;* A, T a, T k, stim::complex&lt;T&gt; n, int Nl){
81 82 }
82 83 }
83 84  
  85 +#define LOCAL_NL 16
84 86 template<typename T>
85   -__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* B, T* j, T kr_min, T dkr, int Nl){
86   - extern __shared__ stim::scalarwave<T> shared_W[]; //declare the list of waves in shared memory
87   -
88   - stim::cuda::sharedMemcpy(shared_W, W, nW, threadIdx.x, blockDim.x); //copy the plane waves into shared memory for faster access
89   - __syncthreads(); //synchronize threads to insure all data is copied
  87 +__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, int Nl){
  88 + extern __shared__ stim::complex<T> shared_hB[]; //declare the list of waves in shared memory
90 89  
91 90 size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
92   - if(i >= N) return; //exit if this thread is outside the array
  91 + if(i >= N) return; //exit if this thread is outside the array
93 92 stim::vec3<T> p;
94 93 (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
95 94 (y == NULL) ? p[1] = 0 : p[1] = y[i];
96 95 (z == NULL) ? p[2] = 0 : p[2] = z[i];
97 96  
98   - T r = p.len(); //calculate the distance from the sphere
  97 + T r = p.len(); //calculate the distance from the sphere
  98 + if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field)
99 99 T k = W[0].kmag();
100   - if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field)
101   -
102   - size_t NC = Nl + 1; //calculate the number of coefficients to be used
103   - T kr = r * k; //calculate the thread value for k*r
104   - T fij = (kr - kr_min)/dkr; //FP index into the spherical bessel LUT
105   - size_t ij = (size_t) fij; //convert to an integral index
106   - T alpha = fij - ij; //calculate the fractional portion of the index
107   - size_t n0j = ij * (NC); //start of the first entry in the LUT
108   - size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT
  100 + size_t NC = Nl + 1; //calculate the number of coefficients to be used
  101 + T kr = r * k; //calculate the thread value for k*r
  102 + T fij = (kr - kr_min)/dkr; //FP index into the spherical bessel LUT
  103 + size_t ij = (size_t) fij; //convert to an integral index
  104 + T alpha = fij - ij; //calculate the fractional portion of the index
  105 + size_t n0j = ij * (NC); //start of the first entry in the LUT
  106 + size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT
109 107  
110 108 T cos_phi;
111   - T Pl_2, Pl_1; //declare registers to store the previous two Legendre polynomials
112   - T Pl = 1; //initialize the current value for the Legendre polynomial
113   - T jl;
  109 + T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials
  110 +
  111 + stim::complex<T> hBl;
114 112 stim::complex<T> Ei = 0; //create a register to store the result
115 113 int l;
  114 +
  115 + stim::complex<T> hlBl[LOCAL_NL+1];
  116 + int shared_start = threadIdx.x * (Nl - LOCAL_NL);
  117 +
  118 + #pragma unroll LOCAL_NL+1
  119 + for(l = 0; l <= LOCAL_NL; l++)
  120 + hlBl[l] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
  121 +
  122 + for(l = LOCAL_NL+1; l <= Nl; l++)
  123 + shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
  124 +
116 125 for(size_t w = 0; w < nW; w++){
117 126 cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere
118   - for(l = 0; l <= Nl; l++){
119   - Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
  127 + Pl_2 = 1;
  128 + Pl_1 = cos_phi;
  129 + Ei += W[w].E() * hlBl[0] * Pl_2;
  130 + Ei += W[w].E() * hlBl[1] * Pl_1;
  131 +
  132 + #pragma unroll LOCAL_NL-1
  133 + for(l = 2; l <= LOCAL_NL; l++){
  134 + Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
  135 + Ei += W[w].E() * hlBl[l] * Pl;
  136 + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
120 137 Pl_1 = Pl;
121   - if(l == 0){ //computing Pl is done recursively, where the recursive relation
122   - Pl = cos_phi; // requires the first two orders. This defines the second.
123   - }
124   - else{ //if this is not the first iteration, use the recursive relation to calculate Pl
125   - Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
126   - }
  138 + }
127 139  
128   - jl = lerp<T>( j[n0j + l], j[n1j + l], alpha ); //read jl from the LUT and interpolate the result
129   - Ei += W[w].E() * B[l] * jl * Pl;
  140 + for(l = LOCAL_NL+1; l <= Nl; l++){
  141 + Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
  142 + Ei += W[w].E() * shared_hB[shared_start + (l - (LOCAL_NL+1))] * Pl;
  143 + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
  144 + Pl_1 = Pl;
  145 +
130 146 }
131   - //Ei += shared_W[w].pos(p); //evaluate the plane wave
132 147 }
133   - E[i] += Ei; //copy the result to device memory
  148 + E[i] += Ei; //copy the result to device memory
134 149 }
135 150  
136 151 template<typename T>
137   -void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* B, T* j, T kr_min, T dkr, size_t Nl){
138   -
139   - size_t wave_bytes = sizeof(stim::scalarwave<T>);
140   - size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available
141   - size_t array_bytes = nW * wave_bytes; //calculate the maximum number of bytes required for the planewave array
142   - size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory
143   - size_t num_batches = nW / max_batch + 1; //calculate the number of batches required to process all plane waves
144   - size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required
145   -
146   - stim::scalarwave<T>* batch_W;
147   - HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes)); //allocate memory for a single batch of plane waves
148   -
149   - int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
150   - dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
151   -
152   - size_t batch_size; //declare a variable to store the size of the current batch
153   - size_t waves_processed = 0; //initialize the number of waves processed to zero
154   - while(waves_processed < nW){ //while there are still waves to be processed
155   - batch_size = min<size_t>(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left
156   - batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
157   - HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory
158   - cuda_scalar_mie_scatter<T><<< blocks, threads, batch_bytes >>>(E, N, x, y, z, batch_W, batch_size, a, n, B, j, kr_min, dkr, (int)Nl); //call the kernel
159   - waves_processed += batch_size; //increment the counter indicating how many waves have been processed
160   - }
161   - cudaFree(batch_W);
  152 +void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){
  153 +
  154 + size_t max_shared_mem = stim::sharedMemPerBlock();
  155 + int hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
  156 + std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl;
  157 + std::cout<<"shared memory: "<<max_shared_mem<<std::endl;
  158 + int threads = (max_shared_mem / hBl_array) / 32 * 32;
  159 + std::cout<<"threads per block: "<<threads<<std::endl;
  160 + dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
  161 +
  162 + size_t shared_mem;
  163 + if(Nl <= LOCAL_NL) shared_mem = 0;
  164 + else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate
  165 + std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
  166 + cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel
  167 +
  168 +}
  169 +
  170 +template<typename T>
  171 +__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){
  172 + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
  173 + if(i >= N) return; //exit if this thread is outside the array
  174 +
  175 + stim::vec3<T> p;
  176 + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
  177 + (y == NULL) ? p[1] = 0 : p[1] = y[i];
  178 + (z == NULL) ? p[2] = 0 : p[2] = z[i];
  179 +
  180 + r[i] = p.len();
162 181 }
163 182 /// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
164 183  
... ... @@ -171,17 +190,19 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, sti
171 190 /// @param a is the radius of the sphere
172 191 /// @param n is the complex refractive index of the sphere
173 192 template<typename T>
174   -void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n){
  193 +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
175 194 //calculate the necessary number of orders required to represent the scattered field
176 195 T k = W[0].kmag();
177 196  
178   - size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2);
  197 + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
  198 + if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization)
  199 + std::cout<<"Nl: "<<Nl<<std::endl;
179 200  
180 201 //calculate the scattering coefficients for the sphere
181 202 stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients
182 203 B_coefficients(B, a, k, n, Nl);
183 204  
184   -#ifdef __CUDACC__
  205 +#ifdef CUDA_FOUND
185 206 stim::complex<T>* dev_E; //allocate space for the field
186 207 cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
187 208 cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
... ... @@ -209,16 +230,44 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
209 230 HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
210 231 HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
211 232  
212   - // SCATTERING COEFFICIENTS
213   - stim::complex<T>* dev_B;
214   - HANDLE_ERROR( cudaMalloc(&dev_B, sizeof(stim::complex<T>) * (Nl+1)) );
215   - HANDLE_ERROR( cudaMemcpy(dev_B, B, sizeof(stim::complex<T>) * (Nl+1), cudaMemcpyHostToDevice) );
216   -
217 233 // BESSEL FUNCTION LOOK-UP TABLE
  234 + //calculate the distance from the sphere center
  235 + T* dev_r;
  236 + HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );
  237 +
  238 + int threads = stim::maxThreadsPerBlock();
  239 + dim3 blocks((unsigned)(N / threads + 1));
  240 + cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);
  241 +
  242 + //Find the minimum and maximum values of r
  243 + cublasStatus_t stat;
  244 + cublasHandle_t handle;
  245 +
  246 + stat = cublasCreate(&handle); //create a cuBLAS handle
  247 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  248 + printf ("CUBLAS initialization failed\n");
  249 + exit(1);
  250 + }
  251 +
  252 + int i_min, i_max;
  253 + stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
  254 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  255 + printf ("CUBLAS Error: failed to calculate minimum r value.\n");
  256 + exit(1);
  257 + }
  258 + stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
  259 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  260 + printf ("CUBLAS Error: failed to calculate maximum r value.\n");
  261 + exit(1);
  262 + }
  263 +
  264 + T r_min, r_max; //allocate space to store the minimum and maximum values
  265 + HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU
  266 + HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
  267 +
  268 +
218 269 //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r
219   - size_t Nlut_j = 1024;
220   - T r_min = 0;
221   - T r_max = 10;
  270 + size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1);
222 271  
223 272 T kr_min = k * r_min;
224 273 T kr_max = k * r_max;
... ... @@ -230,25 +279,29 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
230 279 double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
231 280 double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
232 281  
233   - size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j;
234   - T* bessel_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table
235   - T dkr = (kr_max - kr_min) / (Nlut_j-1); //distance between values in the LUT
236   - std::cout<<"LUT jl bytes: "<<lutj_bytes<<std::endl;
237   - for(size_t kri = 0; kri < Nlut_j; kri++){ //for each value in the LUT
  282 + size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut;
  283 + stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes); //pointer to the look-up table
  284 + T dkr = (kr_max - kr_min) / (N_hB_lut-1); //distance between values in the LUT
  285 + std::cout<<"LUT jl bytes: "<<hB_bytes<<std::endl;
  286 + stim::complex<T> hl;
  287 + for(size_t kri = 0; kri < N_hB_lut; kri++){ //for each value in the LUT
238 288 stim::bessjyv_sph<double>(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
239 289 for(size_t l = 0; l <= Nl; l++){ //for each order
240   - bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; //store the bessel function result
  290 + hl.r = (T)jv[l];
  291 + hl.i = (T)yv[l];
  292 +
  293 + hB_lut[kri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result
241 294 }
242 295 }
243 296  
244   - stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
  297 + //stim::cpu2image<T>(hankel_lut, "hankel.bmp", Nl+1, Nlut_j, stim::cmBrewer);
245 298  
246 299 //Allocate device memory and copy everything to the GPU
247   - T* dev_j_lut;
248   - HANDLE_ERROR( cudaMalloc(&dev_j_lut, lutj_bytes) );
249   - HANDLE_ERROR( cudaMemcpy(dev_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) );
  300 + stim::complex<T>* dev_hB_lut;
  301 + HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
  302 + HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );
250 303  
251   - gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_B, dev_j_lut, kr_min, dkr, Nl);
  304 + gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, kr_min, dkr, N_hB_lut, Nl);
252 305  
253 306 cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory
254 307  
... ... @@ -318,6 +371,7 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
318 371 T k = W[0].kmag();
319 372  
320 373 size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2);
  374 + std::cout<<"Nl: "<<Nl<<std::endl;
321 375  
322 376 //calculate the scattering coefficients for the sphere
323 377 stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients
... ...
stim/optics/scalarbeam.h
... ... @@ -110,7 +110,7 @@ public:
110 110 std::vector< scalarwave<T> > samples(N); //create a vector of plane waves
111 111 T kmag = (T)stim::TAU / lambda; //calculate the wavenumber
112 112 stim::complex<T> apw; //allocate space for the amplitude at the focal point
113   - T a = stim::TAU * (1 - cos(asin(NA[0]))) / (double)N;
  113 + T a = (T)(stim::TAU * (1 - cos(asin(NA[0]))) / (double)N);
114 114 stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction
115 115 for(size_t i=0; i<N; i++){ //for each sample
116 116 kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave
... ... @@ -201,6 +201,11 @@ CUDA_CALLABLE void lut_lookup(T* lut_values, T* lut, T val, size_t N, T min_val,
201 201 }
202 202  
203 203 template <typename T>
  204 +CUDA_CALLABLE stim::complex<T> clerp(stim::complex<T> v0, stim::complex<T> v1, T t) {
  205 + return stim::complex<T>( fma(t, v1.r, fma(-t, v0.r, v0.r)), fma(t, v1.i, fma(-t, v0.i, v0.i)) );
  206 +}
  207 +
  208 +template <typename T>
204 209 CUDA_CALLABLE T lerp(T v0, T v1, T t) {
205 210 return fma(t, v1, fma(-t, v0, v0));
206 211 }
... ...
stim/optics/scalarwave.h
... ... @@ -23,7 +23,7 @@ namespace stim{
23 23 template<typename T>
24 24 class scalarwave{
25 25  
26   -protected:
  26 +public:
27 27  
28 28 stim::vec3<T> k; //k-vector, pointed in propagation direction with magnitude |k| = tau / lambda = 2pi / lambda
29 29 stim::complex<T> E0; //amplitude
... ... @@ -240,9 +240,7 @@ void gpu_scalarwaves(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scal
240 240  
241 241 size_t wave_bytes = sizeof(stim::scalarwave<T>);
242 242 size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available
243   - size_t array_bytes = nW * wave_bytes; //calculate the maximum number of bytes required for the planewave array
244 243 size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory
245   - size_t num_batches = nW / max_batch + 1; //calculate the number of batches required to process all plane waves
246 244 size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required
247 245  
248 246 stim::scalarwave<T>* batch_W;
... ... @@ -332,6 +330,12 @@ void cpu_scalarwave(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scala
332 330 cpu_scalarwaves(F, N, x, y, z, w_array);
333 331 }
334 332  
  333 +template<typename T>
  334 +void cpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
  335 + std::vector< stim::scalarwave<T> > w_array(1, w);
  336 + cpu_scalarwaves(F, N, x, y, z, w_array);
  337 +}
  338 +
335 339  
336 340 /// Sums a series of coherent plane waves at a specified point
337 341 /// @param x is the x coordinate of the field point
... ...