Commit 31262e831e976c77dcedd6370811cc9f59bc24fe
1 parent
e252a6d6
GPU implementation of Mie scattering (external field)
Showing
3 changed files
with
146 additions
and
83 deletions
Show diff stats
stim/optics/mie.h
... | ... | @@ -3,6 +3,7 @@ |
3 | 3 | |
4 | 4 | #include "scalarwave.h" |
5 | 5 | #include "../math/bessel.h" |
6 | +#include "../cuda/cudatools/devices.h" | |
6 | 7 | #include <cmath> |
7 | 8 | |
8 | 9 | namespace stim{ |
... | ... | @@ -33,7 +34,7 @@ void B_coefficients(stim::complex<T>* B, T a, T k, stim::complex<T> n, int Nl){ |
33 | 34 | stim::complex<double> h_ka, dh_ka; |
34 | 35 | stim::complex<double> numerator, denominator; |
35 | 36 | stim::complex<double> i(0, 1); |
36 | - for(size_t l = 0; l <= Nl; l++){ | |
37 | + for(int l = 0; l <= Nl; l++){ | |
37 | 38 | h_ka.r = j_ka[l]; |
38 | 39 | h_ka.i = y_ka[l]; |
39 | 40 | dh_ka.r = dj_ka[l]; |
... | ... | @@ -81,84 +82,102 @@ void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){ |
81 | 82 | } |
82 | 83 | } |
83 | 84 | |
85 | +#define LOCAL_NL 16 | |
84 | 86 | template<typename T> |
85 | -__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* B, T* j, T kr_min, T dkr, int Nl){ | |
86 | - extern __shared__ stim::scalarwave<T> shared_W[]; //declare the list of waves in shared memory | |
87 | - | |
88 | - stim::cuda::sharedMemcpy(shared_W, W, nW, threadIdx.x, blockDim.x); //copy the plane waves into shared memory for faster access | |
89 | - __syncthreads(); //synchronize threads to insure all data is copied | |
87 | +__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, int Nl){ | |
88 | + extern __shared__ stim::complex<T> shared_hB[]; //declare the list of waves in shared memory | |
90 | 89 | |
91 | 90 | size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array |
92 | - if(i >= N) return; //exit if this thread is outside the array | |
91 | + if(i >= N) return; //exit if this thread is outside the array | |
93 | 92 | stim::vec3<T> p; |
94 | 93 | (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions |
95 | 94 | (y == NULL) ? p[1] = 0 : p[1] = y[i]; |
96 | 95 | (z == NULL) ? p[2] = 0 : p[2] = z[i]; |
97 | 96 | |
98 | - T r = p.len(); //calculate the distance from the sphere | |
97 | + T r = p.len(); //calculate the distance from the sphere | |
98 | + if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) | |
99 | 99 | T k = W[0].kmag(); |
100 | - if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) | |
101 | - | |
102 | - size_t NC = Nl + 1; //calculate the number of coefficients to be used | |
103 | - T kr = r * k; //calculate the thread value for k*r | |
104 | - T fij = (kr - kr_min)/dkr; //FP index into the spherical bessel LUT | |
105 | - size_t ij = (size_t) fij; //convert to an integral index | |
106 | - T alpha = fij - ij; //calculate the fractional portion of the index | |
107 | - size_t n0j = ij * (NC); //start of the first entry in the LUT | |
108 | - size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT | |
100 | + size_t NC = Nl + 1; //calculate the number of coefficients to be used | |
101 | + T kr = r * k; //calculate the thread value for k*r | |
102 | + T fij = (kr - kr_min)/dkr; //FP index into the spherical bessel LUT | |
103 | + size_t ij = (size_t) fij; //convert to an integral index | |
104 | + T alpha = fij - ij; //calculate the fractional portion of the index | |
105 | + size_t n0j = ij * (NC); //start of the first entry in the LUT | |
106 | + size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT | |
109 | 107 | |
110 | 108 | T cos_phi; |
111 | - T Pl_2, Pl_1; //declare registers to store the previous two Legendre polynomials | |
112 | - T Pl = 1; //initialize the current value for the Legendre polynomial | |
113 | - T jl; | |
109 | + T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials | |
110 | + | |
111 | + stim::complex<T> hBl; | |
114 | 112 | stim::complex<T> Ei = 0; //create a register to store the result |
115 | 113 | int l; |
114 | + | |
115 | + stim::complex<T> hlBl[LOCAL_NL+1]; | |
116 | + int shared_start = threadIdx.x * (Nl - LOCAL_NL); | |
117 | + | |
118 | + #pragma unroll LOCAL_NL+1 | |
119 | + for(l = 0; l <= LOCAL_NL; l++) | |
120 | + hlBl[l] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha ); | |
121 | + | |
122 | + for(l = LOCAL_NL+1; l <= Nl; l++) | |
123 | + shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha ); | |
124 | + | |
116 | 125 | for(size_t w = 0; w < nW; w++){ |
117 | 126 | cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere |
118 | - for(l = 0; l <= Nl; l++){ | |
119 | - Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 | |
127 | + Pl_2 = 1; | |
128 | + Pl_1 = cos_phi; | |
129 | + Ei += W[w].E() * hlBl[0] * Pl_2; | |
130 | + Ei += W[w].E() * hlBl[1] * Pl_1; | |
131 | + | |
132 | + #pragma unroll LOCAL_NL-1 | |
133 | + for(l = 2; l <= LOCAL_NL; l++){ | |
134 | + Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1); | |
135 | + Ei += W[w].E() * hlBl[l] * Pl; | |
136 | + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 | |
120 | 137 | Pl_1 = Pl; |
121 | - if(l == 0){ //computing Pl is done recursively, where the recursive relation | |
122 | - Pl = cos_phi; // requires the first two orders. This defines the second. | |
123 | - } | |
124 | - else{ //if this is not the first iteration, use the recursive relation to calculate Pl | |
125 | - Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1); | |
126 | - } | |
138 | + } | |
127 | 139 | |
128 | - jl = lerp<T>( j[n0j + l], j[n1j + l], alpha ); //read jl from the LUT and interpolate the result | |
129 | - Ei += W[w].E() * B[l] * jl * Pl; | |
140 | + for(l = LOCAL_NL+1; l <= Nl; l++){ | |
141 | + Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1); | |
142 | + Ei += W[w].E() * shared_hB[shared_start + (l - (LOCAL_NL+1))] * Pl; | |
143 | + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 | |
144 | + Pl_1 = Pl; | |
145 | + | |
130 | 146 | } |
131 | - //Ei += shared_W[w].pos(p); //evaluate the plane wave | |
132 | 147 | } |
133 | - E[i] += Ei; //copy the result to device memory | |
148 | + E[i] += Ei; //copy the result to device memory | |
134 | 149 | } |
135 | 150 | |
136 | 151 | template<typename T> |
137 | -void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* B, T* j, T kr_min, T dkr, size_t Nl){ | |
138 | - | |
139 | - size_t wave_bytes = sizeof(stim::scalarwave<T>); | |
140 | - size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available | |
141 | - size_t array_bytes = nW * wave_bytes; //calculate the maximum number of bytes required for the planewave array | |
142 | - size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory | |
143 | - size_t num_batches = nW / max_batch + 1; //calculate the number of batches required to process all plane waves | |
144 | - size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required | |
145 | - | |
146 | - stim::scalarwave<T>* batch_W; | |
147 | - HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes)); //allocate memory for a single batch of plane waves | |
148 | - | |
149 | - int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device | |
150 | - dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks | |
151 | - | |
152 | - size_t batch_size; //declare a variable to store the size of the current batch | |
153 | - size_t waves_processed = 0; //initialize the number of waves processed to zero | |
154 | - while(waves_processed < nW){ //while there are still waves to be processed | |
155 | - batch_size = min<size_t>(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left | |
156 | - batch_bytes = batch_size * sizeof(stim::scalarwave<T>); | |
157 | - HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory | |
158 | - cuda_scalar_mie_scatter<T><<< blocks, threads, batch_bytes >>>(E, N, x, y, z, batch_W, batch_size, a, n, B, j, kr_min, dkr, (int)Nl); //call the kernel | |
159 | - waves_processed += batch_size; //increment the counter indicating how many waves have been processed | |
160 | - } | |
161 | - cudaFree(batch_W); | |
152 | +void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){ | |
153 | + | |
154 | + size_t max_shared_mem = stim::sharedMemPerBlock(); | |
155 | + int hBl_array = sizeof(stim::complex<T>) * (Nl + 1); | |
156 | + std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl; | |
157 | + std::cout<<"shared memory: "<<max_shared_mem<<std::endl; | |
158 | + int threads = (max_shared_mem / hBl_array) / 32 * 32; | |
159 | + std::cout<<"threads per block: "<<threads<<std::endl; | |
160 | + dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks | |
161 | + | |
162 | + size_t shared_mem; | |
163 | + if(Nl <= LOCAL_NL) shared_mem = 0; | |
164 | + else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate | |
165 | + std::cout<<"shared memory allocated: "<<shared_mem<<std::endl; | |
166 | + cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel | |
167 | + | |
168 | +} | |
169 | + | |
170 | +template<typename T> | |
171 | +__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){ | |
172 | + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array | |
173 | + if(i >= N) return; //exit if this thread is outside the array | |
174 | + | |
175 | + stim::vec3<T> p; | |
176 | + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions | |
177 | + (y == NULL) ? p[1] = 0 : p[1] = y[i]; | |
178 | + (z == NULL) ? p[2] = 0 : p[2] = z[i]; | |
179 | + | |
180 | + r[i] = p.len(); | |
162 | 181 | } |
163 | 182 | /// Calculate the scalar Mie solution for the scattered field produced by a single plane wave |
164 | 183 | |
... | ... | @@ -171,17 +190,19 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, sti |
171 | 190 | /// @param a is the radius of the sphere |
172 | 191 | /// @param n is the complex refractive index of the sphere |
173 | 192 | template<typename T> |
174 | -void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n){ | |
193 | +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){ | |
175 | 194 | //calculate the necessary number of orders required to represent the scattered field |
176 | 195 | T k = W[0].kmag(); |
177 | 196 | |
178 | - size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2); | |
197 | + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); | |
198 | + if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) | |
199 | + std::cout<<"Nl: "<<Nl<<std::endl; | |
179 | 200 | |
180 | 201 | //calculate the scattering coefficients for the sphere |
181 | 202 | stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients |
182 | 203 | B_coefficients(B, a, k, n, Nl); |
183 | 204 | |
184 | -#ifdef __CUDACC__ | |
205 | +#ifdef CUDA_FOUND | |
185 | 206 | stim::complex<T>* dev_E; //allocate space for the field |
186 | 207 | cudaMalloc(&dev_E, N * sizeof(stim::complex<T>)); |
187 | 208 | cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice); |
... | ... | @@ -209,16 +230,44 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std |
209 | 230 | HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) ); |
210 | 231 | HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) ); |
211 | 232 | |
212 | - // SCATTERING COEFFICIENTS | |
213 | - stim::complex<T>* dev_B; | |
214 | - HANDLE_ERROR( cudaMalloc(&dev_B, sizeof(stim::complex<T>) * (Nl+1)) ); | |
215 | - HANDLE_ERROR( cudaMemcpy(dev_B, B, sizeof(stim::complex<T>) * (Nl+1), cudaMemcpyHostToDevice) ); | |
216 | - | |
217 | 233 | // BESSEL FUNCTION LOOK-UP TABLE |
234 | + //calculate the distance from the sphere center | |
235 | + T* dev_r; | |
236 | + HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) ); | |
237 | + | |
238 | + int threads = stim::maxThreadsPerBlock(); | |
239 | + dim3 blocks((unsigned)(N / threads + 1)); | |
240 | + cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N); | |
241 | + | |
242 | + //Find the minimum and maximum values of r | |
243 | + cublasStatus_t stat; | |
244 | + cublasHandle_t handle; | |
245 | + | |
246 | + stat = cublasCreate(&handle); //create a cuBLAS handle | |
247 | + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure | |
248 | + printf ("CUBLAS initialization failed\n"); | |
249 | + exit(1); | |
250 | + } | |
251 | + | |
252 | + int i_min, i_max; | |
253 | + stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min); | |
254 | + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure | |
255 | + printf ("CUBLAS Error: failed to calculate minimum r value.\n"); | |
256 | + exit(1); | |
257 | + } | |
258 | + stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max); | |
259 | + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure | |
260 | + printf ("CUBLAS Error: failed to calculate maximum r value.\n"); | |
261 | + exit(1); | |
262 | + } | |
263 | + | |
264 | + T r_min, r_max; //allocate space to store the minimum and maximum values | |
265 | + HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU | |
266 | + HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); | |
267 | + | |
268 | + | |
218 | 269 | //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r |
219 | - size_t Nlut_j = 1024; | |
220 | - T r_min = 0; | |
221 | - T r_max = 10; | |
270 | + size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1); | |
222 | 271 | |
223 | 272 | T kr_min = k * r_min; |
224 | 273 | T kr_max = k * r_max; |
... | ... | @@ -230,25 +279,29 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std |
230 | 279 | double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); |
231 | 280 | double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); |
232 | 281 | |
233 | - size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j; | |
234 | - T* bessel_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table | |
235 | - T dkr = (kr_max - kr_min) / (Nlut_j-1); //distance between values in the LUT | |
236 | - std::cout<<"LUT jl bytes: "<<lutj_bytes<<std::endl; | |
237 | - for(size_t kri = 0; kri < Nlut_j; kri++){ //for each value in the LUT | |
282 | + size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut; | |
283 | + stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes); //pointer to the look-up table | |
284 | + T dkr = (kr_max - kr_min) / (N_hB_lut-1); //distance between values in the LUT | |
285 | + std::cout<<"LUT jl bytes: "<<hB_bytes<<std::endl; | |
286 | + stim::complex<T> hl; | |
287 | + for(size_t kri = 0; kri < N_hB_lut; kri++){ //for each value in the LUT | |
238 | 288 | stim::bessjyv_sph<double>(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] |
239 | 289 | for(size_t l = 0; l <= Nl; l++){ //for each order |
240 | - bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; //store the bessel function result | |
290 | + hl.r = (T)jv[l]; | |
291 | + hl.i = (T)yv[l]; | |
292 | + | |
293 | + hB_lut[kri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result | |
241 | 294 | } |
242 | 295 | } |
243 | 296 | |
244 | - stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer); | |
297 | + //stim::cpu2image<T>(hankel_lut, "hankel.bmp", Nl+1, Nlut_j, stim::cmBrewer); | |
245 | 298 | |
246 | 299 | //Allocate device memory and copy everything to the GPU |
247 | - T* dev_j_lut; | |
248 | - HANDLE_ERROR( cudaMalloc(&dev_j_lut, lutj_bytes) ); | |
249 | - HANDLE_ERROR( cudaMemcpy(dev_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) ); | |
300 | + stim::complex<T>* dev_hB_lut; | |
301 | + HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); | |
302 | + HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); | |
250 | 303 | |
251 | - gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_B, dev_j_lut, kr_min, dkr, Nl); | |
304 | + gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, kr_min, dkr, N_hB_lut, Nl); | |
252 | 305 | |
253 | 306 | cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory |
254 | 307 | |
... | ... | @@ -318,6 +371,7 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st |
318 | 371 | T k = W[0].kmag(); |
319 | 372 | |
320 | 373 | size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2); |
374 | + std::cout<<"Nl: "<<Nl<<std::endl; | |
321 | 375 | |
322 | 376 | //calculate the scattering coefficients for the sphere |
323 | 377 | stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients | ... | ... |
stim/optics/scalarbeam.h
... | ... | @@ -110,7 +110,7 @@ public: |
110 | 110 | std::vector< scalarwave<T> > samples(N); //create a vector of plane waves |
111 | 111 | T kmag = (T)stim::TAU / lambda; //calculate the wavenumber |
112 | 112 | stim::complex<T> apw; //allocate space for the amplitude at the focal point |
113 | - T a = stim::TAU * (1 - cos(asin(NA[0]))) / (double)N; | |
113 | + T a = (T)(stim::TAU * (1 - cos(asin(NA[0]))) / (double)N); | |
114 | 114 | stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction |
115 | 115 | for(size_t i=0; i<N; i++){ //for each sample |
116 | 116 | kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave |
... | ... | @@ -201,6 +201,11 @@ CUDA_CALLABLE void lut_lookup(T* lut_values, T* lut, T val, size_t N, T min_val, |
201 | 201 | } |
202 | 202 | |
203 | 203 | template <typename T> |
204 | +CUDA_CALLABLE stim::complex<T> clerp(stim::complex<T> v0, stim::complex<T> v1, T t) { | |
205 | + return stim::complex<T>( fma(t, v1.r, fma(-t, v0.r, v0.r)), fma(t, v1.i, fma(-t, v0.i, v0.i)) ); | |
206 | +} | |
207 | + | |
208 | +template <typename T> | |
204 | 209 | CUDA_CALLABLE T lerp(T v0, T v1, T t) { |
205 | 210 | return fma(t, v1, fma(-t, v0, v0)); |
206 | 211 | } | ... | ... |
stim/optics/scalarwave.h
... | ... | @@ -23,7 +23,7 @@ namespace stim{ |
23 | 23 | template<typename T> |
24 | 24 | class scalarwave{ |
25 | 25 | |
26 | -protected: | |
26 | +public: | |
27 | 27 | |
28 | 28 | stim::vec3<T> k; //k-vector, pointed in propagation direction with magnitude |k| = tau / lambda = 2pi / lambda |
29 | 29 | stim::complex<T> E0; //amplitude |
... | ... | @@ -240,9 +240,7 @@ void gpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scal |
240 | 240 | |
241 | 241 | size_t wave_bytes = sizeof(stim::scalarwave<T>); |
242 | 242 | size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available |
243 | - size_t array_bytes = nW * wave_bytes; //calculate the maximum number of bytes required for the planewave array | |
244 | 243 | size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory |
245 | - size_t num_batches = nW / max_batch + 1; //calculate the number of batches required to process all plane waves | |
246 | 244 | size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required |
247 | 245 | |
248 | 246 | stim::scalarwave<T>* batch_W; |
... | ... | @@ -332,6 +330,12 @@ void cpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scala |
332 | 330 | cpu_scalarwaves(F, N, x, y, z, w_array); |
333 | 331 | } |
334 | 332 | |
333 | +template<typename T> | |
334 | +void cpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){ | |
335 | + std::vector< stim::scalarwave<T> > w_array(1, w); | |
336 | + cpu_scalarwaves(F, N, x, y, z, w_array); | |
337 | +} | |
338 | + | |
335 | 339 | |
336 | 340 | /// Sums a series of coherent plane waves at a specified point |
337 | 341 | /// @param x is the x coordinate of the field point | ... | ... |