Commit 31262e831e976c77dcedd6370811cc9f59bc24fe

Authored by David Mayerich
1 parent e252a6d6

GPU implementation of Mie scattering (external field)

@@ -3,6 +3,7 @@ @@ -3,6 +3,7 @@
3 3
4 #include "scalarwave.h" 4 #include "scalarwave.h"
5 #include "../math/bessel.h" 5 #include "../math/bessel.h"
  6 +#include "../cuda/cudatools/devices.h"
6 #include <cmath> 7 #include <cmath>
7 8
8 namespace stim{ 9 namespace stim{
@@ -33,7 +34,7 @@ void B_coefficients(stim::complex&lt;T&gt;* B, T a, T k, stim::complex&lt;T&gt; n, int Nl){ @@ -33,7 +34,7 @@ void B_coefficients(stim::complex&lt;T&gt;* B, T a, T k, stim::complex&lt;T&gt; n, int Nl){
33 stim::complex<double> h_ka, dh_ka; 34 stim::complex<double> h_ka, dh_ka;
34 stim::complex<double> numerator, denominator; 35 stim::complex<double> numerator, denominator;
35 stim::complex<double> i(0, 1); 36 stim::complex<double> i(0, 1);
36 - for(size_t l = 0; l <= Nl; l++){ 37 + for(int l = 0; l <= Nl; l++){
37 h_ka.r = j_ka[l]; 38 h_ka.r = j_ka[l];
38 h_ka.i = y_ka[l]; 39 h_ka.i = y_ka[l];
39 dh_ka.r = dj_ka[l]; 40 dh_ka.r = dj_ka[l];
@@ -81,84 +82,102 @@ void A_coefficients(stim::complex&lt;T&gt;* A, T a, T k, stim::complex&lt;T&gt; n, int Nl){ @@ -81,84 +82,102 @@ void A_coefficients(stim::complex&lt;T&gt;* A, T a, T k, stim::complex&lt;T&gt; n, int Nl){
81 } 82 }
82 } 83 }
83 84
  85 +#define LOCAL_NL 16
84 template<typename T> 86 template<typename T>
85 -__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* B, T* j, T kr_min, T dkr, int Nl){  
86 - extern __shared__ stim::scalarwave<T> shared_W[]; //declare the list of waves in shared memory  
87 -  
88 - stim::cuda::sharedMemcpy(shared_W, W, nW, threadIdx.x, blockDim.x); //copy the plane waves into shared memory for faster access  
89 - __syncthreads(); //synchronize threads to insure all data is copied 87 +__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, int Nl){
  88 + extern __shared__ stim::complex<T> shared_hB[]; //declare the list of waves in shared memory
90 89
91 size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array 90 size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
92 - if(i >= N) return; //exit if this thread is outside the array 91 + if(i >= N) return; //exit if this thread is outside the array
93 stim::vec3<T> p; 92 stim::vec3<T> p;
94 (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions 93 (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
95 (y == NULL) ? p[1] = 0 : p[1] = y[i]; 94 (y == NULL) ? p[1] = 0 : p[1] = y[i];
96 (z == NULL) ? p[2] = 0 : p[2] = z[i]; 95 (z == NULL) ? p[2] = 0 : p[2] = z[i];
97 96
98 - T r = p.len(); //calculate the distance from the sphere 97 + T r = p.len(); //calculate the distance from the sphere
  98 + if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field)
99 T k = W[0].kmag(); 99 T k = W[0].kmag();
100 - if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field)  
101 -  
102 - size_t NC = Nl + 1; //calculate the number of coefficients to be used  
103 - T kr = r * k; //calculate the thread value for k*r  
104 - T fij = (kr - kr_min)/dkr; //FP index into the spherical bessel LUT  
105 - size_t ij = (size_t) fij; //convert to an integral index  
106 - T alpha = fij - ij; //calculate the fractional portion of the index  
107 - size_t n0j = ij * (NC); //start of the first entry in the LUT  
108 - size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT 100 + size_t NC = Nl + 1; //calculate the number of coefficients to be used
  101 + T kr = r * k; //calculate the thread value for k*r
  102 + T fij = (kr - kr_min)/dkr; //FP index into the spherical bessel LUT
  103 + size_t ij = (size_t) fij; //convert to an integral index
  104 + T alpha = fij - ij; //calculate the fractional portion of the index
  105 + size_t n0j = ij * (NC); //start of the first entry in the LUT
  106 + size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT
109 107
110 T cos_phi; 108 T cos_phi;
111 - T Pl_2, Pl_1; //declare registers to store the previous two Legendre polynomials  
112 - T Pl = 1; //initialize the current value for the Legendre polynomial  
113 - T jl; 109 + T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials
  110 +
  111 + stim::complex<T> hBl;
114 stim::complex<T> Ei = 0; //create a register to store the result 112 stim::complex<T> Ei = 0; //create a register to store the result
115 int l; 113 int l;
  114 +
  115 + stim::complex<T> hlBl[LOCAL_NL+1];
  116 + int shared_start = threadIdx.x * (Nl - LOCAL_NL);
  117 +
  118 + #pragma unroll LOCAL_NL+1
  119 + for(l = 0; l <= LOCAL_NL; l++)
  120 + hlBl[l] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
  121 +
  122 + for(l = LOCAL_NL+1; l <= Nl; l++)
  123 + shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
  124 +
116 for(size_t w = 0; w < nW; w++){ 125 for(size_t w = 0; w < nW; w++){
117 cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere 126 cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere
118 - for(l = 0; l <= Nl; l++){  
119 - Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 127 + Pl_2 = 1;
  128 + Pl_1 = cos_phi;
  129 + Ei += W[w].E() * hlBl[0] * Pl_2;
  130 + Ei += W[w].E() * hlBl[1] * Pl_1;
  131 +
  132 + #pragma unroll LOCAL_NL-1
  133 + for(l = 2; l <= LOCAL_NL; l++){
  134 + Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
  135 + Ei += W[w].E() * hlBl[l] * Pl;
  136 + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
120 Pl_1 = Pl; 137 Pl_1 = Pl;
121 - if(l == 0){ //computing Pl is done recursively, where the recursive relation  
122 - Pl = cos_phi; // requires the first two orders. This defines the second.  
123 - }  
124 - else{ //if this is not the first iteration, use the recursive relation to calculate Pl  
125 - Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);  
126 - } 138 + }
127 139
128 - jl = lerp<T>( j[n0j + l], j[n1j + l], alpha ); //read jl from the LUT and interpolate the result  
129 - Ei += W[w].E() * B[l] * jl * Pl; 140 + for(l = LOCAL_NL+1; l <= Nl; l++){
  141 + Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
  142 + Ei += W[w].E() * shared_hB[shared_start + (l - (LOCAL_NL+1))] * Pl;
  143 + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1
  144 + Pl_1 = Pl;
  145 +
130 } 146 }
131 - //Ei += shared_W[w].pos(p); //evaluate the plane wave  
132 } 147 }
133 - E[i] += Ei; //copy the result to device memory 148 + E[i] += Ei; //copy the result to device memory
134 } 149 }
135 150
136 template<typename T> 151 template<typename T>
137 -void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* B, T* j, T kr_min, T dkr, size_t Nl){  
138 -  
139 - size_t wave_bytes = sizeof(stim::scalarwave<T>);  
140 - size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available  
141 - size_t array_bytes = nW * wave_bytes; //calculate the maximum number of bytes required for the planewave array  
142 - size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory  
143 - size_t num_batches = nW / max_batch + 1; //calculate the number of batches required to process all plane waves  
144 - size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required  
145 -  
146 - stim::scalarwave<T>* batch_W;  
147 - HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes)); //allocate memory for a single batch of plane waves  
148 -  
149 - int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device  
150 - dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks  
151 -  
152 - size_t batch_size; //declare a variable to store the size of the current batch  
153 - size_t waves_processed = 0; //initialize the number of waves processed to zero  
154 - while(waves_processed < nW){ //while there are still waves to be processed  
155 - batch_size = min<size_t>(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left  
156 - batch_bytes = batch_size * sizeof(stim::scalarwave<T>);  
157 - HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory  
158 - cuda_scalar_mie_scatter<T><<< blocks, threads, batch_bytes >>>(E, N, x, y, z, batch_W, batch_size, a, n, B, j, kr_min, dkr, (int)Nl); //call the kernel  
159 - waves_processed += batch_size; //increment the counter indicating how many waves have been processed  
160 - }  
161 - cudaFree(batch_W); 152 +void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){
  153 +
  154 + size_t max_shared_mem = stim::sharedMemPerBlock();
  155 + int hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
  156 + std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl;
  157 + std::cout<<"shared memory: "<<max_shared_mem<<std::endl;
  158 + int threads = (max_shared_mem / hBl_array) / 32 * 32;
  159 + std::cout<<"threads per block: "<<threads<<std::endl;
  160 + dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
  161 +
  162 + size_t shared_mem;
  163 + if(Nl <= LOCAL_NL) shared_mem = 0;
  164 + else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate
  165 + std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
  166 + cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel
  167 +
  168 +}
  169 +
  170 +template<typename T>
  171 +__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){
  172 + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array
  173 + if(i >= N) return; //exit if this thread is outside the array
  174 +
  175 + stim::vec3<T> p;
  176 + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions
  177 + (y == NULL) ? p[1] = 0 : p[1] = y[i];
  178 + (z == NULL) ? p[2] = 0 : p[2] = z[i];
  179 +
  180 + r[i] = p.len();
162 } 181 }
163 /// Calculate the scalar Mie solution for the scattered field produced by a single plane wave 182 /// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
164 183
@@ -171,17 +190,19 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, sti @@ -171,17 +190,19 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, sti
171 /// @param a is the radius of the sphere 190 /// @param a is the radius of the sphere
172 /// @param n is the complex refractive index of the sphere 191 /// @param n is the complex refractive index of the sphere
173 template<typename T> 192 template<typename T>
174 -void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n){ 193 +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
175 //calculate the necessary number of orders required to represent the scattered field 194 //calculate the necessary number of orders required to represent the scattered field
176 T k = W[0].kmag(); 195 T k = W[0].kmag();
177 196
178 - size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2); 197 + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
  198 + if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization)
  199 + std::cout<<"Nl: "<<Nl<<std::endl;
179 200
180 //calculate the scattering coefficients for the sphere 201 //calculate the scattering coefficients for the sphere
181 stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients 202 stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients
182 B_coefficients(B, a, k, n, Nl); 203 B_coefficients(B, a, k, n, Nl);
183 204
184 -#ifdef __CUDACC__ 205 +#ifdef CUDA_FOUND
185 stim::complex<T>* dev_E; //allocate space for the field 206 stim::complex<T>* dev_E; //allocate space for the field
186 cudaMalloc(&dev_E, N * sizeof(stim::complex<T>)); 207 cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
187 cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice); 208 cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
@@ -209,16 +230,44 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std @@ -209,16 +230,44 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
209 HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) ); 230 HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
210 HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) ); 231 HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
211 232
212 - // SCATTERING COEFFICIENTS  
213 - stim::complex<T>* dev_B;  
214 - HANDLE_ERROR( cudaMalloc(&dev_B, sizeof(stim::complex<T>) * (Nl+1)) );  
215 - HANDLE_ERROR( cudaMemcpy(dev_B, B, sizeof(stim::complex<T>) * (Nl+1), cudaMemcpyHostToDevice) );  
216 -  
217 // BESSEL FUNCTION LOOK-UP TABLE 233 // BESSEL FUNCTION LOOK-UP TABLE
  234 + //calculate the distance from the sphere center
  235 + T* dev_r;
  236 + HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );
  237 +
  238 + int threads = stim::maxThreadsPerBlock();
  239 + dim3 blocks((unsigned)(N / threads + 1));
  240 + cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);
  241 +
  242 + //Find the minimum and maximum values of r
  243 + cublasStatus_t stat;
  244 + cublasHandle_t handle;
  245 +
  246 + stat = cublasCreate(&handle); //create a cuBLAS handle
  247 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  248 + printf ("CUBLAS initialization failed\n");
  249 + exit(1);
  250 + }
  251 +
  252 + int i_min, i_max;
  253 + stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
  254 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  255 + printf ("CUBLAS Error: failed to calculate minimum r value.\n");
  256 + exit(1);
  257 + }
  258 + stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
  259 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  260 + printf ("CUBLAS Error: failed to calculate maximum r value.\n");
  261 + exit(1);
  262 + }
  263 +
  264 + T r_min, r_max; //allocate space to store the minimum and maximum values
  265 + HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU
  266 + HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
  267 +
  268 +
218 //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r 269 //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r
219 - size_t Nlut_j = 1024;  
220 - T r_min = 0;  
221 - T r_max = 10; 270 + size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1);
222 271
223 T kr_min = k * r_min; 272 T kr_min = k * r_min;
224 T kr_max = k * r_max; 273 T kr_max = k * r_max;
@@ -230,25 +279,29 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std @@ -230,25 +279,29 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
230 double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); 279 double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
231 double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); 280 double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
232 281
233 - size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j;  
234 - T* bessel_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table  
235 - T dkr = (kr_max - kr_min) / (Nlut_j-1); //distance between values in the LUT  
236 - std::cout<<"LUT jl bytes: "<<lutj_bytes<<std::endl;  
237 - for(size_t kri = 0; kri < Nlut_j; kri++){ //for each value in the LUT 282 + size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut;
  283 + stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes); //pointer to the look-up table
  284 + T dkr = (kr_max - kr_min) / (N_hB_lut-1); //distance between values in the LUT
  285 + std::cout<<"LUT jl bytes: "<<hB_bytes<<std::endl;
  286 + stim::complex<T> hl;
  287 + for(size_t kri = 0; kri < N_hB_lut; kri++){ //for each value in the LUT
238 stim::bessjyv_sph<double>(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] 288 stim::bessjyv_sph<double>(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
239 for(size_t l = 0; l <= Nl; l++){ //for each order 289 for(size_t l = 0; l <= Nl; l++){ //for each order
240 - bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; //store the bessel function result 290 + hl.r = (T)jv[l];
  291 + hl.i = (T)yv[l];
  292 +
  293 + hB_lut[kri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result
241 } 294 }
242 } 295 }
243 296
244 - stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer); 297 + //stim::cpu2image<T>(hankel_lut, "hankel.bmp", Nl+1, Nlut_j, stim::cmBrewer);
245 298
246 //Allocate device memory and copy everything to the GPU 299 //Allocate device memory and copy everything to the GPU
247 - T* dev_j_lut;  
248 - HANDLE_ERROR( cudaMalloc(&dev_j_lut, lutj_bytes) );  
249 - HANDLE_ERROR( cudaMemcpy(dev_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) ); 300 + stim::complex<T>* dev_hB_lut;
  301 + HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
  302 + HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );
250 303
251 - gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_B, dev_j_lut, kr_min, dkr, Nl); 304 + gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, kr_min, dkr, N_hB_lut, Nl);
252 305
253 cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory 306 cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory
254 307
@@ -318,6 +371,7 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st @@ -318,6 +371,7 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
318 T k = W[0].kmag(); 371 T k = W[0].kmag();
319 372
320 size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2); 373 size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2);
  374 + std::cout<<"Nl: "<<Nl<<std::endl;
321 375
322 //calculate the scattering coefficients for the sphere 376 //calculate the scattering coefficients for the sphere
323 stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients 377 stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients
stim/optics/scalarbeam.h
@@ -110,7 +110,7 @@ public: @@ -110,7 +110,7 @@ public:
110 std::vector< scalarwave<T> > samples(N); //create a vector of plane waves 110 std::vector< scalarwave<T> > samples(N); //create a vector of plane waves
111 T kmag = (T)stim::TAU / lambda; //calculate the wavenumber 111 T kmag = (T)stim::TAU / lambda; //calculate the wavenumber
112 stim::complex<T> apw; //allocate space for the amplitude at the focal point 112 stim::complex<T> apw; //allocate space for the amplitude at the focal point
113 - T a = stim::TAU * (1 - cos(asin(NA[0]))) / (double)N; 113 + T a = (T)(stim::TAU * (1 - cos(asin(NA[0]))) / (double)N);
114 stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction 114 stim::vec3<T> kpw; //declare the new k-vector based on the focused plane wave direction
115 for(size_t i=0; i<N; i++){ //for each sample 115 for(size_t i=0; i<N; i++){ //for each sample
116 kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave 116 kpw = dirs[i] * kmag; //calculate the k-vector for the new plane wave
@@ -201,6 +201,11 @@ CUDA_CALLABLE void lut_lookup(T* lut_values, T* lut, T val, size_t N, T min_val, @@ -201,6 +201,11 @@ CUDA_CALLABLE void lut_lookup(T* lut_values, T* lut, T val, size_t N, T min_val,
201 } 201 }
202 202
203 template <typename T> 203 template <typename T>
  204 +CUDA_CALLABLE stim::complex<T> clerp(stim::complex<T> v0, stim::complex<T> v1, T t) {
  205 + return stim::complex<T>( fma(t, v1.r, fma(-t, v0.r, v0.r)), fma(t, v1.i, fma(-t, v0.i, v0.i)) );
  206 +}
  207 +
  208 +template <typename T>
204 CUDA_CALLABLE T lerp(T v0, T v1, T t) { 209 CUDA_CALLABLE T lerp(T v0, T v1, T t) {
205 return fma(t, v1, fma(-t, v0, v0)); 210 return fma(t, v1, fma(-t, v0, v0));
206 } 211 }
stim/optics/scalarwave.h
@@ -23,7 +23,7 @@ namespace stim{ @@ -23,7 +23,7 @@ namespace stim{
23 template<typename T> 23 template<typename T>
24 class scalarwave{ 24 class scalarwave{
25 25
26 -protected: 26 +public:
27 27
28 stim::vec3<T> k; //k-vector, pointed in propagation direction with magnitude |k| = tau / lambda = 2pi / lambda 28 stim::vec3<T> k; //k-vector, pointed in propagation direction with magnitude |k| = tau / lambda = 2pi / lambda
29 stim::complex<T> E0; //amplitude 29 stim::complex<T> E0; //amplitude
@@ -240,9 +240,7 @@ void gpu_scalarwaves(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scal @@ -240,9 +240,7 @@ void gpu_scalarwaves(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scal
240 240
241 size_t wave_bytes = sizeof(stim::scalarwave<T>); 241 size_t wave_bytes = sizeof(stim::scalarwave<T>);
242 size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available 242 size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available
243 - size_t array_bytes = nW * wave_bytes; //calculate the maximum number of bytes required for the planewave array  
244 size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory 243 size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory
245 - size_t num_batches = nW / max_batch + 1; //calculate the number of batches required to process all plane waves  
246 size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required 244 size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required
247 245
248 stim::scalarwave<T>* batch_W; 246 stim::scalarwave<T>* batch_W;
@@ -332,6 +330,12 @@ void cpu_scalarwave(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scala @@ -332,6 +330,12 @@ void cpu_scalarwave(stim::complex&lt;T&gt;* F, size_t N, T* x, T* y, T* z, stim::scala
332 cpu_scalarwaves(F, N, x, y, z, w_array); 330 cpu_scalarwaves(F, N, x, y, z, w_array);
333 } 331 }
334 332
  333 +template<typename T>
  334 +void cpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
  335 + std::vector< stim::scalarwave<T> > w_array(1, w);
  336 + cpu_scalarwaves(F, N, x, y, z, w_array);
  337 +}
  338 +
335 339
336 /// Sums a series of coherent plane waves at a specified point 340 /// Sums a series of coherent plane waves at a specified point
337 /// @param x is the x coordinate of the field point 341 /// @param x is the x coordinate of the field point