From 31262e831e976c77dcedd6370811cc9f59bc24fe Mon Sep 17 00:00:00 2001 From: David Date: Tue, 14 Jun 2016 00:42:01 -0500 Subject: [PATCH] GPU implementation of Mie scattering (external field) --- stim/optics/mie.h | 212 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------------------------- stim/optics/scalarbeam.h | 7 ++++++- stim/optics/scalarwave.h | 10 +++++++--- 3 files changed, 146 insertions(+), 83 deletions(-) diff --git a/stim/optics/mie.h b/stim/optics/mie.h index 59796d8..9955205 100644 --- a/stim/optics/mie.h +++ b/stim/optics/mie.h @@ -3,6 +3,7 @@ #include "scalarwave.h" #include "../math/bessel.h" +#include "../cuda/cudatools/devices.h" #include namespace stim{ @@ -33,7 +34,7 @@ void B_coefficients(stim::complex* B, T a, T k, stim::complex n, int Nl){ stim::complex h_ka, dh_ka; stim::complex numerator, denominator; stim::complex i(0, 1); - for(size_t l = 0; l <= Nl; l++){ + for(int l = 0; l <= Nl; l++){ h_ka.r = j_ka[l]; h_ka.i = y_ka[l]; dh_ka.r = dj_ka[l]; @@ -81,84 +82,102 @@ void A_coefficients(stim::complex* A, T a, T k, stim::complex n, int Nl){ } } +#define LOCAL_NL 16 template -__global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* B, T* j, T kr_min, T dkr, int Nl){ - extern __shared__ stim::scalarwave shared_W[]; //declare the list of waves in shared memory - - stim::cuda::sharedMemcpy(shared_W, W, nW, threadIdx.x, blockDim.x); //copy the plane waves into shared memory for faster access - __syncthreads(); //synchronize threads to insure all data is copied +__global__ void cuda_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* hB, T kr_min, T dkr, size_t N_hB, int Nl){ + extern __shared__ stim::complex shared_hB[]; //declare the list of waves in shared memory size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array - if(i >= N) return; //exit if this thread is outside the array + if(i >= N) return; //exit if this thread is outside the array stim::vec3 p; (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions (y == NULL) ? p[1] = 0 : p[1] = y[i]; (z == NULL) ? p[2] = 0 : p[2] = z[i]; - T r = p.len(); //calculate the distance from the sphere + T r = p.len(); //calculate the distance from the sphere + if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) T k = W[0].kmag(); - if(r < a) return; //exit if the point is inside the sphere (we only calculate the internal field) - - size_t NC = Nl + 1; //calculate the number of coefficients to be used - T kr = r * k; //calculate the thread value for k*r - T fij = (kr - kr_min)/dkr; //FP index into the spherical bessel LUT - size_t ij = (size_t) fij; //convert to an integral index - T alpha = fij - ij; //calculate the fractional portion of the index - size_t n0j = ij * (NC); //start of the first entry in the LUT - size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT + size_t NC = Nl + 1; //calculate the number of coefficients to be used + T kr = r * k; //calculate the thread value for k*r + T fij = (kr - kr_min)/dkr; //FP index into the spherical bessel LUT + size_t ij = (size_t) fij; //convert to an integral index + T alpha = fij - ij; //calculate the fractional portion of the index + size_t n0j = ij * (NC); //start of the first entry in the LUT + size_t n1j = (ij+1) * (NC); //start of the second entry in the LUT T cos_phi; - T Pl_2, Pl_1; //declare registers to store the previous two Legendre polynomials - T Pl = 1; //initialize the current value for the Legendre polynomial - T jl; + T Pl_2, Pl_1, Pl; //declare registers to store the previous two Legendre polynomials + + stim::complex hBl; stim::complex Ei = 0; //create a register to store the result int l; + + stim::complex hlBl[LOCAL_NL+1]; + int shared_start = threadIdx.x * (Nl - LOCAL_NL); + + #pragma unroll LOCAL_NL+1 + for(l = 0; l <= LOCAL_NL; l++) + hlBl[l] = clerp( hB[n0j + l], hB[n1j + l], alpha ); + + for(l = LOCAL_NL+1; l <= Nl; l++) + shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp( hB[n0j + l], hB[n1j + l], alpha ); + for(size_t w = 0; w < nW; w++){ cos_phi = p.norm().dot(W[w].kvec().norm()); //calculate the cosine of the angle between the k vector and the direction from the sphere - for(l = 0; l <= Nl; l++){ - Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 + Pl_2 = 1; + Pl_1 = cos_phi; + Ei += W[w].E() * hlBl[0] * Pl_2; + Ei += W[w].E() * hlBl[1] * Pl_1; + + #pragma unroll LOCAL_NL-1 + for(l = 2; l <= LOCAL_NL; l++){ + Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1); + Ei += W[w].E() * hlBl[l] * Pl; + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 Pl_1 = Pl; - if(l == 0){ //computing Pl is done recursively, where the recursive relation - Pl = cos_phi; // requires the first two orders. This defines the second. - } - else{ //if this is not the first iteration, use the recursive relation to calculate Pl - Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1); - } + } - jl = lerp( j[n0j + l], j[n1j + l], alpha ); //read jl from the LUT and interpolate the result - Ei += W[w].E() * B[l] * jl * Pl; + for(l = LOCAL_NL+1; l <= Nl; l++){ + Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1); + Ei += W[w].E() * shared_hB[shared_start + (l - (LOCAL_NL+1))] * Pl; + Pl_2 = Pl_1; //shift Pl_1 -> Pl_2 and Pl -> Pl_1 + Pl_1 = Pl; + } - //Ei += shared_W[w].pos(p); //evaluate the plane wave } - E[i] += Ei; //copy the result to device memory + E[i] += Ei; //copy the result to device memory } template -void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* B, T* j, T kr_min, T dkr, size_t Nl){ - - size_t wave_bytes = sizeof(stim::scalarwave); - size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available - size_t array_bytes = nW * wave_bytes; //calculate the maximum number of bytes required for the planewave array - size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory - size_t num_batches = nW / max_batch + 1; //calculate the number of batches required to process all plane waves - size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required - - stim::scalarwave* batch_W; - HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes)); //allocate memory for a single batch of plane waves - - int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device - dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks - - size_t batch_size; //declare a variable to store the size of the current batch - size_t waves_processed = 0; //initialize the number of waves processed to zero - while(waves_processed < nW){ //while there are still waves to be processed - batch_size = min(max_batch, nW - waves_processed); //process either a whole batch, or whatever is left - batch_bytes = batch_size * sizeof(stim::scalarwave); - HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice)); //copy the plane waves into global memory - cuda_scalar_mie_scatter<<< blocks, threads, batch_bytes >>>(E, N, x, y, z, batch_W, batch_size, a, n, B, j, kr_min, dkr, (int)Nl); //call the kernel - waves_processed += batch_size; //increment the counter indicating how many waves have been processed - } - cudaFree(batch_W); +void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, stim::scalarwave* W, size_t nW, T a, stim::complex n, stim::complex* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){ + + size_t max_shared_mem = stim::sharedMemPerBlock(); + int hBl_array = sizeof(stim::complex) * (Nl + 1); + std::cout<<"hl*Bl array size: "<) * (Nl - LOCAL_NL); //amount of shared memory to allocate + std::cout<<"shared memory allocated: "<<<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel + +} + +template +__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){ + size_t i = blockIdx.x * blockDim.x + threadIdx.x; //get the index into the array + if(i >= N) return; //exit if this thread is outside the array + + stim::vec3 p; + (x == NULL) ? p[0] = 0 : p[0] = x[i]; // test for NULL values and set positions + (y == NULL) ? p[1] = 0 : p[1] = y[i]; + (z == NULL) ? p[2] = 0 : p[2] = z[i]; + + r[i] = p.len(); } /// Calculate the scalar Mie solution for the scattered field produced by a single plane wave @@ -171,17 +190,19 @@ void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, sti /// @param a is the radius of the sphere /// @param n is the complex refractive index of the sphere template -void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector> W, T a, stim::complex n){ +void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector> W, T a, stim::complex n, T r_spacing = 0.1){ //calculate the necessary number of orders required to represent the scattered field T k = W[0].kmag(); - size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2); + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); + if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) + std::cout<<"Nl: "<* B = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients B_coefficients(B, a, k, n, Nl); -#ifdef __CUDACC__ +#ifdef CUDA_FOUND stim::complex* dev_E; //allocate space for the field cudaMalloc(&dev_E, N * sizeof(stim::complex)); cudaMemcpy(dev_E, E, N * sizeof(stim::complex), cudaMemcpyHostToDevice); @@ -209,16 +230,44 @@ void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave) * W.size()) ); HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave) * W.size(), cudaMemcpyHostToDevice) ); - // SCATTERING COEFFICIENTS - stim::complex* dev_B; - HANDLE_ERROR( cudaMalloc(&dev_B, sizeof(stim::complex) * (Nl+1)) ); - HANDLE_ERROR( cudaMemcpy(dev_B, B, sizeof(stim::complex) * (Nl+1), cudaMemcpyHostToDevice) ); - // BESSEL FUNCTION LOOK-UP TABLE + //calculate the distance from the sphere center + T* dev_r; + HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) ); + + int threads = stim::maxThreadsPerBlock(); + dim3 blocks((unsigned)(N / threads + 1)); + cuda_dist <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N); + + //Find the minimum and maximum values of r + cublasStatus_t stat; + cublasHandle_t handle; + + stat = cublasCreate(&handle); //create a cuBLAS handle + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure + printf ("CUBLAS initialization failed\n"); + exit(1); + } + + int i_min, i_max; + stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min); + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure + printf ("CUBLAS Error: failed to calculate minimum r value.\n"); + exit(1); + } + stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max); + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure + printf ("CUBLAS Error: failed to calculate maximum r value.\n"); + exit(1); + } + + T r_min, r_max; //allocate space to store the minimum and maximum values + HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU + HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); + + //size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1); //number of values in the look-up table based on the user-specified spacing along r - size_t Nlut_j = 1024; - T r_min = 0; - T r_max = 10; + size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1); T kr_min = k * r_min; T kr_max = k * r_max; @@ -230,25 +279,29 @@ void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std double* djv= (double*) malloc( (Nl + 1) * sizeof(double) ); double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) ); - size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j; - T* bessel_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table - T dkr = (kr_max - kr_min) / (Nlut_j-1); //distance between values in the LUT - std::cout<<"LUT jl bytes: "<) * (Nl+1) * N_hB_lut; + stim::complex* hB_lut = (stim::complex*) malloc(hB_bytes); //pointer to the look-up table + T dkr = (kr_max - kr_min) / (N_hB_lut-1); //distance between values in the LUT + std::cout<<"LUT jl bytes: "< hl; + for(size_t kri = 0; kri < N_hB_lut; kri++){ //for each value in the LUT stim::bessjyv_sph(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] for(size_t l = 0; l <= Nl; l++){ //for each order - bessel_lut[kri * (Nl + 1) + l] = (T)jv[l]; //store the bessel function result + hl.r = (T)jv[l]; + hl.i = (T)yv[l]; + + hB_lut[kri * (Nl + 1) + l] = hl * B[l]; //store the bessel function result } } - stim::cpu2image(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer); + //stim::cpu2image(hankel_lut, "hankel.bmp", Nl+1, Nlut_j, stim::cmBrewer); //Allocate device memory and copy everything to the GPU - T* dev_j_lut; - HANDLE_ERROR( cudaMalloc(&dev_j_lut, lutj_bytes) ); - HANDLE_ERROR( cudaMemcpy(dev_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) ); + stim::complex* dev_hB_lut; + HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); + HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); - gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_B, dev_j_lut, kr_min, dkr, Nl); + gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, kr_min, dkr, N_hB_lut, Nl); cudaMemcpy(E, dev_E, N * sizeof(stim::complex), cudaMemcpyDeviceToHost); //copy the field from device memory @@ -318,6 +371,7 @@ void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, st T k = W[0].kmag(); size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2); + std::cout<<"Nl: "<* A = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients diff --git a/stim/optics/scalarbeam.h b/stim/optics/scalarbeam.h index e4dcd06..93881eb 100644 --- a/stim/optics/scalarbeam.h +++ b/stim/optics/scalarbeam.h @@ -110,7 +110,7 @@ public: std::vector< scalarwave > samples(N); //create a vector of plane waves T kmag = (T)stim::TAU / lambda; //calculate the wavenumber stim::complex apw; //allocate space for the amplitude at the focal point - T a = stim::TAU * (1 - cos(asin(NA[0]))) / (double)N; + T a = (T)(stim::TAU * (1 - cos(asin(NA[0]))) / (double)N); stim::vec3 kpw; //declare the new k-vector based on the focused plane wave direction for(size_t i=0; i +CUDA_CALLABLE stim::complex clerp(stim::complex v0, stim::complex v1, T t) { + return stim::complex( fma(t, v1.r, fma(-t, v0.r, v0.r)), fma(t, v1.i, fma(-t, v0.i, v0.i)) ); +} + +template CUDA_CALLABLE T lerp(T v0, T v1, T t) { return fma(t, v1, fma(-t, v0, v0)); } diff --git a/stim/optics/scalarwave.h b/stim/optics/scalarwave.h index 4e63bed..ac620ab 100644 --- a/stim/optics/scalarwave.h +++ b/stim/optics/scalarwave.h @@ -23,7 +23,7 @@ namespace stim{ template class scalarwave{ -protected: +public: stim::vec3 k; //k-vector, pointed in propagation direction with magnitude |k| = tau / lambda = 2pi / lambda stim::complex E0; //amplitude @@ -240,9 +240,7 @@ void gpu_scalarwaves(stim::complex* F, size_t N, T* x, T* y, T* z, stim::scal size_t wave_bytes = sizeof(stim::scalarwave); size_t shared_bytes = stim::sharedMemPerBlock(); //calculate the maximum amount of shared memory available - size_t array_bytes = nW * wave_bytes; //calculate the maximum number of bytes required for the planewave array size_t max_batch = shared_bytes / wave_bytes; //calculate number of plane waves that will fit into shared memory - size_t num_batches = nW / max_batch + 1; //calculate the number of batches required to process all plane waves size_t batch_bytes = min(nW, max_batch) * wave_bytes; //initialize the batch size (in bytes) to the maximum batch required stim::scalarwave* batch_W; @@ -332,6 +330,12 @@ void cpu_scalarwave(stim::complex* F, size_t N, T* x, T* y, T* z, stim::scala cpu_scalarwaves(F, N, x, y, z, w_array); } +template +void cpu_scalarwaves(stim::complex* F, size_t N, T* x, T* y, T* z, stim::scalarwave w){ + std::vector< stim::scalarwave > w_array(1, w); + cpu_scalarwaves(F, N, x, y, z, w_array); +} + /// Sums a series of coherent plane waves at a specified point /// @param x is the x coordinate of the field point -- libgit2 0.21.4