From 31262e831e976c77dcedd6370811cc9f59bc24fe Mon Sep 17 00:00:00 2001
From: David <mayerich@uh.edu>
Date: Tue, 14 Jun 2016 00:42:01 -0500
Subject: [PATCH] GPU implementation of Mie scattering (external field)

---
 stim/optics/mie.h        | 212 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------------------------------------------------------
 stim/optics/scalarbeam.h |   7 ++++++-
 stim/optics/scalarwave.h |  10 +++++++---
 3 files changed, 146 insertions(+), 83 deletions(-)
diff --git a/stim/optics/mie.h b/stim/optics/mie.h
index 59796d8..9955205 100644
--- a/stim/optics/mie.h
+++ b/stim/optics/mie.h
@@ -3,6 +3,7 @@
 
 #include "scalarwave.h"
 #include "../math/bessel.h"
+#include "../cuda/cudatools/devices.h"
 #include <cmath>
 
 namespace stim{
@@ -33,7 +34,7 @@ void B_coefficients(stim::complex<T>* B, T a, T k, stim::complex<T> n, int Nl){
 	stim::complex<double> h_ka, dh_ka;
 	stim::complex<double> numerator, denominator;
 	stim::complex<double> i(0, 1);
-	for(size_t l = 0; l <= Nl; l++){
+	for(int l = 0; l <= Nl; l++){
 		h_ka.r = j_ka[l];
 		h_ka.i = y_ka[l];
 		dh_ka.r = dj_ka[l];
@@ -81,84 +82,102 @@ void A_coefficients(stim::complex<T>* A, T a, T k, stim::complex<T> n, int Nl){
 	}
 }
 
+#define LOCAL_NL	16
 template<typename T>
-__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* B, T* j, T kr_min, T dkr, int Nl){
-	extern __shared__ stim::scalarwave<T> shared_W[];		//declare the list of waves in shared memory
-
-	stim::cuda::sharedMemcpy(shared_W, W, nW, threadIdx.x, blockDim.x);	//copy the plane waves into shared memory for faster access
-	__syncthreads();															//synchronize threads to insure all data is copied
+__global__ void cuda_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, int Nl){
+	extern __shared__ stim::complex<T> shared_hB[];		//declare the list of waves in shared memory
 
 	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
-	if(i >= N) return;												//exit if this thread is outside the array
+	if(i >= N) return;													//exit if this thread is outside the array
 	stim::vec3<T> p;
 	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
 	(y == NULL) ? p[1] = 0 : p[1] = y[i];
 	(z == NULL) ? p[2] = 0 : p[2] = z[i];
 	
-	T r = p.len();													//calculate the distance from the sphere
+	T r = p.len();														//calculate the distance from the sphere
+	if(r < a) return;													//exit if the point is inside the sphere (we only calculate the internal field)
 	T k = W[0].kmag();
-	if(r < a) return;												//exit if the point is inside the sphere (we only calculate the internal field)
-
-	size_t NC = Nl + 1;										//calculate the number of coefficients to be used
-	T kr = r * k;											//calculate the thread value for k*r
-	T fij = (kr - kr_min)/dkr;								//FP index into the spherical bessel LUT
-	size_t ij = (size_t) fij;								//convert to an integral index
-	T alpha = fij - ij;											//calculate the fractional portion of the index
-	size_t n0j = ij * (NC);									//start of the first entry in the LUT
-	size_t n1j = (ij+1) * (NC);								//start of the second entry in the LUT
+	size_t NC = Nl + 1;													//calculate the number of coefficients to be used
+	T kr = r * k;														//calculate the thread value for k*r
+	T fij = (kr - kr_min)/dkr;											//FP index into the spherical bessel LUT
+	size_t ij = (size_t) fij;											//convert to an integral index
+	T alpha = fij - ij;													//calculate the fractional portion of the index
+	size_t n0j = ij * (NC);												//start of the first entry in the LUT
+	size_t n1j = (ij+1) * (NC);											//start of the second entry in the LUT
 
 	T cos_phi;	
-	T Pl_2, Pl_1;									//declare registers to store the previous two Legendre polynomials
-	T Pl = 1;										//initialize the current value for the Legendre polynomial
-	T jl;
+	T Pl_2, Pl_1, Pl;														//declare registers to store the previous two Legendre polynomials
+	
+	stim::complex<T> hBl;
 	stim::complex<T> Ei = 0;											//create a register to store the result
 	int l;
+
+	stim::complex<T> hlBl[LOCAL_NL+1];
+	int shared_start = threadIdx.x * (Nl - LOCAL_NL);
+
+	#pragma unroll LOCAL_NL+1
+	for(l = 0; l <= LOCAL_NL; l++)
+		hlBl[l] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
+	
+	for(l = LOCAL_NL+1; l <= Nl; l++)
+		shared_hB[shared_start + (l - (LOCAL_NL+1))] = clerp<T>( hB[n0j + l], hB[n1j + l], alpha );
+
 	for(size_t w = 0; w < nW; w++){
 		cos_phi = p.norm().dot(W[w].kvec().norm());						//calculate the cosine of the angle between the k vector and the direction from the sphere
-		for(l = 0; l <= Nl; l++){
-			Pl_2 = Pl_1;								//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+		Pl_2 = 1;
+		Pl_1 = cos_phi;
+		Ei += W[w].E() * hlBl[0] * Pl_2;
+		Ei += W[w].E() * hlBl[1] * Pl_1;		
+
+		#pragma unroll LOCAL_NL-1
+		for(l = 2; l <= LOCAL_NL; l++){
+			Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
+			Ei += W[w].E() * hlBl[l] * Pl;
+			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
 			Pl_1 = Pl;
-			if(l == 0){									//computing Pl is done recursively, where the recursive relation
-				Pl = cos_phi;							//	requires the first two orders. This defines the second.
-			}
-			else{										//if this is not the first iteration, use the recursive relation to calculate Pl
-				Pl = ( (2 * (l+1) - 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
-			}
+		}
 
-			jl = lerp<T>( j[n0j + l], j[n1j + l], alpha );	//read jl from the LUT and interpolate the result
-			Ei += W[w].E() * B[l] * jl * Pl;
+		for(l = LOCAL_NL+1; l <= Nl; l++){
+			Pl = ( (2 * l + 1) * cos_phi * Pl_1 - (l) * Pl_2 ) / (l+1);
+			Ei += W[w].E() * shared_hB[shared_start + (l - (LOCAL_NL+1))] * Pl;
+			Pl_2 = Pl_1;												//shift Pl_1 -> Pl_2 and Pl -> Pl_1
+			Pl_1 = Pl;
+			
 		}
-		//Ei += shared_W[w].pos(p);									//evaluate the plane wave
 	}
-	E[i] += Ei;														//copy the result to device memory
+	E[i] += Ei;															//copy the result to device memory
 }
 
 template<typename T>
-void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* B, T* j, T kr_min, T dkr, size_t Nl){
-
-	size_t wave_bytes = sizeof(stim::scalarwave<T>);
-	size_t shared_bytes = stim::sharedMemPerBlock();									//calculate the maximum amount of shared memory available
-	size_t array_bytes = nW * wave_bytes;			//calculate the maximum number of bytes required for the planewave array
-	size_t max_batch = shared_bytes / wave_bytes;				//calculate number of plane waves that will fit into shared memory
-	size_t num_batches = nW / max_batch + 1;								//calculate the number of batches required to process all plane waves
-	size_t batch_bytes = min(nW, max_batch) * wave_bytes;				//initialize the batch size (in bytes) to the maximum batch required
-
-	stim::scalarwave<T>* batch_W;
-	HANDLE_ERROR(cudaMalloc(&batch_W, batch_bytes));										//allocate memory for a single batch of plane waves
-
-	int threads = stim::maxThreadsPerBlock();							//get the maximum number of threads per block for the CUDA device
-	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks	
-
-	size_t batch_size;																	//declare a variable to store the size of the current batch
-	size_t waves_processed = 0;															//initialize the number of waves processed to zero
-	while(waves_processed < nW){												//while there are still waves to be processed
-		batch_size = min<size_t>(max_batch, nW - waves_processed);			//process either a whole batch, or whatever is left
-		batch_bytes = batch_size * sizeof(stim::scalarwave<T>);
-		HANDLE_ERROR(cudaMemcpy(batch_W, W + waves_processed, batch_bytes, cudaMemcpyDeviceToDevice));	//copy the plane waves into global memory
-		cuda_scalar_mie_scatter<T><<< blocks, threads, batch_bytes >>>(E, N, x, y, z, batch_W, batch_size, a, n, B, j, kr_min, dkr, (int)Nl);	//call the kernel
-		waves_processed += batch_size;													//increment the counter indicating how many waves have been processed
-	}
-	cudaFree(batch_W);
+void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, stim::scalarwave<T>* W, size_t nW, T a, stim::complex<T> n, stim::complex<T>* hB, T kr_min, T dkr, size_t N_hB, size_t Nl){
+	
+	size_t max_shared_mem = stim::sharedMemPerBlock();	
+	int hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
+	std::cout<<"hl*Bl array size:  "<<hBl_array<<std::endl;
+	std::cout<<"shared memory:     "<<max_shared_mem<<std::endl;
+	int threads = (max_shared_mem / hBl_array) / 32 * 32;
+	std::cout<<"threads per block: "<<threads<<std::endl;
+	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks
+
+	size_t shared_mem;
+	if(Nl <= LOCAL_NL) shared_mem = 0;
+	else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL);				//amount of shared memory to allocate
+	std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
+	cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl);	//call the kernel
+
+}
+
+template<typename T>
+__global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;				//get the index into the array
+	if(i >= N) return;													//exit if this thread is outside the array
+
+	stim::vec3<T> p;
+	(x == NULL) ? p[0] = 0 : p[0] = x[i];								// test for NULL values and set positions
+	(y == NULL) ? p[1] = 0 : p[1] = y[i];
+	(z == NULL) ? p[2] = 0 : p[2] = z[i];
+
+	r[i] = p.len();
 }
 /// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
 
@@ -171,17 +190,19 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, sti
 /// @param a is the radius of the sphere
 /// @param n is the complex refractive index of the sphere
 template<typename T>
-void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n){
+void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
 	//calculate the necessary number of orders required to represent the scattered field
 	T k = W[0].kmag();
 
-	size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2);
+	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
+	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
+	std::cout<<"Nl: "<<Nl<<std::endl;
 
 	//calculate the scattering coefficients for the sphere
 	stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
 	B_coefficients(B, a, k, n, Nl);
 
-#ifdef __CUDACC__
+#ifdef CUDA_FOUND
 	stim::complex<T>* dev_E;										//allocate space for the field
 	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
 	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
@@ -209,16 +230,44 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std
 	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
 	HANDLE_ERROR( cudaMemcpy(dev_W, &W[0], sizeof(stim::scalarwave<T>) * W.size(), cudaMemcpyHostToDevice) );
 
-	// SCATTERING COEFFICIENTS
-	stim::complex<T>* dev_B;
-	HANDLE_ERROR( cudaMalloc(&dev_B, sizeof(stim::complex<T>) * (Nl+1)) );
-	HANDLE_ERROR( cudaMemcpy(dev_B, B, sizeof(stim::complex<T>) * (Nl+1), cudaMemcpyHostToDevice) );
-
 	// BESSEL FUNCTION LOOK-UP TABLE
+	//calculate the distance from the sphere center
+	T* dev_r;
+	HANDLE_ERROR( cudaMalloc(&dev_r, sizeof(T) * N) );
+		
+	int threads = stim::maxThreadsPerBlock();
+	dim3 blocks((unsigned)(N / threads + 1));
+	cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);
+
+	//Find the minimum and maximum values of r
+    cublasStatus_t stat;
+    cublasHandle_t handle;
+
+	stat = cublasCreate(&handle);							//create a cuBLAS handle
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS initialization failed\n");
+		exit(1);
+	}
+
+	int i_min, i_max;
+	stat = cublasIsamin(handle, (int)N, dev_r, 1, &i_min);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
+		exit(1);
+	}
+	stat = cublasIsamax(handle, (int)N, dev_r, 1, &i_max);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
+		exit(1);
+	}
+
+	T r_min, r_max;											//allocate space to store the minimum and maximum values
+	HANDLE_ERROR( cudaMemcpy(&r_min, dev_r + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
+	HANDLE_ERROR( cudaMemcpy(&r_max, dev_r + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
+
+
 	//size_t Nlut_j = (size_t)((r_max - r_min) / r_spacing + 1);			//number of values in the look-up table based on the user-specified spacing along r
-	size_t Nlut_j = 1024;
-	T r_min = 0;
-	T r_max = 10;
+	size_t N_hB_lut = (size_t)((r_max - r_min) / r_spacing + 1);
 
 	T kr_min = k * r_min;
 	T kr_max = k * r_max;
@@ -230,25 +279,29 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std
 	double* djv= (double*) malloc( (Nl + 1) * sizeof(double) );
 	double* dyv= (double*) malloc( (Nl + 1) * sizeof(double) );
 
-	size_t lutj_bytes = sizeof(T) * (Nl+1) * Nlut_j;
-	T* bessel_lut = (T*) malloc(lutj_bytes);													//pointer to the look-up table
-	T dkr = (kr_max - kr_min) / (Nlut_j-1);												//distance between values in the LUT
-	std::cout<<"LUT jl bytes:  "<<lutj_bytes<<std::endl;
-	for(size_t kri = 0; kri < Nlut_j; kri++){													//for each value in the LUT
+	size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut;
+	stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes);													//pointer to the look-up table
+	T dkr = (kr_max - kr_min) / (N_hB_lut-1);												//distance between values in the LUT
+	std::cout<<"LUT jl bytes:  "<<hB_bytes<<std::endl;
+	stim::complex<T> hl;
+	for(size_t kri = 0; kri < N_hB_lut; kri++){													//for each value in the LUT
 		stim::bessjyv_sph<double>(Nl, kr_min + kri * dkr, vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
 		for(size_t l = 0; l <= Nl; l++){													//for each order
-			bessel_lut[kri * (Nl + 1) + l] = (T)jv[l];										//store the bessel function result
+			hl.r = (T)jv[l];
+			hl.i = (T)yv[l];
+
+			hB_lut[kri * (Nl + 1) + l] = hl * B[l];										//store the bessel function result
 		}
 	}
 
-	stim::cpu2image<T>(bessel_lut, "lut.bmp", Nl+1, Nlut_j, stim::cmBrewer);
+	//stim::cpu2image<T>(hankel_lut, "hankel.bmp", Nl+1, Nlut_j, stim::cmBrewer);
 
 	//Allocate device memory and copy everything to the GPU
-	T* dev_j_lut;
-	HANDLE_ERROR( cudaMalloc(&dev_j_lut, lutj_bytes) );
-	HANDLE_ERROR( cudaMemcpy(dev_j_lut, bessel_lut, lutj_bytes, cudaMemcpyHostToDevice) );
+	stim::complex<T>* dev_hB_lut;
+	HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
+	HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );
 
-	gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_B, dev_j_lut, kr_min, dkr, Nl);
+	gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, kr_min, dkr, N_hB_lut, Nl);
 
 	cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
 
@@ -318,6 +371,7 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st
 	T k = W[0].kmag();
 
 	size_t Nl = ceil(k*a + 4 * cbrt( k * a ) + 2);
+	std::cout<<"Nl: "<<Nl<<std::endl;
 
 	//calculate the scattering coefficients for the sphere
 	stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
diff --git a/stim/optics/scalarbeam.h b/stim/optics/scalarbeam.h
index e4dcd06..93881eb 100644
--- a/stim/optics/scalarbeam.h
+++ b/stim/optics/scalarbeam.h
@@ -110,7 +110,7 @@ public:
 		std::vector< scalarwave<T> > samples(N);											//create a vector of plane waves
 		T kmag = (T)stim::TAU / lambda;								//calculate the wavenumber
 		stim::complex<T> apw;										//allocate space for the amplitude at the focal point
-		T a = stim::TAU * (1 - cos(asin(NA[0]))) / (double)N;
+		T a = (T)(stim::TAU * (1 - cos(asin(NA[0]))) / (double)N);
 		stim::vec3<T> kpw;											//declare the new k-vector based on the focused plane wave direction
 		for(size_t i=0; i<N; i++){										//for each sample
 			kpw = dirs[i] * kmag;									//calculate the k-vector for the new plane wave
@@ -201,6 +201,11 @@ CUDA_CALLABLE void lut_lookup(T* lut_values, T* lut, T val, size_t N, T min_val,
 }
 
 template <typename T>
+CUDA_CALLABLE stim::complex<T> clerp(stim::complex<T> v0, stim::complex<T> v1, T t) {
+    return stim::complex<T>( fma(t, v1.r, fma(-t, v0.r, v0.r)), fma(t, v1.i, fma(-t, v0.i, v0.i)) );
+}
+
+template <typename T>
 CUDA_CALLABLE T lerp(T v0, T v1, T t) {
     return fma(t, v1, fma(-t, v0, v0));
 }
diff --git a/stim/optics/scalarwave.h b/stim/optics/scalarwave.h
index 4e63bed..ac620ab 100644
--- a/stim/optics/scalarwave.h
+++ b/stim/optics/scalarwave.h
@@ -23,7 +23,7 @@ namespace stim{
 template<typename T>
 class scalarwave{
 
-protected:
+public:
 
 	stim::vec3<T> k;							//k-vector, pointed in propagation direction with magnitude |k| = tau / lambda = 2pi / lambda
 	stim::complex<T> E0;						//amplitude
@@ -240,9 +240,7 @@ void gpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scal
 
 	size_t wave_bytes = sizeof(stim::scalarwave<T>);
 	size_t shared_bytes = stim::sharedMemPerBlock();									//calculate the maximum amount of shared memory available
-	size_t array_bytes = nW * wave_bytes;			//calculate the maximum number of bytes required for the planewave array
 	size_t max_batch = shared_bytes / wave_bytes;				//calculate number of plane waves that will fit into shared memory
-	size_t num_batches = nW / max_batch + 1;								//calculate the number of batches required to process all plane waves
 	size_t batch_bytes = min(nW, max_batch) * wave_bytes;				//initialize the batch size (in bytes) to the maximum batch required
 
 	stim::scalarwave<T>* batch_W;
@@ -332,6 +330,12 @@ void cpu_scalarwave(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scala
 	cpu_scalarwaves(F, N, x, y, z, w_array);	
 }
 
+template<typename T>
+void cpu_scalarwaves(stim::complex<T>* F, size_t N, T* x, T* y, T* z, stim::scalarwave<T> w){
+	std::vector< stim::scalarwave<T> > w_array(1, w);
+	cpu_scalarwaves(F, N, x, y, z, w_array);	
+}
+
 
 /// Sums a series of coherent plane waves at a specified point
 /// @param x is the x coordinate of the field point
--
libgit2 0.21.4