ivote3 fixes and new bimsim code

David Mayerich
1 parent b5dd3d3a
Showing 8 changed files with 419 additions and 119 deletions Show diff stats
stim/cuda/crop.cuh
stim/math/circle.h
stim/math/constants.h
stim/math/vec3.h
stim/optics/scalarbeam.h
stim/optics/scalarfield.h
stim/optics/scalarmie.h
stim/visualization/colormap.h
+
+template<typename T>
+__global__ void cuda_crop2d(T* dest, T* src, size_t sx, size_t sy, size_t x, size_t y, size_t dx, size_t dy){
+
+	size_t xi = blockIdx.x * blockDim.x + threadIdx.x;				//calculate the current working position within the destination image
+	size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
+	if(xi >= dx || yi >= dy) return;									//if this thread is outside of the destination image, return
+
+	size_t di = yi * dx + xi;										//calculate the 1D index into the destination image
+	size_t si = (y + yi) * sx + (x + xi);							//calculate the 1D index into the source image
+
+	dest[di] = src[si];												//copy the corresponding source pixel to the destination image
+}
+
+/// Crops a 2D image composed of elements of type T
+/// @param dest is a device pointer to memory of size dx*dy that will store the cropped image
+/// @param src is a device pointer to memory of size sx*sy that stores the original image
+/// @param sx is the size of the source image along x
+/// @param sy is the size of the source image along y
+/// @param x is the x-coordinate of the start position of the cropped region within the source image
+/// @param y is the y-coordinate of the start position of the cropped region within the source image
+/// @param dx is the size of the destination image along x
+/// @param dy is the size of the destination image along y
+template<typename T>
+void gpu_crop2d(T* dest, T* src, size_t sx, size_t sy, size_t x, size_t y, size_t dx, size_t dy){
+	int max_threads = stim::maxThreadsPerBlock();												//get the maximum number of threads per block for the CUDA device
+	dim3 threads( sqrt(max_threads), sqrt(max_threads) );
+	dim3 blocks( dx / sqrt(threads.x) + 1, dy / sqrt(threads.y) + 1);							//calculate the optimal number of blocks
+	cuda_crop2d<T> <<< blocks, threads >>>(dest, src, sx, sy, x, y, dx, dy);
+}
 \ No newline at end of file
@@ -64,7 +64,7 @@ public:
 		init();
 	}
-	///create a rectangle from a center point, normal, and size
+	/*///create a rectangle from a center point, normal, and size
 	///@param c: x,y,z location of the center.
 	///@param s: size of the rectangle.
 	///@param n: x,y,z direction of the normal.
@@ -76,6 +76,7 @@ public:
 		rotate(n, U, Y);
 		scale(s);
 	}
+	*/
 	///create a rectangle from a center point, normal, and size
 	///@param c: x,y,z location of the center.
@@ -164,12 +165,12 @@ public:
 	///returns a vector with the points on the initialized circle.
 	///connecting the points results in a circle.
 	///@param n: integer for the number of points representing the circle.
-	stim::vec3<T>
+	CUDA_CALLABLE stim::vec3<T>
 	p(T theta)
 	{
 		T x,y;
-		y = 0.5*cos(theta*stim::TAU/360.0)+0.5;
-		x = 0.5*sin(theta*stim::TAU/360.0)+0.5;
+		y = 0.5*cos(theta*STIM_TAU/360.0)+0.5;
+		x = 0.5*sin(theta*STIM_TAU/360.0)+0.5;
 		return p(x,y);
 	}
 #ifndef STIM_CONSTANTS_H
 #define STIM_CONSTANTS_H
+#define STIM_PI		3.1415926535897932384626433832795028841971693993751058209749445923078164062862
+#define STIM_TAU	2 * STIM_PI
+
 #include "stim/cuda/cudatools/callable.h"
 namespace stim{
 	const double PI		=	3.1415926535897932384626433832795028841971693993751058209749445923078164062862;
@@ -216,7 +216,7 @@ public:
 		return result;
 	}
-
+#ifndef __NVCC__
 	/// Outputs the vector as a string
 	std::string str() const{
 		std::stringstream ss;
@@ -234,6 +234,7 @@ public:
 		return ss.str();
 	}
+#endif
 	size_t size(){ return 3; }
@@ -193,6 +193,7 @@ void gpu_scalar_psf_local(stim::complex&lt;T&gt;* E, size_t N, T* r, T* phi, T lambda,
         printf ("CUBLAS Error: failed to calculate maximum r value.\n");
 		exit(1);
 	}
+	cublasDestroy(handle);
 	i_min--;												//cuBLAS uses 1-based indexing for Fortran compatibility
 	i_max--;
@@ -212,8 +213,9 @@ void gpu_scalar_psf_local(stim::complex&lt;T&gt;* E, size_t N, T* r, T* phi, T lambda,
 	T* j_lut = (T*) malloc(lutj_bytes);													//pointer to the look-up table
 	T dr = (r_max - r_min) / (Nlut_j-1);												//distance between values in the LUT
 	T jl;
+	unsigned l;
 	for(size_t ri = 0; ri < Nlut_j; ri++){													//for each value in the LUT
-		for(unsigned l = 0; l <= (unsigned)Nl; l++){													//for each order
+		for(l = 0; l <= (unsigned)Nl; l++){													//for each order
 			jl = boost::math::sph_bessel<T>(l, k*(r_min + ri * dr));					//use boost to calculate the spherical bessel function
 			j_lut[ri * (Nl + 1) + l] = jl;												//store the bessel function result
 		}
@@ -323,6 +325,9 @@ void gpu_scalar_psf_cart(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, T lamb
 	gpu_scalar_psf_local(E, N, gpu_r, gpu_phi, lambda, A, NA, NA_in, Nl, r_spacing);
+	HANDLE_ERROR( cudaFree(gpu_r) );
+	HANDLE_ERROR( cudaFree(gpu_phi) );
+
 }
 #endif
@@ -449,17 +454,12 @@ public:
 	/// Evaluate the beam to a scalar field using Debye focusing
 	void eval(stim::scalarfield<T>& E, int order = 500){
-		size_t array_size = E.grid_bytes();
-		T* X = (T*) malloc( array_size );			//allocate space for the coordinate meshes
-		T* Y = (T*) malloc( array_size );
-		T* Z = (T*) malloc( array_size );
-
-		E.meshgrid(X, Y, Z, stim::CPUmem);			//calculate the coordinate meshes
-		eval(E, X, Y, Z, order);
-
-		free(X);									//free the coordinate meshes
-		free(Y);
-		free(Z);
+		E.meshgrid();								//calculate a meshgrid if one isn't created
+		if(E.gpu())
+			gpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[0], NA[1], order, E.spacing());
+		else
+			cpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[0], NA[1], order, E.spacing());
+		//eval(E, E.x(), E.y(), E.z(), order);
 	}
 	/// Calculate the field at a given point
@@ -6,8 +6,52 @@
 #include "../math/complex.h"
 #include "../math/fft.h"
+#ifdef CUDA_FOUND
+#include "../cuda/crop.cuh"
+#endif
+
 namespace stim{
+	template<typename T>
+	__global__ void cuda_abs(T* img, stim::complex<T>* field, size_t N){
+		size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+		if(i >= N) return;
+
+		img[i] = field[i].abs();
+	}
+
+	template<typename T>
+	__global__ void cuda_real(T* img, stim::complex<T>* field, size_t N){
+		size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+		if(i >= N) return;
+
+		img[i] = field[i].real();
+	}
+
+	template<typename T>
+	__global__ void cuda_imag(T* img, stim::complex<T>* field, size_t N){
+		size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+		if(i >= N) return;
+
+		img[i] = field[i].imag();
+	}
+
+	template<typename T>
+	__global__ void cuda_intensity(T* img, stim::complex<T>* field, size_t N){
+		size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+		if(i >= N) return;
+
+		img[i] = pow(field[i].abs(), 2);
+	}
+
+	template<typename T>
+	__global__ void cuda_sum_intensity(T* img, stim::complex<T>* field, size_t N){
+		size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+		if(i >= N) return;
+
+		img[i] += pow(field[i].abs(), 2);
+	}
+
 	/// Perform a k-space transform of a scalar field (FFT). The given field has a width of x and the calculated momentum space has a
 	///		width of kx (in radians).
 	/// @param K is a pointer to the output array of all plane waves in the field
@@ -44,12 +88,16 @@ namespace stim{
 			std::cout<<"Error using cuFFT to perform a forward Fourier transform of the field."<<std::endl;
 			exit(1);
 		}
+		cufftDestroy(plan);
 		stim::complex<T>* fft = (stim::complex<T>*) malloc(sizeof(stim::complex<T>) * nx * ny);
 		HANDLE_ERROR( cudaMemcpy(fft, dev_FFT, sizeof(stim::complex<T>) * nx * ny, cudaMemcpyDeviceToHost) );
 		stim::cpu_fftshift(K, fft, nx, ny);
-		//memcpy(K, fft, sizeof(stim::complex<T>) * nx * ny);
+		
+		HANDLE_ERROR( cudaFree(dev_FFT) );			//free GPU memory
+		HANDLE_ERROR( cudaFree(dev_E) );
+		free(fft);									//free CPU memory
 	}
 	template<typename T>
@@ -82,12 +130,18 @@ namespace stim{
 			std::cout<<"Error using cuFFT to perform a forward Fourier transform of the field."<<std::endl;
 			exit(1);
 		}
+		cufftDestroy(plan);
 		HANDLE_ERROR( cudaMemcpy(E, dev_E, sizeof(stim::complex<T>) * nx * ny, cudaMemcpyDeviceToHost) );
+		HANDLE_ERROR( cudaFree(dev_FFT) );			//free GPU memory
+		HANDLE_ERROR( cudaFree(dev_E) );
+		free(fft);									//free CPU memory
 	}
+	
+
 	/// Propagate a field slice along its orthogonal direction by a given distance z
 	/// @param Enew is the resulting propogated field
 	/// @param E is the field to be propogated
@@ -105,9 +159,9 @@ namespace stim{
 		T Kx, Ky;											//width and height in k space
 		cpu_scalar_to_kspace(K, Kx, Ky, E ,sx, sy, nx, ny);
-		T* mag = (T*) malloc( sizeof(T) * nx * ny );
-		stim::abs(mag, K, nx * ny);
-		stim::cpu2image<float>(mag, "kspace_pre_shift.bmp", nx, ny, stim::cmBrewer);
+		//T* mag = (T*) malloc( sizeof(T) * nx * ny );
+		//stim::abs(mag, K, nx * ny);
+		//stim::cpu2image<float>(mag, "kspace_pre_shift.bmp", nx, ny, stim::cmBrewer);
 		size_t kxi, kyi;
 		size_t i;
@@ -143,10 +197,27 @@ namespace stim{
 			}
 		}
-		stim::abs(mag, K, nx * ny);
-		stim::cpu2image<float>(mag, "kspace_post_shift.bmp", nx, ny, stim::cmBrewer);
+		//stim::abs(mag, K, nx * ny);
+		//stim::cpu2image<float>(mag, "kspace_post_shift.bmp", nx, ny, stim::cmBrewer);
 		cpu_scalar_from_kspace(Enew, sx, sy, K, Kx, Ky, nx, ny);
+		free(K);
+	}
+
+	template<typename T>
+	void gpu_scalar_propagate(stim::complex<T>* Enew, stim::complex<T>* E, T sx, T sy, T z, T k, size_t nx, size_t ny){
+		
+		size_t field_bytes = sizeof(stim::complex<T>) * nx * ny;
+		stim::complex<T>* host_E = (stim::complex<T>*) malloc( field_bytes);
+		HANDLE_ERROR( cudaMemcpy(host_E, E, field_bytes, cudaMemcpyDeviceToHost) );
+
+		stim::complex<T>* host_Enew = (stim::complex<T>*) malloc(field_bytes);
+
+		cpu_scalar_propagate(host_Enew, host_E, sx, sy, z, k, nx, ny);
+
+		HANDLE_ERROR( cudaMemcpy(Enew, host_Enew, field_bytes, cudaMemcpyHostToDevice) );
+		free(host_E);
+		free(host_Enew);
 	}
 	/// Apply a lowpass filter to a field slice
@@ -165,9 +236,9 @@ namespace stim{
 		T Kx, Ky;											//width and height in k space
 		cpu_scalar_to_kspace(K, Kx, Ky, E ,sx, sy, nx, ny);
-		T* mag = (T*) malloc( sizeof(T) * nx * ny );
-		stim::abs(mag, K, nx * ny);
-		stim::cpu2image<float>(mag, "kspace_pre_lowpass.bmp", nx, ny, stim::cmBrewer);
+		//T* mag = (T*) malloc( sizeof(T) * nx * ny );
+		//stim::abs(mag, K, nx * ny);
+		//stim::cpu2image<float>(mag, "kspace_pre_lowpass.bmp", nx, ny, stim::cmBrewer);
 		size_t kxi, kyi;
 		size_t i;
@@ -200,10 +271,11 @@ namespace stim{
 			}
 		}
-		stim::abs(mag, K, nx * ny);
-		stim::cpu2image<float>(mag, "kspace_post_lowpass.bmp", nx, ny, stim::cmBrewer);
+		//stim::abs(mag, K, nx * ny);
+		//stim::cpu2image<float>(mag, "kspace_post_lowpass.bmp", nx, ny, stim::cmBrewer);
 		cpu_scalar_from_kspace(Enew, sx, sy, K, Kx, Ky, nx, ny);
+		free(K);
 	}
 	enum locationType {CPUmem, GPUmem};
@@ -222,6 +294,8 @@ protected:
 	size_t R[2];
 	locationType loc;
+	T* p[3];											//scalar position for each point in E
+
 	/// Convert the field to a k-space representation (do an FFT)
 	void to_kspace(T& kx, T& ky){
 		cpu_scalar_to_kspace(E, kx, ky, E, X.len(), Y.len(), R[0], R[1]);
@@ -252,6 +326,9 @@ public:
 		E = (stim::complex<T>*) malloc(grid_bytes());		//allocate in CPU memory
 		memset(E, 0, grid_bytes());
 		loc = CPUmem;
+
+		p[0] = p[1] = p[2] = NULL;							//set the position vector to NULL
+
 	}
 	~scalarfield(){
@@ -271,11 +348,35 @@ public:
 		if(loc == GPUmem) return;
 		else{
 			stim::complex<T>* dev_E;
-			HANDLE_ERROR( cudaMalloc(&dev_E, e_bytes()) );								//allocate GPU memory
-			HANDLE_ERROR( cudaMemcpy(dev_E, E, e_bytes(), cudaMemcpyHostToDevice) );	//copy the field to the GPU
+			HANDLE_ERROR( cudaMalloc(&dev_E, grid_bytes()) );								//allocate GPU memory
+			HANDLE_ERROR( cudaMemcpy(dev_E, E, grid_bytes(), cudaMemcpyHostToDevice) );	//copy the field to the GPU
 			free(E);																	//free the CPU memory
 			E = dev_E;																	//swap pointers
+
+			if(p[0]){
+				size_t meshgrid_bytes = size() * sizeof(T);								//calculate the number of bytes in each meshgrid
+				T* dev_X;																//allocate variables to store the device meshgrid
+				T* dev_Y;
+				T* dev_Z;
+				HANDLE_ERROR( cudaMalloc(&dev_X, meshgrid_bytes) );						//allocate space for the meshgrid on the device
+				HANDLE_ERROR( cudaMalloc(&dev_Y, meshgrid_bytes) );
+				HANDLE_ERROR( cudaMalloc(&dev_Z, meshgrid_bytes) );
+
+				HANDLE_ERROR( cudaMemcpy(dev_X, p[0], meshgrid_bytes, cudaMemcpyHostToDevice) );	//copy from the host to the device
+				HANDLE_ERROR( cudaMemcpy(dev_Y, p[1], meshgrid_bytes, cudaMemcpyHostToDevice) );
+				HANDLE_ERROR( cudaMemcpy(dev_Z, p[2], meshgrid_bytes, cudaMemcpyHostToDevice) );
+
+				free(p[0]);																//free device memory
+				free(p[1]);
+				free(p[2]);
+
+				p[0] = dev_X;															//swap in the new pointers to device memory
+				p[1] = dev_Y;
+				p[2] = dev_Z;
+			}
+			loc = GPUmem;																//set the location flag
 		}
+
 	}
 	/// Copy the field array to the CPU, if it isn't already there
@@ -286,14 +387,43 @@ public:
 			HANDLE_ERROR( cudaMemcpy(host_E, E, grid_bytes(), cudaMemcpyDeviceToHost) );	//copy from GPU to CPU
 			HANDLE_ERROR( cudaFree(E) );												//free device memory
 			E = host_E;																	//swap pointers
+
+			//copy a meshgrid has been created
+			if(p[0]){
+				size_t meshgrid_bytes = size() * sizeof(T);								//move X to the CPU
+				T* host_X = (T*) malloc( meshgrid_bytes );
+				T* host_Y = (T*) malloc( meshgrid_bytes );
+				T* host_Z = (T*) malloc( meshgrid_bytes );
+
+				HANDLE_ERROR( cudaMemcpy(host_X, p[0], meshgrid_bytes, cudaMemcpyDeviceToHost) );
+				HANDLE_ERROR( cudaMemcpy(host_Y, p[1], meshgrid_bytes, cudaMemcpyDeviceToHost) );
+				HANDLE_ERROR( cudaMemcpy(host_Z, p[2], meshgrid_bytes, cudaMemcpyDeviceToHost) );
+
+				HANDLE_ERROR( cudaFree(p[0]) );
+				HANDLE_ERROR( cudaFree(p[1]) );
+				HANDLE_ERROR( cudaFree(p[2]) );
+
+				p[0] = host_X;
+				p[1] = host_Y;
+				p[2] = host_Z;
+			}
+			loc = CPUmem;
 		}
 	}
+	bool gpu(){
+		if(loc == GPUmem) return true;
+		else return false;
+	}
+
 	/// Propagate the field along its orthogonal direction by a distance d
 	void propagate(T d, T k){
-		//X[2] += d;																		//move the plane position along the propagation direction
-		//Y[2] += d;
-		cpu_scalar_propagate(E, E, X.len(), Y.len(), d, k, R[0], R[1]);
+		if(loc == CPUmem){
+			cpu_scalar_propagate(E, E, X.len(), Y.len(), d, k, R[0], R[1]);
+		}
+		else{
+			gpu_scalar_propagate(E, E, X.len(), Y.len(), d, k, R[0], R[1]);
+		}
 	}
 	/// Propagate the field along its orthogonal direction by a distance d
@@ -312,6 +442,7 @@ public:
 		}
 		if(loc == CPUmem){
+			cropped.to_cpu();										//make sure that the cropped image is on the CPU
 			size_t x, y;
 			size_t sx, sy, si, di;
 			for(y = 0; y < Cy; y++){
@@ -325,8 +456,8 @@ public:
 			}
 		}
 		else{
-			std::cout<<"Error: cropping not supported for GPU memory yet."<<std::endl;
-			exit(1);
+			cropped.to_gpu();										//make sure that the cropped image is also on the GPU
+			gpu_crop2d<stim::complex<T>>(cropped.E, E, R[0], R[1], Cx * padding, Cy * padding, Cx, Cy);
 		}
 	}
@@ -346,6 +477,10 @@ public:
 		return E;
 	}
+	T* x(){ return p[0]; }
+	T* y(){ return p[1]; }
+	T* z(){ return p[2]; }
+
 	/// Evaluate the cartesian coordinates of each point in the field. The resulting arrays are allocated in the same memory where the field is stored.
 	void meshgrid(T* X, T* Y, T* Z, locationType location){
 		//size_t array_size = sizeof(T) * R[0] * R[1];
@@ -376,6 +511,21 @@ public:
 		}
 	}
+	/// Create a local meshgrid
+	void meshgrid(){
+		if(p[0]) return;								//if the p[0] value is not NULL, a meshgrid has already been created
+		if(loc == CPUmem){
+			p[0] = (T*) malloc( size() * sizeof(T) );
+			p[1] = (T*) malloc( size() * sizeof(T) );
+			p[2] = (T*) malloc( size() * sizeof(T) );
+		}
+		else{
+			std::cout<<"GPUmem meshgrid isn't implemented yet."<<std::endl;
+			exit(1);
+		}
+		meshgrid(p[0], p[1], p[2], loc);
+	}
+
 	//clear the field, setting all values to zero
 	void clear(){
 		if(loc == GPUmem)
@@ -386,25 +536,49 @@ public:
 	void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer){
-		if(loc == GPUmem) to_cpu();									//if the field is in the GPU, move it to the CPU
-		T* image = (T*) malloc( sizeof(T) * size() );				//allocate space for the real image
-
-		switch(type){												//get the specified component from the complex value
-		case complexMag:
-			stim::abs(image, E, size());
-			break;
-		case complexReal:
-			stim::real(image, E, size());
-			break;
-		case complexImaginary:
-			stim::imag(image, E, size());
-			break;
-		case complexIntensity:
-			stim::intensity(image, E, size());
-			break;
+		if(loc == GPUmem){
+			T* image;
+			HANDLE_ERROR( cudaMalloc(&image, sizeof(T) * size()) );
+			int threads = stim::maxThreadsPerBlock();												//get the maximum number of threads per block for the CUDA device
+			dim3 blocks( R[0] * R[1] / threads + 1 );												//create a 1D array of blocks
+
+			switch(type){
+			case complexMag:
+				cuda_abs<T><<< blocks, threads >>>(image, E, size());
+				break;
+			case complexReal:
+				cuda_real<T><<< blocks, threads >>>(image, E, size());
+				break;
+			case complexImaginary:
+				cuda_imag<T><<< blocks, threads >>>(image, E, size());
+				break;
+			case complexIntensity:
+				cuda_intensity<T><<< blocks, threads >>>(image, E, size());
+				break;
+			}
+			stim::gpu2image<T>(image, filename, R[0], R[1], stim::cmBrewer);
+			HANDLE_ERROR( cudaFree(image) );
+		}
+		else{
+			T* image = (T*) malloc( sizeof(T) * size() );				//allocate space for the real image
+
+			switch(type){												//get the specified component from the complex value
+			case complexMag:
+				stim::abs(image, E, size());
+				break;
+			case complexReal:
+				stim::real(image, E, size());
+				break;
+			case complexImaginary:
+				stim::imag(image, E, size());
+				break;
+			case complexIntensity:
+				stim::intensity(image, E, size());
+				break;
+			}
+			stim::cpu2image(image, filename, R[0], R[1], cmap);			//save the resulting image
+			free(image);												//free the real image
 		}
-		stim::cpu2image(image, filename, R[0], R[1], cmap);			//save the resulting image
-		free(image);												//free the real image
 	}
 	void image(T* img, stim::complexComponentType type = complexMag){
@@ -430,15 +604,23 @@ public:
 	//adds the field intensity to the output array (useful for calculating detector response to incoherent fields)
 	void intensity(T* out){
-		if(loc == GPUmem) to_cpu();									//if the field is in the GPU, move it to the CPU
-		T* image = (T*) malloc( sizeof(T) * size() );				//allocate space for the real image
-		stim::intensity(image, E, size());							//calculate the intensity
+		if(loc == GPUmem){
+			//T* image;
+			//HANDLE_ERROR( cudaMalloc(&image, sizeof(T) * size()) );
+			int threads = stim::maxThreadsPerBlock();												//get the maximum number of threads per block for the CUDA device
+			dim3 blocks( R[0] * R[1] / threads + 1 );												//create a 1D array of blocks
+			cuda_sum_intensity<T><<< blocks, threads >>>(out, E, size());
+		}
+		else{
+			T* image = (T*) malloc( sizeof(T) * size() );				//allocate space for the real image
+			stim::intensity(image, E, size());							//calculate the intensity
-		size_t N = size();											//calculate the number of elements in the field
-		for(size_t n = 0; n < N; n++)								//for each point in the field
-			out[n] += image[n];										//add the field intensity to the output image
+			size_t N = size();											//calculate the number of elements in the field
+			for(size_t n = 0; n < N; n++)								//for each point in the field
+				out[n] += image[n];										//add the field intensity to the output image
-		free(image);												//free the temporary intensity image
+			free(image);												//free the temporary intensity image
+		}
 	}
 };				//end class scalarfield
@@ -149,16 +149,16 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, sti
 	size_t max_shared_mem = stim::sharedMemPerBlock();	
 	size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
-	std::cout<<"hl*Bl array size:  "<<hBl_array<<std::endl;
-	std::cout<<"shared memory:     "<<max_shared_mem<<std::endl;
+	//std::cout<<"hl*Bl array size:  "<<hBl_array<<std::endl;
+	//std::cout<<"shared memory:     "<<max_shared_mem<<std::endl;
 	int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
-	std::cout<<"threads per block: "<<threads<<std::endl;
+	//std::cout<<"threads per block: "<<threads<<std::endl;
 	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks
 	size_t shared_mem;
 	if(Nl <= LOCAL_NL) shared_mem = 0;
 	else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL);				//amount of shared memory to allocate
-	std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
+	//std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
 	cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl);	//call the kernel
 }
@@ -174,52 +174,20 @@ __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){
 	r[i] = p.len();
 }
-/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
-
-/// @param E is a pointer to the destination field values
-/// @param N is the number of points used to calculate the field
-/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
-/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
-/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
-/// @param W is an array of planewaves that will be scattered
-/// @param a is the radius of the sphere
-/// @param n is the complex refractive index of the sphere
 template<typename T>
-void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
+void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
+	
 	//calculate the necessary number of orders required to represent the scattered field
 	T k = W[0].kmag();
 	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
 	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
-	std::cout<<"Nl: "<<Nl<<std::endl;
+	//std::cout<<"Nl: "<<Nl<<std::endl;
 	//calculate the scattering coefficients for the sphere
 	stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
-	B_coefficients(B, a, k, n, Nl);
-
-#ifdef CUDA_FOUND
-	stim::complex<T>* dev_E;										//allocate space for the field
-	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
-	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
-	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
-
-	//	COORDINATES
-	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
-	if(x != NULL){
-		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
-		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
-	}
-	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
-	if(y != NULL){
-		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
-		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
-	}
-	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
-	if(z != NULL){
-		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
-		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
-	}
-
+	B_coefficients(B, a, k, n, Nl);	
+	
 	//	PLANE WAVES
 	stim::scalarwave<T>* dev_W;																//allocate space and copy plane waves
 	HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
@@ -232,7 +200,7 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
 	int threads = stim::maxThreadsPerBlock();
 	dim3 blocks((unsigned)(N / threads + 1));
-	cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);
+	cuda_dist<T> <<< blocks, threads >>>(dev_r, x, y, z, N);
 	//Find the minimum and maximum values of r
     cublasStatus_t stat;
@@ -280,7 +248,7 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
 	size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut;
 	stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes);													//pointer to the look-up table
 	T dr = (r_max - r_min) / (N_hB_lut-1);												//distance between values in the LUT
-	std::cout<<"LUT jl bytes:  "<<hB_bytes<<std::endl;
+	//std::cout<<"LUT jl bytes:  "<<hB_bytes<<std::endl;
 	stim::complex<T> hl;
 	for(size_t ri = 0; ri < N_hB_lut; ri++){													//for each value in the LUT
 		stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv);		//compute the list of spherical bessel functions from [0 Nl]
@@ -292,18 +260,57 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
 			//std::cout<<hB_lut[ri * (Nl + 1) + l]<<std::endl;
 		}
 	}
-	T* real_lut = (T*) malloc(hB_bytes/2);
-	stim::real(real_lut, hB_lut, N_hB_lut);
-	stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer);
+	//T* real_lut = (T*) malloc(hB_bytes/2);
+	//stim::real(real_lut, hB_lut, N_hB_lut);
+	//stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer);
 	//Allocate device memory and copy everything to the GPU
 	stim::complex<T>* dev_hB_lut;
 	HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
 	HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );
-	gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl);
+	gpu_scalar_mie_scatter<T>(E, N, x, y, z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl);
-	cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
+	cudaMemcpy(E, E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost);			//copy the field from device memory
+}
+/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
+
+/// @param E is a pointer to the destination field values
+/// @param N is the number of points used to calculate the field
+/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
+/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
+/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
+/// @param W is an array of planewaves that will be scattered
+/// @param a is the radius of the sphere
+/// @param n is the complex refractive index of the sphere
+template<typename T>
+void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
+	
+
+#ifdef CUDA_FOUND
+	stim::complex<T>* dev_E;										//allocate space for the field
+	cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
+	cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
+	//cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>));				//set the field to zero (necessary because a sum is used)
+
+	//	COORDINATES
+	T* dev_x = NULL;												//allocate space and copy the X coordinate (if specified)
+	if(x != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+	T* dev_y = NULL;												//allocate space and copy the Y coordinate (if specified)
+	if(y != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+	T* dev_z = NULL;												//allocate space and copy the Z coordinate (if specified)
+	if(z != NULL){
+		HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
+	}
+
+	gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, W, a, n, r_spacing);
 	if(x != NULL) cudaFree(dev_x);														//free everything
 	if(y != NULL) cudaFree(dev_y);
@@ -311,6 +318,16 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
 	cudaFree(dev_E);
 #else
+	//calculate the necessary number of orders required to represent the scattered field
+	T k = W[0].kmag();
+
+	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
+	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
+	//std::cout<<"Nl: "<<Nl<<std::endl;
+
+	//calculate the scattering coefficients for the sphere
+	stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
+	B_coefficients(B, a, k, n, Nl);
 	//allocate space to store the bessel function call results
 	double vm;										
@@ -422,16 +439,16 @@ void gpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
 	size_t max_shared_mem = stim::sharedMemPerBlock();	
 	size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
-	std::cout<<"hl*Bl array size:  "<<hBl_array<<std::endl;
-	std::cout<<"shared memory:     "<<max_shared_mem<<std::endl;
+	//std::cout<<"hl*Bl array size:  "<<hBl_array<<std::endl;
+	//std::cout<<"shared memory:     "<<max_shared_mem<<std::endl;
 	int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
-	std::cout<<"threads per block: "<<threads<<std::endl;
+	//std::cout<<"threads per block: "<<threads<<std::endl;
 	dim3 blocks((unsigned)(N / threads + 1));										//calculate the optimal number of blocks
 	size_t shared_mem;
 	if(Nl <= LOCAL_NL) shared_mem = 0;
 	else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL);				//amount of shared memory to allocate
-	std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
+	//std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
 	cuda_scalar_mie_internal<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl);	//call the kernel
 }
@@ -452,7 +469,7 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
 	int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
 	if(Nl < LOCAL_NL) Nl = LOCAL_NL;							//always do at least the minimum number of local operations (kernel optimization)
-	std::cout<<"Nl: "<<Nl<<std::endl;
+	//std::cout<<"Nl: "<<Nl<<std::endl;
 	//calculate the scattering coefficients for the sphere
 	stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) );	//allocate space for the scattering coefficients
@@ -537,7 +554,7 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
 	size_t jA_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_jA_lut;
 	stim::complex<T>* jA_lut = (stim::complex<T>*) malloc(jA_bytes);													//pointer to the look-up table
 	T dr = (r_max - r_min) / (N_jA_lut-1);												//distance between values in the LUT
-	std::cout<<"LUT jl bytes:  "<<jA_bytes<<std::endl;
+	//std::cout<<"LUT jl bytes:  "<<jA_bytes<<std::endl;
 	stim::complex<T> hl;
 	stim::complex<double> nd = (stim::complex<double>)n;
 	for(size_t ri = 0; ri < N_jA_lut; ri++){													//for each value in the LUT
@@ -559,6 +576,10 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
 	if(x != NULL) cudaFree(dev_x);														//free everything
 	if(y != NULL) cudaFree(dev_y);
 	if(z != NULL) cudaFree(dev_z);
+	HANDLE_ERROR( cudaFree(dev_jA_lut) );
+	HANDLE_ERROR( cudaFree(dev_E) );
+	HANDLE_ERROR( cudaFree(dev_W) );
+	HANDLE_ERROR( cudaFree(dev_r) );
 	cudaFree(dev_E);
 #else
@@ -624,19 +645,44 @@ public:
 		n = ri;
 	}
+	void sum_scat(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){
+		std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves
+		stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
+	}
+
+	void sum_intern(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){
+		std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves
+		stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
+	}
+
+	void eval(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int order = 500, int samples = 1000){
+		b.eval(E, X, Y, Z, order);													//evaluate the incident field using a plane wave expansion
+		std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves		
+		sum_scat(E, X, Y, Z, b, samples);
+		sum_intern(E, X, Y, Z, b, samples);
+	}
+
 	void eval(stim::scalarfield<T>& E, stim::scalarbeam<T> b, int order = 500, int samples = 1000){
-		size_t array_size = E.grid_bytes();											//calculate the number of bytes in the scalar grid
+		/*size_t array_size = E.grid_bytes();											//calculate the number of bytes in the scalar grid
 		float* X = (float*) malloc( array_size );									//allocate space for the coordinate meshes
 		float* Y = (float*) malloc( array_size );
 		float* Z = (float*) malloc( array_size );
 		E.meshgrid(X, Y, Z, stim::CPUmem);											//calculate the coordinate meshes
-
-		b.eval(E, X, Y, Z, order);													//evaluate the incident field using a plane wave expansion
+		*/
+		E.meshgrid();
+		b.eval(E, order);
 		std::vector< stim::scalarwave<float> > wave_array = b.mc(samples);			//decompose the beam into an array of plane waves
-		stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
-		stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
+
+		if(E.gpu()){
+			stim::gpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing());
+		}
+		else{
+			stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing());
+			stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing());
+		}
+		//eval(E, X, Y, Z, b, order, samples);										//evaluate the field		
 	}
 };			//end stim::scalarmie
@@ -4,6 +4,7 @@
 #include <string>
 #include <stdlib.h>
 #include <cmath>
+#include "cublas_v2.h"
 #ifdef _WIN32
 	#include <float.h>
@@ -219,6 +220,42 @@ static void gpu2image(T* gpuSource, std::string fileDest, unsigned int x_size, u
 	free(cpuBuffer);
 }
+/// save a GPU image to a file using automatic scaling
+template<typename T>
+static void gpu2image(T* gpuSource, std::string fileDest, unsigned int x_size, unsigned int y_size, colormapType cm = cmGrayscale){
+	size_t N = x_size * y_size;								//calculate the total number of elements in the image
+
+	cublasStatus_t stat;
+    cublasHandle_t handle;
+
+	stat = cublasCreate(&handle);							//create a cuBLAS handle
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS initialization failed\n");
+		exit(1);
+	}
+
+	int i_min, i_max;
+	stat = cublasIsamin(handle, (int)N, gpuSource, 1, &i_min);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate minimum r value.\n");
+		exit(1);
+	}
+	stat = cublasIsamax(handle, (int)N, gpuSource, 1, &i_max);
+	if (stat != CUBLAS_STATUS_SUCCESS){						//test for failure
+        printf ("CUBLAS Error: failed to calculate maximum r value.\n");
+		exit(1);
+	}
+	cublasDestroy(handle);
+
+	i_min--;				//cuBLAS uses 1-based indexing for Fortran compatibility
+	i_max--;
+	T v_min, v_max;											//allocate space to store the minimum and maximum values
+	HANDLE_ERROR( cudaMemcpy(&v_min, gpuSource + i_min, sizeof(T), cudaMemcpyDeviceToHost) );		//copy the min and max values from the device to the CPU
+	HANDLE_ERROR( cudaMemcpy(&v_max, gpuSource + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
+
+	gpu2image<T>(gpuSource, fileDest, x_size, y_size, v_min, v_max, cm);
+}
+
 #endif
 template<class T>