diff --git a/stim/cuda/cudatools/error.h b/stim/cuda/cudatools/error.h
index 738bbb4..ddce736 100644
--- a/stim/cuda/cudatools/error.h
+++ b/stim/cuda/cudatools/error.h
@@ -1,22 +1,24 @@
+#ifndef STIM_CUDA_ERROR_H
+#define STIM_CUDA_ERROR_H
+
 #include <stdio.h>
 #include <iostream>
 using namespace std;
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
 #include "cufft.h"
-
-#ifndef CUDA_HANDLE_ERROR_H
-#define CUDA_HANDLE_ERROR_H
+#include "cublas_v2.h"
 
 //handle error macro
-static void HandleError( cudaError_t err, const char *file,  int line ) {
+static void cuHandleError( cudaError_t err, const char *file,  int line ) {
    	if (err != cudaSuccess) {
             printf("%s in %s at line %d\n", cudaGetErrorString( err ),  file, line );
+
    	}
 }
-#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
+#define HANDLE_ERROR( err ) (cuHandleError( err, __FILE__, __LINE__ ))
 
-static void CufftError( cufftResult err )
+static void cufftHandleError( cufftResult err, const char*file, int line )
 {
     if (err != CUFFT_SUCCESS)
     {
@@ -37,7 +39,29 @@ static void CufftError( cufftResult err )
 
     }
 }
+#define CUFFT_HANDLE_ERROR( err ) (cufftHandleError( err, __FILE__, __LINE__ ))
 
+static void cublasHandleError( cublasStatus_t err, const char*file, int line ){
+	if(err != CUBLAS_STATUS_SUCCESS){
+		if(err == CUBLAS_STATUS_NOT_INITIALIZED)
+			std::cout<<"CUBLAS_STATUS_NOT_INITIALIZED" <<" in file "<<file<<" line "<<std::endl;
+		else if(err == CUBLAS_STATUS_ALLOC_FAILED)
+			std::cout<<"CUBLAS_STATUS_ALLOC_FAILED" <<" in file "<<file<<" line "<<std::endl;
+		else if(err == CUBLAS_STATUS_INVALID_VALUE)
+			std::cout<<"CUBLAS_STATUS_INVALID_VALUE" <<" in file "<<file<<" line "<<std::endl;
+		else if(err == CUBLAS_STATUS_ARCH_MISMATCH)
+			std::cout<<"CUBLAS_STATUS_ARCH_MISMATCH" <<" in file "<<file<<" line "<<std::endl;
+		else if(err == CUBLAS_STATUS_MAPPING_ERROR)
+			std::cout<<"CUBLAS_STATUS_MAPPING_ERROR" <<" in file "<<file<<" line "<<std::endl;
+		else if(err == CUBLAS_STATUS_EXECUTION_FAILED)
+			std::cout<<"CUBLAS_STATUS_EXECUTION_FAILED" <<" in file "<<file<<" line "<<std::endl;
+		else if(err == CUBLAS_STATUS_INTERNAL_ERROR)
+			std::cout<<"CUBLAS_STATUS_INTERNAL_ERROR" <<" in file "<<file<<" line "<<std::endl;
+		else
+			std::cout<<"Unknown error"<<" in file "<<file<<" line "<<std::endl;
+	}
+}
+#define CUBLAS_HANDLE_ERROR( err ) (cublasHandleError( err, __FILE__, __LINE__ ))
 
 
 #endif
diff --git a/stim/envi/agilent_binary.h b/stim/envi/agilent_binary.h
index 8662329..1dc5cbf 100644
--- a/stim/envi/agilent_binary.h
+++ b/stim/envi/agilent_binary.h
@@ -6,11 +6,11 @@
 #include <fstream>
 
 //CUDA
-#ifdef CUDA_FOUND
-	#include <cuda_runtime.h>
-	#include "cufft.h"
-	#include <stim/cuda/cudatools/error.h>
-#endif
+//#ifdef CUDA_FOUND
+#include <cuda_runtime.h>
+#include "cufft.h"
+#include <stim/cuda/cudatools/error.h>
+//#endif
 
 namespace stim{
 
@@ -42,6 +42,10 @@ public:
 		alloc();
 	}
 
+	size_t dim(size_t i){
+		return R[i];
+	}
+
 	/// Create a deep copy of an agileng_binary object
 	void deep_copy(agilent_binary<T>* dst, const agilent_binary<T>* src){
 		dst->alloc(src->R[0], src->R[1], src->R[2]);			//allocate memory
@@ -169,7 +173,6 @@ public:
 	void zeropad(){
 		size_t newZ = pow(2, ceil(log(R[2])/log(2)));			//find the nearest power-of-two
 		size_t n = newZ - R[2];									//calculate the number of bands to add
-		std::cout<<"band padding: "<<n<<std::endl;
 		zeropad(n);												//add the padding
 	}
 
@@ -184,7 +187,7 @@ public:
 			ptr[i] = -log10(ptr[i] / background->ptr[i]);
 	}
 
-#ifdef CUDA_FOUND
+//#ifdef CUDA_FOUND
 	/// Perform an FFT and return a binary file with bands in the specified range
 	agilent_binary<T> fft(double band_min, double band_max, double ELWN = 15798, int UDR = 2){
 		auto total_start = std::chrono::high_resolution_clock::now();
@@ -271,7 +274,7 @@ public:
 
 		return result;
 	}
-#endif
+//#endif
 
 };
 
diff --git a/stim/envi/bil.h b/stim/envi/bil.h
index d68e6ca..cd6450f 100644
--- a/stim/envi/bil.h
+++ b/stim/envi/bil.h
@@ -4,6 +4,7 @@
 #include "../envi/envi_header.h"
 #include "../envi/hsi.h"
 #include "../math/fd_coefficients.h"
+#include <stim/cuda/cudatools/error.h>
 #include <cstring>
 #include <utility>
 #include <deque>
@@ -224,10 +225,44 @@ public:
 	}
 
 	//given a Y ,return a XZ slice
-	bool read_plane_y(T * p, unsigned long long y){
+	bool read_plane_xz(T * p, size_t y){
 		return binary<T>::read_plane_2(p, y);
 	}
 
+	//given a Y, return ZX slice (transposed such that the spectrum is the leading dimension)
+	int read_plane_zx(T* p, size_t y){
+		T* temp = (T*) malloc(X() * Z() * sizeof(T));	//allocate space to store the temporary xz plane
+		binary<T>::read_plane_2(temp, y);					//load the plane from disk
+		size_t z, x;
+		for(z = 0; z < Z(); z++){
+			for(x = 0; x <= z; x++){
+				p[x * Z() + z] = temp[z * X() + x];		//copy to the destination frame
+			}
+		}
+	}
+
+	//load a frame y into a pre-allocated double-precision array
+	int read_plane_xzd(double* f, size_t y){		
+		size_t XB = X() * B();
+		T* temp = (T*) malloc(XB * sizeof(T));			//create a temporary location to store the plane at current precision
+		if(!read_plane_y(temp, y)) return 1;			//read the plane in its native format, if it fails return a 1
+		for(size_t i = 0; i < XB; i++) f[i] = temp[i];	//convert the plane to a double
+		return 0;
+	}
+
+	//given a Y, return ZX slice (transposed such that the spectrum is the leading dimension)
+	int read_plane_zxd(double* p, size_t y){
+		T* temp = (T*) malloc(X() * Z() * sizeof(T));		//allocate space to store the temporary xz plane
+		binary<T>::read_plane_2(temp, y);					//load the plane from disk
+		size_t z, x;
+		for(z = 0; z < Z(); z++){
+			for(x = 0; x < X(); x++){
+				p[x * Z() + z] = (double)temp[z * X() + x];	//copy to the destination frame
+			}
+		}
+		return 0;
+	}
+
 
 	/// Perform baseline correction given a list of baseline points and stores the result in a new BSQ file.
 
@@ -268,7 +303,7 @@ public:
 		for (unsigned long long k =0; k < Y(); k++)
 		{
 			//get the current y slice
-			read_plane_y(c, k);
+			read_plane_xz(c, k);
 
 			//initialize lownum, highnum, low, high
 			ai = w[0];
@@ -369,7 +404,7 @@ public:
 
 		for(unsigned long long j = 0; j < Y(); j++)
 		{
-			read_plane_y(c, j);
+			read_plane_xz(c, j);
 			for(unsigned long long i = 0; i < B; i++)
 			{
 				for(unsigned long long m = 0; m < X(); m++)
@@ -469,7 +504,7 @@ public:
 
 		for ( unsigned long long i = 0; i < Y(); i++)
 		{
-			read_plane_y(p, i);
+			read_plane_xz(p, i);
 			for ( unsigned long long k = 0; k < Z(); k++)
 			{
 				unsigned long long ks = k * X();
@@ -863,7 +898,7 @@ public:
 
 		for (unsigned long long i = 0; i < Y(); i++)			//for each value in Y() (BIP should be X)
 		{
-			read_plane_y(temp, i);							//retrieve an ZX slice, stored in temp
+			read_plane_xz(temp, i);							//retrieve an ZX slice, stored in temp
 			for ( unsigned long long j = 0; j < Z(); j++)	//for each Z() (Y)
 			{
 				for (unsigned long long k = 0; k < X(); k++) //for each band
@@ -933,7 +968,7 @@ public:
 		//for each slice along the y axis
 		for (unsigned long long y = 0; y < Y(); y++)			//Select a page by choosing Y coordinate, Y()
 		{
-			read_plane_y(slice, y);							//retrieve an ZX page, store in "slice"
+			read_plane_xz(slice, y);							//retrieve an ZX page, store in "slice"
 
 			//for each sample along X
 			for (unsigned long long x = 0; x < X(); x++)		//Select a pixel by choosing X coordinate in the page, X()
@@ -1004,7 +1039,7 @@ public:
 
 		double x;											//create a register to store the pixel value
 		for (unsigned long long k = 0; k < Y(); k++){
-			read_plane_y(temp, k);
+			read_plane_xz(temp, k);
 			unsigned long long kx = k * X();
 			for (unsigned long long i = 0; i < X(); i++){
 				if (mask == NULL || mask[kx + i] != 0){
@@ -1025,6 +1060,68 @@ public:
 		return true;
 	}
 
+	int co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
+		cudaError_t cudaStat;
+		cublasStatus_t stat;
+		cublasHandle_t handle;
+
+		progress = 0;														//initialize the progress to zero (0)
+		size_t XY = X() * Y();												//calculate the number of elements in a band image
+		size_t XB = X() * Z();
+		size_t B = Z();														//calculate the number of spectral elements
+
+		double* F = (double*)malloc(sizeof(double) * B * X());				//allocate space for the frame that will be pulled from the file
+		double* F_dev;
+		HANDLE_ERROR(cudaMalloc(&F_dev, X() * B * sizeof(double)));			//allocate space for the frame on the GPU
+		double* s_dev;														//declare a device pointer that will store the spectrum on the GPU
+		double* A_dev;														//declare a device pointer that will store the covariance matrix on the GPU
+		double* avg_dev;													//declare a device pointer that will store the average spectrum
+		HANDLE_ERROR(cudaMalloc(&s_dev, B * sizeof(double)));				//allocate space on the CUDA device for a spectrum
+		HANDLE_ERROR(cudaMalloc(&A_dev, B * B * sizeof(double)));			//allocate space on the CUDA device for the covariance matrix
+		HANDLE_ERROR(cudaMemset(A_dev, 0, B * B * sizeof(double)));			//initialize the covariance matrix to zero (0)
+		HANDLE_ERROR(cudaMalloc(&avg_dev, XB * sizeof(double)));				//allocate space on the CUDA device for the average spectrum
+		for(size_t x = 0; x < X(); x++)											//make multiple copies of the average spectrum in order to build a matrix
+			HANDLE_ERROR(cudaMemcpy(&avg_dev[x * B], avg, B * sizeof(double), cudaMemcpyHostToDevice));	
+		//stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1);	//copy the average spectrum to the CUDA device
+
+		double ger_alpha = 1.0/(double)XY;										//scale the outer product by the inverse of the number of samples (mean outer product)
+		double axpy_alpha = -1;													//multiplication factor for the average spectrum (in order to perform a subtraction)
+
+		CUBLAS_HANDLE_ERROR(stat = cublasCreate(&handle));								//create a cuBLAS instance
+		if (stat != CUBLAS_STATUS_SUCCESS) return 1;									//test the cuBLAS instance to make sure it is valid
+
+		else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl;
+		double beta = 1.0;
+		size_t x, y;
+		for(y = 0; y < Y(); y++){										//for each line
+			read_plane_zxd(F, y);												//read a frame from the file
+			HANDLE_ERROR(cudaMemcpy(F_dev, F, XB * sizeof(double), cudaMemcpyHostToDevice));	//copy the frame to the GPU
+			CUBLAS_HANDLE_ERROR(cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, (int)B, (int)X(), &axpy_alpha, avg_dev, (int)B, &beta, F_dev, (int)B, F_dev, (int)B));//subtract the mean spectrum
+
+			for(x = 0; x < X(); x++)
+				CUBLAS_HANDLE_ERROR(cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, &F_dev[x*B], 1, A_dev, (int)B));			//perform an outer product
+			if(PROGRESS) progress = (double)(y + 1) / Y() * 100;
+		}
+
+		cublasGetMatrix((int)B, (int)B, sizeof(double), A_dev, (int)B, co, (int)B);			//copy the result from the GPU to the CPU
+
+		cudaFree(A_dev);																	//clean up allocated device memory
+		cudaFree(s_dev);
+		cudaFree(avg_dev);
+
+		for(unsigned long long i = 0; i < B; i++){										//copy the upper triangular portion to the lower triangular portion
+			for(unsigned long long j = i+1; j < B; j++){
+				co[B * i + j] = co[B * j + i];
+			}
+		}
+
+		return 0;
+
+
+
+	}
+
+
 	/// Calculate the covariance matrix for all masked pixels in the image.
 
 	/// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
@@ -1032,6 +1129,16 @@ public:
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
 	bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
 		progress = 0;
+
+		int dev_count;
+		cudaGetDeviceCount(&dev_count);									//get the number of CUDA devices
+		cudaDeviceProp prop;
+		cudaGetDeviceProperties(&prop, 0);								//get the property of the first device
+		if(dev_count > 0 && prop.major != 9999){							//if the first device is not an emulator
+			int status = co_matrix_cublas(co, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
+			if(status == 0) return true;									//if the cuBLAS function returned correctly, we're done
+		}																	//otherwise continue using the CPU
+
 		//memory allocation
 		unsigned long long xy = X() * Y();
 		unsigned long long B = Z();
@@ -1328,7 +1435,7 @@ public:
 		c = (T*)malloc( L );										//allocate space for the slice
 
 		for(unsigned long long j = 0; j < Y(); j++){				//for each line
-			read_plane_y(c, j);										//load the line into memory
+			read_plane_xz(c, j);										//load the line into memory
 			for(unsigned long long i = 0; i < B; i++){				//for each band
 				for(unsigned long long m = 0; m < X(); m++){		//for each sample
 					if( mask == NULL && mask[m + j * X()] )			//if the pixel is masked
@@ -1358,7 +1465,7 @@ public:
 		c = (T*)malloc( L );										//allocate space for the slice
 
 		for(unsigned long long j = 0; j < Y(); j++){				//for each line
-			read_plane_y(c, j);										//load the line into memory
+			read_plane_xz(c, j);										//load the line into memory
 			for(unsigned long long i = 0; i < B; i++){				//for each band
 				for(unsigned long long m = 0; m < X(); m++){		//for each sample
 					if( mask == NULL && mask[m + j * X()] )			//if the pixel is masked
diff --git a/stim/envi/bip.h b/stim/envi/bip.h
index 8c1925f..e88feaf 100644
--- a/stim/envi/bip.h
+++ b/stim/envi/bip.h
@@ -8,10 +8,11 @@
 #include <utility>
 
 //CUDA
-#ifdef CUDA_FOUND
-	#include <cuda_runtime.h>
-	#include "cublas_v2.h"
-#endif
+//#ifdef CUDA_FOUND
+#include <stim/cuda/cudatools/error.h>
+#include <cuda_runtime.h>
+#include "cublas_v2.h"
+//#endif
 
 namespace stim{
 
@@ -257,7 +258,7 @@ public:
 	}
 
 	//given a Y ,return a ZX slice
-	bool read_plane_y(T * p, unsigned long long y){
+	bool read_plane_y(T * p, size_t y){
 		return binary<T>::read_plane_2(p, y);
 	}
 
@@ -982,16 +983,15 @@ public:
 		free(temp);
 		return true;
 	}
-#ifdef CUDA_FOUND
+//#ifdef CUDA_FOUND
 	/// Calculate the covariance matrix for masked pixels using cuBLAS
 	/// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra
-	bool co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
+	int co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
 
 		cudaError_t cudaStat;
 		cublasStatus_t stat;
 		cublasHandle_t handle;
 
-		progress = 0;													//initialize the progress to zero (0)
 		unsigned long long XY = X() * Y();									//calculate the number of elements in a band image
 		unsigned long long B = Z();											//calculate the number of spectral elements
 
@@ -1009,10 +1009,9 @@ public:
 		double axpy_alpha = -1;												//multiplication factor for the average spectrum (in order to perform a subtraction)
 
 		stat = cublasCreate(&handle);										//create a cuBLAS instance
-		if (stat != CUBLAS_STATUS_SUCCESS) {								//test the cuBLAS instance to make sure it is valid
-			printf ("CUBLAS initialization failed\n");
-			return EXIT_FAILURE;
-		}
+		if (stat != CUBLAS_STATUS_SUCCESS) return 1;						//test the cuBLAS instance to make sure it is valid
+
+		else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl;
 		for (unsigned long long xy = 0; xy < XY; xy++){										//for each pixel
 			if (mask == NULL || mask[xy] != 0){
 				pixeld(s, xy);																	//retreive the spectrum at the current xy pixel location
@@ -1036,9 +1035,9 @@ public:
 			}
 		}
 
-		return true;
+		return 0;
 	}
-#endif
+//#endif
 
 	/// Calculate the covariance matrix for all masked pixels in the image with 64-bit floating point precision.
 
@@ -1046,16 +1045,19 @@ public:
 	/// @param avg is a pointer to memory of size B that stores the average spectrum
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
 	bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
+		progress = 0;
 
-#ifdef CUDA_FOUND
+//#ifdef CUDA_FOUND
 		int dev_count;
 		cudaGetDeviceCount(&dev_count);									//get the number of CUDA devices
 		cudaDeviceProp prop;
 		cudaGetDeviceProperties(&prop, 0);								//get the property of the first device
-		if(dev_count > 0 && prop.major != 9999)							//if the first device is not an emulator
-			return co_matrix_cublas(co, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
-#endif
-		progress = 0;
+		if(dev_count > 0 && prop.major != 9999){							//if the first device is not an emulator
+			int status = co_matrix_cublas(co, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
+			if(status == 0) return true;									//if the cuBLAS function returned correctly, we're done
+		}																	//otherwise continue using the CPU
+//#endif
+		
 		//memory allocation
 		unsigned long long XY = X() * Y();
 		unsigned long long B = Z();
@@ -1097,10 +1099,10 @@ public:
 	}
 
 
-#ifdef CUDA_FOUND
+//#ifdef CUDA_FOUND
 	/// Calculate the covariance matrix of Noise for masked pixels using cuBLAS
 	/// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra
-	bool coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
+	int coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
 
 		cudaError_t cudaStat;
 		cublasStatus_t stat;
@@ -1128,11 +1130,9 @@ public:
 		double ger_alpha = 1.0/(double)XY;									//scale the outer product by the inverse of the number of samples (mean outer product)
 		double axpy_alpha = -1;												//multiplication factor for the average spectrum (in order to perform a subtraction)
 
-		stat = cublasCreate(&handle);										//create a cuBLAS instance
-		if (stat != CUBLAS_STATUS_SUCCESS) {								//test the cuBLAS instance to make sure it is valid
-			printf ("CUBLAS initialization failed\n");
-			return EXIT_FAILURE;
-		}
+		CUBLAS_HANDLE_ERROR(cublasCreate(&handle));							//create a cuBLAS instance
+		if (stat != CUBLAS_STATUS_SUCCESS) return 1;						//test the cuBLAS instance to make sure it is valid
+
 		for (unsigned long long xy = 0; xy < XY; xy++){										//for each pixel
 			if (mask == NULL || mask[xy] != 0){
 				pixeld(s, xy);                                                             //retreive the spectrum at the current xy pixel location
@@ -1165,7 +1165,7 @@ public:
 
 		return true;
 	}
-#endif
+//#endif
 
 	/// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision.
 
@@ -1174,14 +1174,18 @@ public:
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
 	bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
 
-#ifdef CUDA_FOUND
+//#ifdef CUDA_FOUND
 		int dev_count;
 		cudaGetDeviceCount(&dev_count);									//get the number of CUDA devices
 		cudaDeviceProp prop;
 		cudaGetDeviceProperties(&prop, 0);								//get the property of the first device
-		if(dev_count > 0 && prop.major != 9999)							//if the first device is not an emulator
-			return coNoise_matrix_cublas(coN, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
-#endif
+		if(dev_count > 0 && prop.major != 9999){							//if the first device is not an emulator
+			int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
+			if(status == 0) return true;									//if the cuBLAS function returned correctly, we're done
+		}																	//otherwise continue using the CPU
+		//if(dev_count > 0 && prop.major != 9999)							//if the first device is not an emulator
+		//	return coNoise_matrix_cublas(coN, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
+//#endif
 
 
 
--
libgit2 0.21.4