fixed merge conflicts

Pavel Govyadinov
2 parents f7b84fb2 cf5b4c92
Showing 28 changed files with 2063 additions and 621 deletions Show diff stats
matlab/loadAgilent.m
stim/biomodels/cellset.h
stim/cuda/ivote/local_max.cuh
stim/cuda/ivote/re_sample.cuh
stim/cuda/ivote/update_dir_global.cuh → stim/cuda/ivote/update_dir_bb.cuh
stim/cuda/ivote/david_update_dir_global.cuh → stim/cuda/ivote/update_dir_threshold_global.cuh
stim/cuda/ivote/vote_atomic_bb.cuh
stim/cuda/ivote/vote_atomic_shared.cuh
stim/cuda/ivote/vote_shared.cuh
stim/cuda/ivote/vote_threshold_global.cuh
stim/cuda/ivote_atomic.cuh → stim/cuda/ivote_atomic_bb.cuh
stim/envi/agilent_binary.h
stim/envi/bil.h
stim/envi/binary.h
stim/envi/bip.h
stim/envi/bsq.h
stim/envi/envi.h
stim/envi/envi_header.h
stim/envi/hsi.h
stim/gl/gl_texture.h
+function S = loadAgilent(filename)
+
+fid = fopen(filename);          %open the file for reading
+fseek(fid, 9, 'bof');           %skip past the first 9 bytes of the header
+
+bands = fread(fid, 1, 'int16'); %read the number of bands in the file
+fseek(fid, 13, 'cof');          %skip the next 13 bytes in the header
+
+samples = fread(fid, 1, 'int16');   %read the number of samples (X)
+lines = fread(fid, 1, 'int16');     %read the number of lines (Y)
+
+fseek(fid, 1020, 'bof');            %skip past the entire header
+S = fread(fid, [samples lines*bands], 'float32');   %read all the data
+S = reshape(S, [samples, lines, bands]);
+fclose(fid);                    %close the file
+
+
+#ifndef STIM_CELLSET_H
+#define STIM_CELLSET_H
+
+#include <stim/math/vec3.h>
+#include <vector>
+#include <unordered_map>
+#include <fstream>
+
+namespace stim{
+
+class cellset{
+private:
+	static const char delim = ' ';
+protected:
+	std::vector<double*> cells;							//vector storing field data for each cell
+	std::unordered_map<std::string, size_t> fields;		//unordered map storing field->index information for each field
+	size_t ip[3];										//hard code to position indices (for speed)
+
+	void init(){
+
+	}
+public:
+	/// Constructor - create an empty cell set
+	cellset(){
+		init();
+	}
+
+	/// Constructor - load a cellset from a file
+	cellset(std::string filename){
+		init();											//initialize an empty cellset
+		load(filename);									//load the cellset from an existing file
+	}
+	
+	/// Loads a cellset from a file
+	void load(std::string filename){
+		std::ifstream infile(filename);
+		std::string header;								//allocate space for the file header
+		std::getline(infile, header);					//get the file header
+
+		// break the header into fields
+		std::stringstream ss(header);					//create a string stream
+		std::string field;								//store a single field name
+		size_t i = 0;									//current field index
+		while (std::getline(ss, field, delim)) {		//split the header into individual fields
+			std::pair<std::string, size_t> p(field, i);	//create a pair associating the header name with the index
+			fields.insert(p);							//insert the pair into the fields map
+			i++;										//increment the data index
+		}
+		size_t nfields = fields.size();					//store the number of fields for each cell
+
+		//load each cell and all associated fields
+		std::string cell_line;							//string holds all information for a cell
+		std::list<std::string> cell_list;				//list will be temporary storage for the cell fields
+		while(std::getline(infile, cell_line)){			//for each cell entry
+			cell_list.push_back(cell_line);				//push the cell entry into the list
+		}
+
+		//convert the list into actual data
+		size_t ncells = cell_list.size();				//count the number of cells
+		cells.resize(ncells);							//allocate enough space in the array to store all cells
+		for(size_t c = 0; c < ncells; c++){				//for each cell entry in the list
+			cells[c] = (double*) malloc(sizeof(double) * nfields);	//allocate enough space for each field
+			std::stringstream fss(cell_list.front());	//turn the string representing the cell list into a stringstream
+			for(size_t f = 0; f < nfields; f++){		//for each field
+				fss>>cells[c][f];						//load the field
+			}
+			cell_list.pop_front();						//pop the read string off of the front of the list
+		}
+		infile.close();									//close the input file
+
+		ip[0] = fields["x"];							//hard code the position indices for speed
+		ip[1] = fields["y"];							//	this assumes all cells have positions
+		ip[2] = fields["z"];
+	}
+
+	/// Return the value a specified field for a cell
+	/// @param c is the cell index
+	/// @param f is the field
+	double value(size_t c, std::string f){
+		size_t idx = fields[f];
+		return cells[c][idx];
+	}
+
+	/// returns an ID used to look up a field
+	bool exists(std::string f){
+		std::unordered_map<std::string, size_t>::iterator iter = fields.find(f);
+		if(iter == fields.end()) return false;
+		else return true;
+	}
+
+	/// Return the position of cell [i]
+	stim::vec3<double> p(size_t i){
+		stim::vec3<double> pos(cells[i][ip[0]], cells[i][ip[1]], cells[i][ip[2]]);
+		return pos;
+	}
+
+	/// Return the number of cells in the set
+	size_t size(){
+		return cells.size();
+	}
+
+	/// Return the maximum value of a field in this cell set
+	double max(std::string field){
+		size_t idx = fields[field];						//get the field index
+		size_t ncells = cells.size();					//get the total number of cells
+		double maxval, val;								//stores the current and maximum values
+		for(size_t c = 0; c < ncells; c++){				//for each cell
+			val = cells[c][idx];						//get the field value for this cell
+			if(c == 0) maxval = val;					//if this is the first cell, just assign the maximum
+			else if(val > maxval) maxval = val;			//	otherwise text for the size of val and assign it as appropriate
+		}
+		return maxval;
+	}
+
+	/// Return the maximum value of a field in this cell set
+	double min(std::string field){
+		size_t idx = fields[field];						//get the field index
+		size_t ncells = cells.size();					//get the total number of cells
+		double minval, val;								//stores the current and maximum values
+		for(size_t c = 0; c < ncells; c++){				//for each cell
+			val = cells[c][idx];						//get the field value for this cell
+			if(c == 0) minval = val;					//if this is the first cell, just assign the maximum
+			else if(val < minval) minval = val;			//	otherwise text for the size of val and assign it as appropriate
+		}
+		return minval;
+	}
+
+
+};		//end class cellset
+};		//end namespace stim
+
+#endif
 \ No newline at end of file
@@ -14,7 +14,7 @@ namespace stim{
  
 			// calculate the 2D coordinates for this current thread.
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
  
 			if(xi >= x || yi >= y)
 				return;
@@ -63,8 +63,10 @@ namespace stim{
 		void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
  
 			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads, 1);
-			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			/*dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/
+			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
+			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
  
 			//call the kernel to find the local maximum.
 			cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y);
+#ifndef STIM_CUDA_RE_SAMPLE_H
+#define STIM_CUDA_RE_SAMPLE_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/templates/gaussian_blur.cuh>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
+
+			unsigned int sigma_ds = 1/resize;
+			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
+			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
+			
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+			
+			if(xi< x && yi< y){
+				if(xi%sigma_ds==0){
+					if(yi%sigma_ds==0){
+						gpuI[i] = gpuI0[(yi/sigma_ds)*x_ds + xi/sigma_ds];
+					}
+				}
+				else gpuI[i] = 0;
+
+				//int x_org = xi * sigma_ds ;
+				//int y_org = yi * sigma_ds ;
+				//int i_org = y_org * x + x_org;
+				//gpuI[i] = gpuI0[i_org];
+			}
+
+		}
+
+
+		/// Applies a Gaussian blur to a 2D image stored on the GPU
+		template<typename T>
+		void gpu_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
+
+			
+			//unsigned int sigma_ds = 1/resize;
+			//unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
+			//unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
+			
+			//get the number of pixels in the image
+			//unsigned int pixels_ds = x_ds * y_ds;
+			
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+			//stim::cuda::gpu_gaussian_blur2<float>(gpuI0, sigma_ds,x ,y);
+			
+			//resample the image
+			cuda_re_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y);
+
+		}
+
+		/// Applies a Gaussian blur to a 2D image stored on the CPU
+		template<typename T>
+		void cpu_re_sample(T* out, T* in, T resize, unsigned int x, unsigned int y){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x*y;
+			unsigned int bytes = sizeof(T) * pixels;
+			
+			unsigned int sigma_ds = 1/resize;
+			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
+			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
+			unsigned int bytes_ds = sizeof(T) * x_ds * y_ds;
+			
+
+
+			//allocate space on the GPU for the original image
+			T* gpuI0;
+			cudaMalloc(&gpuI0, bytes_ds);
+			
+			
+			//copy the image data to the GPU
+			cudaMemcpy(gpuI0, in, bytes_ds, cudaMemcpyHostToDevice);
+
+			//allocate space on the GPU for the down sampled image
+			T* gpuI;
+			cudaMalloc(&gpuI, bytes);
+
+			//run the GPU-based version of the algorithm
+			gpu_re_sample<T>(gpuI, gpuI0, resize, x, y);
+
+			//copy the image data to the GPU
+			cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
+
+			cudaFree(gpuI0);
+			cudeFree(gpuI);
+		}
+	
+	}
+}
+
+#endif
 \ No newline at end of file
-#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
-#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
+#ifndef STIM_CUDA_UPDATE_DIR_BB_H
+#define STIM_CUDA_UPDATE_DIR_BB_H
  
 # include <iostream>
 # include <cuda.h>
@@ -7,8 +7,7 @@
 #include <stim/cuda/sharedmem.cuh>
 #include <stim/visualization/aabb2.h>
 #include <stim/visualization/colormap.h>
-#include <math.h>
-#include "cpyToshare.cuh" 
+#include <math.h> 
  
 //#define RMAX_TEST	8
  
@@ -76,68 +75,6 @@ namespace stim{
 			gpuDir[i] = atan2((T)max_dy, (T)max_dx);
 		}
  
-		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
-		template<typename T>
-		__global__ void leila_cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
-
-			
-			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-
-			if(xi >= x || yi >= y) return;													//if the index is outside of the image, terminate the kernel
-
-			int i = yi * x + xi;												// convert 2D coordinates to 1D
-			
-			float theta = gpuGrad[2*i];											// calculate the voting direction based on the grtadient direction - global memory fetch			
-			gpuDir[i] = 0;														//initialize the vote direction to zero			
-			float max = 0;														// define a local variable to maximum value of the vote image in the voting area for this voter
-			int id_x = 0;														// define two local variables for the x and y position of the maximum
-			int id_y = 0;
-			
-			int x_table = 2*rmax +1;											// compute the size of window which will be checked for finding the voting area for this voter
-			int rmax_sq = rmax * rmax;
-			int tx_rmax = threadIdx.x + rmax;
-			float atan_angle;
-			float vote_c;
-			int xidx, yidx, yr_sq, xr_sq;
-			for(int yr = -rmax; yr <= rmax; yr++){
-				yidx = yi + yr;													//compute the index into the image
-				if (yidx >= 0 && yidx < y){									//if the current y-index is inside the image
-					yr_sq = yr * yr;											//compute the square of yr, to save time later
-					for(int xr = -rmax; xr <= rmax; xr++){
-						xidx = xi + xr;
-						if(xidx >= 0 && xidx < x){
-							xr_sq = xr * xr;
-							unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
-
-							// calculate the angle between the voter and the current pixel in x and y directions
-							atan_angle = gpuTable[ind_t];
-							//atan_angle = atan2((T)yr, (T)xr);
-											
-							// check if the current pixel is located in the voting area of this voter.
-							if (((xr_sq + yr_sq)< rmax_sq) && (abs(atan_angle - theta) <phi)){
-								
-								vote_c = gpuVote[yidx * x + xidx];				// find the vote value for the current counter
-							// compare the vote value of this pixel with the max value to find the maxima and its index.
-								if  (vote_c>max) {
-
-									max = vote_c;
-									id_x =  xr;
-									id_y =  yr;
-								}
-							}
-						}
-					}
-				}
-			}
-							
-			unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
-			float new_angle = gpuTable[ind_m];
-
-			if(xi < x && yi < y)
-				gpuDir[i] = new_angle;
-		}										//end kernel
  
  
 		// this kernel updates the gradient direction by the calculated voting direction.
@@ -168,9 +105,7 @@ namespace stim{
 			HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) );	
  
 			unsigned int max_threads = stim::maxThreadsPerBlock();
-			//dim3 threads(min(x, max_threads), 1);
-			//dim3 blocks(x/threads.x, y);
-
+			
 			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
 			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
  
@@ -188,12 +123,12 @@ namespace stim{
  
 			//call the kernel to calculate the new voting direction
 			cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-			stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
+			//stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
  
 			//exit(0);
  
-			threads = dim3( sqrt(max_threads), sqrt(max_threads) );
-			blocks = dim3(x/threads.x + 1, y/threads.y + 1);
+			//threads = dim3( sqrt(max_threads), sqrt(max_threads) );
+			//blocks = dim3(x/threads.x + 1, y/threads.y + 1);
  
 			//call the kernel to update the gradient direction
 			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
-#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
-#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
+#ifndef STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBALD_H
+#define STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBAL_H
  
 # include <iostream>
 # include <cuda.h>
 #include <stim/cuda/cudatools.h>
 #include <stim/cuda/sharedmem.cuh>
-#include <math.h>
-#include "cpyToshare.cuh" 
-
-#define RMAX_TEST	8
+#include "cpyToshare.cuh"   
  
 namespace stim{
 	namespace cuda{
  
 		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
 		template<typename T>
-		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax,  int x,  int y){
-			extern __shared__ T atan2_table[];
-			
-			//calculate the start point for this block
-			//int bxi = blockIdx.x * blockDim.x;
-
-			stim::cuda::sharedMemcpy(atan2_table, gpuTable, (2 * rmax + 1) * (2 * rmax + 1), threadIdx.x, blockDim.x);
+		__global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x,  int y){
  
-			__syncthreads();
  
-			// calculate the 2D coordinates for this current thread.
-			//int xi = bxi + threadIdx.x;
+			
+			// calculate the coordinate for this current thread.
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-			if(xi >= x || yi >= y) return;													//if the index is outside of the image, terminate the kernel
-
-			int i = yi * x + xi;												// convert 2D coordinates to 1D
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuTh[3*xi];
  
-			float theta = gpuGrad[2*i];											// calculate the voting direction based on the grtadient direction - global memory fetch			
-			gpuDir[i] = 0;														//initialize the vote direction to zero			
-			float max = 0;														// define a local variable to maximum value of the vote image in the voting area for this voter
-			int id_x = 0;														// define two local variables for the x and y position of the maximum
-			int id_y = 0;
+			//calculate the position and x, y coordinations of this voter in the original image
+			unsigned int i_v = gpuTh[3*xi+2];
+			unsigned int y_v = i_v/x;
+			unsigned int x_v = i_v - (y_v*x);
  
-			int x_table = 2*rmax +1;											// compute the size of window which will be checked for finding the voting area for this voter
+			//initialize the vote direction to zero
+			gpuDir[xi] = 0;
+
+			// define a local variable to maximum value of the vote image in the voting area for this voter
+			float max = 0;
+
+			// define two local variables for the x and y coordinations where the maximum happened
+			int id_x = 0;
+			int id_y = 0;
+
+			// compute the size of window which will be checked for finding the voting area for this voter
+			int x_table = 2*rmax +1;
 			int rmax_sq = rmax * rmax;
 			int tx_rmax = threadIdx.x + rmax;
-			float atan_angle;
-			float vote_c;
-			unsigned int ind_t;
-			for(int yr = -rmax; yr <= rmax; yr++){					//for each counter in the y direction
-				if (yi+yr >= 0 && yi + yr < y){									//if the counter exists (we aren't looking outside of the image)
-					for(int xr = -rmax; xr <= rmax; xr++){					//for each counter in the x direction
-						if((xr * xr + yr *yr)< rmax_sq){								//if the counter is within range of the voter
-
-							ind_t = (rmax - yr) * x_table + rmax - xr;		//calculate the index to the atan2 table							
-							atan_angle = atan2_table[ind_t];								//retrieve the direction vector from the table						
-
-							//atan_angle = atan2((float)yr, (float)xr);
-							
-							if (abs(atan_angle - theta) <phi){							// check if the current pixel is located in the voting angle of this voter.				
-								vote_c = gpuVote[(yi+yr)*x + (xi+xr)];			// find the vote value for the current counter						
-								if(vote_c>max) {								// compare the vote value of this pixel with the max value to find the maxima and its index.
-									max = vote_c;
-									id_x =  xr;
-									id_y =  yr;
-								}
+			if(xi < th_size){
+				
+				for(int yr = -rmax; yr <= rmax; yr++){
+					
+					for(int xr = -rmax; xr <= rmax; xr++){
+
+						unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+						// find the angle between the voter and the current pixel in x and y directions
+						float atan_angle = gpuTable[ind_t];
+										
+						// check if the current pixel is located in the voting area of this voter.
+						if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+							// find the vote value for the current counter
+							float vote_c = gpuVote[(y_v+yr)*x + (x_v+xr)];
+							// compare the vote value of this pixel with the max value to find the maxima and its index.
+							if  (vote_c>max) {
+
+								max = vote_c;
+								id_x =  xr;
+								id_y =  yr;
 							}
 						}
 					}
 				}
-			}
+			
  
-			unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
-			float new_angle = gpuTable[ind_m];
+				unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
+				float new_angle = gpuTable[ind_m];
+				gpuDir[xi] = new_angle;
+			}
  
-			if(xi < x && yi < y)
-				gpuDir[i] = new_angle;
-		}										//end kernel
+		}
  
 		// this kernel updates the gradient direction by the calculated voting direction.
 		template<typename T>
-		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
+		__global__ void cuda_update_grad(T* gpuTh, T* gpuDir, int th_size, int x, int y){
  
-			// calculate the 2D coordinates for this current thread.
+			// calculate the coordinate for this current thread.
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-		
-			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
  
+		
 			//update the gradient image with the vote direction
-			gpuGrad[2*i] = gpuDir[i];
+			gpuTh[3*xi] = gpuDir[xi];
 		}
  
 		template<typename T>
-		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-			
+		void gpu_update_dir(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){
  
 			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
+			unsigned int bytes_th = th_size* sizeof(T);
  
 			unsigned int max_threads = stim::maxThreadsPerBlock();
-
-			dim3 threads(sqrt(max_threads), sqrt(max_threads));
-			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
-
-			
+			dim3 threads(max_threads);
+			dim3 blocks(th_size/threads.x+1);
  
 			// allocate space on the GPU for the updated vote direction
 			T* gpuDir;
-			cudaMalloc(&gpuDir, bytes);	
-
-			size_t shared_mem = sizeof(T) * std::pow((2 * rmax + 1), 2);
-			std::cout<<"Shared memory for atan2 table: "<<shared_mem<<std::endl;
+			cudaMalloc(&gpuDir, bytes_th);	
  
 			//call the kernel to calculate the new voting direction
-			cuda_update_dir <<< blocks, threads, shared_mem>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+			cuda_update_dir <<< blocks, threads>>>(gpuDir, gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y);
  
 			//call the kernel to update the gradient direction
-			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
+			cuda_update_grad <<< blocks, threads >>>(gpuTh, gpuDir, th_size, x , y);
  
 			//free allocated memory
 			cudaFree(gpuDir);
+#ifndef STIM_CUDA_VOTE_ATOMIC_BB_H
+#define STIM_CUDA_VOTE_ATOMIC_BB_H
+
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include <stim/visualization/aabb2.h>
+#include <stim/visualization/colormap.h>
+#include <math.h>
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+
+			extern __shared__ T S[];
+			T* shared_atan = S;
+			size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
+			stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			
+			if(xi >= x || yi >= y) return;			
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuGrad[2*i];
+			//calculate the amount of vote for the voter
+			float mag = gpuGrad[2*i + 1];
+			
+
+			stim::aabb2<int> bb(xi, yi);								//initialize a bounding box at the current point
+			bb.insert(xi + ceil(rmax * cos(theta)),       ceil(yi + rmax * sin(theta)));
+			bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi)));		//insert one corner of the triangle into the bounding box
+			bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi)));		//insert the final corner into the bounding box
+			
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			
+			int lut_i;
+			T dx_sq, dy_sq;
+
+			bb.trim_low(0, 0);															//make sure the bounding box doesn't go outside the image
+			bb.trim_high(x-1, y-1);
+
+			int by, bx;
+			int dx, dy;					
+			
+			unsigned int ind_g;											//initialize the maximum vote value to zero
+			T alpha;
+			
+			for(by = bb.low[1]; by <= bb.high[1]; by++){					//for each element in the bounding box
+				dy = by - yi;											//calculate the y coordinate of the current point relative to yi
+				dy_sq = dy * dy;
+				for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
+					dx = bx - xi;
+					dx_sq = dx * dx;
+					lut_i = (rmax - dy) * x_table + rmax - dx;
+					alpha = shared_atan[lut_i];
+					if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
+						ind_g = (by)*x + (bx);
+						atomicAdd(&gpuVote[ind_g], mag);
+					
+					}
+				}
+			}			
+			
+		}
+	
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
+			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
+			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
+			size_t shared_mem_req = table_bytes;// + template_bytes;
+			std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;		
+			size_t shared_mem = stim::sharedMemPerBlock();
+			if(shared_mem_req > shared_mem){
+				std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
+				exit(1);
+			}
+
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
@@ -5,7 +5,7 @@
 # include <cuda.h>
 #include <stim/cuda/cudatools.h>
 #include <stim/cuda/sharedmem.cuh>
-#include "cpyToshare.cuh"
+
 //#include "writebackshared.cuh"
 namespace stim{
 	namespace cuda{
+#ifndef STIM_CUDA_VOTE_SHARED_H
+#define STIM_CUDA_VOTE_SHARED
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ float s_grad[];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = bxi + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+						
+			// define a local variable to sum the votes from the voters
+			float sum = 0;
+			
+			//calculate the width of the shared memory block
+			int swidth = 2 * rmax + blockDim.x;
+			
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			int bxs = bxi - rmax;
+			
+			//for every line (along y)
+			for(int yr = -rmax; yr <= rmax; yr++){
+				if (yi+yr<y && yi+yr>=0){
+					//copy the portion of the image necessary for this block to shared memory
+					__syncthreads();
+					cpyG2S1D2ch<float>(s_grad, gpuGrad, bxs, yi + yr , 2*swidth, 1, threadIdx, blockDim, x, y);
+					__syncthreads();
+				
+					if(xi < x && yi < y){
+
+						for(int xr = -rmax; xr <= rmax; xr++){
+					
+								//find the location of this voter in the atan2 table
+								int id_t = (yr + rmax) * x_table + xr + rmax;
+
+								// calculate the angle between the pixel and the current voter in x and y directions
+								float atan_angle = gpuTable[id_t];
+												
+								// calculate the voting direction based on the grtadient direction
+								int idx_share = xr + tx_rmax ;
+								float theta = s_grad[idx_share*2];
+								float mag = s_grad[idx_share*2 + 1];
+							
+
+								// check if the current voter is located in the voting area of this pixel.
+								if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+									sum += mag;		
+
+								}
+						}
+				
+					}
+				}
+			}
+			if(xi < x && yi < y)
+				gpuVote[i] = sum;
+			
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+					
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*1*2*sizeof(T);
+			
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H
+#define STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H
+# include <iostream>
+# include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include "cpyToshare.cuh"
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x, int y){
+
+						
+			// calculate the x coordinate for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuTh[3*xi];
+			//find the gradient magnitude for the current voter
+			float mag = gpuTh[3*xi + 1];
+			//calculate the position and x, y coordinations of this voter in the original image
+			unsigned int i_v = gpuTh[3*xi+2];
+			unsigned int y_v = i_v/x;
+			unsigned int x_v = i_v - (y_v*x);
+			
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			int x_table = 2*rmax +1;
+			int rmax_sq = rmax * rmax;
+			if(xi < th_size){
+				for(int yr = -rmax; yr <= rmax; yr++){					
+					for(int xr = -rmax; xr <= rmax; xr++){
+						if ((y_v+yr)>=0 && (y_v+yr)<y && (x_v+xr)>=0 && (x_v+xr)<x){
+					
+							//find the location of the current pixel in the atan2 table
+							unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+							// calculate the angle between the voter and the current pixel in x and y directions
+							float atan_angle = gpuTable[ind_t];
+						
+							// check if the current pixel is located in the voting area of this voter.
+							if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+								// calculate the 1D index for the current pixel in global memory
+								unsigned int ind_g = (y_v+yr)*x + (x_v+xr);
+								atomicAdd(&gpuVote[ind_g], mag);
+							
+								}
+						}
+					}
+				}	
+			}
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){
+
+							
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads);
+			dim3 blocks(th_size/threads.x + 1);
+			
+			//call the kernel to do the voting
+			cuda_vote <<< blocks, threads>>>(gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+						
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+							
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
-#ifndef STIM_CUDA_IVOTE_ATOMIC_H
-#define STIM_CUDA_IVOTE_ATOMIC_H
+#ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H
+#define STIM_CUDA_IVOTE_ATOMIC_BB_H
  
 #include <stim/cuda/ivote/down_sample.cuh>
 #include <stim/cuda/ivote/local_max.cuh>
-#include <stim/cuda/ivote/update_dir_global.cuh>
-//#include <stim/cuda/ivote/vote_shared_32-32.cuh>
-#include <stim/cuda/ivote/vote_atomic_shared.cuh>
-//#include <stim/cuda/ivote/re_sample.cuh>
+#include <stim/cuda/ivote/update_dir_bb.cuh>
+#include <stim/cuda/ivote/vote_atomic_bb.cuh>
+
 namespace stim{
 	namespace cuda{
  
@@ -35,26 +35,28 @@ public:
 	void alloc(){
 		ptr = (T*) malloc(bytes());
 	}
-	void alloc(short x, short y, short z){
+	void alloc(size_t x, size_t y, size_t z){
 		R[0] = x;
 		R[1] = y;
 		R[2] = z;
 		alloc();
 	}
  
+	/// Create a deep copy of an agileng_binary object
 	void deep_copy(agilent_binary<T>* dst, const agilent_binary<T>* src){
 		dst->alloc(src->R[0], src->R[1], src->R[2]);			//allocate memory
 		memcpy(dst->ptr, src->ptr, bytes());					//copy the data
 		memcpy(dst->Z, src->Z, sizeof(double) * 2);				//copy the data z range
 	}
  
+	/// Default constructor, sets the resolution to zero and the data pointer to NULL
 	agilent_binary(){
-		memset(R, 0, sizeof(short) * 3);				//set the resolution to zero
+		memset(R, 0, sizeof(size_t) * 3);				//set the resolution to zero
 		ptr = NULL;
 	}
  
 	/// Constructor with resolution
-	agilent_binary(short x, short y, short z){
+	agilent_binary(size_t x, size_t y, size_t z){
 		alloc(x, y, z);
 	}
  
@@ -109,13 +111,11 @@ public:
  
 		char zero = 0;
 		for(size_t i = 0; i < 9; i++) outfile.write(&zero, 1);		//write 9 zeros
-		outfile.write((char*)&R[0], 2);
+		outfile.write((char*)&R[2], 2);
 		for(size_t i = 0; i < 13; i++) outfile.write(&zero, 1);		//write 13 zeros
+		outfile.write((char*)&R[0], 2);
 		outfile.write((char*)&R[1], 2);
-		outfile.write((char*)&R[2], 2);
 		for(size_t i = 0; i < 992; i++) outfile.write(&zero, 1);		//write 992 zeros
-		//char zerovec[1020];
-		//outfile.write((char*)zerovec, 1020);
  
 		size_t b = bytes();
 		outfile.write((char*)ptr, b);							//write the data to the output file
@@ -149,7 +149,7 @@ public:
  
 #ifdef CUDA_FOUND
 	/// Perform an FFT and return a binary file with bands in the specified range
-	agilent_binary<T> fft(float band_min, float band_max){
+	agilent_binary<T> fft(double band_min, double band_max, double ELWN = 15798, int UDR = 2){
 		auto total_start = std::chrono::high_resolution_clock::now();
  
 		auto start = std::chrono::high_resolution_clock::now();
@@ -177,8 +177,8 @@ public:
  
 		start = std::chrono::high_resolution_clock::now();
 		int N[1];					//create an array with the interferogram size (required for cuFFT input)
-		N[0] = R[2];				//set the only array value to the interferogram size
-		if(cufftPlanMany(&plan, 1, N, NULL, 1, R[2], NULL, 1, R[2], CUFFT_R2C, batch) != CUFFT_SUCCESS){
+		N[0] = (int)R[2];				//set the only array value to the interferogram size
+		if(cufftPlanMany(&plan, 1, N, NULL, 1, (int)R[2], NULL, 1, (int)R[2], CUFFT_R2C, (int)batch) != CUFFT_SUCCESS){
 			std::cout<<"cuFFT Error: unable to create 1D plan."<<std::endl;
 			exit(1);
 		}
@@ -199,12 +199,13 @@ public:
 		std::complex<T>* cpu_fft = (std::complex<T>*) malloc( R[0] * R[1] * (R[2]/2+1) * sizeof(std::complex<T>) );
 		HANDLE_ERROR(cudaMemcpy(cpu_fft, gpu_fft, R[0] * R[1] * (R[2]/2+1) * sizeof(cufftComplex), cudaMemcpyDeviceToHost));	//copy data from the host to the device
  
-		double int_delta = 0.00012656;									//interferogram sample spacing in centimeters
+		//double int_delta = 0.00012656;									//interferogram sample spacing in centimeters
+		double int_delta = (1.0 / ELWN) * ((double)UDR / 2.0);			//calculate the interferogram spacing
 		double int_length = int_delta * R[2];							//interferogram length in centimeters
 		double fft_delta = 1/int_length;								//spectrum spacing (in inverse centimeters, wavenumber
  
-		size_t start_i = std::ceil(band_min / fft_delta);				//calculate the first band to store
-		size_t size_i = std::floor(band_max / fft_delta) - start_i;		//calculate the number of bands to store
+		size_t start_i = (size_t)std::ceil(band_min / fft_delta);				//calculate the first band to store
+		size_t size_i = (size_t)std::floor(band_max / fft_delta) - start_i;		//calculate the number of bands to store
 		size_t end_i = start_i + size_i;								//last band number
 		agilent_binary<T> result(R[0], R[1], size_i);
 		result.Z[0] = start_i * fft_delta;								//set the range for the FFT result
@@ -1309,6 +1309,66 @@ public:
  
 	}
  
+	bool multiply(std::string outname, double v, unsigned char* mask = NULL, bool PROGRESS = false){
+		unsigned long long B = Z();									//calculate the number of bands
+		unsigned long long ZX = Z() * X();
+		unsigned long long XY = X() * Y();							//calculate the number of pixels in a band
+		unsigned long long S = XY * sizeof(T);						//calculate the number of bytes in a band
+		unsigned long long L = ZX * sizeof(T);
+
+		std::ofstream target(outname.c_str(), std::ios::binary);	//open the target binary file
+
+		T * c;														//pointer to the current ZX slice
+		c = (T*)malloc( L );										//allocate space for the slice
+
+		for(unsigned long long j = 0; j < Y(); j++){				//for each line
+			read_plane_y(c, j);										//load the line into memory
+			for(unsigned long long i = 0; i < B; i++){				//for each band
+				for(unsigned long long m = 0; m < X(); m++){		//for each sample
+					if( mask == NULL && mask[m + j * X()] )			//if the pixel is masked
+						c[m + i * X()] *= (T)v;
+				}
+			}
+			target.write(reinterpret_cast<const char*>(c), L);		//write normalized data into destination
+
+			if(PROGRESS) progress = (double)(j+1) / Y() * 100;		//update the progress
+		}
+
+		free(c);													//free the slice memory
+		target.close();												//close the output file
+		return true;
+	}
+
+	bool add(std::string outname, double v, unsigned char* mask = NULL, bool PROGRESS = false){
+		unsigned long long B = Z();									//calculate the number of bands
+		unsigned long long ZX = Z() * X();
+		unsigned long long XY = X() * Y();							//calculate the number of pixels in a band
+		unsigned long long S = XY * sizeof(T);						//calculate the number of bytes in a band
+		unsigned long long L = ZX * sizeof(T);
+
+		std::ofstream target(outname.c_str(), std::ios::binary);	//open the target binary file
+
+		T * c;														//pointer to the current ZX slice
+		c = (T*)malloc( L );										//allocate space for the slice
+
+		for(unsigned long long j = 0; j < Y(); j++){				//for each line
+			read_plane_y(c, j);										//load the line into memory
+			for(unsigned long long i = 0; i < B; i++){				//for each band
+				for(unsigned long long m = 0; m < X(); m++){		//for each sample
+					if( mask == NULL && mask[m + j * X()] )			//if the pixel is masked
+						c[m + i * X()] += (T)v;
+				}
+			}
+			target.write(reinterpret_cast<const char*>(c), L);		//write normalized data into destination
+
+			if(PROGRESS) progress = (double)(j+1) / Y() * 100;		//update the progress
+		}
+
+		free(c);													//free the slice memory
+		target.close();												//close the output file
+		return true;
+	}
+
 	/// Close the file.
 	bool close(){
 		file.close();
@@ -7,6 +7,13 @@
 #include "../math/vector.h"
 #include <fstream>
 #include <sys/stat.h>
+#include <cstring>
+
+#ifdef _WIN32
+#include <Windows.h>
+#else
+#include <unistd.h>
+#endif
  
 namespace stim{
  
@@ -30,14 +37,16 @@ protected:
  
 	double progress;		//stores the progress on the current operation (accessible using a thread)
  
+	size_t buffer_size;		//available memory for processing large files
  
 	/// Private initialization function used to set default parameters in the data structure.
 	void init(){
-		memset(R, 0, sizeof(unsigned long long) * D);		//initialize the resolution to zero
-		header = 0;									//initialize the header size to zero
+		std::memset(R, 0, sizeof(unsigned long long) * D);		//initialize the resolution to zero
+		header = 0;											//initialize the header size to zero
 		mask = NULL;
  
 		progress = 0;
+		set_buffer();										//set the maximum buffer size to the default
 	}
  
 	/// Private helper function that returns the size of the file on disk using system functions.
@@ -105,6 +114,11 @@ protected:
  
 public:
  
+	//default constructor
+	binary(){
+		init();
+	}
+
 	double get_progress(){
 		return progress;
 	}
@@ -113,6 +127,20 @@ public:
 		progress = 0;
 	}
  
+	//specify the maximum fraction of available memory that this class will use for buffering
+	void set_buffer(double mem_frac = 0.5){				//default to 50%
+#ifdef _WIN32
+		MEMORYSTATUSEX statex;
+		statex.dwLength = sizeof (statex);
+		GlobalMemoryStatusEx (&statex);
+		buffer_size = (size_t)(statex.ullAvailPhys * mem_frac);
+#else
+		size_t pages = sysconf(_SC_PHYS_PAGES);
+		size_t page_size = sysconf(_SC_PAGE_SIZE);
+		buffer_size = (size_t)(pages * page_size *  mem_frac);
+#endif
+	}
+
 	/// Open a binary file for streaming.
  
 	/// @param filename is the name of the binary file
@@ -375,6 +403,96 @@ public:
 		return read_pixel(p, i);
 	}
  
+	/// Reads a block specified by an (x, y, z) position and size using the largest possible contiguous reads
+	bool read(T* dest, size_t x, size_t y, size_t z, size_t sx, size_t sy, size_t sz){
+
+		size_t size_bytes = sx * sy * sz * sizeof(T);					//size of the block to read in bytes
+
+		size_t start = z * R[0] * R[1] + y * R[0] + x;						//calculate the start postion
+		size_t start_bytes = start * sizeof(T);							//start position in bytes
+		file.seekg(start * sizeof(T), std::ios::beg);					//seek to the start position
+
+		
+		if(sx == R[0] && sy == R[1]){				//if sx and sy result in a contiguous volume along z
+			file.read((char*)dest, size_bytes);			//read the block in one pass
+			return true;
+		}
+
+		if(sx == R[0]){												//if sx is contiguous, read each z-axis slice can be read in one pass
+			size_t jump_bytes = (R[1] - sy) * R[0] * sizeof(T);		//jump between each slice
+			size_t slice_bytes = sx * sy * sizeof(T);				//size of the slice to be read
+			for(size_t zi = 0; zi < sz; zi++){						//for each z-axis slice
+				file.read((char*)dest, slice_bytes);						//read the slice
+				dest += sx * sy;									//move the destination pointer to the next slice
+				file.seekg(jump_bytes, std::ios::cur);				//skip to the next slice in the file
+			}
+			return true;
+		}
+
+		//in this case, x is not contiguous so the volume must be read line-by-line
+		size_t jump_x_bytes = (R[0] - sx) * sizeof(T);				//number of bytes skipped in the x direction
+		size_t jump_y_bytes = (R[1] - sy) * R[0] * sizeof(T) + jump_x_bytes;	//number of bytes skipped between slices
+		size_t line_bytes = sx * sizeof(T);							//size of the line to be read
+		size_t zi, yi;
+		for(zi = 0; zi < sz; zi++){									//for each slice
+			file.read((char*)dest, line_bytes);							//read the first line
+			for(yi = 1; yi < sy; yi++){								//read each additional line
+				dest += sx;											//move the pointer in the destination block to the next line
+				file.seekg(jump_x_bytes, std::ios::cur);			//skip to the next line in the file
+				file.read((char*)dest, line_bytes);						//read the line to the destination block
+			}
+			file.seekg(jump_y_bytes, std::ios::cur);				//skip to the beginning of the next slice
+		}
+		return false;
+	}
+
+	// permutes a block of data from the current interleave to the interleave specified (re-arranged dimensions to the order specified by [d0, d1, d2])
+
+	void permute(T* dest, T* src, size_t sx, size_t sy, size_t sz, size_t d0, size_t d1, size_t d2){
+		size_t d[3] = {d0, d1, d2};
+		size_t s[3] = {sx, sy, sz};
+		size_t p[3];// = {x, y, z};
+		
+		if(d[0] == 0 && d[1] == 1 && d[2] == 2){
+			//this isn't actually a permute - just copy the data
+			memcpy(dest, src, sizeof(T) * sx * sy * sz);
+		}
+		else if(d[0] == 0){						//the individual lines are contiguous, so you can memcpy line-by-line
+			size_t y, z;
+			size_t src_idx, dest_idx;
+			size_t x_bytes = sizeof(T) * sx;
+			for(z = 0; z < sz; z++){
+				p[2] = z;
+				for(y = 0; y < sy; y++){
+					p[1] = y;
+					src_idx = z * sx * sy + y * sx;
+					dest_idx = p[d[2]] * s[d[0]] * s[d[1]] + p[d[1]] * s[d[0]];
+					//std::cout<<z<<", "<<y<<" ------- "<<p[d[2]]<<" * "<<s[d[0]]<<" * "<<s[d[1]]<<" + "<<p[d[1]]<<" * "<<s[d[0]]<<std::endl;
+					memcpy(dest + dest_idx, src + src_idx, x_bytes);
+				}
+			}
+		}
+		else{									//loop through every damn point
+			size_t x, y, z;
+			size_t src_idx, dest_idx;
+			size_t src_z, src_y;
+			for(z = 0; z < sz; z++){
+				p[2] = z;
+				src_z = z * sx * sy;
+				for(y = 0; y < sy; y++){
+					p[1] = y;
+					src_y = src_z + y * sx;
+					for(x = 0; x < sx; x++){
+						p[0] = x;
+						src_idx = src_y + x;
+						dest_idx = p[d[2]] * s[d[0]] * s[d[1]] + p[d[1]] * s[d[0]] + p[d[0]];
+						dest[dest_idx] = src[src_idx];
+					}
+				}
+			}
+		}
+	}
+
 };
  
 }
@@ -373,7 +373,7 @@ public:
 		for(size_t xy = 0; xy < XY; xy++){							//for each pixel
 			memset(spec, 0, Bb);									//set the spectrum to zero
 			if(mask == NULL || mask[xy]){							//if the pixel is masked
-				len = 0;											//initialize the 
+				len = 0;											//initialize the
 				file.read((char*)spec, Bb);							//read a spectrum
 				for(size_t b = 0; b < B; b++)						//for each band
 					len += spec[b]*spec[b];							//add the square of the spectral band
@@ -385,7 +385,7 @@ public:
 				file.seekg(Bb, std::ios::cur);						//otherwise skip a spectrum
 			target.write((char*)spec, Bb);							//output the normalized spectrum
 			if(PROGRESS) progress = (double)(xy + 1) / (double)XY * 100;		//update the progress
-		}		
+		}
 	}
  
  
@@ -1088,6 +1088,232 @@ public:
 		return true;
 	}
  
+
+#ifdef CUDA_FOUND
+	/// Calculate the covariance matrix of Noise for masked pixels using cuBLAS
+	/// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra
+	bool coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
+
+		cudaError_t cudaStat;
+		cublasStatus_t stat;
+		cublasHandle_t handle;
+
+		progress = 0;													    //initialize the progress to zero (0)
+		unsigned long long XY = X() * Y();									//calculate the number of elements in a band image
+		unsigned long long B = Z();											//calculate the number of spectral elements
+
+		double* s = (double*)malloc(sizeof(double) * B);					//allocate space for the spectrum that will be pulled from the file
+		double* s_dev;														//declare a device pointer that will store the spectrum on the GPU
+
+        double* s2_dev;														//  device pointer on the GPU
+        cudaStat = cudaMalloc(&s2_dev, B * sizeof(double));					//  allocate space on the CUDA device
+        cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double));               //  initialize s2_dev to zero (0)
+
+		double* A_dev;														//declare a device pointer that will store the covariance matrix on the GPU
+		double* avg_dev;													//declare a device pointer that will store the average spectrum
+		cudaStat = cudaMalloc(&s_dev, B * sizeof(double));					//allocate space on the CUDA device for the spectrum
+		cudaStat = cudaMalloc(&A_dev, B * B * sizeof(double));				//allocate space on the CUDA device for the covariance matrix
+		cudaStat = cudaMemset(A_dev, 0, B * B * sizeof(double));			//initialize the covariance matrix to zero (0)
+		cudaStat = cudaMalloc(&avg_dev, B * sizeof(double));				//allocate space on the CUDA device for the average spectrum
+		stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1);		//copy the average spectrum to the CUDA device
+
+		double ger_alpha = 1.0/(double)XY;									//scale the outer product by the inverse of the number of samples (mean outer product)
+		double axpy_alpha = -1;												//multiplication factor for the average spectrum (in order to perform a subtraction)
+
+		stat = cublasCreate(&handle);										//create a cuBLAS instance
+		if (stat != CUBLAS_STATUS_SUCCESS) {								//test the cuBLAS instance to make sure it is valid
+			printf ("CUBLAS initialization failed\n");
+			return EXIT_FAILURE;
+		}
+		for (unsigned long long xy = 0; xy < XY; xy++){										//for each pixel
+			if (mask == NULL || mask[xy] != 0){
+				pixeld(s, xy);                                                             //retreive the spectrum at the current xy pixel location
+
+				stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1);						//copy the spectrum from the host to the device
+				stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s_dev, 1);				//subtract the average spectrum
+
+                cudaMemcpy(s2_dev, s_dev + 1 , (B-1) * sizeof(double), cudaMemcpyDeviceToDevice);    //copy B-1 elements from shifted source data (s_dev) to device pointer (s2_dev )
+                stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1);	   //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them )
+
+
+				stat = cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, s_dev, 1, A_dev, (int)B);	//calculate the covariance matrix (symmetric outer product)
+			}
+			if(PROGRESS) progress = (double)(xy+1) / XY * 100;													//record the current progress
+
+		}
+
+		cublasGetMatrix((int)B, (int)B, sizeof(double), A_dev, (int)B, coN, (int)B);					//copy the result from the GPU to the CPU
+
+		cudaFree(A_dev);														//clean up allocated device memory
+		cudaFree(s_dev);
+		cudaFree(s2_dev);
+		cudaFree(avg_dev);
+
+		for(unsigned long long i = 0; i < B; i++){										//copy the upper triangular portion to the lower triangular portion
+			for(unsigned long long j = i+1; j < B; j++){
+				coN[B * i + j] = coN[B * j + i];
+			}
+		}
+
+		return true;
+	}
+#endif
+
+	/// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision.
+
+	/// @param coN is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
+	/// @param avg is a pointer to memory of size B that stores the average spectrum
+	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
+	bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
+
+#ifdef CUDA_FOUND
+		int dev_count;
+		cudaGetDeviceCount(&dev_count);									//get the number of CUDA devices
+		cudaDeviceProp prop;
+		cudaGetDeviceProperties(&prop, 0);								//get the property of the first device
+		if(dev_count > 0 && prop.major != 9999)							//if the first device is not an emulator
+			return coNoise_matrix_cublas(coN, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
+#endif
+
+
+
+		progress = 0;
+		//memory allocation
+		unsigned long long XY = X() * Y();
+		unsigned long long B = Z();
+		T* temp = (T*)malloc(sizeof(T) * B);
+
+		unsigned long long count = nnz(mask);								//count the number of masked pixels
+
+		//initialize covariance matrix of noise
+		memset(coN, 0, B * B * sizeof(double));
+
+		//calculate covariance matrix
+		double* coN_half = (double*) malloc(B * B * sizeof(double));			//allocate space for a higher-precision intermediate matrix
+		double* temp_precise = (double*) malloc(B * sizeof(double));
+		memset(coN_half, 0, B * B * sizeof(double));							//initialize the high-precision matrix with zeros
+		unsigned long long idx;													//stores i*B to speed indexing
+		for (unsigned long long xy = 0; xy < XY; xy++){
+			if (mask == NULL || mask[xy] != 0){
+				pixel(temp, xy);												//retreive the spectrum at the current xy pixel location
+				for(unsigned long long b = 0; b < B; b++)									//subtract the mean from this spectrum and increase the precision
+					temp_precise[b] = (double)temp[b] - (double)avg[b];
+
+                for(unsigned long long b2 = 0; b2 < B-1; b2++)	    //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them )
+					temp_precise[b2] -=  temp_precise[b2+1];
+
+				idx = 0;
+				for (unsigned long long b0 = 0; b0 < B; b0++){								//for each band
+					for (unsigned long long b1 = b0; b1 < B; b1++)
+						coN_half[idx++] += temp_precise[b0] * temp_precise[b1];
+				}
+			}
+			if(PROGRESS) progress = (double)(xy+1) / XY * 100;
+		}
+		idx = 0;
+		for (unsigned long long i = 0; i < B; i++){										//copy the precision matrix to both halves of the output matrix
+			for (unsigned long long j = i; j < B; j++){
+				coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double) count;
+			}
+		}
+
+		free(temp);
+		free(temp_precise);
+		return true;
+	}
+
+	#ifdef CUDA_FOUND
+    /// Project the spectra onto a set of basis functions
+	/// @param outfile is the name of the new binary output file that will be created
+	/// @param center is a spectrum about which the data set will be rotated (ex. when performing mean centering)
+	/// @param basis a set of basis vectors that the data set will be projected onto (after centering)
+	/// @param M is the number of basis vectors
+	/// @param mask is a character mask used to limit processing to valid pixels
+	bool project_cublas(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){
+
+		cudaError_t cudaStat;
+		cublasStatus_t stat;
+		cublasHandle_t handle;
+
+		std::ofstream target(outfile.c_str(), std::ios::binary);	//open the target binary file
+
+		progress = 0;													    //initialize the progress to zero (0)
+		unsigned long long XY = X() * Y();									//calculate the number of elements in a band image
+		unsigned long long B = Z();											//calculate the number of spectral elements
+
+		double* s = (double*)malloc(sizeof(double) * B);					//allocate space for the spectrum that will be pulled from the file
+		double* s_dev;														//declare a device pointer that will store the spectrum on the GPU
+		cudaStat = cudaMalloc(&s_dev, B * sizeof(double));					//allocate space on the CUDA device for the spectrum
+
+
+        double* basis_dev;														//  device pointer on the GPU
+        cudaStat = cudaMalloc(&basis_dev, M * B * sizeof(double));					//  allocate space on the CUDA device
+        cudaStat = cudaMemset(basis_dev, 0, M * B * sizeof(double));               //  initialize basis_dev to zero (0)
+
+
+        /// transposing basis matrix (because cuBLAS is column-major)
+        double *basis_Transposed = (double*)malloc(M * B * sizeof(double));
+        memset(basis_Transposed, 0, M * B * sizeof(double));
+        for (int i = 0; i<M; i++)
+            for (int j = 0; j<B; j++)
+            basis_Transposed[i+j*M] = basis[i*B+j];
+
+        stat = cublasSetMatrix((int)M, (int)B, sizeof(double),basis_Transposed, (int)M, basis_dev, (int)M);  //copy the basis_Transposed matrix to the CUDA device (both matrices are stored in column-major format)
+
+		double* center_dev;													//declare a device pointer that will store the center (average)
+		cudaStat = cudaMalloc(&center_dev, B * sizeof(double));				//allocate space on the CUDA device for the center (average)
+		stat = cublasSetVector((int)B, sizeof(double), center, 1, center_dev, 1);		//copy the center vector (average) to the CUDA device (from host to device)
+
+
+        double* A = (double*)malloc(sizeof(double) * M);					//allocate space for the projected pixel on the host
+        double* A_dev;														//declare a device pointer that will store the projected pixel on the GPU
+		cudaStat = cudaMalloc(&A_dev,M * sizeof(double));				    //allocate space on the CUDA device for the projected pixel
+		cudaStat = cudaMemset(A_dev, 0,M * sizeof(double));		        	//initialize the projected pixel to zero (0)
+
+		double axpy_alpha = -1;												//multiplication factor for the center (in order to perform a subtraction)
+		double axpy_alpha2 = 1;												//multiplication factor for the matrix-vector multiplication
+        double axpy_beta = 0;												//multiplication factor for the matrix-vector multiplication (there is no second scalor)
+
+		stat = cublasCreate(&handle);										//create a cuBLAS instance
+		if (stat != CUBLAS_STATUS_SUCCESS) {								//test the cuBLAS instance to make sure it is valid
+			printf ("CUBLAS initialization failed\n");
+			return EXIT_FAILURE;
+		}
+
+        T* temp = (T*)malloc(sizeof(T) * M);													//allocate space for the projected pixel to be written on the disc
+		size_t i;
+		for (unsigned long long xy = 0; xy < XY; xy++){											//for each pixel
+			if (mask == NULL || mask[xy] != 0){
+				pixeld(s, xy);																	//retreive the spectrum at the current xy pixel location
+
+				stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1);						    //copy the spectrum from the host to the device
+                stat = cublasDaxpy(handle, (int)B, &axpy_alpha, center_dev, 1, s_dev, 1);				//subtract the center (average)
+                stat = cublasDgemv(handle,CUBLAS_OP_N,(int)M,(int)B,&axpy_alpha2,basis_dev,(int)M,s_dev,1,&axpy_beta,A_dev,1);         //performs the matrix-vector multiplication
+                stat = cublasGetVector((int)B, sizeof(double), A_dev, 1, A, 1);					//copy the projected pixel to the host (from GPU to CPU)
+
+                //std::copy<double*, T*>(A, A + M, temp);											
+				for(i = 0; i < M; i++)	temp[i] = (T)A[i];										//casting projected pixel from double to whatever T is
+			}
+
+			target.write(reinterpret_cast<const char*>(temp), sizeof(T) * M);					  //write the projected vector
+			if(PROGRESS) progress = (double)(xy+1) / XY * 100;									    //record the current progress
+
+		}
+
+        //clean up allocated device memory
+		cudaFree(A_dev);
+		cudaFree(s_dev);
+        cudaFree(basis_dev);
+		cudaFree(center_dev);
+		free(A);
+		free(s);
+		free(temp);
+		target.close();												//close the output file
+		
+		return true;
+	}
+#endif
+
 	/// Project the spectra onto a set of basis functions
 	/// @param outfile is the name of the new binary output file that will be created
 	/// @param center is a spectrum about which the data set will be rotated (ex. when performing mean centering)
@@ -1096,6 +1322,14 @@ public:
 	/// @param mask is a character mask used to limit processing to valid pixels
 	bool project(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){
  
+#ifdef CUDA_FOUND
+		int dev_count;
+		cudaGetDeviceCount(&dev_count);									//get the number of CUDA devices
+		cudaDeviceProp prop;
+		cudaGetDeviceProperties(&prop, 0);								//get the property of the first device
+		if(dev_count > 0 && prop.major != 9999)							//if the first device is not an emulator
+			return project_cublas(outfile,center,basis,M,mask,PROGRESS);	 //use cuBLAS to calculate the covariance matrix
+#endif
 		std::ofstream target(outfile.c_str(), std::ios::binary);	//open the target binary file
 		//std::string headername = outfile + ".hdr";					//the header file name
  
@@ -1125,7 +1359,7 @@ public:
 		free(s);													//free temporary storage arrays
 		free(rs);
 		target.close();												//close the output file
-
+		
 		return true;
 	}
  
@@ -1395,6 +1629,52 @@ public:
 		}
 	}
  
+	bool multiply(std::string outname, double v, unsigned char* mask = NULL, bool PROGRESS = false){
+		std::ofstream target(outname.c_str(), std::ios::binary);		//open the target binary file
+		std::string headername = outname + ".hdr";						//the header file name
+
+		unsigned long long N = X() * Y();								//calculate the total number of pixels to be processed
+		unsigned long long B = Z();										//get the number of bands
+		T* s = (T*)malloc(sizeof(T) * B);								//allocate memory to store a pixel
+		for(unsigned long long n = 0; n < N; n++){						//for each pixel in the image
+			if(mask == NULL || mask[n]){								//if the pixel is masked
+				for(size_t b = 0; b < B; b++)							//for each band in the spectrum
+					s[b] *= (T)v;											//multiply
+			}
+
+			if(PROGRESS) progress = (double)(n+1) / N * 100;			//set the current progress
+
+			target.write((char*)s, sizeof(T) * B);						//write the corrected data into destination
+		}																//end for each pixel
+
+		free(s);														//free the spectrum
+		target.close();													//close the output file
+		return true;
+	}
+
+	bool add(std::string outname, double v, unsigned char* mask = NULL, bool PROGRESS = false){
+		std::ofstream target(outname.c_str(), std::ios::binary);		//open the target binary file
+		std::string headername = outname + ".hdr";						//the header file name
+
+		unsigned long long N = X() * Y();								//calculate the total number of pixels to be processed
+		unsigned long long B = Z();										//get the number of bands
+		T* s = (T*)malloc(sizeof(T) * B);								//allocate memory to store a pixel
+		for(unsigned long long n = 0; n < N; n++){						//for each pixel in the image
+			if(mask == NULL || mask[n]){								//if the pixel is masked
+				for(size_t b = 0; b < B; b++)							//for each band in the spectrum
+					s[b] += (T)v;											//multiply
+			}
+
+			if(PROGRESS) progress = (double)(n+1) / N * 100;			//set the current progress
+
+			target.write((char*)s, sizeof(T) * B);						//write the corrected data into destination
+		}																//end for each pixel
+
+		free(s);														//free the spectrum
+		target.close();													//close the output file
+		return true;
+	}
+
  
  
 	/// Close the file.
@@ -9,6 +9,7 @@
 #include <vector>
 #include <deque>
 #include <chrono>
+#include <future>
  
  
  
@@ -376,36 +377,144 @@ public:
  
 	}
  
-	/// Convert the current BSQ file to a BIL file with the specified file name.
-
-	/// @param outname is the name of the output BIL file to be saved to disk.
-	bool bil(std::string outname, bool PROGRESS = false)
-	{
-		//simplify image resolution
-		unsigned long long jump = (Y() - 1) * X() * sizeof(T);
+	void readlines(T* dest, size_t start, size_t n){
+		hsi<T>::read(dest, 0, start, 0, X(), n, Z());
+	}
  
-		std::ofstream target(outname.c_str(), std::ios::binary);
-		std::string headername = outname + ".hdr";
+	/// Convert this BSQ file to a BIL
+	bool bil(std::string outname, bool PROGRESS = false){
  
-		unsigned long long L = X();
-		T* line = (T*)malloc(sizeof(T) * L);
+		const size_t buffers = 4;													//number of buffers required for this algorithm
+		size_t mem_per_batch = binary<T>::buffer_size / buffers;					//calculate the maximum memory available for a batch
  
-		for ( unsigned long long y = 0; y < Y(); y++)									//for each y position
-		{
-			file.seekg(y * X() * sizeof(T), std::ios::beg);					//seek to the beginning of the xz slice
-			for ( unsigned long long z = 0; z < Z(); z++ )							//for each band
-			{
-				file.read((char *)line, sizeof(T) * X());					//read a line
-				target.write((char*)line, sizeof(T) * X());					//write the line to the output file
-				file.seekg(jump, std::ios::cur);							//seek to the next band
-				if(PROGRESS) progress = (double)((y+1) * Z() + z + 1) / (Z() * Y()) * 100;	//update the progress counter
+		size_t slice_bytes = X() * Z() * sizeof(T);									//number of bytes in an input batch slice (Y-slice in this case)
+		size_t max_slices_per_batch = mem_per_batch / slice_bytes;					//maximum number of slices we can process in one batch given memory constraints
+		if(max_slices_per_batch == 0){														//if there is insufficient memory for a single slice, throw an error
+			std::cout<<"error, insufficient memory for stim::bsq::bil()"<<std::endl;
+			exit(1);
+		}
+		size_t max_batch_bytes = max_slices_per_batch * slice_bytes;				//calculate the amount of memory that will be allocated for all four buffers
+
+		T* src[2];																	//source double-buffer for asynchronous batching
+		src[0] = (T*) malloc(max_batch_bytes);
+		src[1] = (T*) malloc(max_batch_bytes);
+		T* dst[2];																	//destination double-buffer for asynchronous batching
+		dst[0] = (T*) malloc(max_batch_bytes);
+		dst[1] = (T*) malloc(max_batch_bytes);
+
+		size_t N[2];																		//number of slices stored in buffers 0 and 1
+		N[0] = N[1] = min(Y(), max_slices_per_batch);										//start with the maximum number of slices that can be stored (may be the entire data set)
+
+		std::ofstream target(outname.c_str(), std::ios::binary);					//open an output file for writing
+																		//initialize with buffer 0 (used for double buffering)
+		size_t y_load = 0;
+		size_t y_proc = 0;
+		std::future<void> rthread;
+		std::future<std::ostream&> wthread;										//create asynchronous threads for reading and writing
+
+		readlines(src[0], 0, N[0]);												//read the first batch into the 0 source buffer
+		y_load += N[0];															//increment the loaded slice counter
+		int b = 1;
+
+		std::chrono::high_resolution_clock::time_point t_start;						//high-resolution timers
+		std::chrono::high_resolution_clock::time_point t_end;
+		size_t t_batch;																//number of milliseconds to process a batch
+		size_t t_total = 0;
+		while(y_proc < Y()){													//while there are still slices to be processed
+			t_start = std::chrono::high_resolution_clock::now();					//start the timer for this batch
+			if(y_load < Y()){													//if there are still slices to be loaded, load them
+				if(y_load + N[b] > Y()) N[b] = Y() - y_load;					//if the next batch would process more than the total slices, adjust the batch size
+				rthread = std::async(std::launch::async, &stim::bsq<T>::readlines, this, src[b], y_load, N[b]);
+				
+				y_load += N[b];													//increment the number of loaded slices
 			}
+
+			b = !b;																//swap the double-buffer
+
+			binary<T>::permute(dst[b], src[b], X(), N[b], Z(), 0, 2, 1);		//permute the batch to a BIL file
+			target.write((char*)dst[b], N[b] * slice_bytes);					//write the permuted data to the output file
+			y_proc += N[b];														//increment the counter of processed pixels
+			if(PROGRESS) progress = (double)( y_proc + 1 ) / Y() * 100;			//increment the progress counter based on the number of processed pixels
+			t_end = std::chrono::high_resolution_clock::now();
+			t_batch = std::chrono::duration_cast<std::chrono::milliseconds>(t_end-t_start).count();
+			t_total += t_batch;
+			rthread.wait();
 		}
  
-		free(line);
-		target.close();
+		std::cout<<"Total time to execute: "<<t_total<<" ms"<<std::endl;
+		free(src[0]);															//free buffer resources
+		free(src[1]);
+		free(dst[0]);
+		free(dst[1]);
+		return true;															//return true
+	}
  
-		return true;
+	/// Convert this BSQ file to a BIP
+	bool bip(std::string outname, bool PROGRESS = false){
+
+		const size_t buffers = 4;													//number of buffers required for this algorithm
+		size_t mem_per_batch = binary<T>::buffer_size / buffers;					//calculate the maximum memory available for a batch
+
+		size_t slice_bytes = X() * Z() * sizeof(T);									//number of bytes in an input batch slice (Y-slice in this case)
+		size_t max_slices_per_batch = mem_per_batch / slice_bytes;					//maximum number of slices we can process in one batch given memory constraints
+		if(max_slices_per_batch == 0){														//if there is insufficient memory for a single slice, throw an error
+			std::cout<<"error, insufficient memory for stim::bsq::bil()"<<std::endl;
+			exit(1);
+		}
+		size_t max_batch_bytes = max_slices_per_batch * slice_bytes;				//calculate the amount of memory that will be allocated for all four buffers
+
+		T* src[2];																	//source double-buffer for asynchronous batching
+		src[0] = (T*) malloc(max_batch_bytes);
+		src[1] = (T*) malloc(max_batch_bytes);
+		T* dst[2];																	//destination double-buffer for asynchronous batching
+		dst[0] = (T*) malloc(max_batch_bytes);
+		dst[1] = (T*) malloc(max_batch_bytes);
+
+		size_t N[2];																		//number of slices stored in buffers 0 and 1
+		N[0] = N[1] = min(Y(), max_slices_per_batch);										//start with the maximum number of slices that can be stored (may be the entire data set)
+
+		std::ofstream target(outname.c_str(), std::ios::binary);					//open an output file for writing
+																		//initialize with buffer 0 (used for double buffering)
+		size_t y_load = 0;
+		size_t y_proc = 0;
+		std::future<void> rthread;
+		std::future<std::ostream&> wthread;										//create asynchronous threads for reading and writing
+
+		readlines(src[0], 0, N[0]);												//read the first batch into the 0 source buffer
+		y_load += N[0];															//increment the loaded slice counter
+		int b = 1;
+
+		std::chrono::high_resolution_clock::time_point t_start;						//high-resolution timers
+		std::chrono::high_resolution_clock::time_point t_end;
+		size_t t_batch;																//number of milliseconds to process a batch
+		size_t t_total = 0;
+		while(y_proc < Y()){													//while there are still slices to be processed
+			t_start = std::chrono::high_resolution_clock::now();					//start the timer for this batch
+			if(y_load < Y()){													//if there are still slices to be loaded, load them
+				if(y_load + N[b] > Y()) N[b] = Y() - y_load;					//if the next batch would process more than the total slices, adjust the batch size
+				rthread = std::async(std::launch::async, &stim::bsq<T>::readlines, this, src[b], y_load, N[b]);
+				
+				y_load += N[b];													//increment the number of loaded slices
+			}
+
+			b = !b;																//swap the double-buffer
+
+			binary<T>::permute(dst[b], src[b], X(), N[b], Z(), 2, 0, 1);		//permute the batch to a BIP file
+			target.write((char*)dst[b], N[b] * slice_bytes);					//write the permuted data to the output file
+			y_proc += N[b];														//increment the counter of processed pixels
+			if(PROGRESS) progress = (double)( y_proc + 1 ) / Y() * 100;			//increment the progress counter based on the number of processed pixels
+			t_end = std::chrono::high_resolution_clock::now();
+			t_batch = std::chrono::duration_cast<std::chrono::milliseconds>(t_end-t_start).count();
+			t_total += t_batch;
+			rthread.wait();
+		}
+
+		std::cout<<"Total time to execute: "<<t_total<<" ms"<<std::endl;
+		free(src[0]);															//free buffer resources
+		free(src[1]);
+		free(dst[0]);
+		free(dst[1]);
+		return true;															//return true
 	}
  
 	/// Return a baseline corrected band given two adjacent baseline points and their bands. The result is stored in a pre-allocated array.
@@ -1238,6 +1347,60 @@ public:
 			if(PROGRESS) progress = (double)(b+1) / (double)B * 100;
 		}
  
+	}	//end deriv
+
+	bool multiply(std::string outname, double v, unsigned char* mask = NULL, bool PROGRESS = false){
+		unsigned long long B = Z();									//calculate the number of bands
+		unsigned long long XY = X() * Y();							//calculate the number of pixels in a band
+		unsigned long long S = XY * sizeof(T);						//calculate the number of bytes in a band
+
+		std::ofstream target(outname.c_str(), std::ios::binary);	//open the target binary file
+		std::string headername = outname + ".hdr";					//the header file name
+
+		T * c;														//pointer to the current image
+		c = (T*)malloc( S );										//allocate memory for the band image
+
+		for(unsigned long long j = 0; j < B; j++){					//for each band
+			band_index(c, j);										//load the current band
+			for(unsigned long long i = 0; i < XY; i++){				//for each pixel
+				if(mask == NULL || mask[i])							//if the pixel is masked
+					c[i] *= (T)v;										//perform the multiplication
+			}
+			target.write(reinterpret_cast<const char*>(c), S);		//write normalized data into destination
+
+			if(PROGRESS) progress = (double)(j+1) / B * 100;		//update the progress
+		}
+
+		free(c);													//free the band
+		target.close();												//close the output file
+		return true;
+	}
+
+	bool add(std::string outname, double v, unsigned char* mask = NULL, bool PROGRESS = false){
+		unsigned long long B = Z();									//calculate the number of bands
+		unsigned long long XY = X() * Y();							//calculate the number of pixels in a band
+		unsigned long long S = XY * sizeof(T);						//calculate the number of bytes in a band
+
+		std::ofstream target(outname.c_str(), std::ios::binary);	//open the target binary file
+		std::string headername = outname + ".hdr";					//the header file name
+
+		T * c;														//pointer to the current image
+		c = (T*)malloc( S );										//allocate memory for the band image
+
+		for(unsigned long long j = 0; j < B; j++){					//for each band
+			band_index(c, j);										//load the current band
+			for(unsigned long long i = 0; i < XY; i++){				//for each pixel
+				if(mask == NULL || mask[i])							//if the pixel is masked
+					c[i] += (T)v;										//perform the multiplication
+			}
+			target.write(reinterpret_cast<const char*>(c), S);		//write normalized data into destination
+
+			if(PROGRESS) progress = (double)(j+1) / B * 100;		//update the progress
+		}
+
+		free(c);													//free the band
+		target.close();												//close the output file
+		return true;
 	}
  
  
@@ -7,6 +7,7 @@
 #include "../envi/bil.h"
 #include "../math/fd_coefficients.h"
 #include <iostream>
+#include <fstream>
 //#include "../image/image.h"
  
 namespace stim{
@@ -58,15 +59,17 @@ class envi{
 		for(size_t i = 0; i < len; i++)
 			cast(&dst[i], &src[i]);
 	}
-
+	
 public:
+	envi_header header;
  
+	
 	/// Default constructor
 	envi(){
 		file = NULL;				//set the file pointer to NULL
 	}
  
-	envi_header header;
+	
  
 	void* malloc_spectrum(){
 		return alloc_array(header.bands);
@@ -76,6 +79,40 @@ public:
 		return alloc_array(header.samples * header.lines);
 	}
  
+	void set_buffer(double memfrac = 0.5){
+		if(header.interleave == envi_header::BSQ){		//if the infile is bsq file
+			if(header.data_type ==envi_header::float32)
+				((bsq<float>*)file)->set_buffer(memfrac);
+			else if(header.data_type == envi_header::float64)
+				((bsq<double>*)file)->set_buffer(memfrac);
+			else
+				std::cout<<"ERROR: unidentified data type"<<std::endl;
+		}
+
+		else if(header.interleave == envi_header::BIL){		//if the infile is bil file
+			if(header.data_type ==envi_header::float32)
+				((bil<float>*)file)->set_buffer(memfrac);
+			else if(header.data_type == envi_header::float64)
+				((bil<double>*)file)->set_buffer(memfrac);
+			else
+				std::cout<<"ERROR: unidentified data type"<<std::endl;
+		}
+
+		else if(header.interleave == envi_header::BIP){		//if the infile is bip file
+			if(header.data_type ==envi_header::float32)
+				((bip<float>*)file)->set_buffer(memfrac);
+			else if(header.data_type == envi_header::float64)
+				((bip<double>*)file)->set_buffer(memfrac);
+			else
+				std::cout<<"ERROR: unidentified data type"<<std::endl;
+		}
+
+		else{
+			std::cout<<"ERROR: unidentified file type"<<std::endl;
+			exit(1);
+		}
+	}
+
 	/// Returns the size of the data type in bytes
 	unsigned int type_size(){
 		if(header.data_type == envi_header::float32) return 4;
@@ -224,6 +261,37 @@ public:
  
 	}
  
+	/// Open an Agilent binary file as an ENVI stream
+	bool open_agilent(std::string filename){
+		fname = filename;												//store the file name
+
+		//Open the file temporarily to get the header information
+		FILE* f = fopen(filename.c_str(), "r");							//open the binary file for reading
+		if(f == NULL) return false;										//return false if no file is opened
+
+		fseek(f, 9, SEEK_SET);											//seek to the number of bands
+		short b;														//allocate space for the number of bands
+		fread(&b, sizeof(short), 1, f);									//read the number of bands
+		fseek(f, 13, SEEK_CUR);											//skip the the x and y dimensions
+		short x, y;
+		fread(&x, sizeof(short), 1, f);									//read the image x and y size
+		fread(&y, sizeof(short), 1, f);
+		fclose(f);														//close the file
+
+		//store the information from the Agilent header in the ENVI header
+		header.bands = b;
+		header.samples = x;
+		header.lines = y;
+		header.data_type = envi_header::float32;						//all values are 32-bit floats
+		header.header_offset = 1020;									//number of bytes in an Agilent binary header
+		header.interleave = envi_header::BSQ;							//all Agilent binary files are BSQ
+
+		allocate();														//allocate the streaming file object
+		open();															//open the file for streaming
+
+		return true;
+	}
+
 	/// Open an existing ENVI file given the filename and a header structure
  
 	/// @param filename is the name of the ENVI binary file
@@ -257,7 +325,6 @@ public:
 		//header.load(headername);
  
 		return open(filename, h);
-
 	}
  
 	/// Normalize a hyperspectral ENVI file given a band number and threshold.
@@ -454,9 +521,9 @@ public:
 				else if(interleave == envi_header::BIL)			//convert BSQ -> BIL
 					((bsq<float>*)file)->bil(outfile, PROGRESS);
 				else if(interleave == envi_header::BIP){			//ERROR
-					std::cout<<"ERROR: conversion from BSQ to BIP isn't practical; use BSQ->BIL->BIP instead"<<std::endl;
-					//return ((bsq<float>*)file)->bip(outfile, PROGRESS);
-					exit(1);
+					//std::cout<<"ERROR: conversion from BSQ to BIP isn't practical; use BSQ->BIL->BIP instead"<<std::endl;
+					((bsq<float>*)file)->bip(outfile, PROGRESS);
+					//exit(1);
 				}
 			}
  
@@ -468,9 +535,9 @@ public:
 				else if(interleave == envi_header::BIL)					//convert BSQ -> BIL
 					((bsq<double>*)file)->bil(outfile, PROGRESS);
 				else if(interleave == envi_header::BIP){					//ERROR
-					std::cout<<"ERROR: conversion from BSQ to BIP isn't practical; use BSQ->BIL->BIP instead"<<std::endl;
-					//return ((bsq<float>*)file)->bip(outfile, PROGRESS);
-					exit(1);
+					//std::cout<<"ERROR: conversion from BSQ to BIP isn't practical; use BSQ->BIL->BIP instead"<<std::endl;
+					((bsq<float>*)file)->bip(outfile, PROGRESS);
+					//exit(1);
 				}
 			}
  
@@ -1106,46 +1173,6 @@ public:
 		return false;
 	}
  
-	/// Retrieve a spectrum from the specified location
-
-	/// @param ptr is a pointer to pre-allocated memory of size B*sizeof(T)
-	/// @param x is the x-coordinate of the spectrum
-	/// @param y is the y-coordinate of the spectrum
-	/*bool spectrum(void* ptr, unsigned long long x, unsigned long long y, bool PROGRESS = false){
-
-		if(header.interleave == envi_header::BSQ){		//if the infile is bsq file
-			if(header.data_type ==envi_header::float32)
-				return ((bsq<float>*)file)->spectrum((float*)ptr, x, y, PROGRESS);
-			else if (header.data_type == envi_header::float64)
-				return ((bsq<double>*)file)->spectrum((double*)ptr, x, y, PROGRESS);
-			else{
-				std::cout << "ERROR: unidentified data type" << std::endl;
-				exit(1);
-			}
-		}
-		else if (header.interleave == envi_header::BIL){
-			if (header.data_type == envi_header::float32)
-				return ((bil<float>*)file)->spectrum((float*)ptr, x, y, PROGRESS);
-			else if (header.data_type == envi_header::float64)
-				return ((bil<double>*)file)->spectrum((double*)ptr, x, y, PROGRESS);
-			else{
-				std::cout << "ERROR: unidentified data type" << std::endl;
-				exit(1);
-			}
-		}
-		else if (header.interleave == envi_header::BIP){
-			if (header.data_type == envi_header::float32)
-				return ((bip<float>*)file)->spectrum((float*)ptr, x, y, PROGRESS);
-			else if (header.data_type == envi_header::float64)
-				return ((bip<double>*)file)->spectrum((double*)ptr, x, y, PROGRESS);
-			else{
-				std::cout << "ERROR: unidentified data type" << std::endl;
-				exit(1);
-			}
-		}
-		return false;
-	}*/
-
 	// Retrieve a spectrum at the specified 1D location
  
 	/// @param ptr is a pointer to pre-allocated memory of size B*sizeof(T)
@@ -1209,50 +1236,6 @@ public:
 	void spectrum(T* ptr, size_t x, size_t y, bool PROGRESS = false){
  
 		spectrum<T>(ptr, y * header.samples + x, PROGRESS);
-		/*void* temp = alloc_array<T>(header.bands);		//allocate space for the output array
-
-		if(header.interleave == envi_header::BSQ){		//if the infile is bsq file
-			if(header.data_type ==envi_header::float32){
-				((bsq<float>*)file)->spectrum((float*)temp, x, y, PROGRESS);
-				cast<T, float>(ptr, temp, header.bands);
-			}
-			else if (header.data_type == envi_header::float64){
-				((bsq<double>*)file)->spectrum((double*)temp, x, y, PROGRESS);
-				cast<T, double>(ptr, temp, header.bands);
-			}
-			else{
-				std::cout << "ERROR: unidentified data type" << std::endl;
-				exit(1);
-			}
-		}
-		else if (header.interleave == envi_header::BIL){
-			if (header.data_type == envi_header::float32){
-				((bil<float>*)file)->spectrum((float*)temp, x, y, PROGRESS);
-				cast<T, float>(ptr, temp, header.bands);
-			}
-			else if (header.data_type == envi_header::float64){
-				((bil<double>*)file)->spectrum((double*)temp, x, y, PROGRESS);
-				cast<T, double>(ptr, temp, header.bands);
-			}
-			else{
-				std::cout << "ERROR: unidentified data type" << std::endl;
-				exit(1);
-			}
-		}
-		else if (header.interleave == envi_header::BIP){
-			if (header.data_type == envi_header::float32){
-				((bip<float>*)file)->spectrum((float*)temp, x, y, PROGRESS);
-				cast<T, float>(ptr, temp, header.bands);
-			}
-			else if (header.data_type == envi_header::float64){
-				((bip<double>*)file)->spectrum((double*)temp, x, y, PROGRESS);
-				cast<T, double>(ptr, temp, header.bands);
-			}
-			else{
-				std::cout << "ERROR: unidentified data type" << std::endl;
-				exit(1);
-			}
-		}*/
 	}
  
 	/// Retrieve a single band (based on index) and stores it in pre-allocated memory.
@@ -1340,14 +1323,6 @@ public:
 		if (header.interleave == envi_header::BSQ){
 			std::cout<<"ERROR: calculating the covariance matrix for a BSQ file is impractical; convert to BIL or BIP first"<<std::endl;
 			exit(1);
-			/*if (header.data_type == envi_header::float32)
-				return ((bsq<float>*)file)->co_matrix(co, avg, mask, PROGRESS);
-			else if (header.data_type == envi_header::float64)
-				return ((bsq<double>*)file)->co_matrix(co, avg, mask, PROGRESS);
-			else{
-				std::cout << "ERROR: unidentified data type" << std::endl;
-				exit(1);
-			}*/
 		}
 		else if (header.interleave == envi_header::BIL){
 			if (header.data_type == envi_header::float32)
@@ -1372,6 +1347,35 @@ public:
 		return false;
 	}
  
+	/// Calculate the covariance of noise matrix for all masked pixels in the image.
+
+	/// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
+	/// @param avg is a pointer to memory of size B that stores the average spectrum
+	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
+	bool coNoise_matrix(double* coN, double* avg, unsigned char* mask, bool PROGRESS = false){
+		if (header.interleave == envi_header::BSQ){
+			std::cout<<"ERROR: calculating the covariance matrix of noise for a BSQ file is impractical; convert to BIP first"<<std::endl;
+			exit(1);
+		}
+
+
+		else if (header.interleave == envi_header::BIL){
+		        std::cout<<"ERROR: calculating the covariance matrix of noise for a BIL file is impractical; convert to BIP first"<<std::endl;
+                exit(1);
+			 }
+
+		else if (header.interleave == envi_header::BIP){
+			if (header.data_type == envi_header::float32)
+				return ((bip<float>*)file)->coNoise_matrix(coN, avg, mask, PROGRESS);
+			else if (header.data_type == envi_header::float64)
+				return ((bip<double>*)file)->coNoise_matrix(coN, avg, mask, PROGRESS);
+			else{
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+		return false;
+	}
  
 	/// Crop a region of the image and save it to a new file.
  
@@ -1635,7 +1639,81 @@ public:
 		}
 		exit(1);
 	}
-};
+
+	void multiply(std::string outfile, double v, unsigned char* mask = NULL, bool PROGRESS = false){
+		header.save(outfile + ".hdr");
+		if (header.interleave == envi_header::BSQ){
+			if (header.data_type == envi_header::float32)
+				((bsq<float>*)file)->multiply(outfile, v, mask, PROGRESS);
+			else if (header.data_type == envi_header::float64)
+				((bsq<double>*)file)->multiply(outfile, v, mask, PROGRESS);
+			else{
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+
+		else if (header.interleave == envi_header::BIL){
+			if (header.data_type == envi_header::float32)
+				((bil<float>*)file)->multiply(outfile, v, mask, PROGRESS);
+			else if (header.data_type == envi_header::float64)
+				((bil<double>*)file)->multiply(outfile, v, mask, PROGRESS);
+			else{
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+
+		else if (header.interleave == envi_header::BIP){
+			if (header.data_type == envi_header::float32)
+				((bip<float>*)file)->multiply(outfile, v, mask, PROGRESS);
+			else if (header.data_type == envi_header::float64)
+				((bip<double>*)file)->multiply(outfile, v, mask, PROGRESS);
+			else{
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+		exit(1);
+	}
+
+	void add(std::string outfile, double v, unsigned char* mask = NULL, bool PROGRESS = false){
+		header.save(outfile + ".hdr");
+		if (header.interleave == envi_header::BSQ){
+			if (header.data_type == envi_header::float32)
+				((bsq<float>*)file)->add(outfile, v, mask, PROGRESS);
+			else if (header.data_type == envi_header::float64)
+				((bsq<double>*)file)->add(outfile, v, mask, PROGRESS);
+			else{
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+
+		else if (header.interleave == envi_header::BIL){
+			if (header.data_type == envi_header::float32)
+				((bil<float>*)file)->add(outfile, v, mask, PROGRESS);
+			else if (header.data_type == envi_header::float64)
+				((bil<double>*)file)->add(outfile, v, mask, PROGRESS);
+			else{
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+
+		else if (header.interleave == envi_header::BIP){
+			if (header.data_type == envi_header::float32)
+				((bip<float>*)file)->add(outfile, v, mask, PROGRESS);
+			else if (header.data_type == envi_header::float64)
+				((bip<double>*)file)->add(outfile, v, mask, PROGRESS);
+			else{
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+		exit(1);
+	}
+};	//end ENVI
  
 }	//end namespace rts
  
@@ -440,9 +440,24 @@ struct envi_header
 	}
  
 	/// Convert a wavelength to a band index (or a pair of surrounding band indices)
+	///		if the file doesn't specify wavelengths, w is assumed to be a band index
 	std::vector<size_t> band_index(double w){
 		std::vector<size_t> idx;										//create an empty array of indices
-		if(w < wavelength[0] || w > wavelength[bands-1]) return idx;	//if the wavelength range is outside of the file, return an empty array
+		if(wavelength.size() == 0){										//if a wavelength vector doesn't exist, assume the passed value is a band
+			if(w < 0 || w > bands-1) return idx;						//if the band is outside the given band range, return an empty vector
+			size_t low, high;											//allocate space for the floor and ceiling
+			low = (size_t)std::floor(w);										//calculate the floor
+			high = (size_t)std::ceil(w);										//calculate the ceiling
+			if(low == high)												//if the floor and ceiling are the same
+				idx.push_back(low);										//return a vector with one element (the given w matches a band exactly)
+			else{
+				idx.resize(2);											//otherwise return the floor and ceiling
+				idx[0] = low;
+				idx[1] = high;
+			}
+			return idx;
+		}
+		else if(w < wavelength[0] || w > wavelength[bands-1]) return idx;	//if the wavelength range is outside of the file, return an empty array
  
 		for(size_t b = 0; b < bands; b++){							//for each band in the wavelength vector
 			if(wavelength[b] == w){									//if an exact match is found
@@ -149,13 +149,13 @@ public:
 			for(size_t i = 0; i < R[0] * R[1]; i++){		//for each pixel in that page
  
 #ifdef _WIN32
-				if(!_finite(page[i])){						//if the value at index i is finite
+				if(!_finite(page[i])){						//if the value at index i is not finite
 #else
-				if(!std::isfinite(page[i])){					//C++11 implementation
+				if(!std::isfinite(page[i])){				//C++11 implementation
 #endif
 					size_t x, y, b;
-					xyb(p * R[0] * R[1] + i, x, y, b);							//find the 3D coordinates of the value
-					mask[ y * X() + x ] = 0;				//mask the pixel (it's not bad)
+					xyb(p * R[0] * R[1] + i, x, y, b);		//find the 3D coordinates of the value
+					mask[ y * X() + x ] = 0;				//remove the pixel (it's bad)
 				}
 			}
 			if(PROGRESS) progress = (double)(p + 1) / (double)R[2] * 100;
@@ -202,6 +202,24 @@ public:
 		}
 	}
  
+	void read(T* dest, size_t x, size_t y, size_t z, size_t sx, size_t sy, size_t sz){
+		size_t d[3];					//position in the binary coordinate system
+		size_t sd[3];					//size in the binary coordinate system
+
+		d[O[0]] = x;					//set the point in the binary coordinate system
+		d[O[1]] = y;
+		d[O[2]] = z;
+
+		sd[O[0]] = sx;					//set the size in the binary coordinate system
+		sd[O[1]] = sy;
+		sd[O[2]] = sz;
+
+		if(!binary<T>::read(dest, d[0], d[1], d[2], sd[0], sd[1], sd[2])){
+			std::cout<<"error reading block in stim::hsi: ("<<d[0]<<", "<<d[1]<<", "<<d[2]<<") - ["<<sd[0]<<", "<<sd[1]<<", "<<sd[2]<<"]"<<std::endl;
+			exit(1);
+		}
+	}
+
 };
  
 }		//end namespace STIM
 #ifndef STIM_GL_TEXTURE_H
 #define STIM_GL_TEXTURE_H
  
-
-
-
-/*
-includes not necessary (yet)
-
-#include <iterator>
-#include <algorithm>
-
-
-*/
-
 #include <math.h>
 #include <iostream>
 #include <vector>
 #include "../grids/image_stack.h"
-#include <GL/glut.h>
-//#include <GL/glext.h>
-#include "./error.h"	
+//Visual Studio requires GLEW
+#ifdef _WIN32
+	#include <GL/glew.h>
+#endif
+//#include <GL/glut.h>
+#include <stim/gl/error.h>	
 namespace stim{
  
 /*
@@ -27,195 +18,282 @@ class gl_texture
 	Uses image_stack class in order to create a texture object.
 */
  
-template<typename T>
-class gl_texture : public virtual image_stack<T>
+template<typename T, typename F = float>
+class gl_texture : public virtual image_stack<T, F>
 {
-	private:
-		///	Sets the internal texture_type, based on the data
-		///	size. Either 3D, 2D, 1D textures.
-		
-		void
-		setTextureType()
-		{
-			if (R[3] > 1)
-				texture_type = GL_TEXTURE_3D;
-			else if (R[3] == 1 && R[2] == 0)
-				texture_type = GL_TEXTURE_1D;
-			else if (R[3] == 1)
-				texture_type = GL_TEXTURE_2D;
-		}
 	protected:
-		std::string path;
+		//std::string path;
 		GLuint texID;				//OpenGL object
 		GLenum texture_type;			//1D, 2D, 3D
-		GLint interpType;
-		GLint texWrap;	
-		GLenum type;
-		GLenum format;	
+		GLint interpolation;
+		GLint wrap;	
+		GLenum cpu_type;
+		GLenum gpu_type;
+		GLenum format;					//format for the texture (GL_RGBA, GL_LUMINANCE, etc.)
 		using image_stack<T>::R;
-		using image_stack<T>::S;
+		//using image_stack<T>::S;
 		using image_stack<T>::ptr;
-		using image_stack<T>::samples;
+
+		///	Sets the internal texture_type, based on the data dimensions
+		void setTextureType(){
+			if (R[3] > 1)						//if the third dimension is greater than 1
+				texture_type = GL_TEXTURE_3D;	//this is a 3D texture
+			else if (R[2] > 1)					//if the second dimension is greater than 1
+				texture_type = GL_TEXTURE_2D;	//this is a 2D texture
+			else if (R[1] > 1)					//if the dimension value is greater than 1
+				texture_type = GL_TEXTURE_1D;	//this is a 1D texture
+		}
+
+		//initializes important variables
+		void init() {
+			texID = 0;							//initialize texture ID to zero, default if OpenGL returns an error
+			//memset(R, 0, sizeof(size_t));
+			//memset(grid<T, 4, F>::S, 0, sizeof(F));
+		}
+
+		//guesses the color format of the texture
+		GLenum guess_format(){
+			size_t channels = R[0];
+			switch(channels){
+			case 1:
+				return GL_LUMINANCE;
+			case 2:
+				return GL_RG;
+			case 3:
+				return GL_RGB;
+			case 4:
+				return GL_RGBA;
+			default:
+				std::cout<<"Error in stim::gl_texture - unable to guess texture format based on number of channels ("<<R[4]<<")"<<std::endl;
+				exit(1);
+			}
+		}
+
+		//guesses the OpenGL CPU data type based on T
+		GLenum guess_cpu_type(){
+			// The following is C++ 11 code, but causes problems on some compilers (ex. nvcc). Below is my best approximation to a solution
+
+			//if(std::is_same<T, unsigned char>::value)	return CV_MAKETYPE(CV_8U, (int)C());
+			//if(std::is_same<T, char>::value)			return CV_MAKETYPE(CV_8S, (int)C());
+			//if(std::is_same<T, unsigned short>::value)	return CV_MAKETYPE(CV_16U, (int)C());
+			//if(std::is_same<T, short>::value)			return CV_MAKETYPE(CV_16S, (int)C());
+			//if(std::is_same<T, int>::value)				return CV_MAKETYPE(CV_32S, (int)C());
+			//if(std::is_same<T, float>::value)			return CV_MAKETYPE(CV_32F, (int)C());
+			//if(std::is_same<T, double>::value)			return CV_MAKETYPE(CV_64F, (int)C());
+
+			if(typeid(T) == typeid(unsigned char))		return GL_UNSIGNED_BYTE;
+			if(typeid(T) == typeid(char))				return GL_BYTE;
+			if(typeid(T) == typeid(unsigned short))		return GL_UNSIGNED_SHORT;
+			if(typeid(T) == typeid(short))				return GL_SHORT;
+			if(typeid(T) == typeid(unsigned int))		return GL_UNSIGNED_INT;
+			if(typeid(T) == typeid(int))				return GL_INT;
+			if(typeid(T) == typeid(float))				return GL_FLOAT;
+
+			std::cout<<"ERROR in stim::gl_texture - no valid data type found"<<std::endl;
+			exit(1);
+		}
+
+		//Guesses the "internal format" of the texture to closely approximate the original format
+		GLint guess_gpu_type(){
+			switch(format){
+			case GL_LUMINANCE:
+				switch(cpu_type){
+				case GL_BYTE:
+				case GL_UNSIGNED_BYTE:
+					return GL_LUMINANCE8;
+				case GL_SHORT:
+				case GL_UNSIGNED_SHORT:
+					return GL_LUMINANCE16;
+				case GL_INT:
+				case GL_UNSIGNED_INT:
+					return GL_LUMINANCE32I_EXT;
+				case GL_FLOAT:
+					return GL_LUMINANCE32F_ARB;
+				default:
+					std::cout<<"error in stim::gl_texture - unable to guess GPU internal format"<<std::endl;
+					exit(1);
+				}
+			case GL_RGB:
+				switch(cpu_type){
+				case GL_BYTE:
+				case GL_UNSIGNED_BYTE:
+					return GL_RGB8;
+				case GL_SHORT:
+				case GL_UNSIGNED_SHORT:
+					return GL_RGB16;
+				case GL_INT:
+				case GL_UNSIGNED_INT:
+					return GL_RGB32I;
+				case GL_FLOAT:
+					return GL_RGB32F;
+				default:
+					std::cout<<"error in stim::gl_texture - unable to guess GPU internal format"<<std::endl;
+					exit(1);
+				}
+			case GL_RGBA:
+				switch(cpu_type){
+				case GL_BYTE:
+				case GL_UNSIGNED_BYTE:
+					return GL_RGBA8;
+				case GL_SHORT:
+				case GL_UNSIGNED_SHORT:
+					return GL_RGBA16;
+				case GL_INT:
+				case GL_UNSIGNED_INT:
+					return GL_RGBA32I;
+				case GL_FLOAT:
+					return GL_RGBA32F;
+				default:
+					std::cout<<"error in stim::gl_texture - unable to guess GPU internal format"<<std::endl;
+					exit(1);
+				}
+			default:
+				std::cout<<"error in stim::gl_texture - unable to guess GPU internal format"<<std::endl;
+				exit(1);
+			}
+		}
+		///	creates this texture in the current OpenGL context
+		void generate_texture(){
+			glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+			CHECK_OPENGL_ERROR
+			glGenTextures(1, &texID);
+			CHECK_OPENGL_ERROR
+			glBindTexture(texture_type, texID);
+			CHECK_OPENGL_ERROR
+			glTexParameteri(texture_type, GL_TEXTURE_MIN_FILTER, interpolation);
+			CHECK_OPENGL_ERROR
+			glTexParameteri(texture_type, GL_TEXTURE_MAG_FILTER, interpolation);
+			CHECK_OPENGL_ERROR
+			switch(texture_type){
+				case GL_TEXTURE_3D:
+					glTexParameteri(texture_type, GL_TEXTURE_WRAP_S, wrap);
+					glTexParameteri(texture_type, GL_TEXTURE_WRAP_T, wrap);
+					glTexParameteri(texture_type, GL_TEXTURE_WRAP_R, wrap);
+					glTexImage3D(texture_type, 0, gpu_type, (GLsizei)R[1], (GLsizei)R[2], (GLsizei)R[3], 0, format, cpu_type, ptr);
+					break;
+				case GL_TEXTURE_2D:
+					glTexParameteri(texture_type, GL_TEXTURE_WRAP_S, wrap);
+					CHECK_OPENGL_ERROR
+					glTexParameteri(texture_type, GL_TEXTURE_WRAP_T, wrap);
+					CHECK_OPENGL_ERROR
+					glTexImage2D(texture_type, 0, gpu_type, (GLsizei)R[1], (GLsizei)R[2], 0, format, cpu_type, ptr);
+					CHECK_OPENGL_ERROR
+					break;
+				case GL_TEXTURE_1D:
+					glTexParameteri(texture_type, GL_TEXTURE_WRAP_S, wrap);
+					CHECK_OPENGL_ERROR
+					glTexImage1D(texture_type, 0, gpu_type, (GLsizei)R[1], 0, format, cpu_type, ptr);
+					CHECK_OPENGL_ERROR
+					break;
+				default:
+					std::cout<<"Error in stim::gl_texture - unrecognized texture target when generating texture"<<std::endl;
+					exit(1);
+					break;
+			}
+			CHECK_OPENGL_ERROR
+		}
+		void guess_parameters(){
+			setTextureType();									//set the texture type: 1D, 2D, 3D
+			format = guess_format();							//guess the texture format based on the number of image channels
+			cpu_type = guess_cpu_type();						//guess the CPU type based on the template
+			gpu_type = guess_gpu_type();						//guess the GPU type based on the format and template
+		}
  
 	public:
  
 		///default constructor
-		gl_texture()
-		{
-			
+		gl_texture(	GLint interp = GL_LINEAR,					//default to linear interpolation
+					GLint twrap = GL_REPEAT)					//default repeating the texture at the edges					
+						: image_stack<T>() {
+			init();												//initialize the texture with NULL values
+			interpolation = interp;								//store the interpolation type
+			wrap = twrap;										//store the wrap type
 		}
  
-		///@param string path to the directory with the image files.
-		///Creates an instance of the gl_texture object with a path to the data.
+		///@param is a mask indicating the files to load
+		///Creates an instance of the gl_texture object and initializes it with a file list
+
+		gl_texture(std::string file_mask, GLint interp = GL_LINEAR, GLint twrap = GL_REPEAT){
+			init();
+			interpolation = interp;								//store interpolation type
+			wrap = twrap;										//store wrap type
+			load_images(file_mask);
+		}
  
-		gl_texture(std::string file_mask)
-		{
-			//path = file_mask;
-			image_stack<T>::load_images(file_mask);
-			setTextureType();
+		///Creates an instance of gl_texture and initializes with a file list
+		///@param file_list is a list of files
+		///@param interp is the type of texture interpolation (GL_LINEAR, GL_NEAREST)
+		///@param twrap is the type of texture wrapping
+		gl_texture(std::vector<std::string> file_list, GLint interp = GL_LINEAR, GLint twrap = GL_REPEAT){
+			init();
+			interpolation = interp;
+			wrap = twrap;
+			load_images(file_list);
+		}
+
+		///Attaches the texture to the current OpenGL context and makes it ready to render
+		void attach(){
+			if(texID == 0) generate_texture();					//generate the texture if it doesn't already exist
+			else{
+				std::cout<<"Texture has already been attached to a context."<<std::endl;
+				exit(1);
+			}
+		}
+
+		//binds a texture to be the current render source
+		void bind(){
+			glBindTexture(texture_type, texID);					//bind the texture to the appropriate texture target
+			CHECK_OPENGL_ERROR
 		}
  
 		///returns the dimentions of the data in the x, y, z directions. 
-		vec<int>
-		getSize()
-		{
+		vec<int> getSize(){
 			stim::vec<int> size(R[1], R[2], R[3]);
 			return size;
 		}
  
-		///@param GLint interp     --GL_LINEAR, GL_NEAREST...
-		///@param GLint twrap      --GL_REPEAR, GL_CLAMP_TO_EDGE...
-		///@param GLenum dataType  --GL_UNSIGNED_BYTE, GL_FLOAT16...
-		///@param GLenum dataFormat--GL_LUMINANCE, GL_RGB...
-		///	Texture paramenters.
-		void
-		setTexParam(GLint interp = GL_LINEAR,
-			    GLint twrap = GL_CLAMP_TO_EDGE,
-			    GLenum dataType = GL_UNSIGNED_BYTE,
-			    GLenum dataFormat = GL_LUMINANCE)
-		{
-			interpType = interp;
-			texWrap    = twrap;
-			type	   = dataType;
-			format	   = dataFormat;
+		void getSize(size_t& x, size_t& y, size_t& z) {
+			x = R[0]; y = R[1]; z = R[2];
 		}
  
 		///@param x size of the voxel in x direction
 		///@param y size of the voxel in y direction
 		///@param z size of the voxel in z direction
 		///	Sets the dimenstions of the voxels.
-		void
-		setDims(float x, float y, float z)
-		{
-			S[1] = x;
-			S[2] = y;
-			S[3] = z;
+		void setSpacing(float sx, float sy, float sz){
+			grid<T, 4, F>::S[1] = sx;
+			grid<T, 4, F>::S[2] = sy;
+			grid<T, 4, F>::S[3] = sz;
 		}
  
 		///Returns a stim::vec that contains the x, y, z sizes of the voxel.
-		vec<float>
-		getDims()
-		{
-			vec<float> dims(S[1], S[2], S[3]);
+		vec<float> getDims(){
+			vec<float> dims(grid<T, 4, F>::S[1], grid<T, 4, F>::S[2], grid<T, 4, F>::S[3]);
 			return dims;
-		}	
+		}
  
-		///@param file_Path location of the directory with the files
-		///	Sets the path and calls the loader on that path.
-		void
-		setPath(std::string file_path)
-		{
-			path = file_path;
-			image_stack<T>::load_images(path.append("/*.jpg"));
-			setTextureType();
+		/// Loads a series of files specified by a list of strings
+		/// @param file_list is the vector of file names as strings
+		void load_images(std::vector<std::string> file_list){
+			image_stack<T, F>::load_images(file_list);			//load the images
+			guess_parameters();
 		}
-		
-		///	Returns an std::string path associated with an instance of the gl_texture class.
-		std::string
-		getPath()
-		{
-			return path;
+
+		///@param file_mask specifies the file(s) to be loaded
+		///	Sets the path and calls the loader on that path.
+		void load_images(std::string file_mask){
+			image_stack<T>::load_images(file_mask);				//load images
+			guess_parameters();
 		}
  
 		///	Returns the GLuint id of the texture created by/associated with the 
-		///	instance of the gl_texture class.
-				
-		GLuint
-		getTexture()
-		{
+		///	instance of the gl_texture class.				
+		GLuint getTexture(){
 			return texID;
 		}
  
-		///	Creates a texture and from the loaded data and
-		///	assigns that texture to texID
-		//TO DO :::: 1D textures
-	        //TO DO:::add methods for handling the cases of T
-		// and convert them to GL equivalent.
-		// i.e. an overloaded function that handles paramenter conversion.	
-		void
-		createTexture()
-		{
-			glPixelStorei(GL_UNPACK_ALIGNMENT,1);
-			glGenTextures(1, &texID);
-			glBindTexture(texture_type, texID);
-			glTexParameteri(texture_type,
-				 GL_TEXTURE_MIN_FILTER,
-				 interpType);
-			glTexParameteri(texture_type,
-				 GL_TEXTURE_MAG_FILTER,
-				 interpType);
-			switch(texture_type)
-			{
-				case GL_TEXTURE_3D:
-					glTexParameteri(texture_type,
-						 GL_TEXTURE_WRAP_S,texWrap);
-			// GL_REPEAT);
-			// GL_CLAMP_TO_EDGE);
-					glTexParameteri(texture_type,
-						 GL_TEXTURE_WRAP_T,texWrap);
-			// GL_REPEAT);
-			// GL_CLAMP_TO_EDGE);
-					glTexParameteri(texture_type,
-						 GL_TEXTURE_WRAP_R,texWrap);
-			// GL_REPEAT);
-			// GL_CLAMP_TO_EDGE);
-					glTexImage3D(texture_type,
-						0,
-					//	GL_RGB16,
-						1,
-						R[1],
-						R[2],
-						R[3],
-						0,
-						format,
-						type, 
-						ptr);
-					//GL_UNSIGNED_BYTE can be TYPES, convert to GL equivalents
-					glPixelStorei(GL_PACK_ALIGNMENT,1);
-					break;
-				case GL_TEXTURE_2D:
-					glTexParameteri(texture_type,
-						 GL_TEXTURE_WRAP_S, texWrap);
-					glTexParameteri(texture_type,
-						 GL_TEXTURE_WRAP_T, texWrap);
-					glTexImage2D(texture_type,
-						0,
-						1,
-						R[1],
-						R[2],
-						0,
-						format,
-						type, 
-						ptr);
-					break;
-			}
-		}
-			///Temporary methods for debugging and testing are below.
-			///Self-explanatory.
  
-		T*
-		getData()
-		{
+		T* getData(){
 			return ptr;
 		}
  
@@ -15,78 +15,119 @@ namespace stim{
 	Functions are provided for saving and loading binary data.
  
 **/
-template<typename T, unsigned int D = 1>
+template<typename T, unsigned int D = 1, typename F = float>
 class grid{
  
 protected:
  
-	stim::vec<unsigned long> R;		//elements in each dimension
-	stim::vec<float> S;
+	size_t R[D];							//elements in each dimension
+	F S[D];									//spacing between element samples
 	T* ptr;									//pointer to the data (on the GPU or CPU)
  
-	///Return the total number of values in the binary file
-	unsigned long samples(){
-
-		unsigned long s = 1;
-		for(unsigned int d = 0; d < D; d++)
-			s *= R[d];
-
-		return s;
-	}
+	
  
 	///Initializes a grid by allocating the necessary memory and setting all values to zero
-	void init(){
-
-		//calculate the total number of values
-		unsigned long S = samples();
-
-		//allocate memory to store the grid
-		ptr = (T*)malloc(sizeof(T) * S);
-
-		//initialize the memory to zero
-		memset(ptr, 0, sizeof(T) * S);
+	void init(){		
+		ptr = NULL;										//initialize the data pointer to NULL
+		memset(R, 0, sizeof(size_t) * D);				//set the resolution to zero
+		for(size_t d = 0; d < D; d++) S[d] = (F)1.0;	//initialize the spacing to unity
+	}
  
+	void alloc(){
+		if(ptr != NULL) free(ptr);						//if memory has already been allocated, free it
+		size_t N = samples();							//calculate the total number of values		
+		ptr = (T*)calloc(sizeof(T), N);					//allocate memory to store the grid
 	}
  
 public:
  
 	///Default constructor doesn't do anything
 	grid(){
-		ptr = NULL;			//set the pointer to NULL so that we know nothing is allocated
+		init();
 	}
  
 	///Constructor used to specify the grid size as a vector
  
 	/// @param _R is a vector describing the grid resolution
-	grid( stim::vec<unsigned long> _R){
-		
-		//set the grid resolution
-		R = _R;
-
+	grid( stim::vec<size_t> _R){
+		for (size_t d = 0; d < D; d++)
+			R[d] = _R[d];
 		init();		
 	}
  
+	///Return the total number of values in the binary file
+	size_t samples(){
+		size_t s = 1;
+		for(size_t d = 0; d < D; d++)
+			s *= R[d];
+		return s;
+	}
+
+	///Return the number of bytes in the binary file
+	size_t bytes(){
+		return samples() * sizeof(T);
+	}
+
 	void
-	setDim(stim::vec<float> s)
-	{
-		S = s;
+	setDim(stim::vec<float> s){
+		for(size_t d = 0; d < D; d++)
+			S[d] = s[d];
 	}
  
 	///Constructor used to specify the grid size as a set of parameters
-
 	/// @param X0... is a list of values describing the grid size along each dimension
-	grid( unsigned long X0, ...){
+	/*grid( size_t X0, ...){
+		R[0] = X0;									//set the grid size of the first dimension
+		va_list ap;									//get a variable list
+		va_start(ap, X0);							//start the variable list at the first element
+		for(size_t d = 1; d<D; d++)					//for each additional element
+			R[d] = va_arg(ap, size_t);				//read the value from the variable list as a size_t
+		va_end(ap);
+		init();										//initialize the grid
+	}*/
+
+	///Set the spacing between grid sample points
+	/// @param X0... is a list of values describing the grid sample spacing
+	/*void spacing(F X0, ...) {
+		S[0] = X0;											//set the grid size of the first dimension
+		va_list ap;											//get a variable list
+		va_start(ap, X0);									//start the variable list at the first element
+		for (size_t d = 1; d<D; d++)						//for each additional element
+			S[d] = va_arg(ap, F);						//read the value from the variable list as a size_t
+		va_end(ap);
+	}*/
+
+	/// Set the spacing between grid sample points for the specified dimension
+	void spacing(size_t d, F sp){
+		if(d < D) S[d] = sp;
+		else{
+			std::cout<<"error in stim::grid::spacing() - insufficient dimensions"<<std::endl;
+			exit(1);
+		}
+	}
  
-		R[0] = X0;
+	/// Return the spacing for a given dimension
+	F spacing(size_t d){
+		if(d < D) return S[d];
+		else{
+			std::cout<<"error in stim::grid::spacing() - insufficient dimensions"<<std::endl;
+			exit(1);
+		}
+	}
  
-		va_list ap;
-		va_start(ap, X0);
-		for(unsigned int d = 1; d<D; d++)
-			R[d] = va_arg(ap, unsigned long);
-		va_end(ap);
+	/// Get the sample spacing for the given dimension
+	F get_spacing(size_t d) {
+		return S[d];
+	}
  
-		init();
+	/// Get the size of the grid along the specified dimension
+	F size(size_t d){
+		return (F)R[d] * S[d];
+	}
  
+	/// Return the number of samples
+	size_t samples(size_t d){
+		return R[d];
 	}
  
 	///Writes the binary data to disk
@@ -94,13 +135,9 @@ public:
 	/// @param filename is the name of the binary file to be written
 	void write(std::string filename){
  
-		std::fstream file;
-
-		//open the file as binary for reading
-		file.open(filename.c_str(), std::ios::out | std::ios::binary);
-
-		//write file to disk
-		file.write((char *)ptr, samples() * sizeof(T));
+		std::fstream file;		
+		file.open(filename.c_str(), std::ios::out | std::ios::binary);		//open the file as binary for reading		
+		file.write((char *)ptr, samples() * sizeof(T));						//write file to disk
 	}
  
 	///Loads a binary file from disk
@@ -108,66 +145,52 @@ public:
 	/// @param filename is the name of the file containing the binary data
 	/// @param S is the size of the binary file along each dimension
 	/// @param header is the size of the header in bytes
-	void read(std::string filename, stim::vec<unsigned long> S, unsigned long header = 0){
-
-		R = S;	//set the sample resolution
-
-		//allocate space for the data
-		init();
-
-		std::fstream file;
-
-		//open the file as binary for writing
-		file.open(filename.c_str(), std::ios::in | std::ios::binary);
-
-		//seek past the header
-		file.seekg(header, std::ios::beg);
-
-
-		//read the data
-		file.read((char *)ptr, samples() * sizeof(T));
+	void read(std::string filename, stim::vec<size_t> X, unsigned long header = 0){
+		for(size_t d = 0; d < D; d++)
+			R[d] = X[d];																//set the sample resolution		
+		init();																//allocate space for the data
+		std::fstream file;		
+		file.open(filename.c_str(), std::ios::in | std::ios::binary);		//open the file as binary for writing		
+		file.seekg(header, std::ios::beg);									//seek past the header		
+		file.read((char *)ptr, samples() * sizeof(T));						//read the data
 	}
  
 	///Gets a single value from the grid given a set of coordinates
-
 	/// @param x0... is a list of coordinates specifying the desired value
-	T get(unsigned long x0, ...){
+	/*T get(unsigned long x0, ...){
  
-		va_list ap;
+		va_list ap;									//create a variable list
  
-		unsigned long F = 1;
-		unsigned long p = x0;
+		unsigned long F = 1;						//initialize the dimension size to 1
+		unsigned long idx = x0;
  
-		va_start(ap, x0);
-		for(unsigned int d = 1; d<D; d++){
-			F *= R[d-1];
-			p += va_arg(ap, unsigned int) * F;
+		va_start(ap, x0);							//start a variable list
+		for(unsigned int d = 1; d<D; d++){			//for each dimension
+			F *= R[d-1];							//get the size of the first dimension
+			idx += va_arg(ap, unsigned int) * F;	//increment the index
 		}
 		va_end(ap);
  
-		return ptr[p];
-	}
+		return ptr[idx];							//access the appropriate element and return the value
+	}*/
  
 	///Sets a value in the grid
  
 	/// @param value is the grid point value
 	/// @x0... is the coordinate of the value to be set
-	void set(T value, unsigned long x0, ...){
-
-		va_list ap;
-		
-		unsigned long F = 1;
-		unsigned long p = x0;
-
-		va_start(ap, x0);
-		for(unsigned int d = 1; d<D; d++){
-			F *= R[d-1];
-			p += va_arg(ap, unsigned int) * F;
+	/*void set(T value, unsigned long x0, ...){
+		va_list ap;									//create a variable list		
+		unsigned long F = 1;						//initialize the dimension counter to 1
+		unsigned long idx = x0;						//initialize the index to the first variable
+
+		va_start(ap, x0);							//start the variable list
+		for(unsigned int d = 1; d<D; d++){			//for each dimension
+			F *= R[d - 1];
+			idx += va_arg(ap, unsigned int) * F;	//update the index
 		}
 		va_end(ap);
-
-		ptr[p] = value;
-	}
+		ptr[idx] = value;							//set the value at the indexed location
+	}*/
  
  
 	///Outputs grid data as a string
@@ -179,13 +202,11 @@ public:
 		for(unsigned int d = 0; d<D; d++){
 			if(d!=0) result<<", ";
 			result<<R[d];
-
 		}
-
 		result<<"]"<<std::endl;
  
 		//calculate the number of values to output
-		unsigned long nV = min((unsigned long long)R[0], (unsigned long long)10);
+		unsigned long nV = std::min((unsigned long long)R[0], (unsigned long long)10);
  
 		for(unsigned long v = 0; v<nV; v++){
 			result<<ptr[v];
@@ -8,83 +8,112 @@
  
 namespace stim{
  
-/**This class is used to load 3D grid data from stacks of images
-	The class uses a 4D grid object, where the first dimension is a color value.
-**/
-template<typename T>
-class image_stack : public virtual stim::grid<T, 4>{
+///This class is used to load 3D grid data from stacks of images
+//	The class uses a 4D grid object, where the first dimension is a color value.
+template<typename T, typename F = float>
+class image_stack : public virtual stim::grid<T, 4, F>{
  
 	enum image_type {stimAuto, stimMono, stimRGB, stimRGBA};
  
 protected:
-	using stim::grid<T, 4>::S;
+	//using stim::grid<T, 4>::S;
 	using stim::grid<T, 4>::R;
 	using stim::grid<T, 4>::ptr;
-	using stim::grid<T, 4>::samples;
 	using stim::grid<T, 4>::read;
  
 public:
+	//default constructor
+	image_stack() : grid<T, 4>() {
  
-	///Load an image stack based on a file mask. Images are loaded in alphanumeric order.
+	}
  
-	/// @param file_mask is the mask describing images to be loaded
-	void load_images(std::string file_mask){
+	/// Overloads grid::samples() to return the number of samples associated with a given spatial dimension
+	///		this is necessary because R[0] stores the color
+	size_t samples(size_t d){
+		return grid<T, 4, F>::samples(d + 1);
+	}
  
-		stim::filename file_path(file_mask);
+	size_t samples(){
+		return R[1] * R[2] * R[3];						//return the number of spatial samples
+	}
+
+	/// Returns the number of color channels
+	size_t channels(){
+		return R[0];
+	}
+
+	/// Overloads grid::size() to return the size of the grid associated with a given spatial dimension
+	F size(size_t d){
+		return grid<T, 4, F>::size(d + 1);
+	}
  
-		//get the list of files
-		std::vector<stim::filename> file_list = file_path.get_list();
+	/// Sets the spacing between samples in the image stack
+	void spacing(F sx, F sy, F sz){
+		grid<T, 4, F>::S[1] = sx;			//set the sample spacing for the appropriate spatial dimension
+		grid<T, 4, F>::S[2] = sy;
+		grid<T, 4, F>::S[3] = sz;
+	}
+
+	F spacing(size_t d){
+		return grid<T, 4, F>::spacing(d + 1);
+	}
+
+	/// Overloads the spacing parameter to set the size of the grid associated with a given spatial dimension
+	//void spacing(F sx, F sy = 1.0f, F sz = 1.0f){
+	//	grid<T, 4, F>::spacing((F)1.0, sx, sy, sz);
+	//}
+
+	/// Load all of the images specified by a list of strings
+	/// @param string_list is a list of file names specifying images
+	void load_images(std::vector<std::string> string_list){
  
 		//if there are no matching files, exit
-		if(file_list.size() == 0){
+		if(string_list.size() == 0){
 			std::cout<<"STIM ERROR (image_stack): No matching files for loading a stack."<<std::endl;
 			exit(1);
 		}
-		//for(int i = 0; i < file_list.size(); i++)
-		//	std::cout << file_list[i].str() << std::endl;
  
-		//load the first image and set all of the image_stack properties
-		stim::image<T> I(file_list[0].str());
+		stim::image<T> I(string_list[0]);		//load the first image and set all of the image_stack proparties
  
-		//set the image resolution and number of channels
-		R.push(I.channels());
-		R.push(I.width());
-		R.push(I.height());
-		R.push(file_list.size());
+		R[0] = I.channels();				//set the number of color channels
+		R[1] = I.width();				//set the stack height and width based on the image size
+		R[2] = I.height();
+		R[3] = string_list.size();			//set the stack z-resolution based on the number of images
  
-		//allocate storage space
-		ptr = (T*)malloc(sizeof(T) * samples());
+		ptr = (T*)malloc(grid<T, 4, F>::bytes());	//allocate storage space
  
 		//load and copy each image into the grid
-		for(unsigned int i = 0; i<R[3]; i++){
-			//load the image
-			stim::image<T> I(file_list[i].str());
+		for(unsigned int i = 0; i<R[3]; i++){				//for each image in the list			
+			stim::image<T> I(string_list[i]);			//load the image			
+			I.get_interleaved_rgb(&ptr[ i * R[0] * R[1] * R[2] ]);	//retrieve the interlaced data from the image - store it in the grid			
+		}
+	}
  
-			//retrieve the interlaced data from the image - store it in the grid
-			I.get_interleaved_rgb(&ptr[ i * R[0] * R[1] * R[2] ]);
-			
+	/// Load a stack of images based on a file mask. Images are loaded in alphanumeric order
+	/// @param file_mask is the mask describing the images to be loaded
+	void load_images(std::string file_mask){
+		stim::filename file_path(file_mask);				//get the path for the images
+		std::vector<stim::filename> file_list = file_path.get_list();	//get the list of files
+		std::vector<std::string> string_list(file_list.size());		//allocate space for an array of strings
+		for(size_t f = 0; f < file_list.size(); f++){			//for each file name in the list
+			string_list[f] = file_list[f].str();			//convert the file name to a string
 		}
+		load_images(string_list);					//load all of the images in the list
 	}
  
 	///Inserts image I into slot i.
 	/// @param stim::image<T> I; image to insert.
 	/// @int I, where to place the image.
-	void insert_image(stim::image<T> I, int i)
-	{
+	void insert_image(stim::image<T> I, int i){
 		I.get_interleaved_rgb(&ptr[i *R[0] *R[1] *R[2] ]);
 	}
  
 	///Saves a single page to an image file
 	/// @param file_name is the name of the image file to be created
 	/// @param i is the page to be saved
-	void save_image(std::string file_name, unsigned int i){
-
-		//create an image
-		stim::image<T> I;
-
-		//retrieve the interlaced data from the image - store it in the grid
-		I.set_interleaved_rgb(&ptr[ i * R[0] * R[1] * R[2] ], R[1], R[2], R[0]);
-
+	void save_image(std::string file_name, unsigned int i){		
+		stim::image<T> I;											//create an image		
+		I.set_interleaved_rgb(&ptr[ i * R[0] * R[1] * R[2] ], R[1], R[2], R[0]);	//retrieve the interlaced data from the image - store it in the grid
 		I.save(file_name);
 	}
  
@@ -96,10 +125,10 @@ public:
 	void
 	set_dim(float x, float y, float z)
 	{
-		S[0] = 1;
-		S[1] = x;
-		S[2] = y;
-		S[3] = z;
+		grid<T, 4, F>::S[0] = 1;
+		grid<T, 4, F>::S[1] = x;
+		grid<T, 4, F>::S[2] = y;
+		grid<T, 4, F>::S[3] = z;
 	}
  
 	///set dimensions of the grid.
@@ -124,12 +153,6 @@ public:
  
 		stim::filename file_path(file_mask);
  
-		//if the file path is relative, update it with the current working directory
-//		if(file_path.is_relative()){
-//			stim::filename wd = stim::filename::cwd();
-//			file_path = wd.get_relative(file_mask);
-//		}
-
 		//create a list of file names
 		std::vector<std::string> file_list = stim::wildcards::increment(file_path.str(), 0, R[3]-1, 1);
  
@@ -159,7 +159,10 @@ public:
 			std::cout<<"ERROR stim::image::load() - unable to find image "<<filename<<std::endl;
 			exit(1);
 		}
-		allocate(cvImage.cols, cvImage.rows, cvImage.channels());			//allocate space for the image
+		int cols = cvImage.cols;
+		int rows = cvImage.rows;
+		int channels = cvImage.channels();
+		allocate(cols, rows, channels);			//allocate space for the image
 		unsigned char* cv_ptr = (unsigned char*)cvImage.data;
 		if(C() == 1)														//if this is a single-color image, just copy the data
 			memcpy(img, cv_ptr, bytes());
@@ -217,6 +217,7 @@ public:
 		return result;
 	}
  
+//#ifndef __NVCC__
 	/// Outputs the vector as a string
 	std::string str() const{
 		std::stringstream ss;
@@ -234,6 +235,7 @@ public:
  
 		return ss.str();
 	}
+//#endif
  
 	size_t size(){ return 3; }
  
@@ -523,7 +523,11 @@ namespace stim{
         std::string arg(size_t a){
         	return args[a];
         }
-
+	
+	/// Returns an std::vector of argument strings
+	std::vector<std::string> arg_vector(){
+		return args;
+	}
         ///Returns an object describing the argument
  
         /// @param _name is the name of the requested argument
@@ -110,12 +110,17 @@ protected:
 			unix_dir = unix_dir.substr(2, unix_dir.length()-2);	//extract the directory structure
 		}
  
-		if(unix_dir.at(0) == '/'){						//if there is a leading slash
-			relative = false;								//the path is not relative
-			unix_dir = unix_dir.substr(1, unix_dir.length() - 1);	//remove the slash
+		if(drive.size() != 0){
+			relative = false;
+		}
+		if(unix_dir.size() > 0){										//if there is a directory specified, remove surrounding slashes
+			if(unix_dir[0] == '/'){						//if there is a leading slash
+				relative = false;								//the path is not relative
+				unix_dir = unix_dir.substr(1, unix_dir.length() - 1);	//remove the slash
+			}
+			if(unix_dir[unix_dir.size()-1] == '/')
+				unix_dir = unix_dir.substr(0, unix_dir.length() - 1);
 		}
-		if(unix_dir.at(unix_dir.size()-1) == '/')
-			unix_dir = unix_dir.substr(0, unix_dir.length() - 1);
  
 		path = stim::parser::split(unix_dir, '/');					//split up the directory structure
  
@@ -186,6 +186,7 @@ public:
 		d = vec3<float>(0, 0, 1);
 		up = vec3<float>(0, 1, 0);
 		focus = 1;
+		fov = 60;
  
 	}
  
@@ -4,13 +4,13 @@
 #include <string>
 #include <stdlib.h>
 #include <cmath>
-#include "cublas_v2.h"
  
 #ifdef _WIN32
 	#include <float.h>
 #endif
  
 #ifdef __CUDACC__
+#include "cublas_v2.h"
 #include <stim/cuda/cudatools/error.h>
 #endif