Merge branch 'master' of git.stim.ee.uh.edu:codebase/stimlib

Pavel Govyadinov
2 parents 42b3c74f c6251f8b
Showing 4 changed files with 375 additions and 69 deletions Show diff stats
stim/envi/binary.h
stim/envi/bsq.h
stim/envi/envi.h
stim/envi/hsi.h
@@ -8,7 +8,7 @@
 #include <fstream>
 #include <sys/stat.h>
 #include <cstring>
-
+#include <chrono>
 #ifdef _WIN32
 #include <Windows.h>
 #else
@@ -17,6 +17,186 @@
 namespace stim{
+/// This class calculates the optimal setting for independent parameter b (batch size) for
+///		minimizing the dependent parameter bps (bytes per second)
+class stream_optimizer{
+protected:
+	size_t Bps[2];							//bytes per second for the previous batch
+	size_t interval_B;					//number of bytes processed this interval
+	size_t interval_ms;					//number of milliseconds spent in the current interval
+	size_t n[2];							//current batch size (in bytes)
+	size_t h;							//spacing used for finite difference calculations
+	size_t dn;							//delta value (in bytes) for setting the batch size (minimum change in batch parameter)
+	size_t maxn;						//maximum value for the batch size
+
+	double alpha;						//alpha value controls the factor of the gradient that is used to calculate the next point (speed of convergence)
+
+	bool sample_step;					//calculating the derivative (this class alternates between calculating dBps and B)
+	bool forward_diff;					//evaluate the derivative using forward differences
+
+	size_t window_ms;					//size of the interval (in milliseconds) integrated to get a reliable bps value
+
+	// This function rounds x to the nearest value within dB
+	size_t round_limit(double n0){
+		if(n0 < 0) return dn;					//if n0 is less than zero, return the lowest possible n
+
+		size_t new_n = (size_t)(n0 + 0.5);		//now n0 must be positive, so round it to the nearest integer
+		if(new_n > maxn) new_n = maxn;			//limit the returned size of x to within the specified bounds
+
+		size_t lowest = new_n / dn;
+		size_t highest = lowest + dn;
+		size_t diff[2] = {new_n - lowest, highest - new_n};	//calculate the two differences
+		if(diff[0] < diff[1])
+			return lowest;
+		return highest;
+	}
+
+public:
+
+	//constructor initializes a stream optimizer
+	stream_optimizer(size_t min_batch_size, size_t max_batch_size, double a = 0.003, size_t probe_step = 5, size_t window = 2000){
+		//Bps = 0;						//initialize to zero bytes per second processed
+		Bps[0] = Bps[1] = 0;			//initialize the bits per second to 0
+		interval_B = 0;					//zero bytes have been processed at initialization
+		interval_ms = 0;				//no time has been spent on the batch so far
+		dn = min_batch_size;			//set the minimum batch size as the minimum change in batch size
+		maxn = max_batch_size;			//set the maximum batch size
+		n[0] = max_batch_size;			//set B
+		h = (max_batch_size / min_batch_size) / probe_step * dn;
+		std::cout<<"h = "<<h<<std::endl;
+		if(h < dn) h = dn;
+		alpha = a;
+		//n[0] = round_limit( (max_batch_size - min_batch_size)/2 );
+		window_ms = window;		//minimum integration interval (for getting a reliable bps measure)
+		sample_step = true;					//the first step is to calculate the derivative
+		forward_diff = true;			//start with the forward difference (since we start at the maximum batch size)
+	}
+
+	size_t update(size_t bytes_processed, size_t ms_spent, size_t& data_rate, bool VERBOSE = false){
+		interval_B += bytes_processed;		//increment the number of bytes processed
+		interval_ms += ms_spent;			//increment the number of milliseconds spent processing
+		data_rate = interval_B / interval_ms;
+
+		//if we have sufficient information to evaluate the optimization function at this point
+		if(interval_ms < window_ms){					//if insufficient time has passed to get a reliable Bps measurement
+			return n[0];
+		}
+		else{											//if we have collected enough information for a reliable Bps estimate
+			
+			if(Bps[0] == 0){							//if n[0] hasn't been evaluated yet, this is the first step
+				Bps[0] = data_rate;						//set the initial Bps value
+				n[1] = n[0] - h;						//set the position of the next sample point
+				if(VERBOSE)
+					std::cout<<"Bps value at n = "<<n[0]<<" is "<<Bps[0]<<" Bps, probing n = "<<n[1]<<std::endl;
+				return n[1];							//return the probe point
+			}
+			else{
+				Bps[1] = data_rate;						//set the Bps for the current point (n[1])
+
+				double Bps_p;							//allocate a variable for the derivative
+				//calculate the derivative
+				if(n[0] < n[1]){						//if the current point is less than the previous one (probably the most common)
+					Bps_p = ((double)Bps[1] - (double)Bps[0]) / (double)h;		//calculate the derivative using the forward finite difference
+				}
+				else{
+					Bps_p = ((double)Bps[0] - (double)Bps[1]) / (double)h;		//calculate the derivative using the backward finite difference
+				}
+				if(VERBOSE)
+					std::cout<<"     probed n = "<<n[1]<<" with "<<Bps[1]<<" Bps, gradient = "<<Bps_p<<" Bps"<<std::endl;
+
+				double new_n_precise = n[0] + alpha * Bps_p;			//calculate the next point (snap to closest integer)
+				size_t new_n_nearest = round_limit(new_n_precise);		//calculate the next point (given batch parameters)
+
+				if(new_n_nearest == n[0]){								//if the newest point is the same as the original point
+					Bps[0] = Bps[1];									//update the Bps
+					//if(n[0] == dn) n[1] = n[0] + h;					//if we're on the left edge, probe forward
+					//else n[1] = n[0] - h;								//otherwise probe backwards
+					if(VERBOSE)
+						std::cout<<"     staying at n = "<<n[0]<<" for now"<<std::endl;
+					//return n[1];										//return the probe point
+
+					Bps[0] = 0;											//reset the Bps for the current point
+					return n[0];										//return the current point for a re-calculation
+				}
+				else{													//if the newest point is different from the original point
+					n[0] = new_n_nearest;								//move to the new point
+					Bps[0] = 0;											//set the Bps to zero (point hasn't been tested)
+					if(VERBOSE)
+						std::cout<<"     moving to n = "<<n[0]<<std::endl;
+					return n[0];										//return the new point
+				}
+			}
+		}
+	}
+
+	/*// this function updates the optimizer, given the number of bytes processed in an interval and time spent processing
+	size_t update(size_t bytes_processed, size_t ms_spent){
+		interval_B += bytes_processed;		//increment the number of bytes processed
+		interval_ms += ms_spent;			//increment the number of milliseconds spent processing
+
+		//if we have sufficient information to evaluate the optimization function at this point
+		if(interval_ms >= window_ms){					//if sufficient time has passed to get a reliable Bps measurement
+			size_t new_Bps = interval_B / interval_ms;	//calculate the current Bps
+
+			if(sample_step){							//if this is a sample step, collect the information for Bps = f(n0)
+				Bps = new_Bps;							//set the Bps to the evaluated value
+				n[1] = n[0] - dn;								//reduce the batch size by one delta to take a second sample
+				if(n[1] == 0){							//if the resulting batch size is zero
+					n[1] = 2*dn;						//we're at the left edge: set the new sample point to 2*dn
+				}
+
+				interval_B = interval_ms = 0;			//start a new interval at the new sample point
+				sample_step = false;					//next step will calculate the new batch size via optimization
+				return n[1];								//return the new batch size
+			}
+			else{								//if we have sufficient information to evaluate the derivative and optimize
+				double f = (double)new_Bps;				//we have evaluated the function at this location
+				double fprime;
+				if(n[1] < n[0] ){									//if the new point is less than the previous point (usually the case)
+					fprime = (double)(Bps - new_Bps) / (double)dn;	//calculate the forward difference
+				}
+				else{												//if the new point is larger (only happens at the minimum limit)
+					fprime = (double)(new_Bps - Bps) / (double)dn;	//calculate the backward difference
+				}
+				size_t bestn = n[1] - (size_t)(f / fprime);			//calculate the best value for B using Newton's method
+				n[0] = round_limit( (size_t)bestn );						//set the new dependent point
+				sample_step = true;									//the next step will be a sample step
+			}
+
+		}
+		if(sample_step) return n[0];
+		return n[1];										//insufficient information, keep the same batch size
+	}*/
+
+	/*size_t update(size_t bytes_processed, size_t ms_spent){
+		interval_B += bytes_processed;		//increment the number of bytes processed
+		interval_ms += ms_spent;			//increment the number of milliseconds spent processing
+
+		//if( Bps[0] == 0 ){				//if the left boundary hasn't been processed
+
+
+		//if we have sufficient information to evaluate the optimization function at this point
+		if(interval_ms >= window_ms){
+			size_t new_Bps = interval_B / interval_ms;	//calculate the current Bps
+
+			if(Bps[0] == 0)							//if the left interval Bps hasn't been calculated
+				Bps[0] = interval_B / interval_ms;	//that is the interval being processed
+			else
+				Bps[1] = interval_B / interval_ms;	//otherwise the right interval is being processed
+
+			if(Bps[0] != 0 && Bps[1] != 0){			//if both intervals have been processed
+
+
+		}
+	}*/
+
+	/*size_t update(size_t bytes_processed, size_t ms_spent, size_t& data_rate, bool VERBOSE){
+		size_t time = update(bytes_processed, ms_spent, VERBOSE);
+		data_rate = Bps[0];
+		return time;
+	}*/
+};
+
 /** This class manages the streaming of large multidimensional binary files.
  *  Generally these are hyperspectral files with 2 spatial and 1 spectral dimension. However, this class supports
  *  other dimensions via the template parameter D.
@@ -36,6 +216,7 @@ protected:
 	unsigned char* mask;	//pointer to a character array: 0 = background, 1 = foreground (or valid data)
 	double progress;		//stores the progress on the current operation (accessible using a thread)
+	size_t data_rate;		//data rate (currently in Bps)
 	size_t buffer_size;		//available memory for processing large files
@@ -45,8 +226,9 @@ protected:
 		header = 0;											//initialize the header size to zero
 		mask = NULL;
-		progress = 0;
-		set_buffer();										//set the maximum buffer size to the default
+		progress = 0;										//initialize the progress for any algorithm to zero
+		data_rate = 0;										//initialize the data rate to zero
+		set_buffer_frac();										//set the maximum buffer size to the default
 	}
 	/// Private helper function that returns the size of the file on disk using system functions.
@@ -127,8 +309,12 @@ public:
 		progress = 0;
 	}
+	size_t get_data_rate(){
+		return data_rate;
+	}
+
 	//specify the maximum fraction of available memory that this class will use for buffering
-	void set_buffer(double mem_frac = 0.5){				//default to 50%
+	void set_buffer_frac(double mem_frac = 0.5){				//default to 50%
 #ifdef _WIN32
 		MEMORYSTATUSEX statex;
 		statex.dwLength = sizeof (statex);
@@ -141,6 +327,10 @@ public:
 #endif
 	}
+	void set_buffer_raw(size_t bytes){
+		buffer_size = bytes;
+	}
+
 	/// Open a binary file for streaming.
 	/// @param filename is the name of the binary file
@@ -404,8 +594,8 @@ public:
 	}
 	/// Reads a block specified by an (x, y, z) position and size using the largest possible contiguous reads
-	bool read(T* dest, size_t x, size_t y, size_t z, size_t sx, size_t sy, size_t sz){
-
+	size_t read(T* dest, size_t x, size_t y, size_t z, size_t sx, size_t sy, size_t sz){
+		auto t0 = std::chrono::high_resolution_clock::now();
 		size_t size_bytes = sx * sy * sz * sizeof(T);					//size of the block to read in bytes
 		size_t start = z * R[0] * R[1] + y * R[0] + x;						//calculate the start postion
@@ -415,10 +605,8 @@ public:
 		if(sx == R[0] && sy == R[1]){				//if sx and sy result in a contiguous volume along z
 			file.read((char*)dest, size_bytes);			//read the block in one pass
-			return true;
 		}
-
-		if(sx == R[0]){												//if sx is contiguous, read each z-axis slice can be read in one pass
+		else if(sx == R[0]){												//if sx is contiguous, read each z-axis slice can be read in one pass
 			size_t jump_bytes = (R[1] - sy) * R[0] * sizeof(T);		//jump between each slice
 			size_t slice_bytes = sx * sy * sizeof(T);				//size of the slice to be read
 			for(size_t zi = 0; zi < sz; zi++){						//for each z-axis slice
@@ -426,29 +614,31 @@ public:
 				dest += sx * sy;									//move the destination pointer to the next slice
 				file.seekg(jump_bytes, std::ios::cur);				//skip to the next slice in the file
 			}
-			return true;
 		}
-
-		//in this case, x is not contiguous so the volume must be read line-by-line
-		size_t jump_x_bytes = (R[0] - sx) * sizeof(T);				//number of bytes skipped in the x direction
-		size_t jump_y_bytes = (R[1] - sy) * R[0] * sizeof(T) + jump_x_bytes;	//number of bytes skipped between slices
-		size_t line_bytes = sx * sizeof(T);							//size of the line to be read
-		size_t zi, yi;
-		for(zi = 0; zi < sz; zi++){									//for each slice
-			file.read((char*)dest, line_bytes);							//read the first line
-			for(yi = 1; yi < sy; yi++){								//read each additional line
-				dest += sx;											//move the pointer in the destination block to the next line
-				file.seekg(jump_x_bytes, std::ios::cur);			//skip to the next line in the file
-				file.read((char*)dest, line_bytes);						//read the line to the destination block
+		else{
+			//in this case, x is not contiguous so the volume must be read line-by-line
+			size_t jump_x_bytes = (R[0] - sx) * sizeof(T);				//number of bytes skipped in the x direction
+			size_t jump_y_bytes = (R[1] - sy) * R[0] * sizeof(T) + jump_x_bytes;	//number of bytes skipped between slices
+			size_t line_bytes = sx * sizeof(T);							//size of the line to be read
+			size_t zi, yi;
+			for(zi = 0; zi < sz; zi++){									//for each slice
+				file.read((char*)dest, line_bytes);							//read the first line
+				for(yi = 1; yi < sy; yi++){								//read each additional line
+					dest += sx;											//move the pointer in the destination block to the next line
+					file.seekg(jump_x_bytes, std::ios::cur);			//skip to the next line in the file
+					file.read((char*)dest, line_bytes);						//read the line to the destination block
+				}
+				file.seekg(jump_y_bytes, std::ios::cur);				//skip to the beginning of the next slice
 			}
-			file.seekg(jump_y_bytes, std::ios::cur);				//skip to the beginning of the next slice
 		}
-		return false;
+		auto t1 = std::chrono::high_resolution_clock::now();
+		return std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count();
 	}
 	// permutes a block of data from the current interleave to the interleave specified (re-arranged dimensions to the order specified by [d0, d1, d2])
-	void permute(T* dest, T* src, size_t sx, size_t sy, size_t sz, size_t d0, size_t d1, size_t d2){
+	size_t permute(T* dest, T* src, size_t sx, size_t sy, size_t sz, size_t d0, size_t d1, size_t d2){
+		auto t0 = std::chrono::high_resolution_clock::now();
 		size_t d[3] = {d0, d1, d2};
 		size_t s[3] = {sx, sy, sz};
 		size_t p[3];// = {x, y, z};
@@ -467,7 +657,6 @@ public:
 					p[1] = y;
 					src_idx = z * sx * sy + y * sx;
 					dest_idx = p[d[2]] * s[d[0]] * s[d[1]] + p[d[1]] * s[d[0]];
-					//std::cout<<z<<", "<<y<<" ------- "<<p[d[2]]<<" * "<<s[d[0]]<<" * "<<s[d[1]]<<" + "<<p[d[1]]<<" * "<<s[d[0]]<<std::endl;
 					memcpy(dest + dest_idx, src + src_idx, x_bytes);
 				}
 			}
@@ -491,6 +680,8 @@ public:
 				}
 			}
 		}
+		auto t1 = std::chrono::high_resolution_clock::now();
+		return std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count();
 	}
 };
@@ -10,7 +10,7 @@
 #include <deque>
 #include <chrono>
 #include <future>
-
+#include <algorithm>
 namespace stim{
@@ -377,24 +377,40 @@ public:
 	}
-	void readlines(T* dest, size_t start, size_t n){
-		hsi<T>::read(dest, 0, start, 0, X(), n, Z());
+	size_t readlines(T* dest, size_t start, size_t n){
+		return hsi<T>::read(dest, 0, start, 0, X(), n, Z());
+	}
+
+	size_t writeblock(std::ofstream* f, T* src, size_t n){
+		auto t0 = std::chrono::high_resolution_clock::now();
+		f->write((char*)src, n);
+		auto t1 = std::chrono::high_resolution_clock::now();
+		return std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count();
 	}
 	/// Convert this BSQ file to a BIL
-	bool bil(std::string outname, bool PROGRESS = false){
+	bool bil(std::string outname, bool PROGRESS = false, bool VERBOSE = false, bool OPTIMIZATION = true){
 		const size_t buffers = 4;													//number of buffers required for this algorithm
+		
 		size_t mem_per_batch = binary<T>::buffer_size / buffers;					//calculate the maximum memory available for a batch
 		size_t slice_bytes = X() * Z() * sizeof(T);									//number of bytes in an input batch slice (Y-slice in this case)
 		size_t max_slices_per_batch = mem_per_batch / slice_bytes;					//maximum number of slices we can process in one batch given memory constraints
+
+		//if(VERBOSE){
+			std::cout<<"maximum memory available for processing: "<<(double)binary<T>::buffer_size/(double)1000000<<" MB"<<std::endl;
+			std::cout<<"     this supports a batch size of "<<max_slices_per_batch<<" Y-axis slices ("<<X()<<" x "<<Z()<<") = "<<X() * Z() * sizeof(T) * max_slices_per_batch/1000000<<" MB"<<std::endl;
+		//}
+
 		if(max_slices_per_batch == 0){														//if there is insufficient memory for a single slice, throw an error
 			std::cout<<"error, insufficient memory for stim::bsq::bil()"<<std::endl;
 			exit(1);
 		}
 		size_t max_batch_bytes = max_slices_per_batch * slice_bytes;				//calculate the amount of memory that will be allocated for all four buffers
+		stream_optimizer O(1, max_slices_per_batch);
+
 		T* src[2];																	//source double-buffer for asynchronous batching
 		src[0] = (T*) malloc(max_batch_bytes);
 		src[1] = (T*) malloc(max_batch_bytes);
@@ -403,54 +419,70 @@ public:
 		dst[1] = (T*) malloc(max_batch_bytes);
 		size_t N[2];																		//number of slices stored in buffers 0 and 1
-		N[0] = N[1] = min(Y(), max_slices_per_batch);										//start with the maximum number of slices that can be stored (may be the entire data set)
+		N[0] = N[1] = std::min<size_t>(Y(), max_slices_per_batch);										//start with the maximum number of slices that can be stored (may be the entire data set)
 		std::ofstream target(outname.c_str(), std::ios::binary);					//open an output file for writing
 																		//initialize with buffer 0 (used for double buffering)
 		size_t y_load = 0;
 		size_t y_proc = 0;
-		std::future<void> rthread;
+		std::future<size_t> rthread;
 		std::future<std::ostream&> wthread;										//create asynchronous threads for reading and writing
-		readlines(src[0], 0, N[0]);												//read the first batch into the 0 source buffer
-		y_load += N[0];															//increment the loaded slice counter
-		int b = 1;
-
-		std::chrono::high_resolution_clock::time_point t_start;						//high-resolution timers
-		std::chrono::high_resolution_clock::time_point t_end;
+		std::chrono::high_resolution_clock::time_point t_start, pt_start;						//high-resolution timers
+		std::chrono::high_resolution_clock::time_point t_end, pt_end;
 		size_t t_batch;																//number of milliseconds to process a batch
-		size_t t_total = 0;
+		size_t t_total = 0;														//total time for operation
+		size_t pt_total = 0;													//total time spent processing data
+		size_t rt_total = 0;													//total time spent reading data
+		size_t wt_total = 0;
+		size_t dr = 0;
+		
+		rt_total += readlines(src[0], 0, N[0]);					//read the first batch into the 0 source buffer
+		y_load += N[0];											//increment the loaded slice counter
+		int b = 1;												//initialize the double buffer to 0
 		while(y_proc < Y()){													//while there are still slices to be processed
 			t_start = std::chrono::high_resolution_clock::now();					//start the timer for this batch
 			if(y_load < Y()){													//if there are still slices to be loaded, load them
+				//if(y_proc > 0){
+					
+					
+				//}
 				if(y_load + N[b] > Y()) N[b] = Y() - y_load;					//if the next batch would process more than the total slices, adjust the batch size
 				rthread = std::async(std::launch::async, &stim::bsq<T>::readlines, this, src[b], y_load, N[b]);
-				
+				//rt_total += rthread.get();
 				y_load += N[b];													//increment the number of loaded slices
 			}
 			b = !b;																//swap the double-buffer
-
-			binary<T>::permute(dst[b], src[b], X(), N[b], Z(), 0, 2, 1);		//permute the batch to a BIL file
-			target.write((char*)dst[b], N[b] * slice_bytes);					//write the permuted data to the output file
+			pt_total += binary<T>::permute(dst[b], src[b], X(), N[b], Z(), 0, 2, 1);		//permute the batch to a BIL file
+			wt_total += writeblock(&target, dst[b], N[b] * slice_bytes);			//write the permuted data to the output file
 			y_proc += N[b];														//increment the counter of processed pixels
 			if(PROGRESS) progress = (double)( y_proc + 1 ) / Y() * 100;			//increment the progress counter based on the number of processed pixels
+			if(y_load < Y()) rt_total += rthread.get();					//if a new batch was set to load, make sure it loads after calculations
 			t_end = std::chrono::high_resolution_clock::now();
 			t_batch = std::chrono::duration_cast<std::chrono::milliseconds>(t_end-t_start).count();
 			t_total += t_batch;
-			rthread.wait();
+			if(OPTIMIZATION)
+				N[b] = O.update(N[!b] * slice_bytes, t_batch, binary<T>::data_rate, VERBOSE);					//set the batch size based on optimization
+			//binary<T>::data_rate = dr;
+			//std::cout<<"New N = "<<N[!b]<<" selected with "<<(double)data_rate / 1000000<<" MB/s"<<std::endl;
 		}
-
-		std::cout<<"Total time to execute: "<<t_total<<" ms"<<std::endl;
+		
 		free(src[0]);															//free buffer resources
 		free(src[1]);
 		free(dst[0]);
 		free(dst[1]);
+		if(VERBOSE){
+			std::cout<<"total time to execute bsq::bil(): "<<t_total<<" ms"<<std::endl;
+			std::cout<<"     total time spent processing: "<<pt_total<<" ms"<<std::endl;
+			std::cout<<"        total time spent reading: "<<rt_total<<" ms"<<std::endl;
+			std::cout<<"        total time spent writing: "<<wt_total<<" ms"<<std::endl;
+		}
 		return true;															//return true
 	}
 	/// Convert this BSQ file to a BIP
-	bool bip(std::string outname, bool PROGRESS = false){
+	bool bip(std::string outname, bool PROGRESS = false, bool VERBOSE = false){
 		const size_t buffers = 4;													//number of buffers required for this algorithm
 		size_t mem_per_batch = binary<T>::buffer_size / buffers;					//calculate the maximum memory available for a batch
@@ -471,13 +503,13 @@ public:
 		dst[1] = (T*) malloc(max_batch_bytes);
 		size_t N[2];																		//number of slices stored in buffers 0 and 1
-		N[0] = N[1] = min(Y(), max_slices_per_batch);										//start with the maximum number of slices that can be stored (may be the entire data set)
+		N[0] = N[1] = std::min<size_t>(Y(), max_slices_per_batch);										//start with the maximum number of slices that can be stored (may be the entire data set)
 		std::ofstream target(outname.c_str(), std::ios::binary);					//open an output file for writing
 																		//initialize with buffer 0 (used for double buffering)
 		size_t y_load = 0;
 		size_t y_proc = 0;
-		std::future<void> rthread;
+		std::future<size_t> rthread;
 		std::future<std::ostream&> wthread;										//create asynchronous threads for reading and writing
 		readlines(src[0], 0, N[0]);												//read the first batch into the 0 source buffer
@@ -488,6 +520,8 @@ public:
 		std::chrono::high_resolution_clock::time_point t_end;
 		size_t t_batch;																//number of milliseconds to process a batch
 		size_t t_total = 0;
+		size_t pt_total = 0;
+		size_t rt_total = 0;
 		while(y_proc < Y()){													//while there are still slices to be processed
 			t_start = std::chrono::high_resolution_clock::now();					//start the timer for this batch
 			if(y_load < Y()){													//if there are still slices to be loaded, load them
@@ -499,17 +533,21 @@ public:
 			b = !b;																//swap the double-buffer
-			binary<T>::permute(dst[b], src[b], X(), N[b], Z(), 2, 0, 1);		//permute the batch to a BIP file
+			pt_total += binary<T>::permute(dst[b], src[b], X(), N[b], Z(), 2, 0, 1);		//permute the batch to a BIP file
 			target.write((char*)dst[b], N[b] * slice_bytes);					//write the permuted data to the output file
 			y_proc += N[b];														//increment the counter of processed pixels
 			if(PROGRESS) progress = (double)( y_proc + 1 ) / Y() * 100;			//increment the progress counter based on the number of processed pixels
 			t_end = std::chrono::high_resolution_clock::now();
 			t_batch = std::chrono::duration_cast<std::chrono::milliseconds>(t_end-t_start).count();
 			t_total += t_batch;
-			rthread.wait();
+			if(y_load < Y()) rt_total += rthread.get();					//if a batch was threaded to load, make sure it finishes
 		}
-		std::cout<<"Total time to execute: "<<t_total<<" ms"<<std::endl;
+		if(VERBOSE){
+			std::cout<<"total time to execute bsq::bil(): "<<t_total<<" ms"<<std::endl;
+			std::cout<<"     total time spent processing: "<<pt_total<<" ms"<<std::endl;
+			std::cout<<"        total time spent reading: "<<rt_total<<" ms"<<std::endl;
+		}
 		free(src[0]);															//free buffer resources
 		free(src[1]);
 		free(dst[0]);
@@ -79,30 +79,64 @@ public:
 		return alloc_array(header.samples * header.lines);
 	}
-	void set_buffer(double memfrac = 0.5){
+	void set_buffer_frac(double memfrac = 0.5){
 		if(header.interleave == envi_header::BSQ){		//if the infile is bsq file
 			if(header.data_type ==envi_header::float32)
-				((bsq<float>*)file)->set_buffer(memfrac);
+				((bsq<float>*)file)->set_buffer_frac(memfrac);
 			else if(header.data_type == envi_header::float64)
-				((bsq<double>*)file)->set_buffer(memfrac);
+				((bsq<double>*)file)->set_buffer_frac(memfrac);
 			else
 				std::cout<<"ERROR: unidentified data type"<<std::endl;
 		}
 		else if(header.interleave == envi_header::BIL){		//if the infile is bil file
 			if(header.data_type ==envi_header::float32)
-				((bil<float>*)file)->set_buffer(memfrac);
+				((bil<float>*)file)->set_buffer_frac(memfrac);
 			else if(header.data_type == envi_header::float64)
-				((bil<double>*)file)->set_buffer(memfrac);
+				((bil<double>*)file)->set_buffer_frac(memfrac);
 			else
 				std::cout<<"ERROR: unidentified data type"<<std::endl;
 		}
 		else if(header.interleave == envi_header::BIP){		//if the infile is bip file
 			if(header.data_type ==envi_header::float32)
-				((bip<float>*)file)->set_buffer(memfrac);
+				((bip<float>*)file)->set_buffer_frac(memfrac);
 			else if(header.data_type == envi_header::float64)
-				((bip<double>*)file)->set_buffer(memfrac);
+				((bip<double>*)file)->set_buffer_frac(memfrac);
+			else
+				std::cout<<"ERROR: unidentified data type"<<std::endl;
+		}
+
+		else{
+			std::cout<<"ERROR: unidentified file type"<<std::endl;
+			exit(1);
+		}
+	}
+
+	void set_buffer_raw(size_t bytes){
+		if(header.interleave == envi_header::BSQ){		//if the infile is bsq file
+			if(header.data_type ==envi_header::float32)
+				((bsq<float>*)file)->set_buffer_raw(bytes);
+			else if(header.data_type == envi_header::float64)
+				((bsq<double>*)file)->set_buffer_raw(bytes);
+			else
+				std::cout<<"ERROR: unidentified data type"<<std::endl;
+		}
+
+		else if(header.interleave == envi_header::BIL){		//if the infile is bil file
+			if(header.data_type ==envi_header::float32)
+				((bil<float>*)file)->set_buffer_raw(bytes);
+			else if(header.data_type == envi_header::float64)
+				((bil<double>*)file)->set_buffer_raw(bytes);
+			else
+				std::cout<<"ERROR: unidentified data type"<<std::endl;
+		}
+
+		else if(header.interleave == envi_header::BIP){		//if the infile is bip file
+			if(header.data_type ==envi_header::float32)
+				((bip<float>*)file)->set_buffer_raw(bytes);
+			else if(header.data_type == envi_header::float64)
+				((bip<double>*)file)->set_buffer_raw(bytes);
 			else
 				std::cout<<"ERROR: unidentified data type"<<std::endl;
 		}
@@ -121,6 +155,16 @@ public:
 		exit(1);
 	}
+	size_t X(){ return header.samples; }
+	size_t Y(){ return header.lines; }
+	size_t Z(){ return header.bands; }
+	size_t B(){ return Z();	}
+
+	/// Return the size of the data set in bytes
+	size_t bytes(){
+		return X() * Y() * Z() * type_size();
+	}
+
 	/// Returns the progress of the current processing operation as a percentage
 	void reset_progress(){
@@ -193,6 +237,42 @@ public:
 		return 0;
 	}
+	/// Returns the progress of the current processing operation as a percentage
+	size_t data_rate(){
+
+		if(header.interleave == envi_header::BSQ){		//if the infile is bsq file
+			if(header.data_type ==envi_header::float32)
+				return ((bsq<float>*)file)->get_data_rate();
+			else if(header.data_type == envi_header::float64)
+				return ((bsq<double>*)file)->get_data_rate();
+			else
+				std::cout<<"ERROR: unidentified data type"<<std::endl;
+		}
+
+		else if(header.interleave == envi_header::BIL){		//if the infile is bil file
+			if(header.data_type ==envi_header::float32)
+				return ((bil<float>*)file)->get_data_rate();
+			else if(header.data_type == envi_header::float64)
+				return ((bil<double>*)file)->get_data_rate();
+			else
+				std::cout<<"ERROR: unidentified data type"<<std::endl;
+		}
+
+		else if(header.interleave == envi_header::BIP){		//if the infile is bip file
+			if(header.data_type ==envi_header::float32)
+				return ((bip<float>*)file)->get_data_rate();
+			else if(header.data_type == envi_header::float64)
+				return ((bip<double>*)file)->get_data_rate();
+			else
+				std::cout<<"ERROR: unidentified data type"<<std::endl;
+		}
+
+		else{
+			std::cout<<"ERROR: unidentified file type"<<std::endl;
+		}
+		return 0;
+	}
+
 	/// Allocate memory for a new ENVI file based on the current interleave format (BIP, BIL, BSQ) and data type.
 	void allocate(){
@@ -509,7 +589,7 @@ public:
 	/// @param outfile is the file name for the converted output
 	/// @param interleave is the interleave format for the destination file
-	bool convert(std::string outfile, stim::envi_header::interleaveType interleave, bool PROGRESS = false){
+	bool convert(std::string outfile, stim::envi_header::interleaveType interleave, bool PROGRESS = false, bool VERBOSE = false, bool OPTIMIZATION = true){
 		if(header.interleave == envi_header::BSQ){			//if the infile is bsq file
@@ -519,10 +599,10 @@ public:
 					exit(1);
 				}
 				else if(interleave == envi_header::BIL)			//convert BSQ -> BIL
-					((bsq<float>*)file)->bil(outfile, PROGRESS);
+					((bsq<float>*)file)->bil(outfile, PROGRESS, VERBOSE, OPTIMIZATION);
 				else if(interleave == envi_header::BIP){			//ERROR
 					//std::cout<<"ERROR: conversion from BSQ to BIP isn't practical; use BSQ->BIL->BIP instead"<<std::endl;
-					((bsq<float>*)file)->bip(outfile, PROGRESS);
+					((bsq<float>*)file)->bip(outfile, PROGRESS, VERBOSE);
 					//exit(1);
 				}
 			}
@@ -533,10 +613,10 @@ public:
 					exit(1);
 				}
 				else if(interleave == envi_header::BIL)					//convert BSQ -> BIL
-					((bsq<double>*)file)->bil(outfile, PROGRESS);
+					((bsq<double>*)file)->bil(outfile, PROGRESS, OPTIMIZATION);
 				else if(interleave == envi_header::BIP){					//ERROR
 					//std::cout<<"ERROR: conversion from BSQ to BIP isn't practical; use BSQ->BIL->BIP instead"<<std::endl;
-					((bsq<float>*)file)->bip(outfile, PROGRESS);
+					((bsq<float>*)file)->bip(outfile, PROGRESS, OPTIMIZATION);
 					//exit(1);
 				}
 			}
@@ -202,7 +202,7 @@ public:
 		}
 	}
-	void read(T* dest, size_t x, size_t y, size_t z, size_t sx, size_t sy, size_t sz){
+	size_t read(T* dest, size_t x, size_t y, size_t z, size_t sx, size_t sy, size_t sz){
 		size_t d[3];					//position in the binary coordinate system
 		size_t sd[3];					//size in the binary coordinate system
@@ -214,10 +214,7 @@ public:
 		sd[O[1]] = sy;
 		sd[O[2]] = sz;
-		if(!binary<T>::read(dest, d[0], d[1], d[2], sd[0], sd[1], sd[2])){
-			std::cout<<"error reading block in stim::hsi: ("<<d[0]<<", "<<d[1]<<", "<<d[2]<<") - ["<<sd[0]<<", "<<sd[1]<<", "<<sd[2]<<"]"<<std::endl;
-			exit(1);
-		}
+		return binary<T>::read(dest, d[0], d[1], d[2], sd[0], sd[1], sd[2]);
 	}
 };