diff --git a/stim/envi/bip.h b/stim/envi/bip.h
index 8ff9ab5..215ff38 100644
--- a/stim/envi/bip.h
+++ b/stim/envi/bip.h
@@ -1107,10 +1107,7 @@ public:
 	}
 
 
-//#ifdef CUDA_FOUND
-	/// Calculate the covariance matrix of Noise for masked pixels using cuBLAS
-	/// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra
-	int coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
+	int coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false) {
 
 		cudaError_t cudaStat;
 		cublasStatus_t stat;
@@ -1123,9 +1120,10 @@ public:
 		double* s = (double*)malloc(sizeof(double) * B);					//allocate space for the spectrum that will be pulled from the file
 		double* s_dev;														//declare a device pointer that will store the spectrum on the GPU
 
-        double* s2_dev;														//  device pointer on the GPU
-        cudaStat = cudaMalloc(&s2_dev, B * sizeof(double));					//  allocate space on the CUDA device
-        cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double));               //  initialize s2_dev to zero (0)
+		double* s2 = (double*)malloc(sizeof(double) * B);					//allocate space for the spectrum of second pixel that will be pulled from the file
+		double* s2_dev;														//  device pointer on the GPU
+		cudaStat = cudaMalloc(&s2_dev, B * sizeof(double));					//  allocate space on the CUDA device
+		cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double));               //  initialize s2_dev to zero (0)
 
 		double* A_dev;														//declare a device pointer that will store the covariance matrix on the GPU
 		double* avg_dev;													//declare a device pointer that will store the average spectrum
@@ -1135,26 +1133,32 @@ public:
 		cudaStat = cudaMalloc(&avg_dev, B * sizeof(double));				//allocate space on the CUDA device for the average spectrum
 		stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1);		//copy the average spectrum to the CUDA device
 
-		double ger_alpha = 1.0/(double)XY;									//scale the outer product by the inverse of the number of samples (mean outer product)
+		double ger_alpha = 1.0 / (double)XY;									//scale the outer product by the inverse of the number of samples (mean outer product)
 		double axpy_alpha = -1;												//multiplication factor for the average spectrum (in order to perform a subtraction)
 
 		CUBLAS_HANDLE_ERROR(cublasCreate(&handle));							//create a cuBLAS instance
 		if (stat != CUBLAS_STATUS_SUCCESS) return 1;						//test the cuBLAS instance to make sure it is valid
 
-		for (unsigned long long xy = 0; xy < XY; xy++){										//for each pixel
-			if (mask == NULL || mask[xy] != 0){
-				pixeld(s, xy);                                                             //retreive the spectrum at the current xy pixel location
-
-				stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1);						//copy the spectrum from the host to the device
+		for (unsigned long long xy = 0; xy < XY; xy++) {										//for each pixel
+			if (mask == NULL || mask[xy] != 0) {
+				pixeld(s, xy);                                        //retreive the spectrum at the current xy pixel location
+				if (xy < XY - X()) {
+					pixeld(s2, xy + X());                              //retreive the spectrum at the current xy+X pixel location, which is adjacent (bellow) to the pixel at xy location (in y direction)
+				}
+				else {
+					pixeld(s2, xy - X());                               //for the last row we consider the the adjacent pixel which is located above pixel xy
+				}
+				stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1);						//copy the spectrum of first pixel from the host to the device
 				stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s_dev, 1);				//subtract the average spectrum
 
-                cudaMemcpy(s2_dev, s_dev + 1 , (B-1) * sizeof(double), cudaMemcpyDeviceToDevice);    //copy B-1 elements from shifted source data (s_dev) to device pointer (s2_dev )
-                stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1);	   //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them )
+				stat = cublasSetVector((int)B, sizeof(double), s2, 1, s2_dev, 1);					//copy the spectrum of second pixel from the host to the device
+				stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s2_dev, 1);				//subtract the average spectrum
 
+				stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1);	   //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (in y direction)
 
 				stat = cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, s_dev, 1, A_dev, (int)B);	//calculate the covariance matrix (symmetric outer product)
 			}
-			if(PROGRESS) progress = (double)(xy+1) / XY * 100;													//record the current progress
+			if (PROGRESS) progress = (double)(xy + 1) / XY * 100;													//record the current progress
 
 		}
 
@@ -1165,22 +1169,22 @@ public:
 		cudaFree(s2_dev);
 		cudaFree(avg_dev);
 
-		for(unsigned long long i = 0; i < B; i++){										//copy the upper triangular portion to the lower triangular portion
-			for(unsigned long long j = i+1; j < B; j++){
+		for (unsigned long long i = 0; i < B; i++) {										//copy the upper triangular portion to the lower triangular portion
+			for (unsigned long long j = i + 1; j < B; j++) {
 				coN[B * i + j] = coN[B * j + i];
 			}
 		}
 
 		return 0;
 	}
-//#endif
+	//#endif
 
 	/// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision.
 
 	/// @param coN is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
 	/// @param avg is a pointer to memory of size B that stores the average spectrum
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
-	bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, int cuda_device = 0, bool PROGRESS = false){
+	bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, int cuda_device = 0, bool PROGRESS = false) {
 
 		if (cuda_device >= 0) {													//if a CUDA device is specified
 			int dev_count;
@@ -1194,7 +1198,7 @@ public:
 					int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
 					if (status == 0) return true;									//if the cuBLAS function returned correctly, we're done
 				}
-			}																	//otherwise continue using the CPU		
+			}																	//otherwise continue using the CPU
 			std::cout << "WARNING: cuBLAS failed, using CPU" << std::endl;
 		}
 
@@ -1203,43 +1207,55 @@ public:
 		unsigned long long XY = X() * Y();
 		unsigned long long B = Z();
 		T* temp = (T*)malloc(sizeof(T) * B);
+		T* temp2 = (T*)malloc(sizeof(T) * B);
 
 		unsigned long long count = nnz(mask);								//count the number of masked pixels
 
-		//initialize covariance matrix of noise
+																			//initialize covariance matrix of noise
 		memset(coN, 0, B * B * sizeof(double));
 
 		//calculate covariance matrix
-		double* coN_half = (double*) malloc(B * B * sizeof(double));			//allocate space for a higher-precision intermediate matrix
-		double* temp_precise = (double*) malloc(B * sizeof(double));
+		double* coN_half = (double*)malloc(B * B * sizeof(double));			//allocate space for a higher-precision intermediate matrix
+		double* temp_precise = (double*)malloc(B * sizeof(double));
+		double* temp_precise2 = (double*)malloc(B * sizeof(double));
 		memset(coN_half, 0, B * B * sizeof(double));							//initialize the high-precision matrix with zeros
 		unsigned long long idx;													//stores i*B to speed indexing
-		for (unsigned long long xy = 0; xy < XY; xy++){
-			if (mask == NULL || mask[xy] != 0){
-				pixel(temp, xy);												//retreive the spectrum at the current xy pixel location
-				for(unsigned long long b = 0; b < B; b++)									//subtract the mean from this spectrum and increase the precision
+		for (unsigned long long xy = 0; xy < XY; xy++) {
+			if (mask == NULL || mask[xy] != 0) {
+				pixel(temp, xy);                                        //retreive the spectrum at the current xy pixel location
+				if (xy < XY - X()) {
+					pixel(temp2, xy + X());                              //retreive the spectrum at the current xy+X pixel location, which is adjacent (bellow) to the pixel at xy location (in y direction)
+				}
+				else {
+					pixel(temp2, xy - X());                               //for the last row we consider the the adjacent pixel which is located above pixel xy
+				}
+				for (unsigned long long b = 0; b < B; b++) {									//subtract the mean from this spectrum and increase the precision
 					temp_precise[b] = (double)temp[b] - (double)avg[b];
+					temp_precise2[b] = (double)temp2[b] - (double)avg[b];
+				}
 
-                for(unsigned long long b2 = 0; b2 < B-1; b2++)	    //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them )
-					temp_precise[b2] -=  temp_precise[b2+1];
+				for (unsigned long long b2 = 0; b2 < B; b2++)	    //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (in y direction)
+					temp_precise[b2] -= temp_precise2[b2];
 
 				idx = 0;
-				for (unsigned long long b0 = 0; b0 < B; b0++){								//for each band
+				for (unsigned long long b0 = 0; b0 < B; b0++) {								//for each band
 					for (unsigned long long b1 = b0; b1 < B; b1++)
 						coN_half[idx++] += temp_precise[b0] * temp_precise[b1];
 				}
 			}
-			if(PROGRESS) progress = (double)(xy+1) / XY * 100;
+			if (PROGRESS) progress = (double)(xy + 1) / XY * 100;
 		}
 		idx = 0;
-		for (unsigned long long i = 0; i < B; i++){										//copy the precision matrix to both halves of the output matrix
-			for (unsigned long long j = i; j < B; j++){
-				coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double) count;
+		for (unsigned long long i = 0; i < B; i++) {										//copy the precision matrix to both halves of the output matrix
+			for (unsigned long long j = i; j < B; j++) {
+				coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double)count;
 			}
 		}
 
 		free(temp);
+		free(temp2);
 		free(temp_precise);
+		free(temp_precise2);
 		return true;
 	}
 
--
libgit2 0.21.4