Merge branch 'sam'

David Mayerich
2 parents 5b108e44 334547d8
Showing 2 changed files with 266 additions and 2 deletions Show diff stats
stim/envi/bip.h
stim/envi/envi.h
@@ -373,7 +373,7 @@ public:
 		for(size_t xy = 0; xy < XY; xy++){							//for each pixel
 			memset(spec, 0, Bb);									//set the spectrum to zero
 			if(mask == NULL || mask[xy]){							//if the pixel is masked
-				len = 0;											//initialize the 
+				len = 0;											//initialize the
 				file.read((char*)spec, Bb);							//read a spectrum
 				for(size_t b = 0; b < B; b++)						//for each band
 					len += spec[b]*spec[b];							//add the square of the spectral band
@@ -385,7 +385,7 @@ public:
 				file.seekg(Bb, std::ios::cur);						//otherwise skip a spectrum
 			target.write((char*)spec, Bb);							//output the normalized spectrum
 			if(PROGRESS) progress = (double)(xy + 1) / (double)XY * 100;		//update the progress
-		}		
+		}
 	}
  
  
@@ -1088,6 +1088,233 @@ public:
 		return true;
 	}
  
+
+	#ifdef CUDA_FOUND
+	/// Calculate the covariance matrix of Noise for masked pixels using cuBLAS
+	/// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra
+	bool coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
+
+		cudaError_t cudaStat;
+		cublasStatus_t stat;
+		cublasHandle_t handle;
+
+		progress = 0;													    //initialize the progress to zero (0)
+		unsigned long long XY = X() * Y();									//calculate the number of elements in a band image
+		unsigned long long B = Z();											//calculate the number of spectral elements
+
+		double* s = (double*)malloc(sizeof(double) * B);					//allocate space for the spectrum that will be pulled from the file
+		double* s_dev;														//declare a device pointer that will store the spectrum on the GPU
+
+        double* s2_dev;														//  device pointer on the GPU
+        cudaStat = cudaMalloc(&s2_dev, B * sizeof(double));					//  allocate space on the CUDA device
+        cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double));               //  initialize s2_dev to zero (0)
+
+		double* A_dev;														//declare a device pointer that will store the covariance matrix on the GPU
+		double* avg_dev;													//declare a device pointer that will store the average spectrum
+		cudaStat = cudaMalloc(&s_dev, B * sizeof(double));					//allocate space on the CUDA device for the spectrum
+		cudaStat = cudaMalloc(&A_dev, B * B * sizeof(double));				//allocate space on the CUDA device for the covariance matrix
+		cudaStat = cudaMemset(A_dev, 0, B * B * sizeof(double));			//initialize the covariance matrix to zero (0)
+		cudaStat = cudaMalloc(&avg_dev, B * sizeof(double));				//allocate space on the CUDA device for the average spectrum
+		stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1);		//copy the average spectrum to the CUDA device
+
+		double ger_alpha = 1.0/(double)XY;									//scale the outer product by the inverse of the number of samples (mean outer product)
+		double axpy_alpha = -1;												//multiplication factor for the average spectrum (in order to perform a subtraction)
+
+		stat = cublasCreate(&handle);										//create a cuBLAS instance
+		if (stat != CUBLAS_STATUS_SUCCESS) {								//test the cuBLAS instance to make sure it is valid
+			printf ("CUBLAS initialization failed\n");
+			return EXIT_FAILURE;
+		}
+		for (unsigned long long xy = 0; xy < XY; xy++){										//for each pixel
+			if (mask == NULL || mask[xy] != 0){
+				pixeld(s, xy);                                                             //retreive the spectrum at the current xy pixel location
+
+				stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1);						//copy the spectrum from the host to the device
+				stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s_dev, 1);				//subtract the average spectrum
+
+                cudaMemcpy(s2_dev, s_dev + 1 , (B-1) * sizeof(double), cudaMemcpyDeviceToDevice);    //copy B-1 elements from shifted source data (s_dev) to device pointer (s2_dev )
+                stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1);	   //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them )
+
+
+				stat = cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, s_dev, 1, A_dev, (int)B);	//calculate the covariance matrix (symmetric outer product)
+			}
+			if(PROGRESS) progress = (double)(xy+1) / XY * 100;													//record the current progress
+
+		}
+
+		cublasGetMatrix((int)B, (int)B, sizeof(double), A_dev, (int)B, coN, (int)B);					//copy the result from the GPU to the CPU
+
+		cudaFree(A_dev);														//clean up allocated device memory
+		cudaFree(s_dev);
+		cudaFree(s2_dev);
+		cudaFree(avg_dev);
+
+		for(unsigned long long i = 0; i < B; i++){										//copy the upper triangular portion to the lower triangular portion
+			for(unsigned long long j = i+1; j < B; j++){
+				coN[B * i + j] = coN[B * j + i];
+			}
+		}
+
+		return true;
+	}
+#endif
+
+	/// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision.
+
+	/// @param coN is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
+	/// @param avg is a pointer to memory of size B that stores the average spectrum
+	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
+	bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
+
+#ifdef CUDA_FOUND
+		int dev_count;
+		cudaGetDeviceCount(&dev_count);									//get the number of CUDA devices
+		cudaDeviceProp prop;
+		cudaGetDeviceProperties(&prop, 0);								//get the property of the first device
+		if(dev_count > 0 && prop.major != 9999)							//if the first device is not an emulator
+			return coNoise_matrix_cublas(coN, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
+#endif
+
+
+
+		progress = 0;
+		//memory allocation
+		unsigned long long XY = X() * Y();
+		unsigned long long B = Z();
+		T* temp = (T*)malloc(sizeof(T) * B);
+
+		unsigned long long count = nnz(mask);								//count the number of masked pixels
+
+		//initialize covariance matrix of noise
+		memset(coN, 0, B * B * sizeof(double));
+
+		//calculate covariance matrix
+		double* coN_half = (double*) malloc(B * B * sizeof(double));			//allocate space for a higher-precision intermediate matrix
+		double* temp_precise = (double*) malloc(B * sizeof(double));
+		memset(coN_half, 0, B * B * sizeof(double));							//initialize the high-precision matrix with zeros
+		unsigned long long idx;													//stores i*B to speed indexing
+		for (unsigned long long xy = 0; xy < XY; xy++){
+			if (mask == NULL || mask[xy] != 0){
+				pixel(temp, xy);												//retreive the spectrum at the current xy pixel location
+				for(unsigned long long b = 0; b < B; b++)									//subtract the mean from this spectrum and increase the precision
+					temp_precise[b] = (double)temp[b] - (double)avg[b];
+
+                for(unsigned long long b2 = 0; b2 < B-1; b2++)	    //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them )
+					temp_precise[b2] -=  temp_precise[b2+1];
+
+				idx = 0;
+				for (unsigned long long b0 = 0; b0 < B; b0++){								//for each band
+					for (unsigned long long b1 = b0; b1 < B; b1++)
+						coN_half[idx++] += temp_precise[b0] * temp_precise[b1];
+				}
+			}
+			if(PROGRESS) progress = (double)(xy+1) / XY * 100;
+		}
+		idx = 0;
+		for (unsigned long long i = 0; i < B; i++){										//copy the precision matrix to both halves of the output matrix
+			for (unsigned long long j = i; j < B; j++){
+				coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double) count;
+			}
+		}
+
+		free(temp);
+		free(temp_precise);
+		return true;
+	}
+
+	#ifdef CUDA_FOUND
+    /// Project the spectra onto a set of basis functions
+	/// @param outfile is the name of the new binary output file that will be created
+	/// @param center is a spectrum about which the data set will be rotated (ex. when performing mean centering)
+	/// @param basis a set of basis vectors that the data set will be projected onto (after centering)
+	/// @param M is the number of basis vectors
+	/// @param mask is a character mask used to limit processing to valid pixels
+	bool project_cublas(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){
+
+		cudaError_t cudaStat;
+		cublasStatus_t stat;
+		cublasHandle_t handle;
+
+		std::ofstream target(outfile.c_str(), std::ios::binary);	//open the target binary file
+		//std::string headername = outfile + ".hdr";					//the header file name
+
+		progress = 0;													    //initialize the progress to zero (0)
+		unsigned long long XY = X() * Y();									//calculate the number of elements in a band image
+		unsigned long long B = Z();											//calculate the number of spectral elements
+
+		double* s = (double*)malloc(sizeof(double) * B);					//allocate space for the spectrum that will be pulled from the file
+		double* s_dev;														//declare a device pointer that will store the spectrum on the GPU
+		cudaStat = cudaMalloc(&s_dev, B * sizeof(double));					//allocate space on the CUDA device for the spectrum
+
+
+        double* basis_dev;														//  device pointer on the GPU
+        cudaStat = cudaMalloc(&basis_dev, M * B * sizeof(double));					//  allocate space on the CUDA device
+        cudaStat = cudaMemset(basis_dev, 0, M * B * sizeof(double));               //  initialize basis_dev to zero (0)
+
+
+        /// transposing basis matrix (because cuBLAS is column-major)
+        double *basis_Transposed = (double*)malloc(M * B * sizeof(double));
+        memset(basis_Transposed, 0, M * B * sizeof(double));
+        for (int i = 0; i<M; i++)
+            for (int j = 0; j<B; j++)
+            basis_Transposed[i+j*M] = basis[i*B+j];
+
+        stat = cublasSetMatrix((int)M, (int)B, sizeof(double),basis_Transposed, (int)M, basis_dev, (int)M);  //copy the basis_Transposed matrix to the CUDA device (both matrices are stored in column-major format)
+
+		double* center_dev;													//declare a device pointer that will store the center (average)
+		cudaStat = cudaMalloc(&center_dev, B * sizeof(double));				//allocate space on the CUDA device for the center (average)
+		stat = cublasSetVector((int)B, sizeof(double), center, 1, center_dev, 1);		//copy the center vector (average) to the CUDA device (from host to device)
+
+
+        double* A = (double*)malloc(sizeof(double) * M);					//allocate space for the projected pixel on the host
+        double* A_dev;														//declare a device pointer that will store the projected pixel on the GPU
+		cudaStat = cudaMalloc(&A_dev,M * sizeof(double));				    //allocate space on the CUDA device for the projected pixel
+		cudaStat = cudaMemset(A_dev, 0,M * sizeof(double));		        	//initialize the projected pixel to zero (0)
+
+		double axpy_alpha = -1;												//multiplication factor for the center (in order to perform a subtraction)
+		double axpy_alpha2 = 1;												//multiplication factor for the matrix-vector multiplication
+        double axpy_beta = 0;												//multiplication factor for the matrix-vector multiplication (there is no second scalor)
+
+		stat = cublasCreate(&handle);										//create a cuBLAS instance
+		if (stat != CUBLAS_STATUS_SUCCESS) {								//test the cuBLAS instance to make sure it is valid
+			printf ("CUBLAS initialization failed\n");
+			return EXIT_FAILURE;
+		}
+
+        T* temp = (T*)malloc(sizeof(T) * M);                                             //allocate space for the projected pixel to be written on the disc
+
+		for (unsigned long long xy = 0; xy < XY; xy++){										//for each pixel
+			if (mask == NULL || mask[xy] != 0){
+				pixeld(s, xy);                                                  //retreive the spectrum at the current xy pixel location
+
+				stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1);						    //copy the spectrum from the host to the device
+                stat = cublasDaxpy(handle, (int)B, &axpy_alpha, center_dev, 1, s_dev, 1);				//subtract the center (average)
+                stat = cublasDgemv(handle,CUBLAS_OP_N,(int)M,(int)B,&axpy_alpha2,basis_dev,(int)M,s_dev,1,&axpy_beta,A_dev,1);         //performs the matrix-vector multiplication
+                stat = cublasGetVector((int)B, sizeof(double), A_dev, 1, A, 1);		                    //copy the projected pixel to the host (from GPU to CPU)
+
+                std::copy(A, A + M, temp);                                          //casting projected pixel from double to whatever T is
+
+			}
+
+			target.write(reinterpret_cast<const char*>(temp), sizeof(T) * M);					  //write the projected vector
+			if(PROGRESS) progress = (double)(xy+1) / XY * 100;									    //record the current progress
+
+		}
+
+        //clean up allocated device memory
+		cudaFree(A_dev);
+		cudaFree(s_dev);
+        cudaFree(basis_dev);
+		cudaFree(center_dev);
+		free(A);
+		free(s);
+		free(temp);
+		target.close();												//close the output file
+
+		return true;
+	}
+#endif
+
 	/// Project the spectra onto a set of basis functions
 	/// @param outfile is the name of the new binary output file that will be created
 	/// @param center is a spectrum about which the data set will be rotated (ex. when performing mean centering)
@@ -1096,6 +1323,14 @@ public:
 	/// @param mask is a character mask used to limit processing to valid pixels
 	bool project(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){
  
+#ifdef CUDA_FOUND
+		int dev_count;
+		cudaGetDeviceCount(&dev_count);									//get the number of CUDA devices
+		cudaDeviceProp prop;
+		cudaGetDeviceProperties(&prop, 0);								//get the property of the first device
+		if(dev_count > 0 && prop.major != 9999)							//if the first device is not an emulator
+			return project_cublas(outfile,center,basis,M,mask,PROGRESS);	 //use cuBLAS to calculate the covariance matrix
+#endif
 		std::ofstream target(outfile.c_str(), std::ios::binary);	//open the target binary file
 		//std::string headername = outfile + ".hdr";					//the header file name
  
@@ -1403,6 +1403,35 @@ public:
 		return false;
 	}
  
+	/// Calculate the covariance of noise matrix for all masked pixels in the image.
+
+	/// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
+	/// @param avg is a pointer to memory of size B that stores the average spectrum
+	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
+	bool coNoise_matrix(double* coN, double* avg, unsigned char* mask, bool PROGRESS = false){
+		if (header.interleave == envi_header::BSQ){
+			std::cout<<"ERROR: calculating the covariance matrix of noise for a BSQ file is impractical; convert to BIP first"<<std::endl;
+			exit(1);
+		}
+
+
+		else if (header.interleave == envi_header::BIL){
+		        std::cout<<"ERROR: calculating the covariance matrix of noise for a BIL file is impractical; convert to BIP first"<<std::endl;
+                exit(1);
+			 }
+
+		else if (header.interleave == envi_header::BIP){
+			if (header.data_type == envi_header::float32)
+				return ((bip<float>*)file)->coNoise_matrix(coN, avg, mask, PROGRESS);
+			else if (header.data_type == envi_header::float64)
+				return ((bip<double>*)file)->coNoise_matrix(coN, avg, mask, PROGRESS);
+			else{
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+		return false;
+	}
  
 	/// Crop a region of the image and save it to a new file.