From 1c92dffecd442c80bf3562f2958b9bd1aa775ac0 Mon Sep 17 00:00:00 2001
From: David <mayerich@uh.edu>
Date: Fri, 23 Dec 2016 02:30:04 -0600
Subject: [PATCH] simplified CUDA implementation in covariance and projection calculations

---
 stim/envi/bip.h    | 136 +++++++++++++++++++++++++++++++++++++++++++++++-----------------------------------------------------------------------------------------
 stim/math/matrix.h |  60 ++++++++++++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 95 insertions(+), 101 deletions(-)

diff --git a/stim/envi/bip.h b/stim/envi/bip.h
index d7d8d56..ede6684 100644
--- a/stim/envi/bip.h
+++ b/stim/envi/bip.h
@@ -1015,7 +1015,7 @@ public:
 		stat = cublasCreate(&handle);										//create a cuBLAS instance
 		if (stat != CUBLAS_STATUS_SUCCESS) return 1;						//test the cuBLAS instance to make sure it is valid
 
-		else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl;
+		//else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl;
 		for (unsigned long long xy = 0; xy < XY; xy++){										//for each pixel
 			if (mask == NULL || mask[xy] != 0){
 				pixeld(s, xy);																	//retreive the spectrum at the current xy pixel location
@@ -1054,31 +1054,17 @@ public:
 		if(cuda_device >= 0){													//if a CUDA device is specified
 			int dev_count;
 			HANDLE_ERROR(cudaGetDeviceCount(&dev_count));						//get the number of CUDA devices
-			std::cout<<"Number of CUDA devices: "<<dev_count<<std::endl;		//output the number of CUDA devices
-			cudaDeviceProp prop;
-			//int best_device_id = 0;													//stores the best CUDA device
-			//float best_device_cc = 0.0f;												//stores the compute capability of the best device
-			std::cout<<"CUDA devices----"<<std::endl;
-			for(int d = 0; d < dev_count; d++){									//for each CUDA device
-				cudaGetDeviceProperties(&prop, d);								//get the property of the first device
-				//float cc = prop.major + prop.minor / 10.0f;						//calculate the compute capability
-				std::cout<<d<<":  ["<<prop.major<<"."<<prop.minor<<"]      "<<prop.name<<std::endl;	//display the device information
-				//if(cc > best_device_cc){
-				//	best_device_cc = cc;										//if this is better than the previous device, use it
-				//	best_device_id = d;
-				//}
-			}		
 			if(dev_count > 0 && dev_count > cuda_device){							//if the first device is not an emulator
+				cudaDeviceProp prop;
 				cudaGetDeviceProperties(&prop, cuda_device);									//get the property of the requested CUDA device
 				if (prop.major != 9999) {
-					std::cout << "Using device " << cuda_device << std::endl;
+					std::cout << "Using CUDA device [" << cuda_device << "] to calculate the mean covariance matrix..."<<std::endl;
 					HANDLE_ERROR(cudaSetDevice(cuda_device));
 					int status = co_matrix_cublas(co, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
 					if (status == 0) return true;									//if the cuBLAS function returned correctly, we're done
 				}
-			}																	//otherwise continue using the CPU
-		
-			std::cout<<"No supported CUDA devices found or cuBLAS failed. Using CPU"<<std::endl;
+			}																	//otherwise continue using the CPU		
+			std::cout<<"WARNING: cuBLAS failed, using CPU"<<std::endl;
 		}
 		//memory allocation
 		unsigned long long XY = X() * Y();
@@ -1196,32 +1182,20 @@ public:
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
 	bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, int cuda_device = 0, bool PROGRESS = false){
 
-		if(cuda_device >= 0){
+		if (cuda_device >= 0) {													//if a CUDA device is specified
 			int dev_count;
 			HANDLE_ERROR(cudaGetDeviceCount(&dev_count));						//get the number of CUDA devices
-			std::cout<<"Number of CUDA devices: "<<dev_count<<std::endl;		//output the number of CUDA devices
-			cudaDeviceProp prop;
-			//int best_device_id = 0;													//stores the best CUDA device
-			//float best_device_cc = 0.0f;												//stores the compute capability of the best device
-			std::cout<<"CUDA devices:"<<std::endl;
-			for(int d = 0; d < dev_count; d++){									//for each CUDA device
-				cudaGetDeviceProperties(&prop, d);								//get the property of the first device
-				//float cc = prop.major + prop.minor / 10.0f;						//calculate the compute capability
-				std::cout<<d<<":   ("<<prop.major<<"."<<prop.minor<<")      "<<prop.name<<std::endl;	//display the device information
-				//if(cc > best_device_cc){
-				//	best_device_cc = cc;										//if this is better than the previous device, use it
-				//	best_device_id = d;
-				//}
-			}		
-		
-			if(dev_count > 0 && prop.major != 9999 && dev_count > cuda_device){							//if the first device is not an emulator
-				std::cout<<"Using device "<< cuda_device <<std::endl;
-				HANDLE_ERROR(cudaSetDevice(cuda_device));
-				int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
-				if(status == 0) return true;									//if the cuBLAS function returned correctly, we're done
-			}																	//otherwise continue using the CPU
-		
-			std::cout<<"cuBLAS initialization failed - using CPU"<<std::endl;
+			if (dev_count > 0 && dev_count > cuda_device) {							//if the first device is not an emulator
+				cudaDeviceProp prop;
+				cudaGetDeviceProperties(&prop, cuda_device);									//get the property of the requested CUDA device
+				if (prop.major != 9999) {
+					std::cout << "Using CUDA device [" << cuda_device << "] to calculate the noise covariance matrix..." << std::endl;
+					HANDLE_ERROR(cudaSetDevice(cuda_device));
+					int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
+					if (status == 0) return true;									//if the cuBLAS function returned correctly, we're done
+				}
+			}																	//otherwise continue using the CPU		
+			std::cout << "WARNING: cuBLAS failed, using CPU" << std::endl;
 		}
 
 		progress = 0;
@@ -1277,8 +1251,8 @@ public:
 	/// @param mask is a character mask used to limit processing to valid pixels
 	bool project_cublas(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){
 
-		cudaError_t cudaStat;
-		cublasStatus_t stat;
+		//cudaError_t cudaStat;
+		//cublasStatus_t stat;
 		cublasHandle_t handle;
 
 		std::ofstream target(outfile.c_str(), std::ios::binary);	//open the target binary file
@@ -1289,12 +1263,12 @@ public:
 
 		double* s = (double*)malloc(sizeof(double) * B);					//allocate space for the spectrum that will be pulled from the file
 		double* s_dev;														//declare a device pointer that will store the spectrum on the GPU
-		cudaStat = cudaMalloc(&s_dev, B * sizeof(double));					//allocate space on the CUDA device for the spectrum
+		HANDLE_ERROR(cudaMalloc(&s_dev, B * sizeof(double)));				//allocate space on the CUDA device for the spectrum
 
 
         double* basis_dev;														//  device pointer on the GPU
-        cudaStat = cudaMalloc(&basis_dev, M * B * sizeof(double));					//  allocate space on the CUDA device
-        cudaStat = cudaMemset(basis_dev, 0, M * B * sizeof(double));               //  initialize basis_dev to zero (0)
+        HANDLE_ERROR(cudaMalloc(&basis_dev, M * B * sizeof(double)));					//  allocate space on the CUDA device
+        HANDLE_ERROR(cudaMemset(basis_dev, 0, M * B * sizeof(double)));               //  initialize basis_dev to zero (0)
 
 
         /// transposing basis matrix (because cuBLAS is column-major)
@@ -1303,28 +1277,24 @@ public:
         for (int i = 0; i<M; i++)
             for (int j = 0; j<B; j++)
             basis_Transposed[i+j*M] = basis[i*B+j];
+		//copy the basis_Transposed matrix to the CUDA device (both matrices are stored in column-major format)
+		CUBLAS_HANDLE_ERROR(cublasSetMatrix((int)M, (int)B, sizeof(double),basis_Transposed, (int)M, basis_dev, (int)M));
 
-        stat = cublasSetMatrix((int)M, (int)B, sizeof(double),basis_Transposed, (int)M, basis_dev, (int)M);  //copy the basis_Transposed matrix to the CUDA device (both matrices are stored in column-major format)
-
-		double* center_dev;													//declare a device pointer that will store the center (average)
-		cudaStat = cudaMalloc(&center_dev, B * sizeof(double));				//allocate space on the CUDA device for the center (average)
-		stat = cublasSetVector((int)B, sizeof(double), center, 1, center_dev, 1);		//copy the center vector (average) to the CUDA device (from host to device)
+		double* center_dev;																						//declare a device pointer that will store the center (average)
+		HANDLE_ERROR(cudaMalloc(&center_dev, B * sizeof(double)));									//allocate space on the CUDA device for the center (average)
+		CUBLAS_HANDLE_ERROR(cublasSetVector((int)B, sizeof(double), center, 1, center_dev, 1));			//copy the center vector (average) to the CUDA device (from host to device)
 
 
-        double* A = (double*)malloc(sizeof(double) * M);					//allocate space for the projected pixel on the host
-        double* A_dev;														//declare a device pointer that will store the projected pixel on the GPU
-		cudaStat = cudaMalloc(&A_dev,M * sizeof(double));				    //allocate space on the CUDA device for the projected pixel
-		cudaStat = cudaMemset(A_dev, 0,M * sizeof(double));		        	//initialize the projected pixel to zero (0)
+        double* A = (double*)malloc(sizeof(double) * M);								//allocate space for the projected pixel on the host
+        double* A_dev;																	//declare a device pointer that will store the projected pixel on the GPU
+		HANDLE_ERROR(cudaMalloc(&A_dev,M * sizeof(double)));				    //allocate space on the CUDA device for the projected pixel
+		HANDLE_ERROR(cudaMemset(A_dev, 0,M * sizeof(double)));		        //initialize the projected pixel to zero (0)
 
 		double axpy_alpha = -1;												//multiplication factor for the center (in order to perform a subtraction)
 		double axpy_alpha2 = 1;												//multiplication factor for the matrix-vector multiplication
         double axpy_beta = 0;												//multiplication factor for the matrix-vector multiplication (there is no second scalor)
 
-		stat = cublasCreate(&handle);										//create a cuBLAS instance
-		if (stat != CUBLAS_STATUS_SUCCESS) {								//test the cuBLAS instance to make sure it is valid
-			printf ("CUBLAS initialization failed\n");
-			return EXIT_FAILURE;
-		}
+		CUBLAS_HANDLE_ERROR(cublasCreate(&handle));					//create a cuBLAS instance
 
         T* temp = (T*)malloc(sizeof(T) * M);													//allocate space for the projected pixel to be written on the disc
 		size_t i;
@@ -1332,14 +1302,11 @@ public:
 			if (mask == NULL || mask[xy] != 0){
 				pixeld(s, xy);																	//retreive the spectrum at the current xy pixel location
 
-				stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1);						    //copy the spectrum from the host to the device
-                stat = cublasDaxpy(handle, (int)B, &axpy_alpha, center_dev, 1, s_dev, 1);				//subtract the center (average)
-                stat = cublasDgemv(handle,CUBLAS_OP_N,(int)M,(int)B,&axpy_alpha2,basis_dev,(int)M,s_dev,1,&axpy_beta,A_dev,1);         //performs the matrix-vector multiplication
-                stat = cublasGetVector((int)M, sizeof(double), A_dev, 1, A, 1);					//copy the projected pixel to the host (from GPU to CPU)
-
-				//stat = cublasGetVector((int)B, sizeof(double), s_dev, 1, A, 1);					//copy the projected pixel to the host (from GPU to CPU)
-
-                //std::copy<double*, T*>(A, A + M, temp);											
+				CUBLAS_HANDLE_ERROR(cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1));						    //copy the spectrum from the host to the device
+                CUBLAS_HANDLE_ERROR(cublasDaxpy(handle, (int)B, &axpy_alpha, center_dev, 1, s_dev, 1));					//subtract the center (average)
+                CUBLAS_HANDLE_ERROR(cublasDgemv(handle,CUBLAS_OP_N,(int)M,(int)B,&axpy_alpha2,basis_dev,(int)M,s_dev,1,&axpy_beta,A_dev,1));         //performs the matrix-vector multiplication
+                CUBLAS_HANDLE_ERROR(cublasGetVector((int)M, sizeof(double), A_dev, 1, A, 1));							//copy the projected pixel to the host (from GPU to CPU)
+						
 				for(i = 0; i < M; i++)	temp[i] = (T)A[i];										//casting projected pixel from double to whatever T is
 			}
 			else
@@ -1351,10 +1318,11 @@ public:
 		}
 
         //clean up allocated device memory
-		cudaFree(A_dev);
-		cudaFree(s_dev);
-        cudaFree(basis_dev);
-		cudaFree(center_dev);
+		HANDLE_ERROR(cudaFree(A_dev));
+		HANDLE_ERROR(cudaFree(s_dev));
+		HANDLE_ERROR(cudaFree(basis_dev));
+		HANDLE_ERROR(cudaFree(center_dev));
+		CUBLAS_HANDLE_ERROR(cublasDestroy(handle));
 		free(A);
 		free(s);
 		free(temp);
@@ -1370,29 +1338,19 @@ public:
 	/// @param M is the number of basis vectors
 	/// @param mask is a character mask used to limit processing to valid pixels
 	bool project(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, int cuda_device = 0, bool PROGRESS = false){
-
 		if (cuda_device >= 0) {													//if a CUDA device is specified
 			int dev_count;
 			HANDLE_ERROR(cudaGetDeviceCount(&dev_count));						//get the number of CUDA devices
-			std::cout << "Number of CUDA devices: " << dev_count << std::endl;		//output the number of CUDA devices
-			cudaDeviceProp prop;
-			//int best_device_id = 0;													//stores the best CUDA device
-			//float best_device_cc = 0.0f;												//stores the compute capability of the best device
-			std::cout << "CUDA devices----" << std::endl;
-			for (int d = 0; d < dev_count; d++) {									//for each CUDA device
-				cudaGetDeviceProperties(&prop, d);								//get the property of the first device
-																				//float cc = prop.major + prop.minor / 10.0f;						//calculate the compute capability
-				std::cout << d << ":  [" << prop.major << "." << prop.minor << "]      " << prop.name << std::endl;	//display the device information
-			}
 			if (dev_count > 0 && dev_count > cuda_device) {							//if the first device is not an emulator
+				cudaDeviceProp prop;
 				cudaGetDeviceProperties(&prop, cuda_device);									//get the property of the requested CUDA device
 				if (prop.major != 9999) {
-					std::cout << "Using device " << cuda_device << std::endl;
+					std::cout << "Using CUDA device [" << cuda_device << "] to perform a basis projection..." << std::endl;
 					HANDLE_ERROR(cudaSetDevice(cuda_device));
-					int status = project_cublas(outfile, center, basis, M, mask, PROGRESS);	 //use cuBLAS to calculate the covariance matrix
-					if (status == 0) return true;									//if the cuBLAS function returned correctly, we're done
+					return project_cublas(outfile, center, basis, M, mask, PROGRESS);
 				}
-			}																	//otherwise continue using the CPU
+			}																	//otherwise continue using the CPU		
+			std::cout << "WARNING: cuBLAS failed, using CPU" << std::endl;
 		}
 		
 		std::ofstream target(outfile.c_str(), std::ios::binary);	//open the target binary file
diff --git a/stim/math/matrix.h b/stim/math/matrix.h
index 9af628d..01f20e6 100644
--- a/stim/math/matrix.h
+++ b/stim/math/matrix.h
@@ -6,7 +6,7 @@
 #include <iostream>
 #include <stim/math/vector.h>
 #include <stim/math/vec3.h>
-#include <stim/cuda/cudatools/callable.h>
+//#include <stim/cuda/cudatools/callable.h>
 
 namespace stim{
 
@@ -56,7 +56,7 @@ public:
 		return C;
 	}
 
-	T& operator()(int row, int col) {
+	T& operator()(size_t row, size_t col) {
 		return at(row, col);
 	}
 
@@ -122,10 +122,11 @@ public:
 
 		matrix<T> result(R, rhs.C);				//create the output matrix
 		T inner;								//stores the running inner product
-		for(size_t c = 0; c < rhs.C; c++){
-			for(size_t r = 0; r < R; r++){
+		size_t c, r, i;
+		for(c = 0; c < rhs.C; c++){
+			for(r = 0; r < R; r++){
 				inner = (T)0;
-				for(size_t i = 0; i < C; i++){
+				for(i = 0; i < C; i++){
 					inner += at(r, i) * rhs.at(i, c);
 				}
 				result.M[c * R + r] = inner;
@@ -151,23 +152,58 @@ public:
 		return result;
 	}
 
-	std::string toStr()
-	{
+	/// Sort rows of the matrix by the specified indices
+	matrix<T> sort_rows(size_t* idx) {
+		matrix<T> result(C, R);					//create the output matrix
+		size_t r, c;
+		for (c = 0; c < C; c++) {								//for each column
+			for (r = 0; r < R; r++) {							//for each row element
+				result.M[c * R + r] = M[c * R + idx[r]];		//copy each element of the row into its new position
+			}
+		}
+		return result;
+	}
+
+	/// Sort columns of the matrix by the specified indices
+	matrix<T> sort_cols(size_t* idx) {
+		matrix<T> result(C, R);
+		size_t c;
+		for (c = 0; c < C; c++) {											//for each column
+			memcpy(&result.M[c * R], &M[idx[c] * R], sizeof(T) * R);		//copy the entire column from this matrix to the appropriate location
+		}
+		return result;
+	}
+
+	std::string toStr() {
 		std::stringstream ss;
 
-		for(int r = 0; r < R; r++)
-		{
+		for(int r = 0; r < R; r++) {
 			ss << "| ";
-			for(int c=0; c<C; c++)
-			{
+			for(int c=0; c<C; c++) {
 				ss << M[c * R + r] << " ";
 			}
 			ss << "|" << std::endl;
 		}
-
 		return ss.str();
 	}
 
+	std::string csv() {
+		std::stringstream csvss;
+		for (size_t i = 0; i < B; i++) {
+			csvss << M[i];
+			for (size_t j = 0; j < B; j++) csvss << ", " << M[j * B + i];
+			csvss << std::endl;
+		}
+		return csvss.str();
+	}
+
+	//save the data as a CSV file
+	void csv(std::string filename) {
+		ofstream basisfile(filename.c_str());
+		basisfile << csv();
+		basisfile.close();
+	}
+
 	static matrix<T> I(size_t N) {
 
 		matrix<T> result(N, N);							//create the identity matrix
--
libgit2 0.21.4