From 1c92dffecd442c80bf3562f2958b9bd1aa775ac0 Mon Sep 17 00:00:00 2001 From: David Date: Fri, 23 Dec 2016 02:30:04 -0600 Subject: [PATCH] simplified CUDA implementation in covariance and projection calculations --- stim/envi/bip.h | 136 +++++++++++++++++++++++++++++++++++++++++++++++----------------------------------------------------------------------------------------- stim/math/matrix.h | 60 ++++++++++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 95 insertions(+), 101 deletions(-) diff --git a/stim/envi/bip.h b/stim/envi/bip.h index d7d8d56..ede6684 100644 --- a/stim/envi/bip.h +++ b/stim/envi/bip.h @@ -1015,7 +1015,7 @@ public: stat = cublasCreate(&handle); //create a cuBLAS instance if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid - else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<= 0){ //if a CUDA device is specified int dev_count; HANDLE_ERROR(cudaGetDeviceCount(&dev_count)); //get the number of CUDA devices - std::cout<<"Number of CUDA devices: "< best_device_cc){ - // best_device_cc = cc; //if this is better than the previous device, use it - // best_device_id = d; - //} - } if(dev_count > 0 && dev_count > cuda_device){ //if the first device is not an emulator + cudaDeviceProp prop; cudaGetDeviceProperties(&prop, cuda_device); //get the property of the requested CUDA device if (prop.major != 9999) { - std::cout << "Using device " << cuda_device << std::endl; + std::cout << "Using CUDA device [" << cuda_device << "] to calculate the mean covariance matrix..."<= 0){ + if (cuda_device >= 0) { //if a CUDA device is specified int dev_count; HANDLE_ERROR(cudaGetDeviceCount(&dev_count)); //get the number of CUDA devices - std::cout<<"Number of CUDA devices: "< best_device_cc){ - // best_device_cc = cc; //if this is better than the previous device, use it - // best_device_id = d; - //} - } - - if(dev_count > 0 && prop.major != 9999 && dev_count > cuda_device){ //if the first device is not an emulator - std::cout<<"Using device "<< cuda_device < 0 && dev_count > cuda_device) { //if the first device is not an emulator + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, cuda_device); //get the property of the requested CUDA device + if (prop.major != 9999) { + std::cout << "Using CUDA device [" << cuda_device << "] to calculate the noise covariance matrix..." << std::endl; + HANDLE_ERROR(cudaSetDevice(cuda_device)); + int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix + if (status == 0) return true; //if the cuBLAS function returned correctly, we're done + } + } //otherwise continue using the CPU + std::cout << "WARNING: cuBLAS failed, using CPU" << std::endl; } progress = 0; @@ -1277,8 +1251,8 @@ public: /// @param mask is a character mask used to limit processing to valid pixels bool project_cublas(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){ - cudaError_t cudaStat; - cublasStatus_t stat; + //cudaError_t cudaStat; + //cublasStatus_t stat; cublasHandle_t handle; std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file @@ -1289,12 +1263,12 @@ public: double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file double* s_dev; //declare a device pointer that will store the spectrum on the GPU - cudaStat = cudaMalloc(&s_dev, B * sizeof(double)); //allocate space on the CUDA device for the spectrum + HANDLE_ERROR(cudaMalloc(&s_dev, B * sizeof(double))); //allocate space on the CUDA device for the spectrum double* basis_dev; // device pointer on the GPU - cudaStat = cudaMalloc(&basis_dev, M * B * sizeof(double)); // allocate space on the CUDA device - cudaStat = cudaMemset(basis_dev, 0, M * B * sizeof(double)); // initialize basis_dev to zero (0) + HANDLE_ERROR(cudaMalloc(&basis_dev, M * B * sizeof(double))); // allocate space on the CUDA device + HANDLE_ERROR(cudaMemset(basis_dev, 0, M * B * sizeof(double))); // initialize basis_dev to zero (0) /// transposing basis matrix (because cuBLAS is column-major) @@ -1303,28 +1277,24 @@ public: for (int i = 0; i(A, A + M, temp); + CUBLAS_HANDLE_ERROR(cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1)); //copy the spectrum from the host to the device + CUBLAS_HANDLE_ERROR(cublasDaxpy(handle, (int)B, &axpy_alpha, center_dev, 1, s_dev, 1)); //subtract the center (average) + CUBLAS_HANDLE_ERROR(cublasDgemv(handle,CUBLAS_OP_N,(int)M,(int)B,&axpy_alpha2,basis_dev,(int)M,s_dev,1,&axpy_beta,A_dev,1)); //performs the matrix-vector multiplication + CUBLAS_HANDLE_ERROR(cublasGetVector((int)M, sizeof(double), A_dev, 1, A, 1)); //copy the projected pixel to the host (from GPU to CPU) + for(i = 0; i < M; i++) temp[i] = (T)A[i]; //casting projected pixel from double to whatever T is } else @@ -1351,10 +1318,11 @@ public: } //clean up allocated device memory - cudaFree(A_dev); - cudaFree(s_dev); - cudaFree(basis_dev); - cudaFree(center_dev); + HANDLE_ERROR(cudaFree(A_dev)); + HANDLE_ERROR(cudaFree(s_dev)); + HANDLE_ERROR(cudaFree(basis_dev)); + HANDLE_ERROR(cudaFree(center_dev)); + CUBLAS_HANDLE_ERROR(cublasDestroy(handle)); free(A); free(s); free(temp); @@ -1370,29 +1338,19 @@ public: /// @param M is the number of basis vectors /// @param mask is a character mask used to limit processing to valid pixels bool project(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, int cuda_device = 0, bool PROGRESS = false){ - if (cuda_device >= 0) { //if a CUDA device is specified int dev_count; HANDLE_ERROR(cudaGetDeviceCount(&dev_count)); //get the number of CUDA devices - std::cout << "Number of CUDA devices: " << dev_count << std::endl; //output the number of CUDA devices - cudaDeviceProp prop; - //int best_device_id = 0; //stores the best CUDA device - //float best_device_cc = 0.0f; //stores the compute capability of the best device - std::cout << "CUDA devices----" << std::endl; - for (int d = 0; d < dev_count; d++) { //for each CUDA device - cudaGetDeviceProperties(&prop, d); //get the property of the first device - //float cc = prop.major + prop.minor / 10.0f; //calculate the compute capability - std::cout << d << ": [" << prop.major << "." << prop.minor << "] " << prop.name << std::endl; //display the device information - } if (dev_count > 0 && dev_count > cuda_device) { //if the first device is not an emulator + cudaDeviceProp prop; cudaGetDeviceProperties(&prop, cuda_device); //get the property of the requested CUDA device if (prop.major != 9999) { - std::cout << "Using device " << cuda_device << std::endl; + std::cout << "Using CUDA device [" << cuda_device << "] to perform a basis projection..." << std::endl; HANDLE_ERROR(cudaSetDevice(cuda_device)); - int status = project_cublas(outfile, center, basis, M, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix - if (status == 0) return true; //if the cuBLAS function returned correctly, we're done + return project_cublas(outfile, center, basis, M, mask, PROGRESS); } - } //otherwise continue using the CPU + } //otherwise continue using the CPU + std::cout << "WARNING: cuBLAS failed, using CPU" << std::endl; } std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file diff --git a/stim/math/matrix.h b/stim/math/matrix.h index 9af628d..01f20e6 100644 --- a/stim/math/matrix.h +++ b/stim/math/matrix.h @@ -6,7 +6,7 @@ #include #include #include -#include +//#include namespace stim{ @@ -56,7 +56,7 @@ public: return C; } - T& operator()(int row, int col) { + T& operator()(size_t row, size_t col) { return at(row, col); } @@ -122,10 +122,11 @@ public: matrix result(R, rhs.C); //create the output matrix T inner; //stores the running inner product - for(size_t c = 0; c < rhs.C; c++){ - for(size_t r = 0; r < R; r++){ + size_t c, r, i; + for(c = 0; c < rhs.C; c++){ + for(r = 0; r < R; r++){ inner = (T)0; - for(size_t i = 0; i < C; i++){ + for(i = 0; i < C; i++){ inner += at(r, i) * rhs.at(i, c); } result.M[c * R + r] = inner; @@ -151,23 +152,58 @@ public: return result; } - std::string toStr() - { + /// Sort rows of the matrix by the specified indices + matrix sort_rows(size_t* idx) { + matrix result(C, R); //create the output matrix + size_t r, c; + for (c = 0; c < C; c++) { //for each column + for (r = 0; r < R; r++) { //for each row element + result.M[c * R + r] = M[c * R + idx[r]]; //copy each element of the row into its new position + } + } + return result; + } + + /// Sort columns of the matrix by the specified indices + matrix sort_cols(size_t* idx) { + matrix result(C, R); + size_t c; + for (c = 0; c < C; c++) { //for each column + memcpy(&result.M[c * R], &M[idx[c] * R], sizeof(T) * R); //copy the entire column from this matrix to the appropriate location + } + return result; + } + + std::string toStr() { std::stringstream ss; - for(int r = 0; r < R; r++) - { + for(int r = 0; r < R; r++) { ss << "| "; - for(int c=0; c I(size_t N) { matrix result(N, N); //create the identity matrix -- libgit2 0.21.4