Commit c6f526c207824e0e8653cd892784720487ea4eb9

Authored by David Mayerich
1 parent 718ee79b

added cuBLAS support for computing a covariance matrix for BIP files

Showing 1 changed file with 79 additions and 0 deletions   Show diff stats
stim/envi/bip.h
... ... @@ -7,6 +7,10 @@
7 7 #include <cstring>
8 8 #include <utility>
9 9  
  10 +//CUDA
  11 +#include <cuda_runtime.h>
  12 +#include "cublas_v2.h"
  13 +
10 14 namespace stim{
11 15  
12 16 /**
... ... @@ -208,6 +212,23 @@ public:
208 212 return true;
209 213 }
210 214  
  215 + /// Retrieve a single pixel and store it in a pre-allocated double array.
  216 + bool pixeld(double* p, unsigned n){
  217 + unsigned bandnum = X() * Y(); //calculate numbers in one band
  218 + if ( n >= bandnum){ //make sure the pixel number is right
  219 + std::cout<<"ERROR: sample or line out of range"<<std::endl;
  220 + return false;
  221 + }
  222 + unsigned B = Z();
  223 +
  224 + T* temp = (T*) malloc(B * sizeof(T)); //allocate space for the raw pixel data
  225 + file.seekg(n * B * sizeof(T), std::ios::beg); //point to the certain pixel
  226 + file.read((char *)temp, sizeof(T) * B); //read the spectrum from disk to the temp pointer
  227 +
  228 + for(unsigned int i = 0; i < B; i++) //for each element of the spectrum
  229 + p[i] = (double) temp[i]; //cast each element to a double value
  230 + }
  231 +
211 232 /// Retrieve a single pixel and stores it in pre-allocated memory.
212 233  
213 234 /// @param p is a pointer to pre-allocated memory at least sizeof(T) in size.
... ... @@ -989,6 +1010,60 @@ public:
989 1010 thread_data = 100;
990 1011 return true;
991 1012 }
  1013 +#ifdef CUDA_FOUND
  1014 + /// Calculate the covariance matrix for masked pixels using cuBLAS
  1015 + bool cu_co_matrix(double* co, double* avg, unsigned char *mask){
  1016 +
  1017 + cudaError_t cudaStat;
  1018 + cublasStatus_t stat;
  1019 + cublasHandle_t handle;
  1020 +
  1021 + thread_data = 0; //initialize the progress to zero (0)
  1022 + unsigned long long XY = X() * Y(); //calculate the number of elements in a band image
  1023 + unsigned long long B = Z(); //calculate the number of spectral elements
  1024 +
  1025 + double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file
  1026 + double* s_dev; //declare a device pointer that will store the spectrum on the GPU
  1027 + double* A_dev; //declare a device pointer that will store the covariance matrix on the GPU
  1028 + double* avg_dev; //declare a device pointer that will store the average spectrum
  1029 + cudaStat = cudaMalloc(&s_dev, B * sizeof(double)); //allocate space on the CUDA device for the spectrum
  1030 + cudaStat = cudaMalloc(&A_dev, B * B * sizeof(double)); //allocate space on the CUDA device for the covariance matrix
  1031 + cudaStat = cudaMemset(A_dev, 0, B * B * sizeof(double)); //initialize the covariance matrix to zero (0)
  1032 + cudaStat = cudaMalloc(&avg_dev, B * sizeof(double)); //allocate space on the CUDA device for the average spectrum
  1033 + stat = cublasSetVector(B, sizeof(double), avg, 1, avg_dev, 1); //copy the average spectrum to the CUDA device
  1034 +
  1035 + double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product)
  1036 + double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction)
  1037 +
  1038 + stat = cublasCreate(&handle); //create a cuBLAS instance
  1039 + if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid
  1040 + printf ("CUBLAS initialization failed\n");
  1041 + return EXIT_FAILURE;
  1042 + }
  1043 + for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel
  1044 + pixeld(s, xy); //retreive the spectrum at the current xy pixel location
  1045 + stat = cublasSetVector(B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device
  1046 + stat = cublasDaxpy(handle, B, &axpy_alpha, avg_dev, 1, s_dev, 1); //subtract the average spectrum
  1047 + stat = cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, B, &ger_alpha, s_dev, 1, A_dev, B); //calculate the covariance matrix (symmetric outer product)
  1048 + thread_data = (double)xy / XY * 100; //record the current progress
  1049 + }
  1050 +
  1051 + cublasGetMatrix(B, B, sizeof(double), A_dev, B, co, B); //copy the result from the GPU to the CPU
  1052 +
  1053 + cudaFree(A_dev); //clean up allocated device memory
  1054 + cudaFree(s_dev);
  1055 + cudaFree(avg_dev);
  1056 +
  1057 + for(unsigned i = 0; i < B; i++){ //copy the upper triangular portion to the lower triangular portion
  1058 + for(unsigned j = i+1; j < B; j++){
  1059 + co[B * i + j] = co[B * j + i];
  1060 + }
  1061 + }
  1062 +
  1063 + thread_data = 100; //processing complete
  1064 + return true;
  1065 + }
  1066 +#endif
992 1067  
993 1068 /// Calculate the covariance matrix for all masked pixels in the image with 64-bit floating point precision.
994 1069  
... ... @@ -996,6 +1071,10 @@ public:
996 1071 /// @param avg is a pointer to memory of size B that stores the average spectrum
997 1072 /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
998 1073 bool co_matrix(double* co, double* avg, unsigned char *mask){
  1074 +
  1075 +#ifdef CUDA_FOUND
  1076 + return cu_co_matrix(co, avg, mask); //if CUDA is available, use cuBLAS to calculate the covariance matrix
  1077 +#endif
999 1078 thread_data = 0;
1000 1079 //memory allocation
1001 1080 unsigned long long XY = X() * Y();
... ...