Commit c6f526c207824e0e8653cd892784720487ea4eb9
1 parent
718ee79b
added cuBLAS support for computing a covariance matrix for BIP files
Showing
1 changed file
with
79 additions
and
0 deletions
Show diff stats
stim/envi/bip.h
... | ... | @@ -7,6 +7,10 @@ |
7 | 7 | #include <cstring> |
8 | 8 | #include <utility> |
9 | 9 | |
10 | +//CUDA | |
11 | +#include <cuda_runtime.h> | |
12 | +#include "cublas_v2.h" | |
13 | + | |
10 | 14 | namespace stim{ |
11 | 15 | |
12 | 16 | /** |
... | ... | @@ -208,6 +212,23 @@ public: |
208 | 212 | return true; |
209 | 213 | } |
210 | 214 | |
215 | + /// Retrieve a single pixel and store it in a pre-allocated double array. | |
216 | + bool pixeld(double* p, unsigned n){ | |
217 | + unsigned bandnum = X() * Y(); //calculate numbers in one band | |
218 | + if ( n >= bandnum){ //make sure the pixel number is right | |
219 | + std::cout<<"ERROR: sample or line out of range"<<std::endl; | |
220 | + return false; | |
221 | + } | |
222 | + unsigned B = Z(); | |
223 | + | |
224 | + T* temp = (T*) malloc(B * sizeof(T)); //allocate space for the raw pixel data | |
225 | + file.seekg(n * B * sizeof(T), std::ios::beg); //point to the certain pixel | |
226 | + file.read((char *)temp, sizeof(T) * B); //read the spectrum from disk to the temp pointer | |
227 | + | |
228 | + for(unsigned int i = 0; i < B; i++) //for each element of the spectrum | |
229 | + p[i] = (double) temp[i]; //cast each element to a double value | |
230 | + } | |
231 | + | |
211 | 232 | /// Retrieve a single pixel and stores it in pre-allocated memory. |
212 | 233 | |
213 | 234 | /// @param p is a pointer to pre-allocated memory at least sizeof(T) in size. |
... | ... | @@ -989,6 +1010,60 @@ public: |
989 | 1010 | thread_data = 100; |
990 | 1011 | return true; |
991 | 1012 | } |
1013 | +#ifdef CUDA_FOUND | |
1014 | + /// Calculate the covariance matrix for masked pixels using cuBLAS | |
1015 | + bool cu_co_matrix(double* co, double* avg, unsigned char *mask){ | |
1016 | + | |
1017 | + cudaError_t cudaStat; | |
1018 | + cublasStatus_t stat; | |
1019 | + cublasHandle_t handle; | |
1020 | + | |
1021 | + thread_data = 0; //initialize the progress to zero (0) | |
1022 | + unsigned long long XY = X() * Y(); //calculate the number of elements in a band image | |
1023 | + unsigned long long B = Z(); //calculate the number of spectral elements | |
1024 | + | |
1025 | + double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file | |
1026 | + double* s_dev; //declare a device pointer that will store the spectrum on the GPU | |
1027 | + double* A_dev; //declare a device pointer that will store the covariance matrix on the GPU | |
1028 | + double* avg_dev; //declare a device pointer that will store the average spectrum | |
1029 | + cudaStat = cudaMalloc(&s_dev, B * sizeof(double)); //allocate space on the CUDA device for the spectrum | |
1030 | + cudaStat = cudaMalloc(&A_dev, B * B * sizeof(double)); //allocate space on the CUDA device for the covariance matrix | |
1031 | + cudaStat = cudaMemset(A_dev, 0, B * B * sizeof(double)); //initialize the covariance matrix to zero (0) | |
1032 | + cudaStat = cudaMalloc(&avg_dev, B * sizeof(double)); //allocate space on the CUDA device for the average spectrum | |
1033 | + stat = cublasSetVector(B, sizeof(double), avg, 1, avg_dev, 1); //copy the average spectrum to the CUDA device | |
1034 | + | |
1035 | + double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product) | |
1036 | + double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) | |
1037 | + | |
1038 | + stat = cublasCreate(&handle); //create a cuBLAS instance | |
1039 | + if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid | |
1040 | + printf ("CUBLAS initialization failed\n"); | |
1041 | + return EXIT_FAILURE; | |
1042 | + } | |
1043 | + for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel | |
1044 | + pixeld(s, xy); //retreive the spectrum at the current xy pixel location | |
1045 | + stat = cublasSetVector(B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device | |
1046 | + stat = cublasDaxpy(handle, B, &axpy_alpha, avg_dev, 1, s_dev, 1); //subtract the average spectrum | |
1047 | + stat = cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, B, &ger_alpha, s_dev, 1, A_dev, B); //calculate the covariance matrix (symmetric outer product) | |
1048 | + thread_data = (double)xy / XY * 100; //record the current progress | |
1049 | + } | |
1050 | + | |
1051 | + cublasGetMatrix(B, B, sizeof(double), A_dev, B, co, B); //copy the result from the GPU to the CPU | |
1052 | + | |
1053 | + cudaFree(A_dev); //clean up allocated device memory | |
1054 | + cudaFree(s_dev); | |
1055 | + cudaFree(avg_dev); | |
1056 | + | |
1057 | + for(unsigned i = 0; i < B; i++){ //copy the upper triangular portion to the lower triangular portion | |
1058 | + for(unsigned j = i+1; j < B; j++){ | |
1059 | + co[B * i + j] = co[B * j + i]; | |
1060 | + } | |
1061 | + } | |
1062 | + | |
1063 | + thread_data = 100; //processing complete | |
1064 | + return true; | |
1065 | + } | |
1066 | +#endif | |
992 | 1067 | |
993 | 1068 | /// Calculate the covariance matrix for all masked pixels in the image with 64-bit floating point precision. |
994 | 1069 | |
... | ... | @@ -996,6 +1071,10 @@ public: |
996 | 1071 | /// @param avg is a pointer to memory of size B that stores the average spectrum |
997 | 1072 | /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location |
998 | 1073 | bool co_matrix(double* co, double* avg, unsigned char *mask){ |
1074 | + | |
1075 | +#ifdef CUDA_FOUND | |
1076 | + return cu_co_matrix(co, avg, mask); //if CUDA is available, use cuBLAS to calculate the covariance matrix | |
1077 | +#endif | |
999 | 1078 | thread_data = 0; |
1000 | 1079 | //memory allocation |
1001 | 1080 | unsigned long long XY = X() * Y(); | ... | ... |