Commit 2781a48c74b990c3242b2f862ec696de77483afc

Authored by David Mayerich
1 parent 9f87643c

fixed cuBLAS functionality, improved error messages

stim/cuda/cudatools/error.h
  1 +#ifndef STIM_CUDA_ERROR_H
  2 +#define STIM_CUDA_ERROR_H
  3 +
1 #include <stdio.h> 4 #include <stdio.h>
2 #include <iostream> 5 #include <iostream>
3 using namespace std; 6 using namespace std;
4 #include "cuda_runtime.h" 7 #include "cuda_runtime.h"
5 #include "device_launch_parameters.h" 8 #include "device_launch_parameters.h"
6 #include "cufft.h" 9 #include "cufft.h"
7 -  
8 -#ifndef CUDA_HANDLE_ERROR_H  
9 -#define CUDA_HANDLE_ERROR_H 10 +#include "cublas_v2.h"
10 11
11 //handle error macro 12 //handle error macro
12 -static void HandleError( cudaError_t err, const char *file, int line ) { 13 +static void cuHandleError( cudaError_t err, const char *file, int line ) {
13 if (err != cudaSuccess) { 14 if (err != cudaSuccess) {
14 printf("%s in %s at line %d\n", cudaGetErrorString( err ), file, line ); 15 printf("%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
  16 +
15 } 17 }
16 } 18 }
17 -#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) 19 +#define HANDLE_ERROR( err ) (cuHandleError( err, __FILE__, __LINE__ ))
18 20
19 -static void CufftError( cufftResult err ) 21 +static void cufftHandleError( cufftResult err, const char*file, int line )
20 { 22 {
21 if (err != CUFFT_SUCCESS) 23 if (err != CUFFT_SUCCESS)
22 { 24 {
@@ -37,7 +39,29 @@ static void CufftError( cufftResult err ) @@ -37,7 +39,29 @@ static void CufftError( cufftResult err )
37 39
38 } 40 }
39 } 41 }
  42 +#define CUFFT_HANDLE_ERROR( err ) (cufftHandleError( err, __FILE__, __LINE__ ))
40 43
  44 +static void cublasHandleError( cublasStatus_t err, const char*file, int line ){
  45 + if(err != CUBLAS_STATUS_SUCCESS){
  46 + if(err == CUBLAS_STATUS_NOT_INITIALIZED)
  47 + std::cout<<"CUBLAS_STATUS_NOT_INITIALIZED" <<" in file "<<file<<" line "<<std::endl;
  48 + else if(err == CUBLAS_STATUS_ALLOC_FAILED)
  49 + std::cout<<"CUBLAS_STATUS_ALLOC_FAILED" <<" in file "<<file<<" line "<<std::endl;
  50 + else if(err == CUBLAS_STATUS_INVALID_VALUE)
  51 + std::cout<<"CUBLAS_STATUS_INVALID_VALUE" <<" in file "<<file<<" line "<<std::endl;
  52 + else if(err == CUBLAS_STATUS_ARCH_MISMATCH)
  53 + std::cout<<"CUBLAS_STATUS_ARCH_MISMATCH" <<" in file "<<file<<" line "<<std::endl;
  54 + else if(err == CUBLAS_STATUS_MAPPING_ERROR)
  55 + std::cout<<"CUBLAS_STATUS_MAPPING_ERROR" <<" in file "<<file<<" line "<<std::endl;
  56 + else if(err == CUBLAS_STATUS_EXECUTION_FAILED)
  57 + std::cout<<"CUBLAS_STATUS_EXECUTION_FAILED" <<" in file "<<file<<" line "<<std::endl;
  58 + else if(err == CUBLAS_STATUS_INTERNAL_ERROR)
  59 + std::cout<<"CUBLAS_STATUS_INTERNAL_ERROR" <<" in file "<<file<<" line "<<std::endl;
  60 + else
  61 + std::cout<<"Unknown error"<<" in file "<<file<<" line "<<std::endl;
  62 + }
  63 +}
  64 +#define CUBLAS_HANDLE_ERROR( err ) (cublasHandleError( err, __FILE__, __LINE__ ))
41 65
42 66
43 #endif 67 #endif
stim/envi/agilent_binary.h
@@ -6,11 +6,11 @@ @@ -6,11 +6,11 @@
6 #include <fstream> 6 #include <fstream>
7 7
8 //CUDA 8 //CUDA
9 -#ifdef CUDA_FOUND  
10 - #include <cuda_runtime.h>  
11 - #include "cufft.h"  
12 - #include <stim/cuda/cudatools/error.h>  
13 -#endif 9 +//#ifdef CUDA_FOUND
  10 +#include <cuda_runtime.h>
  11 +#include "cufft.h"
  12 +#include <stim/cuda/cudatools/error.h>
  13 +//#endif
14 14
15 namespace stim{ 15 namespace stim{
16 16
@@ -42,6 +42,10 @@ public: @@ -42,6 +42,10 @@ public:
42 alloc(); 42 alloc();
43 } 43 }
44 44
  45 + size_t dim(size_t i){
  46 + return R[i];
  47 + }
  48 +
45 /// Create a deep copy of an agileng_binary object 49 /// Create a deep copy of an agileng_binary object
46 void deep_copy(agilent_binary<T>* dst, const agilent_binary<T>* src){ 50 void deep_copy(agilent_binary<T>* dst, const agilent_binary<T>* src){
47 dst->alloc(src->R[0], src->R[1], src->R[2]); //allocate memory 51 dst->alloc(src->R[0], src->R[1], src->R[2]); //allocate memory
@@ -169,7 +173,6 @@ public: @@ -169,7 +173,6 @@ public:
169 void zeropad(){ 173 void zeropad(){
170 size_t newZ = pow(2, ceil(log(R[2])/log(2))); //find the nearest power-of-two 174 size_t newZ = pow(2, ceil(log(R[2])/log(2))); //find the nearest power-of-two
171 size_t n = newZ - R[2]; //calculate the number of bands to add 175 size_t n = newZ - R[2]; //calculate the number of bands to add
172 - std::cout<<"band padding: "<<n<<std::endl;  
173 zeropad(n); //add the padding 176 zeropad(n); //add the padding
174 } 177 }
175 178
@@ -184,7 +187,7 @@ public: @@ -184,7 +187,7 @@ public:
184 ptr[i] = -log10(ptr[i] / background->ptr[i]); 187 ptr[i] = -log10(ptr[i] / background->ptr[i]);
185 } 188 }
186 189
187 -#ifdef CUDA_FOUND 190 +//#ifdef CUDA_FOUND
188 /// Perform an FFT and return a binary file with bands in the specified range 191 /// Perform an FFT and return a binary file with bands in the specified range
189 agilent_binary<T> fft(double band_min, double band_max, double ELWN = 15798, int UDR = 2){ 192 agilent_binary<T> fft(double band_min, double band_max, double ELWN = 15798, int UDR = 2){
190 auto total_start = std::chrono::high_resolution_clock::now(); 193 auto total_start = std::chrono::high_resolution_clock::now();
@@ -271,7 +274,7 @@ public: @@ -271,7 +274,7 @@ public:
271 274
272 return result; 275 return result;
273 } 276 }
274 -#endif 277 +//#endif
275 278
276 }; 279 };
277 280
@@ -4,6 +4,7 @@ @@ -4,6 +4,7 @@
4 #include "../envi/envi_header.h" 4 #include "../envi/envi_header.h"
5 #include "../envi/hsi.h" 5 #include "../envi/hsi.h"
6 #include "../math/fd_coefficients.h" 6 #include "../math/fd_coefficients.h"
  7 +#include <stim/cuda/cudatools/error.h>
7 #include <cstring> 8 #include <cstring>
8 #include <utility> 9 #include <utility>
9 #include <deque> 10 #include <deque>
@@ -224,10 +225,44 @@ public: @@ -224,10 +225,44 @@ public:
224 } 225 }
225 226
226 //given a Y ,return a XZ slice 227 //given a Y ,return a XZ slice
227 - bool read_plane_y(T * p, unsigned long long y){ 228 + bool read_plane_xz(T * p, size_t y){
228 return binary<T>::read_plane_2(p, y); 229 return binary<T>::read_plane_2(p, y);
229 } 230 }
230 231
  232 + //given a Y, return ZX slice (transposed such that the spectrum is the leading dimension)
  233 + int read_plane_zx(T* p, size_t y){
  234 + T* temp = (T*) malloc(X() * Z() * sizeof(T)); //allocate space to store the temporary xz plane
  235 + binary<T>::read_plane_2(temp, y); //load the plane from disk
  236 + size_t z, x;
  237 + for(z = 0; z < Z(); z++){
  238 + for(x = 0; x <= z; x++){
  239 + p[x * Z() + z] = temp[z * X() + x]; //copy to the destination frame
  240 + }
  241 + }
  242 + }
  243 +
  244 + //load a frame y into a pre-allocated double-precision array
  245 + int read_plane_xzd(double* f, size_t y){
  246 + size_t XB = X() * B();
  247 + T* temp = (T*) malloc(XB * sizeof(T)); //create a temporary location to store the plane at current precision
  248 + if(!read_plane_y(temp, y)) return 1; //read the plane in its native format, if it fails return a 1
  249 + for(size_t i = 0; i < XB; i++) f[i] = temp[i]; //convert the plane to a double
  250 + return 0;
  251 + }
  252 +
  253 + //given a Y, return ZX slice (transposed such that the spectrum is the leading dimension)
  254 + int read_plane_zxd(double* p, size_t y){
  255 + T* temp = (T*) malloc(X() * Z() * sizeof(T)); //allocate space to store the temporary xz plane
  256 + binary<T>::read_plane_2(temp, y); //load the plane from disk
  257 + size_t z, x;
  258 + for(z = 0; z < Z(); z++){
  259 + for(x = 0; x < X(); x++){
  260 + p[x * Z() + z] = (double)temp[z * X() + x]; //copy to the destination frame
  261 + }
  262 + }
  263 + return 0;
  264 + }
  265 +
231 266
232 /// Perform baseline correction given a list of baseline points and stores the result in a new BSQ file. 267 /// Perform baseline correction given a list of baseline points and stores the result in a new BSQ file.
233 268
@@ -268,7 +303,7 @@ public: @@ -268,7 +303,7 @@ public:
268 for (unsigned long long k =0; k < Y(); k++) 303 for (unsigned long long k =0; k < Y(); k++)
269 { 304 {
270 //get the current y slice 305 //get the current y slice
271 - read_plane_y(c, k); 306 + read_plane_xz(c, k);
272 307
273 //initialize lownum, highnum, low, high 308 //initialize lownum, highnum, low, high
274 ai = w[0]; 309 ai = w[0];
@@ -369,7 +404,7 @@ public: @@ -369,7 +404,7 @@ public:
369 404
370 for(unsigned long long j = 0; j < Y(); j++) 405 for(unsigned long long j = 0; j < Y(); j++)
371 { 406 {
372 - read_plane_y(c, j); 407 + read_plane_xz(c, j);
373 for(unsigned long long i = 0; i < B; i++) 408 for(unsigned long long i = 0; i < B; i++)
374 { 409 {
375 for(unsigned long long m = 0; m < X(); m++) 410 for(unsigned long long m = 0; m < X(); m++)
@@ -469,7 +504,7 @@ public: @@ -469,7 +504,7 @@ public:
469 504
470 for ( unsigned long long i = 0; i < Y(); i++) 505 for ( unsigned long long i = 0; i < Y(); i++)
471 { 506 {
472 - read_plane_y(p, i); 507 + read_plane_xz(p, i);
473 for ( unsigned long long k = 0; k < Z(); k++) 508 for ( unsigned long long k = 0; k < Z(); k++)
474 { 509 {
475 unsigned long long ks = k * X(); 510 unsigned long long ks = k * X();
@@ -863,7 +898,7 @@ public: @@ -863,7 +898,7 @@ public:
863 898
864 for (unsigned long long i = 0; i < Y(); i++) //for each value in Y() (BIP should be X) 899 for (unsigned long long i = 0; i < Y(); i++) //for each value in Y() (BIP should be X)
865 { 900 {
866 - read_plane_y(temp, i); //retrieve an ZX slice, stored in temp 901 + read_plane_xz(temp, i); //retrieve an ZX slice, stored in temp
867 for ( unsigned long long j = 0; j < Z(); j++) //for each Z() (Y) 902 for ( unsigned long long j = 0; j < Z(); j++) //for each Z() (Y)
868 { 903 {
869 for (unsigned long long k = 0; k < X(); k++) //for each band 904 for (unsigned long long k = 0; k < X(); k++) //for each band
@@ -933,7 +968,7 @@ public: @@ -933,7 +968,7 @@ public:
933 //for each slice along the y axis 968 //for each slice along the y axis
934 for (unsigned long long y = 0; y < Y(); y++) //Select a page by choosing Y coordinate, Y() 969 for (unsigned long long y = 0; y < Y(); y++) //Select a page by choosing Y coordinate, Y()
935 { 970 {
936 - read_plane_y(slice, y); //retrieve an ZX page, store in "slice" 971 + read_plane_xz(slice, y); //retrieve an ZX page, store in "slice"
937 972
938 //for each sample along X 973 //for each sample along X
939 for (unsigned long long x = 0; x < X(); x++) //Select a pixel by choosing X coordinate in the page, X() 974 for (unsigned long long x = 0; x < X(); x++) //Select a pixel by choosing X coordinate in the page, X()
@@ -1004,7 +1039,7 @@ public: @@ -1004,7 +1039,7 @@ public:
1004 1039
1005 double x; //create a register to store the pixel value 1040 double x; //create a register to store the pixel value
1006 for (unsigned long long k = 0; k < Y(); k++){ 1041 for (unsigned long long k = 0; k < Y(); k++){
1007 - read_plane_y(temp, k); 1042 + read_plane_xz(temp, k);
1008 unsigned long long kx = k * X(); 1043 unsigned long long kx = k * X();
1009 for (unsigned long long i = 0; i < X(); i++){ 1044 for (unsigned long long i = 0; i < X(); i++){
1010 if (mask == NULL || mask[kx + i] != 0){ 1045 if (mask == NULL || mask[kx + i] != 0){
@@ -1025,6 +1060,68 @@ public: @@ -1025,6 +1060,68 @@ public:
1025 return true; 1060 return true;
1026 } 1061 }
1027 1062
  1063 + int co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
  1064 + cudaError_t cudaStat;
  1065 + cublasStatus_t stat;
  1066 + cublasHandle_t handle;
  1067 +
  1068 + progress = 0; //initialize the progress to zero (0)
  1069 + size_t XY = X() * Y(); //calculate the number of elements in a band image
  1070 + size_t XB = X() * Z();
  1071 + size_t B = Z(); //calculate the number of spectral elements
  1072 +
  1073 + double* F = (double*)malloc(sizeof(double) * B * X()); //allocate space for the frame that will be pulled from the file
  1074 + double* F_dev;
  1075 + HANDLE_ERROR(cudaMalloc(&F_dev, X() * B * sizeof(double))); //allocate space for the frame on the GPU
  1076 + double* s_dev; //declare a device pointer that will store the spectrum on the GPU
  1077 + double* A_dev; //declare a device pointer that will store the covariance matrix on the GPU
  1078 + double* avg_dev; //declare a device pointer that will store the average spectrum
  1079 + HANDLE_ERROR(cudaMalloc(&s_dev, B * sizeof(double))); //allocate space on the CUDA device for a spectrum
  1080 + HANDLE_ERROR(cudaMalloc(&A_dev, B * B * sizeof(double))); //allocate space on the CUDA device for the covariance matrix
  1081 + HANDLE_ERROR(cudaMemset(A_dev, 0, B * B * sizeof(double))); //initialize the covariance matrix to zero (0)
  1082 + HANDLE_ERROR(cudaMalloc(&avg_dev, XB * sizeof(double))); //allocate space on the CUDA device for the average spectrum
  1083 + for(size_t x = 0; x < X(); x++) //make multiple copies of the average spectrum in order to build a matrix
  1084 + HANDLE_ERROR(cudaMemcpy(&avg_dev[x * B], avg, B * sizeof(double), cudaMemcpyHostToDevice));
  1085 + //stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1); //copy the average spectrum to the CUDA device
  1086 +
  1087 + double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product)
  1088 + double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction)
  1089 +
  1090 + CUBLAS_HANDLE_ERROR(stat = cublasCreate(&handle)); //create a cuBLAS instance
  1091 + if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid
  1092 +
  1093 + else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl;
  1094 + double beta = 1.0;
  1095 + size_t x, y;
  1096 + for(y = 0; y < Y(); y++){ //for each line
  1097 + read_plane_zxd(F, y); //read a frame from the file
  1098 + HANDLE_ERROR(cudaMemcpy(F_dev, F, XB * sizeof(double), cudaMemcpyHostToDevice)); //copy the frame to the GPU
  1099 + CUBLAS_HANDLE_ERROR(cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, (int)B, (int)X(), &axpy_alpha, avg_dev, (int)B, &beta, F_dev, (int)B, F_dev, (int)B));//subtract the mean spectrum
  1100 +
  1101 + for(x = 0; x < X(); x++)
  1102 + CUBLAS_HANDLE_ERROR(cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, &F_dev[x*B], 1, A_dev, (int)B)); //perform an outer product
  1103 + if(PROGRESS) progress = (double)(y + 1) / Y() * 100;
  1104 + }
  1105 +
  1106 + cublasGetMatrix((int)B, (int)B, sizeof(double), A_dev, (int)B, co, (int)B); //copy the result from the GPU to the CPU
  1107 +
  1108 + cudaFree(A_dev); //clean up allocated device memory
  1109 + cudaFree(s_dev);
  1110 + cudaFree(avg_dev);
  1111 +
  1112 + for(unsigned long long i = 0; i < B; i++){ //copy the upper triangular portion to the lower triangular portion
  1113 + for(unsigned long long j = i+1; j < B; j++){
  1114 + co[B * i + j] = co[B * j + i];
  1115 + }
  1116 + }
  1117 +
  1118 + return 0;
  1119 +
  1120 +
  1121 +
  1122 + }
  1123 +
  1124 +
1028 /// Calculate the covariance matrix for all masked pixels in the image. 1125 /// Calculate the covariance matrix for all masked pixels in the image.
1029 1126
1030 /// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix 1127 /// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
@@ -1032,6 +1129,16 @@ public: @@ -1032,6 +1129,16 @@ public:
1032 /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location 1129 /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
1033 bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ 1130 bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
1034 progress = 0; 1131 progress = 0;
  1132 +
  1133 + int dev_count;
  1134 + cudaGetDeviceCount(&dev_count); //get the number of CUDA devices
  1135 + cudaDeviceProp prop;
  1136 + cudaGetDeviceProperties(&prop, 0); //get the property of the first device
  1137 + if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator
  1138 + int status = co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
  1139 + if(status == 0) return true; //if the cuBLAS function returned correctly, we're done
  1140 + } //otherwise continue using the CPU
  1141 +
1035 //memory allocation 1142 //memory allocation
1036 unsigned long long xy = X() * Y(); 1143 unsigned long long xy = X() * Y();
1037 unsigned long long B = Z(); 1144 unsigned long long B = Z();
@@ -1328,7 +1435,7 @@ public: @@ -1328,7 +1435,7 @@ public:
1328 c = (T*)malloc( L ); //allocate space for the slice 1435 c = (T*)malloc( L ); //allocate space for the slice
1329 1436
1330 for(unsigned long long j = 0; j < Y(); j++){ //for each line 1437 for(unsigned long long j = 0; j < Y(); j++){ //for each line
1331 - read_plane_y(c, j); //load the line into memory 1438 + read_plane_xz(c, j); //load the line into memory
1332 for(unsigned long long i = 0; i < B; i++){ //for each band 1439 for(unsigned long long i = 0; i < B; i++){ //for each band
1333 for(unsigned long long m = 0; m < X(); m++){ //for each sample 1440 for(unsigned long long m = 0; m < X(); m++){ //for each sample
1334 if( mask == NULL && mask[m + j * X()] ) //if the pixel is masked 1441 if( mask == NULL && mask[m + j * X()] ) //if the pixel is masked
@@ -1358,7 +1465,7 @@ public: @@ -1358,7 +1465,7 @@ public:
1358 c = (T*)malloc( L ); //allocate space for the slice 1465 c = (T*)malloc( L ); //allocate space for the slice
1359 1466
1360 for(unsigned long long j = 0; j < Y(); j++){ //for each line 1467 for(unsigned long long j = 0; j < Y(); j++){ //for each line
1361 - read_plane_y(c, j); //load the line into memory 1468 + read_plane_xz(c, j); //load the line into memory
1362 for(unsigned long long i = 0; i < B; i++){ //for each band 1469 for(unsigned long long i = 0; i < B; i++){ //for each band
1363 for(unsigned long long m = 0; m < X(); m++){ //for each sample 1470 for(unsigned long long m = 0; m < X(); m++){ //for each sample
1364 if( mask == NULL && mask[m + j * X()] ) //if the pixel is masked 1471 if( mask == NULL && mask[m + j * X()] ) //if the pixel is masked
@@ -8,10 +8,11 @@ @@ -8,10 +8,11 @@
8 #include <utility> 8 #include <utility>
9 9
10 //CUDA 10 //CUDA
11 -#ifdef CUDA_FOUND  
12 - #include <cuda_runtime.h>  
13 - #include "cublas_v2.h"  
14 -#endif 11 +//#ifdef CUDA_FOUND
  12 +#include <stim/cuda/cudatools/error.h>
  13 +#include <cuda_runtime.h>
  14 +#include "cublas_v2.h"
  15 +//#endif
15 16
16 namespace stim{ 17 namespace stim{
17 18
@@ -257,7 +258,7 @@ public: @@ -257,7 +258,7 @@ public:
257 } 258 }
258 259
259 //given a Y ,return a ZX slice 260 //given a Y ,return a ZX slice
260 - bool read_plane_y(T * p, unsigned long long y){ 261 + bool read_plane_y(T * p, size_t y){
261 return binary<T>::read_plane_2(p, y); 262 return binary<T>::read_plane_2(p, y);
262 } 263 }
263 264
@@ -982,16 +983,15 @@ public: @@ -982,16 +983,15 @@ public:
982 free(temp); 983 free(temp);
983 return true; 984 return true;
984 } 985 }
985 -#ifdef CUDA_FOUND 986 +//#ifdef CUDA_FOUND
986 /// Calculate the covariance matrix for masked pixels using cuBLAS 987 /// Calculate the covariance matrix for masked pixels using cuBLAS
987 /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra 988 /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra
988 - bool co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ 989 + int co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
989 990
990 cudaError_t cudaStat; 991 cudaError_t cudaStat;
991 cublasStatus_t stat; 992 cublasStatus_t stat;
992 cublasHandle_t handle; 993 cublasHandle_t handle;
993 994
994 - progress = 0; //initialize the progress to zero (0)  
995 unsigned long long XY = X() * Y(); //calculate the number of elements in a band image 995 unsigned long long XY = X() * Y(); //calculate the number of elements in a band image
996 unsigned long long B = Z(); //calculate the number of spectral elements 996 unsigned long long B = Z(); //calculate the number of spectral elements
997 997
@@ -1009,10 +1009,9 @@ public: @@ -1009,10 +1009,9 @@ public:
1009 double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) 1009 double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction)
1010 1010
1011 stat = cublasCreate(&handle); //create a cuBLAS instance 1011 stat = cublasCreate(&handle); //create a cuBLAS instance
1012 - if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid  
1013 - printf ("CUBLAS initialization failed\n");  
1014 - return EXIT_FAILURE;  
1015 - } 1012 + if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid
  1013 +
  1014 + else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl;
1016 for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel 1015 for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel
1017 if (mask == NULL || mask[xy] != 0){ 1016 if (mask == NULL || mask[xy] != 0){
1018 pixeld(s, xy); //retreive the spectrum at the current xy pixel location 1017 pixeld(s, xy); //retreive the spectrum at the current xy pixel location
@@ -1036,9 +1035,9 @@ public: @@ -1036,9 +1035,9 @@ public:
1036 } 1035 }
1037 } 1036 }
1038 1037
1039 - return true; 1038 + return 0;
1040 } 1039 }
1041 -#endif 1040 +//#endif
1042 1041
1043 /// Calculate the covariance matrix for all masked pixels in the image with 64-bit floating point precision. 1042 /// Calculate the covariance matrix for all masked pixels in the image with 64-bit floating point precision.
1044 1043
@@ -1046,16 +1045,19 @@ public: @@ -1046,16 +1045,19 @@ public:
1046 /// @param avg is a pointer to memory of size B that stores the average spectrum 1045 /// @param avg is a pointer to memory of size B that stores the average spectrum
1047 /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location 1046 /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
1048 bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ 1047 bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
  1048 + progress = 0;
1049 1049
1050 -#ifdef CUDA_FOUND 1050 +//#ifdef CUDA_FOUND
1051 int dev_count; 1051 int dev_count;
1052 cudaGetDeviceCount(&dev_count); //get the number of CUDA devices 1052 cudaGetDeviceCount(&dev_count); //get the number of CUDA devices
1053 cudaDeviceProp prop; 1053 cudaDeviceProp prop;
1054 cudaGetDeviceProperties(&prop, 0); //get the property of the first device 1054 cudaGetDeviceProperties(&prop, 0); //get the property of the first device
1055 - if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator  
1056 - return co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix  
1057 -#endif  
1058 - progress = 0; 1055 + if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator
  1056 + int status = co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
  1057 + if(status == 0) return true; //if the cuBLAS function returned correctly, we're done
  1058 + } //otherwise continue using the CPU
  1059 +//#endif
  1060 +
1059 //memory allocation 1061 //memory allocation
1060 unsigned long long XY = X() * Y(); 1062 unsigned long long XY = X() * Y();
1061 unsigned long long B = Z(); 1063 unsigned long long B = Z();
@@ -1097,10 +1099,10 @@ public: @@ -1097,10 +1099,10 @@ public:
1097 } 1099 }
1098 1100
1099 1101
1100 -#ifdef CUDA_FOUND 1102 +//#ifdef CUDA_FOUND
1101 /// Calculate the covariance matrix of Noise for masked pixels using cuBLAS 1103 /// Calculate the covariance matrix of Noise for masked pixels using cuBLAS
1102 /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra 1104 /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra
1103 - bool coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ 1105 + int coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
1104 1106
1105 cudaError_t cudaStat; 1107 cudaError_t cudaStat;
1106 cublasStatus_t stat; 1108 cublasStatus_t stat;
@@ -1128,11 +1130,9 @@ public: @@ -1128,11 +1130,9 @@ public:
1128 double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product) 1130 double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product)
1129 double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) 1131 double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction)
1130 1132
1131 - stat = cublasCreate(&handle); //create a cuBLAS instance  
1132 - if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid  
1133 - printf ("CUBLAS initialization failed\n");  
1134 - return EXIT_FAILURE;  
1135 - } 1133 + CUBLAS_HANDLE_ERROR(cublasCreate(&handle)); //create a cuBLAS instance
  1134 + if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid
  1135 +
1136 for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel 1136 for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel
1137 if (mask == NULL || mask[xy] != 0){ 1137 if (mask == NULL || mask[xy] != 0){
1138 pixeld(s, xy); //retreive the spectrum at the current xy pixel location 1138 pixeld(s, xy); //retreive the spectrum at the current xy pixel location
@@ -1165,7 +1165,7 @@ public: @@ -1165,7 +1165,7 @@ public:
1165 1165
1166 return true; 1166 return true;
1167 } 1167 }
1168 -#endif 1168 +//#endif
1169 1169
1170 /// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision. 1170 /// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision.
1171 1171
@@ -1174,14 +1174,18 @@ public: @@ -1174,14 +1174,18 @@ public:
1174 /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location 1174 /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
1175 bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ 1175 bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
1176 1176
1177 -#ifdef CUDA_FOUND 1177 +//#ifdef CUDA_FOUND
1178 int dev_count; 1178 int dev_count;
1179 cudaGetDeviceCount(&dev_count); //get the number of CUDA devices 1179 cudaGetDeviceCount(&dev_count); //get the number of CUDA devices
1180 cudaDeviceProp prop; 1180 cudaDeviceProp prop;
1181 cudaGetDeviceProperties(&prop, 0); //get the property of the first device 1181 cudaGetDeviceProperties(&prop, 0); //get the property of the first device
1182 - if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator  
1183 - return coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix  
1184 -#endif 1182 + if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator
  1183 + int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
  1184 + if(status == 0) return true; //if the cuBLAS function returned correctly, we're done
  1185 + } //otherwise continue using the CPU
  1186 + //if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator
  1187 + // return coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
  1188 +//#endif
1185 1189
1186 1190
1187 1191