Commit 2781a48c74b990c3242b2f862ec696de77483afc

Authored by David Mayerich
1 parent 9f87643c

fixed cuBLAS functionality, improved error messages

stim/cuda/cudatools/error.h
  1 +#ifndef STIM_CUDA_ERROR_H
  2 +#define STIM_CUDA_ERROR_H
  3 +
1 4 #include <stdio.h>
2 5 #include <iostream>
3 6 using namespace std;
4 7 #include "cuda_runtime.h"
5 8 #include "device_launch_parameters.h"
6 9 #include "cufft.h"
7   -
8   -#ifndef CUDA_HANDLE_ERROR_H
9   -#define CUDA_HANDLE_ERROR_H
  10 +#include "cublas_v2.h"
10 11  
11 12 //handle error macro
12   -static void HandleError( cudaError_t err, const char *file, int line ) {
  13 +static void cuHandleError( cudaError_t err, const char *file, int line ) {
13 14 if (err != cudaSuccess) {
14 15 printf("%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
  16 +
15 17 }
16 18 }
17   -#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
  19 +#define HANDLE_ERROR( err ) (cuHandleError( err, __FILE__, __LINE__ ))
18 20  
19   -static void CufftError( cufftResult err )
  21 +static void cufftHandleError( cufftResult err, const char*file, int line )
20 22 {
21 23 if (err != CUFFT_SUCCESS)
22 24 {
... ... @@ -37,7 +39,29 @@ static void CufftError( cufftResult err )
37 39  
38 40 }
39 41 }
  42 +#define CUFFT_HANDLE_ERROR( err ) (cufftHandleError( err, __FILE__, __LINE__ ))
40 43  
  44 +static void cublasHandleError( cublasStatus_t err, const char*file, int line ){
  45 + if(err != CUBLAS_STATUS_SUCCESS){
  46 + if(err == CUBLAS_STATUS_NOT_INITIALIZED)
  47 + std::cout<<"CUBLAS_STATUS_NOT_INITIALIZED" <<" in file "<<file<<" line "<<std::endl;
  48 + else if(err == CUBLAS_STATUS_ALLOC_FAILED)
  49 + std::cout<<"CUBLAS_STATUS_ALLOC_FAILED" <<" in file "<<file<<" line "<<std::endl;
  50 + else if(err == CUBLAS_STATUS_INVALID_VALUE)
  51 + std::cout<<"CUBLAS_STATUS_INVALID_VALUE" <<" in file "<<file<<" line "<<std::endl;
  52 + else if(err == CUBLAS_STATUS_ARCH_MISMATCH)
  53 + std::cout<<"CUBLAS_STATUS_ARCH_MISMATCH" <<" in file "<<file<<" line "<<std::endl;
  54 + else if(err == CUBLAS_STATUS_MAPPING_ERROR)
  55 + std::cout<<"CUBLAS_STATUS_MAPPING_ERROR" <<" in file "<<file<<" line "<<std::endl;
  56 + else if(err == CUBLAS_STATUS_EXECUTION_FAILED)
  57 + std::cout<<"CUBLAS_STATUS_EXECUTION_FAILED" <<" in file "<<file<<" line "<<std::endl;
  58 + else if(err == CUBLAS_STATUS_INTERNAL_ERROR)
  59 + std::cout<<"CUBLAS_STATUS_INTERNAL_ERROR" <<" in file "<<file<<" line "<<std::endl;
  60 + else
  61 + std::cout<<"Unknown error"<<" in file "<<file<<" line "<<std::endl;
  62 + }
  63 +}
  64 +#define CUBLAS_HANDLE_ERROR( err ) (cublasHandleError( err, __FILE__, __LINE__ ))
41 65  
42 66  
43 67 #endif
... ...
stim/envi/agilent_binary.h
... ... @@ -6,11 +6,11 @@
6 6 #include <fstream>
7 7  
8 8 //CUDA
9   -#ifdef CUDA_FOUND
10   - #include <cuda_runtime.h>
11   - #include "cufft.h"
12   - #include <stim/cuda/cudatools/error.h>
13   -#endif
  9 +//#ifdef CUDA_FOUND
  10 +#include <cuda_runtime.h>
  11 +#include "cufft.h"
  12 +#include <stim/cuda/cudatools/error.h>
  13 +//#endif
14 14  
15 15 namespace stim{
16 16  
... ... @@ -42,6 +42,10 @@ public:
42 42 alloc();
43 43 }
44 44  
  45 + size_t dim(size_t i){
  46 + return R[i];
  47 + }
  48 +
45 49 /// Create a deep copy of an agileng_binary object
46 50 void deep_copy(agilent_binary<T>* dst, const agilent_binary<T>* src){
47 51 dst->alloc(src->R[0], src->R[1], src->R[2]); //allocate memory
... ... @@ -169,7 +173,6 @@ public:
169 173 void zeropad(){
170 174 size_t newZ = pow(2, ceil(log(R[2])/log(2))); //find the nearest power-of-two
171 175 size_t n = newZ - R[2]; //calculate the number of bands to add
172   - std::cout<<"band padding: "<<n<<std::endl;
173 176 zeropad(n); //add the padding
174 177 }
175 178  
... ... @@ -184,7 +187,7 @@ public:
184 187 ptr[i] = -log10(ptr[i] / background->ptr[i]);
185 188 }
186 189  
187   -#ifdef CUDA_FOUND
  190 +//#ifdef CUDA_FOUND
188 191 /// Perform an FFT and return a binary file with bands in the specified range
189 192 agilent_binary<T> fft(double band_min, double band_max, double ELWN = 15798, int UDR = 2){
190 193 auto total_start = std::chrono::high_resolution_clock::now();
... ... @@ -271,7 +274,7 @@ public:
271 274  
272 275 return result;
273 276 }
274   -#endif
  277 +//#endif
275 278  
276 279 };
277 280  
... ...
stim/envi/bil.h
... ... @@ -4,6 +4,7 @@
4 4 #include "../envi/envi_header.h"
5 5 #include "../envi/hsi.h"
6 6 #include "../math/fd_coefficients.h"
  7 +#include <stim/cuda/cudatools/error.h>
7 8 #include <cstring>
8 9 #include <utility>
9 10 #include <deque>
... ... @@ -224,10 +225,44 @@ public:
224 225 }
225 226  
226 227 //given a Y ,return a XZ slice
227   - bool read_plane_y(T * p, unsigned long long y){
  228 + bool read_plane_xz(T * p, size_t y){
228 229 return binary<T>::read_plane_2(p, y);
229 230 }
230 231  
  232 + //given a Y, return ZX slice (transposed such that the spectrum is the leading dimension)
  233 + int read_plane_zx(T* p, size_t y){
  234 + T* temp = (T*) malloc(X() * Z() * sizeof(T)); //allocate space to store the temporary xz plane
  235 + binary<T>::read_plane_2(temp, y); //load the plane from disk
  236 + size_t z, x;
  237 + for(z = 0; z < Z(); z++){
  238 + for(x = 0; x <= z; x++){
  239 + p[x * Z() + z] = temp[z * X() + x]; //copy to the destination frame
  240 + }
  241 + }
  242 + }
  243 +
  244 + //load a frame y into a pre-allocated double-precision array
  245 + int read_plane_xzd(double* f, size_t y){
  246 + size_t XB = X() * B();
  247 + T* temp = (T*) malloc(XB * sizeof(T)); //create a temporary location to store the plane at current precision
  248 + if(!read_plane_y(temp, y)) return 1; //read the plane in its native format, if it fails return a 1
  249 + for(size_t i = 0; i < XB; i++) f[i] = temp[i]; //convert the plane to a double
  250 + return 0;
  251 + }
  252 +
  253 + //given a Y, return ZX slice (transposed such that the spectrum is the leading dimension)
  254 + int read_plane_zxd(double* p, size_t y){
  255 + T* temp = (T*) malloc(X() * Z() * sizeof(T)); //allocate space to store the temporary xz plane
  256 + binary<T>::read_plane_2(temp, y); //load the plane from disk
  257 + size_t z, x;
  258 + for(z = 0; z < Z(); z++){
  259 + for(x = 0; x < X(); x++){
  260 + p[x * Z() + z] = (double)temp[z * X() + x]; //copy to the destination frame
  261 + }
  262 + }
  263 + return 0;
  264 + }
  265 +
231 266  
232 267 /// Perform baseline correction given a list of baseline points and stores the result in a new BSQ file.
233 268  
... ... @@ -268,7 +303,7 @@ public:
268 303 for (unsigned long long k =0; k < Y(); k++)
269 304 {
270 305 //get the current y slice
271   - read_plane_y(c, k);
  306 + read_plane_xz(c, k);
272 307  
273 308 //initialize lownum, highnum, low, high
274 309 ai = w[0];
... ... @@ -369,7 +404,7 @@ public:
369 404  
370 405 for(unsigned long long j = 0; j < Y(); j++)
371 406 {
372   - read_plane_y(c, j);
  407 + read_plane_xz(c, j);
373 408 for(unsigned long long i = 0; i < B; i++)
374 409 {
375 410 for(unsigned long long m = 0; m < X(); m++)
... ... @@ -469,7 +504,7 @@ public:
469 504  
470 505 for ( unsigned long long i = 0; i < Y(); i++)
471 506 {
472   - read_plane_y(p, i);
  507 + read_plane_xz(p, i);
473 508 for ( unsigned long long k = 0; k < Z(); k++)
474 509 {
475 510 unsigned long long ks = k * X();
... ... @@ -863,7 +898,7 @@ public:
863 898  
864 899 for (unsigned long long i = 0; i < Y(); i++) //for each value in Y() (BIP should be X)
865 900 {
866   - read_plane_y(temp, i); //retrieve an ZX slice, stored in temp
  901 + read_plane_xz(temp, i); //retrieve an ZX slice, stored in temp
867 902 for ( unsigned long long j = 0; j < Z(); j++) //for each Z() (Y)
868 903 {
869 904 for (unsigned long long k = 0; k < X(); k++) //for each band
... ... @@ -933,7 +968,7 @@ public:
933 968 //for each slice along the y axis
934 969 for (unsigned long long y = 0; y < Y(); y++) //Select a page by choosing Y coordinate, Y()
935 970 {
936   - read_plane_y(slice, y); //retrieve an ZX page, store in "slice"
  971 + read_plane_xz(slice, y); //retrieve an ZX page, store in "slice"
937 972  
938 973 //for each sample along X
939 974 for (unsigned long long x = 0; x < X(); x++) //Select a pixel by choosing X coordinate in the page, X()
... ... @@ -1004,7 +1039,7 @@ public:
1004 1039  
1005 1040 double x; //create a register to store the pixel value
1006 1041 for (unsigned long long k = 0; k < Y(); k++){
1007   - read_plane_y(temp, k);
  1042 + read_plane_xz(temp, k);
1008 1043 unsigned long long kx = k * X();
1009 1044 for (unsigned long long i = 0; i < X(); i++){
1010 1045 if (mask == NULL || mask[kx + i] != 0){
... ... @@ -1025,6 +1060,68 @@ public:
1025 1060 return true;
1026 1061 }
1027 1062  
  1063 + int co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
  1064 + cudaError_t cudaStat;
  1065 + cublasStatus_t stat;
  1066 + cublasHandle_t handle;
  1067 +
  1068 + progress = 0; //initialize the progress to zero (0)
  1069 + size_t XY = X() * Y(); //calculate the number of elements in a band image
  1070 + size_t XB = X() * Z();
  1071 + size_t B = Z(); //calculate the number of spectral elements
  1072 +
  1073 + double* F = (double*)malloc(sizeof(double) * B * X()); //allocate space for the frame that will be pulled from the file
  1074 + double* F_dev;
  1075 + HANDLE_ERROR(cudaMalloc(&F_dev, X() * B * sizeof(double))); //allocate space for the frame on the GPU
  1076 + double* s_dev; //declare a device pointer that will store the spectrum on the GPU
  1077 + double* A_dev; //declare a device pointer that will store the covariance matrix on the GPU
  1078 + double* avg_dev; //declare a device pointer that will store the average spectrum
  1079 + HANDLE_ERROR(cudaMalloc(&s_dev, B * sizeof(double))); //allocate space on the CUDA device for a spectrum
  1080 + HANDLE_ERROR(cudaMalloc(&A_dev, B * B * sizeof(double))); //allocate space on the CUDA device for the covariance matrix
  1081 + HANDLE_ERROR(cudaMemset(A_dev, 0, B * B * sizeof(double))); //initialize the covariance matrix to zero (0)
  1082 + HANDLE_ERROR(cudaMalloc(&avg_dev, XB * sizeof(double))); //allocate space on the CUDA device for the average spectrum
  1083 + for(size_t x = 0; x < X(); x++) //make multiple copies of the average spectrum in order to build a matrix
  1084 + HANDLE_ERROR(cudaMemcpy(&avg_dev[x * B], avg, B * sizeof(double), cudaMemcpyHostToDevice));
  1085 + //stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1); //copy the average spectrum to the CUDA device
  1086 +
  1087 + double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product)
  1088 + double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction)
  1089 +
  1090 + CUBLAS_HANDLE_ERROR(stat = cublasCreate(&handle)); //create a cuBLAS instance
  1091 + if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid
  1092 +
  1093 + else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl;
  1094 + double beta = 1.0;
  1095 + size_t x, y;
  1096 + for(y = 0; y < Y(); y++){ //for each line
  1097 + read_plane_zxd(F, y); //read a frame from the file
  1098 + HANDLE_ERROR(cudaMemcpy(F_dev, F, XB * sizeof(double), cudaMemcpyHostToDevice)); //copy the frame to the GPU
  1099 + CUBLAS_HANDLE_ERROR(cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, (int)B, (int)X(), &axpy_alpha, avg_dev, (int)B, &beta, F_dev, (int)B, F_dev, (int)B));//subtract the mean spectrum
  1100 +
  1101 + for(x = 0; x < X(); x++)
  1102 + CUBLAS_HANDLE_ERROR(cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, &F_dev[x*B], 1, A_dev, (int)B)); //perform an outer product
  1103 + if(PROGRESS) progress = (double)(y + 1) / Y() * 100;
  1104 + }
  1105 +
  1106 + cublasGetMatrix((int)B, (int)B, sizeof(double), A_dev, (int)B, co, (int)B); //copy the result from the GPU to the CPU
  1107 +
  1108 + cudaFree(A_dev); //clean up allocated device memory
  1109 + cudaFree(s_dev);
  1110 + cudaFree(avg_dev);
  1111 +
  1112 + for(unsigned long long i = 0; i < B; i++){ //copy the upper triangular portion to the lower triangular portion
  1113 + for(unsigned long long j = i+1; j < B; j++){
  1114 + co[B * i + j] = co[B * j + i];
  1115 + }
  1116 + }
  1117 +
  1118 + return 0;
  1119 +
  1120 +
  1121 +
  1122 + }
  1123 +
  1124 +
1028 1125 /// Calculate the covariance matrix for all masked pixels in the image.
1029 1126  
1030 1127 /// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
... ... @@ -1032,6 +1129,16 @@ public:
1032 1129 /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
1033 1130 bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
1034 1131 progress = 0;
  1132 +
  1133 + int dev_count;
  1134 + cudaGetDeviceCount(&dev_count); //get the number of CUDA devices
  1135 + cudaDeviceProp prop;
  1136 + cudaGetDeviceProperties(&prop, 0); //get the property of the first device
  1137 + if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator
  1138 + int status = co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
  1139 + if(status == 0) return true; //if the cuBLAS function returned correctly, we're done
  1140 + } //otherwise continue using the CPU
  1141 +
1035 1142 //memory allocation
1036 1143 unsigned long long xy = X() * Y();
1037 1144 unsigned long long B = Z();
... ... @@ -1328,7 +1435,7 @@ public:
1328 1435 c = (T*)malloc( L ); //allocate space for the slice
1329 1436  
1330 1437 for(unsigned long long j = 0; j < Y(); j++){ //for each line
1331   - read_plane_y(c, j); //load the line into memory
  1438 + read_plane_xz(c, j); //load the line into memory
1332 1439 for(unsigned long long i = 0; i < B; i++){ //for each band
1333 1440 for(unsigned long long m = 0; m < X(); m++){ //for each sample
1334 1441 if( mask == NULL && mask[m + j * X()] ) //if the pixel is masked
... ... @@ -1358,7 +1465,7 @@ public:
1358 1465 c = (T*)malloc( L ); //allocate space for the slice
1359 1466  
1360 1467 for(unsigned long long j = 0; j < Y(); j++){ //for each line
1361   - read_plane_y(c, j); //load the line into memory
  1468 + read_plane_xz(c, j); //load the line into memory
1362 1469 for(unsigned long long i = 0; i < B; i++){ //for each band
1363 1470 for(unsigned long long m = 0; m < X(); m++){ //for each sample
1364 1471 if( mask == NULL && mask[m + j * X()] ) //if the pixel is masked
... ...
stim/envi/bip.h
... ... @@ -8,10 +8,11 @@
8 8 #include <utility>
9 9  
10 10 //CUDA
11   -#ifdef CUDA_FOUND
12   - #include <cuda_runtime.h>
13   - #include "cublas_v2.h"
14   -#endif
  11 +//#ifdef CUDA_FOUND
  12 +#include <stim/cuda/cudatools/error.h>
  13 +#include <cuda_runtime.h>
  14 +#include "cublas_v2.h"
  15 +//#endif
15 16  
16 17 namespace stim{
17 18  
... ... @@ -257,7 +258,7 @@ public:
257 258 }
258 259  
259 260 //given a Y ,return a ZX slice
260   - bool read_plane_y(T * p, unsigned long long y){
  261 + bool read_plane_y(T * p, size_t y){
261 262 return binary<T>::read_plane_2(p, y);
262 263 }
263 264  
... ... @@ -982,16 +983,15 @@ public:
982 983 free(temp);
983 984 return true;
984 985 }
985   -#ifdef CUDA_FOUND
  986 +//#ifdef CUDA_FOUND
986 987 /// Calculate the covariance matrix for masked pixels using cuBLAS
987 988 /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra
988   - bool co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
  989 + int co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
989 990  
990 991 cudaError_t cudaStat;
991 992 cublasStatus_t stat;
992 993 cublasHandle_t handle;
993 994  
994   - progress = 0; //initialize the progress to zero (0)
995 995 unsigned long long XY = X() * Y(); //calculate the number of elements in a band image
996 996 unsigned long long B = Z(); //calculate the number of spectral elements
997 997  
... ... @@ -1009,10 +1009,9 @@ public:
1009 1009 double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction)
1010 1010  
1011 1011 stat = cublasCreate(&handle); //create a cuBLAS instance
1012   - if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid
1013   - printf ("CUBLAS initialization failed\n");
1014   - return EXIT_FAILURE;
1015   - }
  1012 + if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid
  1013 +
  1014 + else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl;
1016 1015 for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel
1017 1016 if (mask == NULL || mask[xy] != 0){
1018 1017 pixeld(s, xy); //retreive the spectrum at the current xy pixel location
... ... @@ -1036,9 +1035,9 @@ public:
1036 1035 }
1037 1036 }
1038 1037  
1039   - return true;
  1038 + return 0;
1040 1039 }
1041   -#endif
  1040 +//#endif
1042 1041  
1043 1042 /// Calculate the covariance matrix for all masked pixels in the image with 64-bit floating point precision.
1044 1043  
... ... @@ -1046,16 +1045,19 @@ public:
1046 1045 /// @param avg is a pointer to memory of size B that stores the average spectrum
1047 1046 /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
1048 1047 bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
  1048 + progress = 0;
1049 1049  
1050   -#ifdef CUDA_FOUND
  1050 +//#ifdef CUDA_FOUND
1051 1051 int dev_count;
1052 1052 cudaGetDeviceCount(&dev_count); //get the number of CUDA devices
1053 1053 cudaDeviceProp prop;
1054 1054 cudaGetDeviceProperties(&prop, 0); //get the property of the first device
1055   - if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator
1056   - return co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
1057   -#endif
1058   - progress = 0;
  1055 + if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator
  1056 + int status = co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
  1057 + if(status == 0) return true; //if the cuBLAS function returned correctly, we're done
  1058 + } //otherwise continue using the CPU
  1059 +//#endif
  1060 +
1059 1061 //memory allocation
1060 1062 unsigned long long XY = X() * Y();
1061 1063 unsigned long long B = Z();
... ... @@ -1097,10 +1099,10 @@ public:
1097 1099 }
1098 1100  
1099 1101  
1100   -#ifdef CUDA_FOUND
  1102 +//#ifdef CUDA_FOUND
1101 1103 /// Calculate the covariance matrix of Noise for masked pixels using cuBLAS
1102 1104 /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra
1103   - bool coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
  1105 + int coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
1104 1106  
1105 1107 cudaError_t cudaStat;
1106 1108 cublasStatus_t stat;
... ... @@ -1128,11 +1130,9 @@ public:
1128 1130 double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product)
1129 1131 double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction)
1130 1132  
1131   - stat = cublasCreate(&handle); //create a cuBLAS instance
1132   - if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid
1133   - printf ("CUBLAS initialization failed\n");
1134   - return EXIT_FAILURE;
1135   - }
  1133 + CUBLAS_HANDLE_ERROR(cublasCreate(&handle)); //create a cuBLAS instance
  1134 + if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid
  1135 +
1136 1136 for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel
1137 1137 if (mask == NULL || mask[xy] != 0){
1138 1138 pixeld(s, xy); //retreive the spectrum at the current xy pixel location
... ... @@ -1165,7 +1165,7 @@ public:
1165 1165  
1166 1166 return true;
1167 1167 }
1168   -#endif
  1168 +//#endif
1169 1169  
1170 1170 /// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision.
1171 1171  
... ... @@ -1174,14 +1174,18 @@ public:
1174 1174 /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
1175 1175 bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
1176 1176  
1177   -#ifdef CUDA_FOUND
  1177 +//#ifdef CUDA_FOUND
1178 1178 int dev_count;
1179 1179 cudaGetDeviceCount(&dev_count); //get the number of CUDA devices
1180 1180 cudaDeviceProp prop;
1181 1181 cudaGetDeviceProperties(&prop, 0); //get the property of the first device
1182   - if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator
1183   - return coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
1184   -#endif
  1182 + if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator
  1183 + int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
  1184 + if(status == 0) return true; //if the cuBLAS function returned correctly, we're done
  1185 + } //otherwise continue using the CPU
  1186 + //if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator
  1187 + // return coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
  1188 +//#endif
1185 1189  
1186 1190  
1187 1191  
... ...