Commit 2781a48c74b990c3242b2f862ec696de77483afc
1 parent
9f87643c
fixed cuBLAS functionality, improved error messages
Showing
4 changed files
with
192 additions
and
54 deletions
Show diff stats
stim/cuda/cudatools/error.h
1 | +#ifndef STIM_CUDA_ERROR_H | |
2 | +#define STIM_CUDA_ERROR_H | |
3 | + | |
1 | 4 | #include <stdio.h> |
2 | 5 | #include <iostream> |
3 | 6 | using namespace std; |
4 | 7 | #include "cuda_runtime.h" |
5 | 8 | #include "device_launch_parameters.h" |
6 | 9 | #include "cufft.h" |
7 | - | |
8 | -#ifndef CUDA_HANDLE_ERROR_H | |
9 | -#define CUDA_HANDLE_ERROR_H | |
10 | +#include "cublas_v2.h" | |
10 | 11 | |
11 | 12 | //handle error macro |
12 | -static void HandleError( cudaError_t err, const char *file, int line ) { | |
13 | +static void cuHandleError( cudaError_t err, const char *file, int line ) { | |
13 | 14 | if (err != cudaSuccess) { |
14 | 15 | printf("%s in %s at line %d\n", cudaGetErrorString( err ), file, line ); |
16 | + | |
15 | 17 | } |
16 | 18 | } |
17 | -#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) | |
19 | +#define HANDLE_ERROR( err ) (cuHandleError( err, __FILE__, __LINE__ )) | |
18 | 20 | |
19 | -static void CufftError( cufftResult err ) | |
21 | +static void cufftHandleError( cufftResult err, const char*file, int line ) | |
20 | 22 | { |
21 | 23 | if (err != CUFFT_SUCCESS) |
22 | 24 | { |
... | ... | @@ -37,7 +39,29 @@ static void CufftError( cufftResult err ) |
37 | 39 | |
38 | 40 | } |
39 | 41 | } |
42 | +#define CUFFT_HANDLE_ERROR( err ) (cufftHandleError( err, __FILE__, __LINE__ )) | |
40 | 43 | |
44 | +static void cublasHandleError( cublasStatus_t err, const char*file, int line ){ | |
45 | + if(err != CUBLAS_STATUS_SUCCESS){ | |
46 | + if(err == CUBLAS_STATUS_NOT_INITIALIZED) | |
47 | + std::cout<<"CUBLAS_STATUS_NOT_INITIALIZED" <<" in file "<<file<<" line "<<std::endl; | |
48 | + else if(err == CUBLAS_STATUS_ALLOC_FAILED) | |
49 | + std::cout<<"CUBLAS_STATUS_ALLOC_FAILED" <<" in file "<<file<<" line "<<std::endl; | |
50 | + else if(err == CUBLAS_STATUS_INVALID_VALUE) | |
51 | + std::cout<<"CUBLAS_STATUS_INVALID_VALUE" <<" in file "<<file<<" line "<<std::endl; | |
52 | + else if(err == CUBLAS_STATUS_ARCH_MISMATCH) | |
53 | + std::cout<<"CUBLAS_STATUS_ARCH_MISMATCH" <<" in file "<<file<<" line "<<std::endl; | |
54 | + else if(err == CUBLAS_STATUS_MAPPING_ERROR) | |
55 | + std::cout<<"CUBLAS_STATUS_MAPPING_ERROR" <<" in file "<<file<<" line "<<std::endl; | |
56 | + else if(err == CUBLAS_STATUS_EXECUTION_FAILED) | |
57 | + std::cout<<"CUBLAS_STATUS_EXECUTION_FAILED" <<" in file "<<file<<" line "<<std::endl; | |
58 | + else if(err == CUBLAS_STATUS_INTERNAL_ERROR) | |
59 | + std::cout<<"CUBLAS_STATUS_INTERNAL_ERROR" <<" in file "<<file<<" line "<<std::endl; | |
60 | + else | |
61 | + std::cout<<"Unknown error"<<" in file "<<file<<" line "<<std::endl; | |
62 | + } | |
63 | +} | |
64 | +#define CUBLAS_HANDLE_ERROR( err ) (cublasHandleError( err, __FILE__, __LINE__ )) | |
41 | 65 | |
42 | 66 | |
43 | 67 | #endif | ... | ... |
stim/envi/agilent_binary.h
... | ... | @@ -6,11 +6,11 @@ |
6 | 6 | #include <fstream> |
7 | 7 | |
8 | 8 | //CUDA |
9 | -#ifdef CUDA_FOUND | |
10 | - #include <cuda_runtime.h> | |
11 | - #include "cufft.h" | |
12 | - #include <stim/cuda/cudatools/error.h> | |
13 | -#endif | |
9 | +//#ifdef CUDA_FOUND | |
10 | +#include <cuda_runtime.h> | |
11 | +#include "cufft.h" | |
12 | +#include <stim/cuda/cudatools/error.h> | |
13 | +//#endif | |
14 | 14 | |
15 | 15 | namespace stim{ |
16 | 16 | |
... | ... | @@ -42,6 +42,10 @@ public: |
42 | 42 | alloc(); |
43 | 43 | } |
44 | 44 | |
45 | + size_t dim(size_t i){ | |
46 | + return R[i]; | |
47 | + } | |
48 | + | |
45 | 49 | /// Create a deep copy of an agileng_binary object |
46 | 50 | void deep_copy(agilent_binary<T>* dst, const agilent_binary<T>* src){ |
47 | 51 | dst->alloc(src->R[0], src->R[1], src->R[2]); //allocate memory |
... | ... | @@ -169,7 +173,6 @@ public: |
169 | 173 | void zeropad(){ |
170 | 174 | size_t newZ = pow(2, ceil(log(R[2])/log(2))); //find the nearest power-of-two |
171 | 175 | size_t n = newZ - R[2]; //calculate the number of bands to add |
172 | - std::cout<<"band padding: "<<n<<std::endl; | |
173 | 176 | zeropad(n); //add the padding |
174 | 177 | } |
175 | 178 | |
... | ... | @@ -184,7 +187,7 @@ public: |
184 | 187 | ptr[i] = -log10(ptr[i] / background->ptr[i]); |
185 | 188 | } |
186 | 189 | |
187 | -#ifdef CUDA_FOUND | |
190 | +//#ifdef CUDA_FOUND | |
188 | 191 | /// Perform an FFT and return a binary file with bands in the specified range |
189 | 192 | agilent_binary<T> fft(double band_min, double band_max, double ELWN = 15798, int UDR = 2){ |
190 | 193 | auto total_start = std::chrono::high_resolution_clock::now(); |
... | ... | @@ -271,7 +274,7 @@ public: |
271 | 274 | |
272 | 275 | return result; |
273 | 276 | } |
274 | -#endif | |
277 | +//#endif | |
275 | 278 | |
276 | 279 | }; |
277 | 280 | ... | ... |
stim/envi/bil.h
... | ... | @@ -4,6 +4,7 @@ |
4 | 4 | #include "../envi/envi_header.h" |
5 | 5 | #include "../envi/hsi.h" |
6 | 6 | #include "../math/fd_coefficients.h" |
7 | +#include <stim/cuda/cudatools/error.h> | |
7 | 8 | #include <cstring> |
8 | 9 | #include <utility> |
9 | 10 | #include <deque> |
... | ... | @@ -224,10 +225,44 @@ public: |
224 | 225 | } |
225 | 226 | |
226 | 227 | //given a Y ,return a XZ slice |
227 | - bool read_plane_y(T * p, unsigned long long y){ | |
228 | + bool read_plane_xz(T * p, size_t y){ | |
228 | 229 | return binary<T>::read_plane_2(p, y); |
229 | 230 | } |
230 | 231 | |
232 | + //given a Y, return ZX slice (transposed such that the spectrum is the leading dimension) | |
233 | + int read_plane_zx(T* p, size_t y){ | |
234 | + T* temp = (T*) malloc(X() * Z() * sizeof(T)); //allocate space to store the temporary xz plane | |
235 | + binary<T>::read_plane_2(temp, y); //load the plane from disk | |
236 | + size_t z, x; | |
237 | + for(z = 0; z < Z(); z++){ | |
238 | + for(x = 0; x <= z; x++){ | |
239 | + p[x * Z() + z] = temp[z * X() + x]; //copy to the destination frame | |
240 | + } | |
241 | + } | |
242 | + } | |
243 | + | |
244 | + //load a frame y into a pre-allocated double-precision array | |
245 | + int read_plane_xzd(double* f, size_t y){ | |
246 | + size_t XB = X() * B(); | |
247 | + T* temp = (T*) malloc(XB * sizeof(T)); //create a temporary location to store the plane at current precision | |
248 | + if(!read_plane_y(temp, y)) return 1; //read the plane in its native format, if it fails return a 1 | |
249 | + for(size_t i = 0; i < XB; i++) f[i] = temp[i]; //convert the plane to a double | |
250 | + return 0; | |
251 | + } | |
252 | + | |
253 | + //given a Y, return ZX slice (transposed such that the spectrum is the leading dimension) | |
254 | + int read_plane_zxd(double* p, size_t y){ | |
255 | + T* temp = (T*) malloc(X() * Z() * sizeof(T)); //allocate space to store the temporary xz plane | |
256 | + binary<T>::read_plane_2(temp, y); //load the plane from disk | |
257 | + size_t z, x; | |
258 | + for(z = 0; z < Z(); z++){ | |
259 | + for(x = 0; x < X(); x++){ | |
260 | + p[x * Z() + z] = (double)temp[z * X() + x]; //copy to the destination frame | |
261 | + } | |
262 | + } | |
263 | + return 0; | |
264 | + } | |
265 | + | |
231 | 266 | |
232 | 267 | /// Perform baseline correction given a list of baseline points and stores the result in a new BSQ file. |
233 | 268 | |
... | ... | @@ -268,7 +303,7 @@ public: |
268 | 303 | for (unsigned long long k =0; k < Y(); k++) |
269 | 304 | { |
270 | 305 | //get the current y slice |
271 | - read_plane_y(c, k); | |
306 | + read_plane_xz(c, k); | |
272 | 307 | |
273 | 308 | //initialize lownum, highnum, low, high |
274 | 309 | ai = w[0]; |
... | ... | @@ -369,7 +404,7 @@ public: |
369 | 404 | |
370 | 405 | for(unsigned long long j = 0; j < Y(); j++) |
371 | 406 | { |
372 | - read_plane_y(c, j); | |
407 | + read_plane_xz(c, j); | |
373 | 408 | for(unsigned long long i = 0; i < B; i++) |
374 | 409 | { |
375 | 410 | for(unsigned long long m = 0; m < X(); m++) |
... | ... | @@ -469,7 +504,7 @@ public: |
469 | 504 | |
470 | 505 | for ( unsigned long long i = 0; i < Y(); i++) |
471 | 506 | { |
472 | - read_plane_y(p, i); | |
507 | + read_plane_xz(p, i); | |
473 | 508 | for ( unsigned long long k = 0; k < Z(); k++) |
474 | 509 | { |
475 | 510 | unsigned long long ks = k * X(); |
... | ... | @@ -863,7 +898,7 @@ public: |
863 | 898 | |
864 | 899 | for (unsigned long long i = 0; i < Y(); i++) //for each value in Y() (BIP should be X) |
865 | 900 | { |
866 | - read_plane_y(temp, i); //retrieve an ZX slice, stored in temp | |
901 | + read_plane_xz(temp, i); //retrieve an ZX slice, stored in temp | |
867 | 902 | for ( unsigned long long j = 0; j < Z(); j++) //for each Z() (Y) |
868 | 903 | { |
869 | 904 | for (unsigned long long k = 0; k < X(); k++) //for each band |
... | ... | @@ -933,7 +968,7 @@ public: |
933 | 968 | //for each slice along the y axis |
934 | 969 | for (unsigned long long y = 0; y < Y(); y++) //Select a page by choosing Y coordinate, Y() |
935 | 970 | { |
936 | - read_plane_y(slice, y); //retrieve an ZX page, store in "slice" | |
971 | + read_plane_xz(slice, y); //retrieve an ZX page, store in "slice" | |
937 | 972 | |
938 | 973 | //for each sample along X |
939 | 974 | for (unsigned long long x = 0; x < X(); x++) //Select a pixel by choosing X coordinate in the page, X() |
... | ... | @@ -1004,7 +1039,7 @@ public: |
1004 | 1039 | |
1005 | 1040 | double x; //create a register to store the pixel value |
1006 | 1041 | for (unsigned long long k = 0; k < Y(); k++){ |
1007 | - read_plane_y(temp, k); | |
1042 | + read_plane_xz(temp, k); | |
1008 | 1043 | unsigned long long kx = k * X(); |
1009 | 1044 | for (unsigned long long i = 0; i < X(); i++){ |
1010 | 1045 | if (mask == NULL || mask[kx + i] != 0){ |
... | ... | @@ -1025,6 +1060,68 @@ public: |
1025 | 1060 | return true; |
1026 | 1061 | } |
1027 | 1062 | |
1063 | + int co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ | |
1064 | + cudaError_t cudaStat; | |
1065 | + cublasStatus_t stat; | |
1066 | + cublasHandle_t handle; | |
1067 | + | |
1068 | + progress = 0; //initialize the progress to zero (0) | |
1069 | + size_t XY = X() * Y(); //calculate the number of elements in a band image | |
1070 | + size_t XB = X() * Z(); | |
1071 | + size_t B = Z(); //calculate the number of spectral elements | |
1072 | + | |
1073 | + double* F = (double*)malloc(sizeof(double) * B * X()); //allocate space for the frame that will be pulled from the file | |
1074 | + double* F_dev; | |
1075 | + HANDLE_ERROR(cudaMalloc(&F_dev, X() * B * sizeof(double))); //allocate space for the frame on the GPU | |
1076 | + double* s_dev; //declare a device pointer that will store the spectrum on the GPU | |
1077 | + double* A_dev; //declare a device pointer that will store the covariance matrix on the GPU | |
1078 | + double* avg_dev; //declare a device pointer that will store the average spectrum | |
1079 | + HANDLE_ERROR(cudaMalloc(&s_dev, B * sizeof(double))); //allocate space on the CUDA device for a spectrum | |
1080 | + HANDLE_ERROR(cudaMalloc(&A_dev, B * B * sizeof(double))); //allocate space on the CUDA device for the covariance matrix | |
1081 | + HANDLE_ERROR(cudaMemset(A_dev, 0, B * B * sizeof(double))); //initialize the covariance matrix to zero (0) | |
1082 | + HANDLE_ERROR(cudaMalloc(&avg_dev, XB * sizeof(double))); //allocate space on the CUDA device for the average spectrum | |
1083 | + for(size_t x = 0; x < X(); x++) //make multiple copies of the average spectrum in order to build a matrix | |
1084 | + HANDLE_ERROR(cudaMemcpy(&avg_dev[x * B], avg, B * sizeof(double), cudaMemcpyHostToDevice)); | |
1085 | + //stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1); //copy the average spectrum to the CUDA device | |
1086 | + | |
1087 | + double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product) | |
1088 | + double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) | |
1089 | + | |
1090 | + CUBLAS_HANDLE_ERROR(stat = cublasCreate(&handle)); //create a cuBLAS instance | |
1091 | + if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid | |
1092 | + | |
1093 | + else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl; | |
1094 | + double beta = 1.0; | |
1095 | + size_t x, y; | |
1096 | + for(y = 0; y < Y(); y++){ //for each line | |
1097 | + read_plane_zxd(F, y); //read a frame from the file | |
1098 | + HANDLE_ERROR(cudaMemcpy(F_dev, F, XB * sizeof(double), cudaMemcpyHostToDevice)); //copy the frame to the GPU | |
1099 | + CUBLAS_HANDLE_ERROR(cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, (int)B, (int)X(), &axpy_alpha, avg_dev, (int)B, &beta, F_dev, (int)B, F_dev, (int)B));//subtract the mean spectrum | |
1100 | + | |
1101 | + for(x = 0; x < X(); x++) | |
1102 | + CUBLAS_HANDLE_ERROR(cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, &F_dev[x*B], 1, A_dev, (int)B)); //perform an outer product | |
1103 | + if(PROGRESS) progress = (double)(y + 1) / Y() * 100; | |
1104 | + } | |
1105 | + | |
1106 | + cublasGetMatrix((int)B, (int)B, sizeof(double), A_dev, (int)B, co, (int)B); //copy the result from the GPU to the CPU | |
1107 | + | |
1108 | + cudaFree(A_dev); //clean up allocated device memory | |
1109 | + cudaFree(s_dev); | |
1110 | + cudaFree(avg_dev); | |
1111 | + | |
1112 | + for(unsigned long long i = 0; i < B; i++){ //copy the upper triangular portion to the lower triangular portion | |
1113 | + for(unsigned long long j = i+1; j < B; j++){ | |
1114 | + co[B * i + j] = co[B * j + i]; | |
1115 | + } | |
1116 | + } | |
1117 | + | |
1118 | + return 0; | |
1119 | + | |
1120 | + | |
1121 | + | |
1122 | + } | |
1123 | + | |
1124 | + | |
1028 | 1125 | /// Calculate the covariance matrix for all masked pixels in the image. |
1029 | 1126 | |
1030 | 1127 | /// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix |
... | ... | @@ -1032,6 +1129,16 @@ public: |
1032 | 1129 | /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location |
1033 | 1130 | bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ |
1034 | 1131 | progress = 0; |
1132 | + | |
1133 | + int dev_count; | |
1134 | + cudaGetDeviceCount(&dev_count); //get the number of CUDA devices | |
1135 | + cudaDeviceProp prop; | |
1136 | + cudaGetDeviceProperties(&prop, 0); //get the property of the first device | |
1137 | + if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator | |
1138 | + int status = co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | |
1139 | + if(status == 0) return true; //if the cuBLAS function returned correctly, we're done | |
1140 | + } //otherwise continue using the CPU | |
1141 | + | |
1035 | 1142 | //memory allocation |
1036 | 1143 | unsigned long long xy = X() * Y(); |
1037 | 1144 | unsigned long long B = Z(); |
... | ... | @@ -1328,7 +1435,7 @@ public: |
1328 | 1435 | c = (T*)malloc( L ); //allocate space for the slice |
1329 | 1436 | |
1330 | 1437 | for(unsigned long long j = 0; j < Y(); j++){ //for each line |
1331 | - read_plane_y(c, j); //load the line into memory | |
1438 | + read_plane_xz(c, j); //load the line into memory | |
1332 | 1439 | for(unsigned long long i = 0; i < B; i++){ //for each band |
1333 | 1440 | for(unsigned long long m = 0; m < X(); m++){ //for each sample |
1334 | 1441 | if( mask == NULL && mask[m + j * X()] ) //if the pixel is masked |
... | ... | @@ -1358,7 +1465,7 @@ public: |
1358 | 1465 | c = (T*)malloc( L ); //allocate space for the slice |
1359 | 1466 | |
1360 | 1467 | for(unsigned long long j = 0; j < Y(); j++){ //for each line |
1361 | - read_plane_y(c, j); //load the line into memory | |
1468 | + read_plane_xz(c, j); //load the line into memory | |
1362 | 1469 | for(unsigned long long i = 0; i < B; i++){ //for each band |
1363 | 1470 | for(unsigned long long m = 0; m < X(); m++){ //for each sample |
1364 | 1471 | if( mask == NULL && mask[m + j * X()] ) //if the pixel is masked | ... | ... |
stim/envi/bip.h
... | ... | @@ -8,10 +8,11 @@ |
8 | 8 | #include <utility> |
9 | 9 | |
10 | 10 | //CUDA |
11 | -#ifdef CUDA_FOUND | |
12 | - #include <cuda_runtime.h> | |
13 | - #include "cublas_v2.h" | |
14 | -#endif | |
11 | +//#ifdef CUDA_FOUND | |
12 | +#include <stim/cuda/cudatools/error.h> | |
13 | +#include <cuda_runtime.h> | |
14 | +#include "cublas_v2.h" | |
15 | +//#endif | |
15 | 16 | |
16 | 17 | namespace stim{ |
17 | 18 | |
... | ... | @@ -257,7 +258,7 @@ public: |
257 | 258 | } |
258 | 259 | |
259 | 260 | //given a Y ,return a ZX slice |
260 | - bool read_plane_y(T * p, unsigned long long y){ | |
261 | + bool read_plane_y(T * p, size_t y){ | |
261 | 262 | return binary<T>::read_plane_2(p, y); |
262 | 263 | } |
263 | 264 | |
... | ... | @@ -982,16 +983,15 @@ public: |
982 | 983 | free(temp); |
983 | 984 | return true; |
984 | 985 | } |
985 | -#ifdef CUDA_FOUND | |
986 | +//#ifdef CUDA_FOUND | |
986 | 987 | /// Calculate the covariance matrix for masked pixels using cuBLAS |
987 | 988 | /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra |
988 | - bool co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ | |
989 | + int co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ | |
989 | 990 | |
990 | 991 | cudaError_t cudaStat; |
991 | 992 | cublasStatus_t stat; |
992 | 993 | cublasHandle_t handle; |
993 | 994 | |
994 | - progress = 0; //initialize the progress to zero (0) | |
995 | 995 | unsigned long long XY = X() * Y(); //calculate the number of elements in a band image |
996 | 996 | unsigned long long B = Z(); //calculate the number of spectral elements |
997 | 997 | |
... | ... | @@ -1009,10 +1009,9 @@ public: |
1009 | 1009 | double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) |
1010 | 1010 | |
1011 | 1011 | stat = cublasCreate(&handle); //create a cuBLAS instance |
1012 | - if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid | |
1013 | - printf ("CUBLAS initialization failed\n"); | |
1014 | - return EXIT_FAILURE; | |
1015 | - } | |
1012 | + if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid | |
1013 | + | |
1014 | + else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl; | |
1016 | 1015 | for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel |
1017 | 1016 | if (mask == NULL || mask[xy] != 0){ |
1018 | 1017 | pixeld(s, xy); //retreive the spectrum at the current xy pixel location |
... | ... | @@ -1036,9 +1035,9 @@ public: |
1036 | 1035 | } |
1037 | 1036 | } |
1038 | 1037 | |
1039 | - return true; | |
1038 | + return 0; | |
1040 | 1039 | } |
1041 | -#endif | |
1040 | +//#endif | |
1042 | 1041 | |
1043 | 1042 | /// Calculate the covariance matrix for all masked pixels in the image with 64-bit floating point precision. |
1044 | 1043 | |
... | ... | @@ -1046,16 +1045,19 @@ public: |
1046 | 1045 | /// @param avg is a pointer to memory of size B that stores the average spectrum |
1047 | 1046 | /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location |
1048 | 1047 | bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ |
1048 | + progress = 0; | |
1049 | 1049 | |
1050 | -#ifdef CUDA_FOUND | |
1050 | +//#ifdef CUDA_FOUND | |
1051 | 1051 | int dev_count; |
1052 | 1052 | cudaGetDeviceCount(&dev_count); //get the number of CUDA devices |
1053 | 1053 | cudaDeviceProp prop; |
1054 | 1054 | cudaGetDeviceProperties(&prop, 0); //get the property of the first device |
1055 | - if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator | |
1056 | - return co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | |
1057 | -#endif | |
1058 | - progress = 0; | |
1055 | + if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator | |
1056 | + int status = co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | |
1057 | + if(status == 0) return true; //if the cuBLAS function returned correctly, we're done | |
1058 | + } //otherwise continue using the CPU | |
1059 | +//#endif | |
1060 | + | |
1059 | 1061 | //memory allocation |
1060 | 1062 | unsigned long long XY = X() * Y(); |
1061 | 1063 | unsigned long long B = Z(); |
... | ... | @@ -1097,10 +1099,10 @@ public: |
1097 | 1099 | } |
1098 | 1100 | |
1099 | 1101 | |
1100 | -#ifdef CUDA_FOUND | |
1102 | +//#ifdef CUDA_FOUND | |
1101 | 1103 | /// Calculate the covariance matrix of Noise for masked pixels using cuBLAS |
1102 | 1104 | /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra |
1103 | - bool coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ | |
1105 | + int coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ | |
1104 | 1106 | |
1105 | 1107 | cudaError_t cudaStat; |
1106 | 1108 | cublasStatus_t stat; |
... | ... | @@ -1128,11 +1130,9 @@ public: |
1128 | 1130 | double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product) |
1129 | 1131 | double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) |
1130 | 1132 | |
1131 | - stat = cublasCreate(&handle); //create a cuBLAS instance | |
1132 | - if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid | |
1133 | - printf ("CUBLAS initialization failed\n"); | |
1134 | - return EXIT_FAILURE; | |
1135 | - } | |
1133 | + CUBLAS_HANDLE_ERROR(cublasCreate(&handle)); //create a cuBLAS instance | |
1134 | + if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid | |
1135 | + | |
1136 | 1136 | for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel |
1137 | 1137 | if (mask == NULL || mask[xy] != 0){ |
1138 | 1138 | pixeld(s, xy); //retreive the spectrum at the current xy pixel location |
... | ... | @@ -1165,7 +1165,7 @@ public: |
1165 | 1165 | |
1166 | 1166 | return true; |
1167 | 1167 | } |
1168 | -#endif | |
1168 | +//#endif | |
1169 | 1169 | |
1170 | 1170 | /// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision. |
1171 | 1171 | |
... | ... | @@ -1174,14 +1174,18 @@ public: |
1174 | 1174 | /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location |
1175 | 1175 | bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ |
1176 | 1176 | |
1177 | -#ifdef CUDA_FOUND | |
1177 | +//#ifdef CUDA_FOUND | |
1178 | 1178 | int dev_count; |
1179 | 1179 | cudaGetDeviceCount(&dev_count); //get the number of CUDA devices |
1180 | 1180 | cudaDeviceProp prop; |
1181 | 1181 | cudaGetDeviceProperties(&prop, 0); //get the property of the first device |
1182 | - if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator | |
1183 | - return coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | |
1184 | -#endif | |
1182 | + if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator | |
1183 | + int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | |
1184 | + if(status == 0) return true; //if the cuBLAS function returned correctly, we're done | |
1185 | + } //otherwise continue using the CPU | |
1186 | + //if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator | |
1187 | + // return coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | |
1188 | +//#endif | |
1185 | 1189 | |
1186 | 1190 | |
1187 | 1191 | ... | ... |