Commit 2781a48c74b990c3242b2f862ec696de77483afc
1 parent
9f87643c
fixed cuBLAS functionality, improved error messages
Showing
4 changed files
with
192 additions
and
54 deletions
Show diff stats
stim/cuda/cudatools/error.h
1 | +#ifndef STIM_CUDA_ERROR_H | ||
2 | +#define STIM_CUDA_ERROR_H | ||
3 | + | ||
1 | #include <stdio.h> | 4 | #include <stdio.h> |
2 | #include <iostream> | 5 | #include <iostream> |
3 | using namespace std; | 6 | using namespace std; |
4 | #include "cuda_runtime.h" | 7 | #include "cuda_runtime.h" |
5 | #include "device_launch_parameters.h" | 8 | #include "device_launch_parameters.h" |
6 | #include "cufft.h" | 9 | #include "cufft.h" |
7 | - | ||
8 | -#ifndef CUDA_HANDLE_ERROR_H | ||
9 | -#define CUDA_HANDLE_ERROR_H | 10 | +#include "cublas_v2.h" |
10 | 11 | ||
11 | //handle error macro | 12 | //handle error macro |
12 | -static void HandleError( cudaError_t err, const char *file, int line ) { | 13 | +static void cuHandleError( cudaError_t err, const char *file, int line ) { |
13 | if (err != cudaSuccess) { | 14 | if (err != cudaSuccess) { |
14 | printf("%s in %s at line %d\n", cudaGetErrorString( err ), file, line ); | 15 | printf("%s in %s at line %d\n", cudaGetErrorString( err ), file, line ); |
16 | + | ||
15 | } | 17 | } |
16 | } | 18 | } |
17 | -#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) | 19 | +#define HANDLE_ERROR( err ) (cuHandleError( err, __FILE__, __LINE__ )) |
18 | 20 | ||
19 | -static void CufftError( cufftResult err ) | 21 | +static void cufftHandleError( cufftResult err, const char*file, int line ) |
20 | { | 22 | { |
21 | if (err != CUFFT_SUCCESS) | 23 | if (err != CUFFT_SUCCESS) |
22 | { | 24 | { |
@@ -37,7 +39,29 @@ static void CufftError( cufftResult err ) | @@ -37,7 +39,29 @@ static void CufftError( cufftResult err ) | ||
37 | 39 | ||
38 | } | 40 | } |
39 | } | 41 | } |
42 | +#define CUFFT_HANDLE_ERROR( err ) (cufftHandleError( err, __FILE__, __LINE__ )) | ||
40 | 43 | ||
44 | +static void cublasHandleError( cublasStatus_t err, const char*file, int line ){ | ||
45 | + if(err != CUBLAS_STATUS_SUCCESS){ | ||
46 | + if(err == CUBLAS_STATUS_NOT_INITIALIZED) | ||
47 | + std::cout<<"CUBLAS_STATUS_NOT_INITIALIZED" <<" in file "<<file<<" line "<<std::endl; | ||
48 | + else if(err == CUBLAS_STATUS_ALLOC_FAILED) | ||
49 | + std::cout<<"CUBLAS_STATUS_ALLOC_FAILED" <<" in file "<<file<<" line "<<std::endl; | ||
50 | + else if(err == CUBLAS_STATUS_INVALID_VALUE) | ||
51 | + std::cout<<"CUBLAS_STATUS_INVALID_VALUE" <<" in file "<<file<<" line "<<std::endl; | ||
52 | + else if(err == CUBLAS_STATUS_ARCH_MISMATCH) | ||
53 | + std::cout<<"CUBLAS_STATUS_ARCH_MISMATCH" <<" in file "<<file<<" line "<<std::endl; | ||
54 | + else if(err == CUBLAS_STATUS_MAPPING_ERROR) | ||
55 | + std::cout<<"CUBLAS_STATUS_MAPPING_ERROR" <<" in file "<<file<<" line "<<std::endl; | ||
56 | + else if(err == CUBLAS_STATUS_EXECUTION_FAILED) | ||
57 | + std::cout<<"CUBLAS_STATUS_EXECUTION_FAILED" <<" in file "<<file<<" line "<<std::endl; | ||
58 | + else if(err == CUBLAS_STATUS_INTERNAL_ERROR) | ||
59 | + std::cout<<"CUBLAS_STATUS_INTERNAL_ERROR" <<" in file "<<file<<" line "<<std::endl; | ||
60 | + else | ||
61 | + std::cout<<"Unknown error"<<" in file "<<file<<" line "<<std::endl; | ||
62 | + } | ||
63 | +} | ||
64 | +#define CUBLAS_HANDLE_ERROR( err ) (cublasHandleError( err, __FILE__, __LINE__ )) | ||
41 | 65 | ||
42 | 66 | ||
43 | #endif | 67 | #endif |
stim/envi/agilent_binary.h
@@ -6,11 +6,11 @@ | @@ -6,11 +6,11 @@ | ||
6 | #include <fstream> | 6 | #include <fstream> |
7 | 7 | ||
8 | //CUDA | 8 | //CUDA |
9 | -#ifdef CUDA_FOUND | ||
10 | - #include <cuda_runtime.h> | ||
11 | - #include "cufft.h" | ||
12 | - #include <stim/cuda/cudatools/error.h> | ||
13 | -#endif | 9 | +//#ifdef CUDA_FOUND |
10 | +#include <cuda_runtime.h> | ||
11 | +#include "cufft.h" | ||
12 | +#include <stim/cuda/cudatools/error.h> | ||
13 | +//#endif | ||
14 | 14 | ||
15 | namespace stim{ | 15 | namespace stim{ |
16 | 16 | ||
@@ -42,6 +42,10 @@ public: | @@ -42,6 +42,10 @@ public: | ||
42 | alloc(); | 42 | alloc(); |
43 | } | 43 | } |
44 | 44 | ||
45 | + size_t dim(size_t i){ | ||
46 | + return R[i]; | ||
47 | + } | ||
48 | + | ||
45 | /// Create a deep copy of an agileng_binary object | 49 | /// Create a deep copy of an agileng_binary object |
46 | void deep_copy(agilent_binary<T>* dst, const agilent_binary<T>* src){ | 50 | void deep_copy(agilent_binary<T>* dst, const agilent_binary<T>* src){ |
47 | dst->alloc(src->R[0], src->R[1], src->R[2]); //allocate memory | 51 | dst->alloc(src->R[0], src->R[1], src->R[2]); //allocate memory |
@@ -169,7 +173,6 @@ public: | @@ -169,7 +173,6 @@ public: | ||
169 | void zeropad(){ | 173 | void zeropad(){ |
170 | size_t newZ = pow(2, ceil(log(R[2])/log(2))); //find the nearest power-of-two | 174 | size_t newZ = pow(2, ceil(log(R[2])/log(2))); //find the nearest power-of-two |
171 | size_t n = newZ - R[2]; //calculate the number of bands to add | 175 | size_t n = newZ - R[2]; //calculate the number of bands to add |
172 | - std::cout<<"band padding: "<<n<<std::endl; | ||
173 | zeropad(n); //add the padding | 176 | zeropad(n); //add the padding |
174 | } | 177 | } |
175 | 178 | ||
@@ -184,7 +187,7 @@ public: | @@ -184,7 +187,7 @@ public: | ||
184 | ptr[i] = -log10(ptr[i] / background->ptr[i]); | 187 | ptr[i] = -log10(ptr[i] / background->ptr[i]); |
185 | } | 188 | } |
186 | 189 | ||
187 | -#ifdef CUDA_FOUND | 190 | +//#ifdef CUDA_FOUND |
188 | /// Perform an FFT and return a binary file with bands in the specified range | 191 | /// Perform an FFT and return a binary file with bands in the specified range |
189 | agilent_binary<T> fft(double band_min, double band_max, double ELWN = 15798, int UDR = 2){ | 192 | agilent_binary<T> fft(double band_min, double band_max, double ELWN = 15798, int UDR = 2){ |
190 | auto total_start = std::chrono::high_resolution_clock::now(); | 193 | auto total_start = std::chrono::high_resolution_clock::now(); |
@@ -271,7 +274,7 @@ public: | @@ -271,7 +274,7 @@ public: | ||
271 | 274 | ||
272 | return result; | 275 | return result; |
273 | } | 276 | } |
274 | -#endif | 277 | +//#endif |
275 | 278 | ||
276 | }; | 279 | }; |
277 | 280 |
stim/envi/bil.h
@@ -4,6 +4,7 @@ | @@ -4,6 +4,7 @@ | ||
4 | #include "../envi/envi_header.h" | 4 | #include "../envi/envi_header.h" |
5 | #include "../envi/hsi.h" | 5 | #include "../envi/hsi.h" |
6 | #include "../math/fd_coefficients.h" | 6 | #include "../math/fd_coefficients.h" |
7 | +#include <stim/cuda/cudatools/error.h> | ||
7 | #include <cstring> | 8 | #include <cstring> |
8 | #include <utility> | 9 | #include <utility> |
9 | #include <deque> | 10 | #include <deque> |
@@ -224,10 +225,44 @@ public: | @@ -224,10 +225,44 @@ public: | ||
224 | } | 225 | } |
225 | 226 | ||
226 | //given a Y ,return a XZ slice | 227 | //given a Y ,return a XZ slice |
227 | - bool read_plane_y(T * p, unsigned long long y){ | 228 | + bool read_plane_xz(T * p, size_t y){ |
228 | return binary<T>::read_plane_2(p, y); | 229 | return binary<T>::read_plane_2(p, y); |
229 | } | 230 | } |
230 | 231 | ||
232 | + //given a Y, return ZX slice (transposed such that the spectrum is the leading dimension) | ||
233 | + int read_plane_zx(T* p, size_t y){ | ||
234 | + T* temp = (T*) malloc(X() * Z() * sizeof(T)); //allocate space to store the temporary xz plane | ||
235 | + binary<T>::read_plane_2(temp, y); //load the plane from disk | ||
236 | + size_t z, x; | ||
237 | + for(z = 0; z < Z(); z++){ | ||
238 | + for(x = 0; x <= z; x++){ | ||
239 | + p[x * Z() + z] = temp[z * X() + x]; //copy to the destination frame | ||
240 | + } | ||
241 | + } | ||
242 | + } | ||
243 | + | ||
244 | + //load a frame y into a pre-allocated double-precision array | ||
245 | + int read_plane_xzd(double* f, size_t y){ | ||
246 | + size_t XB = X() * B(); | ||
247 | + T* temp = (T*) malloc(XB * sizeof(T)); //create a temporary location to store the plane at current precision | ||
248 | + if(!read_plane_y(temp, y)) return 1; //read the plane in its native format, if it fails return a 1 | ||
249 | + for(size_t i = 0; i < XB; i++) f[i] = temp[i]; //convert the plane to a double | ||
250 | + return 0; | ||
251 | + } | ||
252 | + | ||
253 | + //given a Y, return ZX slice (transposed such that the spectrum is the leading dimension) | ||
254 | + int read_plane_zxd(double* p, size_t y){ | ||
255 | + T* temp = (T*) malloc(X() * Z() * sizeof(T)); //allocate space to store the temporary xz plane | ||
256 | + binary<T>::read_plane_2(temp, y); //load the plane from disk | ||
257 | + size_t z, x; | ||
258 | + for(z = 0; z < Z(); z++){ | ||
259 | + for(x = 0; x < X(); x++){ | ||
260 | + p[x * Z() + z] = (double)temp[z * X() + x]; //copy to the destination frame | ||
261 | + } | ||
262 | + } | ||
263 | + return 0; | ||
264 | + } | ||
265 | + | ||
231 | 266 | ||
232 | /// Perform baseline correction given a list of baseline points and stores the result in a new BSQ file. | 267 | /// Perform baseline correction given a list of baseline points and stores the result in a new BSQ file. |
233 | 268 | ||
@@ -268,7 +303,7 @@ public: | @@ -268,7 +303,7 @@ public: | ||
268 | for (unsigned long long k =0; k < Y(); k++) | 303 | for (unsigned long long k =0; k < Y(); k++) |
269 | { | 304 | { |
270 | //get the current y slice | 305 | //get the current y slice |
271 | - read_plane_y(c, k); | 306 | + read_plane_xz(c, k); |
272 | 307 | ||
273 | //initialize lownum, highnum, low, high | 308 | //initialize lownum, highnum, low, high |
274 | ai = w[0]; | 309 | ai = w[0]; |
@@ -369,7 +404,7 @@ public: | @@ -369,7 +404,7 @@ public: | ||
369 | 404 | ||
370 | for(unsigned long long j = 0; j < Y(); j++) | 405 | for(unsigned long long j = 0; j < Y(); j++) |
371 | { | 406 | { |
372 | - read_plane_y(c, j); | 407 | + read_plane_xz(c, j); |
373 | for(unsigned long long i = 0; i < B; i++) | 408 | for(unsigned long long i = 0; i < B; i++) |
374 | { | 409 | { |
375 | for(unsigned long long m = 0; m < X(); m++) | 410 | for(unsigned long long m = 0; m < X(); m++) |
@@ -469,7 +504,7 @@ public: | @@ -469,7 +504,7 @@ public: | ||
469 | 504 | ||
470 | for ( unsigned long long i = 0; i < Y(); i++) | 505 | for ( unsigned long long i = 0; i < Y(); i++) |
471 | { | 506 | { |
472 | - read_plane_y(p, i); | 507 | + read_plane_xz(p, i); |
473 | for ( unsigned long long k = 0; k < Z(); k++) | 508 | for ( unsigned long long k = 0; k < Z(); k++) |
474 | { | 509 | { |
475 | unsigned long long ks = k * X(); | 510 | unsigned long long ks = k * X(); |
@@ -863,7 +898,7 @@ public: | @@ -863,7 +898,7 @@ public: | ||
863 | 898 | ||
864 | for (unsigned long long i = 0; i < Y(); i++) //for each value in Y() (BIP should be X) | 899 | for (unsigned long long i = 0; i < Y(); i++) //for each value in Y() (BIP should be X) |
865 | { | 900 | { |
866 | - read_plane_y(temp, i); //retrieve an ZX slice, stored in temp | 901 | + read_plane_xz(temp, i); //retrieve an ZX slice, stored in temp |
867 | for ( unsigned long long j = 0; j < Z(); j++) //for each Z() (Y) | 902 | for ( unsigned long long j = 0; j < Z(); j++) //for each Z() (Y) |
868 | { | 903 | { |
869 | for (unsigned long long k = 0; k < X(); k++) //for each band | 904 | for (unsigned long long k = 0; k < X(); k++) //for each band |
@@ -933,7 +968,7 @@ public: | @@ -933,7 +968,7 @@ public: | ||
933 | //for each slice along the y axis | 968 | //for each slice along the y axis |
934 | for (unsigned long long y = 0; y < Y(); y++) //Select a page by choosing Y coordinate, Y() | 969 | for (unsigned long long y = 0; y < Y(); y++) //Select a page by choosing Y coordinate, Y() |
935 | { | 970 | { |
936 | - read_plane_y(slice, y); //retrieve an ZX page, store in "slice" | 971 | + read_plane_xz(slice, y); //retrieve an ZX page, store in "slice" |
937 | 972 | ||
938 | //for each sample along X | 973 | //for each sample along X |
939 | for (unsigned long long x = 0; x < X(); x++) //Select a pixel by choosing X coordinate in the page, X() | 974 | for (unsigned long long x = 0; x < X(); x++) //Select a pixel by choosing X coordinate in the page, X() |
@@ -1004,7 +1039,7 @@ public: | @@ -1004,7 +1039,7 @@ public: | ||
1004 | 1039 | ||
1005 | double x; //create a register to store the pixel value | 1040 | double x; //create a register to store the pixel value |
1006 | for (unsigned long long k = 0; k < Y(); k++){ | 1041 | for (unsigned long long k = 0; k < Y(); k++){ |
1007 | - read_plane_y(temp, k); | 1042 | + read_plane_xz(temp, k); |
1008 | unsigned long long kx = k * X(); | 1043 | unsigned long long kx = k * X(); |
1009 | for (unsigned long long i = 0; i < X(); i++){ | 1044 | for (unsigned long long i = 0; i < X(); i++){ |
1010 | if (mask == NULL || mask[kx + i] != 0){ | 1045 | if (mask == NULL || mask[kx + i] != 0){ |
@@ -1025,6 +1060,68 @@ public: | @@ -1025,6 +1060,68 @@ public: | ||
1025 | return true; | 1060 | return true; |
1026 | } | 1061 | } |
1027 | 1062 | ||
1063 | + int co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ | ||
1064 | + cudaError_t cudaStat; | ||
1065 | + cublasStatus_t stat; | ||
1066 | + cublasHandle_t handle; | ||
1067 | + | ||
1068 | + progress = 0; //initialize the progress to zero (0) | ||
1069 | + size_t XY = X() * Y(); //calculate the number of elements in a band image | ||
1070 | + size_t XB = X() * Z(); | ||
1071 | + size_t B = Z(); //calculate the number of spectral elements | ||
1072 | + | ||
1073 | + double* F = (double*)malloc(sizeof(double) * B * X()); //allocate space for the frame that will be pulled from the file | ||
1074 | + double* F_dev; | ||
1075 | + HANDLE_ERROR(cudaMalloc(&F_dev, X() * B * sizeof(double))); //allocate space for the frame on the GPU | ||
1076 | + double* s_dev; //declare a device pointer that will store the spectrum on the GPU | ||
1077 | + double* A_dev; //declare a device pointer that will store the covariance matrix on the GPU | ||
1078 | + double* avg_dev; //declare a device pointer that will store the average spectrum | ||
1079 | + HANDLE_ERROR(cudaMalloc(&s_dev, B * sizeof(double))); //allocate space on the CUDA device for a spectrum | ||
1080 | + HANDLE_ERROR(cudaMalloc(&A_dev, B * B * sizeof(double))); //allocate space on the CUDA device for the covariance matrix | ||
1081 | + HANDLE_ERROR(cudaMemset(A_dev, 0, B * B * sizeof(double))); //initialize the covariance matrix to zero (0) | ||
1082 | + HANDLE_ERROR(cudaMalloc(&avg_dev, XB * sizeof(double))); //allocate space on the CUDA device for the average spectrum | ||
1083 | + for(size_t x = 0; x < X(); x++) //make multiple copies of the average spectrum in order to build a matrix | ||
1084 | + HANDLE_ERROR(cudaMemcpy(&avg_dev[x * B], avg, B * sizeof(double), cudaMemcpyHostToDevice)); | ||
1085 | + //stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1); //copy the average spectrum to the CUDA device | ||
1086 | + | ||
1087 | + double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product) | ||
1088 | + double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) | ||
1089 | + | ||
1090 | + CUBLAS_HANDLE_ERROR(stat = cublasCreate(&handle)); //create a cuBLAS instance | ||
1091 | + if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid | ||
1092 | + | ||
1093 | + else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl; | ||
1094 | + double beta = 1.0; | ||
1095 | + size_t x, y; | ||
1096 | + for(y = 0; y < Y(); y++){ //for each line | ||
1097 | + read_plane_zxd(F, y); //read a frame from the file | ||
1098 | + HANDLE_ERROR(cudaMemcpy(F_dev, F, XB * sizeof(double), cudaMemcpyHostToDevice)); //copy the frame to the GPU | ||
1099 | + CUBLAS_HANDLE_ERROR(cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, (int)B, (int)X(), &axpy_alpha, avg_dev, (int)B, &beta, F_dev, (int)B, F_dev, (int)B));//subtract the mean spectrum | ||
1100 | + | ||
1101 | + for(x = 0; x < X(); x++) | ||
1102 | + CUBLAS_HANDLE_ERROR(cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, &F_dev[x*B], 1, A_dev, (int)B)); //perform an outer product | ||
1103 | + if(PROGRESS) progress = (double)(y + 1) / Y() * 100; | ||
1104 | + } | ||
1105 | + | ||
1106 | + cublasGetMatrix((int)B, (int)B, sizeof(double), A_dev, (int)B, co, (int)B); //copy the result from the GPU to the CPU | ||
1107 | + | ||
1108 | + cudaFree(A_dev); //clean up allocated device memory | ||
1109 | + cudaFree(s_dev); | ||
1110 | + cudaFree(avg_dev); | ||
1111 | + | ||
1112 | + for(unsigned long long i = 0; i < B; i++){ //copy the upper triangular portion to the lower triangular portion | ||
1113 | + for(unsigned long long j = i+1; j < B; j++){ | ||
1114 | + co[B * i + j] = co[B * j + i]; | ||
1115 | + } | ||
1116 | + } | ||
1117 | + | ||
1118 | + return 0; | ||
1119 | + | ||
1120 | + | ||
1121 | + | ||
1122 | + } | ||
1123 | + | ||
1124 | + | ||
1028 | /// Calculate the covariance matrix for all masked pixels in the image. | 1125 | /// Calculate the covariance matrix for all masked pixels in the image. |
1029 | 1126 | ||
1030 | /// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix | 1127 | /// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix |
@@ -1032,6 +1129,16 @@ public: | @@ -1032,6 +1129,16 @@ public: | ||
1032 | /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location | 1129 | /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location |
1033 | bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ | 1130 | bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ |
1034 | progress = 0; | 1131 | progress = 0; |
1132 | + | ||
1133 | + int dev_count; | ||
1134 | + cudaGetDeviceCount(&dev_count); //get the number of CUDA devices | ||
1135 | + cudaDeviceProp prop; | ||
1136 | + cudaGetDeviceProperties(&prop, 0); //get the property of the first device | ||
1137 | + if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator | ||
1138 | + int status = co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | ||
1139 | + if(status == 0) return true; //if the cuBLAS function returned correctly, we're done | ||
1140 | + } //otherwise continue using the CPU | ||
1141 | + | ||
1035 | //memory allocation | 1142 | //memory allocation |
1036 | unsigned long long xy = X() * Y(); | 1143 | unsigned long long xy = X() * Y(); |
1037 | unsigned long long B = Z(); | 1144 | unsigned long long B = Z(); |
@@ -1328,7 +1435,7 @@ public: | @@ -1328,7 +1435,7 @@ public: | ||
1328 | c = (T*)malloc( L ); //allocate space for the slice | 1435 | c = (T*)malloc( L ); //allocate space for the slice |
1329 | 1436 | ||
1330 | for(unsigned long long j = 0; j < Y(); j++){ //for each line | 1437 | for(unsigned long long j = 0; j < Y(); j++){ //for each line |
1331 | - read_plane_y(c, j); //load the line into memory | 1438 | + read_plane_xz(c, j); //load the line into memory |
1332 | for(unsigned long long i = 0; i < B; i++){ //for each band | 1439 | for(unsigned long long i = 0; i < B; i++){ //for each band |
1333 | for(unsigned long long m = 0; m < X(); m++){ //for each sample | 1440 | for(unsigned long long m = 0; m < X(); m++){ //for each sample |
1334 | if( mask == NULL && mask[m + j * X()] ) //if the pixel is masked | 1441 | if( mask == NULL && mask[m + j * X()] ) //if the pixel is masked |
@@ -1358,7 +1465,7 @@ public: | @@ -1358,7 +1465,7 @@ public: | ||
1358 | c = (T*)malloc( L ); //allocate space for the slice | 1465 | c = (T*)malloc( L ); //allocate space for the slice |
1359 | 1466 | ||
1360 | for(unsigned long long j = 0; j < Y(); j++){ //for each line | 1467 | for(unsigned long long j = 0; j < Y(); j++){ //for each line |
1361 | - read_plane_y(c, j); //load the line into memory | 1468 | + read_plane_xz(c, j); //load the line into memory |
1362 | for(unsigned long long i = 0; i < B; i++){ //for each band | 1469 | for(unsigned long long i = 0; i < B; i++){ //for each band |
1363 | for(unsigned long long m = 0; m < X(); m++){ //for each sample | 1470 | for(unsigned long long m = 0; m < X(); m++){ //for each sample |
1364 | if( mask == NULL && mask[m + j * X()] ) //if the pixel is masked | 1471 | if( mask == NULL && mask[m + j * X()] ) //if the pixel is masked |
stim/envi/bip.h
@@ -8,10 +8,11 @@ | @@ -8,10 +8,11 @@ | ||
8 | #include <utility> | 8 | #include <utility> |
9 | 9 | ||
10 | //CUDA | 10 | //CUDA |
11 | -#ifdef CUDA_FOUND | ||
12 | - #include <cuda_runtime.h> | ||
13 | - #include "cublas_v2.h" | ||
14 | -#endif | 11 | +//#ifdef CUDA_FOUND |
12 | +#include <stim/cuda/cudatools/error.h> | ||
13 | +#include <cuda_runtime.h> | ||
14 | +#include "cublas_v2.h" | ||
15 | +//#endif | ||
15 | 16 | ||
16 | namespace stim{ | 17 | namespace stim{ |
17 | 18 | ||
@@ -257,7 +258,7 @@ public: | @@ -257,7 +258,7 @@ public: | ||
257 | } | 258 | } |
258 | 259 | ||
259 | //given a Y ,return a ZX slice | 260 | //given a Y ,return a ZX slice |
260 | - bool read_plane_y(T * p, unsigned long long y){ | 261 | + bool read_plane_y(T * p, size_t y){ |
261 | return binary<T>::read_plane_2(p, y); | 262 | return binary<T>::read_plane_2(p, y); |
262 | } | 263 | } |
263 | 264 | ||
@@ -982,16 +983,15 @@ public: | @@ -982,16 +983,15 @@ public: | ||
982 | free(temp); | 983 | free(temp); |
983 | return true; | 984 | return true; |
984 | } | 985 | } |
985 | -#ifdef CUDA_FOUND | 986 | +//#ifdef CUDA_FOUND |
986 | /// Calculate the covariance matrix for masked pixels using cuBLAS | 987 | /// Calculate the covariance matrix for masked pixels using cuBLAS |
987 | /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra | 988 | /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra |
988 | - bool co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ | 989 | + int co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ |
989 | 990 | ||
990 | cudaError_t cudaStat; | 991 | cudaError_t cudaStat; |
991 | cublasStatus_t stat; | 992 | cublasStatus_t stat; |
992 | cublasHandle_t handle; | 993 | cublasHandle_t handle; |
993 | 994 | ||
994 | - progress = 0; //initialize the progress to zero (0) | ||
995 | unsigned long long XY = X() * Y(); //calculate the number of elements in a band image | 995 | unsigned long long XY = X() * Y(); //calculate the number of elements in a band image |
996 | unsigned long long B = Z(); //calculate the number of spectral elements | 996 | unsigned long long B = Z(); //calculate the number of spectral elements |
997 | 997 | ||
@@ -1009,10 +1009,9 @@ public: | @@ -1009,10 +1009,9 @@ public: | ||
1009 | double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) | 1009 | double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) |
1010 | 1010 | ||
1011 | stat = cublasCreate(&handle); //create a cuBLAS instance | 1011 | stat = cublasCreate(&handle); //create a cuBLAS instance |
1012 | - if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid | ||
1013 | - printf ("CUBLAS initialization failed\n"); | ||
1014 | - return EXIT_FAILURE; | ||
1015 | - } | 1012 | + if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid |
1013 | + | ||
1014 | + else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl; | ||
1016 | for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel | 1015 | for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel |
1017 | if (mask == NULL || mask[xy] != 0){ | 1016 | if (mask == NULL || mask[xy] != 0){ |
1018 | pixeld(s, xy); //retreive the spectrum at the current xy pixel location | 1017 | pixeld(s, xy); //retreive the spectrum at the current xy pixel location |
@@ -1036,9 +1035,9 @@ public: | @@ -1036,9 +1035,9 @@ public: | ||
1036 | } | 1035 | } |
1037 | } | 1036 | } |
1038 | 1037 | ||
1039 | - return true; | 1038 | + return 0; |
1040 | } | 1039 | } |
1041 | -#endif | 1040 | +//#endif |
1042 | 1041 | ||
1043 | /// Calculate the covariance matrix for all masked pixels in the image with 64-bit floating point precision. | 1042 | /// Calculate the covariance matrix for all masked pixels in the image with 64-bit floating point precision. |
1044 | 1043 | ||
@@ -1046,16 +1045,19 @@ public: | @@ -1046,16 +1045,19 @@ public: | ||
1046 | /// @param avg is a pointer to memory of size B that stores the average spectrum | 1045 | /// @param avg is a pointer to memory of size B that stores the average spectrum |
1047 | /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location | 1046 | /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location |
1048 | bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ | 1047 | bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ |
1048 | + progress = 0; | ||
1049 | 1049 | ||
1050 | -#ifdef CUDA_FOUND | 1050 | +//#ifdef CUDA_FOUND |
1051 | int dev_count; | 1051 | int dev_count; |
1052 | cudaGetDeviceCount(&dev_count); //get the number of CUDA devices | 1052 | cudaGetDeviceCount(&dev_count); //get the number of CUDA devices |
1053 | cudaDeviceProp prop; | 1053 | cudaDeviceProp prop; |
1054 | cudaGetDeviceProperties(&prop, 0); //get the property of the first device | 1054 | cudaGetDeviceProperties(&prop, 0); //get the property of the first device |
1055 | - if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator | ||
1056 | - return co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | ||
1057 | -#endif | ||
1058 | - progress = 0; | 1055 | + if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator |
1056 | + int status = co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | ||
1057 | + if(status == 0) return true; //if the cuBLAS function returned correctly, we're done | ||
1058 | + } //otherwise continue using the CPU | ||
1059 | +//#endif | ||
1060 | + | ||
1059 | //memory allocation | 1061 | //memory allocation |
1060 | unsigned long long XY = X() * Y(); | 1062 | unsigned long long XY = X() * Y(); |
1061 | unsigned long long B = Z(); | 1063 | unsigned long long B = Z(); |
@@ -1097,10 +1099,10 @@ public: | @@ -1097,10 +1099,10 @@ public: | ||
1097 | } | 1099 | } |
1098 | 1100 | ||
1099 | 1101 | ||
1100 | -#ifdef CUDA_FOUND | 1102 | +//#ifdef CUDA_FOUND |
1101 | /// Calculate the covariance matrix of Noise for masked pixels using cuBLAS | 1103 | /// Calculate the covariance matrix of Noise for masked pixels using cuBLAS |
1102 | /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra | 1104 | /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra |
1103 | - bool coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ | 1105 | + int coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ |
1104 | 1106 | ||
1105 | cudaError_t cudaStat; | 1107 | cudaError_t cudaStat; |
1106 | cublasStatus_t stat; | 1108 | cublasStatus_t stat; |
@@ -1128,11 +1130,9 @@ public: | @@ -1128,11 +1130,9 @@ public: | ||
1128 | double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product) | 1130 | double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product) |
1129 | double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) | 1131 | double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) |
1130 | 1132 | ||
1131 | - stat = cublasCreate(&handle); //create a cuBLAS instance | ||
1132 | - if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid | ||
1133 | - printf ("CUBLAS initialization failed\n"); | ||
1134 | - return EXIT_FAILURE; | ||
1135 | - } | 1133 | + CUBLAS_HANDLE_ERROR(cublasCreate(&handle)); //create a cuBLAS instance |
1134 | + if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid | ||
1135 | + | ||
1136 | for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel | 1136 | for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel |
1137 | if (mask == NULL || mask[xy] != 0){ | 1137 | if (mask == NULL || mask[xy] != 0){ |
1138 | pixeld(s, xy); //retreive the spectrum at the current xy pixel location | 1138 | pixeld(s, xy); //retreive the spectrum at the current xy pixel location |
@@ -1165,7 +1165,7 @@ public: | @@ -1165,7 +1165,7 @@ public: | ||
1165 | 1165 | ||
1166 | return true; | 1166 | return true; |
1167 | } | 1167 | } |
1168 | -#endif | 1168 | +//#endif |
1169 | 1169 | ||
1170 | /// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision. | 1170 | /// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision. |
1171 | 1171 | ||
@@ -1174,14 +1174,18 @@ public: | @@ -1174,14 +1174,18 @@ public: | ||
1174 | /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location | 1174 | /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location |
1175 | bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ | 1175 | bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ |
1176 | 1176 | ||
1177 | -#ifdef CUDA_FOUND | 1177 | +//#ifdef CUDA_FOUND |
1178 | int dev_count; | 1178 | int dev_count; |
1179 | cudaGetDeviceCount(&dev_count); //get the number of CUDA devices | 1179 | cudaGetDeviceCount(&dev_count); //get the number of CUDA devices |
1180 | cudaDeviceProp prop; | 1180 | cudaDeviceProp prop; |
1181 | cudaGetDeviceProperties(&prop, 0); //get the property of the first device | 1181 | cudaGetDeviceProperties(&prop, 0); //get the property of the first device |
1182 | - if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator | ||
1183 | - return coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | ||
1184 | -#endif | 1182 | + if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator |
1183 | + int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | ||
1184 | + if(status == 0) return true; //if the cuBLAS function returned correctly, we're done | ||
1185 | + } //otherwise continue using the CPU | ||
1186 | + //if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator | ||
1187 | + // return coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | ||
1188 | +//#endif | ||
1185 | 1189 | ||
1186 | 1190 | ||
1187 | 1191 |