Commit ebd0245245fde20cb2e3cf63a19614c7b8286467

Authored by David Mayerich
1 parent 1808c255

added cuBLAS error messages

stim/envi/agilent_binary.h
@@ -171,7 +171,7 @@ public: @@ -171,7 +171,7 @@ public:
171 171
172 //pads to the nearest power-of-two 172 //pads to the nearest power-of-two
173 void zeropad(){ 173 void zeropad(){
174 - size_t newZ = pow(2, ceil(log(R[2])/log(2))); //find the nearest power-of-two 174 + size_t newZ = (size_t)pow(2, ceil(log(R[2])/log(2))); //find the nearest power-of-two
175 size_t n = newZ - R[2]; //calculate the number of bands to add 175 size_t n = newZ - R[2]; //calculate the number of bands to add
176 zeropad(n); //add the padding 176 zeropad(n); //add the padding
177 } 177 }
@@ -243,7 +243,7 @@ public: @@ -243,7 +243,7 @@ public:
243 243
244 //load a frame y into a pre-allocated double-precision array 244 //load a frame y into a pre-allocated double-precision array
245 int read_plane_xzd(double* f, size_t y){ 245 int read_plane_xzd(double* f, size_t y){
246 - size_t XB = X() * B(); 246 + size_t XB = X() * Z();
247 T* temp = (T*) malloc(XB * sizeof(T)); //create a temporary location to store the plane at current precision 247 T* temp = (T*) malloc(XB * sizeof(T)); //create a temporary location to store the plane at current precision
248 if(!read_plane_y(temp, y)) return 1; //read the plane in its native format, if it fails return a 1 248 if(!read_plane_y(temp, y)) return 1; //read the plane in its native format, if it fails return a 1
249 for(size_t i = 0; i < XB; i++) f[i] = temp[i]; //convert the plane to a double 249 for(size_t i = 0; i < XB; i++) f[i] = temp[i]; //convert the plane to a double
@@ -1061,7 +1061,6 @@ public: @@ -1061,7 +1061,6 @@ public:
1061 } 1061 }
1062 1062
1063 int co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ 1063 int co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
1064 - cudaError_t cudaStat;  
1065 cublasStatus_t stat; 1064 cublasStatus_t stat;
1066 cublasHandle_t handle; 1065 cublasHandle_t handle;
1067 1066
@@ -1131,13 +1130,30 @@ public: @@ -1131,13 +1130,30 @@ public:
1131 progress = 0; 1130 progress = 0;
1132 1131
1133 int dev_count; 1132 int dev_count;
1134 - cudaGetDeviceCount(&dev_count); //get the number of CUDA devices 1133 + HANDLE_ERROR(cudaGetDeviceCount(&dev_count)); //get the number of CUDA devices
  1134 + std::cout<<"Number of CUDA devices: "<<dev_count<<std::endl; //output the number of CUDA devices
1135 cudaDeviceProp prop; 1135 cudaDeviceProp prop;
1136 - cudaGetDeviceProperties(&prop, 0); //get the property of the first device 1136 + int best_device_id = 0; //stores the best CUDA device
  1137 + float best_device_cc = 0.0f; //stores the compute capability of the best device
  1138 + std::cout<<"CUDA devices:"<<std::endl;
  1139 + for(int d = 0; d < dev_count; d++){ //for each CUDA device
  1140 + cudaGetDeviceProperties(&prop, d); //get the property of the first device
  1141 + float cc = prop.major + prop.minor / 10.0f; //calculate the compute capability
  1142 + std::cout<<"("<<prop.major<<"."<<prop.minor<<") "<<prop.name<<std::endl; //display the device information
  1143 + if(cc > best_device_cc){
  1144 + best_device_cc = cc; //if this is better than the previous device, use it
  1145 + best_device_id = d;
  1146 + }
  1147 + }
  1148 +
1137 if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator 1149 if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator
  1150 + std::cout<<"Using device "<<best_device_id<<std::endl;
  1151 + HANDLE_ERROR(cudaSetDevice(best_device_id));
1138 int status = co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix 1152 int status = co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
1139 if(status == 0) return true; //if the cuBLAS function returned correctly, we're done 1153 if(status == 0) return true; //if the cuBLAS function returned correctly, we're done
1140 } //otherwise continue using the CPU 1154 } //otherwise continue using the CPU
  1155 +
  1156 + std::cout<<"No supported CUDA devices found or cuBLAS failed. Using CPU"<<std::endl;
1141 1157
1142 //memory allocation 1158 //memory allocation
1143 unsigned long long xy = X() * Y(); 1159 unsigned long long xy = X() * Y();
@@ -1047,17 +1047,31 @@ public: @@ -1047,17 +1047,31 @@ public:
1047 bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ 1047 bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
1048 progress = 0; 1048 progress = 0;
1049 1049
1050 -//#ifdef CUDA_FOUND  
1051 int dev_count; 1050 int dev_count;
1052 - cudaGetDeviceCount(&dev_count); //get the number of CUDA devices 1051 + HANDLE_ERROR(cudaGetDeviceCount(&dev_count)); //get the number of CUDA devices
  1052 + std::cout<<"Number of CUDA devices: "<<dev_count<<std::endl; //output the number of CUDA devices
1053 cudaDeviceProp prop; 1053 cudaDeviceProp prop;
1054 - cudaGetDeviceProperties(&prop, 0); //get the property of the first device 1054 + int best_device_id = 0; //stores the best CUDA device
  1055 + float best_device_cc = 0.0f; //stores the compute capability of the best device
  1056 + std::cout<<"CUDA devices:"<<std::endl;
  1057 + for(int d = 0; d < dev_count; d++){ //for each CUDA device
  1058 + cudaGetDeviceProperties(&prop, d); //get the property of the first device
  1059 + float cc = prop.major + prop.minor / 10.0f; //calculate the compute capability
  1060 + std::cout<<d<<": ("<<prop.major<<"."<<prop.minor<<") "<<prop.name<<std::endl; //display the device information
  1061 + if(cc > best_device_cc){
  1062 + best_device_cc = cc; //if this is better than the previous device, use it
  1063 + best_device_id = d;
  1064 + }
  1065 + }
  1066 +
1055 if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator 1067 if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator
  1068 + std::cout<<"Using device "<<best_device_id<<std::endl;
  1069 + HANDLE_ERROR(cudaSetDevice(best_device_id));
1056 int status = co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix 1070 int status = co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
1057 if(status == 0) return true; //if the cuBLAS function returned correctly, we're done 1071 if(status == 0) return true; //if the cuBLAS function returned correctly, we're done
1058 } //otherwise continue using the CPU 1072 } //otherwise continue using the CPU
1059 -//#endif  
1060 1073
  1074 + std::cout<<"No supported CUDA devices found or cuBLAS failed. Using CPU"<<std::endl;
1061 //memory allocation 1075 //memory allocation
1062 unsigned long long XY = X() * Y(); 1076 unsigned long long XY = X() * Y();
1063 unsigned long long B = Z(); 1077 unsigned long long B = Z();
@@ -1174,20 +1188,31 @@ public: @@ -1174,20 +1188,31 @@ public:
1174 /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location 1188 /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
1175 bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ 1189 bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
1176 1190
1177 -//#ifdef CUDA_FOUND  
1178 int dev_count; 1191 int dev_count;
1179 - cudaGetDeviceCount(&dev_count); //get the number of CUDA devices 1192 + HANDLE_ERROR(cudaGetDeviceCount(&dev_count)); //get the number of CUDA devices
  1193 + std::cout<<"Number of CUDA devices: "<<dev_count<<std::endl; //output the number of CUDA devices
1180 cudaDeviceProp prop; 1194 cudaDeviceProp prop;
1181 - cudaGetDeviceProperties(&prop, 0); //get the property of the first device 1195 + int best_device_id = 0; //stores the best CUDA device
  1196 + float best_device_cc = 0.0f; //stores the compute capability of the best device
  1197 + std::cout<<"CUDA devices:"<<std::endl;
  1198 + for(int d = 0; d < dev_count; d++){ //for each CUDA device
  1199 + cudaGetDeviceProperties(&prop, d); //get the property of the first device
  1200 + float cc = prop.major + prop.minor / 10.0f; //calculate the compute capability
  1201 + std::cout<<d<<": ("<<prop.major<<"."<<prop.minor<<") "<<prop.name<<std::endl; //display the device information
  1202 + if(cc > best_device_cc){
  1203 + best_device_cc = cc; //if this is better than the previous device, use it
  1204 + best_device_id = d;
  1205 + }
  1206 + }
  1207 +
1182 if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator 1208 if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator
  1209 + std::cout<<"Using device "<<best_device_id<<std::endl;
  1210 + HANDLE_ERROR(cudaSetDevice(best_device_id));
1183 int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix 1211 int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
1184 if(status == 0) return true; //if the cuBLAS function returned correctly, we're done 1212 if(status == 0) return true; //if the cuBLAS function returned correctly, we're done
1185 } //otherwise continue using the CPU 1213 } //otherwise continue using the CPU
1186 - //if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator  
1187 - // return coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix  
1188 -//#endif  
1189 -  
1190 - 1214 +
  1215 + std::cout<<"cuBLAS initialization failed - using CPU"<<std::endl;
1191 1216
1192 progress = 0; 1217 progress = 0;
1193 //memory allocation 1218 //memory allocation
@@ -1265,7 +1265,6 @@ public: @@ -1265,7 +1265,6 @@ public:
1265 yi > sy/2 && yi < Y() - sy/2){ 1265 yi > sy/2 && yi < Y() - sy/2){
1266 size_t cx = xi - sx/2; //calculate the corner position for the subimage 1266 size_t cx = xi - sx/2; //calculate the corner position for the subimage
1267 size_t cy = yi - sy/2; 1267 size_t cy = yi - sy/2;
1268 - size_t cxi, cyi;  
1269 for(size_t syi = 0; syi < sy; syi++){ //for each line in the subimage 1268 for(size_t syi = 0; syi < sy; syi++){ //for each line in the subimage
1270 size_t src_i = (cy + syi) * X() + cx; 1269 size_t src_i = (cy + syi) * X() + cx;
1271 //size_t dst_i = syi * (N * sx) + n * sx; 1270 //size_t dst_i = syi * (N * sx) + n * sx;