Commit 1c92dffecd442c80bf3562f2958b9bd1aa775ac0

Authored by David Mayerich
1 parent f78ceb64

simplified CUDA implementation in covariance and projection calculations

Also updated stim::matrix to allow re-arranging of rows and columns, as well as saving to CSV files.
Showing 2 changed files with 95 additions and 101 deletions   Show diff stats
stim/envi/bip.h
... ... @@ -1015,7 +1015,7 @@ public:
1015 1015 stat = cublasCreate(&handle); //create a cuBLAS instance
1016 1016 if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid
1017 1017  
1018   - else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl;
  1018 + //else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl;
1019 1019 for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel
1020 1020 if (mask == NULL || mask[xy] != 0){
1021 1021 pixeld(s, xy); //retreive the spectrum at the current xy pixel location
... ... @@ -1054,31 +1054,17 @@ public:
1054 1054 if(cuda_device >= 0){ //if a CUDA device is specified
1055 1055 int dev_count;
1056 1056 HANDLE_ERROR(cudaGetDeviceCount(&dev_count)); //get the number of CUDA devices
1057   - std::cout<<"Number of CUDA devices: "<<dev_count<<std::endl; //output the number of CUDA devices
1058   - cudaDeviceProp prop;
1059   - //int best_device_id = 0; //stores the best CUDA device
1060   - //float best_device_cc = 0.0f; //stores the compute capability of the best device
1061   - std::cout<<"CUDA devices----"<<std::endl;
1062   - for(int d = 0; d < dev_count; d++){ //for each CUDA device
1063   - cudaGetDeviceProperties(&prop, d); //get the property of the first device
1064   - //float cc = prop.major + prop.minor / 10.0f; //calculate the compute capability
1065   - std::cout<<d<<": ["<<prop.major<<"."<<prop.minor<<"] "<<prop.name<<std::endl; //display the device information
1066   - //if(cc > best_device_cc){
1067   - // best_device_cc = cc; //if this is better than the previous device, use it
1068   - // best_device_id = d;
1069   - //}
1070   - }
1071 1057 if(dev_count > 0 && dev_count > cuda_device){ //if the first device is not an emulator
  1058 + cudaDeviceProp prop;
1072 1059 cudaGetDeviceProperties(&prop, cuda_device); //get the property of the requested CUDA device
1073 1060 if (prop.major != 9999) {
1074   - std::cout << "Using device " << cuda_device << std::endl;
  1061 + std::cout << "Using CUDA device [" << cuda_device << "] to calculate the mean covariance matrix..."<<std::endl;
1075 1062 HANDLE_ERROR(cudaSetDevice(cuda_device));
1076 1063 int status = co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
1077 1064 if (status == 0) return true; //if the cuBLAS function returned correctly, we're done
1078 1065 }
1079   - } //otherwise continue using the CPU
1080   -
1081   - std::cout<<"No supported CUDA devices found or cuBLAS failed. Using CPU"<<std::endl;
  1066 + } //otherwise continue using the CPU
  1067 + std::cout<<"WARNING: cuBLAS failed, using CPU"<<std::endl;
1082 1068 }
1083 1069 //memory allocation
1084 1070 unsigned long long XY = X() * Y();
... ... @@ -1196,32 +1182,20 @@ public:
1196 1182 /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
1197 1183 bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, int cuda_device = 0, bool PROGRESS = false){
1198 1184  
1199   - if(cuda_device >= 0){
  1185 + if (cuda_device >= 0) { //if a CUDA device is specified
1200 1186 int dev_count;
1201 1187 HANDLE_ERROR(cudaGetDeviceCount(&dev_count)); //get the number of CUDA devices
1202   - std::cout<<"Number of CUDA devices: "<<dev_count<<std::endl; //output the number of CUDA devices
1203   - cudaDeviceProp prop;
1204   - //int best_device_id = 0; //stores the best CUDA device
1205   - //float best_device_cc = 0.0f; //stores the compute capability of the best device
1206   - std::cout<<"CUDA devices:"<<std::endl;
1207   - for(int d = 0; d < dev_count; d++){ //for each CUDA device
1208   - cudaGetDeviceProperties(&prop, d); //get the property of the first device
1209   - //float cc = prop.major + prop.minor / 10.0f; //calculate the compute capability
1210   - std::cout<<d<<": ("<<prop.major<<"."<<prop.minor<<") "<<prop.name<<std::endl; //display the device information
1211   - //if(cc > best_device_cc){
1212   - // best_device_cc = cc; //if this is better than the previous device, use it
1213   - // best_device_id = d;
1214   - //}
1215   - }
1216   -
1217   - if(dev_count > 0 && prop.major != 9999 && dev_count > cuda_device){ //if the first device is not an emulator
1218   - std::cout<<"Using device "<< cuda_device <<std::endl;
1219   - HANDLE_ERROR(cudaSetDevice(cuda_device));
1220   - int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
1221   - if(status == 0) return true; //if the cuBLAS function returned correctly, we're done
1222   - } //otherwise continue using the CPU
1223   -
1224   - std::cout<<"cuBLAS initialization failed - using CPU"<<std::endl;
  1188 + if (dev_count > 0 && dev_count > cuda_device) { //if the first device is not an emulator
  1189 + cudaDeviceProp prop;
  1190 + cudaGetDeviceProperties(&prop, cuda_device); //get the property of the requested CUDA device
  1191 + if (prop.major != 9999) {
  1192 + std::cout << "Using CUDA device [" << cuda_device << "] to calculate the noise covariance matrix..." << std::endl;
  1193 + HANDLE_ERROR(cudaSetDevice(cuda_device));
  1194 + int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
  1195 + if (status == 0) return true; //if the cuBLAS function returned correctly, we're done
  1196 + }
  1197 + } //otherwise continue using the CPU
  1198 + std::cout << "WARNING: cuBLAS failed, using CPU" << std::endl;
1225 1199 }
1226 1200  
1227 1201 progress = 0;
... ... @@ -1277,8 +1251,8 @@ public:
1277 1251 /// @param mask is a character mask used to limit processing to valid pixels
1278 1252 bool project_cublas(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){
1279 1253  
1280   - cudaError_t cudaStat;
1281   - cublasStatus_t stat;
  1254 + //cudaError_t cudaStat;
  1255 + //cublasStatus_t stat;
1282 1256 cublasHandle_t handle;
1283 1257  
1284 1258 std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file
... ... @@ -1289,12 +1263,12 @@ public:
1289 1263  
1290 1264 double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file
1291 1265 double* s_dev; //declare a device pointer that will store the spectrum on the GPU
1292   - cudaStat = cudaMalloc(&s_dev, B * sizeof(double)); //allocate space on the CUDA device for the spectrum
  1266 + HANDLE_ERROR(cudaMalloc(&s_dev, B * sizeof(double))); //allocate space on the CUDA device for the spectrum
1293 1267  
1294 1268  
1295 1269 double* basis_dev; // device pointer on the GPU
1296   - cudaStat = cudaMalloc(&basis_dev, M * B * sizeof(double)); // allocate space on the CUDA device
1297   - cudaStat = cudaMemset(basis_dev, 0, M * B * sizeof(double)); // initialize basis_dev to zero (0)
  1270 + HANDLE_ERROR(cudaMalloc(&basis_dev, M * B * sizeof(double))); // allocate space on the CUDA device
  1271 + HANDLE_ERROR(cudaMemset(basis_dev, 0, M * B * sizeof(double))); // initialize basis_dev to zero (0)
1298 1272  
1299 1273  
1300 1274 /// transposing basis matrix (because cuBLAS is column-major)
... ... @@ -1303,28 +1277,24 @@ public:
1303 1277 for (int i = 0; i<M; i++)
1304 1278 for (int j = 0; j<B; j++)
1305 1279 basis_Transposed[i+j*M] = basis[i*B+j];
  1280 + //copy the basis_Transposed matrix to the CUDA device (both matrices are stored in column-major format)
  1281 + CUBLAS_HANDLE_ERROR(cublasSetMatrix((int)M, (int)B, sizeof(double),basis_Transposed, (int)M, basis_dev, (int)M));
1306 1282  
1307   - stat = cublasSetMatrix((int)M, (int)B, sizeof(double),basis_Transposed, (int)M, basis_dev, (int)M); //copy the basis_Transposed matrix to the CUDA device (both matrices are stored in column-major format)
1308   -
1309   - double* center_dev; //declare a device pointer that will store the center (average)
1310   - cudaStat = cudaMalloc(&center_dev, B * sizeof(double)); //allocate space on the CUDA device for the center (average)
1311   - stat = cublasSetVector((int)B, sizeof(double), center, 1, center_dev, 1); //copy the center vector (average) to the CUDA device (from host to device)
  1283 + double* center_dev; //declare a device pointer that will store the center (average)
  1284 + HANDLE_ERROR(cudaMalloc(&center_dev, B * sizeof(double))); //allocate space on the CUDA device for the center (average)
  1285 + CUBLAS_HANDLE_ERROR(cublasSetVector((int)B, sizeof(double), center, 1, center_dev, 1)); //copy the center vector (average) to the CUDA device (from host to device)
1312 1286  
1313 1287  
1314   - double* A = (double*)malloc(sizeof(double) * M); //allocate space for the projected pixel on the host
1315   - double* A_dev; //declare a device pointer that will store the projected pixel on the GPU
1316   - cudaStat = cudaMalloc(&A_dev,M * sizeof(double)); //allocate space on the CUDA device for the projected pixel
1317   - cudaStat = cudaMemset(A_dev, 0,M * sizeof(double)); //initialize the projected pixel to zero (0)
  1288 + double* A = (double*)malloc(sizeof(double) * M); //allocate space for the projected pixel on the host
  1289 + double* A_dev; //declare a device pointer that will store the projected pixel on the GPU
  1290 + HANDLE_ERROR(cudaMalloc(&A_dev,M * sizeof(double))); //allocate space on the CUDA device for the projected pixel
  1291 + HANDLE_ERROR(cudaMemset(A_dev, 0,M * sizeof(double))); //initialize the projected pixel to zero (0)
1318 1292  
1319 1293 double axpy_alpha = -1; //multiplication factor for the center (in order to perform a subtraction)
1320 1294 double axpy_alpha2 = 1; //multiplication factor for the matrix-vector multiplication
1321 1295 double axpy_beta = 0; //multiplication factor for the matrix-vector multiplication (there is no second scalor)
1322 1296  
1323   - stat = cublasCreate(&handle); //create a cuBLAS instance
1324   - if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid
1325   - printf ("CUBLAS initialization failed\n");
1326   - return EXIT_FAILURE;
1327   - }
  1297 + CUBLAS_HANDLE_ERROR(cublasCreate(&handle)); //create a cuBLAS instance
1328 1298  
1329 1299 T* temp = (T*)malloc(sizeof(T) * M); //allocate space for the projected pixel to be written on the disc
1330 1300 size_t i;
... ... @@ -1332,14 +1302,11 @@ public:
1332 1302 if (mask == NULL || mask[xy] != 0){
1333 1303 pixeld(s, xy); //retreive the spectrum at the current xy pixel location
1334 1304  
1335   - stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device
1336   - stat = cublasDaxpy(handle, (int)B, &axpy_alpha, center_dev, 1, s_dev, 1); //subtract the center (average)
1337   - stat = cublasDgemv(handle,CUBLAS_OP_N,(int)M,(int)B,&axpy_alpha2,basis_dev,(int)M,s_dev,1,&axpy_beta,A_dev,1); //performs the matrix-vector multiplication
1338   - stat = cublasGetVector((int)M, sizeof(double), A_dev, 1, A, 1); //copy the projected pixel to the host (from GPU to CPU)
1339   -
1340   - //stat = cublasGetVector((int)B, sizeof(double), s_dev, 1, A, 1); //copy the projected pixel to the host (from GPU to CPU)
1341   -
1342   - //std::copy<double*, T*>(A, A + M, temp);
  1305 + CUBLAS_HANDLE_ERROR(cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1)); //copy the spectrum from the host to the device
  1306 + CUBLAS_HANDLE_ERROR(cublasDaxpy(handle, (int)B, &axpy_alpha, center_dev, 1, s_dev, 1)); //subtract the center (average)
  1307 + CUBLAS_HANDLE_ERROR(cublasDgemv(handle,CUBLAS_OP_N,(int)M,(int)B,&axpy_alpha2,basis_dev,(int)M,s_dev,1,&axpy_beta,A_dev,1)); //performs the matrix-vector multiplication
  1308 + CUBLAS_HANDLE_ERROR(cublasGetVector((int)M, sizeof(double), A_dev, 1, A, 1)); //copy the projected pixel to the host (from GPU to CPU)
  1309 +
1343 1310 for(i = 0; i < M; i++) temp[i] = (T)A[i]; //casting projected pixel from double to whatever T is
1344 1311 }
1345 1312 else
... ... @@ -1351,10 +1318,11 @@ public:
1351 1318 }
1352 1319  
1353 1320 //clean up allocated device memory
1354   - cudaFree(A_dev);
1355   - cudaFree(s_dev);
1356   - cudaFree(basis_dev);
1357   - cudaFree(center_dev);
  1321 + HANDLE_ERROR(cudaFree(A_dev));
  1322 + HANDLE_ERROR(cudaFree(s_dev));
  1323 + HANDLE_ERROR(cudaFree(basis_dev));
  1324 + HANDLE_ERROR(cudaFree(center_dev));
  1325 + CUBLAS_HANDLE_ERROR(cublasDestroy(handle));
1358 1326 free(A);
1359 1327 free(s);
1360 1328 free(temp);
... ... @@ -1370,29 +1338,19 @@ public:
1370 1338 /// @param M is the number of basis vectors
1371 1339 /// @param mask is a character mask used to limit processing to valid pixels
1372 1340 bool project(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, int cuda_device = 0, bool PROGRESS = false){
1373   -
1374 1341 if (cuda_device >= 0) { //if a CUDA device is specified
1375 1342 int dev_count;
1376 1343 HANDLE_ERROR(cudaGetDeviceCount(&dev_count)); //get the number of CUDA devices
1377   - std::cout << "Number of CUDA devices: " << dev_count << std::endl; //output the number of CUDA devices
1378   - cudaDeviceProp prop;
1379   - //int best_device_id = 0; //stores the best CUDA device
1380   - //float best_device_cc = 0.0f; //stores the compute capability of the best device
1381   - std::cout << "CUDA devices----" << std::endl;
1382   - for (int d = 0; d < dev_count; d++) { //for each CUDA device
1383   - cudaGetDeviceProperties(&prop, d); //get the property of the first device
1384   - //float cc = prop.major + prop.minor / 10.0f; //calculate the compute capability
1385   - std::cout << d << ": [" << prop.major << "." << prop.minor << "] " << prop.name << std::endl; //display the device information
1386   - }
1387 1344 if (dev_count > 0 && dev_count > cuda_device) { //if the first device is not an emulator
  1345 + cudaDeviceProp prop;
1388 1346 cudaGetDeviceProperties(&prop, cuda_device); //get the property of the requested CUDA device
1389 1347 if (prop.major != 9999) {
1390   - std::cout << "Using device " << cuda_device << std::endl;
  1348 + std::cout << "Using CUDA device [" << cuda_device << "] to perform a basis projection..." << std::endl;
1391 1349 HANDLE_ERROR(cudaSetDevice(cuda_device));
1392   - int status = project_cublas(outfile, center, basis, M, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
1393   - if (status == 0) return true; //if the cuBLAS function returned correctly, we're done
  1350 + return project_cublas(outfile, center, basis, M, mask, PROGRESS);
1394 1351 }
1395   - } //otherwise continue using the CPU
  1352 + } //otherwise continue using the CPU
  1353 + std::cout << "WARNING: cuBLAS failed, using CPU" << std::endl;
1396 1354 }
1397 1355  
1398 1356 std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file
... ...
stim/math/matrix.h
... ... @@ -6,7 +6,7 @@
6 6 #include <iostream>
7 7 #include <stim/math/vector.h>
8 8 #include <stim/math/vec3.h>
9   -#include <stim/cuda/cudatools/callable.h>
  9 +//#include <stim/cuda/cudatools/callable.h>
10 10  
11 11 namespace stim{
12 12  
... ... @@ -56,7 +56,7 @@ public:
56 56 return C;
57 57 }
58 58  
59   - T& operator()(int row, int col) {
  59 + T& operator()(size_t row, size_t col) {
60 60 return at(row, col);
61 61 }
62 62  
... ... @@ -122,10 +122,11 @@ public:
122 122  
123 123 matrix<T> result(R, rhs.C); //create the output matrix
124 124 T inner; //stores the running inner product
125   - for(size_t c = 0; c < rhs.C; c++){
126   - for(size_t r = 0; r < R; r++){
  125 + size_t c, r, i;
  126 + for(c = 0; c < rhs.C; c++){
  127 + for(r = 0; r < R; r++){
127 128 inner = (T)0;
128   - for(size_t i = 0; i < C; i++){
  129 + for(i = 0; i < C; i++){
129 130 inner += at(r, i) * rhs.at(i, c);
130 131 }
131 132 result.M[c * R + r] = inner;
... ... @@ -151,23 +152,58 @@ public:
151 152 return result;
152 153 }
153 154  
154   - std::string toStr()
155   - {
  155 + /// Sort rows of the matrix by the specified indices
  156 + matrix<T> sort_rows(size_t* idx) {
  157 + matrix<T> result(C, R); //create the output matrix
  158 + size_t r, c;
  159 + for (c = 0; c < C; c++) { //for each column
  160 + for (r = 0; r < R; r++) { //for each row element
  161 + result.M[c * R + r] = M[c * R + idx[r]]; //copy each element of the row into its new position
  162 + }
  163 + }
  164 + return result;
  165 + }
  166 +
  167 + /// Sort columns of the matrix by the specified indices
  168 + matrix<T> sort_cols(size_t* idx) {
  169 + matrix<T> result(C, R);
  170 + size_t c;
  171 + for (c = 0; c < C; c++) { //for each column
  172 + memcpy(&result.M[c * R], &M[idx[c] * R], sizeof(T) * R); //copy the entire column from this matrix to the appropriate location
  173 + }
  174 + return result;
  175 + }
  176 +
  177 + std::string toStr() {
156 178 std::stringstream ss;
157 179  
158   - for(int r = 0; r < R; r++)
159   - {
  180 + for(int r = 0; r < R; r++) {
160 181 ss << "| ";
161   - for(int c=0; c<C; c++)
162   - {
  182 + for(int c=0; c<C; c++) {
163 183 ss << M[c * R + r] << " ";
164 184 }
165 185 ss << "|" << std::endl;
166 186 }
167   -
168 187 return ss.str();
169 188 }
170 189  
  190 + std::string csv() {
  191 + std::stringstream csvss;
  192 + for (size_t i = 0; i < B; i++) {
  193 + csvss << M[i];
  194 + for (size_t j = 0; j < B; j++) csvss << ", " << M[j * B + i];
  195 + csvss << std::endl;
  196 + }
  197 + return csvss.str();
  198 + }
  199 +
  200 + //save the data as a CSV file
  201 + void csv(std::string filename) {
  202 + ofstream basisfile(filename.c_str());
  203 + basisfile << csv();
  204 + basisfile.close();
  205 + }
  206 +
171 207 static matrix<T> I(size_t N) {
172 208  
173 209 matrix<T> result(N, N); //create the identity matrix
... ...