Commit 1c92dffecd442c80bf3562f2958b9bd1aa775ac0
1 parent
f78ceb64
simplified CUDA implementation in covariance and projection calculations
Also updated stim::matrix to allow re-arranging of rows and columns, as well as saving to CSV files.
Showing
2 changed files
with
95 additions
and
101 deletions
Show diff stats
stim/envi/bip.h
... | ... | @@ -1015,7 +1015,7 @@ public: |
1015 | 1015 | stat = cublasCreate(&handle); //create a cuBLAS instance |
1016 | 1016 | if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid |
1017 | 1017 | |
1018 | - else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl; | |
1018 | + //else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl; | |
1019 | 1019 | for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel |
1020 | 1020 | if (mask == NULL || mask[xy] != 0){ |
1021 | 1021 | pixeld(s, xy); //retreive the spectrum at the current xy pixel location |
... | ... | @@ -1054,31 +1054,17 @@ public: |
1054 | 1054 | if(cuda_device >= 0){ //if a CUDA device is specified |
1055 | 1055 | int dev_count; |
1056 | 1056 | HANDLE_ERROR(cudaGetDeviceCount(&dev_count)); //get the number of CUDA devices |
1057 | - std::cout<<"Number of CUDA devices: "<<dev_count<<std::endl; //output the number of CUDA devices | |
1058 | - cudaDeviceProp prop; | |
1059 | - //int best_device_id = 0; //stores the best CUDA device | |
1060 | - //float best_device_cc = 0.0f; //stores the compute capability of the best device | |
1061 | - std::cout<<"CUDA devices----"<<std::endl; | |
1062 | - for(int d = 0; d < dev_count; d++){ //for each CUDA device | |
1063 | - cudaGetDeviceProperties(&prop, d); //get the property of the first device | |
1064 | - //float cc = prop.major + prop.minor / 10.0f; //calculate the compute capability | |
1065 | - std::cout<<d<<": ["<<prop.major<<"."<<prop.minor<<"] "<<prop.name<<std::endl; //display the device information | |
1066 | - //if(cc > best_device_cc){ | |
1067 | - // best_device_cc = cc; //if this is better than the previous device, use it | |
1068 | - // best_device_id = d; | |
1069 | - //} | |
1070 | - } | |
1071 | 1057 | if(dev_count > 0 && dev_count > cuda_device){ //if the first device is not an emulator |
1058 | + cudaDeviceProp prop; | |
1072 | 1059 | cudaGetDeviceProperties(&prop, cuda_device); //get the property of the requested CUDA device |
1073 | 1060 | if (prop.major != 9999) { |
1074 | - std::cout << "Using device " << cuda_device << std::endl; | |
1061 | + std::cout << "Using CUDA device [" << cuda_device << "] to calculate the mean covariance matrix..."<<std::endl; | |
1075 | 1062 | HANDLE_ERROR(cudaSetDevice(cuda_device)); |
1076 | 1063 | int status = co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix |
1077 | 1064 | if (status == 0) return true; //if the cuBLAS function returned correctly, we're done |
1078 | 1065 | } |
1079 | - } //otherwise continue using the CPU | |
1080 | - | |
1081 | - std::cout<<"No supported CUDA devices found or cuBLAS failed. Using CPU"<<std::endl; | |
1066 | + } //otherwise continue using the CPU | |
1067 | + std::cout<<"WARNING: cuBLAS failed, using CPU"<<std::endl; | |
1082 | 1068 | } |
1083 | 1069 | //memory allocation |
1084 | 1070 | unsigned long long XY = X() * Y(); |
... | ... | @@ -1196,32 +1182,20 @@ public: |
1196 | 1182 | /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location |
1197 | 1183 | bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, int cuda_device = 0, bool PROGRESS = false){ |
1198 | 1184 | |
1199 | - if(cuda_device >= 0){ | |
1185 | + if (cuda_device >= 0) { //if a CUDA device is specified | |
1200 | 1186 | int dev_count; |
1201 | 1187 | HANDLE_ERROR(cudaGetDeviceCount(&dev_count)); //get the number of CUDA devices |
1202 | - std::cout<<"Number of CUDA devices: "<<dev_count<<std::endl; //output the number of CUDA devices | |
1203 | - cudaDeviceProp prop; | |
1204 | - //int best_device_id = 0; //stores the best CUDA device | |
1205 | - //float best_device_cc = 0.0f; //stores the compute capability of the best device | |
1206 | - std::cout<<"CUDA devices:"<<std::endl; | |
1207 | - for(int d = 0; d < dev_count; d++){ //for each CUDA device | |
1208 | - cudaGetDeviceProperties(&prop, d); //get the property of the first device | |
1209 | - //float cc = prop.major + prop.minor / 10.0f; //calculate the compute capability | |
1210 | - std::cout<<d<<": ("<<prop.major<<"."<<prop.minor<<") "<<prop.name<<std::endl; //display the device information | |
1211 | - //if(cc > best_device_cc){ | |
1212 | - // best_device_cc = cc; //if this is better than the previous device, use it | |
1213 | - // best_device_id = d; | |
1214 | - //} | |
1215 | - } | |
1216 | - | |
1217 | - if(dev_count > 0 && prop.major != 9999 && dev_count > cuda_device){ //if the first device is not an emulator | |
1218 | - std::cout<<"Using device "<< cuda_device <<std::endl; | |
1219 | - HANDLE_ERROR(cudaSetDevice(cuda_device)); | |
1220 | - int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | |
1221 | - if(status == 0) return true; //if the cuBLAS function returned correctly, we're done | |
1222 | - } //otherwise continue using the CPU | |
1223 | - | |
1224 | - std::cout<<"cuBLAS initialization failed - using CPU"<<std::endl; | |
1188 | + if (dev_count > 0 && dev_count > cuda_device) { //if the first device is not an emulator | |
1189 | + cudaDeviceProp prop; | |
1190 | + cudaGetDeviceProperties(&prop, cuda_device); //get the property of the requested CUDA device | |
1191 | + if (prop.major != 9999) { | |
1192 | + std::cout << "Using CUDA device [" << cuda_device << "] to calculate the noise covariance matrix..." << std::endl; | |
1193 | + HANDLE_ERROR(cudaSetDevice(cuda_device)); | |
1194 | + int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | |
1195 | + if (status == 0) return true; //if the cuBLAS function returned correctly, we're done | |
1196 | + } | |
1197 | + } //otherwise continue using the CPU | |
1198 | + std::cout << "WARNING: cuBLAS failed, using CPU" << std::endl; | |
1225 | 1199 | } |
1226 | 1200 | |
1227 | 1201 | progress = 0; |
... | ... | @@ -1277,8 +1251,8 @@ public: |
1277 | 1251 | /// @param mask is a character mask used to limit processing to valid pixels |
1278 | 1252 | bool project_cublas(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){ |
1279 | 1253 | |
1280 | - cudaError_t cudaStat; | |
1281 | - cublasStatus_t stat; | |
1254 | + //cudaError_t cudaStat; | |
1255 | + //cublasStatus_t stat; | |
1282 | 1256 | cublasHandle_t handle; |
1283 | 1257 | |
1284 | 1258 | std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file |
... | ... | @@ -1289,12 +1263,12 @@ public: |
1289 | 1263 | |
1290 | 1264 | double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file |
1291 | 1265 | double* s_dev; //declare a device pointer that will store the spectrum on the GPU |
1292 | - cudaStat = cudaMalloc(&s_dev, B * sizeof(double)); //allocate space on the CUDA device for the spectrum | |
1266 | + HANDLE_ERROR(cudaMalloc(&s_dev, B * sizeof(double))); //allocate space on the CUDA device for the spectrum | |
1293 | 1267 | |
1294 | 1268 | |
1295 | 1269 | double* basis_dev; // device pointer on the GPU |
1296 | - cudaStat = cudaMalloc(&basis_dev, M * B * sizeof(double)); // allocate space on the CUDA device | |
1297 | - cudaStat = cudaMemset(basis_dev, 0, M * B * sizeof(double)); // initialize basis_dev to zero (0) | |
1270 | + HANDLE_ERROR(cudaMalloc(&basis_dev, M * B * sizeof(double))); // allocate space on the CUDA device | |
1271 | + HANDLE_ERROR(cudaMemset(basis_dev, 0, M * B * sizeof(double))); // initialize basis_dev to zero (0) | |
1298 | 1272 | |
1299 | 1273 | |
1300 | 1274 | /// transposing basis matrix (because cuBLAS is column-major) |
... | ... | @@ -1303,28 +1277,24 @@ public: |
1303 | 1277 | for (int i = 0; i<M; i++) |
1304 | 1278 | for (int j = 0; j<B; j++) |
1305 | 1279 | basis_Transposed[i+j*M] = basis[i*B+j]; |
1280 | + //copy the basis_Transposed matrix to the CUDA device (both matrices are stored in column-major format) | |
1281 | + CUBLAS_HANDLE_ERROR(cublasSetMatrix((int)M, (int)B, sizeof(double),basis_Transposed, (int)M, basis_dev, (int)M)); | |
1306 | 1282 | |
1307 | - stat = cublasSetMatrix((int)M, (int)B, sizeof(double),basis_Transposed, (int)M, basis_dev, (int)M); //copy the basis_Transposed matrix to the CUDA device (both matrices are stored in column-major format) | |
1308 | - | |
1309 | - double* center_dev; //declare a device pointer that will store the center (average) | |
1310 | - cudaStat = cudaMalloc(¢er_dev, B * sizeof(double)); //allocate space on the CUDA device for the center (average) | |
1311 | - stat = cublasSetVector((int)B, sizeof(double), center, 1, center_dev, 1); //copy the center vector (average) to the CUDA device (from host to device) | |
1283 | + double* center_dev; //declare a device pointer that will store the center (average) | |
1284 | + HANDLE_ERROR(cudaMalloc(¢er_dev, B * sizeof(double))); //allocate space on the CUDA device for the center (average) | |
1285 | + CUBLAS_HANDLE_ERROR(cublasSetVector((int)B, sizeof(double), center, 1, center_dev, 1)); //copy the center vector (average) to the CUDA device (from host to device) | |
1312 | 1286 | |
1313 | 1287 | |
1314 | - double* A = (double*)malloc(sizeof(double) * M); //allocate space for the projected pixel on the host | |
1315 | - double* A_dev; //declare a device pointer that will store the projected pixel on the GPU | |
1316 | - cudaStat = cudaMalloc(&A_dev,M * sizeof(double)); //allocate space on the CUDA device for the projected pixel | |
1317 | - cudaStat = cudaMemset(A_dev, 0,M * sizeof(double)); //initialize the projected pixel to zero (0) | |
1288 | + double* A = (double*)malloc(sizeof(double) * M); //allocate space for the projected pixel on the host | |
1289 | + double* A_dev; //declare a device pointer that will store the projected pixel on the GPU | |
1290 | + HANDLE_ERROR(cudaMalloc(&A_dev,M * sizeof(double))); //allocate space on the CUDA device for the projected pixel | |
1291 | + HANDLE_ERROR(cudaMemset(A_dev, 0,M * sizeof(double))); //initialize the projected pixel to zero (0) | |
1318 | 1292 | |
1319 | 1293 | double axpy_alpha = -1; //multiplication factor for the center (in order to perform a subtraction) |
1320 | 1294 | double axpy_alpha2 = 1; //multiplication factor for the matrix-vector multiplication |
1321 | 1295 | double axpy_beta = 0; //multiplication factor for the matrix-vector multiplication (there is no second scalor) |
1322 | 1296 | |
1323 | - stat = cublasCreate(&handle); //create a cuBLAS instance | |
1324 | - if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid | |
1325 | - printf ("CUBLAS initialization failed\n"); | |
1326 | - return EXIT_FAILURE; | |
1327 | - } | |
1297 | + CUBLAS_HANDLE_ERROR(cublasCreate(&handle)); //create a cuBLAS instance | |
1328 | 1298 | |
1329 | 1299 | T* temp = (T*)malloc(sizeof(T) * M); //allocate space for the projected pixel to be written on the disc |
1330 | 1300 | size_t i; |
... | ... | @@ -1332,14 +1302,11 @@ public: |
1332 | 1302 | if (mask == NULL || mask[xy] != 0){ |
1333 | 1303 | pixeld(s, xy); //retreive the spectrum at the current xy pixel location |
1334 | 1304 | |
1335 | - stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device | |
1336 | - stat = cublasDaxpy(handle, (int)B, &axpy_alpha, center_dev, 1, s_dev, 1); //subtract the center (average) | |
1337 | - stat = cublasDgemv(handle,CUBLAS_OP_N,(int)M,(int)B,&axpy_alpha2,basis_dev,(int)M,s_dev,1,&axpy_beta,A_dev,1); //performs the matrix-vector multiplication | |
1338 | - stat = cublasGetVector((int)M, sizeof(double), A_dev, 1, A, 1); //copy the projected pixel to the host (from GPU to CPU) | |
1339 | - | |
1340 | - //stat = cublasGetVector((int)B, sizeof(double), s_dev, 1, A, 1); //copy the projected pixel to the host (from GPU to CPU) | |
1341 | - | |
1342 | - //std::copy<double*, T*>(A, A + M, temp); | |
1305 | + CUBLAS_HANDLE_ERROR(cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1)); //copy the spectrum from the host to the device | |
1306 | + CUBLAS_HANDLE_ERROR(cublasDaxpy(handle, (int)B, &axpy_alpha, center_dev, 1, s_dev, 1)); //subtract the center (average) | |
1307 | + CUBLAS_HANDLE_ERROR(cublasDgemv(handle,CUBLAS_OP_N,(int)M,(int)B,&axpy_alpha2,basis_dev,(int)M,s_dev,1,&axpy_beta,A_dev,1)); //performs the matrix-vector multiplication | |
1308 | + CUBLAS_HANDLE_ERROR(cublasGetVector((int)M, sizeof(double), A_dev, 1, A, 1)); //copy the projected pixel to the host (from GPU to CPU) | |
1309 | + | |
1343 | 1310 | for(i = 0; i < M; i++) temp[i] = (T)A[i]; //casting projected pixel from double to whatever T is |
1344 | 1311 | } |
1345 | 1312 | else |
... | ... | @@ -1351,10 +1318,11 @@ public: |
1351 | 1318 | } |
1352 | 1319 | |
1353 | 1320 | //clean up allocated device memory |
1354 | - cudaFree(A_dev); | |
1355 | - cudaFree(s_dev); | |
1356 | - cudaFree(basis_dev); | |
1357 | - cudaFree(center_dev); | |
1321 | + HANDLE_ERROR(cudaFree(A_dev)); | |
1322 | + HANDLE_ERROR(cudaFree(s_dev)); | |
1323 | + HANDLE_ERROR(cudaFree(basis_dev)); | |
1324 | + HANDLE_ERROR(cudaFree(center_dev)); | |
1325 | + CUBLAS_HANDLE_ERROR(cublasDestroy(handle)); | |
1358 | 1326 | free(A); |
1359 | 1327 | free(s); |
1360 | 1328 | free(temp); |
... | ... | @@ -1370,29 +1338,19 @@ public: |
1370 | 1338 | /// @param M is the number of basis vectors |
1371 | 1339 | /// @param mask is a character mask used to limit processing to valid pixels |
1372 | 1340 | bool project(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, int cuda_device = 0, bool PROGRESS = false){ |
1373 | - | |
1374 | 1341 | if (cuda_device >= 0) { //if a CUDA device is specified |
1375 | 1342 | int dev_count; |
1376 | 1343 | HANDLE_ERROR(cudaGetDeviceCount(&dev_count)); //get the number of CUDA devices |
1377 | - std::cout << "Number of CUDA devices: " << dev_count << std::endl; //output the number of CUDA devices | |
1378 | - cudaDeviceProp prop; | |
1379 | - //int best_device_id = 0; //stores the best CUDA device | |
1380 | - //float best_device_cc = 0.0f; //stores the compute capability of the best device | |
1381 | - std::cout << "CUDA devices----" << std::endl; | |
1382 | - for (int d = 0; d < dev_count; d++) { //for each CUDA device | |
1383 | - cudaGetDeviceProperties(&prop, d); //get the property of the first device | |
1384 | - //float cc = prop.major + prop.minor / 10.0f; //calculate the compute capability | |
1385 | - std::cout << d << ": [" << prop.major << "." << prop.minor << "] " << prop.name << std::endl; //display the device information | |
1386 | - } | |
1387 | 1344 | if (dev_count > 0 && dev_count > cuda_device) { //if the first device is not an emulator |
1345 | + cudaDeviceProp prop; | |
1388 | 1346 | cudaGetDeviceProperties(&prop, cuda_device); //get the property of the requested CUDA device |
1389 | 1347 | if (prop.major != 9999) { |
1390 | - std::cout << "Using device " << cuda_device << std::endl; | |
1348 | + std::cout << "Using CUDA device [" << cuda_device << "] to perform a basis projection..." << std::endl; | |
1391 | 1349 | HANDLE_ERROR(cudaSetDevice(cuda_device)); |
1392 | - int status = project_cublas(outfile, center, basis, M, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | |
1393 | - if (status == 0) return true; //if the cuBLAS function returned correctly, we're done | |
1350 | + return project_cublas(outfile, center, basis, M, mask, PROGRESS); | |
1394 | 1351 | } |
1395 | - } //otherwise continue using the CPU | |
1352 | + } //otherwise continue using the CPU | |
1353 | + std::cout << "WARNING: cuBLAS failed, using CPU" << std::endl; | |
1396 | 1354 | } |
1397 | 1355 | |
1398 | 1356 | std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file | ... | ... |
stim/math/matrix.h
... | ... | @@ -6,7 +6,7 @@ |
6 | 6 | #include <iostream> |
7 | 7 | #include <stim/math/vector.h> |
8 | 8 | #include <stim/math/vec3.h> |
9 | -#include <stim/cuda/cudatools/callable.h> | |
9 | +//#include <stim/cuda/cudatools/callable.h> | |
10 | 10 | |
11 | 11 | namespace stim{ |
12 | 12 | |
... | ... | @@ -56,7 +56,7 @@ public: |
56 | 56 | return C; |
57 | 57 | } |
58 | 58 | |
59 | - T& operator()(int row, int col) { | |
59 | + T& operator()(size_t row, size_t col) { | |
60 | 60 | return at(row, col); |
61 | 61 | } |
62 | 62 | |
... | ... | @@ -122,10 +122,11 @@ public: |
122 | 122 | |
123 | 123 | matrix<T> result(R, rhs.C); //create the output matrix |
124 | 124 | T inner; //stores the running inner product |
125 | - for(size_t c = 0; c < rhs.C; c++){ | |
126 | - for(size_t r = 0; r < R; r++){ | |
125 | + size_t c, r, i; | |
126 | + for(c = 0; c < rhs.C; c++){ | |
127 | + for(r = 0; r < R; r++){ | |
127 | 128 | inner = (T)0; |
128 | - for(size_t i = 0; i < C; i++){ | |
129 | + for(i = 0; i < C; i++){ | |
129 | 130 | inner += at(r, i) * rhs.at(i, c); |
130 | 131 | } |
131 | 132 | result.M[c * R + r] = inner; |
... | ... | @@ -151,23 +152,58 @@ public: |
151 | 152 | return result; |
152 | 153 | } |
153 | 154 | |
154 | - std::string toStr() | |
155 | - { | |
155 | + /// Sort rows of the matrix by the specified indices | |
156 | + matrix<T> sort_rows(size_t* idx) { | |
157 | + matrix<T> result(C, R); //create the output matrix | |
158 | + size_t r, c; | |
159 | + for (c = 0; c < C; c++) { //for each column | |
160 | + for (r = 0; r < R; r++) { //for each row element | |
161 | + result.M[c * R + r] = M[c * R + idx[r]]; //copy each element of the row into its new position | |
162 | + } | |
163 | + } | |
164 | + return result; | |
165 | + } | |
166 | + | |
167 | + /// Sort columns of the matrix by the specified indices | |
168 | + matrix<T> sort_cols(size_t* idx) { | |
169 | + matrix<T> result(C, R); | |
170 | + size_t c; | |
171 | + for (c = 0; c < C; c++) { //for each column | |
172 | + memcpy(&result.M[c * R], &M[idx[c] * R], sizeof(T) * R); //copy the entire column from this matrix to the appropriate location | |
173 | + } | |
174 | + return result; | |
175 | + } | |
176 | + | |
177 | + std::string toStr() { | |
156 | 178 | std::stringstream ss; |
157 | 179 | |
158 | - for(int r = 0; r < R; r++) | |
159 | - { | |
180 | + for(int r = 0; r < R; r++) { | |
160 | 181 | ss << "| "; |
161 | - for(int c=0; c<C; c++) | |
162 | - { | |
182 | + for(int c=0; c<C; c++) { | |
163 | 183 | ss << M[c * R + r] << " "; |
164 | 184 | } |
165 | 185 | ss << "|" << std::endl; |
166 | 186 | } |
167 | - | |
168 | 187 | return ss.str(); |
169 | 188 | } |
170 | 189 | |
190 | + std::string csv() { | |
191 | + std::stringstream csvss; | |
192 | + for (size_t i = 0; i < B; i++) { | |
193 | + csvss << M[i]; | |
194 | + for (size_t j = 0; j < B; j++) csvss << ", " << M[j * B + i]; | |
195 | + csvss << std::endl; | |
196 | + } | |
197 | + return csvss.str(); | |
198 | + } | |
199 | + | |
200 | + //save the data as a CSV file | |
201 | + void csv(std::string filename) { | |
202 | + ofstream basisfile(filename.c_str()); | |
203 | + basisfile << csv(); | |
204 | + basisfile.close(); | |
205 | + } | |
206 | + | |
171 | 207 | static matrix<T> I(size_t N) { |
172 | 208 | |
173 | 209 | matrix<T> result(N, N); //create the identity matrix | ... | ... |