Commit 1c92dffecd442c80bf3562f2958b9bd1aa775ac0

Authored by David Mayerich
1 parent f78ceb64

simplified CUDA implementation in covariance and projection calculations

Also updated stim::matrix to allow re-arranging of rows and columns, as well as saving to CSV files.
Showing 2 changed files with 95 additions and 101 deletions   Show diff stats
@@ -1015,7 +1015,7 @@ public: @@ -1015,7 +1015,7 @@ public:
1015 stat = cublasCreate(&handle); //create a cuBLAS instance 1015 stat = cublasCreate(&handle); //create a cuBLAS instance
1016 if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid 1016 if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid
1017 1017
1018 - else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl; 1018 + //else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl;
1019 for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel 1019 for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel
1020 if (mask == NULL || mask[xy] != 0){ 1020 if (mask == NULL || mask[xy] != 0){
1021 pixeld(s, xy); //retreive the spectrum at the current xy pixel location 1021 pixeld(s, xy); //retreive the spectrum at the current xy pixel location
@@ -1054,31 +1054,17 @@ public: @@ -1054,31 +1054,17 @@ public:
1054 if(cuda_device >= 0){ //if a CUDA device is specified 1054 if(cuda_device >= 0){ //if a CUDA device is specified
1055 int dev_count; 1055 int dev_count;
1056 HANDLE_ERROR(cudaGetDeviceCount(&dev_count)); //get the number of CUDA devices 1056 HANDLE_ERROR(cudaGetDeviceCount(&dev_count)); //get the number of CUDA devices
1057 - std::cout<<"Number of CUDA devices: "<<dev_count<<std::endl; //output the number of CUDA devices  
1058 - cudaDeviceProp prop;  
1059 - //int best_device_id = 0; //stores the best CUDA device  
1060 - //float best_device_cc = 0.0f; //stores the compute capability of the best device  
1061 - std::cout<<"CUDA devices----"<<std::endl;  
1062 - for(int d = 0; d < dev_count; d++){ //for each CUDA device  
1063 - cudaGetDeviceProperties(&prop, d); //get the property of the first device  
1064 - //float cc = prop.major + prop.minor / 10.0f; //calculate the compute capability  
1065 - std::cout<<d<<": ["<<prop.major<<"."<<prop.minor<<"] "<<prop.name<<std::endl; //display the device information  
1066 - //if(cc > best_device_cc){  
1067 - // best_device_cc = cc; //if this is better than the previous device, use it  
1068 - // best_device_id = d;  
1069 - //}  
1070 - }  
1071 if(dev_count > 0 && dev_count > cuda_device){ //if the first device is not an emulator 1057 if(dev_count > 0 && dev_count > cuda_device){ //if the first device is not an emulator
  1058 + cudaDeviceProp prop;
1072 cudaGetDeviceProperties(&prop, cuda_device); //get the property of the requested CUDA device 1059 cudaGetDeviceProperties(&prop, cuda_device); //get the property of the requested CUDA device
1073 if (prop.major != 9999) { 1060 if (prop.major != 9999) {
1074 - std::cout << "Using device " << cuda_device << std::endl; 1061 + std::cout << "Using CUDA device [" << cuda_device << "] to calculate the mean covariance matrix..."<<std::endl;
1075 HANDLE_ERROR(cudaSetDevice(cuda_device)); 1062 HANDLE_ERROR(cudaSetDevice(cuda_device));
1076 int status = co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix 1063 int status = co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
1077 if (status == 0) return true; //if the cuBLAS function returned correctly, we're done 1064 if (status == 0) return true; //if the cuBLAS function returned correctly, we're done
1078 } 1065 }
1079 - } //otherwise continue using the CPU  
1080 -  
1081 - std::cout<<"No supported CUDA devices found or cuBLAS failed. Using CPU"<<std::endl; 1066 + } //otherwise continue using the CPU
  1067 + std::cout<<"WARNING: cuBLAS failed, using CPU"<<std::endl;
1082 } 1068 }
1083 //memory allocation 1069 //memory allocation
1084 unsigned long long XY = X() * Y(); 1070 unsigned long long XY = X() * Y();
@@ -1196,32 +1182,20 @@ public: @@ -1196,32 +1182,20 @@ public:
1196 /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location 1182 /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
1197 bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, int cuda_device = 0, bool PROGRESS = false){ 1183 bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, int cuda_device = 0, bool PROGRESS = false){
1198 1184
1199 - if(cuda_device >= 0){ 1185 + if (cuda_device >= 0) { //if a CUDA device is specified
1200 int dev_count; 1186 int dev_count;
1201 HANDLE_ERROR(cudaGetDeviceCount(&dev_count)); //get the number of CUDA devices 1187 HANDLE_ERROR(cudaGetDeviceCount(&dev_count)); //get the number of CUDA devices
1202 - std::cout<<"Number of CUDA devices: "<<dev_count<<std::endl; //output the number of CUDA devices  
1203 - cudaDeviceProp prop;  
1204 - //int best_device_id = 0; //stores the best CUDA device  
1205 - //float best_device_cc = 0.0f; //stores the compute capability of the best device  
1206 - std::cout<<"CUDA devices:"<<std::endl;  
1207 - for(int d = 0; d < dev_count; d++){ //for each CUDA device  
1208 - cudaGetDeviceProperties(&prop, d); //get the property of the first device  
1209 - //float cc = prop.major + prop.minor / 10.0f; //calculate the compute capability  
1210 - std::cout<<d<<": ("<<prop.major<<"."<<prop.minor<<") "<<prop.name<<std::endl; //display the device information  
1211 - //if(cc > best_device_cc){  
1212 - // best_device_cc = cc; //if this is better than the previous device, use it  
1213 - // best_device_id = d;  
1214 - //}  
1215 - }  
1216 -  
1217 - if(dev_count > 0 && prop.major != 9999 && dev_count > cuda_device){ //if the first device is not an emulator  
1218 - std::cout<<"Using device "<< cuda_device <<std::endl;  
1219 - HANDLE_ERROR(cudaSetDevice(cuda_device));  
1220 - int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix  
1221 - if(status == 0) return true; //if the cuBLAS function returned correctly, we're done  
1222 - } //otherwise continue using the CPU  
1223 -  
1224 - std::cout<<"cuBLAS initialization failed - using CPU"<<std::endl; 1188 + if (dev_count > 0 && dev_count > cuda_device) { //if the first device is not an emulator
  1189 + cudaDeviceProp prop;
  1190 + cudaGetDeviceProperties(&prop, cuda_device); //get the property of the requested CUDA device
  1191 + if (prop.major != 9999) {
  1192 + std::cout << "Using CUDA device [" << cuda_device << "] to calculate the noise covariance matrix..." << std::endl;
  1193 + HANDLE_ERROR(cudaSetDevice(cuda_device));
  1194 + int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
  1195 + if (status == 0) return true; //if the cuBLAS function returned correctly, we're done
  1196 + }
  1197 + } //otherwise continue using the CPU
  1198 + std::cout << "WARNING: cuBLAS failed, using CPU" << std::endl;
1225 } 1199 }
1226 1200
1227 progress = 0; 1201 progress = 0;
@@ -1277,8 +1251,8 @@ public: @@ -1277,8 +1251,8 @@ public:
1277 /// @param mask is a character mask used to limit processing to valid pixels 1251 /// @param mask is a character mask used to limit processing to valid pixels
1278 bool project_cublas(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){ 1252 bool project_cublas(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){
1279 1253
1280 - cudaError_t cudaStat;  
1281 - cublasStatus_t stat; 1254 + //cudaError_t cudaStat;
  1255 + //cublasStatus_t stat;
1282 cublasHandle_t handle; 1256 cublasHandle_t handle;
1283 1257
1284 std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file 1258 std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file
@@ -1289,12 +1263,12 @@ public: @@ -1289,12 +1263,12 @@ public:
1289 1263
1290 double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file 1264 double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file
1291 double* s_dev; //declare a device pointer that will store the spectrum on the GPU 1265 double* s_dev; //declare a device pointer that will store the spectrum on the GPU
1292 - cudaStat = cudaMalloc(&s_dev, B * sizeof(double)); //allocate space on the CUDA device for the spectrum 1266 + HANDLE_ERROR(cudaMalloc(&s_dev, B * sizeof(double))); //allocate space on the CUDA device for the spectrum
1293 1267
1294 1268
1295 double* basis_dev; // device pointer on the GPU 1269 double* basis_dev; // device pointer on the GPU
1296 - cudaStat = cudaMalloc(&basis_dev, M * B * sizeof(double)); // allocate space on the CUDA device  
1297 - cudaStat = cudaMemset(basis_dev, 0, M * B * sizeof(double)); // initialize basis_dev to zero (0) 1270 + HANDLE_ERROR(cudaMalloc(&basis_dev, M * B * sizeof(double))); // allocate space on the CUDA device
  1271 + HANDLE_ERROR(cudaMemset(basis_dev, 0, M * B * sizeof(double))); // initialize basis_dev to zero (0)
1298 1272
1299 1273
1300 /// transposing basis matrix (because cuBLAS is column-major) 1274 /// transposing basis matrix (because cuBLAS is column-major)
@@ -1303,28 +1277,24 @@ public: @@ -1303,28 +1277,24 @@ public:
1303 for (int i = 0; i<M; i++) 1277 for (int i = 0; i<M; i++)
1304 for (int j = 0; j<B; j++) 1278 for (int j = 0; j<B; j++)
1305 basis_Transposed[i+j*M] = basis[i*B+j]; 1279 basis_Transposed[i+j*M] = basis[i*B+j];
  1280 + //copy the basis_Transposed matrix to the CUDA device (both matrices are stored in column-major format)
  1281 + CUBLAS_HANDLE_ERROR(cublasSetMatrix((int)M, (int)B, sizeof(double),basis_Transposed, (int)M, basis_dev, (int)M));
1306 1282
1307 - stat = cublasSetMatrix((int)M, (int)B, sizeof(double),basis_Transposed, (int)M, basis_dev, (int)M); //copy the basis_Transposed matrix to the CUDA device (both matrices are stored in column-major format)  
1308 -  
1309 - double* center_dev; //declare a device pointer that will store the center (average)  
1310 - cudaStat = cudaMalloc(&center_dev, B * sizeof(double)); //allocate space on the CUDA device for the center (average)  
1311 - stat = cublasSetVector((int)B, sizeof(double), center, 1, center_dev, 1); //copy the center vector (average) to the CUDA device (from host to device) 1283 + double* center_dev; //declare a device pointer that will store the center (average)
  1284 + HANDLE_ERROR(cudaMalloc(&center_dev, B * sizeof(double))); //allocate space on the CUDA device for the center (average)
  1285 + CUBLAS_HANDLE_ERROR(cublasSetVector((int)B, sizeof(double), center, 1, center_dev, 1)); //copy the center vector (average) to the CUDA device (from host to device)
1312 1286
1313 1287
1314 - double* A = (double*)malloc(sizeof(double) * M); //allocate space for the projected pixel on the host  
1315 - double* A_dev; //declare a device pointer that will store the projected pixel on the GPU  
1316 - cudaStat = cudaMalloc(&A_dev,M * sizeof(double)); //allocate space on the CUDA device for the projected pixel  
1317 - cudaStat = cudaMemset(A_dev, 0,M * sizeof(double)); //initialize the projected pixel to zero (0) 1288 + double* A = (double*)malloc(sizeof(double) * M); //allocate space for the projected pixel on the host
  1289 + double* A_dev; //declare a device pointer that will store the projected pixel on the GPU
  1290 + HANDLE_ERROR(cudaMalloc(&A_dev,M * sizeof(double))); //allocate space on the CUDA device for the projected pixel
  1291 + HANDLE_ERROR(cudaMemset(A_dev, 0,M * sizeof(double))); //initialize the projected pixel to zero (0)
1318 1292
1319 double axpy_alpha = -1; //multiplication factor for the center (in order to perform a subtraction) 1293 double axpy_alpha = -1; //multiplication factor for the center (in order to perform a subtraction)
1320 double axpy_alpha2 = 1; //multiplication factor for the matrix-vector multiplication 1294 double axpy_alpha2 = 1; //multiplication factor for the matrix-vector multiplication
1321 double axpy_beta = 0; //multiplication factor for the matrix-vector multiplication (there is no second scalor) 1295 double axpy_beta = 0; //multiplication factor for the matrix-vector multiplication (there is no second scalor)
1322 1296
1323 - stat = cublasCreate(&handle); //create a cuBLAS instance  
1324 - if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid  
1325 - printf ("CUBLAS initialization failed\n");  
1326 - return EXIT_FAILURE;  
1327 - } 1297 + CUBLAS_HANDLE_ERROR(cublasCreate(&handle)); //create a cuBLAS instance
1328 1298
1329 T* temp = (T*)malloc(sizeof(T) * M); //allocate space for the projected pixel to be written on the disc 1299 T* temp = (T*)malloc(sizeof(T) * M); //allocate space for the projected pixel to be written on the disc
1330 size_t i; 1300 size_t i;
@@ -1332,14 +1302,11 @@ public: @@ -1332,14 +1302,11 @@ public:
1332 if (mask == NULL || mask[xy] != 0){ 1302 if (mask == NULL || mask[xy] != 0){
1333 pixeld(s, xy); //retreive the spectrum at the current xy pixel location 1303 pixeld(s, xy); //retreive the spectrum at the current xy pixel location
1334 1304
1335 - stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device  
1336 - stat = cublasDaxpy(handle, (int)B, &axpy_alpha, center_dev, 1, s_dev, 1); //subtract the center (average)  
1337 - stat = cublasDgemv(handle,CUBLAS_OP_N,(int)M,(int)B,&axpy_alpha2,basis_dev,(int)M,s_dev,1,&axpy_beta,A_dev,1); //performs the matrix-vector multiplication  
1338 - stat = cublasGetVector((int)M, sizeof(double), A_dev, 1, A, 1); //copy the projected pixel to the host (from GPU to CPU)  
1339 -  
1340 - //stat = cublasGetVector((int)B, sizeof(double), s_dev, 1, A, 1); //copy the projected pixel to the host (from GPU to CPU)  
1341 -  
1342 - //std::copy<double*, T*>(A, A + M, temp); 1305 + CUBLAS_HANDLE_ERROR(cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1)); //copy the spectrum from the host to the device
  1306 + CUBLAS_HANDLE_ERROR(cublasDaxpy(handle, (int)B, &axpy_alpha, center_dev, 1, s_dev, 1)); //subtract the center (average)
  1307 + CUBLAS_HANDLE_ERROR(cublasDgemv(handle,CUBLAS_OP_N,(int)M,(int)B,&axpy_alpha2,basis_dev,(int)M,s_dev,1,&axpy_beta,A_dev,1)); //performs the matrix-vector multiplication
  1308 + CUBLAS_HANDLE_ERROR(cublasGetVector((int)M, sizeof(double), A_dev, 1, A, 1)); //copy the projected pixel to the host (from GPU to CPU)
  1309 +
1343 for(i = 0; i < M; i++) temp[i] = (T)A[i]; //casting projected pixel from double to whatever T is 1310 for(i = 0; i < M; i++) temp[i] = (T)A[i]; //casting projected pixel from double to whatever T is
1344 } 1311 }
1345 else 1312 else
@@ -1351,10 +1318,11 @@ public: @@ -1351,10 +1318,11 @@ public:
1351 } 1318 }
1352 1319
1353 //clean up allocated device memory 1320 //clean up allocated device memory
1354 - cudaFree(A_dev);  
1355 - cudaFree(s_dev);  
1356 - cudaFree(basis_dev);  
1357 - cudaFree(center_dev); 1321 + HANDLE_ERROR(cudaFree(A_dev));
  1322 + HANDLE_ERROR(cudaFree(s_dev));
  1323 + HANDLE_ERROR(cudaFree(basis_dev));
  1324 + HANDLE_ERROR(cudaFree(center_dev));
  1325 + CUBLAS_HANDLE_ERROR(cublasDestroy(handle));
1358 free(A); 1326 free(A);
1359 free(s); 1327 free(s);
1360 free(temp); 1328 free(temp);
@@ -1370,29 +1338,19 @@ public: @@ -1370,29 +1338,19 @@ public:
1370 /// @param M is the number of basis vectors 1338 /// @param M is the number of basis vectors
1371 /// @param mask is a character mask used to limit processing to valid pixels 1339 /// @param mask is a character mask used to limit processing to valid pixels
1372 bool project(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, int cuda_device = 0, bool PROGRESS = false){ 1340 bool project(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, int cuda_device = 0, bool PROGRESS = false){
1373 -  
1374 if (cuda_device >= 0) { //if a CUDA device is specified 1341 if (cuda_device >= 0) { //if a CUDA device is specified
1375 int dev_count; 1342 int dev_count;
1376 HANDLE_ERROR(cudaGetDeviceCount(&dev_count)); //get the number of CUDA devices 1343 HANDLE_ERROR(cudaGetDeviceCount(&dev_count)); //get the number of CUDA devices
1377 - std::cout << "Number of CUDA devices: " << dev_count << std::endl; //output the number of CUDA devices  
1378 - cudaDeviceProp prop;  
1379 - //int best_device_id = 0; //stores the best CUDA device  
1380 - //float best_device_cc = 0.0f; //stores the compute capability of the best device  
1381 - std::cout << "CUDA devices----" << std::endl;  
1382 - for (int d = 0; d < dev_count; d++) { //for each CUDA device  
1383 - cudaGetDeviceProperties(&prop, d); //get the property of the first device  
1384 - //float cc = prop.major + prop.minor / 10.0f; //calculate the compute capability  
1385 - std::cout << d << ": [" << prop.major << "." << prop.minor << "] " << prop.name << std::endl; //display the device information  
1386 - }  
1387 if (dev_count > 0 && dev_count > cuda_device) { //if the first device is not an emulator 1344 if (dev_count > 0 && dev_count > cuda_device) { //if the first device is not an emulator
  1345 + cudaDeviceProp prop;
1388 cudaGetDeviceProperties(&prop, cuda_device); //get the property of the requested CUDA device 1346 cudaGetDeviceProperties(&prop, cuda_device); //get the property of the requested CUDA device
1389 if (prop.major != 9999) { 1347 if (prop.major != 9999) {
1390 - std::cout << "Using device " << cuda_device << std::endl; 1348 + std::cout << "Using CUDA device [" << cuda_device << "] to perform a basis projection..." << std::endl;
1391 HANDLE_ERROR(cudaSetDevice(cuda_device)); 1349 HANDLE_ERROR(cudaSetDevice(cuda_device));
1392 - int status = project_cublas(outfile, center, basis, M, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix  
1393 - if (status == 0) return true; //if the cuBLAS function returned correctly, we're done 1350 + return project_cublas(outfile, center, basis, M, mask, PROGRESS);
1394 } 1351 }
1395 - } //otherwise continue using the CPU 1352 + } //otherwise continue using the CPU
  1353 + std::cout << "WARNING: cuBLAS failed, using CPU" << std::endl;
1396 } 1354 }
1397 1355
1398 std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file 1356 std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file
stim/math/matrix.h
@@ -6,7 +6,7 @@ @@ -6,7 +6,7 @@
6 #include <iostream> 6 #include <iostream>
7 #include <stim/math/vector.h> 7 #include <stim/math/vector.h>
8 #include <stim/math/vec3.h> 8 #include <stim/math/vec3.h>
9 -#include <stim/cuda/cudatools/callable.h> 9 +//#include <stim/cuda/cudatools/callable.h>
10 10
11 namespace stim{ 11 namespace stim{
12 12
@@ -56,7 +56,7 @@ public: @@ -56,7 +56,7 @@ public:
56 return C; 56 return C;
57 } 57 }
58 58
59 - T& operator()(int row, int col) { 59 + T& operator()(size_t row, size_t col) {
60 return at(row, col); 60 return at(row, col);
61 } 61 }
62 62
@@ -122,10 +122,11 @@ public: @@ -122,10 +122,11 @@ public:
122 122
123 matrix<T> result(R, rhs.C); //create the output matrix 123 matrix<T> result(R, rhs.C); //create the output matrix
124 T inner; //stores the running inner product 124 T inner; //stores the running inner product
125 - for(size_t c = 0; c < rhs.C; c++){  
126 - for(size_t r = 0; r < R; r++){ 125 + size_t c, r, i;
  126 + for(c = 0; c < rhs.C; c++){
  127 + for(r = 0; r < R; r++){
127 inner = (T)0; 128 inner = (T)0;
128 - for(size_t i = 0; i < C; i++){ 129 + for(i = 0; i < C; i++){
129 inner += at(r, i) * rhs.at(i, c); 130 inner += at(r, i) * rhs.at(i, c);
130 } 131 }
131 result.M[c * R + r] = inner; 132 result.M[c * R + r] = inner;
@@ -151,23 +152,58 @@ public: @@ -151,23 +152,58 @@ public:
151 return result; 152 return result;
152 } 153 }
153 154
154 - std::string toStr()  
155 - { 155 + /// Sort rows of the matrix by the specified indices
  156 + matrix<T> sort_rows(size_t* idx) {
  157 + matrix<T> result(C, R); //create the output matrix
  158 + size_t r, c;
  159 + for (c = 0; c < C; c++) { //for each column
  160 + for (r = 0; r < R; r++) { //for each row element
  161 + result.M[c * R + r] = M[c * R + idx[r]]; //copy each element of the row into its new position
  162 + }
  163 + }
  164 + return result;
  165 + }
  166 +
  167 + /// Sort columns of the matrix by the specified indices
  168 + matrix<T> sort_cols(size_t* idx) {
  169 + matrix<T> result(C, R);
  170 + size_t c;
  171 + for (c = 0; c < C; c++) { //for each column
  172 + memcpy(&result.M[c * R], &M[idx[c] * R], sizeof(T) * R); //copy the entire column from this matrix to the appropriate location
  173 + }
  174 + return result;
  175 + }
  176 +
  177 + std::string toStr() {
156 std::stringstream ss; 178 std::stringstream ss;
157 179
158 - for(int r = 0; r < R; r++)  
159 - { 180 + for(int r = 0; r < R; r++) {
160 ss << "| "; 181 ss << "| ";
161 - for(int c=0; c<C; c++)  
162 - { 182 + for(int c=0; c<C; c++) {
163 ss << M[c * R + r] << " "; 183 ss << M[c * R + r] << " ";
164 } 184 }
165 ss << "|" << std::endl; 185 ss << "|" << std::endl;
166 } 186 }
167 -  
168 return ss.str(); 187 return ss.str();
169 } 188 }
170 189
  190 + std::string csv() {
  191 + std::stringstream csvss;
  192 + for (size_t i = 0; i < B; i++) {
  193 + csvss << M[i];
  194 + for (size_t j = 0; j < B; j++) csvss << ", " << M[j * B + i];
  195 + csvss << std::endl;
  196 + }
  197 + return csvss.str();
  198 + }
  199 +
  200 + //save the data as a CSV file
  201 + void csv(std::string filename) {
  202 + ofstream basisfile(filename.c_str());
  203 + basisfile << csv();
  204 + basisfile.close();
  205 + }
  206 +
171 static matrix<T> I(size_t N) { 207 static matrix<T> I(size_t N) {
172 208
173 matrix<T> result(N, N); //create the identity matrix 209 matrix<T> result(N, N); //create the identity matrix