Commit 334547d8a4ebc1de16f52a6d48451d11de0781b1
1 parent
65c0cc85
changes for mnf: adding covariance of noise function in cpu and gpu also a gpu …
…version of project function
Showing
2 changed files
with
266 additions
and
2 deletions
Show diff stats
stim/envi/bip.h
@@ -373,7 +373,7 @@ public: | @@ -373,7 +373,7 @@ public: | ||
373 | for(size_t xy = 0; xy < XY; xy++){ //for each pixel | 373 | for(size_t xy = 0; xy < XY; xy++){ //for each pixel |
374 | memset(spec, 0, Bb); //set the spectrum to zero | 374 | memset(spec, 0, Bb); //set the spectrum to zero |
375 | if(mask == NULL || mask[xy]){ //if the pixel is masked | 375 | if(mask == NULL || mask[xy]){ //if the pixel is masked |
376 | - len = 0; //initialize the | 376 | + len = 0; //initialize the |
377 | file.read((char*)spec, Bb); //read a spectrum | 377 | file.read((char*)spec, Bb); //read a spectrum |
378 | for(size_t b = 0; b < B; b++) //for each band | 378 | for(size_t b = 0; b < B; b++) //for each band |
379 | len += spec[b]*spec[b]; //add the square of the spectral band | 379 | len += spec[b]*spec[b]; //add the square of the spectral band |
@@ -385,7 +385,7 @@ public: | @@ -385,7 +385,7 @@ public: | ||
385 | file.seekg(Bb, std::ios::cur); //otherwise skip a spectrum | 385 | file.seekg(Bb, std::ios::cur); //otherwise skip a spectrum |
386 | target.write((char*)spec, Bb); //output the normalized spectrum | 386 | target.write((char*)spec, Bb); //output the normalized spectrum |
387 | if(PROGRESS) progress = (double)(xy + 1) / (double)XY * 100; //update the progress | 387 | if(PROGRESS) progress = (double)(xy + 1) / (double)XY * 100; //update the progress |
388 | - } | 388 | + } |
389 | } | 389 | } |
390 | 390 | ||
391 | 391 | ||
@@ -1088,6 +1088,233 @@ public: | @@ -1088,6 +1088,233 @@ public: | ||
1088 | return true; | 1088 | return true; |
1089 | } | 1089 | } |
1090 | 1090 | ||
1091 | + | ||
1092 | + #ifdef CUDA_FOUND | ||
1093 | + /// Calculate the covariance matrix of Noise for masked pixels using cuBLAS | ||
1094 | + /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra | ||
1095 | + bool coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ | ||
1096 | + | ||
1097 | + cudaError_t cudaStat; | ||
1098 | + cublasStatus_t stat; | ||
1099 | + cublasHandle_t handle; | ||
1100 | + | ||
1101 | + progress = 0; //initialize the progress to zero (0) | ||
1102 | + unsigned long long XY = X() * Y(); //calculate the number of elements in a band image | ||
1103 | + unsigned long long B = Z(); //calculate the number of spectral elements | ||
1104 | + | ||
1105 | + double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file | ||
1106 | + double* s_dev; //declare a device pointer that will store the spectrum on the GPU | ||
1107 | + | ||
1108 | + double* s2_dev; // device pointer on the GPU | ||
1109 | + cudaStat = cudaMalloc(&s2_dev, B * sizeof(double)); // allocate space on the CUDA device | ||
1110 | + cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double)); // initialize s2_dev to zero (0) | ||
1111 | + | ||
1112 | + double* A_dev; //declare a device pointer that will store the covariance matrix on the GPU | ||
1113 | + double* avg_dev; //declare a device pointer that will store the average spectrum | ||
1114 | + cudaStat = cudaMalloc(&s_dev, B * sizeof(double)); //allocate space on the CUDA device for the spectrum | ||
1115 | + cudaStat = cudaMalloc(&A_dev, B * B * sizeof(double)); //allocate space on the CUDA device for the covariance matrix | ||
1116 | + cudaStat = cudaMemset(A_dev, 0, B * B * sizeof(double)); //initialize the covariance matrix to zero (0) | ||
1117 | + cudaStat = cudaMalloc(&avg_dev, B * sizeof(double)); //allocate space on the CUDA device for the average spectrum | ||
1118 | + stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1); //copy the average spectrum to the CUDA device | ||
1119 | + | ||
1120 | + double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product) | ||
1121 | + double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) | ||
1122 | + | ||
1123 | + stat = cublasCreate(&handle); //create a cuBLAS instance | ||
1124 | + if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid | ||
1125 | + printf ("CUBLAS initialization failed\n"); | ||
1126 | + return EXIT_FAILURE; | ||
1127 | + } | ||
1128 | + for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel | ||
1129 | + if (mask == NULL || mask[xy] != 0){ | ||
1130 | + pixeld(s, xy); //retreive the spectrum at the current xy pixel location | ||
1131 | + | ||
1132 | + stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device | ||
1133 | + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s_dev, 1); //subtract the average spectrum | ||
1134 | + | ||
1135 | + cudaMemcpy(s2_dev, s_dev + 1 , (B-1) * sizeof(double), cudaMemcpyDeviceToDevice); //copy B-1 elements from shifted source data (s_dev) to device pointer (s2_dev ) | ||
1136 | + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1); //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them ) | ||
1137 | + | ||
1138 | + | ||
1139 | + stat = cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, s_dev, 1, A_dev, (int)B); //calculate the covariance matrix (symmetric outer product) | ||
1140 | + } | ||
1141 | + if(PROGRESS) progress = (double)(xy+1) / XY * 100; //record the current progress | ||
1142 | + | ||
1143 | + } | ||
1144 | + | ||
1145 | + cublasGetMatrix((int)B, (int)B, sizeof(double), A_dev, (int)B, coN, (int)B); //copy the result from the GPU to the CPU | ||
1146 | + | ||
1147 | + cudaFree(A_dev); //clean up allocated device memory | ||
1148 | + cudaFree(s_dev); | ||
1149 | + cudaFree(s2_dev); | ||
1150 | + cudaFree(avg_dev); | ||
1151 | + | ||
1152 | + for(unsigned long long i = 0; i < B; i++){ //copy the upper triangular portion to the lower triangular portion | ||
1153 | + for(unsigned long long j = i+1; j < B; j++){ | ||
1154 | + coN[B * i + j] = coN[B * j + i]; | ||
1155 | + } | ||
1156 | + } | ||
1157 | + | ||
1158 | + return true; | ||
1159 | + } | ||
1160 | +#endif | ||
1161 | + | ||
1162 | + /// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision. | ||
1163 | + | ||
1164 | + /// @param coN is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix | ||
1165 | + /// @param avg is a pointer to memory of size B that stores the average spectrum | ||
1166 | + /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location | ||
1167 | + bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ | ||
1168 | + | ||
1169 | +#ifdef CUDA_FOUND | ||
1170 | + int dev_count; | ||
1171 | + cudaGetDeviceCount(&dev_count); //get the number of CUDA devices | ||
1172 | + cudaDeviceProp prop; | ||
1173 | + cudaGetDeviceProperties(&prop, 0); //get the property of the first device | ||
1174 | + if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator | ||
1175 | + return coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | ||
1176 | +#endif | ||
1177 | + | ||
1178 | + | ||
1179 | + | ||
1180 | + progress = 0; | ||
1181 | + //memory allocation | ||
1182 | + unsigned long long XY = X() * Y(); | ||
1183 | + unsigned long long B = Z(); | ||
1184 | + T* temp = (T*)malloc(sizeof(T) * B); | ||
1185 | + | ||
1186 | + unsigned long long count = nnz(mask); //count the number of masked pixels | ||
1187 | + | ||
1188 | + //initialize covariance matrix of noise | ||
1189 | + memset(coN, 0, B * B * sizeof(double)); | ||
1190 | + | ||
1191 | + //calculate covariance matrix | ||
1192 | + double* coN_half = (double*) malloc(B * B * sizeof(double)); //allocate space for a higher-precision intermediate matrix | ||
1193 | + double* temp_precise = (double*) malloc(B * sizeof(double)); | ||
1194 | + memset(coN_half, 0, B * B * sizeof(double)); //initialize the high-precision matrix with zeros | ||
1195 | + unsigned long long idx; //stores i*B to speed indexing | ||
1196 | + for (unsigned long long xy = 0; xy < XY; xy++){ | ||
1197 | + if (mask == NULL || mask[xy] != 0){ | ||
1198 | + pixel(temp, xy); //retreive the spectrum at the current xy pixel location | ||
1199 | + for(unsigned long long b = 0; b < B; b++) //subtract the mean from this spectrum and increase the precision | ||
1200 | + temp_precise[b] = (double)temp[b] - (double)avg[b]; | ||
1201 | + | ||
1202 | + for(unsigned long long b2 = 0; b2 < B-1; b2++) //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them ) | ||
1203 | + temp_precise[b2] -= temp_precise[b2+1]; | ||
1204 | + | ||
1205 | + idx = 0; | ||
1206 | + for (unsigned long long b0 = 0; b0 < B; b0++){ //for each band | ||
1207 | + for (unsigned long long b1 = b0; b1 < B; b1++) | ||
1208 | + coN_half[idx++] += temp_precise[b0] * temp_precise[b1]; | ||
1209 | + } | ||
1210 | + } | ||
1211 | + if(PROGRESS) progress = (double)(xy+1) / XY * 100; | ||
1212 | + } | ||
1213 | + idx = 0; | ||
1214 | + for (unsigned long long i = 0; i < B; i++){ //copy the precision matrix to both halves of the output matrix | ||
1215 | + for (unsigned long long j = i; j < B; j++){ | ||
1216 | + coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double) count; | ||
1217 | + } | ||
1218 | + } | ||
1219 | + | ||
1220 | + free(temp); | ||
1221 | + free(temp_precise); | ||
1222 | + return true; | ||
1223 | + } | ||
1224 | + | ||
1225 | + #ifdef CUDA_FOUND | ||
1226 | + /// Project the spectra onto a set of basis functions | ||
1227 | + /// @param outfile is the name of the new binary output file that will be created | ||
1228 | + /// @param center is a spectrum about which the data set will be rotated (ex. when performing mean centering) | ||
1229 | + /// @param basis a set of basis vectors that the data set will be projected onto (after centering) | ||
1230 | + /// @param M is the number of basis vectors | ||
1231 | + /// @param mask is a character mask used to limit processing to valid pixels | ||
1232 | + bool project_cublas(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){ | ||
1233 | + | ||
1234 | + cudaError_t cudaStat; | ||
1235 | + cublasStatus_t stat; | ||
1236 | + cublasHandle_t handle; | ||
1237 | + | ||
1238 | + std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file | ||
1239 | + //std::string headername = outfile + ".hdr"; //the header file name | ||
1240 | + | ||
1241 | + progress = 0; //initialize the progress to zero (0) | ||
1242 | + unsigned long long XY = X() * Y(); //calculate the number of elements in a band image | ||
1243 | + unsigned long long B = Z(); //calculate the number of spectral elements | ||
1244 | + | ||
1245 | + double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file | ||
1246 | + double* s_dev; //declare a device pointer that will store the spectrum on the GPU | ||
1247 | + cudaStat = cudaMalloc(&s_dev, B * sizeof(double)); //allocate space on the CUDA device for the spectrum | ||
1248 | + | ||
1249 | + | ||
1250 | + double* basis_dev; // device pointer on the GPU | ||
1251 | + cudaStat = cudaMalloc(&basis_dev, M * B * sizeof(double)); // allocate space on the CUDA device | ||
1252 | + cudaStat = cudaMemset(basis_dev, 0, M * B * sizeof(double)); // initialize basis_dev to zero (0) | ||
1253 | + | ||
1254 | + | ||
1255 | + /// transposing basis matrix (because cuBLAS is column-major) | ||
1256 | + double *basis_Transposed = (double*)malloc(M * B * sizeof(double)); | ||
1257 | + memset(basis_Transposed, 0, M * B * sizeof(double)); | ||
1258 | + for (int i = 0; i<M; i++) | ||
1259 | + for (int j = 0; j<B; j++) | ||
1260 | + basis_Transposed[i+j*M] = basis[i*B+j]; | ||
1261 | + | ||
1262 | + stat = cublasSetMatrix((int)M, (int)B, sizeof(double),basis_Transposed, (int)M, basis_dev, (int)M); //copy the basis_Transposed matrix to the CUDA device (both matrices are stored in column-major format) | ||
1263 | + | ||
1264 | + double* center_dev; //declare a device pointer that will store the center (average) | ||
1265 | + cudaStat = cudaMalloc(¢er_dev, B * sizeof(double)); //allocate space on the CUDA device for the center (average) | ||
1266 | + stat = cublasSetVector((int)B, sizeof(double), center, 1, center_dev, 1); //copy the center vector (average) to the CUDA device (from host to device) | ||
1267 | + | ||
1268 | + | ||
1269 | + double* A = (double*)malloc(sizeof(double) * M); //allocate space for the projected pixel on the host | ||
1270 | + double* A_dev; //declare a device pointer that will store the projected pixel on the GPU | ||
1271 | + cudaStat = cudaMalloc(&A_dev,M * sizeof(double)); //allocate space on the CUDA device for the projected pixel | ||
1272 | + cudaStat = cudaMemset(A_dev, 0,M * sizeof(double)); //initialize the projected pixel to zero (0) | ||
1273 | + | ||
1274 | + double axpy_alpha = -1; //multiplication factor for the center (in order to perform a subtraction) | ||
1275 | + double axpy_alpha2 = 1; //multiplication factor for the matrix-vector multiplication | ||
1276 | + double axpy_beta = 0; //multiplication factor for the matrix-vector multiplication (there is no second scalor) | ||
1277 | + | ||
1278 | + stat = cublasCreate(&handle); //create a cuBLAS instance | ||
1279 | + if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid | ||
1280 | + printf ("CUBLAS initialization failed\n"); | ||
1281 | + return EXIT_FAILURE; | ||
1282 | + } | ||
1283 | + | ||
1284 | + T* temp = (T*)malloc(sizeof(T) * M); //allocate space for the projected pixel to be written on the disc | ||
1285 | + | ||
1286 | + for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel | ||
1287 | + if (mask == NULL || mask[xy] != 0){ | ||
1288 | + pixeld(s, xy); //retreive the spectrum at the current xy pixel location | ||
1289 | + | ||
1290 | + stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device | ||
1291 | + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, center_dev, 1, s_dev, 1); //subtract the center (average) | ||
1292 | + stat = cublasDgemv(handle,CUBLAS_OP_N,(int)M,(int)B,&axpy_alpha2,basis_dev,(int)M,s_dev,1,&axpy_beta,A_dev,1); //performs the matrix-vector multiplication | ||
1293 | + stat = cublasGetVector((int)B, sizeof(double), A_dev, 1, A, 1); //copy the projected pixel to the host (from GPU to CPU) | ||
1294 | + | ||
1295 | + std::copy(A, A + M, temp); //casting projected pixel from double to whatever T is | ||
1296 | + | ||
1297 | + } | ||
1298 | + | ||
1299 | + target.write(reinterpret_cast<const char*>(temp), sizeof(T) * M); //write the projected vector | ||
1300 | + if(PROGRESS) progress = (double)(xy+1) / XY * 100; //record the current progress | ||
1301 | + | ||
1302 | + } | ||
1303 | + | ||
1304 | + //clean up allocated device memory | ||
1305 | + cudaFree(A_dev); | ||
1306 | + cudaFree(s_dev); | ||
1307 | + cudaFree(basis_dev); | ||
1308 | + cudaFree(center_dev); | ||
1309 | + free(A); | ||
1310 | + free(s); | ||
1311 | + free(temp); | ||
1312 | + target.close(); //close the output file | ||
1313 | + | ||
1314 | + return true; | ||
1315 | + } | ||
1316 | +#endif | ||
1317 | + | ||
1091 | /// Project the spectra onto a set of basis functions | 1318 | /// Project the spectra onto a set of basis functions |
1092 | /// @param outfile is the name of the new binary output file that will be created | 1319 | /// @param outfile is the name of the new binary output file that will be created |
1093 | /// @param center is a spectrum about which the data set will be rotated (ex. when performing mean centering) | 1320 | /// @param center is a spectrum about which the data set will be rotated (ex. when performing mean centering) |
@@ -1096,6 +1323,14 @@ public: | @@ -1096,6 +1323,14 @@ public: | ||
1096 | /// @param mask is a character mask used to limit processing to valid pixels | 1323 | /// @param mask is a character mask used to limit processing to valid pixels |
1097 | bool project(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){ | 1324 | bool project(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){ |
1098 | 1325 | ||
1326 | +#ifdef CUDA_FOUND | ||
1327 | + int dev_count; | ||
1328 | + cudaGetDeviceCount(&dev_count); //get the number of CUDA devices | ||
1329 | + cudaDeviceProp prop; | ||
1330 | + cudaGetDeviceProperties(&prop, 0); //get the property of the first device | ||
1331 | + if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator | ||
1332 | + return project_cublas(outfile,center,basis,M,mask,PROGRESS); //use cuBLAS to calculate the covariance matrix | ||
1333 | +#endif | ||
1099 | std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file | 1334 | std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file |
1100 | //std::string headername = outfile + ".hdr"; //the header file name | 1335 | //std::string headername = outfile + ".hdr"; //the header file name |
1101 | 1336 |
stim/envi/envi.h
@@ -1403,6 +1403,35 @@ public: | @@ -1403,6 +1403,35 @@ public: | ||
1403 | return false; | 1403 | return false; |
1404 | } | 1404 | } |
1405 | 1405 | ||
1406 | + /// Calculate the covariance of noise matrix for all masked pixels in the image. | ||
1407 | + | ||
1408 | + /// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix | ||
1409 | + /// @param avg is a pointer to memory of size B that stores the average spectrum | ||
1410 | + /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location | ||
1411 | + bool coNoise_matrix(double* coN, double* avg, unsigned char* mask, bool PROGRESS = false){ | ||
1412 | + if (header.interleave == envi_header::BSQ){ | ||
1413 | + std::cout<<"ERROR: calculating the covariance matrix of noise for a BSQ file is impractical; convert to BIP first"<<std::endl; | ||
1414 | + exit(1); | ||
1415 | + } | ||
1416 | + | ||
1417 | + | ||
1418 | + else if (header.interleave == envi_header::BIL){ | ||
1419 | + std::cout<<"ERROR: calculating the covariance matrix of noise for a BIL file is impractical; convert to BIP first"<<std::endl; | ||
1420 | + exit(1); | ||
1421 | + } | ||
1422 | + | ||
1423 | + else if (header.interleave == envi_header::BIP){ | ||
1424 | + if (header.data_type == envi_header::float32) | ||
1425 | + return ((bip<float>*)file)->coNoise_matrix(coN, avg, mask, PROGRESS); | ||
1426 | + else if (header.data_type == envi_header::float64) | ||
1427 | + return ((bip<double>*)file)->coNoise_matrix(coN, avg, mask, PROGRESS); | ||
1428 | + else{ | ||
1429 | + std::cout << "ERROR: unidentified data type" << std::endl; | ||
1430 | + exit(1); | ||
1431 | + } | ||
1432 | + } | ||
1433 | + return false; | ||
1434 | + } | ||
1406 | 1435 | ||
1407 | /// Crop a region of the image and save it to a new file. | 1436 | /// Crop a region of the image and save it to a new file. |
1408 | 1437 |