Commit 334547d8a4ebc1de16f52a6d48451d11de0781b1

Authored by sam
1 parent 65c0cc85

changes for mnf: adding covariance of noise function in cpu and gpu also a gpu …

…version of project function
Showing 2 changed files with 266 additions and 2 deletions   Show diff stats
stim/envi/bip.h
... ... @@ -373,7 +373,7 @@ public:
373 373 for(size_t xy = 0; xy < XY; xy++){ //for each pixel
374 374 memset(spec, 0, Bb); //set the spectrum to zero
375 375 if(mask == NULL || mask[xy]){ //if the pixel is masked
376   - len = 0; //initialize the
  376 + len = 0; //initialize the
377 377 file.read((char*)spec, Bb); //read a spectrum
378 378 for(size_t b = 0; b < B; b++) //for each band
379 379 len += spec[b]*spec[b]; //add the square of the spectral band
... ... @@ -385,7 +385,7 @@ public:
385 385 file.seekg(Bb, std::ios::cur); //otherwise skip a spectrum
386 386 target.write((char*)spec, Bb); //output the normalized spectrum
387 387 if(PROGRESS) progress = (double)(xy + 1) / (double)XY * 100; //update the progress
388   - }
  388 + }
389 389 }
390 390  
391 391  
... ... @@ -1088,6 +1088,233 @@ public:
1088 1088 return true;
1089 1089 }
1090 1090  
  1091 +
  1092 + #ifdef CUDA_FOUND
  1093 + /// Calculate the covariance matrix of Noise for masked pixels using cuBLAS
  1094 + /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra
  1095 + bool coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
  1096 +
  1097 + cudaError_t cudaStat;
  1098 + cublasStatus_t stat;
  1099 + cublasHandle_t handle;
  1100 +
  1101 + progress = 0; //initialize the progress to zero (0)
  1102 + unsigned long long XY = X() * Y(); //calculate the number of elements in a band image
  1103 + unsigned long long B = Z(); //calculate the number of spectral elements
  1104 +
  1105 + double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file
  1106 + double* s_dev; //declare a device pointer that will store the spectrum on the GPU
  1107 +
  1108 + double* s2_dev; // device pointer on the GPU
  1109 + cudaStat = cudaMalloc(&s2_dev, B * sizeof(double)); // allocate space on the CUDA device
  1110 + cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double)); // initialize s2_dev to zero (0)
  1111 +
  1112 + double* A_dev; //declare a device pointer that will store the covariance matrix on the GPU
  1113 + double* avg_dev; //declare a device pointer that will store the average spectrum
  1114 + cudaStat = cudaMalloc(&s_dev, B * sizeof(double)); //allocate space on the CUDA device for the spectrum
  1115 + cudaStat = cudaMalloc(&A_dev, B * B * sizeof(double)); //allocate space on the CUDA device for the covariance matrix
  1116 + cudaStat = cudaMemset(A_dev, 0, B * B * sizeof(double)); //initialize the covariance matrix to zero (0)
  1117 + cudaStat = cudaMalloc(&avg_dev, B * sizeof(double)); //allocate space on the CUDA device for the average spectrum
  1118 + stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1); //copy the average spectrum to the CUDA device
  1119 +
  1120 + double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product)
  1121 + double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction)
  1122 +
  1123 + stat = cublasCreate(&handle); //create a cuBLAS instance
  1124 + if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid
  1125 + printf ("CUBLAS initialization failed\n");
  1126 + return EXIT_FAILURE;
  1127 + }
  1128 + for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel
  1129 + if (mask == NULL || mask[xy] != 0){
  1130 + pixeld(s, xy); //retreive the spectrum at the current xy pixel location
  1131 +
  1132 + stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device
  1133 + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s_dev, 1); //subtract the average spectrum
  1134 +
  1135 + cudaMemcpy(s2_dev, s_dev + 1 , (B-1) * sizeof(double), cudaMemcpyDeviceToDevice); //copy B-1 elements from shifted source data (s_dev) to device pointer (s2_dev )
  1136 + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1); //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them )
  1137 +
  1138 +
  1139 + stat = cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, s_dev, 1, A_dev, (int)B); //calculate the covariance matrix (symmetric outer product)
  1140 + }
  1141 + if(PROGRESS) progress = (double)(xy+1) / XY * 100; //record the current progress
  1142 +
  1143 + }
  1144 +
  1145 + cublasGetMatrix((int)B, (int)B, sizeof(double), A_dev, (int)B, coN, (int)B); //copy the result from the GPU to the CPU
  1146 +
  1147 + cudaFree(A_dev); //clean up allocated device memory
  1148 + cudaFree(s_dev);
  1149 + cudaFree(s2_dev);
  1150 + cudaFree(avg_dev);
  1151 +
  1152 + for(unsigned long long i = 0; i < B; i++){ //copy the upper triangular portion to the lower triangular portion
  1153 + for(unsigned long long j = i+1; j < B; j++){
  1154 + coN[B * i + j] = coN[B * j + i];
  1155 + }
  1156 + }
  1157 +
  1158 + return true;
  1159 + }
  1160 +#endif
  1161 +
  1162 + /// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision.
  1163 +
  1164 + /// @param coN is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
  1165 + /// @param avg is a pointer to memory of size B that stores the average spectrum
  1166 + /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
  1167 + bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
  1168 +
  1169 +#ifdef CUDA_FOUND
  1170 + int dev_count;
  1171 + cudaGetDeviceCount(&dev_count); //get the number of CUDA devices
  1172 + cudaDeviceProp prop;
  1173 + cudaGetDeviceProperties(&prop, 0); //get the property of the first device
  1174 + if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator
  1175 + return coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
  1176 +#endif
  1177 +
  1178 +
  1179 +
  1180 + progress = 0;
  1181 + //memory allocation
  1182 + unsigned long long XY = X() * Y();
  1183 + unsigned long long B = Z();
  1184 + T* temp = (T*)malloc(sizeof(T) * B);
  1185 +
  1186 + unsigned long long count = nnz(mask); //count the number of masked pixels
  1187 +
  1188 + //initialize covariance matrix of noise
  1189 + memset(coN, 0, B * B * sizeof(double));
  1190 +
  1191 + //calculate covariance matrix
  1192 + double* coN_half = (double*) malloc(B * B * sizeof(double)); //allocate space for a higher-precision intermediate matrix
  1193 + double* temp_precise = (double*) malloc(B * sizeof(double));
  1194 + memset(coN_half, 0, B * B * sizeof(double)); //initialize the high-precision matrix with zeros
  1195 + unsigned long long idx; //stores i*B to speed indexing
  1196 + for (unsigned long long xy = 0; xy < XY; xy++){
  1197 + if (mask == NULL || mask[xy] != 0){
  1198 + pixel(temp, xy); //retreive the spectrum at the current xy pixel location
  1199 + for(unsigned long long b = 0; b < B; b++) //subtract the mean from this spectrum and increase the precision
  1200 + temp_precise[b] = (double)temp[b] - (double)avg[b];
  1201 +
  1202 + for(unsigned long long b2 = 0; b2 < B-1; b2++) //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them )
  1203 + temp_precise[b2] -= temp_precise[b2+1];
  1204 +
  1205 + idx = 0;
  1206 + for (unsigned long long b0 = 0; b0 < B; b0++){ //for each band
  1207 + for (unsigned long long b1 = b0; b1 < B; b1++)
  1208 + coN_half[idx++] += temp_precise[b0] * temp_precise[b1];
  1209 + }
  1210 + }
  1211 + if(PROGRESS) progress = (double)(xy+1) / XY * 100;
  1212 + }
  1213 + idx = 0;
  1214 + for (unsigned long long i = 0; i < B; i++){ //copy the precision matrix to both halves of the output matrix
  1215 + for (unsigned long long j = i; j < B; j++){
  1216 + coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double) count;
  1217 + }
  1218 + }
  1219 +
  1220 + free(temp);
  1221 + free(temp_precise);
  1222 + return true;
  1223 + }
  1224 +
  1225 + #ifdef CUDA_FOUND
  1226 + /// Project the spectra onto a set of basis functions
  1227 + /// @param outfile is the name of the new binary output file that will be created
  1228 + /// @param center is a spectrum about which the data set will be rotated (ex. when performing mean centering)
  1229 + /// @param basis a set of basis vectors that the data set will be projected onto (after centering)
  1230 + /// @param M is the number of basis vectors
  1231 + /// @param mask is a character mask used to limit processing to valid pixels
  1232 + bool project_cublas(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){
  1233 +
  1234 + cudaError_t cudaStat;
  1235 + cublasStatus_t stat;
  1236 + cublasHandle_t handle;
  1237 +
  1238 + std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file
  1239 + //std::string headername = outfile + ".hdr"; //the header file name
  1240 +
  1241 + progress = 0; //initialize the progress to zero (0)
  1242 + unsigned long long XY = X() * Y(); //calculate the number of elements in a band image
  1243 + unsigned long long B = Z(); //calculate the number of spectral elements
  1244 +
  1245 + double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file
  1246 + double* s_dev; //declare a device pointer that will store the spectrum on the GPU
  1247 + cudaStat = cudaMalloc(&s_dev, B * sizeof(double)); //allocate space on the CUDA device for the spectrum
  1248 +
  1249 +
  1250 + double* basis_dev; // device pointer on the GPU
  1251 + cudaStat = cudaMalloc(&basis_dev, M * B * sizeof(double)); // allocate space on the CUDA device
  1252 + cudaStat = cudaMemset(basis_dev, 0, M * B * sizeof(double)); // initialize basis_dev to zero (0)
  1253 +
  1254 +
  1255 + /// transposing basis matrix (because cuBLAS is column-major)
  1256 + double *basis_Transposed = (double*)malloc(M * B * sizeof(double));
  1257 + memset(basis_Transposed, 0, M * B * sizeof(double));
  1258 + for (int i = 0; i<M; i++)
  1259 + for (int j = 0; j<B; j++)
  1260 + basis_Transposed[i+j*M] = basis[i*B+j];
  1261 +
  1262 + stat = cublasSetMatrix((int)M, (int)B, sizeof(double),basis_Transposed, (int)M, basis_dev, (int)M); //copy the basis_Transposed matrix to the CUDA device (both matrices are stored in column-major format)
  1263 +
  1264 + double* center_dev; //declare a device pointer that will store the center (average)
  1265 + cudaStat = cudaMalloc(&center_dev, B * sizeof(double)); //allocate space on the CUDA device for the center (average)
  1266 + stat = cublasSetVector((int)B, sizeof(double), center, 1, center_dev, 1); //copy the center vector (average) to the CUDA device (from host to device)
  1267 +
  1268 +
  1269 + double* A = (double*)malloc(sizeof(double) * M); //allocate space for the projected pixel on the host
  1270 + double* A_dev; //declare a device pointer that will store the projected pixel on the GPU
  1271 + cudaStat = cudaMalloc(&A_dev,M * sizeof(double)); //allocate space on the CUDA device for the projected pixel
  1272 + cudaStat = cudaMemset(A_dev, 0,M * sizeof(double)); //initialize the projected pixel to zero (0)
  1273 +
  1274 + double axpy_alpha = -1; //multiplication factor for the center (in order to perform a subtraction)
  1275 + double axpy_alpha2 = 1; //multiplication factor for the matrix-vector multiplication
  1276 + double axpy_beta = 0; //multiplication factor for the matrix-vector multiplication (there is no second scalor)
  1277 +
  1278 + stat = cublasCreate(&handle); //create a cuBLAS instance
  1279 + if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid
  1280 + printf ("CUBLAS initialization failed\n");
  1281 + return EXIT_FAILURE;
  1282 + }
  1283 +
  1284 + T* temp = (T*)malloc(sizeof(T) * M); //allocate space for the projected pixel to be written on the disc
  1285 +
  1286 + for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel
  1287 + if (mask == NULL || mask[xy] != 0){
  1288 + pixeld(s, xy); //retreive the spectrum at the current xy pixel location
  1289 +
  1290 + stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device
  1291 + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, center_dev, 1, s_dev, 1); //subtract the center (average)
  1292 + stat = cublasDgemv(handle,CUBLAS_OP_N,(int)M,(int)B,&axpy_alpha2,basis_dev,(int)M,s_dev,1,&axpy_beta,A_dev,1); //performs the matrix-vector multiplication
  1293 + stat = cublasGetVector((int)B, sizeof(double), A_dev, 1, A, 1); //copy the projected pixel to the host (from GPU to CPU)
  1294 +
  1295 + std::copy(A, A + M, temp); //casting projected pixel from double to whatever T is
  1296 +
  1297 + }
  1298 +
  1299 + target.write(reinterpret_cast<const char*>(temp), sizeof(T) * M); //write the projected vector
  1300 + if(PROGRESS) progress = (double)(xy+1) / XY * 100; //record the current progress
  1301 +
  1302 + }
  1303 +
  1304 + //clean up allocated device memory
  1305 + cudaFree(A_dev);
  1306 + cudaFree(s_dev);
  1307 + cudaFree(basis_dev);
  1308 + cudaFree(center_dev);
  1309 + free(A);
  1310 + free(s);
  1311 + free(temp);
  1312 + target.close(); //close the output file
  1313 +
  1314 + return true;
  1315 + }
  1316 +#endif
  1317 +
1091 1318 /// Project the spectra onto a set of basis functions
1092 1319 /// @param outfile is the name of the new binary output file that will be created
1093 1320 /// @param center is a spectrum about which the data set will be rotated (ex. when performing mean centering)
... ... @@ -1096,6 +1323,14 @@ public:
1096 1323 /// @param mask is a character mask used to limit processing to valid pixels
1097 1324 bool project(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){
1098 1325  
  1326 +#ifdef CUDA_FOUND
  1327 + int dev_count;
  1328 + cudaGetDeviceCount(&dev_count); //get the number of CUDA devices
  1329 + cudaDeviceProp prop;
  1330 + cudaGetDeviceProperties(&prop, 0); //get the property of the first device
  1331 + if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator
  1332 + return project_cublas(outfile,center,basis,M,mask,PROGRESS); //use cuBLAS to calculate the covariance matrix
  1333 +#endif
1099 1334 std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file
1100 1335 //std::string headername = outfile + ".hdr"; //the header file name
1101 1336  
... ...
stim/envi/envi.h
... ... @@ -1403,6 +1403,35 @@ public:
1403 1403 return false;
1404 1404 }
1405 1405  
  1406 + /// Calculate the covariance of noise matrix for all masked pixels in the image.
  1407 +
  1408 + /// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
  1409 + /// @param avg is a pointer to memory of size B that stores the average spectrum
  1410 + /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
  1411 + bool coNoise_matrix(double* coN, double* avg, unsigned char* mask, bool PROGRESS = false){
  1412 + if (header.interleave == envi_header::BSQ){
  1413 + std::cout<<"ERROR: calculating the covariance matrix of noise for a BSQ file is impractical; convert to BIP first"<<std::endl;
  1414 + exit(1);
  1415 + }
  1416 +
  1417 +
  1418 + else if (header.interleave == envi_header::BIL){
  1419 + std::cout<<"ERROR: calculating the covariance matrix of noise for a BIL file is impractical; convert to BIP first"<<std::endl;
  1420 + exit(1);
  1421 + }
  1422 +
  1423 + else if (header.interleave == envi_header::BIP){
  1424 + if (header.data_type == envi_header::float32)
  1425 + return ((bip<float>*)file)->coNoise_matrix(coN, avg, mask, PROGRESS);
  1426 + else if (header.data_type == envi_header::float64)
  1427 + return ((bip<double>*)file)->coNoise_matrix(coN, avg, mask, PROGRESS);
  1428 + else{
  1429 + std::cout << "ERROR: unidentified data type" << std::endl;
  1430 + exit(1);
  1431 + }
  1432 + }
  1433 + return false;
  1434 + }
1406 1435  
1407 1436 /// Crop a region of the image and save it to a new file.
1408 1437  
... ...