Commit 6abbfbdc15fb4c36f1fbbc04d18e1ad419f25476
Merge branch 'sam'
Showing
2 changed files
with
266 additions
and
2 deletions
Show diff stats
stim/envi/bip.h
... | ... | @@ -373,7 +373,7 @@ public: |
373 | 373 | for(size_t xy = 0; xy < XY; xy++){ //for each pixel |
374 | 374 | memset(spec, 0, Bb); //set the spectrum to zero |
375 | 375 | if(mask == NULL || mask[xy]){ //if the pixel is masked |
376 | - len = 0; //initialize the | |
376 | + len = 0; //initialize the | |
377 | 377 | file.read((char*)spec, Bb); //read a spectrum |
378 | 378 | for(size_t b = 0; b < B; b++) //for each band |
379 | 379 | len += spec[b]*spec[b]; //add the square of the spectral band |
... | ... | @@ -385,7 +385,7 @@ public: |
385 | 385 | file.seekg(Bb, std::ios::cur); //otherwise skip a spectrum |
386 | 386 | target.write((char*)spec, Bb); //output the normalized spectrum |
387 | 387 | if(PROGRESS) progress = (double)(xy + 1) / (double)XY * 100; //update the progress |
388 | - } | |
388 | + } | |
389 | 389 | } |
390 | 390 | |
391 | 391 | |
... | ... | @@ -1088,6 +1088,233 @@ public: |
1088 | 1088 | return true; |
1089 | 1089 | } |
1090 | 1090 | |
1091 | + | |
1092 | + #ifdef CUDA_FOUND | |
1093 | + /// Calculate the covariance matrix of Noise for masked pixels using cuBLAS | |
1094 | + /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra | |
1095 | + bool coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ | |
1096 | + | |
1097 | + cudaError_t cudaStat; | |
1098 | + cublasStatus_t stat; | |
1099 | + cublasHandle_t handle; | |
1100 | + | |
1101 | + progress = 0; //initialize the progress to zero (0) | |
1102 | + unsigned long long XY = X() * Y(); //calculate the number of elements in a band image | |
1103 | + unsigned long long B = Z(); //calculate the number of spectral elements | |
1104 | + | |
1105 | + double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file | |
1106 | + double* s_dev; //declare a device pointer that will store the spectrum on the GPU | |
1107 | + | |
1108 | + double* s2_dev; // device pointer on the GPU | |
1109 | + cudaStat = cudaMalloc(&s2_dev, B * sizeof(double)); // allocate space on the CUDA device | |
1110 | + cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double)); // initialize s2_dev to zero (0) | |
1111 | + | |
1112 | + double* A_dev; //declare a device pointer that will store the covariance matrix on the GPU | |
1113 | + double* avg_dev; //declare a device pointer that will store the average spectrum | |
1114 | + cudaStat = cudaMalloc(&s_dev, B * sizeof(double)); //allocate space on the CUDA device for the spectrum | |
1115 | + cudaStat = cudaMalloc(&A_dev, B * B * sizeof(double)); //allocate space on the CUDA device for the covariance matrix | |
1116 | + cudaStat = cudaMemset(A_dev, 0, B * B * sizeof(double)); //initialize the covariance matrix to zero (0) | |
1117 | + cudaStat = cudaMalloc(&avg_dev, B * sizeof(double)); //allocate space on the CUDA device for the average spectrum | |
1118 | + stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1); //copy the average spectrum to the CUDA device | |
1119 | + | |
1120 | + double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product) | |
1121 | + double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) | |
1122 | + | |
1123 | + stat = cublasCreate(&handle); //create a cuBLAS instance | |
1124 | + if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid | |
1125 | + printf ("CUBLAS initialization failed\n"); | |
1126 | + return EXIT_FAILURE; | |
1127 | + } | |
1128 | + for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel | |
1129 | + if (mask == NULL || mask[xy] != 0){ | |
1130 | + pixeld(s, xy); //retreive the spectrum at the current xy pixel location | |
1131 | + | |
1132 | + stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device | |
1133 | + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s_dev, 1); //subtract the average spectrum | |
1134 | + | |
1135 | + cudaMemcpy(s2_dev, s_dev + 1 , (B-1) * sizeof(double), cudaMemcpyDeviceToDevice); //copy B-1 elements from shifted source data (s_dev) to device pointer (s2_dev ) | |
1136 | + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1); //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them ) | |
1137 | + | |
1138 | + | |
1139 | + stat = cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, s_dev, 1, A_dev, (int)B); //calculate the covariance matrix (symmetric outer product) | |
1140 | + } | |
1141 | + if(PROGRESS) progress = (double)(xy+1) / XY * 100; //record the current progress | |
1142 | + | |
1143 | + } | |
1144 | + | |
1145 | + cublasGetMatrix((int)B, (int)B, sizeof(double), A_dev, (int)B, coN, (int)B); //copy the result from the GPU to the CPU | |
1146 | + | |
1147 | + cudaFree(A_dev); //clean up allocated device memory | |
1148 | + cudaFree(s_dev); | |
1149 | + cudaFree(s2_dev); | |
1150 | + cudaFree(avg_dev); | |
1151 | + | |
1152 | + for(unsigned long long i = 0; i < B; i++){ //copy the upper triangular portion to the lower triangular portion | |
1153 | + for(unsigned long long j = i+1; j < B; j++){ | |
1154 | + coN[B * i + j] = coN[B * j + i]; | |
1155 | + } | |
1156 | + } | |
1157 | + | |
1158 | + return true; | |
1159 | + } | |
1160 | +#endif | |
1161 | + | |
1162 | + /// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision. | |
1163 | + | |
1164 | + /// @param coN is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix | |
1165 | + /// @param avg is a pointer to memory of size B that stores the average spectrum | |
1166 | + /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location | |
1167 | + bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ | |
1168 | + | |
1169 | +#ifdef CUDA_FOUND | |
1170 | + int dev_count; | |
1171 | + cudaGetDeviceCount(&dev_count); //get the number of CUDA devices | |
1172 | + cudaDeviceProp prop; | |
1173 | + cudaGetDeviceProperties(&prop, 0); //get the property of the first device | |
1174 | + if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator | |
1175 | + return coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | |
1176 | +#endif | |
1177 | + | |
1178 | + | |
1179 | + | |
1180 | + progress = 0; | |
1181 | + //memory allocation | |
1182 | + unsigned long long XY = X() * Y(); | |
1183 | + unsigned long long B = Z(); | |
1184 | + T* temp = (T*)malloc(sizeof(T) * B); | |
1185 | + | |
1186 | + unsigned long long count = nnz(mask); //count the number of masked pixels | |
1187 | + | |
1188 | + //initialize covariance matrix of noise | |
1189 | + memset(coN, 0, B * B * sizeof(double)); | |
1190 | + | |
1191 | + //calculate covariance matrix | |
1192 | + double* coN_half = (double*) malloc(B * B * sizeof(double)); //allocate space for a higher-precision intermediate matrix | |
1193 | + double* temp_precise = (double*) malloc(B * sizeof(double)); | |
1194 | + memset(coN_half, 0, B * B * sizeof(double)); //initialize the high-precision matrix with zeros | |
1195 | + unsigned long long idx; //stores i*B to speed indexing | |
1196 | + for (unsigned long long xy = 0; xy < XY; xy++){ | |
1197 | + if (mask == NULL || mask[xy] != 0){ | |
1198 | + pixel(temp, xy); //retreive the spectrum at the current xy pixel location | |
1199 | + for(unsigned long long b = 0; b < B; b++) //subtract the mean from this spectrum and increase the precision | |
1200 | + temp_precise[b] = (double)temp[b] - (double)avg[b]; | |
1201 | + | |
1202 | + for(unsigned long long b2 = 0; b2 < B-1; b2++) //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them ) | |
1203 | + temp_precise[b2] -= temp_precise[b2+1]; | |
1204 | + | |
1205 | + idx = 0; | |
1206 | + for (unsigned long long b0 = 0; b0 < B; b0++){ //for each band | |
1207 | + for (unsigned long long b1 = b0; b1 < B; b1++) | |
1208 | + coN_half[idx++] += temp_precise[b0] * temp_precise[b1]; | |
1209 | + } | |
1210 | + } | |
1211 | + if(PROGRESS) progress = (double)(xy+1) / XY * 100; | |
1212 | + } | |
1213 | + idx = 0; | |
1214 | + for (unsigned long long i = 0; i < B; i++){ //copy the precision matrix to both halves of the output matrix | |
1215 | + for (unsigned long long j = i; j < B; j++){ | |
1216 | + coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double) count; | |
1217 | + } | |
1218 | + } | |
1219 | + | |
1220 | + free(temp); | |
1221 | + free(temp_precise); | |
1222 | + return true; | |
1223 | + } | |
1224 | + | |
1225 | + #ifdef CUDA_FOUND | |
1226 | + /// Project the spectra onto a set of basis functions | |
1227 | + /// @param outfile is the name of the new binary output file that will be created | |
1228 | + /// @param center is a spectrum about which the data set will be rotated (ex. when performing mean centering) | |
1229 | + /// @param basis a set of basis vectors that the data set will be projected onto (after centering) | |
1230 | + /// @param M is the number of basis vectors | |
1231 | + /// @param mask is a character mask used to limit processing to valid pixels | |
1232 | + bool project_cublas(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){ | |
1233 | + | |
1234 | + cudaError_t cudaStat; | |
1235 | + cublasStatus_t stat; | |
1236 | + cublasHandle_t handle; | |
1237 | + | |
1238 | + std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file | |
1239 | + //std::string headername = outfile + ".hdr"; //the header file name | |
1240 | + | |
1241 | + progress = 0; //initialize the progress to zero (0) | |
1242 | + unsigned long long XY = X() * Y(); //calculate the number of elements in a band image | |
1243 | + unsigned long long B = Z(); //calculate the number of spectral elements | |
1244 | + | |
1245 | + double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file | |
1246 | + double* s_dev; //declare a device pointer that will store the spectrum on the GPU | |
1247 | + cudaStat = cudaMalloc(&s_dev, B * sizeof(double)); //allocate space on the CUDA device for the spectrum | |
1248 | + | |
1249 | + | |
1250 | + double* basis_dev; // device pointer on the GPU | |
1251 | + cudaStat = cudaMalloc(&basis_dev, M * B * sizeof(double)); // allocate space on the CUDA device | |
1252 | + cudaStat = cudaMemset(basis_dev, 0, M * B * sizeof(double)); // initialize basis_dev to zero (0) | |
1253 | + | |
1254 | + | |
1255 | + /// transposing basis matrix (because cuBLAS is column-major) | |
1256 | + double *basis_Transposed = (double*)malloc(M * B * sizeof(double)); | |
1257 | + memset(basis_Transposed, 0, M * B * sizeof(double)); | |
1258 | + for (int i = 0; i<M; i++) | |
1259 | + for (int j = 0; j<B; j++) | |
1260 | + basis_Transposed[i+j*M] = basis[i*B+j]; | |
1261 | + | |
1262 | + stat = cublasSetMatrix((int)M, (int)B, sizeof(double),basis_Transposed, (int)M, basis_dev, (int)M); //copy the basis_Transposed matrix to the CUDA device (both matrices are stored in column-major format) | |
1263 | + | |
1264 | + double* center_dev; //declare a device pointer that will store the center (average) | |
1265 | + cudaStat = cudaMalloc(¢er_dev, B * sizeof(double)); //allocate space on the CUDA device for the center (average) | |
1266 | + stat = cublasSetVector((int)B, sizeof(double), center, 1, center_dev, 1); //copy the center vector (average) to the CUDA device (from host to device) | |
1267 | + | |
1268 | + | |
1269 | + double* A = (double*)malloc(sizeof(double) * M); //allocate space for the projected pixel on the host | |
1270 | + double* A_dev; //declare a device pointer that will store the projected pixel on the GPU | |
1271 | + cudaStat = cudaMalloc(&A_dev,M * sizeof(double)); //allocate space on the CUDA device for the projected pixel | |
1272 | + cudaStat = cudaMemset(A_dev, 0,M * sizeof(double)); //initialize the projected pixel to zero (0) | |
1273 | + | |
1274 | + double axpy_alpha = -1; //multiplication factor for the center (in order to perform a subtraction) | |
1275 | + double axpy_alpha2 = 1; //multiplication factor for the matrix-vector multiplication | |
1276 | + double axpy_beta = 0; //multiplication factor for the matrix-vector multiplication (there is no second scalor) | |
1277 | + | |
1278 | + stat = cublasCreate(&handle); //create a cuBLAS instance | |
1279 | + if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid | |
1280 | + printf ("CUBLAS initialization failed\n"); | |
1281 | + return EXIT_FAILURE; | |
1282 | + } | |
1283 | + | |
1284 | + T* temp = (T*)malloc(sizeof(T) * M); //allocate space for the projected pixel to be written on the disc | |
1285 | + | |
1286 | + for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel | |
1287 | + if (mask == NULL || mask[xy] != 0){ | |
1288 | + pixeld(s, xy); //retreive the spectrum at the current xy pixel location | |
1289 | + | |
1290 | + stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device | |
1291 | + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, center_dev, 1, s_dev, 1); //subtract the center (average) | |
1292 | + stat = cublasDgemv(handle,CUBLAS_OP_N,(int)M,(int)B,&axpy_alpha2,basis_dev,(int)M,s_dev,1,&axpy_beta,A_dev,1); //performs the matrix-vector multiplication | |
1293 | + stat = cublasGetVector((int)B, sizeof(double), A_dev, 1, A, 1); //copy the projected pixel to the host (from GPU to CPU) | |
1294 | + | |
1295 | + std::copy(A, A + M, temp); //casting projected pixel from double to whatever T is | |
1296 | + | |
1297 | + } | |
1298 | + | |
1299 | + target.write(reinterpret_cast<const char*>(temp), sizeof(T) * M); //write the projected vector | |
1300 | + if(PROGRESS) progress = (double)(xy+1) / XY * 100; //record the current progress | |
1301 | + | |
1302 | + } | |
1303 | + | |
1304 | + //clean up allocated device memory | |
1305 | + cudaFree(A_dev); | |
1306 | + cudaFree(s_dev); | |
1307 | + cudaFree(basis_dev); | |
1308 | + cudaFree(center_dev); | |
1309 | + free(A); | |
1310 | + free(s); | |
1311 | + free(temp); | |
1312 | + target.close(); //close the output file | |
1313 | + | |
1314 | + return true; | |
1315 | + } | |
1316 | +#endif | |
1317 | + | |
1091 | 1318 | /// Project the spectra onto a set of basis functions |
1092 | 1319 | /// @param outfile is the name of the new binary output file that will be created |
1093 | 1320 | /// @param center is a spectrum about which the data set will be rotated (ex. when performing mean centering) |
... | ... | @@ -1096,6 +1323,14 @@ public: |
1096 | 1323 | /// @param mask is a character mask used to limit processing to valid pixels |
1097 | 1324 | bool project(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){ |
1098 | 1325 | |
1326 | +#ifdef CUDA_FOUND | |
1327 | + int dev_count; | |
1328 | + cudaGetDeviceCount(&dev_count); //get the number of CUDA devices | |
1329 | + cudaDeviceProp prop; | |
1330 | + cudaGetDeviceProperties(&prop, 0); //get the property of the first device | |
1331 | + if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator | |
1332 | + return project_cublas(outfile,center,basis,M,mask,PROGRESS); //use cuBLAS to calculate the covariance matrix | |
1333 | +#endif | |
1099 | 1334 | std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file |
1100 | 1335 | //std::string headername = outfile + ".hdr"; //the header file name |
1101 | 1336 | ... | ... |
stim/envi/envi.h
... | ... | @@ -1403,6 +1403,35 @@ public: |
1403 | 1403 | return false; |
1404 | 1404 | } |
1405 | 1405 | |
1406 | + /// Calculate the covariance of noise matrix for all masked pixels in the image. | |
1407 | + | |
1408 | + /// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix | |
1409 | + /// @param avg is a pointer to memory of size B that stores the average spectrum | |
1410 | + /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location | |
1411 | + bool coNoise_matrix(double* coN, double* avg, unsigned char* mask, bool PROGRESS = false){ | |
1412 | + if (header.interleave == envi_header::BSQ){ | |
1413 | + std::cout<<"ERROR: calculating the covariance matrix of noise for a BSQ file is impractical; convert to BIP first"<<std::endl; | |
1414 | + exit(1); | |
1415 | + } | |
1416 | + | |
1417 | + | |
1418 | + else if (header.interleave == envi_header::BIL){ | |
1419 | + std::cout<<"ERROR: calculating the covariance matrix of noise for a BIL file is impractical; convert to BIP first"<<std::endl; | |
1420 | + exit(1); | |
1421 | + } | |
1422 | + | |
1423 | + else if (header.interleave == envi_header::BIP){ | |
1424 | + if (header.data_type == envi_header::float32) | |
1425 | + return ((bip<float>*)file)->coNoise_matrix(coN, avg, mask, PROGRESS); | |
1426 | + else if (header.data_type == envi_header::float64) | |
1427 | + return ((bip<double>*)file)->coNoise_matrix(coN, avg, mask, PROGRESS); | |
1428 | + else{ | |
1429 | + std::cout << "ERROR: unidentified data type" << std::endl; | |
1430 | + exit(1); | |
1431 | + } | |
1432 | + } | |
1433 | + return false; | |
1434 | + } | |
1406 | 1435 | |
1407 | 1436 | /// Crop a region of the image and save it to a new file. |
1408 | 1437 | ... | ... |