Commit 1ee79b845d9875823391b942c0c99226e4b90d16

Authored by David Mayerich
1 parent f7cf19e4

more accurate noise covariance calculation with MNF

Showing 1 changed file with 51 additions and 35 deletions   Show diff stats
stim/envi/bip.h
... ... @@ -1107,10 +1107,7 @@ public:
1107 1107 }
1108 1108  
1109 1109  
1110   -//#ifdef CUDA_FOUND
1111   - /// Calculate the covariance matrix of Noise for masked pixels using cuBLAS
1112   - /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra
1113   - int coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
  1110 + int coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false) {
1114 1111  
1115 1112 cudaError_t cudaStat;
1116 1113 cublasStatus_t stat;
... ... @@ -1123,9 +1120,10 @@ public:
1123 1120 double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file
1124 1121 double* s_dev; //declare a device pointer that will store the spectrum on the GPU
1125 1122  
1126   - double* s2_dev; // device pointer on the GPU
1127   - cudaStat = cudaMalloc(&s2_dev, B * sizeof(double)); // allocate space on the CUDA device
1128   - cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double)); // initialize s2_dev to zero (0)
  1123 + double* s2 = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum of second pixel that will be pulled from the file
  1124 + double* s2_dev; // device pointer on the GPU
  1125 + cudaStat = cudaMalloc(&s2_dev, B * sizeof(double)); // allocate space on the CUDA device
  1126 + cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double)); // initialize s2_dev to zero (0)
1129 1127  
1130 1128 double* A_dev; //declare a device pointer that will store the covariance matrix on the GPU
1131 1129 double* avg_dev; //declare a device pointer that will store the average spectrum
... ... @@ -1135,26 +1133,32 @@ public:
1135 1133 cudaStat = cudaMalloc(&avg_dev, B * sizeof(double)); //allocate space on the CUDA device for the average spectrum
1136 1134 stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1); //copy the average spectrum to the CUDA device
1137 1135  
1138   - double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product)
  1136 + double ger_alpha = 1.0 / (double)XY; //scale the outer product by the inverse of the number of samples (mean outer product)
1139 1137 double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction)
1140 1138  
1141 1139 CUBLAS_HANDLE_ERROR(cublasCreate(&handle)); //create a cuBLAS instance
1142 1140 if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid
1143 1141  
1144   - for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel
1145   - if (mask == NULL || mask[xy] != 0){
1146   - pixeld(s, xy); //retreive the spectrum at the current xy pixel location
1147   -
1148   - stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device
  1142 + for (unsigned long long xy = 0; xy < XY; xy++) { //for each pixel
  1143 + if (mask == NULL || mask[xy] != 0) {
  1144 + pixeld(s, xy); //retreive the spectrum at the current xy pixel location
  1145 + if (xy < XY - X()) {
  1146 + pixeld(s2, xy + X()); //retreive the spectrum at the current xy+X pixel location, which is adjacent (bellow) to the pixel at xy location (in y direction)
  1147 + }
  1148 + else {
  1149 + pixeld(s2, xy - X()); //for the last row we consider the the adjacent pixel which is located above pixel xy
  1150 + }
  1151 + stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum of first pixel from the host to the device
1149 1152 stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s_dev, 1); //subtract the average spectrum
1150 1153  
1151   - cudaMemcpy(s2_dev, s_dev + 1 , (B-1) * sizeof(double), cudaMemcpyDeviceToDevice); //copy B-1 elements from shifted source data (s_dev) to device pointer (s2_dev )
1152   - stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1); //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them )
  1154 + stat = cublasSetVector((int)B, sizeof(double), s2, 1, s2_dev, 1); //copy the spectrum of second pixel from the host to the device
  1155 + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s2_dev, 1); //subtract the average spectrum
1153 1156  
  1157 + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1); //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (in y direction)
1154 1158  
1155 1159 stat = cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, s_dev, 1, A_dev, (int)B); //calculate the covariance matrix (symmetric outer product)
1156 1160 }
1157   - if(PROGRESS) progress = (double)(xy+1) / XY * 100; //record the current progress
  1161 + if (PROGRESS) progress = (double)(xy + 1) / XY * 100; //record the current progress
1158 1162  
1159 1163 }
1160 1164  
... ... @@ -1165,22 +1169,22 @@ public:
1165 1169 cudaFree(s2_dev);
1166 1170 cudaFree(avg_dev);
1167 1171  
1168   - for(unsigned long long i = 0; i < B; i++){ //copy the upper triangular portion to the lower triangular portion
1169   - for(unsigned long long j = i+1; j < B; j++){
  1172 + for (unsigned long long i = 0; i < B; i++) { //copy the upper triangular portion to the lower triangular portion
  1173 + for (unsigned long long j = i + 1; j < B; j++) {
1170 1174 coN[B * i + j] = coN[B * j + i];
1171 1175 }
1172 1176 }
1173 1177  
1174 1178 return 0;
1175 1179 }
1176   -//#endif
  1180 + //#endif
1177 1181  
1178 1182 /// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision.
1179 1183  
1180 1184 /// @param coN is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
1181 1185 /// @param avg is a pointer to memory of size B that stores the average spectrum
1182 1186 /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
1183   - bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, int cuda_device = 0, bool PROGRESS = false){
  1187 + bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, int cuda_device = 0, bool PROGRESS = false) {
1184 1188  
1185 1189 if (cuda_device >= 0) { //if a CUDA device is specified
1186 1190 int dev_count;
... ... @@ -1194,7 +1198,7 @@ public:
1194 1198 int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
1195 1199 if (status == 0) return true; //if the cuBLAS function returned correctly, we're done
1196 1200 }
1197   - } //otherwise continue using the CPU
  1201 + } //otherwise continue using the CPU
1198 1202 std::cout << "WARNING: cuBLAS failed, using CPU" << std::endl;
1199 1203 }
1200 1204  
... ... @@ -1203,43 +1207,55 @@ public:
1203 1207 unsigned long long XY = X() * Y();
1204 1208 unsigned long long B = Z();
1205 1209 T* temp = (T*)malloc(sizeof(T) * B);
  1210 + T* temp2 = (T*)malloc(sizeof(T) * B);
1206 1211  
1207 1212 unsigned long long count = nnz(mask); //count the number of masked pixels
1208 1213  
1209   - //initialize covariance matrix of noise
  1214 + //initialize covariance matrix of noise
1210 1215 memset(coN, 0, B * B * sizeof(double));
1211 1216  
1212 1217 //calculate covariance matrix
1213   - double* coN_half = (double*) malloc(B * B * sizeof(double)); //allocate space for a higher-precision intermediate matrix
1214   - double* temp_precise = (double*) malloc(B * sizeof(double));
  1218 + double* coN_half = (double*)malloc(B * B * sizeof(double)); //allocate space for a higher-precision intermediate matrix
  1219 + double* temp_precise = (double*)malloc(B * sizeof(double));
  1220 + double* temp_precise2 = (double*)malloc(B * sizeof(double));
1215 1221 memset(coN_half, 0, B * B * sizeof(double)); //initialize the high-precision matrix with zeros
1216 1222 unsigned long long idx; //stores i*B to speed indexing
1217   - for (unsigned long long xy = 0; xy < XY; xy++){
1218   - if (mask == NULL || mask[xy] != 0){
1219   - pixel(temp, xy); //retreive the spectrum at the current xy pixel location
1220   - for(unsigned long long b = 0; b < B; b++) //subtract the mean from this spectrum and increase the precision
  1223 + for (unsigned long long xy = 0; xy < XY; xy++) {
  1224 + if (mask == NULL || mask[xy] != 0) {
  1225 + pixel(temp, xy); //retreive the spectrum at the current xy pixel location
  1226 + if (xy < XY - X()) {
  1227 + pixel(temp2, xy + X()); //retreive the spectrum at the current xy+X pixel location, which is adjacent (bellow) to the pixel at xy location (in y direction)
  1228 + }
  1229 + else {
  1230 + pixel(temp2, xy - X()); //for the last row we consider the the adjacent pixel which is located above pixel xy
  1231 + }
  1232 + for (unsigned long long b = 0; b < B; b++) { //subtract the mean from this spectrum and increase the precision
1221 1233 temp_precise[b] = (double)temp[b] - (double)avg[b];
  1234 + temp_precise2[b] = (double)temp2[b] - (double)avg[b];
  1235 + }
1222 1236  
1223   - for(unsigned long long b2 = 0; b2 < B-1; b2++) //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them )
1224   - temp_precise[b2] -= temp_precise[b2+1];
  1237 + for (unsigned long long b2 = 0; b2 < B; b2++) //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (in y direction)
  1238 + temp_precise[b2] -= temp_precise2[b2];
1225 1239  
1226 1240 idx = 0;
1227   - for (unsigned long long b0 = 0; b0 < B; b0++){ //for each band
  1241 + for (unsigned long long b0 = 0; b0 < B; b0++) { //for each band
1228 1242 for (unsigned long long b1 = b0; b1 < B; b1++)
1229 1243 coN_half[idx++] += temp_precise[b0] * temp_precise[b1];
1230 1244 }
1231 1245 }
1232   - if(PROGRESS) progress = (double)(xy+1) / XY * 100;
  1246 + if (PROGRESS) progress = (double)(xy + 1) / XY * 100;
1233 1247 }
1234 1248 idx = 0;
1235   - for (unsigned long long i = 0; i < B; i++){ //copy the precision matrix to both halves of the output matrix
1236   - for (unsigned long long j = i; j < B; j++){
1237   - coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double) count;
  1249 + for (unsigned long long i = 0; i < B; i++) { //copy the precision matrix to both halves of the output matrix
  1250 + for (unsigned long long j = i; j < B; j++) {
  1251 + coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double)count;
1238 1252 }
1239 1253 }
1240 1254  
1241 1255 free(temp);
  1256 + free(temp2);
1242 1257 free(temp_precise);
  1258 + free(temp_precise2);
1243 1259 return true;
1244 1260 }
1245 1261  
... ...