Commit 1ee79b845d9875823391b942c0c99226e4b90d16
1 parent
f7cf19e4
more accurate noise covariance calculation with MNF
Showing
1 changed file
with
51 additions
and
35 deletions
Show diff stats
stim/envi/bip.h
@@ -1107,10 +1107,7 @@ public: | @@ -1107,10 +1107,7 @@ public: | ||
1107 | } | 1107 | } |
1108 | 1108 | ||
1109 | 1109 | ||
1110 | -//#ifdef CUDA_FOUND | ||
1111 | - /// Calculate the covariance matrix of Noise for masked pixels using cuBLAS | ||
1112 | - /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra | ||
1113 | - int coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ | 1110 | + int coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false) { |
1114 | 1111 | ||
1115 | cudaError_t cudaStat; | 1112 | cudaError_t cudaStat; |
1116 | cublasStatus_t stat; | 1113 | cublasStatus_t stat; |
@@ -1123,9 +1120,10 @@ public: | @@ -1123,9 +1120,10 @@ public: | ||
1123 | double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file | 1120 | double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file |
1124 | double* s_dev; //declare a device pointer that will store the spectrum on the GPU | 1121 | double* s_dev; //declare a device pointer that will store the spectrum on the GPU |
1125 | 1122 | ||
1126 | - double* s2_dev; // device pointer on the GPU | ||
1127 | - cudaStat = cudaMalloc(&s2_dev, B * sizeof(double)); // allocate space on the CUDA device | ||
1128 | - cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double)); // initialize s2_dev to zero (0) | 1123 | + double* s2 = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum of second pixel that will be pulled from the file |
1124 | + double* s2_dev; // device pointer on the GPU | ||
1125 | + cudaStat = cudaMalloc(&s2_dev, B * sizeof(double)); // allocate space on the CUDA device | ||
1126 | + cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double)); // initialize s2_dev to zero (0) | ||
1129 | 1127 | ||
1130 | double* A_dev; //declare a device pointer that will store the covariance matrix on the GPU | 1128 | double* A_dev; //declare a device pointer that will store the covariance matrix on the GPU |
1131 | double* avg_dev; //declare a device pointer that will store the average spectrum | 1129 | double* avg_dev; //declare a device pointer that will store the average spectrum |
@@ -1135,26 +1133,32 @@ public: | @@ -1135,26 +1133,32 @@ public: | ||
1135 | cudaStat = cudaMalloc(&avg_dev, B * sizeof(double)); //allocate space on the CUDA device for the average spectrum | 1133 | cudaStat = cudaMalloc(&avg_dev, B * sizeof(double)); //allocate space on the CUDA device for the average spectrum |
1136 | stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1); //copy the average spectrum to the CUDA device | 1134 | stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1); //copy the average spectrum to the CUDA device |
1137 | 1135 | ||
1138 | - double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product) | 1136 | + double ger_alpha = 1.0 / (double)XY; //scale the outer product by the inverse of the number of samples (mean outer product) |
1139 | double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) | 1137 | double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) |
1140 | 1138 | ||
1141 | CUBLAS_HANDLE_ERROR(cublasCreate(&handle)); //create a cuBLAS instance | 1139 | CUBLAS_HANDLE_ERROR(cublasCreate(&handle)); //create a cuBLAS instance |
1142 | if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid | 1140 | if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid |
1143 | 1141 | ||
1144 | - for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel | ||
1145 | - if (mask == NULL || mask[xy] != 0){ | ||
1146 | - pixeld(s, xy); //retreive the spectrum at the current xy pixel location | ||
1147 | - | ||
1148 | - stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device | 1142 | + for (unsigned long long xy = 0; xy < XY; xy++) { //for each pixel |
1143 | + if (mask == NULL || mask[xy] != 0) { | ||
1144 | + pixeld(s, xy); //retreive the spectrum at the current xy pixel location | ||
1145 | + if (xy < XY - X()) { | ||
1146 | + pixeld(s2, xy + X()); //retreive the spectrum at the current xy+X pixel location, which is adjacent (bellow) to the pixel at xy location (in y direction) | ||
1147 | + } | ||
1148 | + else { | ||
1149 | + pixeld(s2, xy - X()); //for the last row we consider the the adjacent pixel which is located above pixel xy | ||
1150 | + } | ||
1151 | + stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum of first pixel from the host to the device | ||
1149 | stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s_dev, 1); //subtract the average spectrum | 1152 | stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s_dev, 1); //subtract the average spectrum |
1150 | 1153 | ||
1151 | - cudaMemcpy(s2_dev, s_dev + 1 , (B-1) * sizeof(double), cudaMemcpyDeviceToDevice); //copy B-1 elements from shifted source data (s_dev) to device pointer (s2_dev ) | ||
1152 | - stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1); //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them ) | 1154 | + stat = cublasSetVector((int)B, sizeof(double), s2, 1, s2_dev, 1); //copy the spectrum of second pixel from the host to the device |
1155 | + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s2_dev, 1); //subtract the average spectrum | ||
1153 | 1156 | ||
1157 | + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1); //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (in y direction) | ||
1154 | 1158 | ||
1155 | stat = cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, s_dev, 1, A_dev, (int)B); //calculate the covariance matrix (symmetric outer product) | 1159 | stat = cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, s_dev, 1, A_dev, (int)B); //calculate the covariance matrix (symmetric outer product) |
1156 | } | 1160 | } |
1157 | - if(PROGRESS) progress = (double)(xy+1) / XY * 100; //record the current progress | 1161 | + if (PROGRESS) progress = (double)(xy + 1) / XY * 100; //record the current progress |
1158 | 1162 | ||
1159 | } | 1163 | } |
1160 | 1164 | ||
@@ -1165,22 +1169,22 @@ public: | @@ -1165,22 +1169,22 @@ public: | ||
1165 | cudaFree(s2_dev); | 1169 | cudaFree(s2_dev); |
1166 | cudaFree(avg_dev); | 1170 | cudaFree(avg_dev); |
1167 | 1171 | ||
1168 | - for(unsigned long long i = 0; i < B; i++){ //copy the upper triangular portion to the lower triangular portion | ||
1169 | - for(unsigned long long j = i+1; j < B; j++){ | 1172 | + for (unsigned long long i = 0; i < B; i++) { //copy the upper triangular portion to the lower triangular portion |
1173 | + for (unsigned long long j = i + 1; j < B; j++) { | ||
1170 | coN[B * i + j] = coN[B * j + i]; | 1174 | coN[B * i + j] = coN[B * j + i]; |
1171 | } | 1175 | } |
1172 | } | 1176 | } |
1173 | 1177 | ||
1174 | return 0; | 1178 | return 0; |
1175 | } | 1179 | } |
1176 | -//#endif | 1180 | + //#endif |
1177 | 1181 | ||
1178 | /// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision. | 1182 | /// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision. |
1179 | 1183 | ||
1180 | /// @param coN is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix | 1184 | /// @param coN is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix |
1181 | /// @param avg is a pointer to memory of size B that stores the average spectrum | 1185 | /// @param avg is a pointer to memory of size B that stores the average spectrum |
1182 | /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location | 1186 | /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location |
1183 | - bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, int cuda_device = 0, bool PROGRESS = false){ | 1187 | + bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, int cuda_device = 0, bool PROGRESS = false) { |
1184 | 1188 | ||
1185 | if (cuda_device >= 0) { //if a CUDA device is specified | 1189 | if (cuda_device >= 0) { //if a CUDA device is specified |
1186 | int dev_count; | 1190 | int dev_count; |
@@ -1194,7 +1198,7 @@ public: | @@ -1194,7 +1198,7 @@ public: | ||
1194 | int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | 1198 | int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix |
1195 | if (status == 0) return true; //if the cuBLAS function returned correctly, we're done | 1199 | if (status == 0) return true; //if the cuBLAS function returned correctly, we're done |
1196 | } | 1200 | } |
1197 | - } //otherwise continue using the CPU | 1201 | + } //otherwise continue using the CPU |
1198 | std::cout << "WARNING: cuBLAS failed, using CPU" << std::endl; | 1202 | std::cout << "WARNING: cuBLAS failed, using CPU" << std::endl; |
1199 | } | 1203 | } |
1200 | 1204 | ||
@@ -1203,43 +1207,55 @@ public: | @@ -1203,43 +1207,55 @@ public: | ||
1203 | unsigned long long XY = X() * Y(); | 1207 | unsigned long long XY = X() * Y(); |
1204 | unsigned long long B = Z(); | 1208 | unsigned long long B = Z(); |
1205 | T* temp = (T*)malloc(sizeof(T) * B); | 1209 | T* temp = (T*)malloc(sizeof(T) * B); |
1210 | + T* temp2 = (T*)malloc(sizeof(T) * B); | ||
1206 | 1211 | ||
1207 | unsigned long long count = nnz(mask); //count the number of masked pixels | 1212 | unsigned long long count = nnz(mask); //count the number of masked pixels |
1208 | 1213 | ||
1209 | - //initialize covariance matrix of noise | 1214 | + //initialize covariance matrix of noise |
1210 | memset(coN, 0, B * B * sizeof(double)); | 1215 | memset(coN, 0, B * B * sizeof(double)); |
1211 | 1216 | ||
1212 | //calculate covariance matrix | 1217 | //calculate covariance matrix |
1213 | - double* coN_half = (double*) malloc(B * B * sizeof(double)); //allocate space for a higher-precision intermediate matrix | ||
1214 | - double* temp_precise = (double*) malloc(B * sizeof(double)); | 1218 | + double* coN_half = (double*)malloc(B * B * sizeof(double)); //allocate space for a higher-precision intermediate matrix |
1219 | + double* temp_precise = (double*)malloc(B * sizeof(double)); | ||
1220 | + double* temp_precise2 = (double*)malloc(B * sizeof(double)); | ||
1215 | memset(coN_half, 0, B * B * sizeof(double)); //initialize the high-precision matrix with zeros | 1221 | memset(coN_half, 0, B * B * sizeof(double)); //initialize the high-precision matrix with zeros |
1216 | unsigned long long idx; //stores i*B to speed indexing | 1222 | unsigned long long idx; //stores i*B to speed indexing |
1217 | - for (unsigned long long xy = 0; xy < XY; xy++){ | ||
1218 | - if (mask == NULL || mask[xy] != 0){ | ||
1219 | - pixel(temp, xy); //retreive the spectrum at the current xy pixel location | ||
1220 | - for(unsigned long long b = 0; b < B; b++) //subtract the mean from this spectrum and increase the precision | 1223 | + for (unsigned long long xy = 0; xy < XY; xy++) { |
1224 | + if (mask == NULL || mask[xy] != 0) { | ||
1225 | + pixel(temp, xy); //retreive the spectrum at the current xy pixel location | ||
1226 | + if (xy < XY - X()) { | ||
1227 | + pixel(temp2, xy + X()); //retreive the spectrum at the current xy+X pixel location, which is adjacent (bellow) to the pixel at xy location (in y direction) | ||
1228 | + } | ||
1229 | + else { | ||
1230 | + pixel(temp2, xy - X()); //for the last row we consider the the adjacent pixel which is located above pixel xy | ||
1231 | + } | ||
1232 | + for (unsigned long long b = 0; b < B; b++) { //subtract the mean from this spectrum and increase the precision | ||
1221 | temp_precise[b] = (double)temp[b] - (double)avg[b]; | 1233 | temp_precise[b] = (double)temp[b] - (double)avg[b]; |
1234 | + temp_precise2[b] = (double)temp2[b] - (double)avg[b]; | ||
1235 | + } | ||
1222 | 1236 | ||
1223 | - for(unsigned long long b2 = 0; b2 < B-1; b2++) //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them ) | ||
1224 | - temp_precise[b2] -= temp_precise[b2+1]; | 1237 | + for (unsigned long long b2 = 0; b2 < B; b2++) //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (in y direction) |
1238 | + temp_precise[b2] -= temp_precise2[b2]; | ||
1225 | 1239 | ||
1226 | idx = 0; | 1240 | idx = 0; |
1227 | - for (unsigned long long b0 = 0; b0 < B; b0++){ //for each band | 1241 | + for (unsigned long long b0 = 0; b0 < B; b0++) { //for each band |
1228 | for (unsigned long long b1 = b0; b1 < B; b1++) | 1242 | for (unsigned long long b1 = b0; b1 < B; b1++) |
1229 | coN_half[idx++] += temp_precise[b0] * temp_precise[b1]; | 1243 | coN_half[idx++] += temp_precise[b0] * temp_precise[b1]; |
1230 | } | 1244 | } |
1231 | } | 1245 | } |
1232 | - if(PROGRESS) progress = (double)(xy+1) / XY * 100; | 1246 | + if (PROGRESS) progress = (double)(xy + 1) / XY * 100; |
1233 | } | 1247 | } |
1234 | idx = 0; | 1248 | idx = 0; |
1235 | - for (unsigned long long i = 0; i < B; i++){ //copy the precision matrix to both halves of the output matrix | ||
1236 | - for (unsigned long long j = i; j < B; j++){ | ||
1237 | - coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double) count; | 1249 | + for (unsigned long long i = 0; i < B; i++) { //copy the precision matrix to both halves of the output matrix |
1250 | + for (unsigned long long j = i; j < B; j++) { | ||
1251 | + coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double)count; | ||
1238 | } | 1252 | } |
1239 | } | 1253 | } |
1240 | 1254 | ||
1241 | free(temp); | 1255 | free(temp); |
1256 | + free(temp2); | ||
1242 | free(temp_precise); | 1257 | free(temp_precise); |
1258 | + free(temp_precise2); | ||
1243 | return true; | 1259 | return true; |
1244 | } | 1260 | } |
1245 | 1261 |