Commit 1ee79b845d9875823391b942c0c99226e4b90d16
1 parent
f7cf19e4
more accurate noise covariance calculation with MNF
Showing
1 changed file
with
51 additions
and
35 deletions
Show diff stats
stim/envi/bip.h
... | ... | @@ -1107,10 +1107,7 @@ public: |
1107 | 1107 | } |
1108 | 1108 | |
1109 | 1109 | |
1110 | -//#ifdef CUDA_FOUND | |
1111 | - /// Calculate the covariance matrix of Noise for masked pixels using cuBLAS | |
1112 | - /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra | |
1113 | - int coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ | |
1110 | + int coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false) { | |
1114 | 1111 | |
1115 | 1112 | cudaError_t cudaStat; |
1116 | 1113 | cublasStatus_t stat; |
... | ... | @@ -1123,9 +1120,10 @@ public: |
1123 | 1120 | double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file |
1124 | 1121 | double* s_dev; //declare a device pointer that will store the spectrum on the GPU |
1125 | 1122 | |
1126 | - double* s2_dev; // device pointer on the GPU | |
1127 | - cudaStat = cudaMalloc(&s2_dev, B * sizeof(double)); // allocate space on the CUDA device | |
1128 | - cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double)); // initialize s2_dev to zero (0) | |
1123 | + double* s2 = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum of second pixel that will be pulled from the file | |
1124 | + double* s2_dev; // device pointer on the GPU | |
1125 | + cudaStat = cudaMalloc(&s2_dev, B * sizeof(double)); // allocate space on the CUDA device | |
1126 | + cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double)); // initialize s2_dev to zero (0) | |
1129 | 1127 | |
1130 | 1128 | double* A_dev; //declare a device pointer that will store the covariance matrix on the GPU |
1131 | 1129 | double* avg_dev; //declare a device pointer that will store the average spectrum |
... | ... | @@ -1135,26 +1133,32 @@ public: |
1135 | 1133 | cudaStat = cudaMalloc(&avg_dev, B * sizeof(double)); //allocate space on the CUDA device for the average spectrum |
1136 | 1134 | stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1); //copy the average spectrum to the CUDA device |
1137 | 1135 | |
1138 | - double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product) | |
1136 | + double ger_alpha = 1.0 / (double)XY; //scale the outer product by the inverse of the number of samples (mean outer product) | |
1139 | 1137 | double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) |
1140 | 1138 | |
1141 | 1139 | CUBLAS_HANDLE_ERROR(cublasCreate(&handle)); //create a cuBLAS instance |
1142 | 1140 | if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid |
1143 | 1141 | |
1144 | - for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel | |
1145 | - if (mask == NULL || mask[xy] != 0){ | |
1146 | - pixeld(s, xy); //retreive the spectrum at the current xy pixel location | |
1147 | - | |
1148 | - stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device | |
1142 | + for (unsigned long long xy = 0; xy < XY; xy++) { //for each pixel | |
1143 | + if (mask == NULL || mask[xy] != 0) { | |
1144 | + pixeld(s, xy); //retreive the spectrum at the current xy pixel location | |
1145 | + if (xy < XY - X()) { | |
1146 | + pixeld(s2, xy + X()); //retreive the spectrum at the current xy+X pixel location, which is adjacent (bellow) to the pixel at xy location (in y direction) | |
1147 | + } | |
1148 | + else { | |
1149 | + pixeld(s2, xy - X()); //for the last row we consider the the adjacent pixel which is located above pixel xy | |
1150 | + } | |
1151 | + stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum of first pixel from the host to the device | |
1149 | 1152 | stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s_dev, 1); //subtract the average spectrum |
1150 | 1153 | |
1151 | - cudaMemcpy(s2_dev, s_dev + 1 , (B-1) * sizeof(double), cudaMemcpyDeviceToDevice); //copy B-1 elements from shifted source data (s_dev) to device pointer (s2_dev ) | |
1152 | - stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1); //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them ) | |
1154 | + stat = cublasSetVector((int)B, sizeof(double), s2, 1, s2_dev, 1); //copy the spectrum of second pixel from the host to the device | |
1155 | + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s2_dev, 1); //subtract the average spectrum | |
1153 | 1156 | |
1157 | + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1); //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (in y direction) | |
1154 | 1158 | |
1155 | 1159 | stat = cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, s_dev, 1, A_dev, (int)B); //calculate the covariance matrix (symmetric outer product) |
1156 | 1160 | } |
1157 | - if(PROGRESS) progress = (double)(xy+1) / XY * 100; //record the current progress | |
1161 | + if (PROGRESS) progress = (double)(xy + 1) / XY * 100; //record the current progress | |
1158 | 1162 | |
1159 | 1163 | } |
1160 | 1164 | |
... | ... | @@ -1165,22 +1169,22 @@ public: |
1165 | 1169 | cudaFree(s2_dev); |
1166 | 1170 | cudaFree(avg_dev); |
1167 | 1171 | |
1168 | - for(unsigned long long i = 0; i < B; i++){ //copy the upper triangular portion to the lower triangular portion | |
1169 | - for(unsigned long long j = i+1; j < B; j++){ | |
1172 | + for (unsigned long long i = 0; i < B; i++) { //copy the upper triangular portion to the lower triangular portion | |
1173 | + for (unsigned long long j = i + 1; j < B; j++) { | |
1170 | 1174 | coN[B * i + j] = coN[B * j + i]; |
1171 | 1175 | } |
1172 | 1176 | } |
1173 | 1177 | |
1174 | 1178 | return 0; |
1175 | 1179 | } |
1176 | -//#endif | |
1180 | + //#endif | |
1177 | 1181 | |
1178 | 1182 | /// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision. |
1179 | 1183 | |
1180 | 1184 | /// @param coN is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix |
1181 | 1185 | /// @param avg is a pointer to memory of size B that stores the average spectrum |
1182 | 1186 | /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location |
1183 | - bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, int cuda_device = 0, bool PROGRESS = false){ | |
1187 | + bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, int cuda_device = 0, bool PROGRESS = false) { | |
1184 | 1188 | |
1185 | 1189 | if (cuda_device >= 0) { //if a CUDA device is specified |
1186 | 1190 | int dev_count; |
... | ... | @@ -1194,7 +1198,7 @@ public: |
1194 | 1198 | int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix |
1195 | 1199 | if (status == 0) return true; //if the cuBLAS function returned correctly, we're done |
1196 | 1200 | } |
1197 | - } //otherwise continue using the CPU | |
1201 | + } //otherwise continue using the CPU | |
1198 | 1202 | std::cout << "WARNING: cuBLAS failed, using CPU" << std::endl; |
1199 | 1203 | } |
1200 | 1204 | |
... | ... | @@ -1203,43 +1207,55 @@ public: |
1203 | 1207 | unsigned long long XY = X() * Y(); |
1204 | 1208 | unsigned long long B = Z(); |
1205 | 1209 | T* temp = (T*)malloc(sizeof(T) * B); |
1210 | + T* temp2 = (T*)malloc(sizeof(T) * B); | |
1206 | 1211 | |
1207 | 1212 | unsigned long long count = nnz(mask); //count the number of masked pixels |
1208 | 1213 | |
1209 | - //initialize covariance matrix of noise | |
1214 | + //initialize covariance matrix of noise | |
1210 | 1215 | memset(coN, 0, B * B * sizeof(double)); |
1211 | 1216 | |
1212 | 1217 | //calculate covariance matrix |
1213 | - double* coN_half = (double*) malloc(B * B * sizeof(double)); //allocate space for a higher-precision intermediate matrix | |
1214 | - double* temp_precise = (double*) malloc(B * sizeof(double)); | |
1218 | + double* coN_half = (double*)malloc(B * B * sizeof(double)); //allocate space for a higher-precision intermediate matrix | |
1219 | + double* temp_precise = (double*)malloc(B * sizeof(double)); | |
1220 | + double* temp_precise2 = (double*)malloc(B * sizeof(double)); | |
1215 | 1221 | memset(coN_half, 0, B * B * sizeof(double)); //initialize the high-precision matrix with zeros |
1216 | 1222 | unsigned long long idx; //stores i*B to speed indexing |
1217 | - for (unsigned long long xy = 0; xy < XY; xy++){ | |
1218 | - if (mask == NULL || mask[xy] != 0){ | |
1219 | - pixel(temp, xy); //retreive the spectrum at the current xy pixel location | |
1220 | - for(unsigned long long b = 0; b < B; b++) //subtract the mean from this spectrum and increase the precision | |
1223 | + for (unsigned long long xy = 0; xy < XY; xy++) { | |
1224 | + if (mask == NULL || mask[xy] != 0) { | |
1225 | + pixel(temp, xy); //retreive the spectrum at the current xy pixel location | |
1226 | + if (xy < XY - X()) { | |
1227 | + pixel(temp2, xy + X()); //retreive the spectrum at the current xy+X pixel location, which is adjacent (bellow) to the pixel at xy location (in y direction) | |
1228 | + } | |
1229 | + else { | |
1230 | + pixel(temp2, xy - X()); //for the last row we consider the the adjacent pixel which is located above pixel xy | |
1231 | + } | |
1232 | + for (unsigned long long b = 0; b < B; b++) { //subtract the mean from this spectrum and increase the precision | |
1221 | 1233 | temp_precise[b] = (double)temp[b] - (double)avg[b]; |
1234 | + temp_precise2[b] = (double)temp2[b] - (double)avg[b]; | |
1235 | + } | |
1222 | 1236 | |
1223 | - for(unsigned long long b2 = 0; b2 < B-1; b2++) //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them ) | |
1224 | - temp_precise[b2] -= temp_precise[b2+1]; | |
1237 | + for (unsigned long long b2 = 0; b2 < B; b2++) //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (in y direction) | |
1238 | + temp_precise[b2] -= temp_precise2[b2]; | |
1225 | 1239 | |
1226 | 1240 | idx = 0; |
1227 | - for (unsigned long long b0 = 0; b0 < B; b0++){ //for each band | |
1241 | + for (unsigned long long b0 = 0; b0 < B; b0++) { //for each band | |
1228 | 1242 | for (unsigned long long b1 = b0; b1 < B; b1++) |
1229 | 1243 | coN_half[idx++] += temp_precise[b0] * temp_precise[b1]; |
1230 | 1244 | } |
1231 | 1245 | } |
1232 | - if(PROGRESS) progress = (double)(xy+1) / XY * 100; | |
1246 | + if (PROGRESS) progress = (double)(xy + 1) / XY * 100; | |
1233 | 1247 | } |
1234 | 1248 | idx = 0; |
1235 | - for (unsigned long long i = 0; i < B; i++){ //copy the precision matrix to both halves of the output matrix | |
1236 | - for (unsigned long long j = i; j < B; j++){ | |
1237 | - coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double) count; | |
1249 | + for (unsigned long long i = 0; i < B; i++) { //copy the precision matrix to both halves of the output matrix | |
1250 | + for (unsigned long long j = i; j < B; j++) { | |
1251 | + coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double)count; | |
1238 | 1252 | } |
1239 | 1253 | } |
1240 | 1254 | |
1241 | 1255 | free(temp); |
1256 | + free(temp2); | |
1242 | 1257 | free(temp_precise); |
1258 | + free(temp_precise2); | |
1243 | 1259 | return true; |
1244 | 1260 | } |
1245 | 1261 | ... | ... |