diff --git a/stim/envi/bip.h b/stim/envi/bip.h index 8ff9ab5..215ff38 100644 --- a/stim/envi/bip.h +++ b/stim/envi/bip.h @@ -1107,10 +1107,7 @@ public: } -//#ifdef CUDA_FOUND - /// Calculate the covariance matrix of Noise for masked pixels using cuBLAS - /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra - int coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ + int coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false) { cudaError_t cudaStat; cublasStatus_t stat; @@ -1123,9 +1120,10 @@ public: double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file double* s_dev; //declare a device pointer that will store the spectrum on the GPU - double* s2_dev; // device pointer on the GPU - cudaStat = cudaMalloc(&s2_dev, B * sizeof(double)); // allocate space on the CUDA device - cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double)); // initialize s2_dev to zero (0) + double* s2 = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum of second pixel that will be pulled from the file + double* s2_dev; // device pointer on the GPU + cudaStat = cudaMalloc(&s2_dev, B * sizeof(double)); // allocate space on the CUDA device + cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double)); // initialize s2_dev to zero (0) double* A_dev; //declare a device pointer that will store the covariance matrix on the GPU double* avg_dev; //declare a device pointer that will store the average spectrum @@ -1135,26 +1133,32 @@ public: cudaStat = cudaMalloc(&avg_dev, B * sizeof(double)); //allocate space on the CUDA device for the average spectrum stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1); //copy the average spectrum to the CUDA device - double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product) + double ger_alpha = 1.0 / (double)XY; //scale the outer product by the inverse of the number of samples (mean outer product) double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) CUBLAS_HANDLE_ERROR(cublasCreate(&handle)); //create a cuBLAS instance if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid - for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel - if (mask == NULL || mask[xy] != 0){ - pixeld(s, xy); //retreive the spectrum at the current xy pixel location - - stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device + for (unsigned long long xy = 0; xy < XY; xy++) { //for each pixel + if (mask == NULL || mask[xy] != 0) { + pixeld(s, xy); //retreive the spectrum at the current xy pixel location + if (xy < XY - X()) { + pixeld(s2, xy + X()); //retreive the spectrum at the current xy+X pixel location, which is adjacent (bellow) to the pixel at xy location (in y direction) + } + else { + pixeld(s2, xy - X()); //for the last row we consider the the adjacent pixel which is located above pixel xy + } + stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum of first pixel from the host to the device stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s_dev, 1); //subtract the average spectrum - cudaMemcpy(s2_dev, s_dev + 1 , (B-1) * sizeof(double), cudaMemcpyDeviceToDevice); //copy B-1 elements from shifted source data (s_dev) to device pointer (s2_dev ) - stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1); //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them ) + stat = cublasSetVector((int)B, sizeof(double), s2, 1, s2_dev, 1); //copy the spectrum of second pixel from the host to the device + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s2_dev, 1); //subtract the average spectrum + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1); //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (in y direction) stat = cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, s_dev, 1, A_dev, (int)B); //calculate the covariance matrix (symmetric outer product) } - if(PROGRESS) progress = (double)(xy+1) / XY * 100; //record the current progress + if (PROGRESS) progress = (double)(xy + 1) / XY * 100; //record the current progress } @@ -1165,22 +1169,22 @@ public: cudaFree(s2_dev); cudaFree(avg_dev); - for(unsigned long long i = 0; i < B; i++){ //copy the upper triangular portion to the lower triangular portion - for(unsigned long long j = i+1; j < B; j++){ + for (unsigned long long i = 0; i < B; i++) { //copy the upper triangular portion to the lower triangular portion + for (unsigned long long j = i + 1; j < B; j++) { coN[B * i + j] = coN[B * j + i]; } } return 0; } -//#endif + //#endif /// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision. /// @param coN is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix /// @param avg is a pointer to memory of size B that stores the average spectrum /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location - bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, int cuda_device = 0, bool PROGRESS = false){ + bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, int cuda_device = 0, bool PROGRESS = false) { if (cuda_device >= 0) { //if a CUDA device is specified int dev_count; @@ -1194,7 +1198,7 @@ public: int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix if (status == 0) return true; //if the cuBLAS function returned correctly, we're done } - } //otherwise continue using the CPU + } //otherwise continue using the CPU std::cout << "WARNING: cuBLAS failed, using CPU" << std::endl; } @@ -1203,43 +1207,55 @@ public: unsigned long long XY = X() * Y(); unsigned long long B = Z(); T* temp = (T*)malloc(sizeof(T) * B); + T* temp2 = (T*)malloc(sizeof(T) * B); unsigned long long count = nnz(mask); //count the number of masked pixels - //initialize covariance matrix of noise + //initialize covariance matrix of noise memset(coN, 0, B * B * sizeof(double)); //calculate covariance matrix - double* coN_half = (double*) malloc(B * B * sizeof(double)); //allocate space for a higher-precision intermediate matrix - double* temp_precise = (double*) malloc(B * sizeof(double)); + double* coN_half = (double*)malloc(B * B * sizeof(double)); //allocate space for a higher-precision intermediate matrix + double* temp_precise = (double*)malloc(B * sizeof(double)); + double* temp_precise2 = (double*)malloc(B * sizeof(double)); memset(coN_half, 0, B * B * sizeof(double)); //initialize the high-precision matrix with zeros unsigned long long idx; //stores i*B to speed indexing - for (unsigned long long xy = 0; xy < XY; xy++){ - if (mask == NULL || mask[xy] != 0){ - pixel(temp, xy); //retreive the spectrum at the current xy pixel location - for(unsigned long long b = 0; b < B; b++) //subtract the mean from this spectrum and increase the precision + for (unsigned long long xy = 0; xy < XY; xy++) { + if (mask == NULL || mask[xy] != 0) { + pixel(temp, xy); //retreive the spectrum at the current xy pixel location + if (xy < XY - X()) { + pixel(temp2, xy + X()); //retreive the spectrum at the current xy+X pixel location, which is adjacent (bellow) to the pixel at xy location (in y direction) + } + else { + pixel(temp2, xy - X()); //for the last row we consider the the adjacent pixel which is located above pixel xy + } + for (unsigned long long b = 0; b < B; b++) { //subtract the mean from this spectrum and increase the precision temp_precise[b] = (double)temp[b] - (double)avg[b]; + temp_precise2[b] = (double)temp2[b] - (double)avg[b]; + } - for(unsigned long long b2 = 0; b2 < B-1; b2++) //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them ) - temp_precise[b2] -= temp_precise[b2+1]; + for (unsigned long long b2 = 0; b2 < B; b2++) //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (in y direction) + temp_precise[b2] -= temp_precise2[b2]; idx = 0; - for (unsigned long long b0 = 0; b0 < B; b0++){ //for each band + for (unsigned long long b0 = 0; b0 < B; b0++) { //for each band for (unsigned long long b1 = b0; b1 < B; b1++) coN_half[idx++] += temp_precise[b0] * temp_precise[b1]; } } - if(PROGRESS) progress = (double)(xy+1) / XY * 100; + if (PROGRESS) progress = (double)(xy + 1) / XY * 100; } idx = 0; - for (unsigned long long i = 0; i < B; i++){ //copy the precision matrix to both halves of the output matrix - for (unsigned long long j = i; j < B; j++){ - coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double) count; + for (unsigned long long i = 0; i < B; i++) { //copy the precision matrix to both halves of the output matrix + for (unsigned long long j = i; j < B; j++) { + coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double)count; } } free(temp); + free(temp2); free(temp_precise); + free(temp_precise2); return true; } -- libgit2 0.21.4