diff --git a/stim/cuda/cudatools/error.h b/stim/cuda/cudatools/error.h index 738bbb4..ddce736 100644 --- a/stim/cuda/cudatools/error.h +++ b/stim/cuda/cudatools/error.h @@ -1,22 +1,24 @@ +#ifndef STIM_CUDA_ERROR_H +#define STIM_CUDA_ERROR_H + #include #include using namespace std; #include "cuda_runtime.h" #include "device_launch_parameters.h" #include "cufft.h" - -#ifndef CUDA_HANDLE_ERROR_H -#define CUDA_HANDLE_ERROR_H +#include "cublas_v2.h" //handle error macro -static void HandleError( cudaError_t err, const char *file, int line ) { +static void cuHandleError( cudaError_t err, const char *file, int line ) { if (err != cudaSuccess) { printf("%s in %s at line %d\n", cudaGetErrorString( err ), file, line ); + } } -#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) +#define HANDLE_ERROR( err ) (cuHandleError( err, __FILE__, __LINE__ )) -static void CufftError( cufftResult err ) +static void cufftHandleError( cufftResult err, const char*file, int line ) { if (err != CUFFT_SUCCESS) { @@ -37,7 +39,29 @@ static void CufftError( cufftResult err ) } } +#define CUFFT_HANDLE_ERROR( err ) (cufftHandleError( err, __FILE__, __LINE__ )) +static void cublasHandleError( cublasStatus_t err, const char*file, int line ){ + if(err != CUBLAS_STATUS_SUCCESS){ + if(err == CUBLAS_STATUS_NOT_INITIALIZED) + std::cout<<"CUBLAS_STATUS_NOT_INITIALIZED" <<" in file "< //CUDA -#ifdef CUDA_FOUND - #include - #include "cufft.h" - #include -#endif +//#ifdef CUDA_FOUND +#include +#include "cufft.h" +#include +//#endif namespace stim{ @@ -42,6 +42,10 @@ public: alloc(); } + size_t dim(size_t i){ + return R[i]; + } + /// Create a deep copy of an agileng_binary object void deep_copy(agilent_binary* dst, const agilent_binary* src){ dst->alloc(src->R[0], src->R[1], src->R[2]); //allocate memory @@ -169,7 +173,6 @@ public: void zeropad(){ size_t newZ = pow(2, ceil(log(R[2])/log(2))); //find the nearest power-of-two size_t n = newZ - R[2]; //calculate the number of bands to add - std::cout<<"band padding: "<ptr[i]); } -#ifdef CUDA_FOUND +//#ifdef CUDA_FOUND /// Perform an FFT and return a binary file with bands in the specified range agilent_binary fft(double band_min, double band_max, double ELWN = 15798, int UDR = 2){ auto total_start = std::chrono::high_resolution_clock::now(); @@ -271,7 +274,7 @@ public: return result; } -#endif +//#endif }; diff --git a/stim/envi/bil.h b/stim/envi/bil.h index d68e6ca..cd6450f 100644 --- a/stim/envi/bil.h +++ b/stim/envi/bil.h @@ -4,6 +4,7 @@ #include "../envi/envi_header.h" #include "../envi/hsi.h" #include "../math/fd_coefficients.h" +#include #include #include #include @@ -224,10 +225,44 @@ public: } //given a Y ,return a XZ slice - bool read_plane_y(T * p, unsigned long long y){ + bool read_plane_xz(T * p, size_t y){ return binary::read_plane_2(p, y); } + //given a Y, return ZX slice (transposed such that the spectrum is the leading dimension) + int read_plane_zx(T* p, size_t y){ + T* temp = (T*) malloc(X() * Z() * sizeof(T)); //allocate space to store the temporary xz plane + binary::read_plane_2(temp, y); //load the plane from disk + size_t z, x; + for(z = 0; z < Z(); z++){ + for(x = 0; x <= z; x++){ + p[x * Z() + z] = temp[z * X() + x]; //copy to the destination frame + } + } + } + + //load a frame y into a pre-allocated double-precision array + int read_plane_xzd(double* f, size_t y){ + size_t XB = X() * B(); + T* temp = (T*) malloc(XB * sizeof(T)); //create a temporary location to store the plane at current precision + if(!read_plane_y(temp, y)) return 1; //read the plane in its native format, if it fails return a 1 + for(size_t i = 0; i < XB; i++) f[i] = temp[i]; //convert the plane to a double + return 0; + } + + //given a Y, return ZX slice (transposed such that the spectrum is the leading dimension) + int read_plane_zxd(double* p, size_t y){ + T* temp = (T*) malloc(X() * Z() * sizeof(T)); //allocate space to store the temporary xz plane + binary::read_plane_2(temp, y); //load the plane from disk + size_t z, x; + for(z = 0; z < Z(); z++){ + for(x = 0; x < X(); x++){ + p[x * Z() + z] = (double)temp[z * X() + x]; //copy to the destination frame + } + } + return 0; + } + /// Perform baseline correction given a list of baseline points and stores the result in a new BSQ file. @@ -268,7 +303,7 @@ public: for (unsigned long long k =0; k < Y(); k++) { //get the current y slice - read_plane_y(c, k); + read_plane_xz(c, k); //initialize lownum, highnum, low, high ai = w[0]; @@ -369,7 +404,7 @@ public: for(unsigned long long j = 0; j < Y(); j++) { - read_plane_y(c, j); + read_plane_xz(c, j); for(unsigned long long i = 0; i < B; i++) { for(unsigned long long m = 0; m < X(); m++) @@ -469,7 +504,7 @@ public: for ( unsigned long long i = 0; i < Y(); i++) { - read_plane_y(p, i); + read_plane_xz(p, i); for ( unsigned long long k = 0; k < Z(); k++) { unsigned long long ks = k * X(); @@ -863,7 +898,7 @@ public: for (unsigned long long i = 0; i < Y(); i++) //for each value in Y() (BIP should be X) { - read_plane_y(temp, i); //retrieve an ZX slice, stored in temp + read_plane_xz(temp, i); //retrieve an ZX slice, stored in temp for ( unsigned long long j = 0; j < Z(); j++) //for each Z() (Y) { for (unsigned long long k = 0; k < X(); k++) //for each band @@ -933,7 +968,7 @@ public: //for each slice along the y axis for (unsigned long long y = 0; y < Y(); y++) //Select a page by choosing Y coordinate, Y() { - read_plane_y(slice, y); //retrieve an ZX page, store in "slice" + read_plane_xz(slice, y); //retrieve an ZX page, store in "slice" //for each sample along X for (unsigned long long x = 0; x < X(); x++) //Select a pixel by choosing X coordinate in the page, X() @@ -1004,7 +1039,7 @@ public: double x; //create a register to store the pixel value for (unsigned long long k = 0; k < Y(); k++){ - read_plane_y(temp, k); + read_plane_xz(temp, k); unsigned long long kx = k * X(); for (unsigned long long i = 0; i < X(); i++){ if (mask == NULL || mask[kx + i] != 0){ @@ -1025,6 +1060,68 @@ public: return true; } + int co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ + cudaError_t cudaStat; + cublasStatus_t stat; + cublasHandle_t handle; + + progress = 0; //initialize the progress to zero (0) + size_t XY = X() * Y(); //calculate the number of elements in a band image + size_t XB = X() * Z(); + size_t B = Z(); //calculate the number of spectral elements + + double* F = (double*)malloc(sizeof(double) * B * X()); //allocate space for the frame that will be pulled from the file + double* F_dev; + HANDLE_ERROR(cudaMalloc(&F_dev, X() * B * sizeof(double))); //allocate space for the frame on the GPU + double* s_dev; //declare a device pointer that will store the spectrum on the GPU + double* A_dev; //declare a device pointer that will store the covariance matrix on the GPU + double* avg_dev; //declare a device pointer that will store the average spectrum + HANDLE_ERROR(cudaMalloc(&s_dev, B * sizeof(double))); //allocate space on the CUDA device for a spectrum + HANDLE_ERROR(cudaMalloc(&A_dev, B * B * sizeof(double))); //allocate space on the CUDA device for the covariance matrix + HANDLE_ERROR(cudaMemset(A_dev, 0, B * B * sizeof(double))); //initialize the covariance matrix to zero (0) + HANDLE_ERROR(cudaMalloc(&avg_dev, XB * sizeof(double))); //allocate space on the CUDA device for the average spectrum + for(size_t x = 0; x < X(); x++) //make multiple copies of the average spectrum in order to build a matrix + HANDLE_ERROR(cudaMemcpy(&avg_dev[x * B], avg, B * sizeof(double), cudaMemcpyHostToDevice)); + //stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1); //copy the average spectrum to the CUDA device + + double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product) + double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) + + CUBLAS_HANDLE_ERROR(stat = cublasCreate(&handle)); //create a cuBLAS instance + if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid + + else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."< 0 && prop.major != 9999){ //if the first device is not an emulator + int status = co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix + if(status == 0) return true; //if the cuBLAS function returned correctly, we're done + } //otherwise continue using the CPU + //memory allocation unsigned long long xy = X() * Y(); unsigned long long B = Z(); @@ -1328,7 +1435,7 @@ public: c = (T*)malloc( L ); //allocate space for the slice for(unsigned long long j = 0; j < Y(); j++){ //for each line - read_plane_y(c, j); //load the line into memory + read_plane_xz(c, j); //load the line into memory for(unsigned long long i = 0; i < B; i++){ //for each band for(unsigned long long m = 0; m < X(); m++){ //for each sample if( mask == NULL && mask[m + j * X()] ) //if the pixel is masked @@ -1358,7 +1465,7 @@ public: c = (T*)malloc( L ); //allocate space for the slice for(unsigned long long j = 0; j < Y(); j++){ //for each line - read_plane_y(c, j); //load the line into memory + read_plane_xz(c, j); //load the line into memory for(unsigned long long i = 0; i < B; i++){ //for each band for(unsigned long long m = 0; m < X(); m++){ //for each sample if( mask == NULL && mask[m + j * X()] ) //if the pixel is masked diff --git a/stim/envi/bip.h b/stim/envi/bip.h index 8c1925f..e88feaf 100644 --- a/stim/envi/bip.h +++ b/stim/envi/bip.h @@ -8,10 +8,11 @@ #include //CUDA -#ifdef CUDA_FOUND - #include - #include "cublas_v2.h" -#endif +//#ifdef CUDA_FOUND +#include +#include +#include "cublas_v2.h" +//#endif namespace stim{ @@ -257,7 +258,7 @@ public: } //given a Y ,return a ZX slice - bool read_plane_y(T * p, unsigned long long y){ + bool read_plane_y(T * p, size_t y){ return binary::read_plane_2(p, y); } @@ -982,16 +983,15 @@ public: free(temp); return true; } -#ifdef CUDA_FOUND +//#ifdef CUDA_FOUND /// Calculate the covariance matrix for masked pixels using cuBLAS /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra - bool co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ + int co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){ cudaError_t cudaStat; cublasStatus_t stat; cublasHandle_t handle; - progress = 0; //initialize the progress to zero (0) unsigned long long XY = X() * Y(); //calculate the number of elements in a band image unsigned long long B = Z(); //calculate the number of spectral elements @@ -1009,10 +1009,9 @@ public: double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) stat = cublasCreate(&handle); //create a cuBLAS instance - if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid - printf ("CUBLAS initialization failed\n"); - return EXIT_FAILURE; - } + if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid + + else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."< 0 && prop.major != 9999) //if the first device is not an emulator - return co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix -#endif - progress = 0; + if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator + int status = co_matrix_cublas(co, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix + if(status == 0) return true; //if the cuBLAS function returned correctly, we're done + } //otherwise continue using the CPU +//#endif + //memory allocation unsigned long long XY = X() * Y(); unsigned long long B = Z(); @@ -1097,10 +1099,10 @@ public: } -#ifdef CUDA_FOUND +//#ifdef CUDA_FOUND /// Calculate the covariance matrix of Noise for masked pixels using cuBLAS /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra - bool coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ + int coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ cudaError_t cudaStat; cublasStatus_t stat; @@ -1128,11 +1130,9 @@ public: double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product) double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) - stat = cublasCreate(&handle); //create a cuBLAS instance - if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid - printf ("CUBLAS initialization failed\n"); - return EXIT_FAILURE; - } + CUBLAS_HANDLE_ERROR(cublasCreate(&handle)); //create a cuBLAS instance + if (stat != CUBLAS_STATUS_SUCCESS) return 1; //test the cuBLAS instance to make sure it is valid + for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel if (mask == NULL || mask[xy] != 0){ pixeld(s, xy); //retreive the spectrum at the current xy pixel location @@ -1165,7 +1165,7 @@ public: return true; } -#endif +//#endif /// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision. @@ -1174,14 +1174,18 @@ public: /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ -#ifdef CUDA_FOUND +//#ifdef CUDA_FOUND int dev_count; cudaGetDeviceCount(&dev_count); //get the number of CUDA devices cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); //get the property of the first device - if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator - return coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix -#endif + if(dev_count > 0 && prop.major != 9999){ //if the first device is not an emulator + int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix + if(status == 0) return true; //if the cuBLAS function returned correctly, we're done + } //otherwise continue using the CPU + //if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator + // return coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix +//#endif -- libgit2 0.21.4