realfield.cuh 7.7 KB
#ifndef	RTS_REALFIELD_H
#define RTS_REALFIELD_H

#include "../visualization/colormap.h"
#include "../envi/envi.h"
#include "../math/rect.h"
#include "../cuda/devices.h"
#include "cublas_v2.h"
#include <cuda_runtime.h>


namespace stim{

//multiply R = X * Y
template<typename T>
__global__ void gpu_realfield_multiply(T* R, T* X, T* Y, unsigned int r0, unsigned int r1){

	int iu = blockIdx.x * blockDim.x + threadIdx.x;
    int iv = blockIdx.y * blockDim.y + threadIdx.y;

    //make sure that the thread indices are in-bounds
    if(iu >= r0 || iv >= r1) return;

    //compute the index into the field
    int i = iv*r0 + iu;

    //calculate and store the result
    R[i] = X[i] * Y[i];

}

template<typename P, unsigned int N = 1, bool positive = false>
class realfield{

	P* X[N];		//an array of N gpu pointers for each field component
	int R[2];		//resolution of the slice
	rect<P> shape;

	void process_filename(std::string name, std::string &prefix, std::string &postfix, 
                          std::string &ext, unsigned int &digits)
    {
        std::stringstream ss(name);
        std::string item;
        std::vector<std::string> elems;
        while(std::getline(ss, item, '.'))      //split the string at the '.' character (filename and extension)
        {
            elems.push_back(item);
        }
        
        prefix = elems[0];                      //prefix contains the filename (with wildcard '?' characters)
        ext = elems[1];                         //file extension (ex. .bmp, .png)
        ext = std::string(".") + ext;           //add a period back into the extension

        size_t i0 = prefix.find_first_of("?");  //find the positions of the first and last wildcard ('?'')
        size_t i1 = prefix.find_last_of("?");

        postfix = prefix.substr(i1+1);
        prefix = prefix.substr(0, i0);

        digits = i1 - i0 + 1;                   //compute the number of wildcards
    }

	void init()
	{
		for(unsigned int n=0; n<N; n++)
			X[n] = NULL;
	}
	void destroy()
	{
		for(unsigned int n=0; n<N; n++)
			if(X[n] != NULL)
				HANDLE_ERROR(cudaFree(X[n]));
	}

public:
	//field constructor
	realfield()
	{
		R[0] = R[1] = 0;
		init();
		//std::cout<<"realfield CONSTRUCTOR"<<std::endl;
	}
	realfield(unsigned int x, unsigned int y)
	{
        //set the resolution
        R[0] = x;
        R[1] = y;
		//allocate memory on the GPU
		for(unsigned int n=0; n<N; n++)
		{
			HANDLE_ERROR(cudaMalloc( (void**)&X[n], sizeof(P) * R[0] * R[1] ));
		}
		//shape = rect<P>(vec<P>(-1, -1, 0), vec<P>(-1, 1, 0), vec<P>(1, 1, 0));	//default geometry
		clear();		//zero the field
		//std::cout<<"realfield CONSTRUCTOR"<<std::endl;
    }

	~realfield()
    {
		destroy();
		//std::cout<<"realfield DESTRUCTOR"<<std::endl;
    }

	P* ptr(unsigned int n = 0)
	{
		if(n < N)
			return X[n];
		else return NULL;
	}

	//set all components of the field to zero
	void clear()
    {
		for(unsigned int n=0; n<N; n++)
			if(X[n] != NULL)
				HANDLE_ERROR(cudaMemset(X[n], 0, sizeof(P) * R[0] * R[1]));
    }

	void toImage(std::string filename, unsigned int n, P vmin, P vmax, stim::colormapType cmap = stim::cmBrewer)
    {
		stim::gpu2image<P>(X[n], filename, R[0], R[1], vmin, vmax, cmap);
    }

	void toImages(std::string filename, bool global_max = true, stim::colormapType cmap = stim::cmBrewer)
	{
        std::string prefix, postfix, extension;
        unsigned int digits;
        process_filename(filename, prefix, postfix, extension, digits);      //process the filename for wild cards

        cublasStatus_t stat;
        cublasHandle_t handle;

        //create a CUBLAS handle
        stat = cublasCreate(&handle);
        if(stat != CUBLAS_STATUS_SUCCESS)
        {
            std::cout<<"CUBLAS Error: initialization failed"<<std::endl;
            exit(1);
        }

        int L = R[0] * R[1];    //compute the number of discrete points in a slice
        int result;             //result of the max operation

        P maxVal[N];            //array stores minimum and maximum values
        P maxAll = 0;           //largest value in the data set

        //compute the maximum value for each vector component
        for(int n=0; n<N; n++)
        {
            if(sizeof(P) == 4)
                stat = cublasIsamax(handle, L, (const float*)X[n], 1, &result);
            else
                stat = cublasIdamax(handle, L, (const double*)X[n], 1, &result);

            result -= 1;        //adjust for 1-based indexing

            if(stat != CUBLAS_STATUS_SUCCESS)   //if there was a GPU error, terminate
            {
                std::cout<<"CUBLAS Error: failure finding maximum value."<<std::endl;
                exit(1);
            }

            //retrieve the maximum value for this slice and store it in the maxVal array
            HANDLE_ERROR(cudaMemcpy(&maxVal[n], X[n] + result, sizeof(P), cudaMemcpyDeviceToHost));
            if(abs(maxVal[n]) > maxAll)          //if maxVal is larger, update the maxAll variable
                maxAll = maxVal[n];

        }
        
        cublasDestroy(handle);  //destroy the CUBLAS handle

        P outputMax = abs(maxAll);			//maximum value used for each output image
		for(int n=0; n<N; n++)          //for each image
		{
			if(!global_max) outputMax = maxVal[n];	//calculate the maximum value for this image

			stringstream ss;            //assemble the file name
			ss<<prefix<<std::setfill('0')<<std::setw(digits)<<n<<postfix<<extension;
			std::cout<<ss.str()<<std::endl;
			if(positive)                //if the image is positive
				toImage(ss.str(), n, 0, abs(outputMax), cmap);         //save the image using the global maximum
			else
				toImage(ss.str(), n, -abs(outputMax), abs(outputMax), cmap);   //save the image using the global maximum
		}
	}

	//assignment operator
	realfield & operator= (const realfield & rhs)
    {
        //de-allocate any existing GPU memory
        destroy();

        //copy the slice resolution
        R[0] = rhs.R[0];
        R[1] = rhs.R[1];

		for(unsigned int n=0; n<N; n++)
		{
			//allocate the necessary memory
			HANDLE_ERROR(cudaMalloc(&X[n], sizeof(P) * R[0] * R[1]));
			//copy the slice
			HANDLE_ERROR(cudaMemcpy(X[n], rhs.X[n], sizeof(P) * R[0] * R[1], cudaMemcpyDeviceToDevice));
		}
        //std::cout<<"Assignment operator."<<std::endl;

        return *this;
    }

    //multiply two fields (element-wise multiplication)
    realfield<P, N, positive> operator* (const realfield & rhs){

    	int maxThreads = stim::maxThreadsPerBlock(); //compute the optimal block size
        int SQRT_BLOCK = (int)std::sqrt((float)maxThreads);

        //create one thread for each detector pixel
        dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
        dim3 dimGrid((R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);

        //create a scalar field to store the result
        realfield<P, N, positive> result(R[0], R[1]);

        for(int n=0; n<N; n++)
        	stim::gpu_realfield_multiply <<<dimGrid, dimBlock>>> (result.X[n], X[n], rhs.X[n], R[0], R[1]);

        return result;
    }

	///copy constructor
	realfield(const realfield &rhs)
	{
		//first make a shallow copy
		R[0] = rhs.R[0];
		R[1] = rhs.R[1];

		for(unsigned int n=0; n<N; n++)
		{
			//do we have to make a deep copy?
			if(rhs.X[n] == NULL)
				X[n] = NULL;		//no
			else
			{
				//allocate the necessary memory
				HANDLE_ERROR(cudaMalloc(&X[n], sizeof(P) * R[0] * R[1]));

				//copy the slice
				HANDLE_ERROR(cudaMemcpy(X[n], rhs.X[n], sizeof(P) * R[0] * R[1], cudaMemcpyDeviceToDevice));
			}
		}


	}




};


}	//end namespace rts


#endif