stim/math/field.cuh

#ifndef RTS_FIELD_CUH
#define RTS_FIELD_CUH

#include <vector>
#include <string>
#include <sstream>

#include "cublas_v2.h"
#include <cuda_runtime.h>

#include "../math/rect.h"
#include "../cuda/threads.h"
#include "../cuda/error.h"
#include "../cuda/devices.h"
#include "../visualization/colormap.h"


namespace stim{

//multiply R = X * Y
template<typename T>
__global__ void gpu_field_multiply(T* R, T* X, T* Y, unsigned int r0, unsigned int r1){

	int iu = blockIdx.x * blockDim.x + threadIdx.x;
    int iv = blockIdx.y * blockDim.y + threadIdx.y;

    //make sure that the thread indices are in-bounds
    if(iu >= r0 || iv >= r1) return;

    //compute the index into the field
    int i = iv*r0 + iu;

    //calculate and store the result
    R[i] = X[i] * Y[i];
}

//assign a constant value to all points
template<typename T>
__global__ void gpu_field_assign(T* ptr, T val, unsigned int r0, unsigned int r1){

	int iu = blockIdx.x * blockDim.x + threadIdx.x;
	int iv = blockIdx.y * blockDim.y + threadIdx.y;

	//make sure that the thread indices are in-bounds
	if(iu >= r0 || iv >= r1) return;

	//compute the index into the field
	int i = iv*r0 + iu;

	//calculate and store the result
	ptr[i] = val;
}

//crop the field to the new dimensions (width x height)
template<typename T>
__global__ void gpu_field_crop(T* dest, T* source, 
								unsigned int r0, unsigned int r1, 
								unsigned int width, unsigned int height){

	int iu = blockIdx.x * blockDim.x + threadIdx.x;
    int iv = blockIdx.y * blockDim.y + threadIdx.y;

    //make sure that the thread indices are in-bounds
    if(iu >= width || iv >= height) return;

    //compute the index into the field
    int is = iv*r0 + iu;
    int id = iv*width + iu;

    //calculate and store the result
    dest[id] = source[is];
}

template<typename T, unsigned int D = 1>
class field{

protected:

	T* X[D];			//pointer to the field data
	unsigned int R[2];	//field resolution
	stim::rect<T> shape;		//position and shape of the field slice

	//calculates the optimal block and grid sizes using information from the GPU
	void cuda_params(dim3& grids, dim3& blocks){
		int maxThreads = stim::maxThreadsPerBlock(); //compute the optimal block size
		int SQRT_BLOCK = (int)std::sqrt((float)maxThreads);

		//create one thread for each detector pixel
		blocks = dim3(SQRT_BLOCK, SQRT_BLOCK);
		grids = dim3((R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
	}

	//find the maximum value of component n
	T find_max(unsigned int n){
		cublasStatus_t stat;
		cublasHandle_t handle;

		//create a CUBLAS handle
		stat = cublasCreate(&handle);
		if(stat != CUBLAS_STATUS_SUCCESS){
			std::cout<<"CUBLAS Error: initialization failed"<<std::endl;
			exit(1);
		}

		int L = R[0] * R[1];    //compute the number of discrete points in a slice
		int index;				//result of the max operation
		T result;

		if(sizeof(T) == 4)
			stat = cublasIsamax(handle, L, (const float*)X[n], 1, &index);
		else
			stat = cublasIdamax(handle, L, (const double*)X[n], 1, &index);

		index -= 1;        //adjust for 1-based indexing

		//if there was a GPU error, terminate
		if(stat != CUBLAS_STATUS_SUCCESS){
			std::cout<<"CUBLAS Error: failure finding maximum value."<<std::endl;
			exit(1);
		}

		//retrieve the maximum value for this slice and store it in the maxVal array
		HANDLE_ERROR(cudaMemcpy(&result, X[n] + index, sizeof(T), cudaMemcpyDeviceToHost));
		return result;
	}

public:

	//returns a list of file names given an input string with wild cards
	std::vector<std::string> process_filename(std::string name){
		std::stringstream ss(name);
		std::string item;
		std::vector<std::string> elems;
		while(std::getline(ss, item, '.'))      //split the string at the '.' character (filename and extension)
		{
		    elems.push_back(item);
		}

		std::string prefix = elems[0];                      //prefix contains the filename (with wildcard '?' characters)
		std::string ext = elems[1];                         //file extension (ex. .bmp, .png)
		ext = std::string(".") + ext;           //add a period back into the extension

		size_t i0 = prefix.find_first_of("?");  //find the positions of the first and last wildcard ('?'')
		size_t i1 = prefix.find_last_of("?");

		std::string postfix = prefix.substr(i1+1);
		prefix = prefix.substr(0, i0);

		unsigned int digits = i1 - i0 + 1;                   //compute the number of wildcards

		std::vector<std::string> flist;			//create a vector of file names
		//fill the list
		for(unsigned int d=0; d<D; d++){
			std::stringstream ss;            //assemble the file name
			ss<<prefix<<std::setfill('0')<<std::setw(digits)<<d<<postfix<<ext;
			flist.push_back(ss.str());
		}

		return flist;
	}

	void init(){
		for(unsigned int n=0; n<D; n++)
			X[n] = NULL;
	}
	void destroy(){
		for(unsigned int n=0; n<D; n++)
			if(X[n] != NULL)
				HANDLE_ERROR(cudaFree(X[n]));
	}

public:
	//field constructor
	field(){
		R[0] = R[1] = 0;
		init();
	}

	field(unsigned int x, unsigned int y){
        //set the resolution
        R[0] = x;
        R[1] = y;
		//allocate memory on the GPU
		for(unsigned int n=0; n<D; n++){
			HANDLE_ERROR(cudaMalloc( (void**)&X[n], sizeof(T) * R[0] * R[1] ));
		}
		clear();		//zero the field
    }

    ///copy constructor
	field(const field &rhs){
		//first make a shallow copy
		R[0] = rhs.R[0];
		R[1] = rhs.R[1];

		for(unsigned int n=0; n<D; n++){
			//do we have to make a deep copy?
			if(rhs.X[n] == NULL)
				X[n] = NULL;		//no
			else{
				//allocate the necessary memory
				HANDLE_ERROR(cudaMalloc(&X[n], sizeof(T) * R[0] * R[1]));

				//copy the slice
				HANDLE_ERROR(cudaMemcpy(X[n], rhs.X[n], sizeof(T) * R[0] * R[1], cudaMemcpyDeviceToDevice));
			}
		}
	}

	~field(){
		destroy();
    }

    //assignment operator
	field & operator= (const field & rhs){

        //de-allocate any existing GPU memory
        destroy();

        //copy the slice resolution
        R[0] = rhs.R[0];
        R[1] = rhs.R[1];

		for(unsigned int n=0; n<D; n++)
		{
			//allocate the necessary memory
			HANDLE_ERROR(cudaMalloc(&X[n], sizeof(T) * R[0] * R[1]));
			//copy the slice
			HANDLE_ERROR(cudaMemcpy(X[n], rhs.X[n], sizeof(T) * R[0] * R[1], cudaMemcpyDeviceToDevice));
		}
        return *this;
    }

    field & operator= (const T rhs){

    	int maxThreads = stim::maxThreadsPerBlock(); //compute the optimal block size
        int SQRT_BLOCK = (int)std::sqrt((float)maxThreads);

        //create one thread for each detector pixel
        dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
        dim3 dimGrid((R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);

        //assign the constant value to all positions and dimensions
        for(int n=0; n<D; n++)
        	stim::gpu_field_assign <<<dimGrid, dimBlock>>> (X[n], rhs, R[0], R[1]);

        return *this;
    }

    //assignment of vector component
    field & operator= (const vec<T, D> rhs){

    	int maxThreads = stim::maxThreadsPerBlock(); //compute the optimal block size
        int SQRT_BLOCK = (int)std::sqrt((float)maxThreads);

        //create one thread for each detector pixel
        dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
        dim3 dimGrid((R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);

        //assign the constant value to all positions and dimensions
        for(unsigned int n=0; n<D; n++)
        	stim::gpu_field_assign <<<dimGrid, dimBlock>>> (X[n], rhs.v[n], R[0], R[1]);

        return *this;

    }

    //multiply two fields (element-wise multiplication)
    field<T, D> operator* (const field & rhs){

    	int maxThreads = stim::maxThreadsPerBlock(); //compute the optimal block size
        int SQRT_BLOCK = (int)std::sqrt((float)maxThreads);

        //create one thread for each detector pixel
        dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
        dim3 dimGrid((R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);

        //create a scalar field to store the result
        field<T, D> result(R[0], R[1]);

        for(int n=0; n<D; n++)
        	stim::gpu_field_multiply <<<dimGrid, dimBlock>>> (result.X[n], X[n], rhs.X[n], R[0], R[1]);

        return result;
    }

	T* ptr(unsigned int n = 0){
		if(n < D)
			return X[n];
		else return NULL;
	}

	//return the vector component at position (u, v)
	vec<T, D> get(unsigned int u, unsigned int v){

		vec<T, D> result;
		for(unsigned int d=0; d<D; d++){
			HANDLE_ERROR(cudaMemcpy(&result[d], X[d] + v*R[0] + u, sizeof(T), cudaMemcpyDeviceToHost));
		}

		return result;
	}

	//set all components of the field to zero
	void clear(){
		for(unsigned int n=0; n<D; n++)
			if(X[n] != NULL)
				HANDLE_ERROR(cudaMemset(X[n], 0, sizeof(T) * R[0] * R[1]));
    }

    //crop the field
    field<T, D> crop(unsigned int width, unsigned int height){
    	int maxThreads = stim::maxThreadsPerBlock(); //compute the optimal block size
        int SQRT_BLOCK = (int)std::sqrt((float)maxThreads);

        //create one thread for each detector pixel
        dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
        dim3 dimGrid((width + SQRT_BLOCK -1)/SQRT_BLOCK, (height + SQRT_BLOCK - 1)/SQRT_BLOCK);

        //create a scalar field to store the result
        field<T, D> result(width, height);

        for(int n=0; n<D; n++)
        	stim::gpu_field_crop <<<dimGrid, dimBlock>>> (result.X[n], X[n], R[0], R[1], width, height);

        return result;
    }

    //save an image representing component n
    void toImage(std::string filename, unsigned int n = 0,
    			 bool positive = false, stim::colormapType cmap = stim::cmBrewer){
    	T max_val = find_max(n);	//find the maximum value

    	if(positive)				//if the field is positive, use the range [0 max_val]
    		stim::gpu2image<T>(X[n], filename, R[0], R[1], 0, max_val, cmap);
    	else
    		stim::gpu2image<T>(X[n], filename, R[0], R[1], -max_val, max_val, cmap);
    }

};

}		//end namespace rts
#endif