cost.h 4.72 KB
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <stdio.h>
#include <stim/visualization/colormap.h>
#include <sstream>
#include <stim/math/vector.h>
#include <stim/cuda/devices.h>
#include <stim/cuda/threads.h>

///Cost function that works with the gl-spider class to find index of the item with min-cost.
typedef unsigned char uchar;
texture<uchar, cudaTextureType2D, cudaReadModeElementType> texIn;
float *result;
float* v_dif;
cudaArray* srcArray;
bool testing = false;

inline void checkCUDAerrors(const char *msg)
{
	cudaError_t err = cudaGetLastError();
	if (cudaSuccess != err){
		fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err) );
		exit(1);
		}
}

///Finds the sum of all the pixes in a gives template element.
///Returns the abosolute value.
///@param *diff, a pointer to the memory block that holds the pixel-differences.
float get_sum(float *diff)
{

	cublasStatus_t ret;
	cublasHandle_t handle;
	ret = cublasCreate(&handle);
	
	ret = cublasSetVector(20*10, sizeof(*diff), diff, 1, v_dif, 1);
	float out;
	ret = cublasSasum(handle, 20*10, v_dif, 1, &out);
//	cublasDestroy(ret);
	cublasDestroy(handle);
	return out;
}

///A virtual representation of a uniform template.
///Returns the value of the template pixel.
///@param x, location of a pixel.
__device__ float Template(int x)
{
	if(x < 20/6 || x > 20*5/6 || (x > 20*2/6 && x < 20*4/6)){
		return 1.0;
	}else{
		return 0.0;
	}

}

///Find the difference of the given set of samples and the template
///using cuda acceleration.
///@param *result, a pointer to the memory that stores the result.
__global__
void get_diff (float *result)
{	
	//cuPrintf("Hello");
	int x 	= threadIdx.x + blockIdx.x * blockDim.x;
	int y 	= threadIdx.y + blockIdx.y * blockDim.y;
	int idx = y*20+x;	

	float valIn		= tex2D(texIn, x, y)/255.0;
	float valTemp		= Template(x);
	result[idx]		= abs(valIn-valTemp);
	//result[idx]		= abs(valIn);
}



///Initialization function, allocates the memory and passes the necessary
///handles from OpenGL and Cuda.
///@param src, cudaGraphicsResource that handles the shared OpenGL/Cuda Texture
///@param DIM_Y, integer controlling how much memory to allocate.
void initArray(cudaGraphicsResource_t src, int DIM_Y)
{
	//cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar> ();
	//cudaMallocArray(&result, &channelDesc, DIM_X, DIM_Y, 0);
	//HANDLE_ERROR(
	//	cudaGraphicsGLRegisterImage(&src,
	//		fboID,
	//		GL_TEXTURE_2D,
	HANDLE_ERROR(
		cudaGraphicsMapResources(1, &src)	
	);
	HANDLE_ERROR(
		cudaGraphicsSubResourceGetMappedArray(&srcArray, src, 0, 0)
		);
	HANDLE_ERROR(
		cudaBindTextureToArray(texIn, srcArray)
		);
	cudaMalloc( (void**) &result, 20*DIM_Y*sizeof(float));
        checkCUDAerrors("Memory Allocation Issue 1");	
	cudaMalloc((void **) &v_dif, 20*10*sizeof(float));
        checkCUDAerrors("Memory Allocation Issue 2");	
	//HANDLE_ERROR(
	//	cudaBindTextureToArray(texIn, ptr, &channelDesc)
	//	    );		
}
///Deinit function that frees the memery used and releases the texture resource
///back to OpenGL.
///@param src, cudaGraphicsResource that handles the shared OpenGL/Cuda Texture
void cleanUP(cudaGraphicsResource_t src)
{
	HANDLE_ERROR(
		cudaFree(result)
	);
	HANDLE_ERROR(
		cudaGraphicsUnmapResources(1,&src)
	);
	HANDLE_ERROR(
		cudaFree(v_dif)
	);
	HANDLE_ERROR(
		cudaUnbindTexture(texIn)
	);
}



///External access-point to the cuda function
///@param src, cudaGraphicsResource that handles the shared OpenGL/Cuda Texture
///@param DIM_Y, the number of samples in the template.
///@inter temporary paramenter that tracks the number of times cost.h was called.
extern "C"
stim::vec<int> get_cost(cudaGraphicsResource_t src, int DIM_Y)
{
//	int minGridSize;
//	int blockSize;

//	cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, get_diff, 0, 20*DIM_Y*10);
//	std::cout << blockSize << std::endl;
//	std::cout << minGridSize << std::endl;
//	stringstream name;	//for debugging
//	name << "Test.bmp";
//	dim3 block(4,4);
//	dim3 grid(20/4, DIM_Y*10/4);
//	int gridSize = (DIM_Y*10*20 + 1024 - 1)/1024;
//	dim3 grid(26, 26);
//	dim3 grid = GenGrid1D(DIM_Y*10*20);
//	stim::gpu2image<float>(result, name.str(), 20,DIM_Y*10,0,1);
//	name.clear();
//	name << "sample_" << inter << "_" << idx << ".bmp";
//	stim::gpu2image<float>(v_dif, name.str(), 20,10,0,1);

	float output[DIM_Y];
	stim::vec<int> ret(0, 0);
	float mini = 10000000000000000.0;
	int idx;
	initArray(src, DIM_Y*10);
	dim3 grid(20/2, DIM_Y*10/2);
	dim3 block(2, 2);

	get_diff <<< grid, block >>> (result);
	for (int i = 0; i < DIM_Y; i++){
		output[i] = get_sum(result+(20*10*i));
		if(output[i] <= mini){
			mini = output[i];
			idx = i;
		}
	}	

	output[idx] = get_sum(result+(20*10*idx));
	cleanUP(src);
	ret[0] = idx; ret[1] = (int) output[idx];
	return ret;
}