cost.h 5.29 KB
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <stdio.h>
#include "../visualization/colormap.h"
#include <sstream>
#include "../math/vector.h"
#include "../cuda/devices.h"
#include "../cuda/threads.h"

///Cost function that works with the gl-spider class to find index of the item with min-cost.
typedef unsigned char uchar;
texture<uchar, cudaTextureType2D, cudaReadModeElementType> texIn;
float *result;
cudaArray* srcArray;
bool testing = false;
/*
struct SharedMemory
{
	__device__ inline operator float* ()
	{
		extern __shared__ float __smem[];
		return (float *)__smem;
	}
	
	__device__ inline operator const float* () const
	{
		extern __shared__ float __smem[];
		return (float *)__smem;
	}
};*/

inline void checkCUDAerrors(const char *msg)
{
	cudaError_t err = cudaGetLastError();
	if (cudaSuccess != err){
		fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err) );
		exit(1);
		}
}

///A virtual representation of a uniform template.
///Returns the value of the template pixel.
///@param x, location of a pixel.
__device__ float Template(int x)
{
	if(x < 16/6 || x > 16*5/6 || (x > 16*2/6 && x < 16*4/6)){
		return 1.0;
	}else{
		return 0.0;
	}

}

///Find the difference of the given set of samples and the template
///using cuda acceleration.
///@param *result, a pointer to the memory that stores the result.
__global__
void get_diff (float *result)
{	
	//float* shared = SharedMemory();
	__shared__ float shared[16][8];
	int x 	= threadIdx.x + blockIdx.x * blockDim.x;
	int y 	= threadIdx.y + blockIdx.y * blockDim.y;
	int x_t = threadIdx.x;
	int y_t = threadIdx.y;
	//int idx = y*16+x;
	int g_idx = blockIdx.y;

	float valIn		= tex2D(texIn, x, y)/255.0;
	float valTemp		= Template(x);
	shared[x_t][y_t]	= abs(valIn-valTemp);

	__syncthreads();

	for(unsigned int step = blockDim.x/2; step >= 1; step >>= 1)
	{
		__syncthreads();
		if (x_t < step)
		{
			shared[x_t][y_t] += shared[x_t + step][y_t];
		}
	__syncthreads();
	}
	__syncthreads();

	for(unsigned int step = blockDim.y/2; step >= 1; step >>= 1)
	{
		__syncthreads();
		if(y_t < step)
		{
			shared[x_t][y_t] += shared[x_t][y_t + step];
		}
	__syncthreads();
	}
	__syncthreads();
/*	for(unsigned int step = 1; step < blockDim.x; step *= 2)
	{
		__syncthreads();
		if (x_t %(2*step) == 0)
		{
			shared[x_t][y_t] += shared[x_t + step][y_t];
		}
	}
	__syncthreads();

	for(unsigned int step = 1; step < blockDim.y; step *= 2)
	{
		__syncthreads();
		if(y_t%(2*step) == 0)
		{
			shared[x_t][y_t] += shared[x_t][y_t + step];
		}
	}
	__syncthreads(); */
	if(x_t == 0 && y_t == 0)
		result[g_idx] = shared[0][0];


//	//result[idx]		= abs(valIn);
}



///Initialization function, allocates the memory and passes the necessary
///handles from OpenGL and Cuda.
///@param src, cudaGraphicsResource that handles the shared OpenGL/Cuda Texture
///@param DIM_Y, integer controlling how much memory to allocate.
void initArray(cudaGraphicsResource_t src, int DIM_Y)
{
	HANDLE_ERROR(
		cudaGraphicsMapResources(1, &src)	
	);
	HANDLE_ERROR(
		cudaGraphicsSubResourceGetMappedArray(&srcArray, src, 0, 0)
		);
	HANDLE_ERROR(
		cudaBindTextureToArray(texIn, srcArray)
		);
	cudaMalloc( (void**) &result, DIM_Y*sizeof(float));
        checkCUDAerrors("Memory Allocation Issue 1");	
	//HANDLE_ERROR(
	//	cudaBindTextureToArray(texIn, ptr, &channelDesc)
	//	    );		
}
///Deinit function that frees the memery used and releases the texture resource
///back to OpenGL.
///@param src, cudaGraphicsResource that handles the shared OpenGL/Cuda Texture
void cleanUP(cudaGraphicsResource_t src)
{
	HANDLE_ERROR(
		cudaFree(result)
	);
	HANDLE_ERROR(
		cudaGraphicsUnmapResources(1,&src)
	);
	HANDLE_ERROR(
		cudaUnbindTexture(texIn)
	);
}



///External access-point to the cuda function
///@param src, cudaGraphicsResource that handles the shared OpenGL/Cuda Texture
///@param DIM_Y, the number of samples in the template.
///@inter temporary paramenter that tracks the number of times cost.h was called.
extern "C"
stim::vec<int> get_cost(cudaGraphicsResource_t src, int DIM_Y)
{
//	int minGridSize;
//	int blockSize;

//	cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, get_diff, 0, 20*DIM_Y*10);
//	std::cout << blockSize << std::endl;
//	std::cout << minGridSize << std::endl;
//	stringstream name;	//for debugging
//	name << "Test.bmp";
//	dim3 block(4,4);
//	dim3 grid(20/4, DIM_Y*10/4);
//	int gridSize = (DIM_Y*10*20 + 1024 - 1)/1024;
//	dim3 grid(26, 26);
//	dim3 grid = GenGrid1D(DIM_Y*10*20);
//	stim::gpu2image<float>(result, name.str(), 20,DIM_Y*10,0,1);
//	name.clear();
//	name << "sample_" << inter << "_" << idx << ".bmp";
//	stim::gpu2image<float>(v_dif, name.str(), 20,10,0,1);

	//float output[DIM_Y];
	float *output;
	output = (float* ) malloc(DIM_Y*sizeof(float));
	stim::vec<int> ret(0, 0);
	float mini = 10000000000000000.0;
	int idx = 0;
	initArray(src, DIM_Y*8);
	dim3 numBlocks(1, DIM_Y);
	dim3 threadsPerBlock(16, 8);


	get_diff <<< numBlocks, threadsPerBlock >>> (result);
	cudaMemcpy(output, result, DIM_Y*sizeof(float), cudaMemcpyDeviceToHost);

	for( int i = 0; i<DIM_Y; i++){
//		std::cout << output[i] << std::endl;
		if(output[i] < mini){
			mini = output[i];
			idx = i;
		}
	}
  
//	std::cout << "hello" << std::endl;
	//output[idx] = get_sum(result+(16*8*idx));
	cleanUP(src);
	ret[0] = idx; ret[1] = (int) output[idx];
	free(output);
	return ret;
}