spider_cost.cuh 5.05 KB
#ifndef STIM_SPIDER_COST_H
#define STIM_SPIDER_COST_H

#include <assert.h>
#include <cuda.h>
//#include <cuda_runtime.h>
#include <stdio.h>
#include <stim/visualization/colormap.h>
#include <sstream>
#include <stim/math/vector.h>
#include <stim/cuda/cudatools/timer.h>
#include <stim/cuda/cudatools/devices.h>
#include <stim/cuda/cudatools/threads.h>
#include <stim/cuda/cuda_texture.cuh>
namespace stim{
	namespace cuda
	{
	
	stim::cuda::cuda_texture t;  //texture object.
	float* result;
//	float* print;
	
	///Initialization function, allocates the memory and passes the necessary
	///handles from OpenGL and Cuda.
	///@param DIM_Y			--integer controlling how much memory to allocate.
	void initArray(int DIM_Y)
	{
//			cudaMalloc( (void**) &print, DIM_Y*16*sizeof(float));     ///temporary
			cudaMalloc( (void**) &result, DIM_Y*sizeof(float));
	}

	///Deinit function that frees the memery used and releases the texture resource
	///back to OpenGL.
	void cleanUP()
	{
			cudaFree(result);
//			cudaFree(print);         ///temporary
	}  

	///A virtual representation of a uniform template.
	///Returns the value of the template pixel.
	///@param int x			--location of a pixel.
	__device__
	float Template(int x, int max_x)
	{
	if(x < max_x/6 || x > max_x*5/6 || (x > max_x*2/6 && x < max_x*4/6))
		{
			return 1.0;
		}else{
			return 0.0;
		}

	}
		
	///Find the difference of the given set of samples and the template
	///using cuda acceleration.
	///@param stim::cuda::cuda_texture t	--stim texture that holds all the references
	///					  to the data.
	///@param float* result			--a pointer to the memory that stores the result.
	__global__
	//void get_diff (float *result)
	void get_diff (cudaTextureObject_t texIn, float *result, int dx, int dy)
	{       
//		__shared__ float shared[32][16];
		extern __shared__ float shared[];
		int x   = threadIdx.x + blockIdx.x * blockDim.x;
		int y   = threadIdx.y + blockIdx.y * blockDim.y;
		int x_t = threadIdx.x;
		int y_t = threadIdx.y;
		int idx = y_t*dx+x_t;
		int g_idx = blockIdx.y;

		float valIn             = tex2D<unsigned char>(texIn, x, y)/255.0;
		float valTemp           = Template(x, dx);

//		print[idx]              = abs(valIn);             ///temporary

		shared[idx]        = abs(valIn-valTemp);

		__syncthreads();

		for(unsigned int step = blockDim.x/2; step >= 1; step >>= 1)
		{
			__syncthreads();
			if (x_t < step)
			{
//				shared[x_t][y_t] += shared[x_t + step][y_t];
				shared[idx] += shared[y_t*dx+x_t+step];
			}
		__syncthreads();
		}
		__syncthreads();

		for(unsigned int step = blockDim.y/2; step >= 1; step >>= 1)
		{
			__syncthreads();
			if(y_t < step)
			{
//				shared[x_t][y_t] += shared[x_t][y_t + step];
				shared[idx] += shared[(y_t+step)*dx+x_t];
			}
		__syncthreads();
		}
		__syncthreads();
		if(x_t == 0 && y_t == 0)
			result[g_idx] = shared[0];


	//      //result[idx]           = abs(valIn);
	}


	///External access-point to the cuda function
	///@param GLuint texbufferID 	--GLtexture (most be contained in a framebuffer object)
	///				  that holds the data that will be handed to cuda.
	///@param GLenum texType	--either GL_TEXTURE_1D, GL_TEXTURE_2D or GL_TEXTURE_3D
	///				  may work with other gl texture types, but untested.
	///@param DIM_Y, the number of samples in the template.
	extern "C"
	stim::vec<int> get_cost(GLint texbufferID, GLenum texType, int DIM_Y,int dx = 16, int dy = 8)
	{

		//Bind the Texture in GL and allow access to cuda.
//		#ifdef TIMING
//			gpuStartTimer();
//		#endif
		t.MapCudaTexture(texbufferID, texType);
//		#ifdef TIMING
//			std::cout << "      " << gpuStopTimer();
//		#endif

		//initialize the return arrays.
//		#ifdef TIMING
//			gpuStartTimer();
//		#endif
		float* output;	
		output = (float* ) malloc(DIM_Y*sizeof(float));

		stim::vec<int> ret(0, 0);
		initArray(DIM_Y);
		

		//variables for finding the min.
		float mini = 10000000000000000.0;
		int idx = 0;
//		#ifdef TIMING
//			std::cout << "      " << gpuStopTimer();
//		#endif
	
		//cuda launch variables.
//		#ifdef TIMING
//			gpuStartTimer();
//		#endif
		dim3 numBlocks(1, DIM_Y);
		dim3 threadsPerBlock(dx, dy);

		get_diff <<< numBlocks, threadsPerBlock, dx*dy*sizeof(float) >>> (t.getTexture(), result, dx, dy);
		cudaDeviceSynchronize();
//		#ifdef TIMING
//			std::cout << "      " << gpuStopTimer();
//		#endif

//		#ifdef TIMING
//			gpuStartTimer();
//		#endif
		HANDLE_ERROR(
			cudaMemcpy(output, result, DIM_Y*sizeof(float), cudaMemcpyDeviceToHost)
			);

		for( int i = 0; i<DIM_Y; i++){
			if(output[i] < mini){
				mini = output[i];
				idx = i;                                          
			}
		}
//		#ifdef TIMING
//			std::cout << "      " << gpuStopTimer();
//		#endif

//		#ifdef TIMING
//			gpuStartTimer();
//		#endif
//		stringstream name;      //for debugging
//		name << "Test.bmp";
//		stim::gpu2image<float>(print, name.str(),16,218,0,256);
	  
		t.UnmapCudaTexture();
		cleanUP();
		ret[0] = idx; ret[1] = (int) output[idx];
//		std::cout << "The cost is " << output[idx] << std::endl;
		free(output);
//		#ifdef TIMING
//			std::cout << "      " << gpuStopTimer() << std::endl;
//		#endif
		return ret;
	}

	}
}


#endif