optics/stim/cuda/spider_cost.cuh

#ifndef STIM_SPIDER_COST_H
#define STIM_SPIDER_COST_H
#include <assert.h>
#include <cuda.h>
//#include <cuda_runtime.h>
#include <stdio.h>
#include <stim/visualization/colormap.h>
#include <sstream>
#include <stim/math/vector.h>
#include <stim/cuda/cudatools/devices.h>
#include <stim/cuda/cudatools/threads.h>
#include <stim/cuda/cuda_texture.cuh>
namespace stim{
	namespace cuda
	{
	
	stim::cuda::cuda_texture t;  //texture object.
	float* result;
	float* print;
	
	///Initialization function, allocates the memory and passes the necessary
	///handles from OpenGL and Cuda.
	///@param DIM_Y			--integer controlling how much memory to allocate.
	void initArray(int DIM_Y)
	{
//			cudaMalloc( (void**) &print, DIM_Y*16*sizeof(float));     ///temporary
			cudaMalloc( (void**) &result, DIM_Y*sizeof(float));
	}
	///Deinit function that frees the memery used and releases the texture resource
	///back to OpenGL.
	void cleanUP()
	{
			cudaFree(result);
//			cudaFree(print);         ///temporary
	}  
	///A virtual representation of a uniform template.
	///Returns the value of the template pixel.
	///@param int x			--location of a pixel.
	__device__
	float Template(int x)
	{
		if(x < 16/6 || x > 16*5/6 || (x > 16*2/6 && x < 16*4/6)){
			return 1.0;
		}else{
			return 0.0;
		}
	}
		
	///Find the difference of the given set of samples and the template
	///using cuda acceleration.
	///@param stim::cuda::cuda_texture t	--stim texture that holds all the references
	///					  to the data.
	///@param float* result			--a pointer to the memory that stores the result.
	__global__
	//void get_diff (float *result)
	void get_diff (cudaTextureObject_t texIn, float *result)
	{       
		__shared__ float shared[16][8];
		int x   = threadIdx.x + blockIdx.x * blockDim.x;
		int y   = threadIdx.y + blockIdx.y * blockDim.y;
		int x_t = threadIdx.x;
		int y_t = threadIdx.y;
//		int idx = y*16+x;
		int g_idx = blockIdx.y;
		float valIn             = tex2D<unsigned char>(texIn, x, y)/255.0;
		float valTemp           = Template(x);
//		print[idx]              = abs(valIn);             ///temporary
		shared[x_t][y_t]        = abs(valIn-valTemp);
		__syncthreads();
		for(unsigned int step = blockDim.x/2; step >= 1; step >>= 1)
		{
			__syncthreads();
			if (x_t < step)
			{
				shared[x_t][y_t] += shared[x_t + step][y_t];
			}
		__syncthreads();
		}
		__syncthreads();
		for(unsigned int step = blockDim.y/2; step >= 1; step >>= 1)
		{
			__syncthreads();
			if(y_t < step)
			{
				shared[x_t][y_t] += shared[x_t][y_t + step];
			}
		__syncthreads();
		}
		__syncthreads();
		if(x_t == 0 && y_t == 0)
			result[g_idx] = shared[0][0];
	//      //result[idx]           = abs(valIn);
	}
	///External access-point to the cuda function
	///@param GLuint texbufferID 	--GLtexture (most be contained in a framebuffer object)
	///				  that holds the data that will be handed to cuda.
	///@param GLenum texType	--either GL_TEXTURE_1D, GL_TEXTURE_2D or GL_TEXTURE_3D
	///				  may work with other gl texture types, but untested.
	///@param DIM_Y, the number of samples in the template.
	extern "C"
	stim::vec<int> get_cost(GLint texbufferID, GLenum texType, int DIM_Y)
	{
		//Bind the Texture in GL and allow access to cuda.
		t.MapCudaTexture(texbufferID, texType);
		//initialize the return arrays.
		float* output;	
		output = (float* ) malloc(DIM_Y*sizeof(float));
		stim::vec<int> ret(0, 0);
		initArray(DIM_Y);
		
		//variables for finding the min.
		float mini = 10000000000000000.0;
		int idx = 0;
	
		//cuda launch variables.
		dim3 numBlocks(1, DIM_Y);
		dim3 threadsPerBlock(16, 8);
		get_diff <<< numBlocks, threadsPerBlock >>> (t.getTexture(), result);
		HANDLE_ERROR(
			cudaMemcpy(output, result, DIM_Y*sizeof(float), cudaMemcpyDeviceToHost)
			);
		for( int i = 0; i<DIM_Y; i++){
			if(output[i] < mini){
				mini = output[i];
				idx = i;                                                                   
			}
		}
//		stringstream name;      //for debugging
//		name << "Test.bmp";
//		stim::gpu2image<float>(print, name.str(),16,218,0,256);
	  
		t.UnmapCudaTexture();
		cleanUP();
		ret[0] = idx; ret[1] = (int) output[idx];
		free(output);
		return ret;
	}
	}
}
#endif