#ifndef STIM_SPIDER_COST_H #define STIM_SPIDER_COST_H #include #include //#include #include #include #include #include #include #include #include #include namespace stim{ namespace cuda { // float* result; // float* print; ///Initialization function, allocates the memory and passes the necessary ///handles from OpenGL and Cuda. ///@param DIM_Y --integer controlling how much memory to allocate. // void initArray(int DIM_Y) // { // cudaMalloc( (void**) &print, DIM_Y*16*sizeof(float)); ///temporary // cudaMalloc( (void**) &result, DIM_Y*sizeof(float)); // } ///Deinit function that frees the memery used and releases the texture resource ///back to OpenGL. // void cleanUP() // { // cudaFree(result); // cudaFree(print); ///temporary // } ///A virtual representation of a uniform template. ///Returns the value of the template pixel. ///@param int x --location of a pixel. __device__ float Template(int x, int max_x) { if(x < max_x/6 || x > max_x*5/6 || (x > max_x*2/6 && x < max_x*4/6)) { return 1.0; }else{ return 0.0; } } ///Find the difference of the given set of samples and the template ///using cuda acceleration. ///@param stim::cuda::cuda_texture t --stim texture that holds all the references /// to the data. ///@param float* result --a pointer to the memory that stores the result. __global__ //void get_diff (float *result) void get_diff (cudaTextureObject_t texIn, float *result, int dx, int dy) { // __shared__ float shared[32][16]; extern __shared__ float shared[]; int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int x_t = threadIdx.x; int y_t = threadIdx.y; int idx = y_t*dx+x_t; int g_idx = blockIdx.y; float valIn = tex2D(texIn, x, y)/255.0; float valTemp = Template(x, dx); // print[idx] = abs(valIn); ///temporary shared[idx] = abs(valIn-valTemp); __syncthreads(); for(unsigned int step = blockDim.x/2; step >= 1; step >>= 1) { __syncthreads(); if (x_t < step) { // shared[x_t][y_t] += shared[x_t + step][y_t]; shared[idx] += shared[y_t*dx+x_t+step]; } __syncthreads(); } __syncthreads(); for(unsigned int step = blockDim.y/2; step >= 1; step >>= 1) { __syncthreads(); if(y_t < step) { // shared[x_t][y_t] += shared[x_t][y_t + step]; shared[idx] += shared[(y_t+step)*dx+x_t]; } __syncthreads(); } __syncthreads(); if(x_t == 0 && y_t == 0) result[g_idx] = shared[0]; // //result[idx] = abs(valIn); } ///External access-point to the cuda function ///@param GLuint texbufferID --GLtexture (most be contained in a framebuffer object) /// that holds the data that will be handed to cuda. ///@param GLenum texType --either GL_TEXTURE_1D, GL_TEXTURE_2D or GL_TEXTURE_3D /// may work with other gl texture types, but untested. ///@param DIM_Y, the number of samples in the template. extern "C" //stim::vec get_cost(GLint texbufferID, GLenum texType, int DIM_Y,int dx = 16, int dy = 8) stim::vec get_cost(cudaTextureObject_t tObj, float* result, int DIM_Y,int dx = 16, int dy = 8) { //Bind the Texture in GL and allow access to cuda. // #ifdef TIMING // gpuStartTimer(); // #endif // t.MapCudaTexture(texbufferID, texType); // #ifdef TIMING // std::cout << " " << gpuStopTimer(); // #endif //initialize the return arrays. // #ifdef TIMING // gpuStartTimer(); // #endif float* output; output = (float* ) malloc(DIM_Y*sizeof(float)); stim::vec ret(0, 0); // initArray(DIM_Y); //variables for finding the min. float mini = 10000000000000000.0; int idx = 0; // #ifdef TIMING // std::cout << " " << gpuStopTimer(); // #endif //cuda launch variables. // #ifdef TIMING // gpuStartTimer(); // #endif dim3 numBlocks(1, DIM_Y); dim3 threadsPerBlock(dx, dy); get_diff <<< numBlocks, threadsPerBlock, dx*dy*sizeof(float) >>> (tObj, result, dx, dy); cudaDeviceSynchronize(); // #ifdef TIMING // std::cout << " " << gpuStopTimer(); // #endif // #ifdef TIMING // gpuStartTimer(); // #endif HANDLE_ERROR( cudaMemcpy(output, result, DIM_Y*sizeof(float), cudaMemcpyDeviceToHost) ); for( int i = 0; i(print, name.str(),16,218,0,256); // t.UnmapCudaTexture(); // cleanUP(); ret[0] = idx; ret[1] = (int) output[idx]; // std::cout << "The cost is " << output[idx] << std::endl; free(output); // #ifdef TIMING // std::cout << " " << gpuStopTimer() << std::endl; // #endif return ret; } } } #endif