stim/cuda/filter.cuh

#ifndef STIM_FILTER_H
#define STIM_FILTER_H
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <stim/visualization/colormap.h>
#include <sstream>
#include <stim/math/constants.h>
#include <stim/cuda/cudatools/devices.h>
#include <stim/cuda/cudatools/threads.h>
#include <stim/cuda/cuda_texture.cuh>
#include <stim/cuda/ivote.cuh>
#include <stim/cuda/arraymath.cuh>
#define IMAD(a,b,c) ( __mul24((a), (b)) + (c) )
namespace stim
{
	namespace cuda
	{
	float* gpuLoG;
	float* LoG;
	float* res;
	float* centers;
//#ifdef DEBUG
	float* print;
//#endif
	stim::cuda::cuda_texture tx;
	void initArray(int DIM_X, int DIM_Y, int kl)
	{
		
			LoG =  (float*) malloc(kl*kl*sizeof(float));
		HANDLE_ERROR(
			cudaMalloc( (void**) &gpuLoG, kl*kl*sizeof(float))
		);
	//	checkCUDAerrors("Memory Allocation, LoG");
		HANDLE_ERROR(
			cudaMalloc( (void**) &res, DIM_Y*DIM_X*sizeof(float))
		);
		HANDLE_ERROR(
			cudaMalloc( (void**) &centers, DIM_Y*DIM_X*sizeof(float))
		);
//#ifdef DEBUG
		HANDLE_ERROR(
			cudaMalloc( (void**) &print, DIM_Y*DIM_X*sizeof(float))
		);
//#endif
	//	checkCUDAerrors("Memory Allocation, Result");
	}
	void cleanUP()
	{
		HANDLE_ERROR(
			cudaFree(gpuLoG)
		);
		HANDLE_ERROR(
			cudaFree(res)
		);
		HANDLE_ERROR(
			cudaFree(centers)
		);
//#ifdef DEBUG
		HANDLE_ERROR(
			cudaFree(print)
		);
//#endif
			free(LoG);
	}
	void
	filterKernel (float kl, float sigma, float *LoG)
	{
		float t = 0.0;
		float kr = kl/2; 
		float x, y;
		int idx;
		for(int i = 0; i < kl; i++){
			for(int j = 0; j < kl; j++){
				idx = j*kl+i;
				x = i - kr - 0.5;
				y = j - kr - 0.5;
				LoG[idx] = (-1.0/PI/powf(sigma, 4))* (1 - (powf(x,2)+powf(y,2))/2.0/powf(sigma, 2))
						*expf(-(powf(x,2)+powf(y,2))/2/powf(sigma,2));	
				t +=LoG[idx];
			}
		}
		
		for(int i = 0; i < kl*kl; i++)
		{
			LoG[i] = LoG[i]/t;
		}
		
	}
	//Shared memory would be better.
	__global__
	void
	applyFilter(cudaTextureObject_t texIn, unsigned int DIM_X, unsigned int DIM_Y, int kr, int kl, float *res, float* gpuLoG, float* p){
	//R = floor(size/2)
	//THIS IS A NAIVE WAY TO DO IT, and there is a better way)
		
		__shared__ float shared[7][7];
		int x = blockIdx.x;
		int y = blockIdx.y;
		int xi = threadIdx.x;
		int yi = threadIdx.y;
	//	float val = 0;
		float tu = (x-kr+xi)/(float)DIM_X;
		float tv = (y-kr+yi)/(float)DIM_Y;
		int idx = y*DIM_X+x;
		shared[xi][yi] = gpuLoG[yi*kl+xi]*(255.0-(float)tex2D<unsigned char>(texIn, tu, tv));
		__syncthreads();
		
		//x = max(0,x);
		//x = min(x, width-1);
		//y = max(y, 0);
		//y = min(y, height - 1);
	//	int k_idx;
                for(unsigned int step = blockDim.x/2; step >= 1; step >>= 1)
                {
                        __syncthreads();
                        if (xi < step)
                        {
                                shared[xi][yi] += shared[xi + step][yi];
                        }
                __syncthreads();
                }
                __syncthreads();
                for(unsigned int step = blockDim.y/2; step >= 1; step >>= 1)
                {
                        __syncthreads();
                        if(yi < step)
                        {
                                shared[xi][yi] += shared[xi][yi + step];
                        }
                __syncthreads();
                }
                __syncthreads();
                if(xi == 0 && yi == 0)
                        res[idx] = shared[0][0];
	}
	extern "C"
	float *
	get_centers(GLint texbufferID, GLenum texType, int DIM_X, int DIM_Y, int sizeK, float sigma, float conn, float threshold, int iter = 0)
	{
		tx.SetTextureCoordinates(1);
		tx.SetAddressMode(1, 3);
		tx.MapCudaTexture(texbufferID, texType);
		float* result =  (float*) malloc(DIM_X*DIM_Y*sizeof(float));
		
		initArray(DIM_X, DIM_Y, sizeK);
		filterKernel(sizeK, sigma, LoG);
		cudaMemcpy(gpuLoG, LoG, sizeK*sizeK*sizeof(float), cudaMemcpyHostToDevice);
		dim3 numBlocks(DIM_X, DIM_Y);
		dim3 threadsPerBlock(sizeK, sizeK);
		applyFilter <<< numBlocks, threadsPerBlock >>> (tx.getTexture(), DIM_X, DIM_Y, floor(sizeK/2), sizeK, res, gpuLoG, print);
		#ifdef DEBUG
			stringstream name;
			name.str("");
			name << "Fiber Cylinder " << iter << ".bmp";
			stim::gpu2image<float>(res, name.str(), DIM_X, DIM_Y, 0, 255);
		#endif
		stim::cuda::gpu_local_max<float>(centers, res, threshold, conn, DIM_X, DIM_Y);
		
		cudaDeviceSynchronize();
		cudaMemcpy(result, centers, DIM_X*DIM_Y*sizeof(float), cudaMemcpyDeviceToHost);
		tx.UnmapCudaTexture();
		cleanUP();
		return result;
	}
	}
}
#endif