#ifndef STIM_FILTER_H #define STIM_FILTER_H #include #include #include #include #include #include #include #include #include #include #include #include #define IMAD(a,b,c) ( __mul24((a), (b)) + (c) ) namespace stim { namespace cuda { float* gpuLoG; float* LoG; float* res; float* centers; stim::cuda::cuda_texture tx; void initArray(int DIM_X, int DIM_Y, int kl) { LoG = (float*) malloc(kl*kl*sizeof(float)); HANDLE_ERROR( cudaMalloc( (void**) &gpuLoG, kl*kl*sizeof(float)) ); // checkCUDAerrors("Memory Allocation, LoG"); HANDLE_ERROR( cudaMalloc( (void**) &res, DIM_Y*DIM_X*sizeof(float)) ); HANDLE_ERROR( cudaMalloc( (void**) ¢ers, DIM_Y*DIM_X*sizeof(float)) ); // checkCUDAerrors("Memory Allocation, Result"); } void cleanUP() { HANDLE_ERROR( cudaFree(gpuLoG) ); HANDLE_ERROR( cudaFree(res) ); HANDLE_ERROR( cudaFree(centers) ); free(LoG); } void filterKernel (float kl, float sigma, float *LoG) { float t = 0.0; float kr = kl/2; float x, y; int idx; for(int i = 0; i < kl; i++){ for(int j = 0; j < kl; j++){ idx = j*kl+i; x = i - kr - 0.5; y = j - kr - 0.5; LoG[idx] = (-1.0/PI/powf(sigma, 4))* (1 - (powf(x,2)+powf(y,2))/2.0/powf(sigma, 2)) *expf(-(powf(x,2)+powf(y,2))/2/powf(sigma,2)); t +=LoG[idx]; } } for(int i = 0; i < kl*kl; i++) { LoG[i] = LoG[i]/t; } } //Shared memory would be better. __global__ void applyFilter(cudaTextureObject_t texIn, unsigned int DIM_X, unsigned int DIM_Y, int kr, int kl, float *res, float* gpuLoG){ //R = floor(size/2) //THIS IS A NAIVE WAY TO DO IT, and there is a better way) __shared__ float shared[7][7]; int x = blockIdx.x; int y = blockIdx.y; int xi = threadIdx.x; int yi = threadIdx.y; // float val = 0; float tu = (x-kr+xi)/(float)DIM_X; float tv = (y-kr+yi)/(float)DIM_Y; shared[xi][yi] = gpuLoG[yi*kl+xi]*(255.0-(float)tex2D(texIn, tu, tv)); __syncthreads(); //x = max(0,x); //x = min(x, width-1); //y = max(y, 0); //y = min(y, height - 1); int idx = y*DIM_X+x; // int k_idx; for(unsigned int step = blockDim.x/2; step >= 1; step >>= 1) { __syncthreads(); if (xi < step) { shared[xi][yi] += shared[xi + step][yi]; } __syncthreads(); } __syncthreads(); for(unsigned int step = blockDim.y/2; step >= 1; step >>= 1) { __syncthreads(); if(yi < step) { shared[xi][yi] += shared[xi][yi + step]; } __syncthreads(); } __syncthreads(); if(xi == 0 && yi == 0) res[idx] = shared[0][0]; } extern "C" float * get_centers(GLint texbufferID, GLenum texType, int DIM_X, int DIM_Y, int sizeK, float sigma, float conn, float threshold) { tx.SetTextureCoordinates(1); tx.SetAddressMode(1, 3); tx.MapCudaTexture(texbufferID, texType); float* result = (float*) malloc(DIM_X*DIM_Y*sizeof(float)); initArray(DIM_X, DIM_Y, sizeK); filterKernel(sizeK, sigma, LoG); cudaMemcpy(gpuLoG, LoG, sizeK*sizeK*sizeof(float), cudaMemcpyHostToDevice); dim3 numBlocks(DIM_X, DIM_Y); dim3 threadsPerBlock(sizeK, sizeK); applyFilter <<< numBlocks, threadsPerBlock >>> (tx.getTexture(), DIM_X, DIM_Y, floor(sizeK/2), sizeK, res, gpuLoG); stim::cuda::gpu_local_max(centers, res, threshold, conn, DIM_X, DIM_Y); cudaDeviceSynchronize(); cudaMemcpy(result, centers, DIM_X*DIM_Y*sizeof(float), cudaMemcpyDeviceToHost); tx.UnmapCudaTexture(); cleanUP(); return result; } } } #endif