#ifndef STIM_CUDA_LOCAL_MAX_H #define STIM_CUDA_LOCAL_MAX_H # include # include #include namespace stim{ namespace cuda{ // this kernel calculates the local maximum for finding the cell centers template __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, int conn, int x, int y){ // calculate the 2D coordinates for this current thread. int xi = blockIdx.x * blockDim.x + threadIdx.x; int yi = blockIdx.y; if(xi >= x || yi >= y) return; // convert 2D coordinates to 1D int i = yi * x + xi; gpuCenters[i] = 0; //initialize the value at this location to zero T val = gpuVote[i]; //compare to the threshold if(val < final_t) return; //define an array to store indices with same vote value /*int * IdxEq; IdxEq = new int [2*conn]; int n = 0;*/ for(int xl = xi - conn; xl < xi + conn; xl++){ for(int yl = yi - conn; yl < yi + conn; yl++){ if(xl >= 0 && xl < x && yl >= 0 && yl < y){ int il = yl * x + xl; if(gpuVote[il] > val){ return; } if (gpuVote[il] == val){ /*IdxEq[n] = il; n = n+1;*/ if( il > i){ return; } } } } } /*if (n!=0){ if(IdxEq[n/2] !=i){ return; } } */ gpuCenters[i] = 1; } template void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(max_threads, 1); dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); //call the kernel to find the local maximum. cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y); } template void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ //calculate the number of bytes in the array unsigned int bytes = x * y * sizeof(T); // allocate space on the GPU for the detected cell centes T* gpuCenters; cudaMalloc(&gpuCenters, bytes); //allocate space on the GPU for the input Vote Image T* gpuVote; cudaMalloc(&gpuVote, bytes); //copy the Vote image data to the GPU HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); //call the GPU version of the local max function gpu_local_max(gpuCenters, gpuVote, final_t, conn, x, y); //copy the cell centers data to the CPU cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ; //free allocated memory cudaFree(gpuCenters); cudaFree(gpuVote); } } } #endif