#ifndef STIM_CUDA_LOCAL_MAX_H #define STIM_CUDA_LOCAL_MAX_H # include # include #include namespace stim{ namespace cuda{ // this kernel calculates the local maximum for finding the cell centers template __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, int x, int y){ // calculate the 2D coordinates for this current thread. int xi = blockIdx.x * blockDim.x + threadIdx.x; int yi = blockIdx.y * blockDim.y + threadIdx.y; if(xi >= x || yi >= y) return; // convert 2D coordinates to 1D int i = yi * x + xi; gpuCenters[i] = 0; //initialize the value at this location to zero T val = gpuVote[i]; for(int xl = xi - conn; xl < xi + conn; xl++){ for(int yl = yi - conn; yl < yi + conn; yl++){ if(xl >= 0 && xl < x && yl >= 0 && yl < y){ int il = yl * x + xl; if(gpuVote[il] > val){ return; } if (gpuVote[il] == val){ if( il > i){ return; } } } } } gpuCenters[i] = gpuVote[i]; } template void gpu_local_max(T* gpuCenters, T* gpuVote, unsigned int conn, size_t x, size_t y){ unsigned int max_threads = stim::maxThreadsPerBlock(); /*dim3 threads(max_threads, 1); dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/ dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) ); dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1); //call the kernel to find the local maximum. cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, conn, x, y); } template void cpu_local_max(T* cpuCenters, T* cpuVote, unsigned int conn, unsigned int x, unsigned int y){ //calculate the number of bytes in the array unsigned int bytes = x * y * sizeof(T); // allocate space on the GPU for the detected cell centes T* gpuCenters; cudaMalloc(&gpuCenters, bytes); //allocate space on the GPU for the input Vote Image T* gpuVote; cudaMalloc(&gpuVote, bytes); //copy the Vote image data to the GPU HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); //call the GPU version of the local max function gpu_local_max(gpuCenters, gpuVote, conn, x, y); //copy the cell centers data to the CPU cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ; //free allocated memory cudaFree(gpuCenters); cudaFree(gpuVote); } } } #endif