#ifndef STIM_CUDA_LOCAL_MAX_H #define STIM_CUDA_LOCAL_MAX_H # include # include #include namespace stim{ namespace cuda{ // this kernel calculates the local maximum for finding the cell centers template __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ // calculate the 2D coordinates for this current thread. int xi = blockIdx.x * blockDim.x + threadIdx.x; int yi = blockIdx.y; // convert 2D coordinates to 1D int i = yi * x + xi; //calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. int xl = xi - conn; int yl = yi - conn; // use zero for the lowest limits if the xi or yi is less than conn. if (xi <= conn) xl = 0; if (yi <= conn) yl = 0; //calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. int xh = xi + conn; int yh = yi + conn; // use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn. if (xi >= x - conn) xh = x; if (yi>= y - conn) yh = y; // calculate the limits for finding the local maximum location in the connected neighbors for the current pixel int n_l = yl * x + xl; int n_h = yh * x + xh; //initial the centers image to zero gpuCenters[i] = 0; int n = n_l; float l_value = 0; if (i < x * y) // check if the vote value for this pixel is greater than threshold, so this pixel may be a local max. if (gpuVote[i]>final_t){ // compare the vote value for this pixel with the vote values with its neighbors. while (n<=n_h){ // check if this vote value is a local max in its neighborhood. if (gpuVote[i] < gpuVote[n]){ l_value = 0; n =n_h+1; } else if (n == n_h){ l_value = 1; n = n+1; } // check if the current neighbor is the last one at the current row else if ((n - n_l - 2*conn)% x ==0){ n = n + x - 2*conn -1; n ++; } else n ++; } // set the center value for this pixel to high if it's a local max ,and to low if not. gpuCenters[i] = l_value ; } } template void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(max_threads, 1); dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); //call the kernel to find the local maximum. cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y); } template void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ //calculate the number of bytes in the array unsigned int bytes = x * y * sizeof(T); // allocate space on the GPU for the detected cell centes T* gpuCenters; cudaMalloc(&gpuCenters, bytes); //allocate space on the GPU for the input Vote Image T* gpuVote; cudaMalloc(&gpuVote, bytes); //copy the Vote image data to the GPU HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); //call the GPU version of the local max function gpu_local_max(gpuCenters, gpuVote, final_t, conn, x, y); //copy the cell centers data to the CPU cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ; //free allocated memory cudaFree(gpuCenters); cudaFree(gpuVote); cudaFree(gpuGrad); } } } #endif