local_max.cuh 3.67 KB
#ifndef STIM_CUDA_LOCAL_MAX_H
#define STIM_CUDA_LOCAL_MAX_H


# include <iostream>
# include <cuda.h>
#include <stim/cuda/cudatools.h>

namespace stim{
	namespace cuda{


		// this kernel calculates the local maximum for finding the cell centers
		template<typename T>
		__global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){

			// calculate the 2D coordinates for this current thread.
			int xi = blockIdx.x * blockDim.x + threadIdx.x;
			int yi = blockIdx.y;
			// convert 2D coordinates to 1D
			int i = yi * x + xi;



			//calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
			int xl = xi - conn;
			int yl = yi - conn;
			
			// use zero for the lowest limits if the xi or yi is less than conn.
			if (xi <= conn)
				xl = 0;
			if (yi <= conn)
				yl = 0;

			//calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
			int xh = xi + conn;
			int yh = yi + conn;

			// use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn.
			if (xi >= x - conn)
				xh = x;
			if (yi>= y - conn)
				yh = y;

			// calculate the limits for finding the local maximum location in the connected neighbors for the current pixel
			int n_l = yl * x + xl;
			int n_h = yh * x + xh;
			
			//initial the centers image to zero
			gpuCenters[i] = 0;


			int n = n_l;

			float l_value = 0;

			if (i < x * y)

				// check if the vote value for this pixel is greater than threshold, so this pixel may be a local max.
				if (gpuVote[i]>final_t){

					// compare the vote value for this pixel with the vote values with its neighbors.
					while (n<=n_h){

						// check if this vote value is a local max in its neighborhood.
							if (gpuVote[i] < gpuVote[n]){
								l_value  = 0;
								n =n_h+1;
							}
							else if (n == n_h){
								l_value = 1;
								n = n+1;
							}
							// check if the current neighbor is the last one at the current row
							else if ((n - n_l - 2*conn)% x ==0){
								n = n + x - 2*conn -1;
								n ++;						
							}
							else
								n ++;
				}
					// set the center value for this pixel to high if it's a local max ,and to low if not.
					gpuCenters[i] = l_value ;
				}

		}



		template<typename T>
		void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){

			
			
			
			unsigned int max_threads = stim::maxThreadsPerBlock();
			dim3 threads(max_threads, 1);
			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
			
			
			
			//call the kernel to find the local maximum.
			cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y);


		}



		template<typename T>
		void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
		

			//calculate the number of bytes in the array
			unsigned int bytes = x * y * sizeof(T);

			// allocate space on the GPU for the detected cell centes
			T* gpuCenters;
			cudaMalloc(&gpuCenters, bytes);		


			//allocate space on the GPU for the input Vote Image
			T* gpuVote;
			cudaMalloc(&gpuVote, bytes);		

			
			//copy the Vote image data to the GPU
			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));

			
			//call the GPU version of the local max function
			gpu_local_max<T>(gpuCenters, gpuVote, final_t, conn, x, y);

				
			//copy the cell centers data to the CPU
			cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ;

			
			//free allocated memory
			cudaFree(gpuCenters);
			cudaFree(gpuVote);
			cudaFree(gpuGrad);
		}
		
	}
}



#endif