stim/iVote/ivote2.cuh

#ifndef STIM_IVOTE2_CUH
#define STIM_IVOTE2_CUH
#include <iostream>
#include <fstream>
#include <stim/cuda/cudatools/error.h>
#include <stim/cuda/templates/gradient.cuh>
#include <stim/cuda/arraymath.cuh>
#include <stim/iVote/ivote2/iter_vote2.cuh>
#include <stim/iVote/ivote2/local_max.cuh>
#include <stim/math/constants.h>
#include <stim/math/vector.h>
#include <stim/visualization/colormap.h>
namespace stim {
	// this function precomputes the atan2 values
	template<typename T>
	void atan_2(T* cpuTable, unsigned int rmax) {
		int xsize = 2 * rmax + 1;						//initialize the width and height of the window which atan2 are computed in.
		int ysize = 2 * rmax + 1;
		int yi = rmax;									// assign the center coordinates of the atan2 window to yi and xi
		int xi = rmax;
		for (int xt = 0; xt < xsize; xt++) {			//for each element in the atan2 table
			for (int yt = 0; yt < ysize; yt++) {
				int id = yt * xsize + xt;				//convert the current 2D coordinates to 1D
				int xd = xi - xt;						// calculate the distance between the pixel and the center of the atan2 window
				int yd = yi - yt;
				T atan_2d = atan2((T)yd, (T)xd);	// calculate the angle between the pixel and the center of the atan2 window and store the result.
				cpuTable[id] = atan_2d;
			}
		}
	}
	//this kernel invert the 2D image
	template<typename T>
	__global__ void cuda_invert(T* gpuI, size_t x, size_t y) {
		// calculate the 2D coordinates for this current thread.
		size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
		size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
		if (xi >= x || yi >= y) return;
		size_t i = yi * x + xi;					// convert 2D coordinates to 1D
		gpuI[i] = 255 - gpuI[i];				//invert the pixel intensity
	}
	//this function calculate the threshold using OTSU method
	template<typename T>
	T th_otsu(T* pts, size_t pixels, unsigned int th_num = 20) {
		T Imax = pts[0];				//initialize the maximum value to the first one
		T Imin = pts[0];				//initialize the maximum value to the first on
		for (size_t n = 0; n < pixels; n++) {		//for every value
			if (pts[n] > Imax) {			//if the value is higher than the current max
				Imax = pts[n];
			}
		}
		for (size_t n = 0; n< pixels; n++) {		//for every value
			if (pts[n] < Imin) {			//if the value is higher than the current max
				Imin = pts[n];
			}
		}
		T th_step = ((Imax - Imin) / th_num);
		vector<T> var_b;
		for (unsigned int t0 = 0; t0 < th_num; t0++) {
			T th = t0 * th_step + Imin;
			unsigned int n_b(0), n_o(0);		//these variables save the number of elements that are below and over the threshold
			T m_b(0), m_o(0);				//these variables save the mean value for each cluster
			for (unsigned int idx = 0; idx < pixels; idx++) {
				if (pts[idx] <= th) {
					m_b += pts[idx];
					n_b += 1;
				}
				else {
					m_o += pts[idx];
					n_o += 1;
				}
			}
			m_b = m_b / n_b;		//calculate the mean value for the below threshold cluster
			m_o = m_o / n_o;		//calculate the mean value for the over threshold cluster
			var_b.push_back(n_b * n_o * pow((m_b - m_o), 2));
		}
		vector<float>::iterator max_var = std::max_element(var_b.begin(), var_b.end());	//finding maximum elements in the vector
		size_t th_idx = std::distance(var_b.begin(), max_var);
		T threshold = Imin + (T)(th_idx * th_step);
		return threshold;
	}
	//this function performs the 2D iterative voting algorithm on the image stored in the gpu 
	template<typename T>
	void gpu_ivote2(T* gpuI, unsigned int rmax, size_t x, size_t y, bool invert = false, T t = 0, int iter = 8, T phi = 15.0f * (float)stim::PI / 180, int conn = 8, bool debug = false) {
		size_t pixels = x * y;				//compute the size of input image
		//
		if (invert) {						//if inversion is required call the kernel to invert the image
			unsigned int max_threads = stim::maxThreadsPerBlock();
			dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads));
			dim3 blocks((unsigned int)x / threads.x + 1, (unsigned int)y / threads.y + 1);
			cuda_invert << <blocks, threads >> > (gpuI, x, y);
		}
		//
		size_t table_bytes = (size_t)(pow(2 * rmax + 1, 2) * sizeof(T));				// create the atan2 table
		T* cpuTable = (T*)malloc(table_bytes);											//assign memory on the cpu for atan2 table
		atan_2<T>(cpuTable, rmax);														//call the function to precompute the atan2 table
		T* gpuTable;  HANDLE_ERROR(cudaMalloc(&gpuTable, table_bytes));
		HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, table_bytes, cudaMemcpyHostToDevice));	//copy atan2 table to the gpu
		size_t bytes = pixels* sizeof(T);													//calculate the bytes of the input
		float dphi = phi / iter;															//change in phi for each iteration
		float* gpuGrad; HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes * 2));									//allocate space to store the 2D gradient
		float* gpuVote; HANDLE_ERROR(cudaMalloc(&gpuVote, bytes));										//allocate space to store the vote image
		stim::cuda::gpu_gradient_2d<float>(gpuGrad, gpuI, x, y);			//calculate the 2D gradient
		stim::cuda::gpu_cart2polar<float>(gpuGrad, x, y);					//convert cartesian coordinate of gradient to the polar
		for (int i = 0; i < iter; i++) {														//for each iteration
			cudaMemset(gpuVote, 0, bytes);													//reset the vote image to 0
			stim::cuda::gpu_vote<float>(gpuVote, gpuGrad, gpuTable, phi, rmax, x, y, debug);		//perform voting
			stim::cuda::gpu_update_dir<float>(gpuVote, gpuGrad, gpuTable, phi, rmax, x, y, debug);	//update the voter directions
			phi = phi - dphi;																//decrement phi
		}
		stim::cuda::gpu_local_max<float>(gpuI, gpuVote, conn, x, y);				//calculate the local maxima
		if (t > 0) {
			T* pts = (T*)malloc(bytes);													//allocate memory on the cpu to store the output of iterative voting
			HANDLE_ERROR(cudaMemcpy(pts, gpuI, bytes, cudaMemcpyDeviceToHost));			//copy the output from gpu to the cpu memory
			T threshold;
			threshold = t;
			size_t ind;
			for (size_t ix = 0; ix < x; ix++) {
				for (size_t iy = 0; iy < y; iy++) {
					ind = iy * x + ix;
					if (pts[ind] > threshold) {
						pts[ind] = 1;
					}
					else pts[ind] = 0;
				}
			}
			HANDLE_ERROR(cudaMemcpy(gpuI, pts, bytes, cudaMemcpyHostToDevice));		//copy the points to the gpu
		}
				
	}
	template<typename T>
	void cpu_ivote2(T* cpuI, unsigned int rmax, size_t x, size_t y, bool invert = false, T t = 0, int iter = 8, T phi = 15.0f * (float)stim::PI / 180, int conn = 8, bool debug = false) {
		size_t bytes = x*y * sizeof(T);
		T* gpuI;						//allocate space on the gpu to save the input image
		HANDLE_ERROR(cudaMalloc(&gpuI, bytes));
		HANDLE_ERROR(cudaMemcpy(gpuI, cpuI, bytes, cudaMemcpyHostToDevice));		//copy the image to the gpu
		stim::gpu_ivote2<T>(gpuI, rmax, x, y, invert, t, iter, phi, conn, debug);				//call the gpu version of the ivote
		HANDLE_ERROR(cudaMemcpy(cpuI, gpuI, bytes, cudaMemcpyDeviceToHost));		//copy the output to the cpu
	}
}
#endif