stim/structures/kdtree.cuh

// change CUDA_STACK together with max_tree_levels in trial and error manner
// data should be stored in row-major
// x1,x2,x3,x4,x5......
// y1,y2,y3,y4,y5......
// ....................
// ....................
#ifndef KDTREE_H
#define KDTREE_H
#include "device_launch_parameters.h"
#include <cuda.h>
#include <cuda_runtime_api.h>
#include "cuda_runtime.h"
#include <vector>
#include <float.h>
#include <iostream>
#include <algorithm>
/// using API called HADDLE_ERROR
static void HandleError(cudaError_t err, const char *file, int line) {
	if (err != cudaSuccess) {
	std::cout<<cudaGetErrorString(err)<<" in"<< file <<" at line "<<line<<std::endl;
	}
}
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
#define CUDA_STACK 2												// implementation "stacks" on CUDA as to do store the nodes information
namespace stim {
	namespace kdtree {
		template<typename T, int D>
		struct point {
			T coords[D];													// if we use size to measure a vector<point>, it will show the number of point structures
		};
		template<typename T>
		class KDNode {
		public:
			KDNode() {														// initialization
				parent = NULL;
				left = NULL;
				right = NULL;
				split_value = -1;
				_parent = -1;
				_left = -1;
				_right = -1;
			}
			int id;															// id for current node
			size_t level;
			KDNode *parent, *left, *right;
			int _parent, _left, _right;										// id for parent node
			T split_value;													// node value
			std::vector <size_t> indices;									// indices that indicate the data that current tree has
		};
	}
	template <typename T, int D = 3>
	class cpu_kdtree {
	protected:
		std::vector <kdtree::point<T, D>> *m_pts;
		kdtree::KDNode<T> *m_root;												// current node
		int m_current_axis;
		size_t m_levels;
		int m_cmps;														// count how many comparisons are to made in the tree for one query
		int m_id;														// level + 1
		static cpu_kdtree<T, D> *myself;
	public:
		cpu_kdtree() {														// initialization
			myself = this;
			m_id = 0;														// id = level + 1, level -> axis index while id -> node identifier
		}
		~cpu_kdtree() {														// destructor for deleting what was created by kdtree()
			std::vector <kdtree::KDNode<T>*> next_node;
			next_node.push_back(m_root);
			while (next_node.size()) {
				std::vector <kdtree::KDNode<T>*> next_search;
				while (next_node.size()) {
					kdtree::KDNode<T> *cur = next_node.back();
					next_node.pop_back();
					if (cur->left)
						next_search.push_back(cur->left);
					if (cur->right)
						next_search.push_back(cur->right);
					delete cur;
				}
				next_node = next_search;
			}
			m_root = NULL;
		}
		void Create(std::vector <kdtree::point<T, D>> &pts, size_t max_levels) {
			m_pts = &pts;												// create a pointer point to the input data
			m_levels = max_levels;										// stores max tree levels
			m_root = new kdtree::KDNode<T>();									// using KDNode() to initialize an ancestor node
			m_root->id = m_id++;										// id is 1 while level is 0 at the very beginning
			m_root->level = 0;											// to begin with level 0
			m_root->indices.resize(pts.size());							// initialize the size of whole indices
			for (size_t i = 0; i < pts.size(); i++) {
				m_root->indices[i] = i;									// like what we did on Keys in GPU-BF part
			}
			std::vector <kdtree::KDNode<T>*> next_node;							// next node
			next_node.push_back(m_root);								// new node
			while (next_node.size()) {
				std::vector <kdtree::KDNode<T>*> next_search;
				while (next_node.size()) {								// two same WHILE is because we need to make a new vector for searching
					kdtree::KDNode<T> *current_node = next_node.back();			// pointer point to current node (right first) 
					next_node.pop_back();								// pop out current node in order to store next node
					if (current_node->level < max_levels) {				// max_levels should be reasonably small compared with numbers of data
						if (current_node->indices.size() > 1) {
							kdtree::KDNode<T> *left = new kdtree::KDNode<T>();
							kdtree::KDNode<T> *right = new kdtree::KDNode<T>();
							left->id = m_id++;							// risky guessing but OK for large amount of data since max_level is small     
							right->id = m_id++;							// risky guessing but OK for large amount of data since max_level is small
							Split(current_node, left, right);			// split left and right and determine a node
							std::vector <size_t> temp;					// empty vecters of int
							//temp.resize(current_node->indices.size());
							current_node->indices.swap(temp);			// clean up current tree's indices
							current_node->left = left;
							current_node->right = right;
							current_node->_left = left->id;				// indicates it has left son node and gets its id
							current_node->_right = right->id;			// indicates it has right son node and gets its id
							if (left->indices.size())
								next_search.push_back(left);			// right first then left according to stack(first in last out), it can be done in parallel for left and right are independent
							if (right->indices.size())
								next_search.push_back(right);
						}
					}
				}
				next_node = next_search;
			}
		}
		static bool SortPoints(const size_t a, const size_t b) {
			std::vector <kdtree::point<T, D>> &pts = *myself->m_pts;
			return pts[a].coords[myself->m_current_axis] < pts[b].coords[myself->m_current_axis];
		}
		void Split(kdtree::KDNode<T> *cur, kdtree::KDNode<T> *left, kdtree::KDNode<T> *right) {
			/// assume both two sides are created and sure it was
			std::vector <kdtree::point<T, D>> &pts = *m_pts;
			m_current_axis = cur->level % D;											// indicate the judicative dimension or axis
			std::sort(cur->indices.begin(), cur->indices.end(), SortPoints);			// using SortPoints as comparison function to sort the data
			size_t mid = cur->indices[cur->indices.size() / 2];                         // odd in the mid, even take the floor
			cur->split_value = pts[mid].coords[m_current_axis];                         // get the mother node
			left->parent = cur;                                                         // set the parent to current node for the next nodes
			right->parent = cur;
			left->level = cur->level + 1;
			right->level = cur->level + 1;
			left->_parent = cur->id;                                                    // indicates it has mother node and gets its id
			right->_parent = cur->id;                                                   // indicates it has mother node and gets its id
			for (size_t i = 0; i < cur->indices.size(); i++) {							// split into left and right area one by one
				size_t idx = cur->indices[i];
				if (pts[idx].coords[m_current_axis] < cur->split_value)
					left->indices.push_back(idx);
				else
					right->indices.push_back(idx);
			}
		}
		int GetNumNodes() const { return m_id; }
		kdtree::KDNode<T>* GetRoot() const { return m_root; }
	};				//end class kdtree
	template <typename T, int D>
	cpu_kdtree<T, D>* cpu_kdtree<T, D>::myself = NULL;												// definition of myself pointer points to the specific class
	template <typename T>
	struct CUDA_KDNode {
		size_t level;
		int parent, left, right;														// indicates id of
		T split_value;
		size_t num_indices;																// number of indices it has
		int indices;																	// the beginning
	};
	template <typename T, int D>
	__device__ T Distance(kdtree::point<T, D> &a, kdtree::point<T, D> &b) {
		T dist = 0;
		for (size_t i = 0; i < D; i++) {
			T d = a.coords[i] - b.coords[i];
			dist += d*d;
		}
		return dist;
	}
	template <typename T, int D>
	__device__ void SearchAtNode(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, int cur, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist, int *ret_node) {
		/// finds the first possibility
		size_t best_idx = 0;
		T best_dist = FLT_MAX;
		while (true) {
			int split_axis = nodes[cur].level % D;
			if (nodes[cur].left == -1) {												// if it doesn't have left son node 
				*ret_node = cur;
				for (int i = 0; i < nodes[cur].num_indices; i++) {
					size_t idx = indices[nodes[cur].indices + i];
					T dist = Distance<T, D>(Query, pts[idx]);
					if (dist < best_dist) {
						best_dist = dist;
						best_idx = idx;
					}
				}
				break;
			}
			else if (Query.coords[split_axis] < nodes[cur].split_value) {
				cur = nodes[cur].left;
			}
			else {
				cur = nodes[cur].right;
			}
		}
		*ret_index = best_idx;
		*ret_dist = best_dist;
	}
	template <typename T, int D>
	__device__ void SearchAtNodeRange(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, int cur, T range, size_t *ret_index, T *ret_dist) {
		/// search throught all the nodes that are within one range
		size_t best_idx = 0;
		T best_dist = FLT_MAX;
		/// using fixed stack, increase it when need
		int next_node[CUDA_STACK];														// should be larger than 1
		int next_node_pos = 0;															// initialize pop out order index
		next_node[next_node_pos++] = cur;												// equals to next_node[next_node_pos] = cur, next_node_pos++
		while (next_node_pos) {
			int next_search[CUDA_STACK];												// for store next nodes
			int next_search_pos = 0;													// record push back order index
			while (next_node_pos) {
				cur = next_node[next_node_pos - 1];										// pop out the last in one and keep poping out
				next_node_pos--;
				int split_axis = nodes[cur].level % D;
				if (nodes[cur].left == -1) {
					for (int i = 0; i < nodes[cur].num_indices; i++) {
						int idx = indices[nodes[cur].indices + i];						// all indices are stored in one array, pick up from every node's beginning index
						T d = Distance<T>(Query, pts[idx]);
						if (d < best_dist) {
							best_dist = d;
							best_idx = idx;
						}
					}
				}
				else {
					T d = Query.coords[split_axis] - nodes[cur].split_value;
					if (fabs(d) > range) {
						if (d < 0)
							next_search[next_search_pos++] = nodes[cur].left;
						else
							next_search[next_search_pos++] = nodes[cur].right;
					}
					else {
						next_search[next_search_pos++] = nodes[cur].left;
						next_search[next_search_pos++] = nodes[cur].right;
					}
				}
			}
			for (int i = 0; i < next_search_pos; i++)
				next_node[i] = next_search[i];
			next_node_pos = next_search_pos;											// operation that really resemble STACK, namely first in last out
		}
		*ret_index = best_idx;
		*ret_dist = best_dist;
	}
	template <typename T, int D>
	__device__ void Search(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist) {
		/// find first nearest node
		int best_node = 0;
		size_t best_idx = 0;
		T best_dist = FLT_MAX;
		T radius = 0;
		SearchAtNode<T, D>(nodes, indices, pts, 0, Query, &best_idx, &best_dist, &best_node);
		radius = sqrt(best_dist);
		/// find other possibilities
		int cur = best_node;
		while (nodes[cur].parent != -1) {
			/// go up
			int parent = nodes[cur].parent;
			int split_axis = nodes[parent].level % D;
			/// search other node
			T tmp_dist = FLT_MAX;
			size_t tmp_idx;
			if (fabs(nodes[parent].split_value - Query.coords[split_axis]) <= radius) {
				/// search opposite node
				if (nodes[parent].left != cur)
					SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].left, radius, &tmp_idx, &tmp_dist);
				else
					SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].right, radius, &tmp_idx, &tmp_dist);
			}
			if (tmp_dist < best_dist) {
				best_dist = tmp_dist;
				best_idx = tmp_idx;
			}
			cur = parent;
		}
		*ret_index = best_idx;
		*ret_dist = sqrt(best_dist);
	}
	template <typename T, int D>
	__global__ void SearchBatch(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, size_t num_pts, kdtree::point<T, D> *Query, size_t num_Query, size_t *ret_index, T *ret_dist) {
		int idx = blockIdx.x * blockDim.x + threadIdx.x;
		if (idx >= num_Query) return;
		Search<T, D>(nodes, indices, pts, Query[idx], &ret_index[idx], &ret_dist[idx]);        // every query points are independent
	}
	template <typename T, int D = 3>
	class cuda_kdtree {
	protected:
		CUDA_KDNode<T> *m_gpu_nodes;                                                     // store nodes
		size_t *m_gpu_indices;
		kdtree::point<T, D>* m_gpu_points;
		size_t m_num_points;
	public:
		~cuda_kdtree() {
			HANDLE_ERROR(cudaFree(m_gpu_nodes));
			HANDLE_ERROR(cudaFree(m_gpu_indices));
			HANDLE_ERROR(cudaFree(m_gpu_points));
		}
		void CreateKDTree(T *ReferencePoints, size_t ReferenceCount, size_t ColCount, size_t max_levels) {
			std::vector < kdtree::point<T, D> > pts(ReferenceCount);															// create specific struct of reference data
			for (size_t j = 0; j < ReferenceCount; j++)
				for (size_t i = 0; i < ColCount; i++)
					pts[j].coords[i] = ReferencePoints[j * ColCount + i];
			cpu_kdtree<T, D> tree;																						// initialize a tree
			tree.Create(pts, max_levels);																		// create KD-Tree on host
			kdtree::KDNode<T> *root = tree.GetRoot();
			int num_nodes = tree.GetNumNodes();
			/// create the same on CPU
			m_num_points = pts.size();																			// number of points for creating tree = reference_count in the case
			HANDLE_ERROR(cudaMalloc((void**)&m_gpu_nodes, sizeof(CUDA_KDNode<T>) * num_nodes));					// private variables for kdtree
			HANDLE_ERROR(cudaMalloc((void**)&m_gpu_indices, sizeof(size_t) * m_num_points));
			HANDLE_ERROR(cudaMalloc((void**)&m_gpu_points, sizeof(kdtree::point<T, D>) * m_num_points));
			std::vector <CUDA_KDNode<T>> cpu_nodes(num_nodes);													// from left to right, id of nodes
			std::vector <size_t> indices(m_num_points);
			std::vector < kdtree::KDNode<T>* > next_node;
			size_t cur_pos = 0;
			next_node.push_back(root);
			while (next_node.size()) {
				std::vector <typename kdtree::KDNode<T>* > next_search;
				while (next_node.size()) {
					kdtree::KDNode<T> *cur = next_node.back();
					next_node.pop_back();
					int id = cur->id;																			// the nodes at same level are independent
					cpu_nodes[id].level = cur->level;
					cpu_nodes[id].parent = cur->_parent;
					cpu_nodes[id].left = cur->_left;
					cpu_nodes[id].right = cur->_right;
					cpu_nodes[id].split_value = cur->split_value;
					cpu_nodes[id].num_indices = cur->indices.size();											// number of index
					if (cur->indices.size()) {
						for (size_t i = 0; i < cur->indices.size(); i++)
							indices[cur_pos + i] = cur->indices[i];
						cpu_nodes[id].indices = (int)cur_pos;													// beginning index that every bottom node has
						cur_pos += cur->indices.size();															// store indices continuously
					}
					else {
						cpu_nodes[id].indices = -1;
					}
					if (cur->left)
						next_search.push_back(cur->left);
					if (cur->right)
						next_search.push_back(cur->right);
				}
				next_node = next_search;
			}
			HANDLE_ERROR(cudaMemcpy(m_gpu_nodes, &cpu_nodes[0], sizeof(CUDA_KDNode<T>) * cpu_nodes.size(), cudaMemcpyHostToDevice));
			HANDLE_ERROR(cudaMemcpy(m_gpu_indices, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
			HANDLE_ERROR(cudaMemcpy(m_gpu_points, &pts[0], sizeof(kdtree::point<T, D>) * pts.size(), cudaMemcpyHostToDevice));
		}
		void Search(T *QueryPoints, size_t QueryCount, size_t ColCount, T *dists, size_t *indices) {
			std::vector < kdtree::point<T, D> > query(QueryCount);
			for (size_t j = 0; j < QueryCount; j++)
				for (size_t i = 0; i < ColCount; i++)
					query[j].coords[i] = QueryPoints[j * ColCount + i];
			unsigned int threads = (unsigned int)(query.size() > 1024 ? 1024 : query.size());
			//unsigned int blocks = (unsigned int)ceil(query.size() / threads);
			unsigned int blocks = (unsigned int)(query.size() / threads + (query.size() % threads ? 1 : 0));
			kdtree::point<T, D> *gpu_Query;
			size_t *gpu_ret_indices;
			T *gpu_ret_dist;
			HANDLE_ERROR(cudaMalloc((void**)&gpu_Query, sizeof(T) * query.size() * D));
			HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_indices, sizeof(size_t) * query.size()));
			HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_dist, sizeof(T) * query.size()));
			HANDLE_ERROR(cudaMemcpy(gpu_Query, &query[0], sizeof(T) * query.size() * D, cudaMemcpyHostToDevice));
			SearchBatch << <threads, blocks >> > (m_gpu_nodes, m_gpu_indices, m_gpu_points, m_num_points, gpu_Query, query.size(), gpu_ret_indices, gpu_ret_dist);
			HANDLE_ERROR(cudaMemcpy(indices, gpu_ret_indices, sizeof(size_t) * query.size(), cudaMemcpyDeviceToHost));
			HANDLE_ERROR(cudaMemcpy(dists, gpu_ret_dist, sizeof(T) * query.size(), cudaMemcpyDeviceToHost));
			HANDLE_ERROR(cudaFree(gpu_Query));
			HANDLE_ERROR(cudaFree(gpu_ret_indices));
			HANDLE_ERROR(cudaFree(gpu_ret_dist));
		}
	};
}				//end namespace stim
#endif