kdtree.cuh 18.4 KB

Edit Raw Blame History

// right now the size of CUDA STACK is set to 1000, increase it if you mean to make deeper tree
// data should be stored in row-major
// x1,x2,x3,x4,x5......
// y1,y2,y3,y4,y5......
// ....................
// ....................

#ifndef KDTREE_H
#define KDTREE_H

#include "device_launch_parameters.h"
#include <cuda.h>
#include <cuda_runtime_api.h>
#include "cuda_runtime.h"
#include <vector>
#include <float.h>
#include <iostream>
#include <algorithm>
#include <stim/cuda/cudatools/error.h>

namespace stim {
	namespace kdtree {
		template<typename T, int D>											// typename refers to float or double while D refers to dimension of points
		struct point {
			T dim[D];														// create a structure to store every one input point
		};

		template<typename T>
		class kdnode {
		public:
			kdnode() {														// constructor for initializing a kdnode
				parent = NULL;												// set every node's parent, left and right kdnode pointers to NULL
				left = NULL;
				right = NULL;
				parent_idx = -1;											// set parent node index to default -1
				left_idx = -1;
				right_idx = -1;
				split_value = -1;											// set split_value to default -1
			}
			int idx;														// index of current node
			int parent_idx, left_idx, right_idx;							// index of parent, left and right nodes
			kdnode *parent, *left, *right;									// parent, left and right kdnodes
			T split_value;													// splitting value of current node
			std::vector <size_t> indices;									// it indicates the points' indices that current node has
			size_t level;													// tree level of current node
		};
	}				// end of namespace kdtree

	template <typename T, int D = 3>										// set dimension of data to default 3
	class cpu_kdtree {
	protected:
		int current_axis;													// current judging axis
		int cmps;															// count how many time of comparisons (just for cpu-kdtree)
		int n_id;															// store the total number of nodes
		std::vector <kdtree::point<T, D>> *tmp_points;						// transfer or temp points
		kdtree::kdnode<T> *root;											// root node
		static cpu_kdtree<T, D> *cur_tree_ptr;
	public:
		cpu_kdtree() {														// constructor for creating a cpu_kdtree
			cur_tree_ptr = this;												// create  a class pointer points to the current class value
			n_id = 0;														// set total number of points to default 0
		}
		~cpu_kdtree() {											  			// destructor of cpu_kdtree
			std::vector <kdtree::kdnode<T>*> next_nodes;
			next_nodes.push_back(root);
			while (next_nodes.size()) {
				std::vector <kdtree::kdnode<T>*> next_search_nodes;
				while (next_nodes.size()) {
					kdtree::kdnode<T> *cur = next_nodes.back();
					next_nodes.pop_back();
					if (cur->left)
						next_search_nodes.push_back(cur->left);
					if (cur->right)
						next_search_nodes.push_back(cur->right);
					delete cur;
				}
				next_nodes = next_search_nodes;
			}
			root = NULL;
		}
		void Create(std::vector <kdtree::point<T, D>> &reference_points, size_t max_levels) {
			tmp_points = &reference_points;
			root = new kdtree::kdnode<T>();									// initializing the root node
			root->idx = n_id++;												// the index of root is 0
			root->level = 0;												// tree level begins at 0
			root->indices.resize(reference_points.size());					// get the number of points
			for (size_t i = 0; i < reference_points.size(); i++) {
				root->indices[i] = i;										// set indices of input points
			}
			std::vector <kdtree::kdnode<T>*> next_nodes;					// next nodes
			next_nodes.push_back(root);										// push back the root node
			while (next_nodes.size()) {
				std::vector <kdtree::kdnode<T>*> next_search_nodes;			// next search nodes
				while (next_nodes.size()) {									// two same WHILE is because we need to make a new vector to store nodes for search
					kdtree::kdnode<T> *current_node = next_nodes.back();	// handle node one by one (right first)
					next_nodes.pop_back();									// pop out current node in order to store next round of nodes
					if (current_node->level < max_levels) {
						if (current_node->indices.size() > 1) {				// split if the nonleaf node contains more than one point
							kdtree::kdnode<T> *left = new kdtree::kdnode<T>();
							kdtree::kdnode<T> *right = new kdtree::kdnode<T>();
							left->idx = n_id++;								// set the index of current node's left node
							right->idx = n_id++;
							Split(current_node, left, right);				// split left and right and determine a node
							std::vector <size_t> temp;						// empty vecters of int
							//temp.resize(current_node->indices.size());
							current_node->indices.swap(temp);				// clean up current node's indices
							current_node->left = left;
							current_node->right = right;
							current_node->left_idx = left->idx;
							current_node->right_idx = right->idx;
							if (right->indices.size())
								next_search_nodes.push_back(right);			// left pop out first
							if (left->indices.size())
								next_search_nodes.push_back(left);
						}
					}
				}
				next_nodes = next_search_nodes;								// go deeper within the tree
			}
		}
		static bool SortPoints(const size_t a, const size_t b) {						// create functor for std::sort
			std::vector <kdtree::point<T, D>> &pts = *cur_tree_ptr->tmp_points;			// put cur_tree_ptr to current input points' pointer
			return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis];
		}
		void Split(kdtree::kdnode<T> *cur, kdtree::kdnode<T> *left, kdtree::kdnode<T> *right) {
			std::vector <kdtree::point<T, D>> &pts = *tmp_points;
			current_axis = cur->level % D;												// indicate the judicative dimension or axis
			std::sort(cur->indices.begin(), cur->indices.end(), SortPoints);			// using SortPoints as comparison function to sort the data
			size_t mid_value = cur->indices[cur->indices.size() / 2];                   // odd in the mid_value, even take the floor
			cur->split_value = pts[mid_value].dim[current_axis];						// get the parent node
			left->parent = cur;                                                         // set the parent of the next search nodes to current node
			right->parent = cur;
			left->level = cur->level + 1;												// level + 1
			right->level = cur->level + 1;
			left->parent_idx = cur->idx;                                                // set its parent node's index
			right->parent_idx = cur->idx;
			for (size_t i = 0; i < cur->indices.size(); i++) {							// split into left and right half-space one by one
				size_t idx = cur->indices[i];
				if (pts[idx].dim[current_axis] < cur->split_value)
					left->indices.push_back(idx);
				else
					right->indices.push_back(idx);
			}
		}
		int GetNumNodes() const {														// get the total number of nodes
			return n_id;
		}
		kdtree::kdnode<T>* GetRoot() const {											// get the root node of tree
			return root;
		}
	};				//end class kdtree

	template <typename T, int D>
	cpu_kdtree<T, D>* cpu_kdtree<T, D>::cur_tree_ptr = NULL;							// definition of cur_tree_ptr pointer points to the current class

	template <typename T>
	struct cuda_kdnode {
		int parent, left, right;
		T split_value;
		size_t num_index;																// number of indices it has
		int index;																		// the beginning index
		size_t level;
	};

	template <typename T, int D>
	__device__ T Distance(kdtree::point<T, D> &a, kdtree::point<T, D> &b) {
		T dist = 0;

		for (size_t i = 0; i < D; i++) {
			T d = a.dim[i] - b.dim[i];
			dist += d*d;
		}
		return dist;
	}
	template <typename T, int D>
	__device__ void SearchAtNode(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, int cur, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) {
		T best_distance = FLT_MAX;
		size_t best_index = 0;

		while (true) {																	// break until reach the bottom
			int split_axis = nodes[cur].level % D;
			if (nodes[cur].left == -1) {												// check whether it has left node or not
				*d_node = cur;
				for (int i = 0; i < nodes[cur].num_index; i++) {
					size_t idx = indices[nodes[cur].index + i];
					T dist = Distance<T, D>(d_query_point, d_reference_points[idx]);
					if (dist < best_distance) {
						best_distance = dist;
						best_index = idx;
					}
				}
			break;
			}
			else if (d_query_point.dim[split_axis] < nodes[cur].split_value) {			// jump into specific son node
				cur = nodes[cur].left;
			}
			else {
				cur = nodes[cur].right;
			}
		}
		*d_distance = best_distance;
		*d_index = best_index;
	}
	template <typename T, int D>
	__device__ void SearchAtNodeRange(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
		T best_distance = FLT_MAX;
		size_t best_index = 0;

		int next_nodes_pos = 0;															// initialize pop out order index
		next_nodes[id * 1000 + next_nodes_pos] = cur;									// find data that belongs to the very specific thread
		next_nodes_pos++;

		while (next_nodes_pos) {
			int next_search_nodes_pos = 0;												// record push back order index
			while (next_nodes_pos) {
				cur = next_nodes[id * 1000 + next_nodes_pos - 1];						// pop out the last push in one and keep poping out
				next_nodes_pos--;
				int split_axis = nodes[cur].level % D;

				if (nodes[cur].left == -1) {
					for (int i = 0; i < nodes[cur].num_index; i++) {
						int idx = indices[nodes[cur].index + i];						// all indices are stored in one array, pick up from every node's beginning index
						T d = Distance<T>(d_query_point, d_reference_points[idx]);
						if (d < best_distance) {
							best_distance = d;
							best_index = idx;
						}
					}
				}
				else {
					T d = d_query_point.dim[split_axis] - nodes[cur].split_value;

					if (fabs(d) > range) {
						if (d < 0) {
							next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].left;
							next_search_nodes_pos++;
						}
						else {
							next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].right;
							next_search_nodes_pos++;
						}
					}
					else {
						next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].right;
						next_search_nodes_pos++;
						next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].left;
						next_search_nodes_pos++;
						if (next_search_nodes_pos > 1000) {
							printf("Thread conflict might be caused by thread %d, so please try smaller input max_tree_levels\n", id);
							(*Judge)++;
						}
					}
				}
			}
			for (int i = 0; i < next_search_nodes_pos; i++)
				next_nodes[id * 1000 + i] = next_search_nodes[id * 1000 + i];
			next_nodes_pos = next_search_nodes_pos;
		}
		*d_distance = best_distance;
		*d_index = best_index;
	}
	template <typename T, int D>
	__device__ void Search(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
		int best_node = 0;
		T best_distance = FLT_MAX;
		size_t best_index = 0;
		T radius = 0;

		SearchAtNode<T, D>(nodes, indices, d_reference_points, 0, d_query_point, &best_index, &best_distance, &best_node);
		radius = sqrt(best_distance);																															// get range
		int cur = best_node;

		while (nodes[cur].parent != -1) {
			int parent = nodes[cur].parent;
			int split_axis = nodes[parent].level % D;

			T tmp_dist = FLT_MAX;
			size_t tmp_idx;
			if (fabs(nodes[parent].split_value - d_query_point.dim[split_axis]) <= radius) {
				if (nodes[parent].left != cur)
					SearchAtNodeRange(nodes, indices, d_reference_points, d_query_point, nodes[parent].left, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
				else
					SearchAtNodeRange(nodes, indices, d_reference_points, d_query_point, nodes[parent].right, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
			}
			if (tmp_dist < best_distance) {
				best_distance = tmp_dist;
				best_index = tmp_idx;
			}
			cur = parent;
		}
		*d_distance = sqrt(best_distance);
		*d_index = best_index;
	}
	template <typename T, int D>
	__global__ void SearchBatch(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) {
		size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
		if (idx >= d_query_count) return;																														 // avoid segfault

		Search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge);    // every query points are independent
	}

	template <typename T, int D = 3>
	class cuda_kdtree {
	protected:
		cuda_kdnode<T> *d_nodes;
		size_t *d_index;
		kdtree::point<T, D>* d_reference_points;
		size_t d_reference_count;
	public:
		~cuda_kdtree() {
			HANDLE_ERROR(cudaFree(d_nodes));
			HANDLE_ERROR(cudaFree(d_index));
			HANDLE_ERROR(cudaFree(d_reference_points));
		}
		void CreateKDTree(T *h_reference_points, size_t reference_count, size_t dim_count, size_t max_levels) {
			if (max_levels > 10) {
				std::cout<<"The max_tree_levels should be smaller!"<<std::endl;
				exit(1);
			}
			std::vector <kdtree::point<T, D>> reference_points(reference_count);																				// restore the reference points in particular way
			for (size_t j = 0; j < reference_count; j++)
				for (size_t i = 0; i < dim_count; i++)
					reference_points[j].dim[i] = h_reference_points[j * dim_count + i];
			cpu_kdtree<T, D> tree;																																// creating a tree on cpu
			tree.Create(reference_points, max_levels);																											// building a tree on cpu
			kdtree::kdnode<T> *d_root = tree.GetRoot();
			int num_nodes = tree.GetNumNodes();
			d_reference_count = reference_points.size();																										// also equals to reference_count

			HANDLE_ERROR(cudaMalloc((void**)&d_nodes, sizeof(cuda_kdnode<T>) * num_nodes));																		// copy data from host to device
			HANDLE_ERROR(cudaMalloc((void**)&d_index, sizeof(size_t) * d_reference_count));
			HANDLE_ERROR(cudaMalloc((void**)&d_reference_points, sizeof(kdtree::point<T, D>) * d_reference_count));

			std::vector <cuda_kdnode<T>> tmp_nodes(num_nodes);
			std::vector <size_t> indices(d_reference_count);
			std::vector <kdtree::kdnode<T>*> next_nodes;
			size_t cur_pos = 0;
			next_nodes.push_back(d_root);
			while (next_nodes.size()) {
				std::vector <typename kdtree::kdnode<T>*> next_search_nodes;
				while (next_nodes.size()) {
					kdtree::kdnode<T> *cur = next_nodes.back();
					next_nodes.pop_back();
					int id = cur->idx;																															// the nodes at same level are independent
					tmp_nodes[id].level = cur->level;
					tmp_nodes[id].parent = cur->parent_idx;
					tmp_nodes[id].left = cur->left_idx;
					tmp_nodes[id].right = cur->right_idx;
					tmp_nodes[id].split_value = cur->split_value;
					tmp_nodes[id].num_index = cur->indices.size();																								// number of index
					if (cur->indices.size()) {
						for (size_t i = 0; i < cur->indices.size(); i++)
							indices[cur_pos + i] = cur->indices[i];

						tmp_nodes[id].index = (int)cur_pos;																										// beginning index of reference_points that every bottom node has
						cur_pos += cur->indices.size();																											// store indices continuously for every query_point
					}
					else {
						tmp_nodes[id].index = -1;
					}

					if (cur->left)
						next_search_nodes.push_back(cur->left);

					if (cur->right)
						next_search_nodes.push_back(cur->right);
				}
				next_nodes = next_search_nodes;
			}
			HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice));
			HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
			HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point<T, D>) * reference_points.size(), cudaMemcpyHostToDevice));
		}
		void Search(T *h_query_points, size_t query_count, size_t dim_count, T *dists, size_t *indices) {
			std::vector <kdtree::point<T, D>> query_points(query_count);
			for (size_t j = 0; j < query_count; j++)
				for (size_t i = 0; i < dim_count; i++)
					query_points[j].dim[i] = h_query_points[j * dim_count + i];

			unsigned int threads = (unsigned int)(query_points.size() > 1024 ? 1024 : query_points.size());
			unsigned int blocks = (unsigned int)(query_points.size() / threads + (query_points.size() % threads ? 1 : 0));

			kdtree::point<T, D> *d_query_points;																												// create a pointer pointing to query points on gpu
			size_t *d_indices;
			T *d_distances;

			int *next_nodes;																																	// create two STACK-like array
			int *next_search_nodes;

			int *Judge = NULL;																																	// judge variable to see whether one thread is overwrite another thread's memory

			HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_points.size() * D));
			HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_points.size()));
			HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_points.size()));
			HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * 1000 * sizeof(int)));																// STACK size right now is 1000, you can change it if you mean to
			HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * 1000 * sizeof(int)));
			HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_points.size() * D, cudaMemcpyHostToDevice));

			SearchBatch<<<threads, blocks>>> (d_nodes, d_index, d_reference_points, d_query_points, query_points.size(), d_indices, d_distances, next_nodes, next_search_nodes, Judge);

			if (Judge == NULL) {																																// do the following work if the thread works safely
				HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_points.size(), cudaMemcpyDeviceToHost));
				HANDLE_ERROR(cudaMemcpy(dists, d_distances, sizeof(T) * query_points.size(), cudaMemcpyDeviceToHost));
			}

			HANDLE_ERROR(cudaFree(next_nodes));
			HANDLE_ERROR(cudaFree(next_search_nodes));
			HANDLE_ERROR(cudaFree(d_query_points));
			HANDLE_ERROR(cudaFree(d_indices));
			HANDLE_ERROR(cudaFree(d_distances));
		}
	};
}				//end namespace stim
#endif