codebase / stimlib

Browse Code »

Commit cc09e435897de0d20bf3d141043f0b67ceadc62f

Authored by David Mayerich 2016-11-16 14:48:03 -0600

1 parent 19a3973b

added Jack's KD-Tree code

Showing 1 changed file with 415 additions and 0 deletions Show diff stats

Inline Side-by-side

stim/structures/kdtree.cuh 0 → 100644

Wrap text Show/Hide comments View file @cc09e43

	1	+// change CUDA_STACK together with max_tree_levels in trial and error manner
	2	+// data should be stored in row-major
	3	+// x1,x2,x3,x4,x5......
	4	+// y1,y2,y3,y4,y5......
	5	+// ....................
	6	+// ....................
	7	+
	8	+#ifndef KDTREE_H
	9	+#define KDTREE_H
	10	+
	11	+#include "device_launch_parameters.h"
	12	+#include <cuda.h>
	13	+#include <cuda_runtime_api.h>
	14	+#include "cuda_runtime.h"
	15	+#include <vector>
	16	+#include <float.h>
	17	+#include <iostream>
	18	+#include <algorithm>
	19	+
	20	+/// using API called HADDLE_ERROR
	21	+static void HandleError(cudaError_t err, const char *file, int line) {
	22	+ if (err != cudaSuccess) {
	23	+ std::cout<<cudaGetErrorString(err)<<" in"<< file <<" at line "<<line<<std::endl;
	24	+ }
	25	+}
	26	+#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
	27	+
	28	+#define CUDA_STACK 2 // implementation "stacks" on CUDA as to do store the nodes information
	29	+
	30	+namespace stim {
	31	+ namespace kdtree {
	32	+ template<typename T, int D>
	33	+ struct point {
	34	+ T coords[D]; // if we use size to measure a vector<point>, it will show the number of point structures
	35	+ };
	36	+
	37	+ template<typename T>
	38	+ class KDNode {
	39	+ public:
	40	+ KDNode() { // initialization
	41	+ parent = NULL;
	42	+ left = NULL;
	43	+ right = NULL;
	44	+ split_value = -1;
	45	+ _parent = -1;
	46	+ _left = -1;
	47	+ _right = -1;
	48	+ }
	49	+ int id; // id for current node
	50	+ size_t level;
	51	+ KDNode parent, left, *right;
	52	+ int _parent, _left, _right; // id for parent node
	53	+ T split_value; // node value
	54	+ std::vector <size_t> indices; // indices that indicate the data that current tree has
	55	+ };
	56	+ }
	57	+
	58	+ template <typename T, int D = 3>
	59	+ class cpu_kdtree {
	60	+
	61	+ protected:
	62	+ std::vector <kdtree::point<T, D>> *m_pts;
	63	+ kdtree::KDNode<T> *m_root; // current node
	64	+ int m_current_axis;
	65	+ size_t m_levels;
	66	+ int m_cmps; // count how many comparisons are to made in the tree for one query
	67	+ int m_id; // level + 1
	68	+ static cpu_kdtree<T, D> *myself;
	69	+ public:
	70	+ cpu_kdtree() { // initialization
	71	+ myself = this;
	72	+ m_id = 0; // id = level + 1, level -> axis index while id -> node identifier
	73	+ }
	74	+ ~cpu_kdtree() { // destructor for deleting what was created by kdtree()
	75	+ std::vector <kdtree::KDNode<T>*> next_node;
	76	+ next_node.push_back(m_root);
	77	+ while (next_node.size()) {
	78	+ std::vector <kdtree::KDNode<T>*> next_search;
	79	+ while (next_node.size()) {
	80	+ kdtree::KDNode<T> *cur = next_node.back();
	81	+ next_node.pop_back();
	82	+ if (cur->left)
	83	+ next_search.push_back(cur->left);
	84	+ if (cur->right)
	85	+ next_search.push_back(cur->right);
	86	+ delete cur;
	87	+ }
	88	+ next_node = next_search;
	89	+ }
	90	+ m_root = NULL;
	91	+ }
	92	+ void Create(std::vector <kdtree::point<T, D>> &pts, size_t max_levels) {
	93	+ m_pts = &pts; // create a pointer point to the input data
	94	+ m_levels = max_levels; // stores max tree levels
	95	+ m_root = new kdtree::KDNode<T>(); // using KDNode() to initialize an ancestor node
	96	+ m_root->id = m_id++; // id is 1 while level is 0 at the very beginning
	97	+ m_root->level = 0; // to begin with level 0
	98	+ m_root->indices.resize(pts.size()); // initialize the size of whole indices
	99	+ for (size_t i = 0; i < pts.size(); i++) {
	100	+ m_root->indices[i] = i; // like what we did on Keys in GPU-BF part
	101	+ }
	102	+ std::vector <kdtree::KDNode<T>*> next_node; // next node
	103	+ next_node.push_back(m_root); // new node
	104	+ while (next_node.size()) {
	105	+ std::vector <kdtree::KDNode<T>*> next_search;
	106	+ while (next_node.size()) { // two same WHILE is because we need to make a new vector for searching
	107	+ kdtree::KDNode<T> *current_node = next_node.back(); // pointer point to current node (right first)
	108	+ next_node.pop_back(); // pop out current node in order to store next node
	109	+ if (current_node->level < max_levels) { // max_levels should be reasonably small compared with numbers of data
	110	+ if (current_node->indices.size() > 1) {
	111	+ kdtree::KDNode<T> *left = new kdtree::KDNode<T>();
	112	+ kdtree::KDNode<T> *right = new kdtree::KDNode<T>();
	113	+ left->id = m_id++; // risky guessing but OK for large amount of data since max_level is small
	114	+ right->id = m_id++; // risky guessing but OK for large amount of data since max_level is small
	115	+ Split(current_node, left, right); // split left and right and determine a node
	116	+ std::vector <size_t> temp; // empty vecters of int
	117	+ //temp.resize(current_node->indices.size());
	118	+ current_node->indices.swap(temp); // clean up current tree's indices
	119	+ current_node->left = left;
	120	+ current_node->right = right;
	121	+ current_node->_left = left->id; // indicates it has left son node and gets its id
	122	+ current_node->_right = right->id; // indicates it has right son node and gets its id
	123	+ if (left->indices.size())
	124	+ next_search.push_back(left); // right first then left according to stack(first in last out), it can be done in parallel for left and right are independent
	125	+ if (right->indices.size())
	126	+ next_search.push_back(right);
	127	+ }
	128	+ }
	129	+ }
	130	+ next_node = next_search;
	131	+ }
	132	+ }
	133	+ static bool SortPoints(const size_t a, const size_t b) {
	134	+ std::vector <kdtree::point<T, D>> &pts = *myself->m_pts;
	135	+ return pts[a].coords[myself->m_current_axis] < pts[b].coords[myself->m_current_axis];
	136	+ }
	137	+ void Split(kdtree::KDNode<T> cur, kdtree::KDNode<T> left, kdtree::KDNode<T> *right) {
	138	+ /// assume both two sides are created and sure it was
	139	+ std::vector <kdtree::point<T, D>> &pts = *m_pts;
	140	+ m_current_axis = cur->level % D; // indicate the judicative dimension or axis
	141	+ std::sort(cur->indices.begin(), cur->indices.end(), SortPoints); // using SortPoints as comparison function to sort the data
	142	+ size_t mid = cur->indices[cur->indices.size() / 2]; // odd in the mid, even take the floor
	143	+ cur->split_value = pts[mid].coords[m_current_axis]; // get the mother node
	144	+ left->parent = cur; // set the parent to current node for the next nodes
	145	+ right->parent = cur;
	146	+ left->level = cur->level + 1;
	147	+ right->level = cur->level + 1;
	148	+ left->_parent = cur->id; // indicates it has mother node and gets its id
	149	+ right->_parent = cur->id; // indicates it has mother node and gets its id
	150	+ for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right area one by one
	151	+ size_t idx = cur->indices[i];
	152	+ if (pts[idx].coords[m_current_axis] < cur->split_value)
	153	+ left->indices.push_back(idx);
	154	+ else
	155	+ right->indices.push_back(idx);
	156	+ }
	157	+ }
	158	+ int GetNumNodes() const { return m_id; }
	159	+ kdtree::KDNode<T>* GetRoot() const { return m_root; }
	160	+ }; //end class kdtree
	161	+
	162	+ template <typename T, int D>
	163	+ cpu_kdtree<T, D>* cpu_kdtree<T, D>::myself = NULL; // definition of myself pointer points to the specific class
	164	+
	165	+ template <typename T>
	166	+ struct CUDA_KDNode {
	167	+ size_t level;
	168	+ int parent, left, right; // indicates id of
	169	+ T split_value;
	170	+ size_t num_indices; // number of indices it has
	171	+ int indices; // the beginning
	172	+ };
	173	+
	174	+ template <typename T, int D>
	175	+ __device__ T Distance(kdtree::point<T, D> &a, kdtree::point<T, D> &b) {
	176	+ T dist = 0;
	177	+
	178	+ for (size_t i = 0; i < D; i++) {
	179	+ T d = a.coords[i] - b.coords[i];
	180	+ dist += d*d;
	181	+ }
	182	+ return dist;
	183	+ }
	184	+ template <typename T, int D>
	185	+ __device__ void SearchAtNode(CUDA_KDNode<T> nodes, size_t indices, kdtree::point<T, D> pts, int cur, kdtree::point<T, D> &Query, size_t ret_index, T ret_dist, int ret_node) {
	186	+ /// finds the first possibility
	187	+ size_t best_idx = 0;
	188	+ T best_dist = FLT_MAX;
	189	+
	190	+ while (true) {
	191	+ int split_axis = nodes[cur].level % D;
	192	+ if (nodes[cur].left == -1) { // if it doesn't have left son node
	193	+ *ret_node = cur;
	194	+ for (int i = 0; i < nodes[cur].num_indices; i++) {
	195	+ size_t idx = indices[nodes[cur].indices + i];
	196	+ T dist = Distance<T, D>(Query, pts[idx]);
	197	+ if (dist < best_dist) {
	198	+ best_dist = dist;
	199	+ best_idx = idx;
	200	+ }
	201	+ }
	202	+ break;
	203	+ }
	204	+ else if (Query.coords[split_axis] < nodes[cur].split_value) {
	205	+ cur = nodes[cur].left;
	206	+ }
	207	+ else {
	208	+ cur = nodes[cur].right;
	209	+ }
	210	+ }
	211	+ *ret_index = best_idx;
	212	+ *ret_dist = best_dist;
	213	+ }
	214	+ template <typename T, int D>
	215	+ __device__ void SearchAtNodeRange(CUDA_KDNode<T> nodes, size_t indices, kdtree::point<T, D> pts, kdtree::point<T, D> &Query, int cur, T range, size_t ret_index, T *ret_dist) {
	216	+ /// search throught all the nodes that are within one range
	217	+ size_t best_idx = 0;
	218	+ T best_dist = FLT_MAX;
	219	+ /// using fixed stack, increase it when need
	220	+ int next_node[CUDA_STACK]; // should be larger than 1
	221	+ int next_node_pos = 0; // initialize pop out order index
	222	+ next_node[next_node_pos++] = cur; // equals to next_node[next_node_pos] = cur, next_node_pos++
	223	+
	224	+ while (next_node_pos) {
	225	+ int next_search[CUDA_STACK]; // for store next nodes
	226	+ int next_search_pos = 0; // record push back order index
	227	+ while (next_node_pos) {
	228	+ cur = next_node[next_node_pos - 1]; // pop out the last in one and keep poping out
	229	+ next_node_pos--;
	230	+ int split_axis = nodes[cur].level % D;
	231	+
	232	+ if (nodes[cur].left == -1) {
	233	+ for (int i = 0; i < nodes[cur].num_indices; i++) {
	234	+ int idx = indices[nodes[cur].indices + i]; // all indices are stored in one array, pick up from every node's beginning index
	235	+ T d = Distance<T>(Query, pts[idx]);
	236	+ if (d < best_dist) {
	237	+ best_dist = d;
	238	+ best_idx = idx;
	239	+ }
	240	+ }
	241	+ }
	242	+ else {
	243	+ T d = Query.coords[split_axis] - nodes[cur].split_value;
	244	+
	245	+ if (fabs(d) > range) {
	246	+ if (d < 0)
	247	+ next_search[next_search_pos++] = nodes[cur].left;
	248	+ else
	249	+ next_search[next_search_pos++] = nodes[cur].right;
	250	+ }
	251	+ else {
	252	+ next_search[next_search_pos++] = nodes[cur].left;
	253	+ next_search[next_search_pos++] = nodes[cur].right;
	254	+ }
	255	+ }
	256	+ }
	257	+
	258	+ for (int i = 0; i < next_search_pos; i++)
	259	+ next_node[i] = next_search[i];
	260	+ next_node_pos = next_search_pos; // operation that really resemble STACK, namely first in last out
	261	+ }
	262	+ *ret_index = best_idx;
	263	+ *ret_dist = best_dist;
	264	+ }
	265	+ template <typename T, int D>
	266	+ __device__ void Search(CUDA_KDNode<T> nodes, size_t indices, kdtree::point<T, D> pts, kdtree::point<T, D> &Query, size_t ret_index, T *ret_dist) {
	267	+ /// find first nearest node
	268	+ int best_node = 0;
	269	+ size_t best_idx = 0;
	270	+ T best_dist = FLT_MAX;
	271	+ T radius = 0;
	272	+ SearchAtNode<T, D>(nodes, indices, pts, 0, Query, &best_idx, &best_dist, &best_node);
	273	+ radius = sqrt(best_dist);
	274	+ /// find other possibilities
	275	+ int cur = best_node;
	276	+
	277	+ while (nodes[cur].parent != -1) {
	278	+ /// go up
	279	+ int parent = nodes[cur].parent;
	280	+ int split_axis = nodes[parent].level % D;
	281	+ /// search other node
	282	+ T tmp_dist = FLT_MAX;
	283	+ size_t tmp_idx;
	284	+ if (fabs(nodes[parent].split_value - Query.coords[split_axis]) <= radius) {
	285	+ /// search opposite node
	286	+ if (nodes[parent].left != cur)
	287	+ SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].left, radius, &tmp_idx, &tmp_dist);
	288	+ else
	289	+ SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].right, radius, &tmp_idx, &tmp_dist);
	290	+ }
	291	+ if (tmp_dist < best_dist) {
	292	+ best_dist = tmp_dist;
	293	+ best_idx = tmp_idx;
	294	+ }
	295	+ cur = parent;
	296	+ }
	297	+ *ret_index = best_idx;
	298	+ *ret_dist = sqrt(best_dist);
	299	+ }
	300	+ template <typename T, int D>
	301	+ __global__ void SearchBatch(CUDA_KDNode<T> nodes, size_t indices, kdtree::point<T, D> pts, size_t num_pts, kdtree::point<T, D> Query, size_t num_Query, size_t ret_index, T ret_dist) {
	302	+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
	303	+ if (idx >= num_Query) return;
	304	+
	305	+ Search<T, D>(nodes, indices, pts, Query[idx], &ret_index[idx], &ret_dist[idx]); // every query points are independent
	306	+ }
	307	+
	308	+ template <typename T, int D = 3>
	309	+ class cuda_kdtree {
	310	+ protected:
	311	+ CUDA_KDNode<T> *m_gpu_nodes; // store nodes
	312	+ size_t *m_gpu_indices;
	313	+ kdtree::point<T, D>* m_gpu_points;
	314	+ size_t m_num_points;
	315	+ public:
	316	+ ~cuda_kdtree() {
	317	+ HANDLE_ERROR(cudaFree(m_gpu_nodes));
	318	+ HANDLE_ERROR(cudaFree(m_gpu_indices));
	319	+ HANDLE_ERROR(cudaFree(m_gpu_points));
	320	+ }
	321	+ void CreateKDTree(T *ReferencePoints, size_t ReferenceCount, size_t ColCount, size_t max_levels) {
	322	+ std::vector < kdtree::point<T, D> > pts(ReferenceCount); // create specific struct of reference data
	323	+ for (size_t j = 0; j < ReferenceCount; j++)
	324	+ for (size_t i = 0; i < ColCount; i++)
	325	+ pts[j].coords[i] = ReferencePoints[j * ColCount + i];
	326	+ cpu_kdtree<T, D> tree; // initialize a tree
	327	+ tree.Create(pts, max_levels); // create KD-Tree on host
	328	+ kdtree::KDNode<T> *root = tree.GetRoot();
	329	+ int num_nodes = tree.GetNumNodes();
	330	+ /// create the same on CPU
	331	+ m_num_points = pts.size(); // number of points for creating tree = reference_count in the case
	332	+
	333	+ HANDLE_ERROR(cudaMalloc((void*)&m_gpu_nodes, sizeof(CUDA_KDNode<T>) num_nodes)); // private variables for kdtree
	334	+ HANDLE_ERROR(cudaMalloc((void*)&m_gpu_indices, sizeof(size_t) m_num_points));
	335	+ HANDLE_ERROR(cudaMalloc((void*)&m_gpu_points, sizeof(kdtree::point<T, D>) m_num_points));
	336	+
	337	+ std::vector <CUDA_KDNode<T>> cpu_nodes(num_nodes); // from left to right, id of nodes
	338	+ std::vector <size_t> indices(m_num_points);
	339	+ std::vector < kdtree::KDNode<T>* > next_node;
	340	+
	341	+ size_t cur_pos = 0;
	342	+
	343	+ next_node.push_back(root);
	344	+
	345	+ while (next_node.size()) {
	346	+ std::vector <typename kdtree::KDNode<T>* > next_search;
	347	+
	348	+ while (next_node.size()) {
	349	+ kdtree::KDNode<T> *cur = next_node.back();
	350	+ next_node.pop_back();
	351	+
	352	+ int id = cur->id; // the nodes at same level are independent
	353	+
	354	+ cpu_nodes[id].level = cur->level;
	355	+ cpu_nodes[id].parent = cur->_parent;
	356	+ cpu_nodes[id].left = cur->_left;
	357	+ cpu_nodes[id].right = cur->_right;
	358	+ cpu_nodes[id].split_value = cur->split_value;
	359	+ cpu_nodes[id].num_indices = cur->indices.size(); // number of index
	360	+
	361	+ if (cur->indices.size()) {
	362	+ for (size_t i = 0; i < cur->indices.size(); i++)
	363	+ indices[cur_pos + i] = cur->indices[i];
	364	+
	365	+ cpu_nodes[id].indices = (int)cur_pos; // beginning index that every bottom node has
	366	+ cur_pos += cur->indices.size(); // store indices continuously
	367	+ }
	368	+ else {
	369	+ cpu_nodes[id].indices = -1;
	370	+ }
	371	+
	372	+ if (cur->left)
	373	+ next_search.push_back(cur->left);
	374	+
	375	+ if (cur->right)
	376	+ next_search.push_back(cur->right);
	377	+ }
	378	+ next_node = next_search;
	379	+ }
	380	+
	381	+ HANDLE_ERROR(cudaMemcpy(m_gpu_nodes, &cpu_nodes[0], sizeof(CUDA_KDNode<T>) * cpu_nodes.size(), cudaMemcpyHostToDevice));
	382	+ HANDLE_ERROR(cudaMemcpy(m_gpu_indices, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
	383	+ HANDLE_ERROR(cudaMemcpy(m_gpu_points, &pts[0], sizeof(kdtree::point<T, D>) * pts.size(), cudaMemcpyHostToDevice));
	384	+ }
	385	+ void Search(T QueryPoints, size_t QueryCount, size_t ColCount, T dists, size_t *indices) {
	386	+ std::vector < kdtree::point<T, D> > query(QueryCount);
	387	+ for (size_t j = 0; j < QueryCount; j++)
	388	+ for (size_t i = 0; i < ColCount; i++)
	389	+ query[j].coords[i] = QueryPoints[j * ColCount + i];
	390	+
	391	+ unsigned int threads = (unsigned int)(query.size() > 1024 ? 1024 : query.size());
	392	+ //unsigned int blocks = (unsigned int)ceil(query.size() / threads);
	393	+ unsigned int blocks = (unsigned int)(query.size() / threads + (query.size() % threads ? 1 : 0));
	394	+
	395	+ kdtree::point<T, D> *gpu_Query;
	396	+ size_t *gpu_ret_indices;
	397	+ T *gpu_ret_dist;
	398	+
	399	+ HANDLE_ERROR(cudaMalloc((void*)&gpu_Query, sizeof(T) query.size() * D));
	400	+ HANDLE_ERROR(cudaMalloc((void*)&gpu_ret_indices, sizeof(size_t) query.size()));
	401	+ HANDLE_ERROR(cudaMalloc((void*)&gpu_ret_dist, sizeof(T) query.size()));
	402	+ HANDLE_ERROR(cudaMemcpy(gpu_Query, &query[0], sizeof(T) * query.size() * D, cudaMemcpyHostToDevice));
	403	+
	404	+ SearchBatch << <threads, blocks >> > (m_gpu_nodes, m_gpu_indices, m_gpu_points, m_num_points, gpu_Query, query.size(), gpu_ret_indices, gpu_ret_dist);
	405	+
	406	+ HANDLE_ERROR(cudaMemcpy(indices, gpu_ret_indices, sizeof(size_t) * query.size(), cudaMemcpyDeviceToHost));
	407	+ HANDLE_ERROR(cudaMemcpy(dists, gpu_ret_dist, sizeof(T) * query.size(), cudaMemcpyDeviceToHost));
	408	+
	409	+ HANDLE_ERROR(cudaFree(gpu_Query));
	410	+ HANDLE_ERROR(cudaFree(gpu_ret_indices));
	411	+ HANDLE_ERROR(cudaFree(gpu_ret_dist));
	412	+ }
	413	+ };
	414	+} //end namespace stim
	415	+#endif
0	416	\ No newline at end of file
...	...