Commit cc09e435897de0d20bf3d141043f0b67ceadc62f

Authored by David Mayerich
1 parent 19a3973b

added Jack's KD-Tree code

Showing 1 changed file with 415 additions and 0 deletions   Show diff stats
stim/structures/kdtree.cuh 0 → 100644
  1 +// change CUDA_STACK together with max_tree_levels in trial and error manner
  2 +// data should be stored in row-major
  3 +// x1,x2,x3,x4,x5......
  4 +// y1,y2,y3,y4,y5......
  5 +// ....................
  6 +// ....................
  7 +
  8 +#ifndef KDTREE_H
  9 +#define KDTREE_H
  10 +
  11 +#include "device_launch_parameters.h"
  12 +#include <cuda.h>
  13 +#include <cuda_runtime_api.h>
  14 +#include "cuda_runtime.h"
  15 +#include <vector>
  16 +#include <float.h>
  17 +#include <iostream>
  18 +#include <algorithm>
  19 +
  20 +/// using API called HADDLE_ERROR
  21 +static void HandleError(cudaError_t err, const char *file, int line) {
  22 + if (err != cudaSuccess) {
  23 + std::cout<<cudaGetErrorString(err)<<" in"<< file <<" at line "<<line<<std::endl;
  24 + }
  25 +}
  26 +#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
  27 +
  28 +#define CUDA_STACK 2 // implementation "stacks" on CUDA as to do store the nodes information
  29 +
  30 +namespace stim {
  31 + namespace kdtree {
  32 + template<typename T, int D>
  33 + struct point {
  34 + T coords[D]; // if we use size to measure a vector<point>, it will show the number of point structures
  35 + };
  36 +
  37 + template<typename T>
  38 + class KDNode {
  39 + public:
  40 + KDNode() { // initialization
  41 + parent = NULL;
  42 + left = NULL;
  43 + right = NULL;
  44 + split_value = -1;
  45 + _parent = -1;
  46 + _left = -1;
  47 + _right = -1;
  48 + }
  49 + int id; // id for current node
  50 + size_t level;
  51 + KDNode *parent, *left, *right;
  52 + int _parent, _left, _right; // id for parent node
  53 + T split_value; // node value
  54 + std::vector <size_t> indices; // indices that indicate the data that current tree has
  55 + };
  56 + }
  57 +
  58 + template <typename T, int D = 3>
  59 + class cpu_kdtree {
  60 +
  61 + protected:
  62 + std::vector <kdtree::point<T, D>> *m_pts;
  63 + kdtree::KDNode<T> *m_root; // current node
  64 + int m_current_axis;
  65 + size_t m_levels;
  66 + int m_cmps; // count how many comparisons are to made in the tree for one query
  67 + int m_id; // level + 1
  68 + static cpu_kdtree<T, D> *myself;
  69 + public:
  70 + cpu_kdtree() { // initialization
  71 + myself = this;
  72 + m_id = 0; // id = level + 1, level -> axis index while id -> node identifier
  73 + }
  74 + ~cpu_kdtree() { // destructor for deleting what was created by kdtree()
  75 + std::vector <kdtree::KDNode<T>*> next_node;
  76 + next_node.push_back(m_root);
  77 + while (next_node.size()) {
  78 + std::vector <kdtree::KDNode<T>*> next_search;
  79 + while (next_node.size()) {
  80 + kdtree::KDNode<T> *cur = next_node.back();
  81 + next_node.pop_back();
  82 + if (cur->left)
  83 + next_search.push_back(cur->left);
  84 + if (cur->right)
  85 + next_search.push_back(cur->right);
  86 + delete cur;
  87 + }
  88 + next_node = next_search;
  89 + }
  90 + m_root = NULL;
  91 + }
  92 + void Create(std::vector <kdtree::point<T, D>> &pts, size_t max_levels) {
  93 + m_pts = &pts; // create a pointer point to the input data
  94 + m_levels = max_levels; // stores max tree levels
  95 + m_root = new kdtree::KDNode<T>(); // using KDNode() to initialize an ancestor node
  96 + m_root->id = m_id++; // id is 1 while level is 0 at the very beginning
  97 + m_root->level = 0; // to begin with level 0
  98 + m_root->indices.resize(pts.size()); // initialize the size of whole indices
  99 + for (size_t i = 0; i < pts.size(); i++) {
  100 + m_root->indices[i] = i; // like what we did on Keys in GPU-BF part
  101 + }
  102 + std::vector <kdtree::KDNode<T>*> next_node; // next node
  103 + next_node.push_back(m_root); // new node
  104 + while (next_node.size()) {
  105 + std::vector <kdtree::KDNode<T>*> next_search;
  106 + while (next_node.size()) { // two same WHILE is because we need to make a new vector for searching
  107 + kdtree::KDNode<T> *current_node = next_node.back(); // pointer point to current node (right first)
  108 + next_node.pop_back(); // pop out current node in order to store next node
  109 + if (current_node->level < max_levels) { // max_levels should be reasonably small compared with numbers of data
  110 + if (current_node->indices.size() > 1) {
  111 + kdtree::KDNode<T> *left = new kdtree::KDNode<T>();
  112 + kdtree::KDNode<T> *right = new kdtree::KDNode<T>();
  113 + left->id = m_id++; // risky guessing but OK for large amount of data since max_level is small
  114 + right->id = m_id++; // risky guessing but OK for large amount of data since max_level is small
  115 + Split(current_node, left, right); // split left and right and determine a node
  116 + std::vector <size_t> temp; // empty vecters of int
  117 + //temp.resize(current_node->indices.size());
  118 + current_node->indices.swap(temp); // clean up current tree's indices
  119 + current_node->left = left;
  120 + current_node->right = right;
  121 + current_node->_left = left->id; // indicates it has left son node and gets its id
  122 + current_node->_right = right->id; // indicates it has right son node and gets its id
  123 + if (left->indices.size())
  124 + next_search.push_back(left); // right first then left according to stack(first in last out), it can be done in parallel for left and right are independent
  125 + if (right->indices.size())
  126 + next_search.push_back(right);
  127 + }
  128 + }
  129 + }
  130 + next_node = next_search;
  131 + }
  132 + }
  133 + static bool SortPoints(const size_t a, const size_t b) {
  134 + std::vector <kdtree::point<T, D>> &pts = *myself->m_pts;
  135 + return pts[a].coords[myself->m_current_axis] < pts[b].coords[myself->m_current_axis];
  136 + }
  137 + void Split(kdtree::KDNode<T> *cur, kdtree::KDNode<T> *left, kdtree::KDNode<T> *right) {
  138 + /// assume both two sides are created and sure it was
  139 + std::vector <kdtree::point<T, D>> &pts = *m_pts;
  140 + m_current_axis = cur->level % D; // indicate the judicative dimension or axis
  141 + std::sort(cur->indices.begin(), cur->indices.end(), SortPoints); // using SortPoints as comparison function to sort the data
  142 + size_t mid = cur->indices[cur->indices.size() / 2]; // odd in the mid, even take the floor
  143 + cur->split_value = pts[mid].coords[m_current_axis]; // get the mother node
  144 + left->parent = cur; // set the parent to current node for the next nodes
  145 + right->parent = cur;
  146 + left->level = cur->level + 1;
  147 + right->level = cur->level + 1;
  148 + left->_parent = cur->id; // indicates it has mother node and gets its id
  149 + right->_parent = cur->id; // indicates it has mother node and gets its id
  150 + for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right area one by one
  151 + size_t idx = cur->indices[i];
  152 + if (pts[idx].coords[m_current_axis] < cur->split_value)
  153 + left->indices.push_back(idx);
  154 + else
  155 + right->indices.push_back(idx);
  156 + }
  157 + }
  158 + int GetNumNodes() const { return m_id; }
  159 + kdtree::KDNode<T>* GetRoot() const { return m_root; }
  160 + }; //end class kdtree
  161 +
  162 + template <typename T, int D>
  163 + cpu_kdtree<T, D>* cpu_kdtree<T, D>::myself = NULL; // definition of myself pointer points to the specific class
  164 +
  165 + template <typename T>
  166 + struct CUDA_KDNode {
  167 + size_t level;
  168 + int parent, left, right; // indicates id of
  169 + T split_value;
  170 + size_t num_indices; // number of indices it has
  171 + int indices; // the beginning
  172 + };
  173 +
  174 + template <typename T, int D>
  175 + __device__ T Distance(kdtree::point<T, D> &a, kdtree::point<T, D> &b) {
  176 + T dist = 0;
  177 +
  178 + for (size_t i = 0; i < D; i++) {
  179 + T d = a.coords[i] - b.coords[i];
  180 + dist += d*d;
  181 + }
  182 + return dist;
  183 + }
  184 + template <typename T, int D>
  185 + __device__ void SearchAtNode(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, int cur, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist, int *ret_node) {
  186 + /// finds the first possibility
  187 + size_t best_idx = 0;
  188 + T best_dist = FLT_MAX;
  189 +
  190 + while (true) {
  191 + int split_axis = nodes[cur].level % D;
  192 + if (nodes[cur].left == -1) { // if it doesn't have left son node
  193 + *ret_node = cur;
  194 + for (int i = 0; i < nodes[cur].num_indices; i++) {
  195 + size_t idx = indices[nodes[cur].indices + i];
  196 + T dist = Distance<T, D>(Query, pts[idx]);
  197 + if (dist < best_dist) {
  198 + best_dist = dist;
  199 + best_idx = idx;
  200 + }
  201 + }
  202 + break;
  203 + }
  204 + else if (Query.coords[split_axis] < nodes[cur].split_value) {
  205 + cur = nodes[cur].left;
  206 + }
  207 + else {
  208 + cur = nodes[cur].right;
  209 + }
  210 + }
  211 + *ret_index = best_idx;
  212 + *ret_dist = best_dist;
  213 + }
  214 + template <typename T, int D>
  215 + __device__ void SearchAtNodeRange(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, int cur, T range, size_t *ret_index, T *ret_dist) {
  216 + /// search throught all the nodes that are within one range
  217 + size_t best_idx = 0;
  218 + T best_dist = FLT_MAX;
  219 + /// using fixed stack, increase it when need
  220 + int next_node[CUDA_STACK]; // should be larger than 1
  221 + int next_node_pos = 0; // initialize pop out order index
  222 + next_node[next_node_pos++] = cur; // equals to next_node[next_node_pos] = cur, next_node_pos++
  223 +
  224 + while (next_node_pos) {
  225 + int next_search[CUDA_STACK]; // for store next nodes
  226 + int next_search_pos = 0; // record push back order index
  227 + while (next_node_pos) {
  228 + cur = next_node[next_node_pos - 1]; // pop out the last in one and keep poping out
  229 + next_node_pos--;
  230 + int split_axis = nodes[cur].level % D;
  231 +
  232 + if (nodes[cur].left == -1) {
  233 + for (int i = 0; i < nodes[cur].num_indices; i++) {
  234 + int idx = indices[nodes[cur].indices + i]; // all indices are stored in one array, pick up from every node's beginning index
  235 + T d = Distance<T>(Query, pts[idx]);
  236 + if (d < best_dist) {
  237 + best_dist = d;
  238 + best_idx = idx;
  239 + }
  240 + }
  241 + }
  242 + else {
  243 + T d = Query.coords[split_axis] - nodes[cur].split_value;
  244 +
  245 + if (fabs(d) > range) {
  246 + if (d < 0)
  247 + next_search[next_search_pos++] = nodes[cur].left;
  248 + else
  249 + next_search[next_search_pos++] = nodes[cur].right;
  250 + }
  251 + else {
  252 + next_search[next_search_pos++] = nodes[cur].left;
  253 + next_search[next_search_pos++] = nodes[cur].right;
  254 + }
  255 + }
  256 + }
  257 +
  258 + for (int i = 0; i < next_search_pos; i++)
  259 + next_node[i] = next_search[i];
  260 + next_node_pos = next_search_pos; // operation that really resemble STACK, namely first in last out
  261 + }
  262 + *ret_index = best_idx;
  263 + *ret_dist = best_dist;
  264 + }
  265 + template <typename T, int D>
  266 + __device__ void Search(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist) {
  267 + /// find first nearest node
  268 + int best_node = 0;
  269 + size_t best_idx = 0;
  270 + T best_dist = FLT_MAX;
  271 + T radius = 0;
  272 + SearchAtNode<T, D>(nodes, indices, pts, 0, Query, &best_idx, &best_dist, &best_node);
  273 + radius = sqrt(best_dist);
  274 + /// find other possibilities
  275 + int cur = best_node;
  276 +
  277 + while (nodes[cur].parent != -1) {
  278 + /// go up
  279 + int parent = nodes[cur].parent;
  280 + int split_axis = nodes[parent].level % D;
  281 + /// search other node
  282 + T tmp_dist = FLT_MAX;
  283 + size_t tmp_idx;
  284 + if (fabs(nodes[parent].split_value - Query.coords[split_axis]) <= radius) {
  285 + /// search opposite node
  286 + if (nodes[parent].left != cur)
  287 + SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].left, radius, &tmp_idx, &tmp_dist);
  288 + else
  289 + SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].right, radius, &tmp_idx, &tmp_dist);
  290 + }
  291 + if (tmp_dist < best_dist) {
  292 + best_dist = tmp_dist;
  293 + best_idx = tmp_idx;
  294 + }
  295 + cur = parent;
  296 + }
  297 + *ret_index = best_idx;
  298 + *ret_dist = sqrt(best_dist);
  299 + }
  300 + template <typename T, int D>
  301 + __global__ void SearchBatch(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, size_t num_pts, kdtree::point<T, D> *Query, size_t num_Query, size_t *ret_index, T *ret_dist) {
  302 + int idx = blockIdx.x * blockDim.x + threadIdx.x;
  303 + if (idx >= num_Query) return;
  304 +
  305 + Search<T, D>(nodes, indices, pts, Query[idx], &ret_index[idx], &ret_dist[idx]); // every query points are independent
  306 + }
  307 +
  308 + template <typename T, int D = 3>
  309 + class cuda_kdtree {
  310 + protected:
  311 + CUDA_KDNode<T> *m_gpu_nodes; // store nodes
  312 + size_t *m_gpu_indices;
  313 + kdtree::point<T, D>* m_gpu_points;
  314 + size_t m_num_points;
  315 + public:
  316 + ~cuda_kdtree() {
  317 + HANDLE_ERROR(cudaFree(m_gpu_nodes));
  318 + HANDLE_ERROR(cudaFree(m_gpu_indices));
  319 + HANDLE_ERROR(cudaFree(m_gpu_points));
  320 + }
  321 + void CreateKDTree(T *ReferencePoints, size_t ReferenceCount, size_t ColCount, size_t max_levels) {
  322 + std::vector < kdtree::point<T, D> > pts(ReferenceCount); // create specific struct of reference data
  323 + for (size_t j = 0; j < ReferenceCount; j++)
  324 + for (size_t i = 0; i < ColCount; i++)
  325 + pts[j].coords[i] = ReferencePoints[j * ColCount + i];
  326 + cpu_kdtree<T, D> tree; // initialize a tree
  327 + tree.Create(pts, max_levels); // create KD-Tree on host
  328 + kdtree::KDNode<T> *root = tree.GetRoot();
  329 + int num_nodes = tree.GetNumNodes();
  330 + /// create the same on CPU
  331 + m_num_points = pts.size(); // number of points for creating tree = reference_count in the case
  332 +
  333 + HANDLE_ERROR(cudaMalloc((void**)&m_gpu_nodes, sizeof(CUDA_KDNode<T>) * num_nodes)); // private variables for kdtree
  334 + HANDLE_ERROR(cudaMalloc((void**)&m_gpu_indices, sizeof(size_t) * m_num_points));
  335 + HANDLE_ERROR(cudaMalloc((void**)&m_gpu_points, sizeof(kdtree::point<T, D>) * m_num_points));
  336 +
  337 + std::vector <CUDA_KDNode<T>> cpu_nodes(num_nodes); // from left to right, id of nodes
  338 + std::vector <size_t> indices(m_num_points);
  339 + std::vector < kdtree::KDNode<T>* > next_node;
  340 +
  341 + size_t cur_pos = 0;
  342 +
  343 + next_node.push_back(root);
  344 +
  345 + while (next_node.size()) {
  346 + std::vector <typename kdtree::KDNode<T>* > next_search;
  347 +
  348 + while (next_node.size()) {
  349 + kdtree::KDNode<T> *cur = next_node.back();
  350 + next_node.pop_back();
  351 +
  352 + int id = cur->id; // the nodes at same level are independent
  353 +
  354 + cpu_nodes[id].level = cur->level;
  355 + cpu_nodes[id].parent = cur->_parent;
  356 + cpu_nodes[id].left = cur->_left;
  357 + cpu_nodes[id].right = cur->_right;
  358 + cpu_nodes[id].split_value = cur->split_value;
  359 + cpu_nodes[id].num_indices = cur->indices.size(); // number of index
  360 +
  361 + if (cur->indices.size()) {
  362 + for (size_t i = 0; i < cur->indices.size(); i++)
  363 + indices[cur_pos + i] = cur->indices[i];
  364 +
  365 + cpu_nodes[id].indices = (int)cur_pos; // beginning index that every bottom node has
  366 + cur_pos += cur->indices.size(); // store indices continuously
  367 + }
  368 + else {
  369 + cpu_nodes[id].indices = -1;
  370 + }
  371 +
  372 + if (cur->left)
  373 + next_search.push_back(cur->left);
  374 +
  375 + if (cur->right)
  376 + next_search.push_back(cur->right);
  377 + }
  378 + next_node = next_search;
  379 + }
  380 +
  381 + HANDLE_ERROR(cudaMemcpy(m_gpu_nodes, &cpu_nodes[0], sizeof(CUDA_KDNode<T>) * cpu_nodes.size(), cudaMemcpyHostToDevice));
  382 + HANDLE_ERROR(cudaMemcpy(m_gpu_indices, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
  383 + HANDLE_ERROR(cudaMemcpy(m_gpu_points, &pts[0], sizeof(kdtree::point<T, D>) * pts.size(), cudaMemcpyHostToDevice));
  384 + }
  385 + void Search(T *QueryPoints, size_t QueryCount, size_t ColCount, T *dists, size_t *indices) {
  386 + std::vector < kdtree::point<T, D> > query(QueryCount);
  387 + for (size_t j = 0; j < QueryCount; j++)
  388 + for (size_t i = 0; i < ColCount; i++)
  389 + query[j].coords[i] = QueryPoints[j * ColCount + i];
  390 +
  391 + unsigned int threads = (unsigned int)(query.size() > 1024 ? 1024 : query.size());
  392 + //unsigned int blocks = (unsigned int)ceil(query.size() / threads);
  393 + unsigned int blocks = (unsigned int)(query.size() / threads + (query.size() % threads ? 1 : 0));
  394 +
  395 + kdtree::point<T, D> *gpu_Query;
  396 + size_t *gpu_ret_indices;
  397 + T *gpu_ret_dist;
  398 +
  399 + HANDLE_ERROR(cudaMalloc((void**)&gpu_Query, sizeof(T) * query.size() * D));
  400 + HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_indices, sizeof(size_t) * query.size()));
  401 + HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_dist, sizeof(T) * query.size()));
  402 + HANDLE_ERROR(cudaMemcpy(gpu_Query, &query[0], sizeof(T) * query.size() * D, cudaMemcpyHostToDevice));
  403 +
  404 + SearchBatch << <threads, blocks >> > (m_gpu_nodes, m_gpu_indices, m_gpu_points, m_num_points, gpu_Query, query.size(), gpu_ret_indices, gpu_ret_dist);
  405 +
  406 + HANDLE_ERROR(cudaMemcpy(indices, gpu_ret_indices, sizeof(size_t) * query.size(), cudaMemcpyDeviceToHost));
  407 + HANDLE_ERROR(cudaMemcpy(dists, gpu_ret_dist, sizeof(T) * query.size(), cudaMemcpyDeviceToHost));
  408 +
  409 + HANDLE_ERROR(cudaFree(gpu_Query));
  410 + HANDLE_ERROR(cudaFree(gpu_ret_indices));
  411 + HANDLE_ERROR(cudaFree(gpu_ret_dist));
  412 + }
  413 + };
  414 +} //end namespace stim
  415 +#endif
0 416 \ No newline at end of file
... ...