Commit 6a54a547256f32bceb7afb81e456d9c5fc699bee

Authored by Jiaming Guo
1 parent 42bb075c

new version of kdtree

Showing 1 changed file with 627 additions and 0 deletions   Show diff stats
stim/structures/kdtreenew.cuh 0 → 100644
  1 +// right now the size of CUDA STACK is set to 50, increase it if you mean to make deeper tree
  2 +// data should be stored in row-major
  3 +// x1,x2,x3,x4,x5......
  4 +// y1,y2,y3,y4,y5......
  5 +// ....................
  6 +// ....................
  7 +
  8 +#ifndef KDTREE_H
  9 +#define KDTREE_H
  10 +#define stack_size 50
  11 +
  12 +#include "device_launch_parameters.h"
  13 +#include <cuda.h>
  14 +#include <cuda_runtime_api.h>
  15 +#include "cuda_runtime.h"
  16 +#include <vector>
  17 +#include <cstring>
  18 +#include <float.h>
  19 +#include <iostream>
  20 +#include <algorithm>
  21 +#include <stim/cuda/cudatools/error.h>
  22 +#include <stim/visualization/aabbn.h>
  23 +
  24 +namespace stim {
  25 + namespace cpu_kdtree {
  26 + template<typename T, int D> // typename refers to float or double while D refers to dimension of points
  27 + struct point {
  28 + T dim[D]; // create a structure to store every one input point
  29 + };
  30 +
  31 + template<typename T>
  32 + class cpu_kdnode {
  33 + public:
  34 + cpu_kdnode() { // constructor for initializing a kdnode
  35 + parent = NULL; // set every node's parent, left and right kdnode pointers to NULL
  36 + left = NULL;
  37 + right = NULL;
  38 + parent_idx = -1; // set parent node index to default -1
  39 + left_idx = -1;
  40 + right_idx = -1;
  41 + split_value = -1; // set split_value to default -1
  42 + }
  43 + int idx; // index of current node
  44 + int parent_idx, left_idx, right_idx; // index of parent, left and right nodes
  45 + cpu_kdnode *parent, *left, *right; // parent, left and right kdnodes
  46 + T split_value; // splitting value of current node
  47 + std::vector <size_t> indices; // it indicates the points' indices that current node has
  48 + size_t level; // tree level of current node
  49 + };
  50 + } // end of namespace cpu_kdtree
  51 +
  52 + template <typename T>
  53 + struct cuda_kdnode {
  54 + int parent, left, right;
  55 + T split_value;
  56 + size_t num_index; // number of indices it has
  57 + int index; // the beginning index
  58 + size_t level;
  59 + };
  60 +
  61 + template <typename T, int D>
  62 + __device__ T gpu_distance(cpu_kdtree::point<T, D> &a, cpu_kdtree::point<T, D> &b) {
  63 + T distance = 0;
  64 +
  65 + for (size_t i = 0; i < D; i++) {
  66 + T d = a.dim[i] - b.dim[i];
  67 + distance += d*d;
  68 + }
  69 + return distance;
  70 + }
  71 +
  72 + template <typename T, int D>
  73 + __device__ void search_at_node(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, int cur, cpu_kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) {
  74 + T best_distance = FLT_MAX;
  75 + size_t best_index = 0;
  76 +
  77 + while (true) { // break until reach the bottom
  78 + int split_axis = nodes[cur].level % D;
  79 + if (nodes[cur].left == -1) { // check whether it has left node or not
  80 + *d_node = cur;
  81 + for (int i = 0; i < nodes[cur].num_index; i++) {
  82 + size_t idx = indices[nodes[cur].index + i];
  83 + T dist = gpu_distance<T, D>(d_query_point, d_reference_points[idx]);
  84 + if (dist < best_distance) {
  85 + best_distance = dist;
  86 + best_index = idx;
  87 + }
  88 + }
  89 + break;
  90 + }
  91 + else if (d_query_point.dim[split_axis] < nodes[cur].split_value) { // jump into specific son node
  92 + cur = nodes[cur].left;
  93 + }
  94 + else {
  95 + cur = nodes[cur].right;
  96 + }
  97 + }
  98 + *d_distance = best_distance;
  99 + *d_index = best_index;
  100 + }
  101 +
  102 + template <typename T, int D>
  103 + __device__ void search_at_node_range(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
  104 + T best_distance = FLT_MAX;
  105 + size_t best_index = 0;
  106 +
  107 + int next_nodes_pos = 0; // initialize pop out order index
  108 + next_nodes[id * stack_size + next_nodes_pos] = cur; // find data that belongs to the very specific thread
  109 + next_nodes_pos++;
  110 +
  111 + while (next_nodes_pos) {
  112 + int next_search_nodes_pos = 0; // record push back order index
  113 + while (next_nodes_pos) {
  114 + cur = next_nodes[id * stack_size + next_nodes_pos - 1]; // pop out the last push in one and keep poping out
  115 + next_nodes_pos--;
  116 + int split_axis = nodes[cur].level % D;
  117 +
  118 + if (nodes[cur].left == -1) {
  119 + for (int i = 0; i < nodes[cur].num_index; i++) {
  120 + int idx = indices[nodes[cur].index + i]; // all indices are stored in one array, pick up from every node's beginning index
  121 + T d = gpu_distance<T>(d_query_point, d_reference_points[idx]);
  122 + if (d < best_distance) {
  123 + best_distance = d;
  124 + best_index = idx;
  125 + }
  126 + }
  127 + }
  128 + else {
  129 + T d = d_query_point.dim[split_axis] - nodes[cur].split_value;
  130 +
  131 + if (fabs(d) > range) {
  132 + if (d < 0) {
  133 + next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left;
  134 + next_search_nodes_pos++;
  135 + }
  136 + else {
  137 + next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right;
  138 + next_search_nodes_pos++;
  139 + }
  140 + }
  141 + else {
  142 + next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right;
  143 + next_search_nodes_pos++;
  144 + next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left;
  145 + next_search_nodes_pos++;
  146 + if (next_search_nodes_pos > stack_size) {
  147 + printf("Thread conflict might be caused by thread %d, so please try smaller input max_tree_levels\n", id);
  148 + (*Judge)++;
  149 + }
  150 + }
  151 + }
  152 + }
  153 + for (int i = 0; i < next_search_nodes_pos; i++)
  154 + next_nodes[id * stack_size + i] = next_search_nodes[id * stack_size + i];
  155 + next_nodes_pos = next_search_nodes_pos;
  156 + }
  157 + *d_distance = best_distance;
  158 + *d_index = best_index;
  159 + }
  160 +
  161 + template <typename T, int D>
  162 + __device__ void search(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
  163 + int best_node = 0;
  164 + T best_distance = FLT_MAX;
  165 + size_t best_index = 0;
  166 + T radius = 0;
  167 +
  168 + search_at_node<T, D>(nodes, indices, d_reference_points, 0, d_query_point, &best_index, &best_distance, &best_node);
  169 + radius = sqrt(best_distance); // get range
  170 + int cur = best_node;
  171 +
  172 + while (nodes[cur].parent != -1) {
  173 + int parent = nodes[cur].parent;
  174 + int split_axis = nodes[parent].level % D;
  175 +
  176 + T tmp_dist = FLT_MAX;
  177 + size_t tmp_idx;
  178 + if (fabs(nodes[parent].split_value - d_query_point.dim[split_axis]) <= radius) {
  179 + if (nodes[parent].left != cur)
  180 + search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].left, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
  181 + else
  182 + search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].right, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
  183 + }
  184 + if (tmp_dist < best_distance) {
  185 + best_distance = tmp_dist;
  186 + best_index = tmp_idx;
  187 + }
  188 + cur = parent;
  189 + }
  190 + *d_distance = sqrt(best_distance);
  191 + *d_index = best_index;
  192 + }
  193 +
  194 + template <typename T, int D>
  195 + __global__ void search_batch(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) {
  196 + size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
  197 + if (idx >= d_query_count) return; // avoid segfault
  198 +
  199 + search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent
  200 + }
  201 +
  202 + template <typename T, int D>
  203 + void search_stream(cuda_kdnode<T> *d_nodes, size_t *d_index, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> *query_stream_points, size_t stream_count, size_t *indices, T *distances) {
  204 + unsigned int threads = (unsigned int)(stream_count > 1024 ? 1024 : stream_count);
  205 + unsigned int blocks = (unsigned int)(stream_count / threads + (stream_count % threads ? 1 : 0));
  206 +
  207 + cpu_kdtree::point<T, D> *d_query_points;
  208 + size_t *d_indices;
  209 + T *d_distances;
  210 +
  211 + int *next_nodes;
  212 + int *next_search_nodes;
  213 +
  214 + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * stream_count * D));
  215 + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * stream_count));
  216 + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * stream_count));
  217 + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int)));
  218 + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));
  219 + HANDLE_ERROR(cudaMemcpy(d_query_points, query_stream_points, sizeof(T) * stream_count * D, cudaMemcpyHostToDevice));
  220 +
  221 + int *Judge = NULL;
  222 +
  223 + search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, stream_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge);
  224 +
  225 + if(Judge == NULL) {
  226 + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * stream_count, cudaMemcpyDeviceToHost));
  227 + HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * stream_count, cudaMemcpyDeviceToHost));
  228 + }
  229 +
  230 + HANDLE_ERROR(cudaFree(next_nodes));
  231 + HANDLE_ERROR(cudaFree(next_search_nodes));
  232 + HANDLE_ERROR(cudaFree(d_query_points));
  233 + HANDLE_ERROR(cudaFree(d_indices));
  234 + HANDLE_ERROR(cudaFree(d_distances));
  235 + }
  236 +
  237 + template <typename T, int D = 3> // set dimension of data to default 3
  238 + class kdtree {
  239 + protected:
  240 + int current_axis; // current judging axis
  241 + int n_id; // store the total number of nodes
  242 + std::vector < typename cpu_kdtree::point<T, D> > *tmp_points; // transfer or temperary points
  243 + std::vector < typename cpu_kdtree::point<T, D> > cpu_tmp_points; // for cpu searching
  244 + cpu_kdtree::cpu_kdnode<T> *root; // root node
  245 + static kdtree<T, D> *cur_tree_ptr;
  246 + #ifdef __CUDACC__
  247 + cuda_kdnode<T> *d_nodes;
  248 + size_t *d_index;
  249 + cpu_kdtree::point<T, D>* d_reference_points;
  250 + size_t npts;
  251 + int num_nodes;
  252 + #endif
  253 + public:
  254 + kdtree() { // constructor for creating a cpu_kdtree
  255 + cur_tree_ptr = this; // create a class pointer points to the current class value
  256 + n_id = 0; // set total number of points to default 0
  257 + }
  258 +
  259 + ~kdtree() { // destructor of cpu_kdtree
  260 + std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes;
  261 + next_nodes.push_back(root);
  262 + while (next_nodes.size()) {
  263 + std::vector <cpu_kdtree::cpu_kdnode<T>*> next_search_nodes;
  264 + while (next_nodes.size()) {
  265 + cpu_kdtree::cpu_kdnode<T> *cur = next_nodes.back();
  266 + next_nodes.pop_back();
  267 + if (cur->left)
  268 + next_search_nodes.push_back(cur->left);
  269 + if (cur->right)
  270 + next_search_nodes.push_back(cur->right);
  271 + delete cur;
  272 + }
  273 + next_nodes = next_search_nodes;
  274 + }
  275 + root = NULL;
  276 + #ifdef __CUDACC__
  277 + HANDLE_ERROR(cudaFree(d_nodes));
  278 + HANDLE_ERROR(cudaFree(d_index));
  279 + HANDLE_ERROR(cudaFree(d_reference_points));
  280 + #endif
  281 + }
  282 +
  283 + void cpu_create(std::vector < typename cpu_kdtree::point<T, D> > &reference_points, size_t max_levels) {
  284 + tmp_points = &reference_points;
  285 + root = new cpu_kdtree::cpu_kdnode<T>(); // initializing the root node
  286 + root->idx = n_id++; // the index of root is 0
  287 + root->level = 0; // tree level begins at 0
  288 + root->indices.resize(reference_points.size()); // get the number of points
  289 + for (size_t i = 0; i < reference_points.size(); i++) {
  290 + root->indices[i] = i; // set indices of input points
  291 + }
  292 + std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes; // next nodes
  293 + next_nodes.push_back(root); // push back the root node
  294 + while (next_nodes.size()) {
  295 + std::vector <cpu_kdtree::cpu_kdnode<T>*> next_search_nodes; // next search nodes
  296 + while (next_nodes.size()) { // two same WHILE is because we need to make a new vector to store nodes for search
  297 + cpu_kdtree::cpu_kdnode<T> *current_node = next_nodes.back(); // handle node one by one (right first)
  298 + next_nodes.pop_back(); // pop out current node in order to store next round of nodes
  299 + if (current_node->level < max_levels) {
  300 + if (current_node->indices.size() > 1) { // split if the nonleaf node contains more than one point
  301 + cpu_kdtree::cpu_kdnode<T> *left = new cpu_kdtree::cpu_kdnode<T>();
  302 + cpu_kdtree::cpu_kdnode<T> *right = new cpu_kdtree::cpu_kdnode<T>();
  303 + left->idx = n_id++; // set the index of current node's left node
  304 + right->idx = n_id++;
  305 + split(current_node, left, right); // split left and right and determine a node
  306 + std::vector <size_t> temp; // empty vecters of int
  307 + //temp.resize(current_node->indices.size());
  308 + current_node->indices.swap(temp); // clean up current node's indices
  309 + current_node->left = left;
  310 + current_node->right = right;
  311 + current_node->left_idx = left->idx;
  312 + current_node->right_idx = right->idx;
  313 + if (right->indices.size())
  314 + next_search_nodes.push_back(right); // left pop out first
  315 + if (left->indices.size())
  316 + next_search_nodes.push_back(left);
  317 + }
  318 + }
  319 + }
  320 + next_nodes = next_search_nodes; // go deeper within the tree
  321 + }
  322 + }
  323 +
  324 + static bool sort_points(const size_t a, const size_t b) { // create functor for std::sort
  325 + std::vector < typename cpu_kdtree::point<T, D> > &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer
  326 + return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis];
  327 + }
  328 +
  329 + void split(cpu_kdtree::cpu_kdnode<T> *cur, cpu_kdtree::cpu_kdnode<T> *left, cpu_kdtree::cpu_kdnode<T> *right) {
  330 + std::vector < typename cpu_kdtree::point<T, D> > &pts = *tmp_points;
  331 + current_axis = cur->level % D; // indicate the judicative dimension or axis
  332 + std::sort(cur->indices.begin(), cur->indices.end(), sort_points); // using SortPoints as comparison function to sort the data
  333 + size_t mid_value = cur->indices[cur->indices.size() / 2]; // odd in the mid_value, even take the floor
  334 + cur->split_value = pts[mid_value].dim[current_axis]; // get the parent node
  335 + left->parent = cur; // set the parent of the next search nodes to current node
  336 + right->parent = cur;
  337 + left->level = cur->level + 1; // level + 1
  338 + right->level = cur->level + 1;
  339 + left->parent_idx = cur->idx; // set its parent node's index
  340 + right->parent_idx = cur->idx;
  341 + for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right half-space one by one
  342 + size_t idx = cur->indices[i];
  343 + if (pts[idx].dim[current_axis] < cur->split_value)
  344 + left->indices.push_back(idx);
  345 + else
  346 + right->indices.push_back(idx);
  347 + }
  348 + }
  349 +
  350 + void create(T *h_reference_points, size_t reference_count, size_t max_levels) {
  351 + #ifdef __CUDACC__
  352 + if (max_levels > 10) {
  353 + std::cout<<"The max_tree_levels should be smaller!"<<std::endl;
  354 + exit(1);
  355 + }
  356 + //bb.init(&h_reference_points[0]);
  357 + //aaboundingboxing<T, D>(bb, h_reference_points, reference_count);
  358 +
  359 + std::vector < typename cpu_kdtree::point<T, D>> reference_points(reference_count); // restore the reference points in particular way
  360 + for (size_t j = 0; j < reference_count; j++)
  361 + for (size_t i = 0; i < D; i++)
  362 + reference_points[j].dim[i] = h_reference_points[j * D + i]; // creating a tree on cpu
  363 + (*this).cpu_create(reference_points, max_levels); // building a tree on cpu
  364 + cpu_kdtree::cpu_kdnode<T> *d_root = (*this).get_root();
  365 + num_nodes = (*this).get_num_nodes();
  366 + npts = reference_count; // also equals to reference_count
  367 +
  368 + HANDLE_ERROR(cudaMalloc((void**)&d_nodes, sizeof(cuda_kdnode<T>) * num_nodes)); // copy data from host to device
  369 + HANDLE_ERROR(cudaMalloc((void**)&d_index, sizeof(size_t) * npts));
  370 + HANDLE_ERROR(cudaMalloc((void**)&d_reference_points, sizeof(cpu_kdtree::point<T, D>) * npts));
  371 +
  372 + std::vector < cuda_kdnode<T> > tmp_nodes(num_nodes);
  373 + std::vector <size_t> indices(npts);
  374 + std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes;
  375 + size_t cur_pos = 0;
  376 + next_nodes.push_back(d_root);
  377 + while (next_nodes.size()) {
  378 + std::vector <typename cpu_kdtree::cpu_kdnode<T>*> next_search_nodes;
  379 + while (next_nodes.size()) {
  380 + cpu_kdtree::cpu_kdnode<T> *cur = next_nodes.back();
  381 + next_nodes.pop_back();
  382 + int id = cur->idx; // the nodes at same level are independent
  383 + tmp_nodes[id].level = cur->level;
  384 + tmp_nodes[id].parent = cur->parent_idx;
  385 + tmp_nodes[id].left = cur->left_idx;
  386 + tmp_nodes[id].right = cur->right_idx;
  387 + tmp_nodes[id].split_value = cur->split_value;
  388 + tmp_nodes[id].num_index = cur->indices.size(); // number of index
  389 + if (cur->indices.size()) {
  390 + for (size_t i = 0; i < cur->indices.size(); i++)
  391 + indices[cur_pos + i] = cur->indices[i];
  392 +
  393 + tmp_nodes[id].index = (int)cur_pos; // beginning index of reference_points that every bottom node has
  394 + cur_pos += cur->indices.size(); // store indices continuously for every query_point
  395 + }
  396 + else {
  397 + tmp_nodes[id].index = -1;
  398 + }
  399 +
  400 + if (cur->left)
  401 + next_search_nodes.push_back(cur->left);
  402 +
  403 + if (cur->right)
  404 + next_search_nodes.push_back(cur->right);
  405 + }
  406 + next_nodes = next_search_nodes;
  407 + }
  408 + HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice));
  409 + HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
  410 + HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(cpu_kdtree::point<T, D>) * reference_count, cudaMemcpyHostToDevice));
  411 +
  412 + #else
  413 + std::vector < typename cpu_kdtree::point<T, D> > reference_points(reference_count); // restore the reference points in particular way
  414 + for (size_t j = 0; j < reference_count; j++)
  415 + for (size_t i = 0; i < D; i++)
  416 + reference_points[j].dim[i] = h_reference_points[j * D + i];
  417 + cpu_create(reference_points, max_levels);
  418 + cpu_tmp_points = *tmp_points;
  419 +
  420 + #endif
  421 + }
  422 +
  423 + int get_num_nodes() const { // get the total number of nodes
  424 + return n_id;
  425 + }
  426 +
  427 + cpu_kdtree::cpu_kdnode<T>* get_root() const { // get the root node of tree
  428 + return root;
  429 + }
  430 +
  431 + T cpu_distance(const cpu_kdtree::point<T, D> &a, const cpu_kdtree::point<T, D> &b) {
  432 + T distance = 0;
  433 +
  434 + for (size_t i = 0; i < D; i++) {
  435 + T d = a.dim[i] - b.dim[i];
  436 + distance += d*d;
  437 + }
  438 + return distance;
  439 + }
  440 +
  441 + void cpu_search_at_node(cpu_kdtree::cpu_kdnode<T> *cur, const cpu_kdtree::point<T, D> &query, size_t *index, T *distance, cpu_kdtree::cpu_kdnode<T> **node) {
  442 + T best_distance = FLT_MAX; // initialize the best distance to max of floating point
  443 + size_t best_index = 0;
  444 + std::vector < typename cpu_kdtree::point<T, D> > pts = cpu_tmp_points;
  445 + while (true) {
  446 + size_t split_axis = cur->level % D;
  447 + if (cur->left == NULL) { // risky but acceptable, same goes for right because left and right are in same pace
  448 + *node = cur; // pointer points to a pointer
  449 + for (size_t i = 0; i < cur->indices.size(); i++) {
  450 + size_t idx = cur->indices[i];
  451 + T d = cpu_distance(query, pts[idx]); // compute distances
  452 + /// if we want to compute k nearest neighbor, we can input the last resul
  453 + /// (last_best_dist < dist < best_dist) to select the next point until reaching to k
  454 + if (d < best_distance) {
  455 + best_distance = d;
  456 + best_index = idx; // record the nearest neighbor index
  457 + }
  458 + }
  459 + break; // find the target point then break the loop
  460 + }
  461 + else if (query.dim[split_axis] < cur->split_value) { // if it has son node, visit the next node on either left side or right side
  462 + cur = cur->left;
  463 + }
  464 + else {
  465 + cur = cur->right;
  466 + }
  467 + }
  468 + *index = best_index;
  469 + *distance = best_distance;
  470 + }
  471 +
  472 + void cpu_search_at_node_range(cpu_kdtree::cpu_kdnode<T> *cur, const cpu_kdtree::point<T, D> &query, T range, size_t *index, T *distance) {
  473 + T best_distance = FLT_MAX; // initialize the best distance to max of floating point
  474 + size_t best_index = 0;
  475 + std::vector < typename cpu_kdtree::point<T, D> > pts = cpu_tmp_points;
  476 + std::vector < typename cpu_kdtree::cpu_kdnode<T>*> next_node;
  477 + next_node.push_back(cur);
  478 + while (next_node.size()) {
  479 + std::vector<typename cpu_kdtree::cpu_kdnode<T>*> next_search;
  480 + while (next_node.size()) {
  481 + cur = next_node.back();
  482 + next_node.pop_back();
  483 + size_t split_axis = cur->level % D;
  484 + if (cur->left == NULL) {
  485 + for (size_t i = 0; i < cur->indices.size(); i++) {
  486 + size_t idx = cur->indices[i];
  487 + T d = cpu_distance(query, pts[idx]);
  488 + if (d < best_distance) {
  489 + best_distance = d;
  490 + best_index = idx;
  491 + }
  492 + }
  493 + }
  494 + else {
  495 + T d = query.dim[split_axis] - cur->split_value; // computer distance along specific axis or dimension
  496 + /// there are three possibilities: on either left or right, and on both left and right
  497 + if (fabs(d) > range) { // absolute value of floating point to see if distance will be larger that best_dist
  498 + if (d < 0)
  499 + next_search.push_back(cur->left); // every left[split_axis] is less and equal to cur->split_value, so it is possible to find the nearest point in this region
  500 + else
  501 + next_search.push_back(cur->right);
  502 + }
  503 + else { // it is possible that nereast neighbor will appear on both left and right
  504 + next_search.push_back(cur->left);
  505 + next_search.push_back(cur->right);
  506 + }
  507 + }
  508 + }
  509 + next_node = next_search; // pop out at least one time
  510 + }
  511 + *index = best_index;
  512 + *distance = best_distance;
  513 + }
  514 +
  515 + void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) {
  516 + /// first convert the input query point into specific type
  517 + cpu_kdtree::point<T, D> query;
  518 + for (size_t j = 0; j < query_count; j++) {
  519 + for (size_t i = 0; i < D; i++)
  520 + query.dim[i] = h_query_points[j * D + i];
  521 + /// find the nearest node, this will be the upper bound for the next time searching
  522 + cpu_kdtree::cpu_kdnode<T> *best_node = NULL;
  523 + T best_distance = FLT_MAX;
  524 + size_t best_index = 0;
  525 + T radius = 0; // radius for range
  526 + cpu_search_at_node(root, query, &best_index, &best_distance, &best_node); // simple search to rougly determine a result for next search step
  527 + radius = sqrt(best_distance); // It is possible that nearest will appear in another region
  528 + /// find other possibilities
  529 + cpu_kdtree::cpu_kdnode<T> *cur = best_node;
  530 + while (cur->parent != NULL) { // every node that you pass will be possible to be the best node
  531 + /// go up
  532 + cpu_kdtree::cpu_kdnode<T> *parent = cur->parent; // travel back to every node that we pass through
  533 + size_t split_axis = (parent->level) % D;
  534 + /// search other nodes
  535 + size_t tmp_index;
  536 + T tmp_distance = FLT_MAX;
  537 + if (fabs(parent->split_value - query.dim[split_axis]) <= radius) {
  538 + /// search opposite node
  539 + if (parent->left != cur)
  540 + cpu_search_at_node_range(parent->left, query, radius, &tmp_index, &tmp_distance); // to see whether it is its mother node's left son node
  541 + else
  542 + cpu_search_at_node_range(parent->right, query, radius, &tmp_index, &tmp_distance);
  543 + }
  544 + if (tmp_distance < best_distance) {
  545 + best_distance = tmp_distance;
  546 + best_index = tmp_index;
  547 + }
  548 + cur = parent;
  549 + }
  550 + h_indices[j] = best_index;
  551 + h_distances[j] = best_distance;
  552 + }
  553 + }
  554 +
  555 + void search(T *h_query_points, size_t query_count, size_t *indices, T *distances) {
  556 + #ifdef __CUDACC__
  557 + std::vector < typename cpu_kdtree::point<T, D> > query_points(query_count);
  558 + for (size_t j = 0; j < query_count; j++)
  559 + for (size_t i = 0; i < D; i++)
  560 + query_points[j].dim[i] = h_query_points[j * D + i];
  561 +
  562 + cudaDeviceProp prop;
  563 + cudaGetDeviceProperties(&prop, 0);
  564 +
  565 + size_t query_memory = D * sizeof(T) * query_count;
  566 + size_t N = 3 * query_memory / prop.totalGlobalMem; //consider index and distance, roughly 3 times
  567 + if (N > 1) {
  568 + N++;
  569 + size_t stream_count = query_count / N;
  570 + for (size_t n = 0; n < N; n++) {
  571 + size_t query_stream_start = n * stream_count;
  572 + search_stream(d_nodes, d_index, d_reference_points, &query_points[query_stream_start], stream_count, &indices[query_stream_start], &distances[query_stream_start]);
  573 + }
  574 + size_t stream_remain_count = query_count - N * stream_count;
  575 + if (stream_remain_count > 0) {
  576 + size_t query_remain_start = N * stream_count;
  577 + search_stream(d_nodes, d_index, d_reference_points, &query_points[query_remain_start], stream_remain_count, &indices[query_remain_start], &distances[query_remain_start]);
  578 + }
  579 + }
  580 + else {
  581 + unsigned int threads = (unsigned int)(query_count > 1024 ? 1024 : query_count);
  582 + unsigned int blocks = (unsigned int)(query_count / threads + (query_count % threads ? 1 : 0));
  583 +
  584 + cpu_kdtree::point<T, D> *d_query_points; // create a pointer pointing to query points on gpu
  585 + size_t *d_indices;
  586 + T *d_distances;
  587 +
  588 + int *next_nodes; // create two STACK-like array
  589 + int *next_search_nodes;
  590 +
  591 + int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory
  592 +
  593 + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_count * D));
  594 + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_count));
  595 + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_count));
  596 + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); // STACK size right now is 50, you can change it if you mean to
  597 + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));
  598 + HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_count * D, cudaMemcpyHostToDevice));
  599 +
  600 + search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, query_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge);
  601 +
  602 + if (Judge == NULL) { // do the following work if the thread works safely
  603 + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_count, cudaMemcpyDeviceToHost));
  604 + HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_count, cudaMemcpyDeviceToHost));
  605 + }
  606 +
  607 + HANDLE_ERROR(cudaFree(next_nodes));
  608 + HANDLE_ERROR(cudaFree(next_search_nodes));
  609 + HANDLE_ERROR(cudaFree(d_query_points));
  610 + HANDLE_ERROR(cudaFree(d_indices));
  611 + HANDLE_ERROR(cudaFree(d_distances));
  612 + }
  613 +
  614 + #else
  615 + cpu_search(h_query_points, query_count, indices, distances);
  616 +
  617 + #endif
  618 +
  619 + }
  620 +
  621 + }; //end class kdtree
  622 +
  623 + template <typename T, int D>
  624 + kdtree<T, D>* kdtree<T, D>::cur_tree_ptr = NULL; // definition of cur_tree_ptr pointer points to the current class
  625 +
  626 +} //end namespace stim
  627 +#endif
0 628 \ No newline at end of file
... ...