Commit 6d247a6cdfd13b3037ca88c8cb5cd260506d10e4

Authored by David Mayerich
2 parents e6d6e994 268037bc

Merge branch 'kdtree' of git.stim.ee.uh.edu:codebase/stimlib into kdtree

Showing 1 changed file with 257 additions and 258 deletions   Show diff stats
stim/structures/kdtree.cuh
1   -// change CUDA_STACK together with max_tree_levels in trial and error manner
  1 +// right now the size of CUDA STACK is set to 1000, increase it if you mean to make deeper tree
2 2 // data should be stored in row-major
3 3 // x1,x2,x3,x4,x5......
4 4 // y1,y2,y3,y4,y5......
... ... @@ -16,159 +16,150 @@
16 16 #include <float.h>
17 17 #include <iostream>
18 18 #include <algorithm>
19   -
20   -/// using API called HADDLE_ERROR
21   -static void HandleError(cudaError_t err, const char *file, int line) {
22   - if (err != cudaSuccess) {
23   - std::cout<<cudaGetErrorString(err)<<" in"<< file <<" at line "<<line<<std::endl;
24   - }
25   -}
26   -#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
27   -
28   -#define CUDA_STACK 2 // implementation "stacks" on CUDA as to do store the nodes information
  19 +#include <stim/cuda/cudatools/error.h>
29 20  
30 21 namespace stim {
31 22 namespace kdtree {
32   - template<typename T, int D>
  23 + template<typename T, int D> // typename refers to float or double while D refers to dimension of points
33 24 struct point {
34   - T coords[D]; // if we use size to measure a vector<point>, it will show the number of point structures
  25 + T dim[D]; // create a structure to store every one input point
35 26 };
36 27  
37 28 template<typename T>
38   - class KDNode {
  29 + class kdnode {
39 30 public:
40   - KDNode() { // initialization
41   - parent = NULL;
  31 + kdnode() { // constructor for initializing a kdnode
  32 + parent = NULL; // set every node's parent, left and right kdnode pointers to NULL
42 33 left = NULL;
43 34 right = NULL;
44   - split_value = -1;
45   - _parent = -1;
46   - _left = -1;
47   - _right = -1;
  35 + parent_idx = -1; // set parent node index to default -1
  36 + left_idx = -1;
  37 + right_idx = -1;
  38 + split_value = -1; // set split_value to default -1
48 39 }
49   - int id; // id for current node
50   - size_t level;
51   - KDNode *parent, *left, *right;
52   - int _parent, _left, _right; // id for parent node
53   - T split_value; // node value
54   - std::vector <size_t> indices; // indices that indicate the data that current tree has
  40 + int idx; // index of current node
  41 + int parent_idx, left_idx, right_idx; // index of parent, left and right nodes
  42 + kdnode *parent, *left, *right; // parent, left and right kdnodes
  43 + T split_value; // splitting value of current node
  44 + std::vector <size_t> indices; // it indicates the points' indices that current node has
  45 + size_t level; // tree level of current node
55 46 };
56   - }
  47 + } // end of namespace kdtree
57 48  
58   - template <typename T, int D = 3>
  49 + template <typename T, int D = 3> // set dimension of data to default 3
59 50 class cpu_kdtree {
60   -
61 51 protected:
62   - std::vector <kdtree::point<T, D>> *m_pts;
63   - kdtree::KDNode<T> *m_root; // current node
64   - int m_current_axis;
65   - size_t m_levels;
66   - int m_cmps; // count how many comparisons are to made in the tree for one query
67   - int m_id; // level + 1
68   - static cpu_kdtree<T, D> *myself;
  52 + int current_axis; // current judging axis
  53 + int cmps; // count how many time of comparisons (just for cpu-kdtree)
  54 + int n_id; // store the total number of nodes
  55 + std::vector <kdtree::point<T, D>> *tmp_points; // transfer or temp points
  56 + kdtree::kdnode<T> *root; // root node
  57 + static cpu_kdtree<T, D> *cur_tree_ptr;
69 58 public:
70   - cpu_kdtree() { // initialization
71   - myself = this;
72   - m_id = 0; // id = level + 1, level -> axis index while id -> node identifier
  59 + cpu_kdtree() { // constructor for creating a cpu_kdtree
  60 + cur_tree_ptr = this; // create a class pointer points to the current class value
  61 + n_id = 0; // set total number of points to default 0
73 62 }
74   - ~cpu_kdtree() { // destructor for deleting what was created by kdtree()
75   - std::vector <kdtree::KDNode<T>*> next_node;
76   - next_node.push_back(m_root);
77   - while (next_node.size()) {
78   - std::vector <kdtree::KDNode<T>*> next_search;
79   - while (next_node.size()) {
80   - kdtree::KDNode<T> *cur = next_node.back();
81   - next_node.pop_back();
  63 + ~cpu_kdtree() { // destructor of cpu_kdtree
  64 + std::vector <kdtree::kdnode<T>*> next_nodes;
  65 + next_nodes.push_back(root);
  66 + while (next_nodes.size()) {
  67 + std::vector <kdtree::kdnode<T>*> next_search_nodes;
  68 + while (next_nodes.size()) {
  69 + kdtree::kdnode<T> *cur = next_nodes.back();
  70 + next_nodes.pop_back();
82 71 if (cur->left)
83   - next_search.push_back(cur->left);
  72 + next_search_nodes.push_back(cur->left);
84 73 if (cur->right)
85   - next_search.push_back(cur->right);
  74 + next_search_nodes.push_back(cur->right);
86 75 delete cur;
87 76 }
88   - next_node = next_search;
  77 + next_nodes = next_search_nodes;
89 78 }
90   - m_root = NULL;
  79 + root = NULL;
91 80 }
92   - void Create(std::vector <kdtree::point<T, D>> &pts, size_t max_levels) {
93   - m_pts = &pts; // create a pointer point to the input data
94   - m_levels = max_levels; // stores max tree levels
95   - m_root = new kdtree::KDNode<T>(); // using KDNode() to initialize an ancestor node
96   - m_root->id = m_id++; // id is 1 while level is 0 at the very beginning
97   - m_root->level = 0; // to begin with level 0
98   - m_root->indices.resize(pts.size()); // initialize the size of whole indices
99   - for (size_t i = 0; i < pts.size(); i++) {
100   - m_root->indices[i] = i; // like what we did on Keys in GPU-BF part
  81 + void Create(std::vector <kdtree::point<T, D>> &reference_points, size_t max_levels) {
  82 + tmp_points = &reference_points;
  83 + root = new kdtree::kdnode<T>(); // initializing the root node
  84 + root->idx = n_id++; // the index of root is 0
  85 + root->level = 0; // tree level begins at 0
  86 + root->indices.resize(reference_points.size()); // get the number of points
  87 + for (size_t i = 0; i < reference_points.size(); i++) {
  88 + root->indices[i] = i; // set indices of input points
101 89 }
102   - std::vector <kdtree::KDNode<T>*> next_node; // next node
103   - next_node.push_back(m_root); // new node
104   - while (next_node.size()) {
105   - std::vector <kdtree::KDNode<T>*> next_search;
106   - while (next_node.size()) { // two same WHILE is because we need to make a new vector for searching
107   - kdtree::KDNode<T> *current_node = next_node.back(); // pointer point to current node (right first)
108   - next_node.pop_back(); // pop out current node in order to store next node
109   - if (current_node->level < max_levels) { // max_levels should be reasonably small compared with numbers of data
110   - if (current_node->indices.size() > 1) {
111   - kdtree::KDNode<T> *left = new kdtree::KDNode<T>();
112   - kdtree::KDNode<T> *right = new kdtree::KDNode<T>();
113   - left->id = m_id++; // risky guessing but OK for large amount of data since max_level is small
114   - right->id = m_id++; // risky guessing but OK for large amount of data since max_level is small
115   - Split(current_node, left, right); // split left and right and determine a node
116   - std::vector <size_t> temp; // empty vecters of int
  90 + std::vector <kdtree::kdnode<T>*> next_nodes; // next nodes
  91 + next_nodes.push_back(root); // push back the root node
  92 + while (next_nodes.size()) {
  93 + std::vector <kdtree::kdnode<T>*> next_search_nodes; // next search nodes
  94 + while (next_nodes.size()) { // two same WHILE is because we need to make a new vector to store nodes for search
  95 + kdtree::kdnode<T> *current_node = next_nodes.back(); // handle node one by one (right first)
  96 + next_nodes.pop_back(); // pop out current node in order to store next round of nodes
  97 + if (current_node->level < max_levels) {
  98 + if (current_node->indices.size() > 1) { // split if the nonleaf node contains more than one point
  99 + kdtree::kdnode<T> *left = new kdtree::kdnode<T>();
  100 + kdtree::kdnode<T> *right = new kdtree::kdnode<T>();
  101 + left->idx = n_id++; // set the index of current node's left node
  102 + right->idx = n_id++;
  103 + Split(current_node, left, right); // split left and right and determine a node
  104 + std::vector <size_t> temp; // empty vecters of int
117 105 //temp.resize(current_node->indices.size());
118   - current_node->indices.swap(temp); // clean up current tree's indices
  106 + current_node->indices.swap(temp); // clean up current node's indices
119 107 current_node->left = left;
120 108 current_node->right = right;
121   - current_node->_left = left->id; // indicates it has left son node and gets its id
122   - current_node->_right = right->id; // indicates it has right son node and gets its id
123   - if (left->indices.size())
124   - next_search.push_back(left); // right first then left according to stack(first in last out), it can be done in parallel for left and right are independent
  109 + current_node->left_idx = left->idx;
  110 + current_node->right_idx = right->idx;
125 111 if (right->indices.size())
126   - next_search.push_back(right);
  112 + next_search_nodes.push_back(right); // left pop out first
  113 + if (left->indices.size())
  114 + next_search_nodes.push_back(left);
127 115 }
128 116 }
129 117 }
130   - next_node = next_search;
  118 + next_nodes = next_search_nodes; // go deeper within the tree
131 119 }
132 120 }
133   - static bool SortPoints(const size_t a, const size_t b) {
134   - std::vector <kdtree::point<T, D>> &pts = *myself->m_pts;
135   - return pts[a].coords[myself->m_current_axis] < pts[b].coords[myself->m_current_axis];
  121 + static bool SortPoints(const size_t a, const size_t b) { // create functor for std::sort
  122 + std::vector <kdtree::point<T, D>> &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer
  123 + return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis];
136 124 }
137   - void Split(kdtree::KDNode<T> *cur, kdtree::KDNode<T> *left, kdtree::KDNode<T> *right) {
138   - /// assume both two sides are created and sure it was
139   - std::vector <kdtree::point<T, D>> &pts = *m_pts;
140   - m_current_axis = cur->level % D; // indicate the judicative dimension or axis
  125 + void Split(kdtree::kdnode<T> *cur, kdtree::kdnode<T> *left, kdtree::kdnode<T> *right) {
  126 + std::vector <kdtree::point<T, D>> &pts = *tmp_points;
  127 + current_axis = cur->level % D; // indicate the judicative dimension or axis
141 128 std::sort(cur->indices.begin(), cur->indices.end(), SortPoints); // using SortPoints as comparison function to sort the data
142   - size_t mid = cur->indices[cur->indices.size() / 2]; // odd in the mid, even take the floor
143   - cur->split_value = pts[mid].coords[m_current_axis]; // get the mother node
144   - left->parent = cur; // set the parent to current node for the next nodes
  129 + size_t mid_value = cur->indices[cur->indices.size() / 2]; // odd in the mid_value, even take the floor
  130 + cur->split_value = pts[mid_value].dim[current_axis]; // get the parent node
  131 + left->parent = cur; // set the parent of the next search nodes to current node
145 132 right->parent = cur;
146   - left->level = cur->level + 1;
  133 + left->level = cur->level + 1; // level + 1
147 134 right->level = cur->level + 1;
148   - left->_parent = cur->id; // indicates it has mother node and gets its id
149   - right->_parent = cur->id; // indicates it has mother node and gets its id
150   - for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right area one by one
  135 + left->parent_idx = cur->idx; // set its parent node's index
  136 + right->parent_idx = cur->idx;
  137 + for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right half-space one by one
151 138 size_t idx = cur->indices[i];
152   - if (pts[idx].coords[m_current_axis] < cur->split_value)
  139 + if (pts[idx].dim[current_axis] < cur->split_value)
153 140 left->indices.push_back(idx);
154 141 else
155 142 right->indices.push_back(idx);
156 143 }
157 144 }
158   - int GetNumNodes() const { return m_id; }
159   - kdtree::KDNode<T>* GetRoot() const { return m_root; }
  145 + int GetNumNodes() const { // get the total number of nodes
  146 + return n_id;
  147 + }
  148 + kdtree::kdnode<T>* GetRoot() const { // get the root node of tree
  149 + return root;
  150 + }
160 151 }; //end class kdtree
161 152  
162 153 template <typename T, int D>
163   - cpu_kdtree<T, D>* cpu_kdtree<T, D>::myself = NULL; // definition of myself pointer points to the specific class
  154 + cpu_kdtree<T, D>* cpu_kdtree<T, D>::cur_tree_ptr = NULL; // definition of cur_tree_ptr pointer points to the current class
164 155  
165 156 template <typename T>
166   - struct CUDA_KDNode {
167   - size_t level;
168   - int parent, left, right; // indicates id of
  157 + struct cuda_kdnode {
  158 + int parent, left, right;
169 159 T split_value;
170   - size_t num_indices; // number of indices it has
171   - int indices; // the beginning
  160 + size_t num_index; // number of indices it has
  161 + int index; // the beginning index
  162 + size_t level;
172 163 };
173 164  
174 165 template <typename T, int D>
... ... @@ -176,239 +167,247 @@ namespace stim {
176 167 T dist = 0;
177 168  
178 169 for (size_t i = 0; i < D; i++) {
179   - T d = a.coords[i] - b.coords[i];
  170 + T d = a.dim[i] - b.dim[i];
180 171 dist += d*d;
181 172 }
182 173 return dist;
183 174 }
184 175 template <typename T, int D>
185   - __device__ void SearchAtNode(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, int cur, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist, int *ret_node) {
186   - /// finds the first possibility
187   - size_t best_idx = 0;
188   - T best_dist = FLT_MAX;
  176 + __device__ void SearchAtNode(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, int cur, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) {
  177 + T best_distance = FLT_MAX;
  178 + size_t best_index = 0;
189 179  
190   - while (true) {
  180 + while (true) { // break until reach the bottom
191 181 int split_axis = nodes[cur].level % D;
192   - if (nodes[cur].left == -1) { // if it doesn't have left son node
193   - *ret_node = cur;
194   - for (int i = 0; i < nodes[cur].num_indices; i++) {
195   - size_t idx = indices[nodes[cur].indices + i];
196   - T dist = Distance<T, D>(Query, pts[idx]);
197   - if (dist < best_dist) {
198   - best_dist = dist;
199   - best_idx = idx;
  182 + if (nodes[cur].left == -1) { // check whether it has left node or not
  183 + *d_node = cur;
  184 + for (int i = 0; i < nodes[cur].num_index; i++) {
  185 + size_t idx = indices[nodes[cur].index + i];
  186 + T dist = Distance<T, D>(d_query_point, d_reference_points[idx]);
  187 + if (dist < best_distance) {
  188 + best_distance = dist;
  189 + best_index = idx;
200 190 }
201 191 }
202   - break;
  192 + break;
203 193 }
204   - else if (Query.coords[split_axis] < nodes[cur].split_value) {
  194 + else if (d_query_point.dim[split_axis] < nodes[cur].split_value) { // jump into specific son node
205 195 cur = nodes[cur].left;
206 196 }
207 197 else {
208 198 cur = nodes[cur].right;
209 199 }
210 200 }
211   - *ret_index = best_idx;
212   - *ret_dist = best_dist;
  201 + *d_distance = best_distance;
  202 + *d_index = best_index;
213 203 }
214 204 template <typename T, int D>
215   - __device__ void SearchAtNodeRange(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, int cur, T range, size_t *ret_index, T *ret_dist) {
216   - /// search throught all the nodes that are within one range
217   - size_t best_idx = 0;
218   - T best_dist = FLT_MAX;
219   - /// using fixed stack, increase it when need
220   - int next_node[CUDA_STACK]; // should be larger than 1
221   - int next_node_pos = 0; // initialize pop out order index
222   - next_node[next_node_pos++] = cur; // equals to next_node[next_node_pos] = cur, next_node_pos++
223   -
224   - while (next_node_pos) {
225   - int next_search[CUDA_STACK]; // for store next nodes
226   - int next_search_pos = 0; // record push back order index
227   - while (next_node_pos) {
228   - cur = next_node[next_node_pos - 1]; // pop out the last in one and keep poping out
229   - next_node_pos--;
  205 + __device__ void SearchAtNodeRange(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
  206 + T best_distance = FLT_MAX;
  207 + size_t best_index = 0;
  208 +
  209 + int next_nodes_pos = 0; // initialize pop out order index
  210 + next_nodes[id * 1000 + next_nodes_pos] = cur; // find data that belongs to the very specific thread
  211 + next_nodes_pos++;
  212 +
  213 + while (next_nodes_pos) {
  214 + int next_search_nodes_pos = 0; // record push back order index
  215 + while (next_nodes_pos) {
  216 + cur = next_nodes[id * 1000 + next_nodes_pos - 1]; // pop out the last push in one and keep poping out
  217 + next_nodes_pos--;
230 218 int split_axis = nodes[cur].level % D;
231 219  
232 220 if (nodes[cur].left == -1) {
233   - for (int i = 0; i < nodes[cur].num_indices; i++) {
234   - int idx = indices[nodes[cur].indices + i]; // all indices are stored in one array, pick up from every node's beginning index
235   - T d = Distance<T>(Query, pts[idx]);
236   - if (d < best_dist) {
237   - best_dist = d;
238   - best_idx = idx;
  221 + for (int i = 0; i < nodes[cur].num_index; i++) {
  222 + int idx = indices[nodes[cur].index + i]; // all indices are stored in one array, pick up from every node's beginning index
  223 + T d = Distance<T>(d_query_point, d_reference_points[idx]);
  224 + if (d < best_distance) {
  225 + best_distance = d;
  226 + best_index = idx;
239 227 }
240 228 }
241 229 }
242 230 else {
243   - T d = Query.coords[split_axis] - nodes[cur].split_value;
  231 + T d = d_query_point.dim[split_axis] - nodes[cur].split_value;
244 232  
245 233 if (fabs(d) > range) {
246   - if (d < 0)
247   - next_search[next_search_pos++] = nodes[cur].left;
248   - else
249   - next_search[next_search_pos++] = nodes[cur].right;
  234 + if (d < 0) {
  235 + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].left;
  236 + next_search_nodes_pos++;
  237 + }
  238 + else {
  239 + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].right;
  240 + next_search_nodes_pos++;
  241 + }
250 242 }
251 243 else {
252   - next_search[next_search_pos++] = nodes[cur].left;
253   - next_search[next_search_pos++] = nodes[cur].right;
  244 + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].right;
  245 + next_search_nodes_pos++;
  246 + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].left;
  247 + next_search_nodes_pos++;
  248 + if (next_search_nodes_pos > 1000) {
  249 + printf("Thread conflict might be caused by thread %d, so please try smaller input max_tree_levels\n", id);
  250 + (*Judge)++;
  251 + }
254 252 }
255 253 }
256 254 }
257   -
258   - for (int i = 0; i < next_search_pos; i++)
259   - next_node[i] = next_search[i];
260   - next_node_pos = next_search_pos; // operation that really resemble STACK, namely first in last out
  255 + for (int i = 0; i < next_search_nodes_pos; i++)
  256 + next_nodes[id * 1000 + i] = next_search_nodes[id * 1000 + i];
  257 + next_nodes_pos = next_search_nodes_pos;
261 258 }
262   - *ret_index = best_idx;
263   - *ret_dist = best_dist;
  259 + *d_distance = best_distance;
  260 + *d_index = best_index;
264 261 }
265 262 template <typename T, int D>
266   - __device__ void Search(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist) {
267   - /// find first nearest node
  263 + __device__ void Search(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
268 264 int best_node = 0;
269   - size_t best_idx = 0;
270   - T best_dist = FLT_MAX;
  265 + T best_distance = FLT_MAX;
  266 + size_t best_index = 0;
271 267 T radius = 0;
272   - SearchAtNode<T, D>(nodes, indices, pts, 0, Query, &best_idx, &best_dist, &best_node);
273   - radius = sqrt(best_dist);
274   - /// find other possibilities
  268 +
  269 + SearchAtNode<T, D>(nodes, indices, d_reference_points, 0, d_query_point, &best_index, &best_distance, &best_node);
  270 + radius = sqrt(best_distance); // get range
275 271 int cur = best_node;
276 272  
277 273 while (nodes[cur].parent != -1) {
278   - /// go up
279 274 int parent = nodes[cur].parent;
280 275 int split_axis = nodes[parent].level % D;
281   - /// search other node
  276 +
282 277 T tmp_dist = FLT_MAX;
283 278 size_t tmp_idx;
284   - if (fabs(nodes[parent].split_value - Query.coords[split_axis]) <= radius) {
285   - /// search opposite node
  279 + if (fabs(nodes[parent].split_value - d_query_point.dim[split_axis]) <= radius) {
286 280 if (nodes[parent].left != cur)
287   - SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].left, radius, &tmp_idx, &tmp_dist);
  281 + SearchAtNodeRange(nodes, indices, d_reference_points, d_query_point, nodes[parent].left, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
288 282 else
289   - SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].right, radius, &tmp_idx, &tmp_dist);
  283 + SearchAtNodeRange(nodes, indices, d_reference_points, d_query_point, nodes[parent].right, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
290 284 }
291   - if (tmp_dist < best_dist) {
292   - best_dist = tmp_dist;
293   - best_idx = tmp_idx;
  285 + if (tmp_dist < best_distance) {
  286 + best_distance = tmp_dist;
  287 + best_index = tmp_idx;
294 288 }
295 289 cur = parent;
296 290 }
297   - *ret_index = best_idx;
298   - *ret_dist = sqrt(best_dist);
  291 + *d_distance = sqrt(best_distance);
  292 + *d_index = best_index;
299 293 }
300 294 template <typename T, int D>
301   - __global__ void SearchBatch(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, size_t num_pts, kdtree::point<T, D> *Query, size_t num_Query, size_t *ret_index, T *ret_dist) {
302   - int idx = blockIdx.x * blockDim.x + threadIdx.x;
303   - if (idx >= num_Query) return;
  295 + __global__ void SearchBatch(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) {
  296 + size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
  297 + if (idx >= d_query_count) return; // avoid segfault
304 298  
305   - Search<T, D>(nodes, indices, pts, Query[idx], &ret_index[idx], &ret_dist[idx]); // every query points are independent
  299 + Search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent
306 300 }
307 301  
308 302 template <typename T, int D = 3>
309 303 class cuda_kdtree {
310 304 protected:
311   - CUDA_KDNode<T> *m_gpu_nodes; // store nodes
312   - size_t *m_gpu_indices;
313   - kdtree::point<T, D>* m_gpu_points;
314   - size_t m_num_points;
  305 + cuda_kdnode<T> *d_nodes;
  306 + size_t *d_index;
  307 + kdtree::point<T, D>* d_reference_points;
  308 + size_t d_reference_count;
315 309 public:
316 310 ~cuda_kdtree() {
317   - HANDLE_ERROR(cudaFree(m_gpu_nodes));
318   - HANDLE_ERROR(cudaFree(m_gpu_indices));
319   - HANDLE_ERROR(cudaFree(m_gpu_points));
  311 + HANDLE_ERROR(cudaFree(d_nodes));
  312 + HANDLE_ERROR(cudaFree(d_index));
  313 + HANDLE_ERROR(cudaFree(d_reference_points));
320 314 }
321   - void CreateKDTree(T *ReferencePoints, size_t ReferenceCount, size_t ColCount, size_t max_levels) {
322   - std::vector < kdtree::point<T, D> > pts(ReferenceCount); // create specific struct of reference data
323   - for (size_t j = 0; j < ReferenceCount; j++)
324   - for (size_t i = 0; i < ColCount; i++)
325   - pts[j].coords[i] = ReferencePoints[j * ColCount + i];
326   - cpu_kdtree<T, D> tree; // initialize a tree
327   - tree.Create(pts, max_levels); // create KD-Tree on host
328   - kdtree::KDNode<T> *root = tree.GetRoot();
  315 + void CreateKDTree(T *h_reference_points, size_t reference_count, size_t dim_count, size_t max_levels) {
  316 + if (max_levels > 10) {
  317 + std::cout<<"The max_tree_levels should be smaller!"<<std::endl;
  318 + exit(1);
  319 + }
  320 + std::vector <kdtree::point<T, D>> reference_points(reference_count); // restore the reference points in particular way
  321 + for (size_t j = 0; j < reference_count; j++)
  322 + for (size_t i = 0; i < dim_count; i++)
  323 + reference_points[j].dim[i] = h_reference_points[j * dim_count + i];
  324 + cpu_kdtree<T, D> tree; // creating a tree on cpu
  325 + tree.Create(reference_points, max_levels); // building a tree on cpu
  326 + kdtree::kdnode<T> *d_root = tree.GetRoot();
329 327 int num_nodes = tree.GetNumNodes();
330   - /// create the same on CPU
331   - m_num_points = pts.size(); // number of points for creating tree = reference_count in the case
  328 + d_reference_count = reference_points.size(); // also equals to reference_count
332 329  
333   - HANDLE_ERROR(cudaMalloc((void**)&m_gpu_nodes, sizeof(CUDA_KDNode<T>) * num_nodes)); // private variables for kdtree
334   - HANDLE_ERROR(cudaMalloc((void**)&m_gpu_indices, sizeof(size_t) * m_num_points));
335   - HANDLE_ERROR(cudaMalloc((void**)&m_gpu_points, sizeof(kdtree::point<T, D>) * m_num_points));
336   -
337   - std::vector <CUDA_KDNode<T>> cpu_nodes(num_nodes); // from left to right, id of nodes
338   - std::vector <size_t> indices(m_num_points);
339   - std::vector < kdtree::KDNode<T>* > next_node;
  330 + HANDLE_ERROR(cudaMalloc((void**)&d_nodes, sizeof(cuda_kdnode<T>) * num_nodes)); // copy data from host to device
  331 + HANDLE_ERROR(cudaMalloc((void**)&d_index, sizeof(size_t) * d_reference_count));
  332 + HANDLE_ERROR(cudaMalloc((void**)&d_reference_points, sizeof(kdtree::point<T, D>) * d_reference_count));
340 333  
  334 + std::vector <cuda_kdnode<T>> tmp_nodes(num_nodes);
  335 + std::vector <size_t> indices(d_reference_count);
  336 + std::vector <kdtree::kdnode<T>*> next_nodes;
341 337 size_t cur_pos = 0;
342   -
343   - next_node.push_back(root);
344   -
345   - while (next_node.size()) {
346   - std::vector <typename kdtree::KDNode<T>* > next_search;
347   -
348   - while (next_node.size()) {
349   - kdtree::KDNode<T> *cur = next_node.back();
350   - next_node.pop_back();
351   -
352   - int id = cur->id; // the nodes at same level are independent
353   -
354   - cpu_nodes[id].level = cur->level;
355   - cpu_nodes[id].parent = cur->_parent;
356   - cpu_nodes[id].left = cur->_left;
357   - cpu_nodes[id].right = cur->_right;
358   - cpu_nodes[id].split_value = cur->split_value;
359   - cpu_nodes[id].num_indices = cur->indices.size(); // number of index
360   -
  338 + next_nodes.push_back(d_root);
  339 + while (next_nodes.size()) {
  340 + std::vector <typename kdtree::kdnode<T>*> next_search_nodes;
  341 + while (next_nodes.size()) {
  342 + kdtree::kdnode<T> *cur = next_nodes.back();
  343 + next_nodes.pop_back();
  344 + int id = cur->idx; // the nodes at same level are independent
  345 + tmp_nodes[id].level = cur->level;
  346 + tmp_nodes[id].parent = cur->parent_idx;
  347 + tmp_nodes[id].left = cur->left_idx;
  348 + tmp_nodes[id].right = cur->right_idx;
  349 + tmp_nodes[id].split_value = cur->split_value;
  350 + tmp_nodes[id].num_index = cur->indices.size(); // number of index
361 351 if (cur->indices.size()) {
362 352 for (size_t i = 0; i < cur->indices.size(); i++)
363 353 indices[cur_pos + i] = cur->indices[i];
364 354  
365   - cpu_nodes[id].indices = (int)cur_pos; // beginning index that every bottom node has
366   - cur_pos += cur->indices.size(); // store indices continuously
  355 + tmp_nodes[id].index = (int)cur_pos; // beginning index of reference_points that every bottom node has
  356 + cur_pos += cur->indices.size(); // store indices continuously for every query_point
367 357 }
368 358 else {
369   - cpu_nodes[id].indices = -1;
  359 + tmp_nodes[id].index = -1;
370 360 }
371 361  
372 362 if (cur->left)
373   - next_search.push_back(cur->left);
  363 + next_search_nodes.push_back(cur->left);
374 364  
375 365 if (cur->right)
376   - next_search.push_back(cur->right);
  366 + next_search_nodes.push_back(cur->right);
377 367 }
378   - next_node = next_search;
  368 + next_nodes = next_search_nodes;
379 369 }
380   -
381   - HANDLE_ERROR(cudaMemcpy(m_gpu_nodes, &cpu_nodes[0], sizeof(CUDA_KDNode<T>) * cpu_nodes.size(), cudaMemcpyHostToDevice));
382   - HANDLE_ERROR(cudaMemcpy(m_gpu_indices, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
383   - HANDLE_ERROR(cudaMemcpy(m_gpu_points, &pts[0], sizeof(kdtree::point<T, D>) * pts.size(), cudaMemcpyHostToDevice));
  370 + HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice));
  371 + HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
  372 + HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point<T, D>) * reference_points.size(), cudaMemcpyHostToDevice));
384 373 }
385   - void Search(T *QueryPoints, size_t QueryCount, size_t ColCount, T *dists, size_t *indices) {
386   - std::vector < kdtree::point<T, D> > query(QueryCount);
387   - for (size_t j = 0; j < QueryCount; j++)
388   - for (size_t i = 0; i < ColCount; i++)
389   - query[j].coords[i] = QueryPoints[j * ColCount + i];
390   -
391   - unsigned int threads = (unsigned int)(query.size() > 1024 ? 1024 : query.size());
392   - //unsigned int blocks = (unsigned int)ceil(query.size() / threads);
393   - unsigned int blocks = (unsigned int)(query.size() / threads + (query.size() % threads ? 1 : 0));
394   -
395   - kdtree::point<T, D> *gpu_Query;
396   - size_t *gpu_ret_indices;
397   - T *gpu_ret_dist;
398   -
399   - HANDLE_ERROR(cudaMalloc((void**)&gpu_Query, sizeof(T) * query.size() * D));
400   - HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_indices, sizeof(size_t) * query.size()));
401   - HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_dist, sizeof(T) * query.size()));
402   - HANDLE_ERROR(cudaMemcpy(gpu_Query, &query[0], sizeof(T) * query.size() * D, cudaMemcpyHostToDevice));
403   -
404   - SearchBatch << <threads, blocks >> > (m_gpu_nodes, m_gpu_indices, m_gpu_points, m_num_points, gpu_Query, query.size(), gpu_ret_indices, gpu_ret_dist);
405   -
406   - HANDLE_ERROR(cudaMemcpy(indices, gpu_ret_indices, sizeof(size_t) * query.size(), cudaMemcpyDeviceToHost));
407   - HANDLE_ERROR(cudaMemcpy(dists, gpu_ret_dist, sizeof(T) * query.size(), cudaMemcpyDeviceToHost));
  374 + void Search(T *h_query_points, size_t query_count, size_t dim_count, T *dists, size_t *indices) {
  375 + std::vector <kdtree::point<T, D>> query_points(query_count);
  376 + for (size_t j = 0; j < query_count; j++)
  377 + for (size_t i = 0; i < dim_count; i++)
  378 + query_points[j].dim[i] = h_query_points[j * dim_count + i];
  379 +
  380 + unsigned int threads = (unsigned int)(query_points.size() > 1024 ? 1024 : query_points.size());
  381 + unsigned int blocks = (unsigned int)(query_points.size() / threads + (query_points.size() % threads ? 1 : 0));
  382 +
  383 + kdtree::point<T, D> *d_query_points; // create a pointer pointing to query points on gpu
  384 + size_t *d_indices;
  385 + T *d_distances;
  386 +
  387 + int *next_nodes; // create two STACK-like array
  388 + int *next_search_nodes;
  389 +
  390 + int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory
  391 +
  392 + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_points.size() * D));
  393 + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_points.size()));
  394 + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_points.size()));
  395 + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * 1000 * sizeof(int))); // STACK size right now is 1000, you can change it if you mean to
  396 + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * 1000 * sizeof(int)));
  397 + HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_points.size() * D, cudaMemcpyHostToDevice));
  398 +
  399 + SearchBatch<<<threads, blocks>>> (d_nodes, d_index, d_reference_points, d_query_points, query_points.size(), d_indices, d_distances, next_nodes, next_search_nodes, Judge);
  400 +
  401 + if (Judge == NULL) { // do the following work if the thread works safely
  402 + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_points.size(), cudaMemcpyDeviceToHost));
  403 + HANDLE_ERROR(cudaMemcpy(dists, d_distances, sizeof(T) * query_points.size(), cudaMemcpyDeviceToHost));
  404 + }
408 405  
409   - HANDLE_ERROR(cudaFree(gpu_Query));
410   - HANDLE_ERROR(cudaFree(gpu_ret_indices));
411   - HANDLE_ERROR(cudaFree(gpu_ret_dist));
  406 + HANDLE_ERROR(cudaFree(next_nodes));
  407 + HANDLE_ERROR(cudaFree(next_search_nodes));
  408 + HANDLE_ERROR(cudaFree(d_query_points));
  409 + HANDLE_ERROR(cudaFree(d_indices));
  410 + HANDLE_ERROR(cudaFree(d_distances));
412 411 }
413 412 };
414 413 } //end namespace stim
... ...