Commit 6d247a6cdfd13b3037ca88c8cb5cd260506d10e4

Authored by David Mayerich
2 parents e6d6e994 268037bc

Merge branch 'kdtree' of git.stim.ee.uh.edu:codebase/stimlib into kdtree

Showing 1 changed file with 257 additions and 258 deletions   Show diff stats
stim/structures/kdtree.cuh
1 -// change CUDA_STACK together with max_tree_levels in trial and error manner 1 +// right now the size of CUDA STACK is set to 1000, increase it if you mean to make deeper tree
2 // data should be stored in row-major 2 // data should be stored in row-major
3 // x1,x2,x3,x4,x5...... 3 // x1,x2,x3,x4,x5......
4 // y1,y2,y3,y4,y5...... 4 // y1,y2,y3,y4,y5......
@@ -16,159 +16,150 @@ @@ -16,159 +16,150 @@
16 #include <float.h> 16 #include <float.h>
17 #include <iostream> 17 #include <iostream>
18 #include <algorithm> 18 #include <algorithm>
19 -  
20 -/// using API called HADDLE_ERROR  
21 -static void HandleError(cudaError_t err, const char *file, int line) {  
22 - if (err != cudaSuccess) {  
23 - std::cout<<cudaGetErrorString(err)<<" in"<< file <<" at line "<<line<<std::endl;  
24 - }  
25 -}  
26 -#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))  
27 -  
28 -#define CUDA_STACK 2 // implementation "stacks" on CUDA as to do store the nodes information 19 +#include <stim/cuda/cudatools/error.h>
29 20
30 namespace stim { 21 namespace stim {
31 namespace kdtree { 22 namespace kdtree {
32 - template<typename T, int D> 23 + template<typename T, int D> // typename refers to float or double while D refers to dimension of points
33 struct point { 24 struct point {
34 - T coords[D]; // if we use size to measure a vector<point>, it will show the number of point structures 25 + T dim[D]; // create a structure to store every one input point
35 }; 26 };
36 27
37 template<typename T> 28 template<typename T>
38 - class KDNode { 29 + class kdnode {
39 public: 30 public:
40 - KDNode() { // initialization  
41 - parent = NULL; 31 + kdnode() { // constructor for initializing a kdnode
  32 + parent = NULL; // set every node's parent, left and right kdnode pointers to NULL
42 left = NULL; 33 left = NULL;
43 right = NULL; 34 right = NULL;
44 - split_value = -1;  
45 - _parent = -1;  
46 - _left = -1;  
47 - _right = -1; 35 + parent_idx = -1; // set parent node index to default -1
  36 + left_idx = -1;
  37 + right_idx = -1;
  38 + split_value = -1; // set split_value to default -1
48 } 39 }
49 - int id; // id for current node  
50 - size_t level;  
51 - KDNode *parent, *left, *right;  
52 - int _parent, _left, _right; // id for parent node  
53 - T split_value; // node value  
54 - std::vector <size_t> indices; // indices that indicate the data that current tree has 40 + int idx; // index of current node
  41 + int parent_idx, left_idx, right_idx; // index of parent, left and right nodes
  42 + kdnode *parent, *left, *right; // parent, left and right kdnodes
  43 + T split_value; // splitting value of current node
  44 + std::vector <size_t> indices; // it indicates the points' indices that current node has
  45 + size_t level; // tree level of current node
55 }; 46 };
56 - } 47 + } // end of namespace kdtree
57 48
58 - template <typename T, int D = 3> 49 + template <typename T, int D = 3> // set dimension of data to default 3
59 class cpu_kdtree { 50 class cpu_kdtree {
60 -  
61 protected: 51 protected:
62 - std::vector <kdtree::point<T, D>> *m_pts;  
63 - kdtree::KDNode<T> *m_root; // current node  
64 - int m_current_axis;  
65 - size_t m_levels;  
66 - int m_cmps; // count how many comparisons are to made in the tree for one query  
67 - int m_id; // level + 1  
68 - static cpu_kdtree<T, D> *myself; 52 + int current_axis; // current judging axis
  53 + int cmps; // count how many time of comparisons (just for cpu-kdtree)
  54 + int n_id; // store the total number of nodes
  55 + std::vector <kdtree::point<T, D>> *tmp_points; // transfer or temp points
  56 + kdtree::kdnode<T> *root; // root node
  57 + static cpu_kdtree<T, D> *cur_tree_ptr;
69 public: 58 public:
70 - cpu_kdtree() { // initialization  
71 - myself = this;  
72 - m_id = 0; // id = level + 1, level -> axis index while id -> node identifier 59 + cpu_kdtree() { // constructor for creating a cpu_kdtree
  60 + cur_tree_ptr = this; // create a class pointer points to the current class value
  61 + n_id = 0; // set total number of points to default 0
73 } 62 }
74 - ~cpu_kdtree() { // destructor for deleting what was created by kdtree()  
75 - std::vector <kdtree::KDNode<T>*> next_node;  
76 - next_node.push_back(m_root);  
77 - while (next_node.size()) {  
78 - std::vector <kdtree::KDNode<T>*> next_search;  
79 - while (next_node.size()) {  
80 - kdtree::KDNode<T> *cur = next_node.back();  
81 - next_node.pop_back(); 63 + ~cpu_kdtree() { // destructor of cpu_kdtree
  64 + std::vector <kdtree::kdnode<T>*> next_nodes;
  65 + next_nodes.push_back(root);
  66 + while (next_nodes.size()) {
  67 + std::vector <kdtree::kdnode<T>*> next_search_nodes;
  68 + while (next_nodes.size()) {
  69 + kdtree::kdnode<T> *cur = next_nodes.back();
  70 + next_nodes.pop_back();
82 if (cur->left) 71 if (cur->left)
83 - next_search.push_back(cur->left); 72 + next_search_nodes.push_back(cur->left);
84 if (cur->right) 73 if (cur->right)
85 - next_search.push_back(cur->right); 74 + next_search_nodes.push_back(cur->right);
86 delete cur; 75 delete cur;
87 } 76 }
88 - next_node = next_search; 77 + next_nodes = next_search_nodes;
89 } 78 }
90 - m_root = NULL; 79 + root = NULL;
91 } 80 }
92 - void Create(std::vector <kdtree::point<T, D>> &pts, size_t max_levels) {  
93 - m_pts = &pts; // create a pointer point to the input data  
94 - m_levels = max_levels; // stores max tree levels  
95 - m_root = new kdtree::KDNode<T>(); // using KDNode() to initialize an ancestor node  
96 - m_root->id = m_id++; // id is 1 while level is 0 at the very beginning  
97 - m_root->level = 0; // to begin with level 0  
98 - m_root->indices.resize(pts.size()); // initialize the size of whole indices  
99 - for (size_t i = 0; i < pts.size(); i++) {  
100 - m_root->indices[i] = i; // like what we did on Keys in GPU-BF part 81 + void Create(std::vector <kdtree::point<T, D>> &reference_points, size_t max_levels) {
  82 + tmp_points = &reference_points;
  83 + root = new kdtree::kdnode<T>(); // initializing the root node
  84 + root->idx = n_id++; // the index of root is 0
  85 + root->level = 0; // tree level begins at 0
  86 + root->indices.resize(reference_points.size()); // get the number of points
  87 + for (size_t i = 0; i < reference_points.size(); i++) {
  88 + root->indices[i] = i; // set indices of input points
101 } 89 }
102 - std::vector <kdtree::KDNode<T>*> next_node; // next node  
103 - next_node.push_back(m_root); // new node  
104 - while (next_node.size()) {  
105 - std::vector <kdtree::KDNode<T>*> next_search;  
106 - while (next_node.size()) { // two same WHILE is because we need to make a new vector for searching  
107 - kdtree::KDNode<T> *current_node = next_node.back(); // pointer point to current node (right first)  
108 - next_node.pop_back(); // pop out current node in order to store next node  
109 - if (current_node->level < max_levels) { // max_levels should be reasonably small compared with numbers of data  
110 - if (current_node->indices.size() > 1) {  
111 - kdtree::KDNode<T> *left = new kdtree::KDNode<T>();  
112 - kdtree::KDNode<T> *right = new kdtree::KDNode<T>();  
113 - left->id = m_id++; // risky guessing but OK for large amount of data since max_level is small  
114 - right->id = m_id++; // risky guessing but OK for large amount of data since max_level is small  
115 - Split(current_node, left, right); // split left and right and determine a node  
116 - std::vector <size_t> temp; // empty vecters of int 90 + std::vector <kdtree::kdnode<T>*> next_nodes; // next nodes
  91 + next_nodes.push_back(root); // push back the root node
  92 + while (next_nodes.size()) {
  93 + std::vector <kdtree::kdnode<T>*> next_search_nodes; // next search nodes
  94 + while (next_nodes.size()) { // two same WHILE is because we need to make a new vector to store nodes for search
  95 + kdtree::kdnode<T> *current_node = next_nodes.back(); // handle node one by one (right first)
  96 + next_nodes.pop_back(); // pop out current node in order to store next round of nodes
  97 + if (current_node->level < max_levels) {
  98 + if (current_node->indices.size() > 1) { // split if the nonleaf node contains more than one point
  99 + kdtree::kdnode<T> *left = new kdtree::kdnode<T>();
  100 + kdtree::kdnode<T> *right = new kdtree::kdnode<T>();
  101 + left->idx = n_id++; // set the index of current node's left node
  102 + right->idx = n_id++;
  103 + Split(current_node, left, right); // split left and right and determine a node
  104 + std::vector <size_t> temp; // empty vecters of int
117 //temp.resize(current_node->indices.size()); 105 //temp.resize(current_node->indices.size());
118 - current_node->indices.swap(temp); // clean up current tree's indices 106 + current_node->indices.swap(temp); // clean up current node's indices
119 current_node->left = left; 107 current_node->left = left;
120 current_node->right = right; 108 current_node->right = right;
121 - current_node->_left = left->id; // indicates it has left son node and gets its id  
122 - current_node->_right = right->id; // indicates it has right son node and gets its id  
123 - if (left->indices.size())  
124 - next_search.push_back(left); // right first then left according to stack(first in last out), it can be done in parallel for left and right are independent 109 + current_node->left_idx = left->idx;
  110 + current_node->right_idx = right->idx;
125 if (right->indices.size()) 111 if (right->indices.size())
126 - next_search.push_back(right); 112 + next_search_nodes.push_back(right); // left pop out first
  113 + if (left->indices.size())
  114 + next_search_nodes.push_back(left);
127 } 115 }
128 } 116 }
129 } 117 }
130 - next_node = next_search; 118 + next_nodes = next_search_nodes; // go deeper within the tree
131 } 119 }
132 } 120 }
133 - static bool SortPoints(const size_t a, const size_t b) {  
134 - std::vector <kdtree::point<T, D>> &pts = *myself->m_pts;  
135 - return pts[a].coords[myself->m_current_axis] < pts[b].coords[myself->m_current_axis]; 121 + static bool SortPoints(const size_t a, const size_t b) { // create functor for std::sort
  122 + std::vector <kdtree::point<T, D>> &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer
  123 + return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis];
136 } 124 }
137 - void Split(kdtree::KDNode<T> *cur, kdtree::KDNode<T> *left, kdtree::KDNode<T> *right) {  
138 - /// assume both two sides are created and sure it was  
139 - std::vector <kdtree::point<T, D>> &pts = *m_pts;  
140 - m_current_axis = cur->level % D; // indicate the judicative dimension or axis 125 + void Split(kdtree::kdnode<T> *cur, kdtree::kdnode<T> *left, kdtree::kdnode<T> *right) {
  126 + std::vector <kdtree::point<T, D>> &pts = *tmp_points;
  127 + current_axis = cur->level % D; // indicate the judicative dimension or axis
141 std::sort(cur->indices.begin(), cur->indices.end(), SortPoints); // using SortPoints as comparison function to sort the data 128 std::sort(cur->indices.begin(), cur->indices.end(), SortPoints); // using SortPoints as comparison function to sort the data
142 - size_t mid = cur->indices[cur->indices.size() / 2]; // odd in the mid, even take the floor  
143 - cur->split_value = pts[mid].coords[m_current_axis]; // get the mother node  
144 - left->parent = cur; // set the parent to current node for the next nodes 129 + size_t mid_value = cur->indices[cur->indices.size() / 2]; // odd in the mid_value, even take the floor
  130 + cur->split_value = pts[mid_value].dim[current_axis]; // get the parent node
  131 + left->parent = cur; // set the parent of the next search nodes to current node
145 right->parent = cur; 132 right->parent = cur;
146 - left->level = cur->level + 1; 133 + left->level = cur->level + 1; // level + 1
147 right->level = cur->level + 1; 134 right->level = cur->level + 1;
148 - left->_parent = cur->id; // indicates it has mother node and gets its id  
149 - right->_parent = cur->id; // indicates it has mother node and gets its id  
150 - for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right area one by one 135 + left->parent_idx = cur->idx; // set its parent node's index
  136 + right->parent_idx = cur->idx;
  137 + for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right half-space one by one
151 size_t idx = cur->indices[i]; 138 size_t idx = cur->indices[i];
152 - if (pts[idx].coords[m_current_axis] < cur->split_value) 139 + if (pts[idx].dim[current_axis] < cur->split_value)
153 left->indices.push_back(idx); 140 left->indices.push_back(idx);
154 else 141 else
155 right->indices.push_back(idx); 142 right->indices.push_back(idx);
156 } 143 }
157 } 144 }
158 - int GetNumNodes() const { return m_id; }  
159 - kdtree::KDNode<T>* GetRoot() const { return m_root; } 145 + int GetNumNodes() const { // get the total number of nodes
  146 + return n_id;
  147 + }
  148 + kdtree::kdnode<T>* GetRoot() const { // get the root node of tree
  149 + return root;
  150 + }
160 }; //end class kdtree 151 }; //end class kdtree
161 152
162 template <typename T, int D> 153 template <typename T, int D>
163 - cpu_kdtree<T, D>* cpu_kdtree<T, D>::myself = NULL; // definition of myself pointer points to the specific class 154 + cpu_kdtree<T, D>* cpu_kdtree<T, D>::cur_tree_ptr = NULL; // definition of cur_tree_ptr pointer points to the current class
164 155
165 template <typename T> 156 template <typename T>
166 - struct CUDA_KDNode {  
167 - size_t level;  
168 - int parent, left, right; // indicates id of 157 + struct cuda_kdnode {
  158 + int parent, left, right;
169 T split_value; 159 T split_value;
170 - size_t num_indices; // number of indices it has  
171 - int indices; // the beginning 160 + size_t num_index; // number of indices it has
  161 + int index; // the beginning index
  162 + size_t level;
172 }; 163 };
173 164
174 template <typename T, int D> 165 template <typename T, int D>
@@ -176,239 +167,247 @@ namespace stim { @@ -176,239 +167,247 @@ namespace stim {
176 T dist = 0; 167 T dist = 0;
177 168
178 for (size_t i = 0; i < D; i++) { 169 for (size_t i = 0; i < D; i++) {
179 - T d = a.coords[i] - b.coords[i]; 170 + T d = a.dim[i] - b.dim[i];
180 dist += d*d; 171 dist += d*d;
181 } 172 }
182 return dist; 173 return dist;
183 } 174 }
184 template <typename T, int D> 175 template <typename T, int D>
185 - __device__ void SearchAtNode(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, int cur, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist, int *ret_node) {  
186 - /// finds the first possibility  
187 - size_t best_idx = 0;  
188 - T best_dist = FLT_MAX; 176 + __device__ void SearchAtNode(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, int cur, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) {
  177 + T best_distance = FLT_MAX;
  178 + size_t best_index = 0;
189 179
190 - while (true) { 180 + while (true) { // break until reach the bottom
191 int split_axis = nodes[cur].level % D; 181 int split_axis = nodes[cur].level % D;
192 - if (nodes[cur].left == -1) { // if it doesn't have left son node  
193 - *ret_node = cur;  
194 - for (int i = 0; i < nodes[cur].num_indices; i++) {  
195 - size_t idx = indices[nodes[cur].indices + i];  
196 - T dist = Distance<T, D>(Query, pts[idx]);  
197 - if (dist < best_dist) {  
198 - best_dist = dist;  
199 - best_idx = idx; 182 + if (nodes[cur].left == -1) { // check whether it has left node or not
  183 + *d_node = cur;
  184 + for (int i = 0; i < nodes[cur].num_index; i++) {
  185 + size_t idx = indices[nodes[cur].index + i];
  186 + T dist = Distance<T, D>(d_query_point, d_reference_points[idx]);
  187 + if (dist < best_distance) {
  188 + best_distance = dist;
  189 + best_index = idx;
200 } 190 }
201 } 191 }
202 - break; 192 + break;
203 } 193 }
204 - else if (Query.coords[split_axis] < nodes[cur].split_value) { 194 + else if (d_query_point.dim[split_axis] < nodes[cur].split_value) { // jump into specific son node
205 cur = nodes[cur].left; 195 cur = nodes[cur].left;
206 } 196 }
207 else { 197 else {
208 cur = nodes[cur].right; 198 cur = nodes[cur].right;
209 } 199 }
210 } 200 }
211 - *ret_index = best_idx;  
212 - *ret_dist = best_dist; 201 + *d_distance = best_distance;
  202 + *d_index = best_index;
213 } 203 }
214 template <typename T, int D> 204 template <typename T, int D>
215 - __device__ void SearchAtNodeRange(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, int cur, T range, size_t *ret_index, T *ret_dist) {  
216 - /// search throught all the nodes that are within one range  
217 - size_t best_idx = 0;  
218 - T best_dist = FLT_MAX;  
219 - /// using fixed stack, increase it when need  
220 - int next_node[CUDA_STACK]; // should be larger than 1  
221 - int next_node_pos = 0; // initialize pop out order index  
222 - next_node[next_node_pos++] = cur; // equals to next_node[next_node_pos] = cur, next_node_pos++  
223 -  
224 - while (next_node_pos) {  
225 - int next_search[CUDA_STACK]; // for store next nodes  
226 - int next_search_pos = 0; // record push back order index  
227 - while (next_node_pos) {  
228 - cur = next_node[next_node_pos - 1]; // pop out the last in one and keep poping out  
229 - next_node_pos--; 205 + __device__ void SearchAtNodeRange(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
  206 + T best_distance = FLT_MAX;
  207 + size_t best_index = 0;
  208 +
  209 + int next_nodes_pos = 0; // initialize pop out order index
  210 + next_nodes[id * 1000 + next_nodes_pos] = cur; // find data that belongs to the very specific thread
  211 + next_nodes_pos++;
  212 +
  213 + while (next_nodes_pos) {
  214 + int next_search_nodes_pos = 0; // record push back order index
  215 + while (next_nodes_pos) {
  216 + cur = next_nodes[id * 1000 + next_nodes_pos - 1]; // pop out the last push in one and keep poping out
  217 + next_nodes_pos--;
230 int split_axis = nodes[cur].level % D; 218 int split_axis = nodes[cur].level % D;
231 219
232 if (nodes[cur].left == -1) { 220 if (nodes[cur].left == -1) {
233 - for (int i = 0; i < nodes[cur].num_indices; i++) {  
234 - int idx = indices[nodes[cur].indices + i]; // all indices are stored in one array, pick up from every node's beginning index  
235 - T d = Distance<T>(Query, pts[idx]);  
236 - if (d < best_dist) {  
237 - best_dist = d;  
238 - best_idx = idx; 221 + for (int i = 0; i < nodes[cur].num_index; i++) {
  222 + int idx = indices[nodes[cur].index + i]; // all indices are stored in one array, pick up from every node's beginning index
  223 + T d = Distance<T>(d_query_point, d_reference_points[idx]);
  224 + if (d < best_distance) {
  225 + best_distance = d;
  226 + best_index = idx;
239 } 227 }
240 } 228 }
241 } 229 }
242 else { 230 else {
243 - T d = Query.coords[split_axis] - nodes[cur].split_value; 231 + T d = d_query_point.dim[split_axis] - nodes[cur].split_value;
244 232
245 if (fabs(d) > range) { 233 if (fabs(d) > range) {
246 - if (d < 0)  
247 - next_search[next_search_pos++] = nodes[cur].left;  
248 - else  
249 - next_search[next_search_pos++] = nodes[cur].right; 234 + if (d < 0) {
  235 + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].left;
  236 + next_search_nodes_pos++;
  237 + }
  238 + else {
  239 + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].right;
  240 + next_search_nodes_pos++;
  241 + }
250 } 242 }
251 else { 243 else {
252 - next_search[next_search_pos++] = nodes[cur].left;  
253 - next_search[next_search_pos++] = nodes[cur].right; 244 + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].right;
  245 + next_search_nodes_pos++;
  246 + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].left;
  247 + next_search_nodes_pos++;
  248 + if (next_search_nodes_pos > 1000) {
  249 + printf("Thread conflict might be caused by thread %d, so please try smaller input max_tree_levels\n", id);
  250 + (*Judge)++;
  251 + }
254 } 252 }
255 } 253 }
256 } 254 }
257 -  
258 - for (int i = 0; i < next_search_pos; i++)  
259 - next_node[i] = next_search[i];  
260 - next_node_pos = next_search_pos; // operation that really resemble STACK, namely first in last out 255 + for (int i = 0; i < next_search_nodes_pos; i++)
  256 + next_nodes[id * 1000 + i] = next_search_nodes[id * 1000 + i];
  257 + next_nodes_pos = next_search_nodes_pos;
261 } 258 }
262 - *ret_index = best_idx;  
263 - *ret_dist = best_dist; 259 + *d_distance = best_distance;
  260 + *d_index = best_index;
264 } 261 }
265 template <typename T, int D> 262 template <typename T, int D>
266 - __device__ void Search(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist) {  
267 - /// find first nearest node 263 + __device__ void Search(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
268 int best_node = 0; 264 int best_node = 0;
269 - size_t best_idx = 0;  
270 - T best_dist = FLT_MAX; 265 + T best_distance = FLT_MAX;
  266 + size_t best_index = 0;
271 T radius = 0; 267 T radius = 0;
272 - SearchAtNode<T, D>(nodes, indices, pts, 0, Query, &best_idx, &best_dist, &best_node);  
273 - radius = sqrt(best_dist);  
274 - /// find other possibilities 268 +
  269 + SearchAtNode<T, D>(nodes, indices, d_reference_points, 0, d_query_point, &best_index, &best_distance, &best_node);
  270 + radius = sqrt(best_distance); // get range
275 int cur = best_node; 271 int cur = best_node;
276 272
277 while (nodes[cur].parent != -1) { 273 while (nodes[cur].parent != -1) {
278 - /// go up  
279 int parent = nodes[cur].parent; 274 int parent = nodes[cur].parent;
280 int split_axis = nodes[parent].level % D; 275 int split_axis = nodes[parent].level % D;
281 - /// search other node 276 +
282 T tmp_dist = FLT_MAX; 277 T tmp_dist = FLT_MAX;
283 size_t tmp_idx; 278 size_t tmp_idx;
284 - if (fabs(nodes[parent].split_value - Query.coords[split_axis]) <= radius) {  
285 - /// search opposite node 279 + if (fabs(nodes[parent].split_value - d_query_point.dim[split_axis]) <= radius) {
286 if (nodes[parent].left != cur) 280 if (nodes[parent].left != cur)
287 - SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].left, radius, &tmp_idx, &tmp_dist); 281 + SearchAtNodeRange(nodes, indices, d_reference_points, d_query_point, nodes[parent].left, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
288 else 282 else
289 - SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].right, radius, &tmp_idx, &tmp_dist); 283 + SearchAtNodeRange(nodes, indices, d_reference_points, d_query_point, nodes[parent].right, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
290 } 284 }
291 - if (tmp_dist < best_dist) {  
292 - best_dist = tmp_dist;  
293 - best_idx = tmp_idx; 285 + if (tmp_dist < best_distance) {
  286 + best_distance = tmp_dist;
  287 + best_index = tmp_idx;
294 } 288 }
295 cur = parent; 289 cur = parent;
296 } 290 }
297 - *ret_index = best_idx;  
298 - *ret_dist = sqrt(best_dist); 291 + *d_distance = sqrt(best_distance);
  292 + *d_index = best_index;
299 } 293 }
300 template <typename T, int D> 294 template <typename T, int D>
301 - __global__ void SearchBatch(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, size_t num_pts, kdtree::point<T, D> *Query, size_t num_Query, size_t *ret_index, T *ret_dist) {  
302 - int idx = blockIdx.x * blockDim.x + threadIdx.x;  
303 - if (idx >= num_Query) return; 295 + __global__ void SearchBatch(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) {
  296 + size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
  297 + if (idx >= d_query_count) return; // avoid segfault
304 298
305 - Search<T, D>(nodes, indices, pts, Query[idx], &ret_index[idx], &ret_dist[idx]); // every query points are independent 299 + Search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent
306 } 300 }
307 301
308 template <typename T, int D = 3> 302 template <typename T, int D = 3>
309 class cuda_kdtree { 303 class cuda_kdtree {
310 protected: 304 protected:
311 - CUDA_KDNode<T> *m_gpu_nodes; // store nodes  
312 - size_t *m_gpu_indices;  
313 - kdtree::point<T, D>* m_gpu_points;  
314 - size_t m_num_points; 305 + cuda_kdnode<T> *d_nodes;
  306 + size_t *d_index;
  307 + kdtree::point<T, D>* d_reference_points;
  308 + size_t d_reference_count;
315 public: 309 public:
316 ~cuda_kdtree() { 310 ~cuda_kdtree() {
317 - HANDLE_ERROR(cudaFree(m_gpu_nodes));  
318 - HANDLE_ERROR(cudaFree(m_gpu_indices));  
319 - HANDLE_ERROR(cudaFree(m_gpu_points)); 311 + HANDLE_ERROR(cudaFree(d_nodes));
  312 + HANDLE_ERROR(cudaFree(d_index));
  313 + HANDLE_ERROR(cudaFree(d_reference_points));
320 } 314 }
321 - void CreateKDTree(T *ReferencePoints, size_t ReferenceCount, size_t ColCount, size_t max_levels) {  
322 - std::vector < kdtree::point<T, D> > pts(ReferenceCount); // create specific struct of reference data  
323 - for (size_t j = 0; j < ReferenceCount; j++)  
324 - for (size_t i = 0; i < ColCount; i++)  
325 - pts[j].coords[i] = ReferencePoints[j * ColCount + i];  
326 - cpu_kdtree<T, D> tree; // initialize a tree  
327 - tree.Create(pts, max_levels); // create KD-Tree on host  
328 - kdtree::KDNode<T> *root = tree.GetRoot(); 315 + void CreateKDTree(T *h_reference_points, size_t reference_count, size_t dim_count, size_t max_levels) {
  316 + if (max_levels > 10) {
  317 + std::cout<<"The max_tree_levels should be smaller!"<<std::endl;
  318 + exit(1);
  319 + }
  320 + std::vector <kdtree::point<T, D>> reference_points(reference_count); // restore the reference points in particular way
  321 + for (size_t j = 0; j < reference_count; j++)
  322 + for (size_t i = 0; i < dim_count; i++)
  323 + reference_points[j].dim[i] = h_reference_points[j * dim_count + i];
  324 + cpu_kdtree<T, D> tree; // creating a tree on cpu
  325 + tree.Create(reference_points, max_levels); // building a tree on cpu
  326 + kdtree::kdnode<T> *d_root = tree.GetRoot();
329 int num_nodes = tree.GetNumNodes(); 327 int num_nodes = tree.GetNumNodes();
330 - /// create the same on CPU  
331 - m_num_points = pts.size(); // number of points for creating tree = reference_count in the case 328 + d_reference_count = reference_points.size(); // also equals to reference_count
332 329
333 - HANDLE_ERROR(cudaMalloc((void**)&m_gpu_nodes, sizeof(CUDA_KDNode<T>) * num_nodes)); // private variables for kdtree  
334 - HANDLE_ERROR(cudaMalloc((void**)&m_gpu_indices, sizeof(size_t) * m_num_points));  
335 - HANDLE_ERROR(cudaMalloc((void**)&m_gpu_points, sizeof(kdtree::point<T, D>) * m_num_points));  
336 -  
337 - std::vector <CUDA_KDNode<T>> cpu_nodes(num_nodes); // from left to right, id of nodes  
338 - std::vector <size_t> indices(m_num_points);  
339 - std::vector < kdtree::KDNode<T>* > next_node; 330 + HANDLE_ERROR(cudaMalloc((void**)&d_nodes, sizeof(cuda_kdnode<T>) * num_nodes)); // copy data from host to device
  331 + HANDLE_ERROR(cudaMalloc((void**)&d_index, sizeof(size_t) * d_reference_count));
  332 + HANDLE_ERROR(cudaMalloc((void**)&d_reference_points, sizeof(kdtree::point<T, D>) * d_reference_count));
340 333
  334 + std::vector <cuda_kdnode<T>> tmp_nodes(num_nodes);
  335 + std::vector <size_t> indices(d_reference_count);
  336 + std::vector <kdtree::kdnode<T>*> next_nodes;
341 size_t cur_pos = 0; 337 size_t cur_pos = 0;
342 -  
343 - next_node.push_back(root);  
344 -  
345 - while (next_node.size()) {  
346 - std::vector <typename kdtree::KDNode<T>* > next_search;  
347 -  
348 - while (next_node.size()) {  
349 - kdtree::KDNode<T> *cur = next_node.back();  
350 - next_node.pop_back();  
351 -  
352 - int id = cur->id; // the nodes at same level are independent  
353 -  
354 - cpu_nodes[id].level = cur->level;  
355 - cpu_nodes[id].parent = cur->_parent;  
356 - cpu_nodes[id].left = cur->_left;  
357 - cpu_nodes[id].right = cur->_right;  
358 - cpu_nodes[id].split_value = cur->split_value;  
359 - cpu_nodes[id].num_indices = cur->indices.size(); // number of index  
360 - 338 + next_nodes.push_back(d_root);
  339 + while (next_nodes.size()) {
  340 + std::vector <typename kdtree::kdnode<T>*> next_search_nodes;
  341 + while (next_nodes.size()) {
  342 + kdtree::kdnode<T> *cur = next_nodes.back();
  343 + next_nodes.pop_back();
  344 + int id = cur->idx; // the nodes at same level are independent
  345 + tmp_nodes[id].level = cur->level;
  346 + tmp_nodes[id].parent = cur->parent_idx;
  347 + tmp_nodes[id].left = cur->left_idx;
  348 + tmp_nodes[id].right = cur->right_idx;
  349 + tmp_nodes[id].split_value = cur->split_value;
  350 + tmp_nodes[id].num_index = cur->indices.size(); // number of index
361 if (cur->indices.size()) { 351 if (cur->indices.size()) {
362 for (size_t i = 0; i < cur->indices.size(); i++) 352 for (size_t i = 0; i < cur->indices.size(); i++)
363 indices[cur_pos + i] = cur->indices[i]; 353 indices[cur_pos + i] = cur->indices[i];
364 354
365 - cpu_nodes[id].indices = (int)cur_pos; // beginning index that every bottom node has  
366 - cur_pos += cur->indices.size(); // store indices continuously 355 + tmp_nodes[id].index = (int)cur_pos; // beginning index of reference_points that every bottom node has
  356 + cur_pos += cur->indices.size(); // store indices continuously for every query_point
367 } 357 }
368 else { 358 else {
369 - cpu_nodes[id].indices = -1; 359 + tmp_nodes[id].index = -1;
370 } 360 }
371 361
372 if (cur->left) 362 if (cur->left)
373 - next_search.push_back(cur->left); 363 + next_search_nodes.push_back(cur->left);
374 364
375 if (cur->right) 365 if (cur->right)
376 - next_search.push_back(cur->right); 366 + next_search_nodes.push_back(cur->right);
377 } 367 }
378 - next_node = next_search; 368 + next_nodes = next_search_nodes;
379 } 369 }
380 -  
381 - HANDLE_ERROR(cudaMemcpy(m_gpu_nodes, &cpu_nodes[0], sizeof(CUDA_KDNode<T>) * cpu_nodes.size(), cudaMemcpyHostToDevice));  
382 - HANDLE_ERROR(cudaMemcpy(m_gpu_indices, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));  
383 - HANDLE_ERROR(cudaMemcpy(m_gpu_points, &pts[0], sizeof(kdtree::point<T, D>) * pts.size(), cudaMemcpyHostToDevice)); 370 + HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice));
  371 + HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
  372 + HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point<T, D>) * reference_points.size(), cudaMemcpyHostToDevice));
384 } 373 }
385 - void Search(T *QueryPoints, size_t QueryCount, size_t ColCount, T *dists, size_t *indices) {  
386 - std::vector < kdtree::point<T, D> > query(QueryCount);  
387 - for (size_t j = 0; j < QueryCount; j++)  
388 - for (size_t i = 0; i < ColCount; i++)  
389 - query[j].coords[i] = QueryPoints[j * ColCount + i];  
390 -  
391 - unsigned int threads = (unsigned int)(query.size() > 1024 ? 1024 : query.size());  
392 - //unsigned int blocks = (unsigned int)ceil(query.size() / threads);  
393 - unsigned int blocks = (unsigned int)(query.size() / threads + (query.size() % threads ? 1 : 0));  
394 -  
395 - kdtree::point<T, D> *gpu_Query;  
396 - size_t *gpu_ret_indices;  
397 - T *gpu_ret_dist;  
398 -  
399 - HANDLE_ERROR(cudaMalloc((void**)&gpu_Query, sizeof(T) * query.size() * D));  
400 - HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_indices, sizeof(size_t) * query.size()));  
401 - HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_dist, sizeof(T) * query.size()));  
402 - HANDLE_ERROR(cudaMemcpy(gpu_Query, &query[0], sizeof(T) * query.size() * D, cudaMemcpyHostToDevice));  
403 -  
404 - SearchBatch << <threads, blocks >> > (m_gpu_nodes, m_gpu_indices, m_gpu_points, m_num_points, gpu_Query, query.size(), gpu_ret_indices, gpu_ret_dist);  
405 -  
406 - HANDLE_ERROR(cudaMemcpy(indices, gpu_ret_indices, sizeof(size_t) * query.size(), cudaMemcpyDeviceToHost));  
407 - HANDLE_ERROR(cudaMemcpy(dists, gpu_ret_dist, sizeof(T) * query.size(), cudaMemcpyDeviceToHost)); 374 + void Search(T *h_query_points, size_t query_count, size_t dim_count, T *dists, size_t *indices) {
  375 + std::vector <kdtree::point<T, D>> query_points(query_count);
  376 + for (size_t j = 0; j < query_count; j++)
  377 + for (size_t i = 0; i < dim_count; i++)
  378 + query_points[j].dim[i] = h_query_points[j * dim_count + i];
  379 +
  380 + unsigned int threads = (unsigned int)(query_points.size() > 1024 ? 1024 : query_points.size());
  381 + unsigned int blocks = (unsigned int)(query_points.size() / threads + (query_points.size() % threads ? 1 : 0));
  382 +
  383 + kdtree::point<T, D> *d_query_points; // create a pointer pointing to query points on gpu
  384 + size_t *d_indices;
  385 + T *d_distances;
  386 +
  387 + int *next_nodes; // create two STACK-like array
  388 + int *next_search_nodes;
  389 +
  390 + int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory
  391 +
  392 + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_points.size() * D));
  393 + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_points.size()));
  394 + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_points.size()));
  395 + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * 1000 * sizeof(int))); // STACK size right now is 1000, you can change it if you mean to
  396 + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * 1000 * sizeof(int)));
  397 + HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_points.size() * D, cudaMemcpyHostToDevice));
  398 +
  399 + SearchBatch<<<threads, blocks>>> (d_nodes, d_index, d_reference_points, d_query_points, query_points.size(), d_indices, d_distances, next_nodes, next_search_nodes, Judge);
  400 +
  401 + if (Judge == NULL) { // do the following work if the thread works safely
  402 + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_points.size(), cudaMemcpyDeviceToHost));
  403 + HANDLE_ERROR(cudaMemcpy(dists, d_distances, sizeof(T) * query_points.size(), cudaMemcpyDeviceToHost));
  404 + }
408 405
409 - HANDLE_ERROR(cudaFree(gpu_Query));  
410 - HANDLE_ERROR(cudaFree(gpu_ret_indices));  
411 - HANDLE_ERROR(cudaFree(gpu_ret_dist)); 406 + HANDLE_ERROR(cudaFree(next_nodes));
  407 + HANDLE_ERROR(cudaFree(next_search_nodes));
  408 + HANDLE_ERROR(cudaFree(d_query_points));
  409 + HANDLE_ERROR(cudaFree(d_indices));
  410 + HANDLE_ERROR(cudaFree(d_distances));
412 } 411 }
413 }; 412 };
414 } //end namespace stim 413 } //end namespace stim