Commit 6d247a6cdfd13b3037ca88c8cb5cd260506d10e4
Merge branch 'kdtree' of git.stim.ee.uh.edu:codebase/stimlib into kdtree
Showing
1 changed file
with
257 additions
and
258 deletions
Show diff stats
stim/structures/kdtree.cuh
1 | -// change CUDA_STACK together with max_tree_levels in trial and error manner | 1 | +// right now the size of CUDA STACK is set to 1000, increase it if you mean to make deeper tree |
2 | // data should be stored in row-major | 2 | // data should be stored in row-major |
3 | // x1,x2,x3,x4,x5...... | 3 | // x1,x2,x3,x4,x5...... |
4 | // y1,y2,y3,y4,y5...... | 4 | // y1,y2,y3,y4,y5...... |
@@ -16,159 +16,150 @@ | @@ -16,159 +16,150 @@ | ||
16 | #include <float.h> | 16 | #include <float.h> |
17 | #include <iostream> | 17 | #include <iostream> |
18 | #include <algorithm> | 18 | #include <algorithm> |
19 | - | ||
20 | -/// using API called HADDLE_ERROR | ||
21 | -static void HandleError(cudaError_t err, const char *file, int line) { | ||
22 | - if (err != cudaSuccess) { | ||
23 | - std::cout<<cudaGetErrorString(err)<<" in"<< file <<" at line "<<line<<std::endl; | ||
24 | - } | ||
25 | -} | ||
26 | -#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__)) | ||
27 | - | ||
28 | -#define CUDA_STACK 2 // implementation "stacks" on CUDA as to do store the nodes information | 19 | +#include <stim/cuda/cudatools/error.h> |
29 | 20 | ||
30 | namespace stim { | 21 | namespace stim { |
31 | namespace kdtree { | 22 | namespace kdtree { |
32 | - template<typename T, int D> | 23 | + template<typename T, int D> // typename refers to float or double while D refers to dimension of points |
33 | struct point { | 24 | struct point { |
34 | - T coords[D]; // if we use size to measure a vector<point>, it will show the number of point structures | 25 | + T dim[D]; // create a structure to store every one input point |
35 | }; | 26 | }; |
36 | 27 | ||
37 | template<typename T> | 28 | template<typename T> |
38 | - class KDNode { | 29 | + class kdnode { |
39 | public: | 30 | public: |
40 | - KDNode() { // initialization | ||
41 | - parent = NULL; | 31 | + kdnode() { // constructor for initializing a kdnode |
32 | + parent = NULL; // set every node's parent, left and right kdnode pointers to NULL | ||
42 | left = NULL; | 33 | left = NULL; |
43 | right = NULL; | 34 | right = NULL; |
44 | - split_value = -1; | ||
45 | - _parent = -1; | ||
46 | - _left = -1; | ||
47 | - _right = -1; | 35 | + parent_idx = -1; // set parent node index to default -1 |
36 | + left_idx = -1; | ||
37 | + right_idx = -1; | ||
38 | + split_value = -1; // set split_value to default -1 | ||
48 | } | 39 | } |
49 | - int id; // id for current node | ||
50 | - size_t level; | ||
51 | - KDNode *parent, *left, *right; | ||
52 | - int _parent, _left, _right; // id for parent node | ||
53 | - T split_value; // node value | ||
54 | - std::vector <size_t> indices; // indices that indicate the data that current tree has | 40 | + int idx; // index of current node |
41 | + int parent_idx, left_idx, right_idx; // index of parent, left and right nodes | ||
42 | + kdnode *parent, *left, *right; // parent, left and right kdnodes | ||
43 | + T split_value; // splitting value of current node | ||
44 | + std::vector <size_t> indices; // it indicates the points' indices that current node has | ||
45 | + size_t level; // tree level of current node | ||
55 | }; | 46 | }; |
56 | - } | 47 | + } // end of namespace kdtree |
57 | 48 | ||
58 | - template <typename T, int D = 3> | 49 | + template <typename T, int D = 3> // set dimension of data to default 3 |
59 | class cpu_kdtree { | 50 | class cpu_kdtree { |
60 | - | ||
61 | protected: | 51 | protected: |
62 | - std::vector <kdtree::point<T, D>> *m_pts; | ||
63 | - kdtree::KDNode<T> *m_root; // current node | ||
64 | - int m_current_axis; | ||
65 | - size_t m_levels; | ||
66 | - int m_cmps; // count how many comparisons are to made in the tree for one query | ||
67 | - int m_id; // level + 1 | ||
68 | - static cpu_kdtree<T, D> *myself; | 52 | + int current_axis; // current judging axis |
53 | + int cmps; // count how many time of comparisons (just for cpu-kdtree) | ||
54 | + int n_id; // store the total number of nodes | ||
55 | + std::vector <kdtree::point<T, D>> *tmp_points; // transfer or temp points | ||
56 | + kdtree::kdnode<T> *root; // root node | ||
57 | + static cpu_kdtree<T, D> *cur_tree_ptr; | ||
69 | public: | 58 | public: |
70 | - cpu_kdtree() { // initialization | ||
71 | - myself = this; | ||
72 | - m_id = 0; // id = level + 1, level -> axis index while id -> node identifier | 59 | + cpu_kdtree() { // constructor for creating a cpu_kdtree |
60 | + cur_tree_ptr = this; // create a class pointer points to the current class value | ||
61 | + n_id = 0; // set total number of points to default 0 | ||
73 | } | 62 | } |
74 | - ~cpu_kdtree() { // destructor for deleting what was created by kdtree() | ||
75 | - std::vector <kdtree::KDNode<T>*> next_node; | ||
76 | - next_node.push_back(m_root); | ||
77 | - while (next_node.size()) { | ||
78 | - std::vector <kdtree::KDNode<T>*> next_search; | ||
79 | - while (next_node.size()) { | ||
80 | - kdtree::KDNode<T> *cur = next_node.back(); | ||
81 | - next_node.pop_back(); | 63 | + ~cpu_kdtree() { // destructor of cpu_kdtree |
64 | + std::vector <kdtree::kdnode<T>*> next_nodes; | ||
65 | + next_nodes.push_back(root); | ||
66 | + while (next_nodes.size()) { | ||
67 | + std::vector <kdtree::kdnode<T>*> next_search_nodes; | ||
68 | + while (next_nodes.size()) { | ||
69 | + kdtree::kdnode<T> *cur = next_nodes.back(); | ||
70 | + next_nodes.pop_back(); | ||
82 | if (cur->left) | 71 | if (cur->left) |
83 | - next_search.push_back(cur->left); | 72 | + next_search_nodes.push_back(cur->left); |
84 | if (cur->right) | 73 | if (cur->right) |
85 | - next_search.push_back(cur->right); | 74 | + next_search_nodes.push_back(cur->right); |
86 | delete cur; | 75 | delete cur; |
87 | } | 76 | } |
88 | - next_node = next_search; | 77 | + next_nodes = next_search_nodes; |
89 | } | 78 | } |
90 | - m_root = NULL; | 79 | + root = NULL; |
91 | } | 80 | } |
92 | - void Create(std::vector <kdtree::point<T, D>> &pts, size_t max_levels) { | ||
93 | - m_pts = &pts; // create a pointer point to the input data | ||
94 | - m_levels = max_levels; // stores max tree levels | ||
95 | - m_root = new kdtree::KDNode<T>(); // using KDNode() to initialize an ancestor node | ||
96 | - m_root->id = m_id++; // id is 1 while level is 0 at the very beginning | ||
97 | - m_root->level = 0; // to begin with level 0 | ||
98 | - m_root->indices.resize(pts.size()); // initialize the size of whole indices | ||
99 | - for (size_t i = 0; i < pts.size(); i++) { | ||
100 | - m_root->indices[i] = i; // like what we did on Keys in GPU-BF part | 81 | + void Create(std::vector <kdtree::point<T, D>> &reference_points, size_t max_levels) { |
82 | + tmp_points = &reference_points; | ||
83 | + root = new kdtree::kdnode<T>(); // initializing the root node | ||
84 | + root->idx = n_id++; // the index of root is 0 | ||
85 | + root->level = 0; // tree level begins at 0 | ||
86 | + root->indices.resize(reference_points.size()); // get the number of points | ||
87 | + for (size_t i = 0; i < reference_points.size(); i++) { | ||
88 | + root->indices[i] = i; // set indices of input points | ||
101 | } | 89 | } |
102 | - std::vector <kdtree::KDNode<T>*> next_node; // next node | ||
103 | - next_node.push_back(m_root); // new node | ||
104 | - while (next_node.size()) { | ||
105 | - std::vector <kdtree::KDNode<T>*> next_search; | ||
106 | - while (next_node.size()) { // two same WHILE is because we need to make a new vector for searching | ||
107 | - kdtree::KDNode<T> *current_node = next_node.back(); // pointer point to current node (right first) | ||
108 | - next_node.pop_back(); // pop out current node in order to store next node | ||
109 | - if (current_node->level < max_levels) { // max_levels should be reasonably small compared with numbers of data | ||
110 | - if (current_node->indices.size() > 1) { | ||
111 | - kdtree::KDNode<T> *left = new kdtree::KDNode<T>(); | ||
112 | - kdtree::KDNode<T> *right = new kdtree::KDNode<T>(); | ||
113 | - left->id = m_id++; // risky guessing but OK for large amount of data since max_level is small | ||
114 | - right->id = m_id++; // risky guessing but OK for large amount of data since max_level is small | ||
115 | - Split(current_node, left, right); // split left and right and determine a node | ||
116 | - std::vector <size_t> temp; // empty vecters of int | 90 | + std::vector <kdtree::kdnode<T>*> next_nodes; // next nodes |
91 | + next_nodes.push_back(root); // push back the root node | ||
92 | + while (next_nodes.size()) { | ||
93 | + std::vector <kdtree::kdnode<T>*> next_search_nodes; // next search nodes | ||
94 | + while (next_nodes.size()) { // two same WHILE is because we need to make a new vector to store nodes for search | ||
95 | + kdtree::kdnode<T> *current_node = next_nodes.back(); // handle node one by one (right first) | ||
96 | + next_nodes.pop_back(); // pop out current node in order to store next round of nodes | ||
97 | + if (current_node->level < max_levels) { | ||
98 | + if (current_node->indices.size() > 1) { // split if the nonleaf node contains more than one point | ||
99 | + kdtree::kdnode<T> *left = new kdtree::kdnode<T>(); | ||
100 | + kdtree::kdnode<T> *right = new kdtree::kdnode<T>(); | ||
101 | + left->idx = n_id++; // set the index of current node's left node | ||
102 | + right->idx = n_id++; | ||
103 | + Split(current_node, left, right); // split left and right and determine a node | ||
104 | + std::vector <size_t> temp; // empty vecters of int | ||
117 | //temp.resize(current_node->indices.size()); | 105 | //temp.resize(current_node->indices.size()); |
118 | - current_node->indices.swap(temp); // clean up current tree's indices | 106 | + current_node->indices.swap(temp); // clean up current node's indices |
119 | current_node->left = left; | 107 | current_node->left = left; |
120 | current_node->right = right; | 108 | current_node->right = right; |
121 | - current_node->_left = left->id; // indicates it has left son node and gets its id | ||
122 | - current_node->_right = right->id; // indicates it has right son node and gets its id | ||
123 | - if (left->indices.size()) | ||
124 | - next_search.push_back(left); // right first then left according to stack(first in last out), it can be done in parallel for left and right are independent | 109 | + current_node->left_idx = left->idx; |
110 | + current_node->right_idx = right->idx; | ||
125 | if (right->indices.size()) | 111 | if (right->indices.size()) |
126 | - next_search.push_back(right); | 112 | + next_search_nodes.push_back(right); // left pop out first |
113 | + if (left->indices.size()) | ||
114 | + next_search_nodes.push_back(left); | ||
127 | } | 115 | } |
128 | } | 116 | } |
129 | } | 117 | } |
130 | - next_node = next_search; | 118 | + next_nodes = next_search_nodes; // go deeper within the tree |
131 | } | 119 | } |
132 | } | 120 | } |
133 | - static bool SortPoints(const size_t a, const size_t b) { | ||
134 | - std::vector <kdtree::point<T, D>> &pts = *myself->m_pts; | ||
135 | - return pts[a].coords[myself->m_current_axis] < pts[b].coords[myself->m_current_axis]; | 121 | + static bool SortPoints(const size_t a, const size_t b) { // create functor for std::sort |
122 | + std::vector <kdtree::point<T, D>> &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer | ||
123 | + return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis]; | ||
136 | } | 124 | } |
137 | - void Split(kdtree::KDNode<T> *cur, kdtree::KDNode<T> *left, kdtree::KDNode<T> *right) { | ||
138 | - /// assume both two sides are created and sure it was | ||
139 | - std::vector <kdtree::point<T, D>> &pts = *m_pts; | ||
140 | - m_current_axis = cur->level % D; // indicate the judicative dimension or axis | 125 | + void Split(kdtree::kdnode<T> *cur, kdtree::kdnode<T> *left, kdtree::kdnode<T> *right) { |
126 | + std::vector <kdtree::point<T, D>> &pts = *tmp_points; | ||
127 | + current_axis = cur->level % D; // indicate the judicative dimension or axis | ||
141 | std::sort(cur->indices.begin(), cur->indices.end(), SortPoints); // using SortPoints as comparison function to sort the data | 128 | std::sort(cur->indices.begin(), cur->indices.end(), SortPoints); // using SortPoints as comparison function to sort the data |
142 | - size_t mid = cur->indices[cur->indices.size() / 2]; // odd in the mid, even take the floor | ||
143 | - cur->split_value = pts[mid].coords[m_current_axis]; // get the mother node | ||
144 | - left->parent = cur; // set the parent to current node for the next nodes | 129 | + size_t mid_value = cur->indices[cur->indices.size() / 2]; // odd in the mid_value, even take the floor |
130 | + cur->split_value = pts[mid_value].dim[current_axis]; // get the parent node | ||
131 | + left->parent = cur; // set the parent of the next search nodes to current node | ||
145 | right->parent = cur; | 132 | right->parent = cur; |
146 | - left->level = cur->level + 1; | 133 | + left->level = cur->level + 1; // level + 1 |
147 | right->level = cur->level + 1; | 134 | right->level = cur->level + 1; |
148 | - left->_parent = cur->id; // indicates it has mother node and gets its id | ||
149 | - right->_parent = cur->id; // indicates it has mother node and gets its id | ||
150 | - for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right area one by one | 135 | + left->parent_idx = cur->idx; // set its parent node's index |
136 | + right->parent_idx = cur->idx; | ||
137 | + for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right half-space one by one | ||
151 | size_t idx = cur->indices[i]; | 138 | size_t idx = cur->indices[i]; |
152 | - if (pts[idx].coords[m_current_axis] < cur->split_value) | 139 | + if (pts[idx].dim[current_axis] < cur->split_value) |
153 | left->indices.push_back(idx); | 140 | left->indices.push_back(idx); |
154 | else | 141 | else |
155 | right->indices.push_back(idx); | 142 | right->indices.push_back(idx); |
156 | } | 143 | } |
157 | } | 144 | } |
158 | - int GetNumNodes() const { return m_id; } | ||
159 | - kdtree::KDNode<T>* GetRoot() const { return m_root; } | 145 | + int GetNumNodes() const { // get the total number of nodes |
146 | + return n_id; | ||
147 | + } | ||
148 | + kdtree::kdnode<T>* GetRoot() const { // get the root node of tree | ||
149 | + return root; | ||
150 | + } | ||
160 | }; //end class kdtree | 151 | }; //end class kdtree |
161 | 152 | ||
162 | template <typename T, int D> | 153 | template <typename T, int D> |
163 | - cpu_kdtree<T, D>* cpu_kdtree<T, D>::myself = NULL; // definition of myself pointer points to the specific class | 154 | + cpu_kdtree<T, D>* cpu_kdtree<T, D>::cur_tree_ptr = NULL; // definition of cur_tree_ptr pointer points to the current class |
164 | 155 | ||
165 | template <typename T> | 156 | template <typename T> |
166 | - struct CUDA_KDNode { | ||
167 | - size_t level; | ||
168 | - int parent, left, right; // indicates id of | 157 | + struct cuda_kdnode { |
158 | + int parent, left, right; | ||
169 | T split_value; | 159 | T split_value; |
170 | - size_t num_indices; // number of indices it has | ||
171 | - int indices; // the beginning | 160 | + size_t num_index; // number of indices it has |
161 | + int index; // the beginning index | ||
162 | + size_t level; | ||
172 | }; | 163 | }; |
173 | 164 | ||
174 | template <typename T, int D> | 165 | template <typename T, int D> |
@@ -176,239 +167,247 @@ namespace stim { | @@ -176,239 +167,247 @@ namespace stim { | ||
176 | T dist = 0; | 167 | T dist = 0; |
177 | 168 | ||
178 | for (size_t i = 0; i < D; i++) { | 169 | for (size_t i = 0; i < D; i++) { |
179 | - T d = a.coords[i] - b.coords[i]; | 170 | + T d = a.dim[i] - b.dim[i]; |
180 | dist += d*d; | 171 | dist += d*d; |
181 | } | 172 | } |
182 | return dist; | 173 | return dist; |
183 | } | 174 | } |
184 | template <typename T, int D> | 175 | template <typename T, int D> |
185 | - __device__ void SearchAtNode(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, int cur, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist, int *ret_node) { | ||
186 | - /// finds the first possibility | ||
187 | - size_t best_idx = 0; | ||
188 | - T best_dist = FLT_MAX; | 176 | + __device__ void SearchAtNode(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, int cur, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) { |
177 | + T best_distance = FLT_MAX; | ||
178 | + size_t best_index = 0; | ||
189 | 179 | ||
190 | - while (true) { | 180 | + while (true) { // break until reach the bottom |
191 | int split_axis = nodes[cur].level % D; | 181 | int split_axis = nodes[cur].level % D; |
192 | - if (nodes[cur].left == -1) { // if it doesn't have left son node | ||
193 | - *ret_node = cur; | ||
194 | - for (int i = 0; i < nodes[cur].num_indices; i++) { | ||
195 | - size_t idx = indices[nodes[cur].indices + i]; | ||
196 | - T dist = Distance<T, D>(Query, pts[idx]); | ||
197 | - if (dist < best_dist) { | ||
198 | - best_dist = dist; | ||
199 | - best_idx = idx; | 182 | + if (nodes[cur].left == -1) { // check whether it has left node or not |
183 | + *d_node = cur; | ||
184 | + for (int i = 0; i < nodes[cur].num_index; i++) { | ||
185 | + size_t idx = indices[nodes[cur].index + i]; | ||
186 | + T dist = Distance<T, D>(d_query_point, d_reference_points[idx]); | ||
187 | + if (dist < best_distance) { | ||
188 | + best_distance = dist; | ||
189 | + best_index = idx; | ||
200 | } | 190 | } |
201 | } | 191 | } |
202 | - break; | 192 | + break; |
203 | } | 193 | } |
204 | - else if (Query.coords[split_axis] < nodes[cur].split_value) { | 194 | + else if (d_query_point.dim[split_axis] < nodes[cur].split_value) { // jump into specific son node |
205 | cur = nodes[cur].left; | 195 | cur = nodes[cur].left; |
206 | } | 196 | } |
207 | else { | 197 | else { |
208 | cur = nodes[cur].right; | 198 | cur = nodes[cur].right; |
209 | } | 199 | } |
210 | } | 200 | } |
211 | - *ret_index = best_idx; | ||
212 | - *ret_dist = best_dist; | 201 | + *d_distance = best_distance; |
202 | + *d_index = best_index; | ||
213 | } | 203 | } |
214 | template <typename T, int D> | 204 | template <typename T, int D> |
215 | - __device__ void SearchAtNodeRange(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, int cur, T range, size_t *ret_index, T *ret_dist) { | ||
216 | - /// search throught all the nodes that are within one range | ||
217 | - size_t best_idx = 0; | ||
218 | - T best_dist = FLT_MAX; | ||
219 | - /// using fixed stack, increase it when need | ||
220 | - int next_node[CUDA_STACK]; // should be larger than 1 | ||
221 | - int next_node_pos = 0; // initialize pop out order index | ||
222 | - next_node[next_node_pos++] = cur; // equals to next_node[next_node_pos] = cur, next_node_pos++ | ||
223 | - | ||
224 | - while (next_node_pos) { | ||
225 | - int next_search[CUDA_STACK]; // for store next nodes | ||
226 | - int next_search_pos = 0; // record push back order index | ||
227 | - while (next_node_pos) { | ||
228 | - cur = next_node[next_node_pos - 1]; // pop out the last in one and keep poping out | ||
229 | - next_node_pos--; | 205 | + __device__ void SearchAtNodeRange(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { |
206 | + T best_distance = FLT_MAX; | ||
207 | + size_t best_index = 0; | ||
208 | + | ||
209 | + int next_nodes_pos = 0; // initialize pop out order index | ||
210 | + next_nodes[id * 1000 + next_nodes_pos] = cur; // find data that belongs to the very specific thread | ||
211 | + next_nodes_pos++; | ||
212 | + | ||
213 | + while (next_nodes_pos) { | ||
214 | + int next_search_nodes_pos = 0; // record push back order index | ||
215 | + while (next_nodes_pos) { | ||
216 | + cur = next_nodes[id * 1000 + next_nodes_pos - 1]; // pop out the last push in one and keep poping out | ||
217 | + next_nodes_pos--; | ||
230 | int split_axis = nodes[cur].level % D; | 218 | int split_axis = nodes[cur].level % D; |
231 | 219 | ||
232 | if (nodes[cur].left == -1) { | 220 | if (nodes[cur].left == -1) { |
233 | - for (int i = 0; i < nodes[cur].num_indices; i++) { | ||
234 | - int idx = indices[nodes[cur].indices + i]; // all indices are stored in one array, pick up from every node's beginning index | ||
235 | - T d = Distance<T>(Query, pts[idx]); | ||
236 | - if (d < best_dist) { | ||
237 | - best_dist = d; | ||
238 | - best_idx = idx; | 221 | + for (int i = 0; i < nodes[cur].num_index; i++) { |
222 | + int idx = indices[nodes[cur].index + i]; // all indices are stored in one array, pick up from every node's beginning index | ||
223 | + T d = Distance<T>(d_query_point, d_reference_points[idx]); | ||
224 | + if (d < best_distance) { | ||
225 | + best_distance = d; | ||
226 | + best_index = idx; | ||
239 | } | 227 | } |
240 | } | 228 | } |
241 | } | 229 | } |
242 | else { | 230 | else { |
243 | - T d = Query.coords[split_axis] - nodes[cur].split_value; | 231 | + T d = d_query_point.dim[split_axis] - nodes[cur].split_value; |
244 | 232 | ||
245 | if (fabs(d) > range) { | 233 | if (fabs(d) > range) { |
246 | - if (d < 0) | ||
247 | - next_search[next_search_pos++] = nodes[cur].left; | ||
248 | - else | ||
249 | - next_search[next_search_pos++] = nodes[cur].right; | 234 | + if (d < 0) { |
235 | + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].left; | ||
236 | + next_search_nodes_pos++; | ||
237 | + } | ||
238 | + else { | ||
239 | + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].right; | ||
240 | + next_search_nodes_pos++; | ||
241 | + } | ||
250 | } | 242 | } |
251 | else { | 243 | else { |
252 | - next_search[next_search_pos++] = nodes[cur].left; | ||
253 | - next_search[next_search_pos++] = nodes[cur].right; | 244 | + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].right; |
245 | + next_search_nodes_pos++; | ||
246 | + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].left; | ||
247 | + next_search_nodes_pos++; | ||
248 | + if (next_search_nodes_pos > 1000) { | ||
249 | + printf("Thread conflict might be caused by thread %d, so please try smaller input max_tree_levels\n", id); | ||
250 | + (*Judge)++; | ||
251 | + } | ||
254 | } | 252 | } |
255 | } | 253 | } |
256 | } | 254 | } |
257 | - | ||
258 | - for (int i = 0; i < next_search_pos; i++) | ||
259 | - next_node[i] = next_search[i]; | ||
260 | - next_node_pos = next_search_pos; // operation that really resemble STACK, namely first in last out | 255 | + for (int i = 0; i < next_search_nodes_pos; i++) |
256 | + next_nodes[id * 1000 + i] = next_search_nodes[id * 1000 + i]; | ||
257 | + next_nodes_pos = next_search_nodes_pos; | ||
261 | } | 258 | } |
262 | - *ret_index = best_idx; | ||
263 | - *ret_dist = best_dist; | 259 | + *d_distance = best_distance; |
260 | + *d_index = best_index; | ||
264 | } | 261 | } |
265 | template <typename T, int D> | 262 | template <typename T, int D> |
266 | - __device__ void Search(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist) { | ||
267 | - /// find first nearest node | 263 | + __device__ void Search(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { |
268 | int best_node = 0; | 264 | int best_node = 0; |
269 | - size_t best_idx = 0; | ||
270 | - T best_dist = FLT_MAX; | 265 | + T best_distance = FLT_MAX; |
266 | + size_t best_index = 0; | ||
271 | T radius = 0; | 267 | T radius = 0; |
272 | - SearchAtNode<T, D>(nodes, indices, pts, 0, Query, &best_idx, &best_dist, &best_node); | ||
273 | - radius = sqrt(best_dist); | ||
274 | - /// find other possibilities | 268 | + |
269 | + SearchAtNode<T, D>(nodes, indices, d_reference_points, 0, d_query_point, &best_index, &best_distance, &best_node); | ||
270 | + radius = sqrt(best_distance); // get range | ||
275 | int cur = best_node; | 271 | int cur = best_node; |
276 | 272 | ||
277 | while (nodes[cur].parent != -1) { | 273 | while (nodes[cur].parent != -1) { |
278 | - /// go up | ||
279 | int parent = nodes[cur].parent; | 274 | int parent = nodes[cur].parent; |
280 | int split_axis = nodes[parent].level % D; | 275 | int split_axis = nodes[parent].level % D; |
281 | - /// search other node | 276 | + |
282 | T tmp_dist = FLT_MAX; | 277 | T tmp_dist = FLT_MAX; |
283 | size_t tmp_idx; | 278 | size_t tmp_idx; |
284 | - if (fabs(nodes[parent].split_value - Query.coords[split_axis]) <= radius) { | ||
285 | - /// search opposite node | 279 | + if (fabs(nodes[parent].split_value - d_query_point.dim[split_axis]) <= radius) { |
286 | if (nodes[parent].left != cur) | 280 | if (nodes[parent].left != cur) |
287 | - SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].left, radius, &tmp_idx, &tmp_dist); | 281 | + SearchAtNodeRange(nodes, indices, d_reference_points, d_query_point, nodes[parent].left, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge); |
288 | else | 282 | else |
289 | - SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].right, radius, &tmp_idx, &tmp_dist); | 283 | + SearchAtNodeRange(nodes, indices, d_reference_points, d_query_point, nodes[parent].right, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge); |
290 | } | 284 | } |
291 | - if (tmp_dist < best_dist) { | ||
292 | - best_dist = tmp_dist; | ||
293 | - best_idx = tmp_idx; | 285 | + if (tmp_dist < best_distance) { |
286 | + best_distance = tmp_dist; | ||
287 | + best_index = tmp_idx; | ||
294 | } | 288 | } |
295 | cur = parent; | 289 | cur = parent; |
296 | } | 290 | } |
297 | - *ret_index = best_idx; | ||
298 | - *ret_dist = sqrt(best_dist); | 291 | + *d_distance = sqrt(best_distance); |
292 | + *d_index = best_index; | ||
299 | } | 293 | } |
300 | template <typename T, int D> | 294 | template <typename T, int D> |
301 | - __global__ void SearchBatch(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, size_t num_pts, kdtree::point<T, D> *Query, size_t num_Query, size_t *ret_index, T *ret_dist) { | ||
302 | - int idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
303 | - if (idx >= num_Query) return; | 295 | + __global__ void SearchBatch(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) { |
296 | + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
297 | + if (idx >= d_query_count) return; // avoid segfault | ||
304 | 298 | ||
305 | - Search<T, D>(nodes, indices, pts, Query[idx], &ret_index[idx], &ret_dist[idx]); // every query points are independent | 299 | + Search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent |
306 | } | 300 | } |
307 | 301 | ||
308 | template <typename T, int D = 3> | 302 | template <typename T, int D = 3> |
309 | class cuda_kdtree { | 303 | class cuda_kdtree { |
310 | protected: | 304 | protected: |
311 | - CUDA_KDNode<T> *m_gpu_nodes; // store nodes | ||
312 | - size_t *m_gpu_indices; | ||
313 | - kdtree::point<T, D>* m_gpu_points; | ||
314 | - size_t m_num_points; | 305 | + cuda_kdnode<T> *d_nodes; |
306 | + size_t *d_index; | ||
307 | + kdtree::point<T, D>* d_reference_points; | ||
308 | + size_t d_reference_count; | ||
315 | public: | 309 | public: |
316 | ~cuda_kdtree() { | 310 | ~cuda_kdtree() { |
317 | - HANDLE_ERROR(cudaFree(m_gpu_nodes)); | ||
318 | - HANDLE_ERROR(cudaFree(m_gpu_indices)); | ||
319 | - HANDLE_ERROR(cudaFree(m_gpu_points)); | 311 | + HANDLE_ERROR(cudaFree(d_nodes)); |
312 | + HANDLE_ERROR(cudaFree(d_index)); | ||
313 | + HANDLE_ERROR(cudaFree(d_reference_points)); | ||
320 | } | 314 | } |
321 | - void CreateKDTree(T *ReferencePoints, size_t ReferenceCount, size_t ColCount, size_t max_levels) { | ||
322 | - std::vector < kdtree::point<T, D> > pts(ReferenceCount); // create specific struct of reference data | ||
323 | - for (size_t j = 0; j < ReferenceCount; j++) | ||
324 | - for (size_t i = 0; i < ColCount; i++) | ||
325 | - pts[j].coords[i] = ReferencePoints[j * ColCount + i]; | ||
326 | - cpu_kdtree<T, D> tree; // initialize a tree | ||
327 | - tree.Create(pts, max_levels); // create KD-Tree on host | ||
328 | - kdtree::KDNode<T> *root = tree.GetRoot(); | 315 | + void CreateKDTree(T *h_reference_points, size_t reference_count, size_t dim_count, size_t max_levels) { |
316 | + if (max_levels > 10) { | ||
317 | + std::cout<<"The max_tree_levels should be smaller!"<<std::endl; | ||
318 | + exit(1); | ||
319 | + } | ||
320 | + std::vector <kdtree::point<T, D>> reference_points(reference_count); // restore the reference points in particular way | ||
321 | + for (size_t j = 0; j < reference_count; j++) | ||
322 | + for (size_t i = 0; i < dim_count; i++) | ||
323 | + reference_points[j].dim[i] = h_reference_points[j * dim_count + i]; | ||
324 | + cpu_kdtree<T, D> tree; // creating a tree on cpu | ||
325 | + tree.Create(reference_points, max_levels); // building a tree on cpu | ||
326 | + kdtree::kdnode<T> *d_root = tree.GetRoot(); | ||
329 | int num_nodes = tree.GetNumNodes(); | 327 | int num_nodes = tree.GetNumNodes(); |
330 | - /// create the same on CPU | ||
331 | - m_num_points = pts.size(); // number of points for creating tree = reference_count in the case | 328 | + d_reference_count = reference_points.size(); // also equals to reference_count |
332 | 329 | ||
333 | - HANDLE_ERROR(cudaMalloc((void**)&m_gpu_nodes, sizeof(CUDA_KDNode<T>) * num_nodes)); // private variables for kdtree | ||
334 | - HANDLE_ERROR(cudaMalloc((void**)&m_gpu_indices, sizeof(size_t) * m_num_points)); | ||
335 | - HANDLE_ERROR(cudaMalloc((void**)&m_gpu_points, sizeof(kdtree::point<T, D>) * m_num_points)); | ||
336 | - | ||
337 | - std::vector <CUDA_KDNode<T>> cpu_nodes(num_nodes); // from left to right, id of nodes | ||
338 | - std::vector <size_t> indices(m_num_points); | ||
339 | - std::vector < kdtree::KDNode<T>* > next_node; | 330 | + HANDLE_ERROR(cudaMalloc((void**)&d_nodes, sizeof(cuda_kdnode<T>) * num_nodes)); // copy data from host to device |
331 | + HANDLE_ERROR(cudaMalloc((void**)&d_index, sizeof(size_t) * d_reference_count)); | ||
332 | + HANDLE_ERROR(cudaMalloc((void**)&d_reference_points, sizeof(kdtree::point<T, D>) * d_reference_count)); | ||
340 | 333 | ||
334 | + std::vector <cuda_kdnode<T>> tmp_nodes(num_nodes); | ||
335 | + std::vector <size_t> indices(d_reference_count); | ||
336 | + std::vector <kdtree::kdnode<T>*> next_nodes; | ||
341 | size_t cur_pos = 0; | 337 | size_t cur_pos = 0; |
342 | - | ||
343 | - next_node.push_back(root); | ||
344 | - | ||
345 | - while (next_node.size()) { | ||
346 | - std::vector <typename kdtree::KDNode<T>* > next_search; | ||
347 | - | ||
348 | - while (next_node.size()) { | ||
349 | - kdtree::KDNode<T> *cur = next_node.back(); | ||
350 | - next_node.pop_back(); | ||
351 | - | ||
352 | - int id = cur->id; // the nodes at same level are independent | ||
353 | - | ||
354 | - cpu_nodes[id].level = cur->level; | ||
355 | - cpu_nodes[id].parent = cur->_parent; | ||
356 | - cpu_nodes[id].left = cur->_left; | ||
357 | - cpu_nodes[id].right = cur->_right; | ||
358 | - cpu_nodes[id].split_value = cur->split_value; | ||
359 | - cpu_nodes[id].num_indices = cur->indices.size(); // number of index | ||
360 | - | 338 | + next_nodes.push_back(d_root); |
339 | + while (next_nodes.size()) { | ||
340 | + std::vector <typename kdtree::kdnode<T>*> next_search_nodes; | ||
341 | + while (next_nodes.size()) { | ||
342 | + kdtree::kdnode<T> *cur = next_nodes.back(); | ||
343 | + next_nodes.pop_back(); | ||
344 | + int id = cur->idx; // the nodes at same level are independent | ||
345 | + tmp_nodes[id].level = cur->level; | ||
346 | + tmp_nodes[id].parent = cur->parent_idx; | ||
347 | + tmp_nodes[id].left = cur->left_idx; | ||
348 | + tmp_nodes[id].right = cur->right_idx; | ||
349 | + tmp_nodes[id].split_value = cur->split_value; | ||
350 | + tmp_nodes[id].num_index = cur->indices.size(); // number of index | ||
361 | if (cur->indices.size()) { | 351 | if (cur->indices.size()) { |
362 | for (size_t i = 0; i < cur->indices.size(); i++) | 352 | for (size_t i = 0; i < cur->indices.size(); i++) |
363 | indices[cur_pos + i] = cur->indices[i]; | 353 | indices[cur_pos + i] = cur->indices[i]; |
364 | 354 | ||
365 | - cpu_nodes[id].indices = (int)cur_pos; // beginning index that every bottom node has | ||
366 | - cur_pos += cur->indices.size(); // store indices continuously | 355 | + tmp_nodes[id].index = (int)cur_pos; // beginning index of reference_points that every bottom node has |
356 | + cur_pos += cur->indices.size(); // store indices continuously for every query_point | ||
367 | } | 357 | } |
368 | else { | 358 | else { |
369 | - cpu_nodes[id].indices = -1; | 359 | + tmp_nodes[id].index = -1; |
370 | } | 360 | } |
371 | 361 | ||
372 | if (cur->left) | 362 | if (cur->left) |
373 | - next_search.push_back(cur->left); | 363 | + next_search_nodes.push_back(cur->left); |
374 | 364 | ||
375 | if (cur->right) | 365 | if (cur->right) |
376 | - next_search.push_back(cur->right); | 366 | + next_search_nodes.push_back(cur->right); |
377 | } | 367 | } |
378 | - next_node = next_search; | 368 | + next_nodes = next_search_nodes; |
379 | } | 369 | } |
380 | - | ||
381 | - HANDLE_ERROR(cudaMemcpy(m_gpu_nodes, &cpu_nodes[0], sizeof(CUDA_KDNode<T>) * cpu_nodes.size(), cudaMemcpyHostToDevice)); | ||
382 | - HANDLE_ERROR(cudaMemcpy(m_gpu_indices, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice)); | ||
383 | - HANDLE_ERROR(cudaMemcpy(m_gpu_points, &pts[0], sizeof(kdtree::point<T, D>) * pts.size(), cudaMemcpyHostToDevice)); | 370 | + HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice)); |
371 | + HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice)); | ||
372 | + HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point<T, D>) * reference_points.size(), cudaMemcpyHostToDevice)); | ||
384 | } | 373 | } |
385 | - void Search(T *QueryPoints, size_t QueryCount, size_t ColCount, T *dists, size_t *indices) { | ||
386 | - std::vector < kdtree::point<T, D> > query(QueryCount); | ||
387 | - for (size_t j = 0; j < QueryCount; j++) | ||
388 | - for (size_t i = 0; i < ColCount; i++) | ||
389 | - query[j].coords[i] = QueryPoints[j * ColCount + i]; | ||
390 | - | ||
391 | - unsigned int threads = (unsigned int)(query.size() > 1024 ? 1024 : query.size()); | ||
392 | - //unsigned int blocks = (unsigned int)ceil(query.size() / threads); | ||
393 | - unsigned int blocks = (unsigned int)(query.size() / threads + (query.size() % threads ? 1 : 0)); | ||
394 | - | ||
395 | - kdtree::point<T, D> *gpu_Query; | ||
396 | - size_t *gpu_ret_indices; | ||
397 | - T *gpu_ret_dist; | ||
398 | - | ||
399 | - HANDLE_ERROR(cudaMalloc((void**)&gpu_Query, sizeof(T) * query.size() * D)); | ||
400 | - HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_indices, sizeof(size_t) * query.size())); | ||
401 | - HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_dist, sizeof(T) * query.size())); | ||
402 | - HANDLE_ERROR(cudaMemcpy(gpu_Query, &query[0], sizeof(T) * query.size() * D, cudaMemcpyHostToDevice)); | ||
403 | - | ||
404 | - SearchBatch << <threads, blocks >> > (m_gpu_nodes, m_gpu_indices, m_gpu_points, m_num_points, gpu_Query, query.size(), gpu_ret_indices, gpu_ret_dist); | ||
405 | - | ||
406 | - HANDLE_ERROR(cudaMemcpy(indices, gpu_ret_indices, sizeof(size_t) * query.size(), cudaMemcpyDeviceToHost)); | ||
407 | - HANDLE_ERROR(cudaMemcpy(dists, gpu_ret_dist, sizeof(T) * query.size(), cudaMemcpyDeviceToHost)); | 374 | + void Search(T *h_query_points, size_t query_count, size_t dim_count, T *dists, size_t *indices) { |
375 | + std::vector <kdtree::point<T, D>> query_points(query_count); | ||
376 | + for (size_t j = 0; j < query_count; j++) | ||
377 | + for (size_t i = 0; i < dim_count; i++) | ||
378 | + query_points[j].dim[i] = h_query_points[j * dim_count + i]; | ||
379 | + | ||
380 | + unsigned int threads = (unsigned int)(query_points.size() > 1024 ? 1024 : query_points.size()); | ||
381 | + unsigned int blocks = (unsigned int)(query_points.size() / threads + (query_points.size() % threads ? 1 : 0)); | ||
382 | + | ||
383 | + kdtree::point<T, D> *d_query_points; // create a pointer pointing to query points on gpu | ||
384 | + size_t *d_indices; | ||
385 | + T *d_distances; | ||
386 | + | ||
387 | + int *next_nodes; // create two STACK-like array | ||
388 | + int *next_search_nodes; | ||
389 | + | ||
390 | + int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory | ||
391 | + | ||
392 | + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_points.size() * D)); | ||
393 | + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_points.size())); | ||
394 | + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_points.size())); | ||
395 | + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * 1000 * sizeof(int))); // STACK size right now is 1000, you can change it if you mean to | ||
396 | + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * 1000 * sizeof(int))); | ||
397 | + HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_points.size() * D, cudaMemcpyHostToDevice)); | ||
398 | + | ||
399 | + SearchBatch<<<threads, blocks>>> (d_nodes, d_index, d_reference_points, d_query_points, query_points.size(), d_indices, d_distances, next_nodes, next_search_nodes, Judge); | ||
400 | + | ||
401 | + if (Judge == NULL) { // do the following work if the thread works safely | ||
402 | + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_points.size(), cudaMemcpyDeviceToHost)); | ||
403 | + HANDLE_ERROR(cudaMemcpy(dists, d_distances, sizeof(T) * query_points.size(), cudaMemcpyDeviceToHost)); | ||
404 | + } | ||
408 | 405 | ||
409 | - HANDLE_ERROR(cudaFree(gpu_Query)); | ||
410 | - HANDLE_ERROR(cudaFree(gpu_ret_indices)); | ||
411 | - HANDLE_ERROR(cudaFree(gpu_ret_dist)); | 406 | + HANDLE_ERROR(cudaFree(next_nodes)); |
407 | + HANDLE_ERROR(cudaFree(next_search_nodes)); | ||
408 | + HANDLE_ERROR(cudaFree(d_query_points)); | ||
409 | + HANDLE_ERROR(cudaFree(d_indices)); | ||
410 | + HANDLE_ERROR(cudaFree(d_distances)); | ||
412 | } | 411 | } |
413 | }; | 412 | }; |
414 | } //end namespace stim | 413 | } //end namespace stim |