Commit 6d247a6cdfd13b3037ca88c8cb5cd260506d10e4
Merge branch 'kdtree' of git.stim.ee.uh.edu:codebase/stimlib into kdtree
Showing
1 changed file
with
257 additions
and
258 deletions
Show diff stats
stim/structures/kdtree.cuh
1 | -// change CUDA_STACK together with max_tree_levels in trial and error manner | |
1 | +// right now the size of CUDA STACK is set to 1000, increase it if you mean to make deeper tree | |
2 | 2 | // data should be stored in row-major |
3 | 3 | // x1,x2,x3,x4,x5...... |
4 | 4 | // y1,y2,y3,y4,y5...... |
... | ... | @@ -16,159 +16,150 @@ |
16 | 16 | #include <float.h> |
17 | 17 | #include <iostream> |
18 | 18 | #include <algorithm> |
19 | - | |
20 | -/// using API called HADDLE_ERROR | |
21 | -static void HandleError(cudaError_t err, const char *file, int line) { | |
22 | - if (err != cudaSuccess) { | |
23 | - std::cout<<cudaGetErrorString(err)<<" in"<< file <<" at line "<<line<<std::endl; | |
24 | - } | |
25 | -} | |
26 | -#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__)) | |
27 | - | |
28 | -#define CUDA_STACK 2 // implementation "stacks" on CUDA as to do store the nodes information | |
19 | +#include <stim/cuda/cudatools/error.h> | |
29 | 20 | |
30 | 21 | namespace stim { |
31 | 22 | namespace kdtree { |
32 | - template<typename T, int D> | |
23 | + template<typename T, int D> // typename refers to float or double while D refers to dimension of points | |
33 | 24 | struct point { |
34 | - T coords[D]; // if we use size to measure a vector<point>, it will show the number of point structures | |
25 | + T dim[D]; // create a structure to store every one input point | |
35 | 26 | }; |
36 | 27 | |
37 | 28 | template<typename T> |
38 | - class KDNode { | |
29 | + class kdnode { | |
39 | 30 | public: |
40 | - KDNode() { // initialization | |
41 | - parent = NULL; | |
31 | + kdnode() { // constructor for initializing a kdnode | |
32 | + parent = NULL; // set every node's parent, left and right kdnode pointers to NULL | |
42 | 33 | left = NULL; |
43 | 34 | right = NULL; |
44 | - split_value = -1; | |
45 | - _parent = -1; | |
46 | - _left = -1; | |
47 | - _right = -1; | |
35 | + parent_idx = -1; // set parent node index to default -1 | |
36 | + left_idx = -1; | |
37 | + right_idx = -1; | |
38 | + split_value = -1; // set split_value to default -1 | |
48 | 39 | } |
49 | - int id; // id for current node | |
50 | - size_t level; | |
51 | - KDNode *parent, *left, *right; | |
52 | - int _parent, _left, _right; // id for parent node | |
53 | - T split_value; // node value | |
54 | - std::vector <size_t> indices; // indices that indicate the data that current tree has | |
40 | + int idx; // index of current node | |
41 | + int parent_idx, left_idx, right_idx; // index of parent, left and right nodes | |
42 | + kdnode *parent, *left, *right; // parent, left and right kdnodes | |
43 | + T split_value; // splitting value of current node | |
44 | + std::vector <size_t> indices; // it indicates the points' indices that current node has | |
45 | + size_t level; // tree level of current node | |
55 | 46 | }; |
56 | - } | |
47 | + } // end of namespace kdtree | |
57 | 48 | |
58 | - template <typename T, int D = 3> | |
49 | + template <typename T, int D = 3> // set dimension of data to default 3 | |
59 | 50 | class cpu_kdtree { |
60 | - | |
61 | 51 | protected: |
62 | - std::vector <kdtree::point<T, D>> *m_pts; | |
63 | - kdtree::KDNode<T> *m_root; // current node | |
64 | - int m_current_axis; | |
65 | - size_t m_levels; | |
66 | - int m_cmps; // count how many comparisons are to made in the tree for one query | |
67 | - int m_id; // level + 1 | |
68 | - static cpu_kdtree<T, D> *myself; | |
52 | + int current_axis; // current judging axis | |
53 | + int cmps; // count how many time of comparisons (just for cpu-kdtree) | |
54 | + int n_id; // store the total number of nodes | |
55 | + std::vector <kdtree::point<T, D>> *tmp_points; // transfer or temp points | |
56 | + kdtree::kdnode<T> *root; // root node | |
57 | + static cpu_kdtree<T, D> *cur_tree_ptr; | |
69 | 58 | public: |
70 | - cpu_kdtree() { // initialization | |
71 | - myself = this; | |
72 | - m_id = 0; // id = level + 1, level -> axis index while id -> node identifier | |
59 | + cpu_kdtree() { // constructor for creating a cpu_kdtree | |
60 | + cur_tree_ptr = this; // create a class pointer points to the current class value | |
61 | + n_id = 0; // set total number of points to default 0 | |
73 | 62 | } |
74 | - ~cpu_kdtree() { // destructor for deleting what was created by kdtree() | |
75 | - std::vector <kdtree::KDNode<T>*> next_node; | |
76 | - next_node.push_back(m_root); | |
77 | - while (next_node.size()) { | |
78 | - std::vector <kdtree::KDNode<T>*> next_search; | |
79 | - while (next_node.size()) { | |
80 | - kdtree::KDNode<T> *cur = next_node.back(); | |
81 | - next_node.pop_back(); | |
63 | + ~cpu_kdtree() { // destructor of cpu_kdtree | |
64 | + std::vector <kdtree::kdnode<T>*> next_nodes; | |
65 | + next_nodes.push_back(root); | |
66 | + while (next_nodes.size()) { | |
67 | + std::vector <kdtree::kdnode<T>*> next_search_nodes; | |
68 | + while (next_nodes.size()) { | |
69 | + kdtree::kdnode<T> *cur = next_nodes.back(); | |
70 | + next_nodes.pop_back(); | |
82 | 71 | if (cur->left) |
83 | - next_search.push_back(cur->left); | |
72 | + next_search_nodes.push_back(cur->left); | |
84 | 73 | if (cur->right) |
85 | - next_search.push_back(cur->right); | |
74 | + next_search_nodes.push_back(cur->right); | |
86 | 75 | delete cur; |
87 | 76 | } |
88 | - next_node = next_search; | |
77 | + next_nodes = next_search_nodes; | |
89 | 78 | } |
90 | - m_root = NULL; | |
79 | + root = NULL; | |
91 | 80 | } |
92 | - void Create(std::vector <kdtree::point<T, D>> &pts, size_t max_levels) { | |
93 | - m_pts = &pts; // create a pointer point to the input data | |
94 | - m_levels = max_levels; // stores max tree levels | |
95 | - m_root = new kdtree::KDNode<T>(); // using KDNode() to initialize an ancestor node | |
96 | - m_root->id = m_id++; // id is 1 while level is 0 at the very beginning | |
97 | - m_root->level = 0; // to begin with level 0 | |
98 | - m_root->indices.resize(pts.size()); // initialize the size of whole indices | |
99 | - for (size_t i = 0; i < pts.size(); i++) { | |
100 | - m_root->indices[i] = i; // like what we did on Keys in GPU-BF part | |
81 | + void Create(std::vector <kdtree::point<T, D>> &reference_points, size_t max_levels) { | |
82 | + tmp_points = &reference_points; | |
83 | + root = new kdtree::kdnode<T>(); // initializing the root node | |
84 | + root->idx = n_id++; // the index of root is 0 | |
85 | + root->level = 0; // tree level begins at 0 | |
86 | + root->indices.resize(reference_points.size()); // get the number of points | |
87 | + for (size_t i = 0; i < reference_points.size(); i++) { | |
88 | + root->indices[i] = i; // set indices of input points | |
101 | 89 | } |
102 | - std::vector <kdtree::KDNode<T>*> next_node; // next node | |
103 | - next_node.push_back(m_root); // new node | |
104 | - while (next_node.size()) { | |
105 | - std::vector <kdtree::KDNode<T>*> next_search; | |
106 | - while (next_node.size()) { // two same WHILE is because we need to make a new vector for searching | |
107 | - kdtree::KDNode<T> *current_node = next_node.back(); // pointer point to current node (right first) | |
108 | - next_node.pop_back(); // pop out current node in order to store next node | |
109 | - if (current_node->level < max_levels) { // max_levels should be reasonably small compared with numbers of data | |
110 | - if (current_node->indices.size() > 1) { | |
111 | - kdtree::KDNode<T> *left = new kdtree::KDNode<T>(); | |
112 | - kdtree::KDNode<T> *right = new kdtree::KDNode<T>(); | |
113 | - left->id = m_id++; // risky guessing but OK for large amount of data since max_level is small | |
114 | - right->id = m_id++; // risky guessing but OK for large amount of data since max_level is small | |
115 | - Split(current_node, left, right); // split left and right and determine a node | |
116 | - std::vector <size_t> temp; // empty vecters of int | |
90 | + std::vector <kdtree::kdnode<T>*> next_nodes; // next nodes | |
91 | + next_nodes.push_back(root); // push back the root node | |
92 | + while (next_nodes.size()) { | |
93 | + std::vector <kdtree::kdnode<T>*> next_search_nodes; // next search nodes | |
94 | + while (next_nodes.size()) { // two same WHILE is because we need to make a new vector to store nodes for search | |
95 | + kdtree::kdnode<T> *current_node = next_nodes.back(); // handle node one by one (right first) | |
96 | + next_nodes.pop_back(); // pop out current node in order to store next round of nodes | |
97 | + if (current_node->level < max_levels) { | |
98 | + if (current_node->indices.size() > 1) { // split if the nonleaf node contains more than one point | |
99 | + kdtree::kdnode<T> *left = new kdtree::kdnode<T>(); | |
100 | + kdtree::kdnode<T> *right = new kdtree::kdnode<T>(); | |
101 | + left->idx = n_id++; // set the index of current node's left node | |
102 | + right->idx = n_id++; | |
103 | + Split(current_node, left, right); // split left and right and determine a node | |
104 | + std::vector <size_t> temp; // empty vecters of int | |
117 | 105 | //temp.resize(current_node->indices.size()); |
118 | - current_node->indices.swap(temp); // clean up current tree's indices | |
106 | + current_node->indices.swap(temp); // clean up current node's indices | |
119 | 107 | current_node->left = left; |
120 | 108 | current_node->right = right; |
121 | - current_node->_left = left->id; // indicates it has left son node and gets its id | |
122 | - current_node->_right = right->id; // indicates it has right son node and gets its id | |
123 | - if (left->indices.size()) | |
124 | - next_search.push_back(left); // right first then left according to stack(first in last out), it can be done in parallel for left and right are independent | |
109 | + current_node->left_idx = left->idx; | |
110 | + current_node->right_idx = right->idx; | |
125 | 111 | if (right->indices.size()) |
126 | - next_search.push_back(right); | |
112 | + next_search_nodes.push_back(right); // left pop out first | |
113 | + if (left->indices.size()) | |
114 | + next_search_nodes.push_back(left); | |
127 | 115 | } |
128 | 116 | } |
129 | 117 | } |
130 | - next_node = next_search; | |
118 | + next_nodes = next_search_nodes; // go deeper within the tree | |
131 | 119 | } |
132 | 120 | } |
133 | - static bool SortPoints(const size_t a, const size_t b) { | |
134 | - std::vector <kdtree::point<T, D>> &pts = *myself->m_pts; | |
135 | - return pts[a].coords[myself->m_current_axis] < pts[b].coords[myself->m_current_axis]; | |
121 | + static bool SortPoints(const size_t a, const size_t b) { // create functor for std::sort | |
122 | + std::vector <kdtree::point<T, D>> &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer | |
123 | + return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis]; | |
136 | 124 | } |
137 | - void Split(kdtree::KDNode<T> *cur, kdtree::KDNode<T> *left, kdtree::KDNode<T> *right) { | |
138 | - /// assume both two sides are created and sure it was | |
139 | - std::vector <kdtree::point<T, D>> &pts = *m_pts; | |
140 | - m_current_axis = cur->level % D; // indicate the judicative dimension or axis | |
125 | + void Split(kdtree::kdnode<T> *cur, kdtree::kdnode<T> *left, kdtree::kdnode<T> *right) { | |
126 | + std::vector <kdtree::point<T, D>> &pts = *tmp_points; | |
127 | + current_axis = cur->level % D; // indicate the judicative dimension or axis | |
141 | 128 | std::sort(cur->indices.begin(), cur->indices.end(), SortPoints); // using SortPoints as comparison function to sort the data |
142 | - size_t mid = cur->indices[cur->indices.size() / 2]; // odd in the mid, even take the floor | |
143 | - cur->split_value = pts[mid].coords[m_current_axis]; // get the mother node | |
144 | - left->parent = cur; // set the parent to current node for the next nodes | |
129 | + size_t mid_value = cur->indices[cur->indices.size() / 2]; // odd in the mid_value, even take the floor | |
130 | + cur->split_value = pts[mid_value].dim[current_axis]; // get the parent node | |
131 | + left->parent = cur; // set the parent of the next search nodes to current node | |
145 | 132 | right->parent = cur; |
146 | - left->level = cur->level + 1; | |
133 | + left->level = cur->level + 1; // level + 1 | |
147 | 134 | right->level = cur->level + 1; |
148 | - left->_parent = cur->id; // indicates it has mother node and gets its id | |
149 | - right->_parent = cur->id; // indicates it has mother node and gets its id | |
150 | - for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right area one by one | |
135 | + left->parent_idx = cur->idx; // set its parent node's index | |
136 | + right->parent_idx = cur->idx; | |
137 | + for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right half-space one by one | |
151 | 138 | size_t idx = cur->indices[i]; |
152 | - if (pts[idx].coords[m_current_axis] < cur->split_value) | |
139 | + if (pts[idx].dim[current_axis] < cur->split_value) | |
153 | 140 | left->indices.push_back(idx); |
154 | 141 | else |
155 | 142 | right->indices.push_back(idx); |
156 | 143 | } |
157 | 144 | } |
158 | - int GetNumNodes() const { return m_id; } | |
159 | - kdtree::KDNode<T>* GetRoot() const { return m_root; } | |
145 | + int GetNumNodes() const { // get the total number of nodes | |
146 | + return n_id; | |
147 | + } | |
148 | + kdtree::kdnode<T>* GetRoot() const { // get the root node of tree | |
149 | + return root; | |
150 | + } | |
160 | 151 | }; //end class kdtree |
161 | 152 | |
162 | 153 | template <typename T, int D> |
163 | - cpu_kdtree<T, D>* cpu_kdtree<T, D>::myself = NULL; // definition of myself pointer points to the specific class | |
154 | + cpu_kdtree<T, D>* cpu_kdtree<T, D>::cur_tree_ptr = NULL; // definition of cur_tree_ptr pointer points to the current class | |
164 | 155 | |
165 | 156 | template <typename T> |
166 | - struct CUDA_KDNode { | |
167 | - size_t level; | |
168 | - int parent, left, right; // indicates id of | |
157 | + struct cuda_kdnode { | |
158 | + int parent, left, right; | |
169 | 159 | T split_value; |
170 | - size_t num_indices; // number of indices it has | |
171 | - int indices; // the beginning | |
160 | + size_t num_index; // number of indices it has | |
161 | + int index; // the beginning index | |
162 | + size_t level; | |
172 | 163 | }; |
173 | 164 | |
174 | 165 | template <typename T, int D> |
... | ... | @@ -176,239 +167,247 @@ namespace stim { |
176 | 167 | T dist = 0; |
177 | 168 | |
178 | 169 | for (size_t i = 0; i < D; i++) { |
179 | - T d = a.coords[i] - b.coords[i]; | |
170 | + T d = a.dim[i] - b.dim[i]; | |
180 | 171 | dist += d*d; |
181 | 172 | } |
182 | 173 | return dist; |
183 | 174 | } |
184 | 175 | template <typename T, int D> |
185 | - __device__ void SearchAtNode(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, int cur, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist, int *ret_node) { | |
186 | - /// finds the first possibility | |
187 | - size_t best_idx = 0; | |
188 | - T best_dist = FLT_MAX; | |
176 | + __device__ void SearchAtNode(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, int cur, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) { | |
177 | + T best_distance = FLT_MAX; | |
178 | + size_t best_index = 0; | |
189 | 179 | |
190 | - while (true) { | |
180 | + while (true) { // break until reach the bottom | |
191 | 181 | int split_axis = nodes[cur].level % D; |
192 | - if (nodes[cur].left == -1) { // if it doesn't have left son node | |
193 | - *ret_node = cur; | |
194 | - for (int i = 0; i < nodes[cur].num_indices; i++) { | |
195 | - size_t idx = indices[nodes[cur].indices + i]; | |
196 | - T dist = Distance<T, D>(Query, pts[idx]); | |
197 | - if (dist < best_dist) { | |
198 | - best_dist = dist; | |
199 | - best_idx = idx; | |
182 | + if (nodes[cur].left == -1) { // check whether it has left node or not | |
183 | + *d_node = cur; | |
184 | + for (int i = 0; i < nodes[cur].num_index; i++) { | |
185 | + size_t idx = indices[nodes[cur].index + i]; | |
186 | + T dist = Distance<T, D>(d_query_point, d_reference_points[idx]); | |
187 | + if (dist < best_distance) { | |
188 | + best_distance = dist; | |
189 | + best_index = idx; | |
200 | 190 | } |
201 | 191 | } |
202 | - break; | |
192 | + break; | |
203 | 193 | } |
204 | - else if (Query.coords[split_axis] < nodes[cur].split_value) { | |
194 | + else if (d_query_point.dim[split_axis] < nodes[cur].split_value) { // jump into specific son node | |
205 | 195 | cur = nodes[cur].left; |
206 | 196 | } |
207 | 197 | else { |
208 | 198 | cur = nodes[cur].right; |
209 | 199 | } |
210 | 200 | } |
211 | - *ret_index = best_idx; | |
212 | - *ret_dist = best_dist; | |
201 | + *d_distance = best_distance; | |
202 | + *d_index = best_index; | |
213 | 203 | } |
214 | 204 | template <typename T, int D> |
215 | - __device__ void SearchAtNodeRange(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, int cur, T range, size_t *ret_index, T *ret_dist) { | |
216 | - /// search throught all the nodes that are within one range | |
217 | - size_t best_idx = 0; | |
218 | - T best_dist = FLT_MAX; | |
219 | - /// using fixed stack, increase it when need | |
220 | - int next_node[CUDA_STACK]; // should be larger than 1 | |
221 | - int next_node_pos = 0; // initialize pop out order index | |
222 | - next_node[next_node_pos++] = cur; // equals to next_node[next_node_pos] = cur, next_node_pos++ | |
223 | - | |
224 | - while (next_node_pos) { | |
225 | - int next_search[CUDA_STACK]; // for store next nodes | |
226 | - int next_search_pos = 0; // record push back order index | |
227 | - while (next_node_pos) { | |
228 | - cur = next_node[next_node_pos - 1]; // pop out the last in one and keep poping out | |
229 | - next_node_pos--; | |
205 | + __device__ void SearchAtNodeRange(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { | |
206 | + T best_distance = FLT_MAX; | |
207 | + size_t best_index = 0; | |
208 | + | |
209 | + int next_nodes_pos = 0; // initialize pop out order index | |
210 | + next_nodes[id * 1000 + next_nodes_pos] = cur; // find data that belongs to the very specific thread | |
211 | + next_nodes_pos++; | |
212 | + | |
213 | + while (next_nodes_pos) { | |
214 | + int next_search_nodes_pos = 0; // record push back order index | |
215 | + while (next_nodes_pos) { | |
216 | + cur = next_nodes[id * 1000 + next_nodes_pos - 1]; // pop out the last push in one and keep poping out | |
217 | + next_nodes_pos--; | |
230 | 218 | int split_axis = nodes[cur].level % D; |
231 | 219 | |
232 | 220 | if (nodes[cur].left == -1) { |
233 | - for (int i = 0; i < nodes[cur].num_indices; i++) { | |
234 | - int idx = indices[nodes[cur].indices + i]; // all indices are stored in one array, pick up from every node's beginning index | |
235 | - T d = Distance<T>(Query, pts[idx]); | |
236 | - if (d < best_dist) { | |
237 | - best_dist = d; | |
238 | - best_idx = idx; | |
221 | + for (int i = 0; i < nodes[cur].num_index; i++) { | |
222 | + int idx = indices[nodes[cur].index + i]; // all indices are stored in one array, pick up from every node's beginning index | |
223 | + T d = Distance<T>(d_query_point, d_reference_points[idx]); | |
224 | + if (d < best_distance) { | |
225 | + best_distance = d; | |
226 | + best_index = idx; | |
239 | 227 | } |
240 | 228 | } |
241 | 229 | } |
242 | 230 | else { |
243 | - T d = Query.coords[split_axis] - nodes[cur].split_value; | |
231 | + T d = d_query_point.dim[split_axis] - nodes[cur].split_value; | |
244 | 232 | |
245 | 233 | if (fabs(d) > range) { |
246 | - if (d < 0) | |
247 | - next_search[next_search_pos++] = nodes[cur].left; | |
248 | - else | |
249 | - next_search[next_search_pos++] = nodes[cur].right; | |
234 | + if (d < 0) { | |
235 | + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].left; | |
236 | + next_search_nodes_pos++; | |
237 | + } | |
238 | + else { | |
239 | + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].right; | |
240 | + next_search_nodes_pos++; | |
241 | + } | |
250 | 242 | } |
251 | 243 | else { |
252 | - next_search[next_search_pos++] = nodes[cur].left; | |
253 | - next_search[next_search_pos++] = nodes[cur].right; | |
244 | + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].right; | |
245 | + next_search_nodes_pos++; | |
246 | + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].left; | |
247 | + next_search_nodes_pos++; | |
248 | + if (next_search_nodes_pos > 1000) { | |
249 | + printf("Thread conflict might be caused by thread %d, so please try smaller input max_tree_levels\n", id); | |
250 | + (*Judge)++; | |
251 | + } | |
254 | 252 | } |
255 | 253 | } |
256 | 254 | } |
257 | - | |
258 | - for (int i = 0; i < next_search_pos; i++) | |
259 | - next_node[i] = next_search[i]; | |
260 | - next_node_pos = next_search_pos; // operation that really resemble STACK, namely first in last out | |
255 | + for (int i = 0; i < next_search_nodes_pos; i++) | |
256 | + next_nodes[id * 1000 + i] = next_search_nodes[id * 1000 + i]; | |
257 | + next_nodes_pos = next_search_nodes_pos; | |
261 | 258 | } |
262 | - *ret_index = best_idx; | |
263 | - *ret_dist = best_dist; | |
259 | + *d_distance = best_distance; | |
260 | + *d_index = best_index; | |
264 | 261 | } |
265 | 262 | template <typename T, int D> |
266 | - __device__ void Search(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist) { | |
267 | - /// find first nearest node | |
263 | + __device__ void Search(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { | |
268 | 264 | int best_node = 0; |
269 | - size_t best_idx = 0; | |
270 | - T best_dist = FLT_MAX; | |
265 | + T best_distance = FLT_MAX; | |
266 | + size_t best_index = 0; | |
271 | 267 | T radius = 0; |
272 | - SearchAtNode<T, D>(nodes, indices, pts, 0, Query, &best_idx, &best_dist, &best_node); | |
273 | - radius = sqrt(best_dist); | |
274 | - /// find other possibilities | |
268 | + | |
269 | + SearchAtNode<T, D>(nodes, indices, d_reference_points, 0, d_query_point, &best_index, &best_distance, &best_node); | |
270 | + radius = sqrt(best_distance); // get range | |
275 | 271 | int cur = best_node; |
276 | 272 | |
277 | 273 | while (nodes[cur].parent != -1) { |
278 | - /// go up | |
279 | 274 | int parent = nodes[cur].parent; |
280 | 275 | int split_axis = nodes[parent].level % D; |
281 | - /// search other node | |
276 | + | |
282 | 277 | T tmp_dist = FLT_MAX; |
283 | 278 | size_t tmp_idx; |
284 | - if (fabs(nodes[parent].split_value - Query.coords[split_axis]) <= radius) { | |
285 | - /// search opposite node | |
279 | + if (fabs(nodes[parent].split_value - d_query_point.dim[split_axis]) <= radius) { | |
286 | 280 | if (nodes[parent].left != cur) |
287 | - SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].left, radius, &tmp_idx, &tmp_dist); | |
281 | + SearchAtNodeRange(nodes, indices, d_reference_points, d_query_point, nodes[parent].left, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge); | |
288 | 282 | else |
289 | - SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].right, radius, &tmp_idx, &tmp_dist); | |
283 | + SearchAtNodeRange(nodes, indices, d_reference_points, d_query_point, nodes[parent].right, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge); | |
290 | 284 | } |
291 | - if (tmp_dist < best_dist) { | |
292 | - best_dist = tmp_dist; | |
293 | - best_idx = tmp_idx; | |
285 | + if (tmp_dist < best_distance) { | |
286 | + best_distance = tmp_dist; | |
287 | + best_index = tmp_idx; | |
294 | 288 | } |
295 | 289 | cur = parent; |
296 | 290 | } |
297 | - *ret_index = best_idx; | |
298 | - *ret_dist = sqrt(best_dist); | |
291 | + *d_distance = sqrt(best_distance); | |
292 | + *d_index = best_index; | |
299 | 293 | } |
300 | 294 | template <typename T, int D> |
301 | - __global__ void SearchBatch(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, size_t num_pts, kdtree::point<T, D> *Query, size_t num_Query, size_t *ret_index, T *ret_dist) { | |
302 | - int idx = blockIdx.x * blockDim.x + threadIdx.x; | |
303 | - if (idx >= num_Query) return; | |
295 | + __global__ void SearchBatch(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) { | |
296 | + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; | |
297 | + if (idx >= d_query_count) return; // avoid segfault | |
304 | 298 | |
305 | - Search<T, D>(nodes, indices, pts, Query[idx], &ret_index[idx], &ret_dist[idx]); // every query points are independent | |
299 | + Search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent | |
306 | 300 | } |
307 | 301 | |
308 | 302 | template <typename T, int D = 3> |
309 | 303 | class cuda_kdtree { |
310 | 304 | protected: |
311 | - CUDA_KDNode<T> *m_gpu_nodes; // store nodes | |
312 | - size_t *m_gpu_indices; | |
313 | - kdtree::point<T, D>* m_gpu_points; | |
314 | - size_t m_num_points; | |
305 | + cuda_kdnode<T> *d_nodes; | |
306 | + size_t *d_index; | |
307 | + kdtree::point<T, D>* d_reference_points; | |
308 | + size_t d_reference_count; | |
315 | 309 | public: |
316 | 310 | ~cuda_kdtree() { |
317 | - HANDLE_ERROR(cudaFree(m_gpu_nodes)); | |
318 | - HANDLE_ERROR(cudaFree(m_gpu_indices)); | |
319 | - HANDLE_ERROR(cudaFree(m_gpu_points)); | |
311 | + HANDLE_ERROR(cudaFree(d_nodes)); | |
312 | + HANDLE_ERROR(cudaFree(d_index)); | |
313 | + HANDLE_ERROR(cudaFree(d_reference_points)); | |
320 | 314 | } |
321 | - void CreateKDTree(T *ReferencePoints, size_t ReferenceCount, size_t ColCount, size_t max_levels) { | |
322 | - std::vector < kdtree::point<T, D> > pts(ReferenceCount); // create specific struct of reference data | |
323 | - for (size_t j = 0; j < ReferenceCount; j++) | |
324 | - for (size_t i = 0; i < ColCount; i++) | |
325 | - pts[j].coords[i] = ReferencePoints[j * ColCount + i]; | |
326 | - cpu_kdtree<T, D> tree; // initialize a tree | |
327 | - tree.Create(pts, max_levels); // create KD-Tree on host | |
328 | - kdtree::KDNode<T> *root = tree.GetRoot(); | |
315 | + void CreateKDTree(T *h_reference_points, size_t reference_count, size_t dim_count, size_t max_levels) { | |
316 | + if (max_levels > 10) { | |
317 | + std::cout<<"The max_tree_levels should be smaller!"<<std::endl; | |
318 | + exit(1); | |
319 | + } | |
320 | + std::vector <kdtree::point<T, D>> reference_points(reference_count); // restore the reference points in particular way | |
321 | + for (size_t j = 0; j < reference_count; j++) | |
322 | + for (size_t i = 0; i < dim_count; i++) | |
323 | + reference_points[j].dim[i] = h_reference_points[j * dim_count + i]; | |
324 | + cpu_kdtree<T, D> tree; // creating a tree on cpu | |
325 | + tree.Create(reference_points, max_levels); // building a tree on cpu | |
326 | + kdtree::kdnode<T> *d_root = tree.GetRoot(); | |
329 | 327 | int num_nodes = tree.GetNumNodes(); |
330 | - /// create the same on CPU | |
331 | - m_num_points = pts.size(); // number of points for creating tree = reference_count in the case | |
328 | + d_reference_count = reference_points.size(); // also equals to reference_count | |
332 | 329 | |
333 | - HANDLE_ERROR(cudaMalloc((void**)&m_gpu_nodes, sizeof(CUDA_KDNode<T>) * num_nodes)); // private variables for kdtree | |
334 | - HANDLE_ERROR(cudaMalloc((void**)&m_gpu_indices, sizeof(size_t) * m_num_points)); | |
335 | - HANDLE_ERROR(cudaMalloc((void**)&m_gpu_points, sizeof(kdtree::point<T, D>) * m_num_points)); | |
336 | - | |
337 | - std::vector <CUDA_KDNode<T>> cpu_nodes(num_nodes); // from left to right, id of nodes | |
338 | - std::vector <size_t> indices(m_num_points); | |
339 | - std::vector < kdtree::KDNode<T>* > next_node; | |
330 | + HANDLE_ERROR(cudaMalloc((void**)&d_nodes, sizeof(cuda_kdnode<T>) * num_nodes)); // copy data from host to device | |
331 | + HANDLE_ERROR(cudaMalloc((void**)&d_index, sizeof(size_t) * d_reference_count)); | |
332 | + HANDLE_ERROR(cudaMalloc((void**)&d_reference_points, sizeof(kdtree::point<T, D>) * d_reference_count)); | |
340 | 333 | |
334 | + std::vector <cuda_kdnode<T>> tmp_nodes(num_nodes); | |
335 | + std::vector <size_t> indices(d_reference_count); | |
336 | + std::vector <kdtree::kdnode<T>*> next_nodes; | |
341 | 337 | size_t cur_pos = 0; |
342 | - | |
343 | - next_node.push_back(root); | |
344 | - | |
345 | - while (next_node.size()) { | |
346 | - std::vector <typename kdtree::KDNode<T>* > next_search; | |
347 | - | |
348 | - while (next_node.size()) { | |
349 | - kdtree::KDNode<T> *cur = next_node.back(); | |
350 | - next_node.pop_back(); | |
351 | - | |
352 | - int id = cur->id; // the nodes at same level are independent | |
353 | - | |
354 | - cpu_nodes[id].level = cur->level; | |
355 | - cpu_nodes[id].parent = cur->_parent; | |
356 | - cpu_nodes[id].left = cur->_left; | |
357 | - cpu_nodes[id].right = cur->_right; | |
358 | - cpu_nodes[id].split_value = cur->split_value; | |
359 | - cpu_nodes[id].num_indices = cur->indices.size(); // number of index | |
360 | - | |
338 | + next_nodes.push_back(d_root); | |
339 | + while (next_nodes.size()) { | |
340 | + std::vector <typename kdtree::kdnode<T>*> next_search_nodes; | |
341 | + while (next_nodes.size()) { | |
342 | + kdtree::kdnode<T> *cur = next_nodes.back(); | |
343 | + next_nodes.pop_back(); | |
344 | + int id = cur->idx; // the nodes at same level are independent | |
345 | + tmp_nodes[id].level = cur->level; | |
346 | + tmp_nodes[id].parent = cur->parent_idx; | |
347 | + tmp_nodes[id].left = cur->left_idx; | |
348 | + tmp_nodes[id].right = cur->right_idx; | |
349 | + tmp_nodes[id].split_value = cur->split_value; | |
350 | + tmp_nodes[id].num_index = cur->indices.size(); // number of index | |
361 | 351 | if (cur->indices.size()) { |
362 | 352 | for (size_t i = 0; i < cur->indices.size(); i++) |
363 | 353 | indices[cur_pos + i] = cur->indices[i]; |
364 | 354 | |
365 | - cpu_nodes[id].indices = (int)cur_pos; // beginning index that every bottom node has | |
366 | - cur_pos += cur->indices.size(); // store indices continuously | |
355 | + tmp_nodes[id].index = (int)cur_pos; // beginning index of reference_points that every bottom node has | |
356 | + cur_pos += cur->indices.size(); // store indices continuously for every query_point | |
367 | 357 | } |
368 | 358 | else { |
369 | - cpu_nodes[id].indices = -1; | |
359 | + tmp_nodes[id].index = -1; | |
370 | 360 | } |
371 | 361 | |
372 | 362 | if (cur->left) |
373 | - next_search.push_back(cur->left); | |
363 | + next_search_nodes.push_back(cur->left); | |
374 | 364 | |
375 | 365 | if (cur->right) |
376 | - next_search.push_back(cur->right); | |
366 | + next_search_nodes.push_back(cur->right); | |
377 | 367 | } |
378 | - next_node = next_search; | |
368 | + next_nodes = next_search_nodes; | |
379 | 369 | } |
380 | - | |
381 | - HANDLE_ERROR(cudaMemcpy(m_gpu_nodes, &cpu_nodes[0], sizeof(CUDA_KDNode<T>) * cpu_nodes.size(), cudaMemcpyHostToDevice)); | |
382 | - HANDLE_ERROR(cudaMemcpy(m_gpu_indices, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice)); | |
383 | - HANDLE_ERROR(cudaMemcpy(m_gpu_points, &pts[0], sizeof(kdtree::point<T, D>) * pts.size(), cudaMemcpyHostToDevice)); | |
370 | + HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice)); | |
371 | + HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice)); | |
372 | + HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point<T, D>) * reference_points.size(), cudaMemcpyHostToDevice)); | |
384 | 373 | } |
385 | - void Search(T *QueryPoints, size_t QueryCount, size_t ColCount, T *dists, size_t *indices) { | |
386 | - std::vector < kdtree::point<T, D> > query(QueryCount); | |
387 | - for (size_t j = 0; j < QueryCount; j++) | |
388 | - for (size_t i = 0; i < ColCount; i++) | |
389 | - query[j].coords[i] = QueryPoints[j * ColCount + i]; | |
390 | - | |
391 | - unsigned int threads = (unsigned int)(query.size() > 1024 ? 1024 : query.size()); | |
392 | - //unsigned int blocks = (unsigned int)ceil(query.size() / threads); | |
393 | - unsigned int blocks = (unsigned int)(query.size() / threads + (query.size() % threads ? 1 : 0)); | |
394 | - | |
395 | - kdtree::point<T, D> *gpu_Query; | |
396 | - size_t *gpu_ret_indices; | |
397 | - T *gpu_ret_dist; | |
398 | - | |
399 | - HANDLE_ERROR(cudaMalloc((void**)&gpu_Query, sizeof(T) * query.size() * D)); | |
400 | - HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_indices, sizeof(size_t) * query.size())); | |
401 | - HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_dist, sizeof(T) * query.size())); | |
402 | - HANDLE_ERROR(cudaMemcpy(gpu_Query, &query[0], sizeof(T) * query.size() * D, cudaMemcpyHostToDevice)); | |
403 | - | |
404 | - SearchBatch << <threads, blocks >> > (m_gpu_nodes, m_gpu_indices, m_gpu_points, m_num_points, gpu_Query, query.size(), gpu_ret_indices, gpu_ret_dist); | |
405 | - | |
406 | - HANDLE_ERROR(cudaMemcpy(indices, gpu_ret_indices, sizeof(size_t) * query.size(), cudaMemcpyDeviceToHost)); | |
407 | - HANDLE_ERROR(cudaMemcpy(dists, gpu_ret_dist, sizeof(T) * query.size(), cudaMemcpyDeviceToHost)); | |
374 | + void Search(T *h_query_points, size_t query_count, size_t dim_count, T *dists, size_t *indices) { | |
375 | + std::vector <kdtree::point<T, D>> query_points(query_count); | |
376 | + for (size_t j = 0; j < query_count; j++) | |
377 | + for (size_t i = 0; i < dim_count; i++) | |
378 | + query_points[j].dim[i] = h_query_points[j * dim_count + i]; | |
379 | + | |
380 | + unsigned int threads = (unsigned int)(query_points.size() > 1024 ? 1024 : query_points.size()); | |
381 | + unsigned int blocks = (unsigned int)(query_points.size() / threads + (query_points.size() % threads ? 1 : 0)); | |
382 | + | |
383 | + kdtree::point<T, D> *d_query_points; // create a pointer pointing to query points on gpu | |
384 | + size_t *d_indices; | |
385 | + T *d_distances; | |
386 | + | |
387 | + int *next_nodes; // create two STACK-like array | |
388 | + int *next_search_nodes; | |
389 | + | |
390 | + int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory | |
391 | + | |
392 | + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_points.size() * D)); | |
393 | + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_points.size())); | |
394 | + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_points.size())); | |
395 | + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * 1000 * sizeof(int))); // STACK size right now is 1000, you can change it if you mean to | |
396 | + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * 1000 * sizeof(int))); | |
397 | + HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_points.size() * D, cudaMemcpyHostToDevice)); | |
398 | + | |
399 | + SearchBatch<<<threads, blocks>>> (d_nodes, d_index, d_reference_points, d_query_points, query_points.size(), d_indices, d_distances, next_nodes, next_search_nodes, Judge); | |
400 | + | |
401 | + if (Judge == NULL) { // do the following work if the thread works safely | |
402 | + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_points.size(), cudaMemcpyDeviceToHost)); | |
403 | + HANDLE_ERROR(cudaMemcpy(dists, d_distances, sizeof(T) * query_points.size(), cudaMemcpyDeviceToHost)); | |
404 | + } | |
408 | 405 | |
409 | - HANDLE_ERROR(cudaFree(gpu_Query)); | |
410 | - HANDLE_ERROR(cudaFree(gpu_ret_indices)); | |
411 | - HANDLE_ERROR(cudaFree(gpu_ret_dist)); | |
406 | + HANDLE_ERROR(cudaFree(next_nodes)); | |
407 | + HANDLE_ERROR(cudaFree(next_search_nodes)); | |
408 | + HANDLE_ERROR(cudaFree(d_query_points)); | |
409 | + HANDLE_ERROR(cudaFree(d_indices)); | |
410 | + HANDLE_ERROR(cudaFree(d_distances)); | |
412 | 411 | } |
413 | 412 | }; |
414 | 413 | } //end namespace stim | ... | ... |