Commit cc09e435897de0d20bf3d141043f0b67ceadc62f
1 parent
19a3973b
added Jack's KD-Tree code
Showing
1 changed file
with
415 additions
and
0 deletions
Show diff stats
1 | +// change CUDA_STACK together with max_tree_levels in trial and error manner | |
2 | +// data should be stored in row-major | |
3 | +// x1,x2,x3,x4,x5...... | |
4 | +// y1,y2,y3,y4,y5...... | |
5 | +// .................... | |
6 | +// .................... | |
7 | + | |
8 | +#ifndef KDTREE_H | |
9 | +#define KDTREE_H | |
10 | + | |
11 | +#include "device_launch_parameters.h" | |
12 | +#include <cuda.h> | |
13 | +#include <cuda_runtime_api.h> | |
14 | +#include "cuda_runtime.h" | |
15 | +#include <vector> | |
16 | +#include <float.h> | |
17 | +#include <iostream> | |
18 | +#include <algorithm> | |
19 | + | |
20 | +/// using API called HADDLE_ERROR | |
21 | +static void HandleError(cudaError_t err, const char *file, int line) { | |
22 | + if (err != cudaSuccess) { | |
23 | + std::cout<<cudaGetErrorString(err)<<" in"<< file <<" at line "<<line<<std::endl; | |
24 | + } | |
25 | +} | |
26 | +#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__)) | |
27 | + | |
28 | +#define CUDA_STACK 2 // implementation "stacks" on CUDA as to do store the nodes information | |
29 | + | |
30 | +namespace stim { | |
31 | + namespace kdtree { | |
32 | + template<typename T, int D> | |
33 | + struct point { | |
34 | + T coords[D]; // if we use size to measure a vector<point>, it will show the number of point structures | |
35 | + }; | |
36 | + | |
37 | + template<typename T> | |
38 | + class KDNode { | |
39 | + public: | |
40 | + KDNode() { // initialization | |
41 | + parent = NULL; | |
42 | + left = NULL; | |
43 | + right = NULL; | |
44 | + split_value = -1; | |
45 | + _parent = -1; | |
46 | + _left = -1; | |
47 | + _right = -1; | |
48 | + } | |
49 | + int id; // id for current node | |
50 | + size_t level; | |
51 | + KDNode *parent, *left, *right; | |
52 | + int _parent, _left, _right; // id for parent node | |
53 | + T split_value; // node value | |
54 | + std::vector <size_t> indices; // indices that indicate the data that current tree has | |
55 | + }; | |
56 | + } | |
57 | + | |
58 | + template <typename T, int D = 3> | |
59 | + class cpu_kdtree { | |
60 | + | |
61 | + protected: | |
62 | + std::vector <kdtree::point<T, D>> *m_pts; | |
63 | + kdtree::KDNode<T> *m_root; // current node | |
64 | + int m_current_axis; | |
65 | + size_t m_levels; | |
66 | + int m_cmps; // count how many comparisons are to made in the tree for one query | |
67 | + int m_id; // level + 1 | |
68 | + static cpu_kdtree<T, D> *myself; | |
69 | + public: | |
70 | + cpu_kdtree() { // initialization | |
71 | + myself = this; | |
72 | + m_id = 0; // id = level + 1, level -> axis index while id -> node identifier | |
73 | + } | |
74 | + ~cpu_kdtree() { // destructor for deleting what was created by kdtree() | |
75 | + std::vector <kdtree::KDNode<T>*> next_node; | |
76 | + next_node.push_back(m_root); | |
77 | + while (next_node.size()) { | |
78 | + std::vector <kdtree::KDNode<T>*> next_search; | |
79 | + while (next_node.size()) { | |
80 | + kdtree::KDNode<T> *cur = next_node.back(); | |
81 | + next_node.pop_back(); | |
82 | + if (cur->left) | |
83 | + next_search.push_back(cur->left); | |
84 | + if (cur->right) | |
85 | + next_search.push_back(cur->right); | |
86 | + delete cur; | |
87 | + } | |
88 | + next_node = next_search; | |
89 | + } | |
90 | + m_root = NULL; | |
91 | + } | |
92 | + void Create(std::vector <kdtree::point<T, D>> &pts, size_t max_levels) { | |
93 | + m_pts = &pts; // create a pointer point to the input data | |
94 | + m_levels = max_levels; // stores max tree levels | |
95 | + m_root = new kdtree::KDNode<T>(); // using KDNode() to initialize an ancestor node | |
96 | + m_root->id = m_id++; // id is 1 while level is 0 at the very beginning | |
97 | + m_root->level = 0; // to begin with level 0 | |
98 | + m_root->indices.resize(pts.size()); // initialize the size of whole indices | |
99 | + for (size_t i = 0; i < pts.size(); i++) { | |
100 | + m_root->indices[i] = i; // like what we did on Keys in GPU-BF part | |
101 | + } | |
102 | + std::vector <kdtree::KDNode<T>*> next_node; // next node | |
103 | + next_node.push_back(m_root); // new node | |
104 | + while (next_node.size()) { | |
105 | + std::vector <kdtree::KDNode<T>*> next_search; | |
106 | + while (next_node.size()) { // two same WHILE is because we need to make a new vector for searching | |
107 | + kdtree::KDNode<T> *current_node = next_node.back(); // pointer point to current node (right first) | |
108 | + next_node.pop_back(); // pop out current node in order to store next node | |
109 | + if (current_node->level < max_levels) { // max_levels should be reasonably small compared with numbers of data | |
110 | + if (current_node->indices.size() > 1) { | |
111 | + kdtree::KDNode<T> *left = new kdtree::KDNode<T>(); | |
112 | + kdtree::KDNode<T> *right = new kdtree::KDNode<T>(); | |
113 | + left->id = m_id++; // risky guessing but OK for large amount of data since max_level is small | |
114 | + right->id = m_id++; // risky guessing but OK for large amount of data since max_level is small | |
115 | + Split(current_node, left, right); // split left and right and determine a node | |
116 | + std::vector <size_t> temp; // empty vecters of int | |
117 | + //temp.resize(current_node->indices.size()); | |
118 | + current_node->indices.swap(temp); // clean up current tree's indices | |
119 | + current_node->left = left; | |
120 | + current_node->right = right; | |
121 | + current_node->_left = left->id; // indicates it has left son node and gets its id | |
122 | + current_node->_right = right->id; // indicates it has right son node and gets its id | |
123 | + if (left->indices.size()) | |
124 | + next_search.push_back(left); // right first then left according to stack(first in last out), it can be done in parallel for left and right are independent | |
125 | + if (right->indices.size()) | |
126 | + next_search.push_back(right); | |
127 | + } | |
128 | + } | |
129 | + } | |
130 | + next_node = next_search; | |
131 | + } | |
132 | + } | |
133 | + static bool SortPoints(const size_t a, const size_t b) { | |
134 | + std::vector <kdtree::point<T, D>> &pts = *myself->m_pts; | |
135 | + return pts[a].coords[myself->m_current_axis] < pts[b].coords[myself->m_current_axis]; | |
136 | + } | |
137 | + void Split(kdtree::KDNode<T> *cur, kdtree::KDNode<T> *left, kdtree::KDNode<T> *right) { | |
138 | + /// assume both two sides are created and sure it was | |
139 | + std::vector <kdtree::point<T, D>> &pts = *m_pts; | |
140 | + m_current_axis = cur->level % D; // indicate the judicative dimension or axis | |
141 | + std::sort(cur->indices.begin(), cur->indices.end(), SortPoints); // using SortPoints as comparison function to sort the data | |
142 | + size_t mid = cur->indices[cur->indices.size() / 2]; // odd in the mid, even take the floor | |
143 | + cur->split_value = pts[mid].coords[m_current_axis]; // get the mother node | |
144 | + left->parent = cur; // set the parent to current node for the next nodes | |
145 | + right->parent = cur; | |
146 | + left->level = cur->level + 1; | |
147 | + right->level = cur->level + 1; | |
148 | + left->_parent = cur->id; // indicates it has mother node and gets its id | |
149 | + right->_parent = cur->id; // indicates it has mother node and gets its id | |
150 | + for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right area one by one | |
151 | + size_t idx = cur->indices[i]; | |
152 | + if (pts[idx].coords[m_current_axis] < cur->split_value) | |
153 | + left->indices.push_back(idx); | |
154 | + else | |
155 | + right->indices.push_back(idx); | |
156 | + } | |
157 | + } | |
158 | + int GetNumNodes() const { return m_id; } | |
159 | + kdtree::KDNode<T>* GetRoot() const { return m_root; } | |
160 | + }; //end class kdtree | |
161 | + | |
162 | + template <typename T, int D> | |
163 | + cpu_kdtree<T, D>* cpu_kdtree<T, D>::myself = NULL; // definition of myself pointer points to the specific class | |
164 | + | |
165 | + template <typename T> | |
166 | + struct CUDA_KDNode { | |
167 | + size_t level; | |
168 | + int parent, left, right; // indicates id of | |
169 | + T split_value; | |
170 | + size_t num_indices; // number of indices it has | |
171 | + int indices; // the beginning | |
172 | + }; | |
173 | + | |
174 | + template <typename T, int D> | |
175 | + __device__ T Distance(kdtree::point<T, D> &a, kdtree::point<T, D> &b) { | |
176 | + T dist = 0; | |
177 | + | |
178 | + for (size_t i = 0; i < D; i++) { | |
179 | + T d = a.coords[i] - b.coords[i]; | |
180 | + dist += d*d; | |
181 | + } | |
182 | + return dist; | |
183 | + } | |
184 | + template <typename T, int D> | |
185 | + __device__ void SearchAtNode(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, int cur, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist, int *ret_node) { | |
186 | + /// finds the first possibility | |
187 | + size_t best_idx = 0; | |
188 | + T best_dist = FLT_MAX; | |
189 | + | |
190 | + while (true) { | |
191 | + int split_axis = nodes[cur].level % D; | |
192 | + if (nodes[cur].left == -1) { // if it doesn't have left son node | |
193 | + *ret_node = cur; | |
194 | + for (int i = 0; i < nodes[cur].num_indices; i++) { | |
195 | + size_t idx = indices[nodes[cur].indices + i]; | |
196 | + T dist = Distance<T, D>(Query, pts[idx]); | |
197 | + if (dist < best_dist) { | |
198 | + best_dist = dist; | |
199 | + best_idx = idx; | |
200 | + } | |
201 | + } | |
202 | + break; | |
203 | + } | |
204 | + else if (Query.coords[split_axis] < nodes[cur].split_value) { | |
205 | + cur = nodes[cur].left; | |
206 | + } | |
207 | + else { | |
208 | + cur = nodes[cur].right; | |
209 | + } | |
210 | + } | |
211 | + *ret_index = best_idx; | |
212 | + *ret_dist = best_dist; | |
213 | + } | |
214 | + template <typename T, int D> | |
215 | + __device__ void SearchAtNodeRange(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, int cur, T range, size_t *ret_index, T *ret_dist) { | |
216 | + /// search throught all the nodes that are within one range | |
217 | + size_t best_idx = 0; | |
218 | + T best_dist = FLT_MAX; | |
219 | + /// using fixed stack, increase it when need | |
220 | + int next_node[CUDA_STACK]; // should be larger than 1 | |
221 | + int next_node_pos = 0; // initialize pop out order index | |
222 | + next_node[next_node_pos++] = cur; // equals to next_node[next_node_pos] = cur, next_node_pos++ | |
223 | + | |
224 | + while (next_node_pos) { | |
225 | + int next_search[CUDA_STACK]; // for store next nodes | |
226 | + int next_search_pos = 0; // record push back order index | |
227 | + while (next_node_pos) { | |
228 | + cur = next_node[next_node_pos - 1]; // pop out the last in one and keep poping out | |
229 | + next_node_pos--; | |
230 | + int split_axis = nodes[cur].level % D; | |
231 | + | |
232 | + if (nodes[cur].left == -1) { | |
233 | + for (int i = 0; i < nodes[cur].num_indices; i++) { | |
234 | + int idx = indices[nodes[cur].indices + i]; // all indices are stored in one array, pick up from every node's beginning index | |
235 | + T d = Distance<T>(Query, pts[idx]); | |
236 | + if (d < best_dist) { | |
237 | + best_dist = d; | |
238 | + best_idx = idx; | |
239 | + } | |
240 | + } | |
241 | + } | |
242 | + else { | |
243 | + T d = Query.coords[split_axis] - nodes[cur].split_value; | |
244 | + | |
245 | + if (fabs(d) > range) { | |
246 | + if (d < 0) | |
247 | + next_search[next_search_pos++] = nodes[cur].left; | |
248 | + else | |
249 | + next_search[next_search_pos++] = nodes[cur].right; | |
250 | + } | |
251 | + else { | |
252 | + next_search[next_search_pos++] = nodes[cur].left; | |
253 | + next_search[next_search_pos++] = nodes[cur].right; | |
254 | + } | |
255 | + } | |
256 | + } | |
257 | + | |
258 | + for (int i = 0; i < next_search_pos; i++) | |
259 | + next_node[i] = next_search[i]; | |
260 | + next_node_pos = next_search_pos; // operation that really resemble STACK, namely first in last out | |
261 | + } | |
262 | + *ret_index = best_idx; | |
263 | + *ret_dist = best_dist; | |
264 | + } | |
265 | + template <typename T, int D> | |
266 | + __device__ void Search(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist) { | |
267 | + /// find first nearest node | |
268 | + int best_node = 0; | |
269 | + size_t best_idx = 0; | |
270 | + T best_dist = FLT_MAX; | |
271 | + T radius = 0; | |
272 | + SearchAtNode<T, D>(nodes, indices, pts, 0, Query, &best_idx, &best_dist, &best_node); | |
273 | + radius = sqrt(best_dist); | |
274 | + /// find other possibilities | |
275 | + int cur = best_node; | |
276 | + | |
277 | + while (nodes[cur].parent != -1) { | |
278 | + /// go up | |
279 | + int parent = nodes[cur].parent; | |
280 | + int split_axis = nodes[parent].level % D; | |
281 | + /// search other node | |
282 | + T tmp_dist = FLT_MAX; | |
283 | + size_t tmp_idx; | |
284 | + if (fabs(nodes[parent].split_value - Query.coords[split_axis]) <= radius) { | |
285 | + /// search opposite node | |
286 | + if (nodes[parent].left != cur) | |
287 | + SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].left, radius, &tmp_idx, &tmp_dist); | |
288 | + else | |
289 | + SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].right, radius, &tmp_idx, &tmp_dist); | |
290 | + } | |
291 | + if (tmp_dist < best_dist) { | |
292 | + best_dist = tmp_dist; | |
293 | + best_idx = tmp_idx; | |
294 | + } | |
295 | + cur = parent; | |
296 | + } | |
297 | + *ret_index = best_idx; | |
298 | + *ret_dist = sqrt(best_dist); | |
299 | + } | |
300 | + template <typename T, int D> | |
301 | + __global__ void SearchBatch(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, size_t num_pts, kdtree::point<T, D> *Query, size_t num_Query, size_t *ret_index, T *ret_dist) { | |
302 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; | |
303 | + if (idx >= num_Query) return; | |
304 | + | |
305 | + Search<T, D>(nodes, indices, pts, Query[idx], &ret_index[idx], &ret_dist[idx]); // every query points are independent | |
306 | + } | |
307 | + | |
308 | + template <typename T, int D = 3> | |
309 | + class cuda_kdtree { | |
310 | + protected: | |
311 | + CUDA_KDNode<T> *m_gpu_nodes; // store nodes | |
312 | + size_t *m_gpu_indices; | |
313 | + kdtree::point<T, D>* m_gpu_points; | |
314 | + size_t m_num_points; | |
315 | + public: | |
316 | + ~cuda_kdtree() { | |
317 | + HANDLE_ERROR(cudaFree(m_gpu_nodes)); | |
318 | + HANDLE_ERROR(cudaFree(m_gpu_indices)); | |
319 | + HANDLE_ERROR(cudaFree(m_gpu_points)); | |
320 | + } | |
321 | + void CreateKDTree(T *ReferencePoints, size_t ReferenceCount, size_t ColCount, size_t max_levels) { | |
322 | + std::vector < kdtree::point<T, D> > pts(ReferenceCount); // create specific struct of reference data | |
323 | + for (size_t j = 0; j < ReferenceCount; j++) | |
324 | + for (size_t i = 0; i < ColCount; i++) | |
325 | + pts[j].coords[i] = ReferencePoints[j * ColCount + i]; | |
326 | + cpu_kdtree<T, D> tree; // initialize a tree | |
327 | + tree.Create(pts, max_levels); // create KD-Tree on host | |
328 | + kdtree::KDNode<T> *root = tree.GetRoot(); | |
329 | + int num_nodes = tree.GetNumNodes(); | |
330 | + /// create the same on CPU | |
331 | + m_num_points = pts.size(); // number of points for creating tree = reference_count in the case | |
332 | + | |
333 | + HANDLE_ERROR(cudaMalloc((void**)&m_gpu_nodes, sizeof(CUDA_KDNode<T>) * num_nodes)); // private variables for kdtree | |
334 | + HANDLE_ERROR(cudaMalloc((void**)&m_gpu_indices, sizeof(size_t) * m_num_points)); | |
335 | + HANDLE_ERROR(cudaMalloc((void**)&m_gpu_points, sizeof(kdtree::point<T, D>) * m_num_points)); | |
336 | + | |
337 | + std::vector <CUDA_KDNode<T>> cpu_nodes(num_nodes); // from left to right, id of nodes | |
338 | + std::vector <size_t> indices(m_num_points); | |
339 | + std::vector < kdtree::KDNode<T>* > next_node; | |
340 | + | |
341 | + size_t cur_pos = 0; | |
342 | + | |
343 | + next_node.push_back(root); | |
344 | + | |
345 | + while (next_node.size()) { | |
346 | + std::vector <typename kdtree::KDNode<T>* > next_search; | |
347 | + | |
348 | + while (next_node.size()) { | |
349 | + kdtree::KDNode<T> *cur = next_node.back(); | |
350 | + next_node.pop_back(); | |
351 | + | |
352 | + int id = cur->id; // the nodes at same level are independent | |
353 | + | |
354 | + cpu_nodes[id].level = cur->level; | |
355 | + cpu_nodes[id].parent = cur->_parent; | |
356 | + cpu_nodes[id].left = cur->_left; | |
357 | + cpu_nodes[id].right = cur->_right; | |
358 | + cpu_nodes[id].split_value = cur->split_value; | |
359 | + cpu_nodes[id].num_indices = cur->indices.size(); // number of index | |
360 | + | |
361 | + if (cur->indices.size()) { | |
362 | + for (size_t i = 0; i < cur->indices.size(); i++) | |
363 | + indices[cur_pos + i] = cur->indices[i]; | |
364 | + | |
365 | + cpu_nodes[id].indices = (int)cur_pos; // beginning index that every bottom node has | |
366 | + cur_pos += cur->indices.size(); // store indices continuously | |
367 | + } | |
368 | + else { | |
369 | + cpu_nodes[id].indices = -1; | |
370 | + } | |
371 | + | |
372 | + if (cur->left) | |
373 | + next_search.push_back(cur->left); | |
374 | + | |
375 | + if (cur->right) | |
376 | + next_search.push_back(cur->right); | |
377 | + } | |
378 | + next_node = next_search; | |
379 | + } | |
380 | + | |
381 | + HANDLE_ERROR(cudaMemcpy(m_gpu_nodes, &cpu_nodes[0], sizeof(CUDA_KDNode<T>) * cpu_nodes.size(), cudaMemcpyHostToDevice)); | |
382 | + HANDLE_ERROR(cudaMemcpy(m_gpu_indices, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice)); | |
383 | + HANDLE_ERROR(cudaMemcpy(m_gpu_points, &pts[0], sizeof(kdtree::point<T, D>) * pts.size(), cudaMemcpyHostToDevice)); | |
384 | + } | |
385 | + void Search(T *QueryPoints, size_t QueryCount, size_t ColCount, T *dists, size_t *indices) { | |
386 | + std::vector < kdtree::point<T, D> > query(QueryCount); | |
387 | + for (size_t j = 0; j < QueryCount; j++) | |
388 | + for (size_t i = 0; i < ColCount; i++) | |
389 | + query[j].coords[i] = QueryPoints[j * ColCount + i]; | |
390 | + | |
391 | + unsigned int threads = (unsigned int)(query.size() > 1024 ? 1024 : query.size()); | |
392 | + //unsigned int blocks = (unsigned int)ceil(query.size() / threads); | |
393 | + unsigned int blocks = (unsigned int)(query.size() / threads + (query.size() % threads ? 1 : 0)); | |
394 | + | |
395 | + kdtree::point<T, D> *gpu_Query; | |
396 | + size_t *gpu_ret_indices; | |
397 | + T *gpu_ret_dist; | |
398 | + | |
399 | + HANDLE_ERROR(cudaMalloc((void**)&gpu_Query, sizeof(T) * query.size() * D)); | |
400 | + HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_indices, sizeof(size_t) * query.size())); | |
401 | + HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_dist, sizeof(T) * query.size())); | |
402 | + HANDLE_ERROR(cudaMemcpy(gpu_Query, &query[0], sizeof(T) * query.size() * D, cudaMemcpyHostToDevice)); | |
403 | + | |
404 | + SearchBatch << <threads, blocks >> > (m_gpu_nodes, m_gpu_indices, m_gpu_points, m_num_points, gpu_Query, query.size(), gpu_ret_indices, gpu_ret_dist); | |
405 | + | |
406 | + HANDLE_ERROR(cudaMemcpy(indices, gpu_ret_indices, sizeof(size_t) * query.size(), cudaMemcpyDeviceToHost)); | |
407 | + HANDLE_ERROR(cudaMemcpy(dists, gpu_ret_dist, sizeof(T) * query.size(), cudaMemcpyDeviceToHost)); | |
408 | + | |
409 | + HANDLE_ERROR(cudaFree(gpu_Query)); | |
410 | + HANDLE_ERROR(cudaFree(gpu_ret_indices)); | |
411 | + HANDLE_ERROR(cudaFree(gpu_ret_dist)); | |
412 | + } | |
413 | + }; | |
414 | +} //end namespace stim | |
415 | +#endif | |
0 | 416 | \ No newline at end of file | ... | ... |