Commit cc09e435897de0d20bf3d141043f0b67ceadc62f
1 parent
19a3973b
added Jack's KD-Tree code
Showing
1 changed file
with
415 additions
and
0 deletions
Show diff stats
1 | +// change CUDA_STACK together with max_tree_levels in trial and error manner | ||
2 | +// data should be stored in row-major | ||
3 | +// x1,x2,x3,x4,x5...... | ||
4 | +// y1,y2,y3,y4,y5...... | ||
5 | +// .................... | ||
6 | +// .................... | ||
7 | + | ||
8 | +#ifndef KDTREE_H | ||
9 | +#define KDTREE_H | ||
10 | + | ||
11 | +#include "device_launch_parameters.h" | ||
12 | +#include <cuda.h> | ||
13 | +#include <cuda_runtime_api.h> | ||
14 | +#include "cuda_runtime.h" | ||
15 | +#include <vector> | ||
16 | +#include <float.h> | ||
17 | +#include <iostream> | ||
18 | +#include <algorithm> | ||
19 | + | ||
20 | +/// using API called HADDLE_ERROR | ||
21 | +static void HandleError(cudaError_t err, const char *file, int line) { | ||
22 | + if (err != cudaSuccess) { | ||
23 | + std::cout<<cudaGetErrorString(err)<<" in"<< file <<" at line "<<line<<std::endl; | ||
24 | + } | ||
25 | +} | ||
26 | +#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__)) | ||
27 | + | ||
28 | +#define CUDA_STACK 2 // implementation "stacks" on CUDA as to do store the nodes information | ||
29 | + | ||
30 | +namespace stim { | ||
31 | + namespace kdtree { | ||
32 | + template<typename T, int D> | ||
33 | + struct point { | ||
34 | + T coords[D]; // if we use size to measure a vector<point>, it will show the number of point structures | ||
35 | + }; | ||
36 | + | ||
37 | + template<typename T> | ||
38 | + class KDNode { | ||
39 | + public: | ||
40 | + KDNode() { // initialization | ||
41 | + parent = NULL; | ||
42 | + left = NULL; | ||
43 | + right = NULL; | ||
44 | + split_value = -1; | ||
45 | + _parent = -1; | ||
46 | + _left = -1; | ||
47 | + _right = -1; | ||
48 | + } | ||
49 | + int id; // id for current node | ||
50 | + size_t level; | ||
51 | + KDNode *parent, *left, *right; | ||
52 | + int _parent, _left, _right; // id for parent node | ||
53 | + T split_value; // node value | ||
54 | + std::vector <size_t> indices; // indices that indicate the data that current tree has | ||
55 | + }; | ||
56 | + } | ||
57 | + | ||
58 | + template <typename T, int D = 3> | ||
59 | + class cpu_kdtree { | ||
60 | + | ||
61 | + protected: | ||
62 | + std::vector <kdtree::point<T, D>> *m_pts; | ||
63 | + kdtree::KDNode<T> *m_root; // current node | ||
64 | + int m_current_axis; | ||
65 | + size_t m_levels; | ||
66 | + int m_cmps; // count how many comparisons are to made in the tree for one query | ||
67 | + int m_id; // level + 1 | ||
68 | + static cpu_kdtree<T, D> *myself; | ||
69 | + public: | ||
70 | + cpu_kdtree() { // initialization | ||
71 | + myself = this; | ||
72 | + m_id = 0; // id = level + 1, level -> axis index while id -> node identifier | ||
73 | + } | ||
74 | + ~cpu_kdtree() { // destructor for deleting what was created by kdtree() | ||
75 | + std::vector <kdtree::KDNode<T>*> next_node; | ||
76 | + next_node.push_back(m_root); | ||
77 | + while (next_node.size()) { | ||
78 | + std::vector <kdtree::KDNode<T>*> next_search; | ||
79 | + while (next_node.size()) { | ||
80 | + kdtree::KDNode<T> *cur = next_node.back(); | ||
81 | + next_node.pop_back(); | ||
82 | + if (cur->left) | ||
83 | + next_search.push_back(cur->left); | ||
84 | + if (cur->right) | ||
85 | + next_search.push_back(cur->right); | ||
86 | + delete cur; | ||
87 | + } | ||
88 | + next_node = next_search; | ||
89 | + } | ||
90 | + m_root = NULL; | ||
91 | + } | ||
92 | + void Create(std::vector <kdtree::point<T, D>> &pts, size_t max_levels) { | ||
93 | + m_pts = &pts; // create a pointer point to the input data | ||
94 | + m_levels = max_levels; // stores max tree levels | ||
95 | + m_root = new kdtree::KDNode<T>(); // using KDNode() to initialize an ancestor node | ||
96 | + m_root->id = m_id++; // id is 1 while level is 0 at the very beginning | ||
97 | + m_root->level = 0; // to begin with level 0 | ||
98 | + m_root->indices.resize(pts.size()); // initialize the size of whole indices | ||
99 | + for (size_t i = 0; i < pts.size(); i++) { | ||
100 | + m_root->indices[i] = i; // like what we did on Keys in GPU-BF part | ||
101 | + } | ||
102 | + std::vector <kdtree::KDNode<T>*> next_node; // next node | ||
103 | + next_node.push_back(m_root); // new node | ||
104 | + while (next_node.size()) { | ||
105 | + std::vector <kdtree::KDNode<T>*> next_search; | ||
106 | + while (next_node.size()) { // two same WHILE is because we need to make a new vector for searching | ||
107 | + kdtree::KDNode<T> *current_node = next_node.back(); // pointer point to current node (right first) | ||
108 | + next_node.pop_back(); // pop out current node in order to store next node | ||
109 | + if (current_node->level < max_levels) { // max_levels should be reasonably small compared with numbers of data | ||
110 | + if (current_node->indices.size() > 1) { | ||
111 | + kdtree::KDNode<T> *left = new kdtree::KDNode<T>(); | ||
112 | + kdtree::KDNode<T> *right = new kdtree::KDNode<T>(); | ||
113 | + left->id = m_id++; // risky guessing but OK for large amount of data since max_level is small | ||
114 | + right->id = m_id++; // risky guessing but OK for large amount of data since max_level is small | ||
115 | + Split(current_node, left, right); // split left and right and determine a node | ||
116 | + std::vector <size_t> temp; // empty vecters of int | ||
117 | + //temp.resize(current_node->indices.size()); | ||
118 | + current_node->indices.swap(temp); // clean up current tree's indices | ||
119 | + current_node->left = left; | ||
120 | + current_node->right = right; | ||
121 | + current_node->_left = left->id; // indicates it has left son node and gets its id | ||
122 | + current_node->_right = right->id; // indicates it has right son node and gets its id | ||
123 | + if (left->indices.size()) | ||
124 | + next_search.push_back(left); // right first then left according to stack(first in last out), it can be done in parallel for left and right are independent | ||
125 | + if (right->indices.size()) | ||
126 | + next_search.push_back(right); | ||
127 | + } | ||
128 | + } | ||
129 | + } | ||
130 | + next_node = next_search; | ||
131 | + } | ||
132 | + } | ||
133 | + static bool SortPoints(const size_t a, const size_t b) { | ||
134 | + std::vector <kdtree::point<T, D>> &pts = *myself->m_pts; | ||
135 | + return pts[a].coords[myself->m_current_axis] < pts[b].coords[myself->m_current_axis]; | ||
136 | + } | ||
137 | + void Split(kdtree::KDNode<T> *cur, kdtree::KDNode<T> *left, kdtree::KDNode<T> *right) { | ||
138 | + /// assume both two sides are created and sure it was | ||
139 | + std::vector <kdtree::point<T, D>> &pts = *m_pts; | ||
140 | + m_current_axis = cur->level % D; // indicate the judicative dimension or axis | ||
141 | + std::sort(cur->indices.begin(), cur->indices.end(), SortPoints); // using SortPoints as comparison function to sort the data | ||
142 | + size_t mid = cur->indices[cur->indices.size() / 2]; // odd in the mid, even take the floor | ||
143 | + cur->split_value = pts[mid].coords[m_current_axis]; // get the mother node | ||
144 | + left->parent = cur; // set the parent to current node for the next nodes | ||
145 | + right->parent = cur; | ||
146 | + left->level = cur->level + 1; | ||
147 | + right->level = cur->level + 1; | ||
148 | + left->_parent = cur->id; // indicates it has mother node and gets its id | ||
149 | + right->_parent = cur->id; // indicates it has mother node and gets its id | ||
150 | + for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right area one by one | ||
151 | + size_t idx = cur->indices[i]; | ||
152 | + if (pts[idx].coords[m_current_axis] < cur->split_value) | ||
153 | + left->indices.push_back(idx); | ||
154 | + else | ||
155 | + right->indices.push_back(idx); | ||
156 | + } | ||
157 | + } | ||
158 | + int GetNumNodes() const { return m_id; } | ||
159 | + kdtree::KDNode<T>* GetRoot() const { return m_root; } | ||
160 | + }; //end class kdtree | ||
161 | + | ||
162 | + template <typename T, int D> | ||
163 | + cpu_kdtree<T, D>* cpu_kdtree<T, D>::myself = NULL; // definition of myself pointer points to the specific class | ||
164 | + | ||
165 | + template <typename T> | ||
166 | + struct CUDA_KDNode { | ||
167 | + size_t level; | ||
168 | + int parent, left, right; // indicates id of | ||
169 | + T split_value; | ||
170 | + size_t num_indices; // number of indices it has | ||
171 | + int indices; // the beginning | ||
172 | + }; | ||
173 | + | ||
174 | + template <typename T, int D> | ||
175 | + __device__ T Distance(kdtree::point<T, D> &a, kdtree::point<T, D> &b) { | ||
176 | + T dist = 0; | ||
177 | + | ||
178 | + for (size_t i = 0; i < D; i++) { | ||
179 | + T d = a.coords[i] - b.coords[i]; | ||
180 | + dist += d*d; | ||
181 | + } | ||
182 | + return dist; | ||
183 | + } | ||
184 | + template <typename T, int D> | ||
185 | + __device__ void SearchAtNode(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, int cur, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist, int *ret_node) { | ||
186 | + /// finds the first possibility | ||
187 | + size_t best_idx = 0; | ||
188 | + T best_dist = FLT_MAX; | ||
189 | + | ||
190 | + while (true) { | ||
191 | + int split_axis = nodes[cur].level % D; | ||
192 | + if (nodes[cur].left == -1) { // if it doesn't have left son node | ||
193 | + *ret_node = cur; | ||
194 | + for (int i = 0; i < nodes[cur].num_indices; i++) { | ||
195 | + size_t idx = indices[nodes[cur].indices + i]; | ||
196 | + T dist = Distance<T, D>(Query, pts[idx]); | ||
197 | + if (dist < best_dist) { | ||
198 | + best_dist = dist; | ||
199 | + best_idx = idx; | ||
200 | + } | ||
201 | + } | ||
202 | + break; | ||
203 | + } | ||
204 | + else if (Query.coords[split_axis] < nodes[cur].split_value) { | ||
205 | + cur = nodes[cur].left; | ||
206 | + } | ||
207 | + else { | ||
208 | + cur = nodes[cur].right; | ||
209 | + } | ||
210 | + } | ||
211 | + *ret_index = best_idx; | ||
212 | + *ret_dist = best_dist; | ||
213 | + } | ||
214 | + template <typename T, int D> | ||
215 | + __device__ void SearchAtNodeRange(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, int cur, T range, size_t *ret_index, T *ret_dist) { | ||
216 | + /// search throught all the nodes that are within one range | ||
217 | + size_t best_idx = 0; | ||
218 | + T best_dist = FLT_MAX; | ||
219 | + /// using fixed stack, increase it when need | ||
220 | + int next_node[CUDA_STACK]; // should be larger than 1 | ||
221 | + int next_node_pos = 0; // initialize pop out order index | ||
222 | + next_node[next_node_pos++] = cur; // equals to next_node[next_node_pos] = cur, next_node_pos++ | ||
223 | + | ||
224 | + while (next_node_pos) { | ||
225 | + int next_search[CUDA_STACK]; // for store next nodes | ||
226 | + int next_search_pos = 0; // record push back order index | ||
227 | + while (next_node_pos) { | ||
228 | + cur = next_node[next_node_pos - 1]; // pop out the last in one and keep poping out | ||
229 | + next_node_pos--; | ||
230 | + int split_axis = nodes[cur].level % D; | ||
231 | + | ||
232 | + if (nodes[cur].left == -1) { | ||
233 | + for (int i = 0; i < nodes[cur].num_indices; i++) { | ||
234 | + int idx = indices[nodes[cur].indices + i]; // all indices are stored in one array, pick up from every node's beginning index | ||
235 | + T d = Distance<T>(Query, pts[idx]); | ||
236 | + if (d < best_dist) { | ||
237 | + best_dist = d; | ||
238 | + best_idx = idx; | ||
239 | + } | ||
240 | + } | ||
241 | + } | ||
242 | + else { | ||
243 | + T d = Query.coords[split_axis] - nodes[cur].split_value; | ||
244 | + | ||
245 | + if (fabs(d) > range) { | ||
246 | + if (d < 0) | ||
247 | + next_search[next_search_pos++] = nodes[cur].left; | ||
248 | + else | ||
249 | + next_search[next_search_pos++] = nodes[cur].right; | ||
250 | + } | ||
251 | + else { | ||
252 | + next_search[next_search_pos++] = nodes[cur].left; | ||
253 | + next_search[next_search_pos++] = nodes[cur].right; | ||
254 | + } | ||
255 | + } | ||
256 | + } | ||
257 | + | ||
258 | + for (int i = 0; i < next_search_pos; i++) | ||
259 | + next_node[i] = next_search[i]; | ||
260 | + next_node_pos = next_search_pos; // operation that really resemble STACK, namely first in last out | ||
261 | + } | ||
262 | + *ret_index = best_idx; | ||
263 | + *ret_dist = best_dist; | ||
264 | + } | ||
265 | + template <typename T, int D> | ||
266 | + __device__ void Search(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist) { | ||
267 | + /// find first nearest node | ||
268 | + int best_node = 0; | ||
269 | + size_t best_idx = 0; | ||
270 | + T best_dist = FLT_MAX; | ||
271 | + T radius = 0; | ||
272 | + SearchAtNode<T, D>(nodes, indices, pts, 0, Query, &best_idx, &best_dist, &best_node); | ||
273 | + radius = sqrt(best_dist); | ||
274 | + /// find other possibilities | ||
275 | + int cur = best_node; | ||
276 | + | ||
277 | + while (nodes[cur].parent != -1) { | ||
278 | + /// go up | ||
279 | + int parent = nodes[cur].parent; | ||
280 | + int split_axis = nodes[parent].level % D; | ||
281 | + /// search other node | ||
282 | + T tmp_dist = FLT_MAX; | ||
283 | + size_t tmp_idx; | ||
284 | + if (fabs(nodes[parent].split_value - Query.coords[split_axis]) <= radius) { | ||
285 | + /// search opposite node | ||
286 | + if (nodes[parent].left != cur) | ||
287 | + SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].left, radius, &tmp_idx, &tmp_dist); | ||
288 | + else | ||
289 | + SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].right, radius, &tmp_idx, &tmp_dist); | ||
290 | + } | ||
291 | + if (tmp_dist < best_dist) { | ||
292 | + best_dist = tmp_dist; | ||
293 | + best_idx = tmp_idx; | ||
294 | + } | ||
295 | + cur = parent; | ||
296 | + } | ||
297 | + *ret_index = best_idx; | ||
298 | + *ret_dist = sqrt(best_dist); | ||
299 | + } | ||
300 | + template <typename T, int D> | ||
301 | + __global__ void SearchBatch(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, size_t num_pts, kdtree::point<T, D> *Query, size_t num_Query, size_t *ret_index, T *ret_dist) { | ||
302 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
303 | + if (idx >= num_Query) return; | ||
304 | + | ||
305 | + Search<T, D>(nodes, indices, pts, Query[idx], &ret_index[idx], &ret_dist[idx]); // every query points are independent | ||
306 | + } | ||
307 | + | ||
308 | + template <typename T, int D = 3> | ||
309 | + class cuda_kdtree { | ||
310 | + protected: | ||
311 | + CUDA_KDNode<T> *m_gpu_nodes; // store nodes | ||
312 | + size_t *m_gpu_indices; | ||
313 | + kdtree::point<T, D>* m_gpu_points; | ||
314 | + size_t m_num_points; | ||
315 | + public: | ||
316 | + ~cuda_kdtree() { | ||
317 | + HANDLE_ERROR(cudaFree(m_gpu_nodes)); | ||
318 | + HANDLE_ERROR(cudaFree(m_gpu_indices)); | ||
319 | + HANDLE_ERROR(cudaFree(m_gpu_points)); | ||
320 | + } | ||
321 | + void CreateKDTree(T *ReferencePoints, size_t ReferenceCount, size_t ColCount, size_t max_levels) { | ||
322 | + std::vector < kdtree::point<T, D> > pts(ReferenceCount); // create specific struct of reference data | ||
323 | + for (size_t j = 0; j < ReferenceCount; j++) | ||
324 | + for (size_t i = 0; i < ColCount; i++) | ||
325 | + pts[j].coords[i] = ReferencePoints[j * ColCount + i]; | ||
326 | + cpu_kdtree<T, D> tree; // initialize a tree | ||
327 | + tree.Create(pts, max_levels); // create KD-Tree on host | ||
328 | + kdtree::KDNode<T> *root = tree.GetRoot(); | ||
329 | + int num_nodes = tree.GetNumNodes(); | ||
330 | + /// create the same on CPU | ||
331 | + m_num_points = pts.size(); // number of points for creating tree = reference_count in the case | ||
332 | + | ||
333 | + HANDLE_ERROR(cudaMalloc((void**)&m_gpu_nodes, sizeof(CUDA_KDNode<T>) * num_nodes)); // private variables for kdtree | ||
334 | + HANDLE_ERROR(cudaMalloc((void**)&m_gpu_indices, sizeof(size_t) * m_num_points)); | ||
335 | + HANDLE_ERROR(cudaMalloc((void**)&m_gpu_points, sizeof(kdtree::point<T, D>) * m_num_points)); | ||
336 | + | ||
337 | + std::vector <CUDA_KDNode<T>> cpu_nodes(num_nodes); // from left to right, id of nodes | ||
338 | + std::vector <size_t> indices(m_num_points); | ||
339 | + std::vector < kdtree::KDNode<T>* > next_node; | ||
340 | + | ||
341 | + size_t cur_pos = 0; | ||
342 | + | ||
343 | + next_node.push_back(root); | ||
344 | + | ||
345 | + while (next_node.size()) { | ||
346 | + std::vector <typename kdtree::KDNode<T>* > next_search; | ||
347 | + | ||
348 | + while (next_node.size()) { | ||
349 | + kdtree::KDNode<T> *cur = next_node.back(); | ||
350 | + next_node.pop_back(); | ||
351 | + | ||
352 | + int id = cur->id; // the nodes at same level are independent | ||
353 | + | ||
354 | + cpu_nodes[id].level = cur->level; | ||
355 | + cpu_nodes[id].parent = cur->_parent; | ||
356 | + cpu_nodes[id].left = cur->_left; | ||
357 | + cpu_nodes[id].right = cur->_right; | ||
358 | + cpu_nodes[id].split_value = cur->split_value; | ||
359 | + cpu_nodes[id].num_indices = cur->indices.size(); // number of index | ||
360 | + | ||
361 | + if (cur->indices.size()) { | ||
362 | + for (size_t i = 0; i < cur->indices.size(); i++) | ||
363 | + indices[cur_pos + i] = cur->indices[i]; | ||
364 | + | ||
365 | + cpu_nodes[id].indices = (int)cur_pos; // beginning index that every bottom node has | ||
366 | + cur_pos += cur->indices.size(); // store indices continuously | ||
367 | + } | ||
368 | + else { | ||
369 | + cpu_nodes[id].indices = -1; | ||
370 | + } | ||
371 | + | ||
372 | + if (cur->left) | ||
373 | + next_search.push_back(cur->left); | ||
374 | + | ||
375 | + if (cur->right) | ||
376 | + next_search.push_back(cur->right); | ||
377 | + } | ||
378 | + next_node = next_search; | ||
379 | + } | ||
380 | + | ||
381 | + HANDLE_ERROR(cudaMemcpy(m_gpu_nodes, &cpu_nodes[0], sizeof(CUDA_KDNode<T>) * cpu_nodes.size(), cudaMemcpyHostToDevice)); | ||
382 | + HANDLE_ERROR(cudaMemcpy(m_gpu_indices, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice)); | ||
383 | + HANDLE_ERROR(cudaMemcpy(m_gpu_points, &pts[0], sizeof(kdtree::point<T, D>) * pts.size(), cudaMemcpyHostToDevice)); | ||
384 | + } | ||
385 | + void Search(T *QueryPoints, size_t QueryCount, size_t ColCount, T *dists, size_t *indices) { | ||
386 | + std::vector < kdtree::point<T, D> > query(QueryCount); | ||
387 | + for (size_t j = 0; j < QueryCount; j++) | ||
388 | + for (size_t i = 0; i < ColCount; i++) | ||
389 | + query[j].coords[i] = QueryPoints[j * ColCount + i]; | ||
390 | + | ||
391 | + unsigned int threads = (unsigned int)(query.size() > 1024 ? 1024 : query.size()); | ||
392 | + //unsigned int blocks = (unsigned int)ceil(query.size() / threads); | ||
393 | + unsigned int blocks = (unsigned int)(query.size() / threads + (query.size() % threads ? 1 : 0)); | ||
394 | + | ||
395 | + kdtree::point<T, D> *gpu_Query; | ||
396 | + size_t *gpu_ret_indices; | ||
397 | + T *gpu_ret_dist; | ||
398 | + | ||
399 | + HANDLE_ERROR(cudaMalloc((void**)&gpu_Query, sizeof(T) * query.size() * D)); | ||
400 | + HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_indices, sizeof(size_t) * query.size())); | ||
401 | + HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_dist, sizeof(T) * query.size())); | ||
402 | + HANDLE_ERROR(cudaMemcpy(gpu_Query, &query[0], sizeof(T) * query.size() * D, cudaMemcpyHostToDevice)); | ||
403 | + | ||
404 | + SearchBatch << <threads, blocks >> > (m_gpu_nodes, m_gpu_indices, m_gpu_points, m_num_points, gpu_Query, query.size(), gpu_ret_indices, gpu_ret_dist); | ||
405 | + | ||
406 | + HANDLE_ERROR(cudaMemcpy(indices, gpu_ret_indices, sizeof(size_t) * query.size(), cudaMemcpyDeviceToHost)); | ||
407 | + HANDLE_ERROR(cudaMemcpy(dists, gpu_ret_dist, sizeof(T) * query.size(), cudaMemcpyDeviceToHost)); | ||
408 | + | ||
409 | + HANDLE_ERROR(cudaFree(gpu_Query)); | ||
410 | + HANDLE_ERROR(cudaFree(gpu_ret_indices)); | ||
411 | + HANDLE_ERROR(cudaFree(gpu_ret_dist)); | ||
412 | + } | ||
413 | + }; | ||
414 | +} //end namespace stim | ||
415 | +#endif | ||
0 | \ No newline at end of file | 416 | \ No newline at end of file |