Commit 6a54a547256f32bceb7afb81e456d9c5fc699bee
1 parent
42bb075c
new version of kdtree
Showing
1 changed file
with
627 additions
and
0 deletions
Show diff stats
1 | +// right now the size of CUDA STACK is set to 50, increase it if you mean to make deeper tree | |
2 | +// data should be stored in row-major | |
3 | +// x1,x2,x3,x4,x5...... | |
4 | +// y1,y2,y3,y4,y5...... | |
5 | +// .................... | |
6 | +// .................... | |
7 | + | |
8 | +#ifndef KDTREE_H | |
9 | +#define KDTREE_H | |
10 | +#define stack_size 50 | |
11 | + | |
12 | +#include "device_launch_parameters.h" | |
13 | +#include <cuda.h> | |
14 | +#include <cuda_runtime_api.h> | |
15 | +#include "cuda_runtime.h" | |
16 | +#include <vector> | |
17 | +#include <cstring> | |
18 | +#include <float.h> | |
19 | +#include <iostream> | |
20 | +#include <algorithm> | |
21 | +#include <stim/cuda/cudatools/error.h> | |
22 | +#include <stim/visualization/aabbn.h> | |
23 | + | |
24 | +namespace stim { | |
25 | + namespace cpu_kdtree { | |
26 | + template<typename T, int D> // typename refers to float or double while D refers to dimension of points | |
27 | + struct point { | |
28 | + T dim[D]; // create a structure to store every one input point | |
29 | + }; | |
30 | + | |
31 | + template<typename T> | |
32 | + class cpu_kdnode { | |
33 | + public: | |
34 | + cpu_kdnode() { // constructor for initializing a kdnode | |
35 | + parent = NULL; // set every node's parent, left and right kdnode pointers to NULL | |
36 | + left = NULL; | |
37 | + right = NULL; | |
38 | + parent_idx = -1; // set parent node index to default -1 | |
39 | + left_idx = -1; | |
40 | + right_idx = -1; | |
41 | + split_value = -1; // set split_value to default -1 | |
42 | + } | |
43 | + int idx; // index of current node | |
44 | + int parent_idx, left_idx, right_idx; // index of parent, left and right nodes | |
45 | + cpu_kdnode *parent, *left, *right; // parent, left and right kdnodes | |
46 | + T split_value; // splitting value of current node | |
47 | + std::vector <size_t> indices; // it indicates the points' indices that current node has | |
48 | + size_t level; // tree level of current node | |
49 | + }; | |
50 | + } // end of namespace cpu_kdtree | |
51 | + | |
52 | + template <typename T> | |
53 | + struct cuda_kdnode { | |
54 | + int parent, left, right; | |
55 | + T split_value; | |
56 | + size_t num_index; // number of indices it has | |
57 | + int index; // the beginning index | |
58 | + size_t level; | |
59 | + }; | |
60 | + | |
61 | + template <typename T, int D> | |
62 | + __device__ T gpu_distance(cpu_kdtree::point<T, D> &a, cpu_kdtree::point<T, D> &b) { | |
63 | + T distance = 0; | |
64 | + | |
65 | + for (size_t i = 0; i < D; i++) { | |
66 | + T d = a.dim[i] - b.dim[i]; | |
67 | + distance += d*d; | |
68 | + } | |
69 | + return distance; | |
70 | + } | |
71 | + | |
72 | + template <typename T, int D> | |
73 | + __device__ void search_at_node(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, int cur, cpu_kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) { | |
74 | + T best_distance = FLT_MAX; | |
75 | + size_t best_index = 0; | |
76 | + | |
77 | + while (true) { // break until reach the bottom | |
78 | + int split_axis = nodes[cur].level % D; | |
79 | + if (nodes[cur].left == -1) { // check whether it has left node or not | |
80 | + *d_node = cur; | |
81 | + for (int i = 0; i < nodes[cur].num_index; i++) { | |
82 | + size_t idx = indices[nodes[cur].index + i]; | |
83 | + T dist = gpu_distance<T, D>(d_query_point, d_reference_points[idx]); | |
84 | + if (dist < best_distance) { | |
85 | + best_distance = dist; | |
86 | + best_index = idx; | |
87 | + } | |
88 | + } | |
89 | + break; | |
90 | + } | |
91 | + else if (d_query_point.dim[split_axis] < nodes[cur].split_value) { // jump into specific son node | |
92 | + cur = nodes[cur].left; | |
93 | + } | |
94 | + else { | |
95 | + cur = nodes[cur].right; | |
96 | + } | |
97 | + } | |
98 | + *d_distance = best_distance; | |
99 | + *d_index = best_index; | |
100 | + } | |
101 | + | |
102 | + template <typename T, int D> | |
103 | + __device__ void search_at_node_range(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { | |
104 | + T best_distance = FLT_MAX; | |
105 | + size_t best_index = 0; | |
106 | + | |
107 | + int next_nodes_pos = 0; // initialize pop out order index | |
108 | + next_nodes[id * stack_size + next_nodes_pos] = cur; // find data that belongs to the very specific thread | |
109 | + next_nodes_pos++; | |
110 | + | |
111 | + while (next_nodes_pos) { | |
112 | + int next_search_nodes_pos = 0; // record push back order index | |
113 | + while (next_nodes_pos) { | |
114 | + cur = next_nodes[id * stack_size + next_nodes_pos - 1]; // pop out the last push in one and keep poping out | |
115 | + next_nodes_pos--; | |
116 | + int split_axis = nodes[cur].level % D; | |
117 | + | |
118 | + if (nodes[cur].left == -1) { | |
119 | + for (int i = 0; i < nodes[cur].num_index; i++) { | |
120 | + int idx = indices[nodes[cur].index + i]; // all indices are stored in one array, pick up from every node's beginning index | |
121 | + T d = gpu_distance<T>(d_query_point, d_reference_points[idx]); | |
122 | + if (d < best_distance) { | |
123 | + best_distance = d; | |
124 | + best_index = idx; | |
125 | + } | |
126 | + } | |
127 | + } | |
128 | + else { | |
129 | + T d = d_query_point.dim[split_axis] - nodes[cur].split_value; | |
130 | + | |
131 | + if (fabs(d) > range) { | |
132 | + if (d < 0) { | |
133 | + next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left; | |
134 | + next_search_nodes_pos++; | |
135 | + } | |
136 | + else { | |
137 | + next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right; | |
138 | + next_search_nodes_pos++; | |
139 | + } | |
140 | + } | |
141 | + else { | |
142 | + next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right; | |
143 | + next_search_nodes_pos++; | |
144 | + next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left; | |
145 | + next_search_nodes_pos++; | |
146 | + if (next_search_nodes_pos > stack_size) { | |
147 | + printf("Thread conflict might be caused by thread %d, so please try smaller input max_tree_levels\n", id); | |
148 | + (*Judge)++; | |
149 | + } | |
150 | + } | |
151 | + } | |
152 | + } | |
153 | + for (int i = 0; i < next_search_nodes_pos; i++) | |
154 | + next_nodes[id * stack_size + i] = next_search_nodes[id * stack_size + i]; | |
155 | + next_nodes_pos = next_search_nodes_pos; | |
156 | + } | |
157 | + *d_distance = best_distance; | |
158 | + *d_index = best_index; | |
159 | + } | |
160 | + | |
161 | + template <typename T, int D> | |
162 | + __device__ void search(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { | |
163 | + int best_node = 0; | |
164 | + T best_distance = FLT_MAX; | |
165 | + size_t best_index = 0; | |
166 | + T radius = 0; | |
167 | + | |
168 | + search_at_node<T, D>(nodes, indices, d_reference_points, 0, d_query_point, &best_index, &best_distance, &best_node); | |
169 | + radius = sqrt(best_distance); // get range | |
170 | + int cur = best_node; | |
171 | + | |
172 | + while (nodes[cur].parent != -1) { | |
173 | + int parent = nodes[cur].parent; | |
174 | + int split_axis = nodes[parent].level % D; | |
175 | + | |
176 | + T tmp_dist = FLT_MAX; | |
177 | + size_t tmp_idx; | |
178 | + if (fabs(nodes[parent].split_value - d_query_point.dim[split_axis]) <= radius) { | |
179 | + if (nodes[parent].left != cur) | |
180 | + search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].left, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge); | |
181 | + else | |
182 | + search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].right, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge); | |
183 | + } | |
184 | + if (tmp_dist < best_distance) { | |
185 | + best_distance = tmp_dist; | |
186 | + best_index = tmp_idx; | |
187 | + } | |
188 | + cur = parent; | |
189 | + } | |
190 | + *d_distance = sqrt(best_distance); | |
191 | + *d_index = best_index; | |
192 | + } | |
193 | + | |
194 | + template <typename T, int D> | |
195 | + __global__ void search_batch(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) { | |
196 | + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; | |
197 | + if (idx >= d_query_count) return; // avoid segfault | |
198 | + | |
199 | + search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent | |
200 | + } | |
201 | + | |
202 | + template <typename T, int D> | |
203 | + void search_stream(cuda_kdnode<T> *d_nodes, size_t *d_index, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> *query_stream_points, size_t stream_count, size_t *indices, T *distances) { | |
204 | + unsigned int threads = (unsigned int)(stream_count > 1024 ? 1024 : stream_count); | |
205 | + unsigned int blocks = (unsigned int)(stream_count / threads + (stream_count % threads ? 1 : 0)); | |
206 | + | |
207 | + cpu_kdtree::point<T, D> *d_query_points; | |
208 | + size_t *d_indices; | |
209 | + T *d_distances; | |
210 | + | |
211 | + int *next_nodes; | |
212 | + int *next_search_nodes; | |
213 | + | |
214 | + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * stream_count * D)); | |
215 | + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * stream_count)); | |
216 | + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * stream_count)); | |
217 | + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); | |
218 | + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int))); | |
219 | + HANDLE_ERROR(cudaMemcpy(d_query_points, query_stream_points, sizeof(T) * stream_count * D, cudaMemcpyHostToDevice)); | |
220 | + | |
221 | + int *Judge = NULL; | |
222 | + | |
223 | + search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, stream_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge); | |
224 | + | |
225 | + if(Judge == NULL) { | |
226 | + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * stream_count, cudaMemcpyDeviceToHost)); | |
227 | + HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * stream_count, cudaMemcpyDeviceToHost)); | |
228 | + } | |
229 | + | |
230 | + HANDLE_ERROR(cudaFree(next_nodes)); | |
231 | + HANDLE_ERROR(cudaFree(next_search_nodes)); | |
232 | + HANDLE_ERROR(cudaFree(d_query_points)); | |
233 | + HANDLE_ERROR(cudaFree(d_indices)); | |
234 | + HANDLE_ERROR(cudaFree(d_distances)); | |
235 | + } | |
236 | + | |
237 | + template <typename T, int D = 3> // set dimension of data to default 3 | |
238 | + class kdtree { | |
239 | + protected: | |
240 | + int current_axis; // current judging axis | |
241 | + int n_id; // store the total number of nodes | |
242 | + std::vector < typename cpu_kdtree::point<T, D> > *tmp_points; // transfer or temperary points | |
243 | + std::vector < typename cpu_kdtree::point<T, D> > cpu_tmp_points; // for cpu searching | |
244 | + cpu_kdtree::cpu_kdnode<T> *root; // root node | |
245 | + static kdtree<T, D> *cur_tree_ptr; | |
246 | + #ifdef __CUDACC__ | |
247 | + cuda_kdnode<T> *d_nodes; | |
248 | + size_t *d_index; | |
249 | + cpu_kdtree::point<T, D>* d_reference_points; | |
250 | + size_t npts; | |
251 | + int num_nodes; | |
252 | + #endif | |
253 | + public: | |
254 | + kdtree() { // constructor for creating a cpu_kdtree | |
255 | + cur_tree_ptr = this; // create a class pointer points to the current class value | |
256 | + n_id = 0; // set total number of points to default 0 | |
257 | + } | |
258 | + | |
259 | + ~kdtree() { // destructor of cpu_kdtree | |
260 | + std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes; | |
261 | + next_nodes.push_back(root); | |
262 | + while (next_nodes.size()) { | |
263 | + std::vector <cpu_kdtree::cpu_kdnode<T>*> next_search_nodes; | |
264 | + while (next_nodes.size()) { | |
265 | + cpu_kdtree::cpu_kdnode<T> *cur = next_nodes.back(); | |
266 | + next_nodes.pop_back(); | |
267 | + if (cur->left) | |
268 | + next_search_nodes.push_back(cur->left); | |
269 | + if (cur->right) | |
270 | + next_search_nodes.push_back(cur->right); | |
271 | + delete cur; | |
272 | + } | |
273 | + next_nodes = next_search_nodes; | |
274 | + } | |
275 | + root = NULL; | |
276 | + #ifdef __CUDACC__ | |
277 | + HANDLE_ERROR(cudaFree(d_nodes)); | |
278 | + HANDLE_ERROR(cudaFree(d_index)); | |
279 | + HANDLE_ERROR(cudaFree(d_reference_points)); | |
280 | + #endif | |
281 | + } | |
282 | + | |
283 | + void cpu_create(std::vector < typename cpu_kdtree::point<T, D> > &reference_points, size_t max_levels) { | |
284 | + tmp_points = &reference_points; | |
285 | + root = new cpu_kdtree::cpu_kdnode<T>(); // initializing the root node | |
286 | + root->idx = n_id++; // the index of root is 0 | |
287 | + root->level = 0; // tree level begins at 0 | |
288 | + root->indices.resize(reference_points.size()); // get the number of points | |
289 | + for (size_t i = 0; i < reference_points.size(); i++) { | |
290 | + root->indices[i] = i; // set indices of input points | |
291 | + } | |
292 | + std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes; // next nodes | |
293 | + next_nodes.push_back(root); // push back the root node | |
294 | + while (next_nodes.size()) { | |
295 | + std::vector <cpu_kdtree::cpu_kdnode<T>*> next_search_nodes; // next search nodes | |
296 | + while (next_nodes.size()) { // two same WHILE is because we need to make a new vector to store nodes for search | |
297 | + cpu_kdtree::cpu_kdnode<T> *current_node = next_nodes.back(); // handle node one by one (right first) | |
298 | + next_nodes.pop_back(); // pop out current node in order to store next round of nodes | |
299 | + if (current_node->level < max_levels) { | |
300 | + if (current_node->indices.size() > 1) { // split if the nonleaf node contains more than one point | |
301 | + cpu_kdtree::cpu_kdnode<T> *left = new cpu_kdtree::cpu_kdnode<T>(); | |
302 | + cpu_kdtree::cpu_kdnode<T> *right = new cpu_kdtree::cpu_kdnode<T>(); | |
303 | + left->idx = n_id++; // set the index of current node's left node | |
304 | + right->idx = n_id++; | |
305 | + split(current_node, left, right); // split left and right and determine a node | |
306 | + std::vector <size_t> temp; // empty vecters of int | |
307 | + //temp.resize(current_node->indices.size()); | |
308 | + current_node->indices.swap(temp); // clean up current node's indices | |
309 | + current_node->left = left; | |
310 | + current_node->right = right; | |
311 | + current_node->left_idx = left->idx; | |
312 | + current_node->right_idx = right->idx; | |
313 | + if (right->indices.size()) | |
314 | + next_search_nodes.push_back(right); // left pop out first | |
315 | + if (left->indices.size()) | |
316 | + next_search_nodes.push_back(left); | |
317 | + } | |
318 | + } | |
319 | + } | |
320 | + next_nodes = next_search_nodes; // go deeper within the tree | |
321 | + } | |
322 | + } | |
323 | + | |
324 | + static bool sort_points(const size_t a, const size_t b) { // create functor for std::sort | |
325 | + std::vector < typename cpu_kdtree::point<T, D> > &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer | |
326 | + return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis]; | |
327 | + } | |
328 | + | |
329 | + void split(cpu_kdtree::cpu_kdnode<T> *cur, cpu_kdtree::cpu_kdnode<T> *left, cpu_kdtree::cpu_kdnode<T> *right) { | |
330 | + std::vector < typename cpu_kdtree::point<T, D> > &pts = *tmp_points; | |
331 | + current_axis = cur->level % D; // indicate the judicative dimension or axis | |
332 | + std::sort(cur->indices.begin(), cur->indices.end(), sort_points); // using SortPoints as comparison function to sort the data | |
333 | + size_t mid_value = cur->indices[cur->indices.size() / 2]; // odd in the mid_value, even take the floor | |
334 | + cur->split_value = pts[mid_value].dim[current_axis]; // get the parent node | |
335 | + left->parent = cur; // set the parent of the next search nodes to current node | |
336 | + right->parent = cur; | |
337 | + left->level = cur->level + 1; // level + 1 | |
338 | + right->level = cur->level + 1; | |
339 | + left->parent_idx = cur->idx; // set its parent node's index | |
340 | + right->parent_idx = cur->idx; | |
341 | + for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right half-space one by one | |
342 | + size_t idx = cur->indices[i]; | |
343 | + if (pts[idx].dim[current_axis] < cur->split_value) | |
344 | + left->indices.push_back(idx); | |
345 | + else | |
346 | + right->indices.push_back(idx); | |
347 | + } | |
348 | + } | |
349 | + | |
350 | + void create(T *h_reference_points, size_t reference_count, size_t max_levels) { | |
351 | + #ifdef __CUDACC__ | |
352 | + if (max_levels > 10) { | |
353 | + std::cout<<"The max_tree_levels should be smaller!"<<std::endl; | |
354 | + exit(1); | |
355 | + } | |
356 | + //bb.init(&h_reference_points[0]); | |
357 | + //aaboundingboxing<T, D>(bb, h_reference_points, reference_count); | |
358 | + | |
359 | + std::vector < typename cpu_kdtree::point<T, D>> reference_points(reference_count); // restore the reference points in particular way | |
360 | + for (size_t j = 0; j < reference_count; j++) | |
361 | + for (size_t i = 0; i < D; i++) | |
362 | + reference_points[j].dim[i] = h_reference_points[j * D + i]; // creating a tree on cpu | |
363 | + (*this).cpu_create(reference_points, max_levels); // building a tree on cpu | |
364 | + cpu_kdtree::cpu_kdnode<T> *d_root = (*this).get_root(); | |
365 | + num_nodes = (*this).get_num_nodes(); | |
366 | + npts = reference_count; // also equals to reference_count | |
367 | + | |
368 | + HANDLE_ERROR(cudaMalloc((void**)&d_nodes, sizeof(cuda_kdnode<T>) * num_nodes)); // copy data from host to device | |
369 | + HANDLE_ERROR(cudaMalloc((void**)&d_index, sizeof(size_t) * npts)); | |
370 | + HANDLE_ERROR(cudaMalloc((void**)&d_reference_points, sizeof(cpu_kdtree::point<T, D>) * npts)); | |
371 | + | |
372 | + std::vector < cuda_kdnode<T> > tmp_nodes(num_nodes); | |
373 | + std::vector <size_t> indices(npts); | |
374 | + std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes; | |
375 | + size_t cur_pos = 0; | |
376 | + next_nodes.push_back(d_root); | |
377 | + while (next_nodes.size()) { | |
378 | + std::vector <typename cpu_kdtree::cpu_kdnode<T>*> next_search_nodes; | |
379 | + while (next_nodes.size()) { | |
380 | + cpu_kdtree::cpu_kdnode<T> *cur = next_nodes.back(); | |
381 | + next_nodes.pop_back(); | |
382 | + int id = cur->idx; // the nodes at same level are independent | |
383 | + tmp_nodes[id].level = cur->level; | |
384 | + tmp_nodes[id].parent = cur->parent_idx; | |
385 | + tmp_nodes[id].left = cur->left_idx; | |
386 | + tmp_nodes[id].right = cur->right_idx; | |
387 | + tmp_nodes[id].split_value = cur->split_value; | |
388 | + tmp_nodes[id].num_index = cur->indices.size(); // number of index | |
389 | + if (cur->indices.size()) { | |
390 | + for (size_t i = 0; i < cur->indices.size(); i++) | |
391 | + indices[cur_pos + i] = cur->indices[i]; | |
392 | + | |
393 | + tmp_nodes[id].index = (int)cur_pos; // beginning index of reference_points that every bottom node has | |
394 | + cur_pos += cur->indices.size(); // store indices continuously for every query_point | |
395 | + } | |
396 | + else { | |
397 | + tmp_nodes[id].index = -1; | |
398 | + } | |
399 | + | |
400 | + if (cur->left) | |
401 | + next_search_nodes.push_back(cur->left); | |
402 | + | |
403 | + if (cur->right) | |
404 | + next_search_nodes.push_back(cur->right); | |
405 | + } | |
406 | + next_nodes = next_search_nodes; | |
407 | + } | |
408 | + HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice)); | |
409 | + HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice)); | |
410 | + HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(cpu_kdtree::point<T, D>) * reference_count, cudaMemcpyHostToDevice)); | |
411 | + | |
412 | + #else | |
413 | + std::vector < typename cpu_kdtree::point<T, D> > reference_points(reference_count); // restore the reference points in particular way | |
414 | + for (size_t j = 0; j < reference_count; j++) | |
415 | + for (size_t i = 0; i < D; i++) | |
416 | + reference_points[j].dim[i] = h_reference_points[j * D + i]; | |
417 | + cpu_create(reference_points, max_levels); | |
418 | + cpu_tmp_points = *tmp_points; | |
419 | + | |
420 | + #endif | |
421 | + } | |
422 | + | |
423 | + int get_num_nodes() const { // get the total number of nodes | |
424 | + return n_id; | |
425 | + } | |
426 | + | |
427 | + cpu_kdtree::cpu_kdnode<T>* get_root() const { // get the root node of tree | |
428 | + return root; | |
429 | + } | |
430 | + | |
431 | + T cpu_distance(const cpu_kdtree::point<T, D> &a, const cpu_kdtree::point<T, D> &b) { | |
432 | + T distance = 0; | |
433 | + | |
434 | + for (size_t i = 0; i < D; i++) { | |
435 | + T d = a.dim[i] - b.dim[i]; | |
436 | + distance += d*d; | |
437 | + } | |
438 | + return distance; | |
439 | + } | |
440 | + | |
441 | + void cpu_search_at_node(cpu_kdtree::cpu_kdnode<T> *cur, const cpu_kdtree::point<T, D> &query, size_t *index, T *distance, cpu_kdtree::cpu_kdnode<T> **node) { | |
442 | + T best_distance = FLT_MAX; // initialize the best distance to max of floating point | |
443 | + size_t best_index = 0; | |
444 | + std::vector < typename cpu_kdtree::point<T, D> > pts = cpu_tmp_points; | |
445 | + while (true) { | |
446 | + size_t split_axis = cur->level % D; | |
447 | + if (cur->left == NULL) { // risky but acceptable, same goes for right because left and right are in same pace | |
448 | + *node = cur; // pointer points to a pointer | |
449 | + for (size_t i = 0; i < cur->indices.size(); i++) { | |
450 | + size_t idx = cur->indices[i]; | |
451 | + T d = cpu_distance(query, pts[idx]); // compute distances | |
452 | + /// if we want to compute k nearest neighbor, we can input the last resul | |
453 | + /// (last_best_dist < dist < best_dist) to select the next point until reaching to k | |
454 | + if (d < best_distance) { | |
455 | + best_distance = d; | |
456 | + best_index = idx; // record the nearest neighbor index | |
457 | + } | |
458 | + } | |
459 | + break; // find the target point then break the loop | |
460 | + } | |
461 | + else if (query.dim[split_axis] < cur->split_value) { // if it has son node, visit the next node on either left side or right side | |
462 | + cur = cur->left; | |
463 | + } | |
464 | + else { | |
465 | + cur = cur->right; | |
466 | + } | |
467 | + } | |
468 | + *index = best_index; | |
469 | + *distance = best_distance; | |
470 | + } | |
471 | + | |
472 | + void cpu_search_at_node_range(cpu_kdtree::cpu_kdnode<T> *cur, const cpu_kdtree::point<T, D> &query, T range, size_t *index, T *distance) { | |
473 | + T best_distance = FLT_MAX; // initialize the best distance to max of floating point | |
474 | + size_t best_index = 0; | |
475 | + std::vector < typename cpu_kdtree::point<T, D> > pts = cpu_tmp_points; | |
476 | + std::vector < typename cpu_kdtree::cpu_kdnode<T>*> next_node; | |
477 | + next_node.push_back(cur); | |
478 | + while (next_node.size()) { | |
479 | + std::vector<typename cpu_kdtree::cpu_kdnode<T>*> next_search; | |
480 | + while (next_node.size()) { | |
481 | + cur = next_node.back(); | |
482 | + next_node.pop_back(); | |
483 | + size_t split_axis = cur->level % D; | |
484 | + if (cur->left == NULL) { | |
485 | + for (size_t i = 0; i < cur->indices.size(); i++) { | |
486 | + size_t idx = cur->indices[i]; | |
487 | + T d = cpu_distance(query, pts[idx]); | |
488 | + if (d < best_distance) { | |
489 | + best_distance = d; | |
490 | + best_index = idx; | |
491 | + } | |
492 | + } | |
493 | + } | |
494 | + else { | |
495 | + T d = query.dim[split_axis] - cur->split_value; // computer distance along specific axis or dimension | |
496 | + /// there are three possibilities: on either left or right, and on both left and right | |
497 | + if (fabs(d) > range) { // absolute value of floating point to see if distance will be larger that best_dist | |
498 | + if (d < 0) | |
499 | + next_search.push_back(cur->left); // every left[split_axis] is less and equal to cur->split_value, so it is possible to find the nearest point in this region | |
500 | + else | |
501 | + next_search.push_back(cur->right); | |
502 | + } | |
503 | + else { // it is possible that nereast neighbor will appear on both left and right | |
504 | + next_search.push_back(cur->left); | |
505 | + next_search.push_back(cur->right); | |
506 | + } | |
507 | + } | |
508 | + } | |
509 | + next_node = next_search; // pop out at least one time | |
510 | + } | |
511 | + *index = best_index; | |
512 | + *distance = best_distance; | |
513 | + } | |
514 | + | |
515 | + void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) { | |
516 | + /// first convert the input query point into specific type | |
517 | + cpu_kdtree::point<T, D> query; | |
518 | + for (size_t j = 0; j < query_count; j++) { | |
519 | + for (size_t i = 0; i < D; i++) | |
520 | + query.dim[i] = h_query_points[j * D + i]; | |
521 | + /// find the nearest node, this will be the upper bound for the next time searching | |
522 | + cpu_kdtree::cpu_kdnode<T> *best_node = NULL; | |
523 | + T best_distance = FLT_MAX; | |
524 | + size_t best_index = 0; | |
525 | + T radius = 0; // radius for range | |
526 | + cpu_search_at_node(root, query, &best_index, &best_distance, &best_node); // simple search to rougly determine a result for next search step | |
527 | + radius = sqrt(best_distance); // It is possible that nearest will appear in another region | |
528 | + /// find other possibilities | |
529 | + cpu_kdtree::cpu_kdnode<T> *cur = best_node; | |
530 | + while (cur->parent != NULL) { // every node that you pass will be possible to be the best node | |
531 | + /// go up | |
532 | + cpu_kdtree::cpu_kdnode<T> *parent = cur->parent; // travel back to every node that we pass through | |
533 | + size_t split_axis = (parent->level) % D; | |
534 | + /// search other nodes | |
535 | + size_t tmp_index; | |
536 | + T tmp_distance = FLT_MAX; | |
537 | + if (fabs(parent->split_value - query.dim[split_axis]) <= radius) { | |
538 | + /// search opposite node | |
539 | + if (parent->left != cur) | |
540 | + cpu_search_at_node_range(parent->left, query, radius, &tmp_index, &tmp_distance); // to see whether it is its mother node's left son node | |
541 | + else | |
542 | + cpu_search_at_node_range(parent->right, query, radius, &tmp_index, &tmp_distance); | |
543 | + } | |
544 | + if (tmp_distance < best_distance) { | |
545 | + best_distance = tmp_distance; | |
546 | + best_index = tmp_index; | |
547 | + } | |
548 | + cur = parent; | |
549 | + } | |
550 | + h_indices[j] = best_index; | |
551 | + h_distances[j] = best_distance; | |
552 | + } | |
553 | + } | |
554 | + | |
555 | + void search(T *h_query_points, size_t query_count, size_t *indices, T *distances) { | |
556 | + #ifdef __CUDACC__ | |
557 | + std::vector < typename cpu_kdtree::point<T, D> > query_points(query_count); | |
558 | + for (size_t j = 0; j < query_count; j++) | |
559 | + for (size_t i = 0; i < D; i++) | |
560 | + query_points[j].dim[i] = h_query_points[j * D + i]; | |
561 | + | |
562 | + cudaDeviceProp prop; | |
563 | + cudaGetDeviceProperties(&prop, 0); | |
564 | + | |
565 | + size_t query_memory = D * sizeof(T) * query_count; | |
566 | + size_t N = 3 * query_memory / prop.totalGlobalMem; //consider index and distance, roughly 3 times | |
567 | + if (N > 1) { | |
568 | + N++; | |
569 | + size_t stream_count = query_count / N; | |
570 | + for (size_t n = 0; n < N; n++) { | |
571 | + size_t query_stream_start = n * stream_count; | |
572 | + search_stream(d_nodes, d_index, d_reference_points, &query_points[query_stream_start], stream_count, &indices[query_stream_start], &distances[query_stream_start]); | |
573 | + } | |
574 | + size_t stream_remain_count = query_count - N * stream_count; | |
575 | + if (stream_remain_count > 0) { | |
576 | + size_t query_remain_start = N * stream_count; | |
577 | + search_stream(d_nodes, d_index, d_reference_points, &query_points[query_remain_start], stream_remain_count, &indices[query_remain_start], &distances[query_remain_start]); | |
578 | + } | |
579 | + } | |
580 | + else { | |
581 | + unsigned int threads = (unsigned int)(query_count > 1024 ? 1024 : query_count); | |
582 | + unsigned int blocks = (unsigned int)(query_count / threads + (query_count % threads ? 1 : 0)); | |
583 | + | |
584 | + cpu_kdtree::point<T, D> *d_query_points; // create a pointer pointing to query points on gpu | |
585 | + size_t *d_indices; | |
586 | + T *d_distances; | |
587 | + | |
588 | + int *next_nodes; // create two STACK-like array | |
589 | + int *next_search_nodes; | |
590 | + | |
591 | + int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory | |
592 | + | |
593 | + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_count * D)); | |
594 | + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_count)); | |
595 | + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_count)); | |
596 | + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); // STACK size right now is 50, you can change it if you mean to | |
597 | + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int))); | |
598 | + HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_count * D, cudaMemcpyHostToDevice)); | |
599 | + | |
600 | + search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, query_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge); | |
601 | + | |
602 | + if (Judge == NULL) { // do the following work if the thread works safely | |
603 | + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_count, cudaMemcpyDeviceToHost)); | |
604 | + HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_count, cudaMemcpyDeviceToHost)); | |
605 | + } | |
606 | + | |
607 | + HANDLE_ERROR(cudaFree(next_nodes)); | |
608 | + HANDLE_ERROR(cudaFree(next_search_nodes)); | |
609 | + HANDLE_ERROR(cudaFree(d_query_points)); | |
610 | + HANDLE_ERROR(cudaFree(d_indices)); | |
611 | + HANDLE_ERROR(cudaFree(d_distances)); | |
612 | + } | |
613 | + | |
614 | + #else | |
615 | + cpu_search(h_query_points, query_count, indices, distances); | |
616 | + | |
617 | + #endif | |
618 | + | |
619 | + } | |
620 | + | |
621 | + }; //end class kdtree | |
622 | + | |
623 | + template <typename T, int D> | |
624 | + kdtree<T, D>* kdtree<T, D>::cur_tree_ptr = NULL; // definition of cur_tree_ptr pointer points to the current class | |
625 | + | |
626 | +} //end namespace stim | |
627 | +#endif | |
0 | 628 | \ No newline at end of file | ... | ... |