Commit 6a54a547256f32bceb7afb81e456d9c5fc699bee
1 parent
42bb075c
new version of kdtree
Showing
1 changed file
with
627 additions
and
0 deletions
Show diff stats
1 | +// right now the size of CUDA STACK is set to 50, increase it if you mean to make deeper tree | ||
2 | +// data should be stored in row-major | ||
3 | +// x1,x2,x3,x4,x5...... | ||
4 | +// y1,y2,y3,y4,y5...... | ||
5 | +// .................... | ||
6 | +// .................... | ||
7 | + | ||
8 | +#ifndef KDTREE_H | ||
9 | +#define KDTREE_H | ||
10 | +#define stack_size 50 | ||
11 | + | ||
12 | +#include "device_launch_parameters.h" | ||
13 | +#include <cuda.h> | ||
14 | +#include <cuda_runtime_api.h> | ||
15 | +#include "cuda_runtime.h" | ||
16 | +#include <vector> | ||
17 | +#include <cstring> | ||
18 | +#include <float.h> | ||
19 | +#include <iostream> | ||
20 | +#include <algorithm> | ||
21 | +#include <stim/cuda/cudatools/error.h> | ||
22 | +#include <stim/visualization/aabbn.h> | ||
23 | + | ||
24 | +namespace stim { | ||
25 | + namespace cpu_kdtree { | ||
26 | + template<typename T, int D> // typename refers to float or double while D refers to dimension of points | ||
27 | + struct point { | ||
28 | + T dim[D]; // create a structure to store every one input point | ||
29 | + }; | ||
30 | + | ||
31 | + template<typename T> | ||
32 | + class cpu_kdnode { | ||
33 | + public: | ||
34 | + cpu_kdnode() { // constructor for initializing a kdnode | ||
35 | + parent = NULL; // set every node's parent, left and right kdnode pointers to NULL | ||
36 | + left = NULL; | ||
37 | + right = NULL; | ||
38 | + parent_idx = -1; // set parent node index to default -1 | ||
39 | + left_idx = -1; | ||
40 | + right_idx = -1; | ||
41 | + split_value = -1; // set split_value to default -1 | ||
42 | + } | ||
43 | + int idx; // index of current node | ||
44 | + int parent_idx, left_idx, right_idx; // index of parent, left and right nodes | ||
45 | + cpu_kdnode *parent, *left, *right; // parent, left and right kdnodes | ||
46 | + T split_value; // splitting value of current node | ||
47 | + std::vector <size_t> indices; // it indicates the points' indices that current node has | ||
48 | + size_t level; // tree level of current node | ||
49 | + }; | ||
50 | + } // end of namespace cpu_kdtree | ||
51 | + | ||
52 | + template <typename T> | ||
53 | + struct cuda_kdnode { | ||
54 | + int parent, left, right; | ||
55 | + T split_value; | ||
56 | + size_t num_index; // number of indices it has | ||
57 | + int index; // the beginning index | ||
58 | + size_t level; | ||
59 | + }; | ||
60 | + | ||
61 | + template <typename T, int D> | ||
62 | + __device__ T gpu_distance(cpu_kdtree::point<T, D> &a, cpu_kdtree::point<T, D> &b) { | ||
63 | + T distance = 0; | ||
64 | + | ||
65 | + for (size_t i = 0; i < D; i++) { | ||
66 | + T d = a.dim[i] - b.dim[i]; | ||
67 | + distance += d*d; | ||
68 | + } | ||
69 | + return distance; | ||
70 | + } | ||
71 | + | ||
72 | + template <typename T, int D> | ||
73 | + __device__ void search_at_node(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, int cur, cpu_kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) { | ||
74 | + T best_distance = FLT_MAX; | ||
75 | + size_t best_index = 0; | ||
76 | + | ||
77 | + while (true) { // break until reach the bottom | ||
78 | + int split_axis = nodes[cur].level % D; | ||
79 | + if (nodes[cur].left == -1) { // check whether it has left node or not | ||
80 | + *d_node = cur; | ||
81 | + for (int i = 0; i < nodes[cur].num_index; i++) { | ||
82 | + size_t idx = indices[nodes[cur].index + i]; | ||
83 | + T dist = gpu_distance<T, D>(d_query_point, d_reference_points[idx]); | ||
84 | + if (dist < best_distance) { | ||
85 | + best_distance = dist; | ||
86 | + best_index = idx; | ||
87 | + } | ||
88 | + } | ||
89 | + break; | ||
90 | + } | ||
91 | + else if (d_query_point.dim[split_axis] < nodes[cur].split_value) { // jump into specific son node | ||
92 | + cur = nodes[cur].left; | ||
93 | + } | ||
94 | + else { | ||
95 | + cur = nodes[cur].right; | ||
96 | + } | ||
97 | + } | ||
98 | + *d_distance = best_distance; | ||
99 | + *d_index = best_index; | ||
100 | + } | ||
101 | + | ||
102 | + template <typename T, int D> | ||
103 | + __device__ void search_at_node_range(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { | ||
104 | + T best_distance = FLT_MAX; | ||
105 | + size_t best_index = 0; | ||
106 | + | ||
107 | + int next_nodes_pos = 0; // initialize pop out order index | ||
108 | + next_nodes[id * stack_size + next_nodes_pos] = cur; // find data that belongs to the very specific thread | ||
109 | + next_nodes_pos++; | ||
110 | + | ||
111 | + while (next_nodes_pos) { | ||
112 | + int next_search_nodes_pos = 0; // record push back order index | ||
113 | + while (next_nodes_pos) { | ||
114 | + cur = next_nodes[id * stack_size + next_nodes_pos - 1]; // pop out the last push in one and keep poping out | ||
115 | + next_nodes_pos--; | ||
116 | + int split_axis = nodes[cur].level % D; | ||
117 | + | ||
118 | + if (nodes[cur].left == -1) { | ||
119 | + for (int i = 0; i < nodes[cur].num_index; i++) { | ||
120 | + int idx = indices[nodes[cur].index + i]; // all indices are stored in one array, pick up from every node's beginning index | ||
121 | + T d = gpu_distance<T>(d_query_point, d_reference_points[idx]); | ||
122 | + if (d < best_distance) { | ||
123 | + best_distance = d; | ||
124 | + best_index = idx; | ||
125 | + } | ||
126 | + } | ||
127 | + } | ||
128 | + else { | ||
129 | + T d = d_query_point.dim[split_axis] - nodes[cur].split_value; | ||
130 | + | ||
131 | + if (fabs(d) > range) { | ||
132 | + if (d < 0) { | ||
133 | + next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left; | ||
134 | + next_search_nodes_pos++; | ||
135 | + } | ||
136 | + else { | ||
137 | + next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right; | ||
138 | + next_search_nodes_pos++; | ||
139 | + } | ||
140 | + } | ||
141 | + else { | ||
142 | + next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right; | ||
143 | + next_search_nodes_pos++; | ||
144 | + next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left; | ||
145 | + next_search_nodes_pos++; | ||
146 | + if (next_search_nodes_pos > stack_size) { | ||
147 | + printf("Thread conflict might be caused by thread %d, so please try smaller input max_tree_levels\n", id); | ||
148 | + (*Judge)++; | ||
149 | + } | ||
150 | + } | ||
151 | + } | ||
152 | + } | ||
153 | + for (int i = 0; i < next_search_nodes_pos; i++) | ||
154 | + next_nodes[id * stack_size + i] = next_search_nodes[id * stack_size + i]; | ||
155 | + next_nodes_pos = next_search_nodes_pos; | ||
156 | + } | ||
157 | + *d_distance = best_distance; | ||
158 | + *d_index = best_index; | ||
159 | + } | ||
160 | + | ||
161 | + template <typename T, int D> | ||
162 | + __device__ void search(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { | ||
163 | + int best_node = 0; | ||
164 | + T best_distance = FLT_MAX; | ||
165 | + size_t best_index = 0; | ||
166 | + T radius = 0; | ||
167 | + | ||
168 | + search_at_node<T, D>(nodes, indices, d_reference_points, 0, d_query_point, &best_index, &best_distance, &best_node); | ||
169 | + radius = sqrt(best_distance); // get range | ||
170 | + int cur = best_node; | ||
171 | + | ||
172 | + while (nodes[cur].parent != -1) { | ||
173 | + int parent = nodes[cur].parent; | ||
174 | + int split_axis = nodes[parent].level % D; | ||
175 | + | ||
176 | + T tmp_dist = FLT_MAX; | ||
177 | + size_t tmp_idx; | ||
178 | + if (fabs(nodes[parent].split_value - d_query_point.dim[split_axis]) <= radius) { | ||
179 | + if (nodes[parent].left != cur) | ||
180 | + search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].left, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge); | ||
181 | + else | ||
182 | + search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].right, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge); | ||
183 | + } | ||
184 | + if (tmp_dist < best_distance) { | ||
185 | + best_distance = tmp_dist; | ||
186 | + best_index = tmp_idx; | ||
187 | + } | ||
188 | + cur = parent; | ||
189 | + } | ||
190 | + *d_distance = sqrt(best_distance); | ||
191 | + *d_index = best_index; | ||
192 | + } | ||
193 | + | ||
194 | + template <typename T, int D> | ||
195 | + __global__ void search_batch(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) { | ||
196 | + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
197 | + if (idx >= d_query_count) return; // avoid segfault | ||
198 | + | ||
199 | + search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent | ||
200 | + } | ||
201 | + | ||
202 | + template <typename T, int D> | ||
203 | + void search_stream(cuda_kdnode<T> *d_nodes, size_t *d_index, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> *query_stream_points, size_t stream_count, size_t *indices, T *distances) { | ||
204 | + unsigned int threads = (unsigned int)(stream_count > 1024 ? 1024 : stream_count); | ||
205 | + unsigned int blocks = (unsigned int)(stream_count / threads + (stream_count % threads ? 1 : 0)); | ||
206 | + | ||
207 | + cpu_kdtree::point<T, D> *d_query_points; | ||
208 | + size_t *d_indices; | ||
209 | + T *d_distances; | ||
210 | + | ||
211 | + int *next_nodes; | ||
212 | + int *next_search_nodes; | ||
213 | + | ||
214 | + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * stream_count * D)); | ||
215 | + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * stream_count)); | ||
216 | + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * stream_count)); | ||
217 | + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); | ||
218 | + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int))); | ||
219 | + HANDLE_ERROR(cudaMemcpy(d_query_points, query_stream_points, sizeof(T) * stream_count * D, cudaMemcpyHostToDevice)); | ||
220 | + | ||
221 | + int *Judge = NULL; | ||
222 | + | ||
223 | + search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, stream_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge); | ||
224 | + | ||
225 | + if(Judge == NULL) { | ||
226 | + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * stream_count, cudaMemcpyDeviceToHost)); | ||
227 | + HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * stream_count, cudaMemcpyDeviceToHost)); | ||
228 | + } | ||
229 | + | ||
230 | + HANDLE_ERROR(cudaFree(next_nodes)); | ||
231 | + HANDLE_ERROR(cudaFree(next_search_nodes)); | ||
232 | + HANDLE_ERROR(cudaFree(d_query_points)); | ||
233 | + HANDLE_ERROR(cudaFree(d_indices)); | ||
234 | + HANDLE_ERROR(cudaFree(d_distances)); | ||
235 | + } | ||
236 | + | ||
237 | + template <typename T, int D = 3> // set dimension of data to default 3 | ||
238 | + class kdtree { | ||
239 | + protected: | ||
240 | + int current_axis; // current judging axis | ||
241 | + int n_id; // store the total number of nodes | ||
242 | + std::vector < typename cpu_kdtree::point<T, D> > *tmp_points; // transfer or temperary points | ||
243 | + std::vector < typename cpu_kdtree::point<T, D> > cpu_tmp_points; // for cpu searching | ||
244 | + cpu_kdtree::cpu_kdnode<T> *root; // root node | ||
245 | + static kdtree<T, D> *cur_tree_ptr; | ||
246 | + #ifdef __CUDACC__ | ||
247 | + cuda_kdnode<T> *d_nodes; | ||
248 | + size_t *d_index; | ||
249 | + cpu_kdtree::point<T, D>* d_reference_points; | ||
250 | + size_t npts; | ||
251 | + int num_nodes; | ||
252 | + #endif | ||
253 | + public: | ||
254 | + kdtree() { // constructor for creating a cpu_kdtree | ||
255 | + cur_tree_ptr = this; // create a class pointer points to the current class value | ||
256 | + n_id = 0; // set total number of points to default 0 | ||
257 | + } | ||
258 | + | ||
259 | + ~kdtree() { // destructor of cpu_kdtree | ||
260 | + std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes; | ||
261 | + next_nodes.push_back(root); | ||
262 | + while (next_nodes.size()) { | ||
263 | + std::vector <cpu_kdtree::cpu_kdnode<T>*> next_search_nodes; | ||
264 | + while (next_nodes.size()) { | ||
265 | + cpu_kdtree::cpu_kdnode<T> *cur = next_nodes.back(); | ||
266 | + next_nodes.pop_back(); | ||
267 | + if (cur->left) | ||
268 | + next_search_nodes.push_back(cur->left); | ||
269 | + if (cur->right) | ||
270 | + next_search_nodes.push_back(cur->right); | ||
271 | + delete cur; | ||
272 | + } | ||
273 | + next_nodes = next_search_nodes; | ||
274 | + } | ||
275 | + root = NULL; | ||
276 | + #ifdef __CUDACC__ | ||
277 | + HANDLE_ERROR(cudaFree(d_nodes)); | ||
278 | + HANDLE_ERROR(cudaFree(d_index)); | ||
279 | + HANDLE_ERROR(cudaFree(d_reference_points)); | ||
280 | + #endif | ||
281 | + } | ||
282 | + | ||
283 | + void cpu_create(std::vector < typename cpu_kdtree::point<T, D> > &reference_points, size_t max_levels) { | ||
284 | + tmp_points = &reference_points; | ||
285 | + root = new cpu_kdtree::cpu_kdnode<T>(); // initializing the root node | ||
286 | + root->idx = n_id++; // the index of root is 0 | ||
287 | + root->level = 0; // tree level begins at 0 | ||
288 | + root->indices.resize(reference_points.size()); // get the number of points | ||
289 | + for (size_t i = 0; i < reference_points.size(); i++) { | ||
290 | + root->indices[i] = i; // set indices of input points | ||
291 | + } | ||
292 | + std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes; // next nodes | ||
293 | + next_nodes.push_back(root); // push back the root node | ||
294 | + while (next_nodes.size()) { | ||
295 | + std::vector <cpu_kdtree::cpu_kdnode<T>*> next_search_nodes; // next search nodes | ||
296 | + while (next_nodes.size()) { // two same WHILE is because we need to make a new vector to store nodes for search | ||
297 | + cpu_kdtree::cpu_kdnode<T> *current_node = next_nodes.back(); // handle node one by one (right first) | ||
298 | + next_nodes.pop_back(); // pop out current node in order to store next round of nodes | ||
299 | + if (current_node->level < max_levels) { | ||
300 | + if (current_node->indices.size() > 1) { // split if the nonleaf node contains more than one point | ||
301 | + cpu_kdtree::cpu_kdnode<T> *left = new cpu_kdtree::cpu_kdnode<T>(); | ||
302 | + cpu_kdtree::cpu_kdnode<T> *right = new cpu_kdtree::cpu_kdnode<T>(); | ||
303 | + left->idx = n_id++; // set the index of current node's left node | ||
304 | + right->idx = n_id++; | ||
305 | + split(current_node, left, right); // split left and right and determine a node | ||
306 | + std::vector <size_t> temp; // empty vecters of int | ||
307 | + //temp.resize(current_node->indices.size()); | ||
308 | + current_node->indices.swap(temp); // clean up current node's indices | ||
309 | + current_node->left = left; | ||
310 | + current_node->right = right; | ||
311 | + current_node->left_idx = left->idx; | ||
312 | + current_node->right_idx = right->idx; | ||
313 | + if (right->indices.size()) | ||
314 | + next_search_nodes.push_back(right); // left pop out first | ||
315 | + if (left->indices.size()) | ||
316 | + next_search_nodes.push_back(left); | ||
317 | + } | ||
318 | + } | ||
319 | + } | ||
320 | + next_nodes = next_search_nodes; // go deeper within the tree | ||
321 | + } | ||
322 | + } | ||
323 | + | ||
324 | + static bool sort_points(const size_t a, const size_t b) { // create functor for std::sort | ||
325 | + std::vector < typename cpu_kdtree::point<T, D> > &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer | ||
326 | + return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis]; | ||
327 | + } | ||
328 | + | ||
329 | + void split(cpu_kdtree::cpu_kdnode<T> *cur, cpu_kdtree::cpu_kdnode<T> *left, cpu_kdtree::cpu_kdnode<T> *right) { | ||
330 | + std::vector < typename cpu_kdtree::point<T, D> > &pts = *tmp_points; | ||
331 | + current_axis = cur->level % D; // indicate the judicative dimension or axis | ||
332 | + std::sort(cur->indices.begin(), cur->indices.end(), sort_points); // using SortPoints as comparison function to sort the data | ||
333 | + size_t mid_value = cur->indices[cur->indices.size() / 2]; // odd in the mid_value, even take the floor | ||
334 | + cur->split_value = pts[mid_value].dim[current_axis]; // get the parent node | ||
335 | + left->parent = cur; // set the parent of the next search nodes to current node | ||
336 | + right->parent = cur; | ||
337 | + left->level = cur->level + 1; // level + 1 | ||
338 | + right->level = cur->level + 1; | ||
339 | + left->parent_idx = cur->idx; // set its parent node's index | ||
340 | + right->parent_idx = cur->idx; | ||
341 | + for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right half-space one by one | ||
342 | + size_t idx = cur->indices[i]; | ||
343 | + if (pts[idx].dim[current_axis] < cur->split_value) | ||
344 | + left->indices.push_back(idx); | ||
345 | + else | ||
346 | + right->indices.push_back(idx); | ||
347 | + } | ||
348 | + } | ||
349 | + | ||
350 | + void create(T *h_reference_points, size_t reference_count, size_t max_levels) { | ||
351 | + #ifdef __CUDACC__ | ||
352 | + if (max_levels > 10) { | ||
353 | + std::cout<<"The max_tree_levels should be smaller!"<<std::endl; | ||
354 | + exit(1); | ||
355 | + } | ||
356 | + //bb.init(&h_reference_points[0]); | ||
357 | + //aaboundingboxing<T, D>(bb, h_reference_points, reference_count); | ||
358 | + | ||
359 | + std::vector < typename cpu_kdtree::point<T, D>> reference_points(reference_count); // restore the reference points in particular way | ||
360 | + for (size_t j = 0; j < reference_count; j++) | ||
361 | + for (size_t i = 0; i < D; i++) | ||
362 | + reference_points[j].dim[i] = h_reference_points[j * D + i]; // creating a tree on cpu | ||
363 | + (*this).cpu_create(reference_points, max_levels); // building a tree on cpu | ||
364 | + cpu_kdtree::cpu_kdnode<T> *d_root = (*this).get_root(); | ||
365 | + num_nodes = (*this).get_num_nodes(); | ||
366 | + npts = reference_count; // also equals to reference_count | ||
367 | + | ||
368 | + HANDLE_ERROR(cudaMalloc((void**)&d_nodes, sizeof(cuda_kdnode<T>) * num_nodes)); // copy data from host to device | ||
369 | + HANDLE_ERROR(cudaMalloc((void**)&d_index, sizeof(size_t) * npts)); | ||
370 | + HANDLE_ERROR(cudaMalloc((void**)&d_reference_points, sizeof(cpu_kdtree::point<T, D>) * npts)); | ||
371 | + | ||
372 | + std::vector < cuda_kdnode<T> > tmp_nodes(num_nodes); | ||
373 | + std::vector <size_t> indices(npts); | ||
374 | + std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes; | ||
375 | + size_t cur_pos = 0; | ||
376 | + next_nodes.push_back(d_root); | ||
377 | + while (next_nodes.size()) { | ||
378 | + std::vector <typename cpu_kdtree::cpu_kdnode<T>*> next_search_nodes; | ||
379 | + while (next_nodes.size()) { | ||
380 | + cpu_kdtree::cpu_kdnode<T> *cur = next_nodes.back(); | ||
381 | + next_nodes.pop_back(); | ||
382 | + int id = cur->idx; // the nodes at same level are independent | ||
383 | + tmp_nodes[id].level = cur->level; | ||
384 | + tmp_nodes[id].parent = cur->parent_idx; | ||
385 | + tmp_nodes[id].left = cur->left_idx; | ||
386 | + tmp_nodes[id].right = cur->right_idx; | ||
387 | + tmp_nodes[id].split_value = cur->split_value; | ||
388 | + tmp_nodes[id].num_index = cur->indices.size(); // number of index | ||
389 | + if (cur->indices.size()) { | ||
390 | + for (size_t i = 0; i < cur->indices.size(); i++) | ||
391 | + indices[cur_pos + i] = cur->indices[i]; | ||
392 | + | ||
393 | + tmp_nodes[id].index = (int)cur_pos; // beginning index of reference_points that every bottom node has | ||
394 | + cur_pos += cur->indices.size(); // store indices continuously for every query_point | ||
395 | + } | ||
396 | + else { | ||
397 | + tmp_nodes[id].index = -1; | ||
398 | + } | ||
399 | + | ||
400 | + if (cur->left) | ||
401 | + next_search_nodes.push_back(cur->left); | ||
402 | + | ||
403 | + if (cur->right) | ||
404 | + next_search_nodes.push_back(cur->right); | ||
405 | + } | ||
406 | + next_nodes = next_search_nodes; | ||
407 | + } | ||
408 | + HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice)); | ||
409 | + HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice)); | ||
410 | + HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(cpu_kdtree::point<T, D>) * reference_count, cudaMemcpyHostToDevice)); | ||
411 | + | ||
412 | + #else | ||
413 | + std::vector < typename cpu_kdtree::point<T, D> > reference_points(reference_count); // restore the reference points in particular way | ||
414 | + for (size_t j = 0; j < reference_count; j++) | ||
415 | + for (size_t i = 0; i < D; i++) | ||
416 | + reference_points[j].dim[i] = h_reference_points[j * D + i]; | ||
417 | + cpu_create(reference_points, max_levels); | ||
418 | + cpu_tmp_points = *tmp_points; | ||
419 | + | ||
420 | + #endif | ||
421 | + } | ||
422 | + | ||
423 | + int get_num_nodes() const { // get the total number of nodes | ||
424 | + return n_id; | ||
425 | + } | ||
426 | + | ||
427 | + cpu_kdtree::cpu_kdnode<T>* get_root() const { // get the root node of tree | ||
428 | + return root; | ||
429 | + } | ||
430 | + | ||
431 | + T cpu_distance(const cpu_kdtree::point<T, D> &a, const cpu_kdtree::point<T, D> &b) { | ||
432 | + T distance = 0; | ||
433 | + | ||
434 | + for (size_t i = 0; i < D; i++) { | ||
435 | + T d = a.dim[i] - b.dim[i]; | ||
436 | + distance += d*d; | ||
437 | + } | ||
438 | + return distance; | ||
439 | + } | ||
440 | + | ||
441 | + void cpu_search_at_node(cpu_kdtree::cpu_kdnode<T> *cur, const cpu_kdtree::point<T, D> &query, size_t *index, T *distance, cpu_kdtree::cpu_kdnode<T> **node) { | ||
442 | + T best_distance = FLT_MAX; // initialize the best distance to max of floating point | ||
443 | + size_t best_index = 0; | ||
444 | + std::vector < typename cpu_kdtree::point<T, D> > pts = cpu_tmp_points; | ||
445 | + while (true) { | ||
446 | + size_t split_axis = cur->level % D; | ||
447 | + if (cur->left == NULL) { // risky but acceptable, same goes for right because left and right are in same pace | ||
448 | + *node = cur; // pointer points to a pointer | ||
449 | + for (size_t i = 0; i < cur->indices.size(); i++) { | ||
450 | + size_t idx = cur->indices[i]; | ||
451 | + T d = cpu_distance(query, pts[idx]); // compute distances | ||
452 | + /// if we want to compute k nearest neighbor, we can input the last resul | ||
453 | + /// (last_best_dist < dist < best_dist) to select the next point until reaching to k | ||
454 | + if (d < best_distance) { | ||
455 | + best_distance = d; | ||
456 | + best_index = idx; // record the nearest neighbor index | ||
457 | + } | ||
458 | + } | ||
459 | + break; // find the target point then break the loop | ||
460 | + } | ||
461 | + else if (query.dim[split_axis] < cur->split_value) { // if it has son node, visit the next node on either left side or right side | ||
462 | + cur = cur->left; | ||
463 | + } | ||
464 | + else { | ||
465 | + cur = cur->right; | ||
466 | + } | ||
467 | + } | ||
468 | + *index = best_index; | ||
469 | + *distance = best_distance; | ||
470 | + } | ||
471 | + | ||
472 | + void cpu_search_at_node_range(cpu_kdtree::cpu_kdnode<T> *cur, const cpu_kdtree::point<T, D> &query, T range, size_t *index, T *distance) { | ||
473 | + T best_distance = FLT_MAX; // initialize the best distance to max of floating point | ||
474 | + size_t best_index = 0; | ||
475 | + std::vector < typename cpu_kdtree::point<T, D> > pts = cpu_tmp_points; | ||
476 | + std::vector < typename cpu_kdtree::cpu_kdnode<T>*> next_node; | ||
477 | + next_node.push_back(cur); | ||
478 | + while (next_node.size()) { | ||
479 | + std::vector<typename cpu_kdtree::cpu_kdnode<T>*> next_search; | ||
480 | + while (next_node.size()) { | ||
481 | + cur = next_node.back(); | ||
482 | + next_node.pop_back(); | ||
483 | + size_t split_axis = cur->level % D; | ||
484 | + if (cur->left == NULL) { | ||
485 | + for (size_t i = 0; i < cur->indices.size(); i++) { | ||
486 | + size_t idx = cur->indices[i]; | ||
487 | + T d = cpu_distance(query, pts[idx]); | ||
488 | + if (d < best_distance) { | ||
489 | + best_distance = d; | ||
490 | + best_index = idx; | ||
491 | + } | ||
492 | + } | ||
493 | + } | ||
494 | + else { | ||
495 | + T d = query.dim[split_axis] - cur->split_value; // computer distance along specific axis or dimension | ||
496 | + /// there are three possibilities: on either left or right, and on both left and right | ||
497 | + if (fabs(d) > range) { // absolute value of floating point to see if distance will be larger that best_dist | ||
498 | + if (d < 0) | ||
499 | + next_search.push_back(cur->left); // every left[split_axis] is less and equal to cur->split_value, so it is possible to find the nearest point in this region | ||
500 | + else | ||
501 | + next_search.push_back(cur->right); | ||
502 | + } | ||
503 | + else { // it is possible that nereast neighbor will appear on both left and right | ||
504 | + next_search.push_back(cur->left); | ||
505 | + next_search.push_back(cur->right); | ||
506 | + } | ||
507 | + } | ||
508 | + } | ||
509 | + next_node = next_search; // pop out at least one time | ||
510 | + } | ||
511 | + *index = best_index; | ||
512 | + *distance = best_distance; | ||
513 | + } | ||
514 | + | ||
515 | + void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) { | ||
516 | + /// first convert the input query point into specific type | ||
517 | + cpu_kdtree::point<T, D> query; | ||
518 | + for (size_t j = 0; j < query_count; j++) { | ||
519 | + for (size_t i = 0; i < D; i++) | ||
520 | + query.dim[i] = h_query_points[j * D + i]; | ||
521 | + /// find the nearest node, this will be the upper bound for the next time searching | ||
522 | + cpu_kdtree::cpu_kdnode<T> *best_node = NULL; | ||
523 | + T best_distance = FLT_MAX; | ||
524 | + size_t best_index = 0; | ||
525 | + T radius = 0; // radius for range | ||
526 | + cpu_search_at_node(root, query, &best_index, &best_distance, &best_node); // simple search to rougly determine a result for next search step | ||
527 | + radius = sqrt(best_distance); // It is possible that nearest will appear in another region | ||
528 | + /// find other possibilities | ||
529 | + cpu_kdtree::cpu_kdnode<T> *cur = best_node; | ||
530 | + while (cur->parent != NULL) { // every node that you pass will be possible to be the best node | ||
531 | + /// go up | ||
532 | + cpu_kdtree::cpu_kdnode<T> *parent = cur->parent; // travel back to every node that we pass through | ||
533 | + size_t split_axis = (parent->level) % D; | ||
534 | + /// search other nodes | ||
535 | + size_t tmp_index; | ||
536 | + T tmp_distance = FLT_MAX; | ||
537 | + if (fabs(parent->split_value - query.dim[split_axis]) <= radius) { | ||
538 | + /// search opposite node | ||
539 | + if (parent->left != cur) | ||
540 | + cpu_search_at_node_range(parent->left, query, radius, &tmp_index, &tmp_distance); // to see whether it is its mother node's left son node | ||
541 | + else | ||
542 | + cpu_search_at_node_range(parent->right, query, radius, &tmp_index, &tmp_distance); | ||
543 | + } | ||
544 | + if (tmp_distance < best_distance) { | ||
545 | + best_distance = tmp_distance; | ||
546 | + best_index = tmp_index; | ||
547 | + } | ||
548 | + cur = parent; | ||
549 | + } | ||
550 | + h_indices[j] = best_index; | ||
551 | + h_distances[j] = best_distance; | ||
552 | + } | ||
553 | + } | ||
554 | + | ||
555 | + void search(T *h_query_points, size_t query_count, size_t *indices, T *distances) { | ||
556 | + #ifdef __CUDACC__ | ||
557 | + std::vector < typename cpu_kdtree::point<T, D> > query_points(query_count); | ||
558 | + for (size_t j = 0; j < query_count; j++) | ||
559 | + for (size_t i = 0; i < D; i++) | ||
560 | + query_points[j].dim[i] = h_query_points[j * D + i]; | ||
561 | + | ||
562 | + cudaDeviceProp prop; | ||
563 | + cudaGetDeviceProperties(&prop, 0); | ||
564 | + | ||
565 | + size_t query_memory = D * sizeof(T) * query_count; | ||
566 | + size_t N = 3 * query_memory / prop.totalGlobalMem; //consider index and distance, roughly 3 times | ||
567 | + if (N > 1) { | ||
568 | + N++; | ||
569 | + size_t stream_count = query_count / N; | ||
570 | + for (size_t n = 0; n < N; n++) { | ||
571 | + size_t query_stream_start = n * stream_count; | ||
572 | + search_stream(d_nodes, d_index, d_reference_points, &query_points[query_stream_start], stream_count, &indices[query_stream_start], &distances[query_stream_start]); | ||
573 | + } | ||
574 | + size_t stream_remain_count = query_count - N * stream_count; | ||
575 | + if (stream_remain_count > 0) { | ||
576 | + size_t query_remain_start = N * stream_count; | ||
577 | + search_stream(d_nodes, d_index, d_reference_points, &query_points[query_remain_start], stream_remain_count, &indices[query_remain_start], &distances[query_remain_start]); | ||
578 | + } | ||
579 | + } | ||
580 | + else { | ||
581 | + unsigned int threads = (unsigned int)(query_count > 1024 ? 1024 : query_count); | ||
582 | + unsigned int blocks = (unsigned int)(query_count / threads + (query_count % threads ? 1 : 0)); | ||
583 | + | ||
584 | + cpu_kdtree::point<T, D> *d_query_points; // create a pointer pointing to query points on gpu | ||
585 | + size_t *d_indices; | ||
586 | + T *d_distances; | ||
587 | + | ||
588 | + int *next_nodes; // create two STACK-like array | ||
589 | + int *next_search_nodes; | ||
590 | + | ||
591 | + int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory | ||
592 | + | ||
593 | + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_count * D)); | ||
594 | + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_count)); | ||
595 | + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_count)); | ||
596 | + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); // STACK size right now is 50, you can change it if you mean to | ||
597 | + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int))); | ||
598 | + HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_count * D, cudaMemcpyHostToDevice)); | ||
599 | + | ||
600 | + search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, query_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge); | ||
601 | + | ||
602 | + if (Judge == NULL) { // do the following work if the thread works safely | ||
603 | + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_count, cudaMemcpyDeviceToHost)); | ||
604 | + HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_count, cudaMemcpyDeviceToHost)); | ||
605 | + } | ||
606 | + | ||
607 | + HANDLE_ERROR(cudaFree(next_nodes)); | ||
608 | + HANDLE_ERROR(cudaFree(next_search_nodes)); | ||
609 | + HANDLE_ERROR(cudaFree(d_query_points)); | ||
610 | + HANDLE_ERROR(cudaFree(d_indices)); | ||
611 | + HANDLE_ERROR(cudaFree(d_distances)); | ||
612 | + } | ||
613 | + | ||
614 | + #else | ||
615 | + cpu_search(h_query_points, query_count, indices, distances); | ||
616 | + | ||
617 | + #endif | ||
618 | + | ||
619 | + } | ||
620 | + | ||
621 | + }; //end class kdtree | ||
622 | + | ||
623 | + template <typename T, int D> | ||
624 | + kdtree<T, D>* kdtree<T, D>::cur_tree_ptr = NULL; // definition of cur_tree_ptr pointer points to the current class | ||
625 | + | ||
626 | +} //end namespace stim | ||
627 | +#endif | ||
0 | \ No newline at end of file | 628 | \ No newline at end of file |