Commit f36421245ddcb0ad90020be734c268eb60b3ab4b
Merge branch 'JACK' into 'master'
add stream to kdtree::search I did the bench test with my data. Please use your data to do the test to see whether it works. If you still encounter the same problem, please let me know! See merge request !16
Showing
1 changed file
with
99 additions
and
29 deletions
Show diff stats
stim/structures/kdtree.cuh
@@ -63,6 +63,7 @@ namespace stim { | @@ -63,6 +63,7 @@ namespace stim { | ||
63 | cur_tree_ptr = this; // create a class pointer points to the current class value | 63 | cur_tree_ptr = this; // create a class pointer points to the current class value |
64 | n_id = 0; // set total number of points to default 0 | 64 | n_id = 0; // set total number of points to default 0 |
65 | } | 65 | } |
66 | + | ||
66 | ~cpu_kdtree() { // destructor of cpu_kdtree | 67 | ~cpu_kdtree() { // destructor of cpu_kdtree |
67 | std::vector <kdtree::kdnode<T>*> next_nodes; | 68 | std::vector <kdtree::kdnode<T>*> next_nodes; |
68 | next_nodes.push_back(root); | 69 | next_nodes.push_back(root); |
@@ -81,6 +82,7 @@ namespace stim { | @@ -81,6 +82,7 @@ namespace stim { | ||
81 | } | 82 | } |
82 | root = NULL; | 83 | root = NULL; |
83 | } | 84 | } |
85 | + | ||
84 | void cpu_create(std::vector < typename kdtree::point<T, D> > &reference_points, size_t max_levels) { | 86 | void cpu_create(std::vector < typename kdtree::point<T, D> > &reference_points, size_t max_levels) { |
85 | tmp_points = &reference_points; | 87 | tmp_points = &reference_points; |
86 | root = new kdtree::kdnode<T>(); // initializing the root node | 88 | root = new kdtree::kdnode<T>(); // initializing the root node |
@@ -121,10 +123,12 @@ namespace stim { | @@ -121,10 +123,12 @@ namespace stim { | ||
121 | next_nodes = next_search_nodes; // go deeper within the tree | 123 | next_nodes = next_search_nodes; // go deeper within the tree |
122 | } | 124 | } |
123 | } | 125 | } |
126 | + | ||
124 | static bool sort_points(const size_t a, const size_t b) { // create functor for std::sort | 127 | static bool sort_points(const size_t a, const size_t b) { // create functor for std::sort |
125 | std::vector < typename kdtree::point<T, D> > &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer | 128 | std::vector < typename kdtree::point<T, D> > &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer |
126 | return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis]; | 129 | return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis]; |
127 | } | 130 | } |
131 | + | ||
128 | void split(kdtree::kdnode<T> *cur, kdtree::kdnode<T> *left, kdtree::kdnode<T> *right) { | 132 | void split(kdtree::kdnode<T> *cur, kdtree::kdnode<T> *left, kdtree::kdnode<T> *right) { |
129 | std::vector < typename kdtree::point<T, D> > &pts = *tmp_points; | 133 | std::vector < typename kdtree::point<T, D> > &pts = *tmp_points; |
130 | current_axis = cur->level % D; // indicate the judicative dimension or axis | 134 | current_axis = cur->level % D; // indicate the judicative dimension or axis |
@@ -145,6 +149,7 @@ namespace stim { | @@ -145,6 +149,7 @@ namespace stim { | ||
145 | right->indices.push_back(idx); | 149 | right->indices.push_back(idx); |
146 | } | 150 | } |
147 | } | 151 | } |
152 | + | ||
148 | void create(T *h_reference_points, size_t reference_count, size_t max_levels) { | 153 | void create(T *h_reference_points, size_t reference_count, size_t max_levels) { |
149 | std::vector < typename kdtree::point<T, D> > reference_points(reference_count); // restore the reference points in particular way | 154 | std::vector < typename kdtree::point<T, D> > reference_points(reference_count); // restore the reference points in particular way |
150 | for (size_t j = 0; j < reference_count; j++) | 155 | for (size_t j = 0; j < reference_count; j++) |
@@ -153,13 +158,16 @@ namespace stim { | @@ -153,13 +158,16 @@ namespace stim { | ||
153 | cpu_create(reference_points, max_levels); | 158 | cpu_create(reference_points, max_levels); |
154 | cpu_tmp_points = *tmp_points; | 159 | cpu_tmp_points = *tmp_points; |
155 | } | 160 | } |
161 | + | ||
156 | int get_num_nodes() const { // get the total number of nodes | 162 | int get_num_nodes() const { // get the total number of nodes |
157 | return n_id; | 163 | return n_id; |
158 | } | 164 | } |
165 | + | ||
159 | kdtree::kdnode<T>* get_root() const { // get the root node of tree | 166 | kdtree::kdnode<T>* get_root() const { // get the root node of tree |
160 | return root; | 167 | return root; |
161 | } | 168 | } |
162 | - T cpu_distance(const kdtree::point<T, D> &a, const kdtree::point<T, D> &b) { | 169 | + |
170 | + T cpu_distance(const kdtree::point<T, D> &a, const kdtree::point<T, D> &b) { | ||
163 | T distance = 0; | 171 | T distance = 0; |
164 | 172 | ||
165 | for (size_t i = 0; i < D; i++) { | 173 | for (size_t i = 0; i < D; i++) { |
@@ -168,6 +176,7 @@ namespace stim { | @@ -168,6 +176,7 @@ namespace stim { | ||
168 | } | 176 | } |
169 | return distance; | 177 | return distance; |
170 | } | 178 | } |
179 | + | ||
171 | void cpu_search_at_node(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, size_t *index, T *distance, kdtree::kdnode<T> **node) { | 180 | void cpu_search_at_node(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, size_t *index, T *distance, kdtree::kdnode<T> **node) { |
172 | T best_distance = FLT_MAX; // initialize the best distance to max of floating point | 181 | T best_distance = FLT_MAX; // initialize the best distance to max of floating point |
173 | size_t best_index = 0; | 182 | size_t best_index = 0; |
@@ -198,6 +207,7 @@ namespace stim { | @@ -198,6 +207,7 @@ namespace stim { | ||
198 | *index = best_index; | 207 | *index = best_index; |
199 | *distance = best_distance; | 208 | *distance = best_distance; |
200 | } | 209 | } |
210 | + | ||
201 | void cpu_search_at_node_range(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, T range, size_t *index, T *distance) { | 211 | void cpu_search_at_node_range(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, T range, size_t *index, T *distance) { |
202 | T best_distance = FLT_MAX; // initialize the best distance to max of floating point | 212 | T best_distance = FLT_MAX; // initialize the best distance to max of floating point |
203 | size_t best_index = 0; | 213 | size_t best_index = 0; |
@@ -240,6 +250,7 @@ namespace stim { | @@ -240,6 +250,7 @@ namespace stim { | ||
240 | *index = best_index; | 250 | *index = best_index; |
241 | *distance = best_distance; | 251 | *distance = best_distance; |
242 | } | 252 | } |
253 | + | ||
243 | void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) { | 254 | void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) { |
244 | /// first convert the input query point into specific type | 255 | /// first convert the input query point into specific type |
245 | kdtree::point<T, D> query; | 256 | kdtree::point<T, D> query; |
@@ -303,6 +314,7 @@ namespace stim { | @@ -303,6 +314,7 @@ namespace stim { | ||
303 | } | 314 | } |
304 | return distance; | 315 | return distance; |
305 | } | 316 | } |
317 | + | ||
306 | template <typename T, int D> | 318 | template <typename T, int D> |
307 | __device__ void search_at_node(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, int cur, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) { | 319 | __device__ void search_at_node(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, int cur, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) { |
308 | T best_distance = FLT_MAX; | 320 | T best_distance = FLT_MAX; |
@@ -332,6 +344,7 @@ namespace stim { | @@ -332,6 +344,7 @@ namespace stim { | ||
332 | *d_distance = best_distance; | 344 | *d_distance = best_distance; |
333 | *d_index = best_index; | 345 | *d_index = best_index; |
334 | } | 346 | } |
347 | + | ||
335 | template <typename T, int D> | 348 | template <typename T, int D> |
336 | __device__ void search_at_node_range(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { | 349 | __device__ void search_at_node_range(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { |
337 | T best_distance = FLT_MAX; | 350 | T best_distance = FLT_MAX; |
@@ -390,6 +403,7 @@ namespace stim { | @@ -390,6 +403,7 @@ namespace stim { | ||
390 | *d_distance = best_distance; | 403 | *d_distance = best_distance; |
391 | *d_index = best_index; | 404 | *d_index = best_index; |
392 | } | 405 | } |
406 | + | ||
393 | template <typename T, int D> | 407 | template <typename T, int D> |
394 | __device__ void search(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { | 408 | __device__ void search(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { |
395 | int best_node = 0; | 409 | int best_node = 0; |
@@ -422,6 +436,7 @@ namespace stim { | @@ -422,6 +436,7 @@ namespace stim { | ||
422 | *d_distance = sqrt(best_distance); | 436 | *d_distance = sqrt(best_distance); |
423 | *d_index = best_index; | 437 | *d_index = best_index; |
424 | } | 438 | } |
439 | + | ||
425 | template <typename T, int D> | 440 | template <typename T, int D> |
426 | __global__ void search_batch(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) { | 441 | __global__ void search_batch(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) { |
427 | size_t idx = blockIdx.x * blockDim.x + threadIdx.x; | 442 | size_t idx = blockIdx.x * blockDim.x + threadIdx.x; |
@@ -429,6 +444,41 @@ namespace stim { | @@ -429,6 +444,41 @@ namespace stim { | ||
429 | 444 | ||
430 | search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent | 445 | search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent |
431 | } | 446 | } |
447 | + | ||
448 | + template <typename T, int D> | ||
449 | + void search_stream(cuda_kdnode<T> *d_nodes, size_t *d_index, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *query_stream_points, size_t stream_count, size_t *indices, T *distances) { | ||
450 | + unsigned int threads = (unsigned int)(stream_count > 1024 ? 1024 : stream_count); | ||
451 | + unsigned int blocks = (unsigned int)(stream_count / threads + (stream_count % threads ? 1 : 0)); | ||
452 | + | ||
453 | + kdtree::point<T, D> *d_query_points; | ||
454 | + size_t *d_indices; | ||
455 | + T *d_distances; | ||
456 | + | ||
457 | + int *next_nodes; | ||
458 | + int *next_search_nodes; | ||
459 | + | ||
460 | + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * stream_count * D)); | ||
461 | + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * stream_count)); | ||
462 | + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * stream_count)); | ||
463 | + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); | ||
464 | + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int))); | ||
465 | + HANDLE_ERROR(cudaMemcpy(d_query_points, query_stream_points, sizeof(T) * stream_count * D, cudaMemcpyHostToDevice)); | ||
466 | + | ||
467 | + int *Judge = NULL; | ||
468 | + | ||
469 | + search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, stream_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge); | ||
470 | + | ||
471 | + if(Judge == NULL) { | ||
472 | + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * stream_count, cudaMemcpyDeviceToHost)); | ||
473 | + HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * stream_count, cudaMemcpyDeviceToHost)); | ||
474 | + } | ||
475 | + | ||
476 | + HANDLE_ERROR(cudaFree(next_nodes)); | ||
477 | + HANDLE_ERROR(cudaFree(next_search_nodes)); | ||
478 | + HANDLE_ERROR(cudaFree(d_query_points)); | ||
479 | + HANDLE_ERROR(cudaFree(d_indices)); | ||
480 | + HANDLE_ERROR(cudaFree(d_distances)); | ||
481 | + } | ||
432 | 482 | ||
433 | template <typename T, int D = 3> | 483 | template <typename T, int D = 3> |
434 | class cuda_kdtree { | 484 | class cuda_kdtree { |
@@ -457,7 +507,7 @@ namespace stim { | @@ -457,7 +507,7 @@ namespace stim { | ||
457 | //bb.init(&h_reference_points[0]); | 507 | //bb.init(&h_reference_points[0]); |
458 | //aaboundingboxing<T, D>(bb, h_reference_points, reference_count); | 508 | //aaboundingboxing<T, D>(bb, h_reference_points, reference_count); |
459 | 509 | ||
460 | - std::vector < typename kdtree::point<T, D> > reference_points(reference_count); // restore the reference points in particular way | 510 | + std::vector < typename kdtree::point<T, D>> reference_points(reference_count); // restore the reference points in particular way |
461 | for (size_t j = 0; j < reference_count; j++) | 511 | for (size_t j = 0; j < reference_count; j++) |
462 | for (size_t i = 0; i < D; i++) | 512 | for (size_t i = 0; i < D; i++) |
463 | reference_points[j].dim[i] = h_reference_points[j * D + i]; | 513 | reference_points[j].dim[i] = h_reference_points[j * D + i]; |
@@ -509,7 +559,7 @@ namespace stim { | @@ -509,7 +559,7 @@ namespace stim { | ||
509 | } | 559 | } |
510 | HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice)); | 560 | HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice)); |
511 | HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice)); | 561 | HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice)); |
512 | - HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point<T, D>) * reference_points.size(), cudaMemcpyHostToDevice)); | 562 | + HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point<T, D>) * reference_count, cudaMemcpyHostToDevice)); |
513 | } | 563 | } |
514 | 564 | ||
515 | /// Search the KD tree for nearest neighbors to a set of specified query points | 565 | /// Search the KD tree for nearest neighbors to a set of specified query points |
@@ -523,37 +573,57 @@ namespace stim { | @@ -523,37 +573,57 @@ namespace stim { | ||
523 | for (size_t i = 0; i < D; i++) | 573 | for (size_t i = 0; i < D; i++) |
524 | query_points[j].dim[i] = h_query_points[j * D + i]; | 574 | query_points[j].dim[i] = h_query_points[j * D + i]; |
525 | 575 | ||
526 | - unsigned int threads = (unsigned int)(query_points.size() > 1024 ? 1024 : query_points.size()); | ||
527 | - unsigned int blocks = (unsigned int)(query_points.size() / threads + (query_points.size() % threads ? 1 : 0)); | 576 | + cudaDeviceProp prop; |
577 | + cudaGetDeviceProperties(&prop, 0); | ||
578 | + | ||
579 | + size_t query_memory = D * sizeof(T) * query_count; | ||
580 | + size_t N = 3 * query_memory / prop.totalGlobalMem; //consider index and distance, roughly 3 times | ||
581 | + if (N > 1) { | ||
582 | + N++; | ||
583 | + size_t stream_count = query_count / N; | ||
584 | + for (size_t n = 0; n < N; n++) { | ||
585 | + size_t query_stream_start = n * stream_count; | ||
586 | + search_stream(d_nodes, d_index, d_reference_points, &query_points[query_stream_start], stream_count, &indices[query_stream_start], &distances[query_stream_start]); | ||
587 | + } | ||
588 | + size_t stream_remain_count = query_count - N * stream_count; | ||
589 | + if (stream_remain_count > 0) { | ||
590 | + size_t query_remain_start = N * stream_count; | ||
591 | + search_stream(d_nodes, d_index, d_reference_points, &query_points[query_remain_start], stream_remain_count, &indices[query_remain_start], &distances[query_remain_start]); | ||
592 | + } | ||
593 | + } | ||
594 | + else { | ||
595 | + unsigned int threads = (unsigned int)(query_count > 1024 ? 1024 : query_count); | ||
596 | + unsigned int blocks = (unsigned int)(query_count / threads + (query_count % threads ? 1 : 0)); | ||
528 | 597 | ||
529 | - kdtree::point<T, D> *d_query_points; // create a pointer pointing to query points on gpu | ||
530 | - size_t *d_indices; | ||
531 | - T *d_distances; | 598 | + kdtree::point<T, D> *d_query_points; // create a pointer pointing to query points on gpu |
599 | + size_t *d_indices; | ||
600 | + T *d_distances; | ||
532 | 601 | ||
533 | - int *next_nodes; // create two STACK-like array | ||
534 | - int *next_search_nodes; | 602 | + int *next_nodes; // create two STACK-like array |
603 | + int *next_search_nodes; | ||
535 | 604 | ||
536 | - int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory | 605 | + int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory |
537 | 606 | ||
538 | - HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_points.size() * D)); | ||
539 | - HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_points.size())); | ||
540 | - HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_points.size())); | ||
541 | - HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); // STACK size right now is 50, you can change it if you mean to | ||
542 | - HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int))); | ||
543 | - HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_points.size() * D, cudaMemcpyHostToDevice)); | ||
544 | - | ||
545 | - search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, query_points.size(), d_indices, d_distances, next_nodes, next_search_nodes, Judge); | ||
546 | - | ||
547 | - if (Judge == NULL) { // do the following work if the thread works safely | ||
548 | - HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_points.size(), cudaMemcpyDeviceToHost)); | ||
549 | - HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_points.size(), cudaMemcpyDeviceToHost)); | ||
550 | - } | 607 | + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_count * D)); |
608 | + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_count)); | ||
609 | + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_count)); | ||
610 | + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); // STACK size right now is 50, you can change it if you mean to | ||
611 | + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int))); | ||
612 | + HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_count * D, cudaMemcpyHostToDevice)); | ||
613 | + | ||
614 | + search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, query_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge); | ||
615 | + | ||
616 | + if (Judge == NULL) { // do the following work if the thread works safely | ||
617 | + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_count, cudaMemcpyDeviceToHost)); | ||
618 | + HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_count, cudaMemcpyDeviceToHost)); | ||
619 | + } | ||
551 | 620 | ||
552 | - HANDLE_ERROR(cudaFree(next_nodes)); | ||
553 | - HANDLE_ERROR(cudaFree(next_search_nodes)); | ||
554 | - HANDLE_ERROR(cudaFree(d_query_points)); | ||
555 | - HANDLE_ERROR(cudaFree(d_indices)); | ||
556 | - HANDLE_ERROR(cudaFree(d_distances)); | 621 | + HANDLE_ERROR(cudaFree(next_nodes)); |
622 | + HANDLE_ERROR(cudaFree(next_search_nodes)); | ||
623 | + HANDLE_ERROR(cudaFree(d_query_points)); | ||
624 | + HANDLE_ERROR(cudaFree(d_indices)); | ||
625 | + HANDLE_ERROR(cudaFree(d_distances)); | ||
626 | + } | ||
557 | } | 627 | } |
558 | 628 | ||
559 | /// Return the number of points in the KD tree | 629 | /// Return the number of points in the KD tree |