Commit bd574305b41cb1a39409ca8c4101dd365e50089d

Authored by Jiaming Guo
1 parent 9b563709

add stream to kdtree::search in order to stream the query points

Showing 1 changed file with 99 additions and 29 deletions   Show diff stats
stim/structures/kdtree.cuh
@@ -63,6 +63,7 @@ namespace stim { @@ -63,6 +63,7 @@ namespace stim {
63 cur_tree_ptr = this; // create a class pointer points to the current class value 63 cur_tree_ptr = this; // create a class pointer points to the current class value
64 n_id = 0; // set total number of points to default 0 64 n_id = 0; // set total number of points to default 0
65 } 65 }
  66 +
66 ~cpu_kdtree() { // destructor of cpu_kdtree 67 ~cpu_kdtree() { // destructor of cpu_kdtree
67 std::vector <kdtree::kdnode<T>*> next_nodes; 68 std::vector <kdtree::kdnode<T>*> next_nodes;
68 next_nodes.push_back(root); 69 next_nodes.push_back(root);
@@ -81,6 +82,7 @@ namespace stim { @@ -81,6 +82,7 @@ namespace stim {
81 } 82 }
82 root = NULL; 83 root = NULL;
83 } 84 }
  85 +
84 void cpu_create(std::vector < typename kdtree::point<T, D> > &reference_points, size_t max_levels) { 86 void cpu_create(std::vector < typename kdtree::point<T, D> > &reference_points, size_t max_levels) {
85 tmp_points = &reference_points; 87 tmp_points = &reference_points;
86 root = new kdtree::kdnode<T>(); // initializing the root node 88 root = new kdtree::kdnode<T>(); // initializing the root node
@@ -121,10 +123,12 @@ namespace stim { @@ -121,10 +123,12 @@ namespace stim {
121 next_nodes = next_search_nodes; // go deeper within the tree 123 next_nodes = next_search_nodes; // go deeper within the tree
122 } 124 }
123 } 125 }
  126 +
124 static bool sort_points(const size_t a, const size_t b) { // create functor for std::sort 127 static bool sort_points(const size_t a, const size_t b) { // create functor for std::sort
125 std::vector < typename kdtree::point<T, D> > &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer 128 std::vector < typename kdtree::point<T, D> > &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer
126 return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis]; 129 return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis];
127 } 130 }
  131 +
128 void split(kdtree::kdnode<T> *cur, kdtree::kdnode<T> *left, kdtree::kdnode<T> *right) { 132 void split(kdtree::kdnode<T> *cur, kdtree::kdnode<T> *left, kdtree::kdnode<T> *right) {
129 std::vector < typename kdtree::point<T, D> > &pts = *tmp_points; 133 std::vector < typename kdtree::point<T, D> > &pts = *tmp_points;
130 current_axis = cur->level % D; // indicate the judicative dimension or axis 134 current_axis = cur->level % D; // indicate the judicative dimension or axis
@@ -145,6 +149,7 @@ namespace stim { @@ -145,6 +149,7 @@ namespace stim {
145 right->indices.push_back(idx); 149 right->indices.push_back(idx);
146 } 150 }
147 } 151 }
  152 +
148 void create(T *h_reference_points, size_t reference_count, size_t max_levels) { 153 void create(T *h_reference_points, size_t reference_count, size_t max_levels) {
149 std::vector < typename kdtree::point<T, D> > reference_points(reference_count); // restore the reference points in particular way 154 std::vector < typename kdtree::point<T, D> > reference_points(reference_count); // restore the reference points in particular way
150 for (size_t j = 0; j < reference_count; j++) 155 for (size_t j = 0; j < reference_count; j++)
@@ -153,13 +158,16 @@ namespace stim { @@ -153,13 +158,16 @@ namespace stim {
153 cpu_create(reference_points, max_levels); 158 cpu_create(reference_points, max_levels);
154 cpu_tmp_points = *tmp_points; 159 cpu_tmp_points = *tmp_points;
155 } 160 }
  161 +
156 int get_num_nodes() const { // get the total number of nodes 162 int get_num_nodes() const { // get the total number of nodes
157 return n_id; 163 return n_id;
158 } 164 }
  165 +
159 kdtree::kdnode<T>* get_root() const { // get the root node of tree 166 kdtree::kdnode<T>* get_root() const { // get the root node of tree
160 return root; 167 return root;
161 } 168 }
162 - T cpu_distance(const kdtree::point<T, D> &a, const kdtree::point<T, D> &b) { 169 +
  170 + T cpu_distance(const kdtree::point<T, D> &a, const kdtree::point<T, D> &b) {
163 T distance = 0; 171 T distance = 0;
164 172
165 for (size_t i = 0; i < D; i++) { 173 for (size_t i = 0; i < D; i++) {
@@ -168,6 +176,7 @@ namespace stim { @@ -168,6 +176,7 @@ namespace stim {
168 } 176 }
169 return distance; 177 return distance;
170 } 178 }
  179 +
171 void cpu_search_at_node(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, size_t *index, T *distance, kdtree::kdnode<T> **node) { 180 void cpu_search_at_node(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, size_t *index, T *distance, kdtree::kdnode<T> **node) {
172 T best_distance = FLT_MAX; // initialize the best distance to max of floating point 181 T best_distance = FLT_MAX; // initialize the best distance to max of floating point
173 size_t best_index = 0; 182 size_t best_index = 0;
@@ -198,6 +207,7 @@ namespace stim { @@ -198,6 +207,7 @@ namespace stim {
198 *index = best_index; 207 *index = best_index;
199 *distance = best_distance; 208 *distance = best_distance;
200 } 209 }
  210 +
201 void cpu_search_at_node_range(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, T range, size_t *index, T *distance) { 211 void cpu_search_at_node_range(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, T range, size_t *index, T *distance) {
202 T best_distance = FLT_MAX; // initialize the best distance to max of floating point 212 T best_distance = FLT_MAX; // initialize the best distance to max of floating point
203 size_t best_index = 0; 213 size_t best_index = 0;
@@ -240,6 +250,7 @@ namespace stim { @@ -240,6 +250,7 @@ namespace stim {
240 *index = best_index; 250 *index = best_index;
241 *distance = best_distance; 251 *distance = best_distance;
242 } 252 }
  253 +
243 void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) { 254 void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) {
244 /// first convert the input query point into specific type 255 /// first convert the input query point into specific type
245 kdtree::point<T, D> query; 256 kdtree::point<T, D> query;
@@ -303,6 +314,7 @@ namespace stim { @@ -303,6 +314,7 @@ namespace stim {
303 } 314 }
304 return distance; 315 return distance;
305 } 316 }
  317 +
306 template <typename T, int D> 318 template <typename T, int D>
307 __device__ void search_at_node(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, int cur, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) { 319 __device__ void search_at_node(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, int cur, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) {
308 T best_distance = FLT_MAX; 320 T best_distance = FLT_MAX;
@@ -332,6 +344,7 @@ namespace stim { @@ -332,6 +344,7 @@ namespace stim {
332 *d_distance = best_distance; 344 *d_distance = best_distance;
333 *d_index = best_index; 345 *d_index = best_index;
334 } 346 }
  347 +
335 template <typename T, int D> 348 template <typename T, int D>
336 __device__ void search_at_node_range(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { 349 __device__ void search_at_node_range(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
337 T best_distance = FLT_MAX; 350 T best_distance = FLT_MAX;
@@ -390,6 +403,7 @@ namespace stim { @@ -390,6 +403,7 @@ namespace stim {
390 *d_distance = best_distance; 403 *d_distance = best_distance;
391 *d_index = best_index; 404 *d_index = best_index;
392 } 405 }
  406 +
393 template <typename T, int D> 407 template <typename T, int D>
394 __device__ void search(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { 408 __device__ void search(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
395 int best_node = 0; 409 int best_node = 0;
@@ -422,6 +436,7 @@ namespace stim { @@ -422,6 +436,7 @@ namespace stim {
422 *d_distance = sqrt(best_distance); 436 *d_distance = sqrt(best_distance);
423 *d_index = best_index; 437 *d_index = best_index;
424 } 438 }
  439 +
425 template <typename T, int D> 440 template <typename T, int D>
426 __global__ void search_batch(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) { 441 __global__ void search_batch(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) {
427 size_t idx = blockIdx.x * blockDim.x + threadIdx.x; 442 size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -429,6 +444,41 @@ namespace stim { @@ -429,6 +444,41 @@ namespace stim {
429 444
430 search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent 445 search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent
431 } 446 }
  447 +
  448 + template <typename T, int D>
  449 + void search_stream(cuda_kdnode<T> *d_nodes, size_t *d_index, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *query_stream_points, size_t stream_count, size_t *indices, T *distances) {
  450 + unsigned int threads = (unsigned int)(stream_count > 1024 ? 1024 : stream_count);
  451 + unsigned int blocks = (unsigned int)(stream_count / threads + (stream_count % threads ? 1 : 0));
  452 +
  453 + kdtree::point<T, D> *d_query_points;
  454 + size_t *d_indices;
  455 + T *d_distances;
  456 +
  457 + int *next_nodes;
  458 + int *next_search_nodes;
  459 +
  460 + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * stream_count * D));
  461 + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * stream_count));
  462 + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * stream_count));
  463 + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int)));
  464 + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));
  465 + HANDLE_ERROR(cudaMemcpy(d_query_points, query_stream_points, sizeof(T) * stream_count * D, cudaMemcpyHostToDevice));
  466 +
  467 + int *Judge = NULL;
  468 +
  469 + search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, stream_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge);
  470 +
  471 + if(Judge == NULL) {
  472 + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * stream_count, cudaMemcpyDeviceToHost));
  473 + HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * stream_count, cudaMemcpyDeviceToHost));
  474 + }
  475 +
  476 + HANDLE_ERROR(cudaFree(next_nodes));
  477 + HANDLE_ERROR(cudaFree(next_search_nodes));
  478 + HANDLE_ERROR(cudaFree(d_query_points));
  479 + HANDLE_ERROR(cudaFree(d_indices));
  480 + HANDLE_ERROR(cudaFree(d_distances));
  481 + }
432 482
433 template <typename T, int D = 3> 483 template <typename T, int D = 3>
434 class cuda_kdtree { 484 class cuda_kdtree {
@@ -457,7 +507,7 @@ namespace stim { @@ -457,7 +507,7 @@ namespace stim {
457 //bb.init(&h_reference_points[0]); 507 //bb.init(&h_reference_points[0]);
458 //aaboundingboxing<T, D>(bb, h_reference_points, reference_count); 508 //aaboundingboxing<T, D>(bb, h_reference_points, reference_count);
459 509
460 - std::vector < typename kdtree::point<T, D> > reference_points(reference_count); // restore the reference points in particular way 510 + std::vector <kdtree::point<T, D>> reference_points(reference_count); // restore the reference points in particular way
461 for (size_t j = 0; j < reference_count; j++) 511 for (size_t j = 0; j < reference_count; j++)
462 for (size_t i = 0; i < D; i++) 512 for (size_t i = 0; i < D; i++)
463 reference_points[j].dim[i] = h_reference_points[j * D + i]; 513 reference_points[j].dim[i] = h_reference_points[j * D + i];
@@ -509,7 +559,7 @@ namespace stim { @@ -509,7 +559,7 @@ namespace stim {
509 } 559 }
510 HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice)); 560 HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice));
511 HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice)); 561 HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
512 - HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point<T, D>) * reference_points.size(), cudaMemcpyHostToDevice)); 562 + HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point<T, D>) * reference_count, cudaMemcpyHostToDevice));
513 } 563 }
514 564
515 /// Search the KD tree for nearest neighbors to a set of specified query points 565 /// Search the KD tree for nearest neighbors to a set of specified query points
@@ -523,37 +573,57 @@ namespace stim { @@ -523,37 +573,57 @@ namespace stim {
523 for (size_t i = 0; i < D; i++) 573 for (size_t i = 0; i < D; i++)
524 query_points[j].dim[i] = h_query_points[j * D + i]; 574 query_points[j].dim[i] = h_query_points[j * D + i];
525 575
526 - unsigned int threads = (unsigned int)(query_points.size() > 1024 ? 1024 : query_points.size());  
527 - unsigned int blocks = (unsigned int)(query_points.size() / threads + (query_points.size() % threads ? 1 : 0)); 576 + cudaDeviceProp prop;
  577 + cudaGetDeviceProperties(&prop, 0);
  578 +
  579 + size_t query_memory = D * sizeof(T) * query_count;
  580 + size_t N = 3 * query_memory / prop.totalGlobalMem; //consider index and distance, roughly 3 times
  581 + if (N > 1) {
  582 + N++;
  583 + size_t stream_count = query_count / N;
  584 + for (size_t n = 0; n < N; n++) {
  585 + size_t query_stream_start = n * stream_count;
  586 + search_stream(d_nodes, d_index, d_reference_points, &query_points[query_stream_start], stream_count, &indices[query_stream_start], &distances[query_stream_start]);
  587 + }
  588 + size_t stream_remain_count = query_count - N * stream_count;
  589 + if (stream_remain_count > 0) {
  590 + size_t query_remain_start = N * stream_count;
  591 + search_stream(d_nodes, d_index, d_reference_points, &query_points[query_remain_start], stream_remain_count, &indices[query_remain_start], &distances[query_remain_start]);
  592 + }
  593 + }
  594 + else {
  595 + unsigned int threads = (unsigned int)(query_count > 1024 ? 1024 : query_count);
  596 + unsigned int blocks = (unsigned int)(query_count / threads + (query_count % threads ? 1 : 0));
528 597
529 - kdtree::point<T, D> *d_query_points; // create a pointer pointing to query points on gpu  
530 - size_t *d_indices;  
531 - T *d_distances; 598 + kdtree::point<T, D> *d_query_points; // create a pointer pointing to query points on gpu
  599 + size_t *d_indices;
  600 + T *d_distances;
532 601
533 - int *next_nodes; // create two STACK-like array  
534 - int *next_search_nodes; 602 + int *next_nodes; // create two STACK-like array
  603 + int *next_search_nodes;
535 604
536 - int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory 605 + int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory
537 606
538 - HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_points.size() * D));  
539 - HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_points.size()));  
540 - HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_points.size()));  
541 - HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); // STACK size right now is 50, you can change it if you mean to  
542 - HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));  
543 - HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_points.size() * D, cudaMemcpyHostToDevice));  
544 -  
545 - search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, query_points.size(), d_indices, d_distances, next_nodes, next_search_nodes, Judge);  
546 -  
547 - if (Judge == NULL) { // do the following work if the thread works safely  
548 - HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_points.size(), cudaMemcpyDeviceToHost));  
549 - HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_points.size(), cudaMemcpyDeviceToHost));  
550 - } 607 + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_count * D));
  608 + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_count));
  609 + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_count));
  610 + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); // STACK size right now is 50, you can change it if you mean to
  611 + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));
  612 + HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_count * D, cudaMemcpyHostToDevice));
  613 +
  614 + search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, query_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge);
  615 +
  616 + if (Judge == NULL) { // do the following work if the thread works safely
  617 + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_count, cudaMemcpyDeviceToHost));
  618 + HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_count, cudaMemcpyDeviceToHost));
  619 + }
551 620
552 - HANDLE_ERROR(cudaFree(next_nodes));  
553 - HANDLE_ERROR(cudaFree(next_search_nodes));  
554 - HANDLE_ERROR(cudaFree(d_query_points));  
555 - HANDLE_ERROR(cudaFree(d_indices));  
556 - HANDLE_ERROR(cudaFree(d_distances)); 621 + HANDLE_ERROR(cudaFree(next_nodes));
  622 + HANDLE_ERROR(cudaFree(next_search_nodes));
  623 + HANDLE_ERROR(cudaFree(d_query_points));
  624 + HANDLE_ERROR(cudaFree(d_indices));
  625 + HANDLE_ERROR(cudaFree(d_distances));
  626 + }
557 } 627 }
558 628
559 /// Return the number of points in the KD tree 629 /// Return the number of points in the KD tree