Commit f36421245ddcb0ad90020be734c268eb60b3ab4b

Authored by David Mayerich
2 parents 9b563709 9dc73a42

Merge branch 'JACK' into 'master'

add stream to kdtree::search

I did the bench test with my data.
Please use your data to do the test to see whether it works.
If you still encounter the same problem, please let me know!

See merge request !16
Showing 1 changed file with 99 additions and 29 deletions   Show diff stats
stim/structures/kdtree.cuh
@@ -63,6 +63,7 @@ namespace stim { @@ -63,6 +63,7 @@ namespace stim {
63 cur_tree_ptr = this; // create a class pointer points to the current class value 63 cur_tree_ptr = this; // create a class pointer points to the current class value
64 n_id = 0; // set total number of points to default 0 64 n_id = 0; // set total number of points to default 0
65 } 65 }
  66 +
66 ~cpu_kdtree() { // destructor of cpu_kdtree 67 ~cpu_kdtree() { // destructor of cpu_kdtree
67 std::vector <kdtree::kdnode<T>*> next_nodes; 68 std::vector <kdtree::kdnode<T>*> next_nodes;
68 next_nodes.push_back(root); 69 next_nodes.push_back(root);
@@ -81,6 +82,7 @@ namespace stim { @@ -81,6 +82,7 @@ namespace stim {
81 } 82 }
82 root = NULL; 83 root = NULL;
83 } 84 }
  85 +
84 void cpu_create(std::vector < typename kdtree::point<T, D> > &reference_points, size_t max_levels) { 86 void cpu_create(std::vector < typename kdtree::point<T, D> > &reference_points, size_t max_levels) {
85 tmp_points = &reference_points; 87 tmp_points = &reference_points;
86 root = new kdtree::kdnode<T>(); // initializing the root node 88 root = new kdtree::kdnode<T>(); // initializing the root node
@@ -121,10 +123,12 @@ namespace stim { @@ -121,10 +123,12 @@ namespace stim {
121 next_nodes = next_search_nodes; // go deeper within the tree 123 next_nodes = next_search_nodes; // go deeper within the tree
122 } 124 }
123 } 125 }
  126 +
124 static bool sort_points(const size_t a, const size_t b) { // create functor for std::sort 127 static bool sort_points(const size_t a, const size_t b) { // create functor for std::sort
125 std::vector < typename kdtree::point<T, D> > &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer 128 std::vector < typename kdtree::point<T, D> > &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer
126 return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis]; 129 return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis];
127 } 130 }
  131 +
128 void split(kdtree::kdnode<T> *cur, kdtree::kdnode<T> *left, kdtree::kdnode<T> *right) { 132 void split(kdtree::kdnode<T> *cur, kdtree::kdnode<T> *left, kdtree::kdnode<T> *right) {
129 std::vector < typename kdtree::point<T, D> > &pts = *tmp_points; 133 std::vector < typename kdtree::point<T, D> > &pts = *tmp_points;
130 current_axis = cur->level % D; // indicate the judicative dimension or axis 134 current_axis = cur->level % D; // indicate the judicative dimension or axis
@@ -145,6 +149,7 @@ namespace stim { @@ -145,6 +149,7 @@ namespace stim {
145 right->indices.push_back(idx); 149 right->indices.push_back(idx);
146 } 150 }
147 } 151 }
  152 +
148 void create(T *h_reference_points, size_t reference_count, size_t max_levels) { 153 void create(T *h_reference_points, size_t reference_count, size_t max_levels) {
149 std::vector < typename kdtree::point<T, D> > reference_points(reference_count); // restore the reference points in particular way 154 std::vector < typename kdtree::point<T, D> > reference_points(reference_count); // restore the reference points in particular way
150 for (size_t j = 0; j < reference_count; j++) 155 for (size_t j = 0; j < reference_count; j++)
@@ -153,13 +158,16 @@ namespace stim { @@ -153,13 +158,16 @@ namespace stim {
153 cpu_create(reference_points, max_levels); 158 cpu_create(reference_points, max_levels);
154 cpu_tmp_points = *tmp_points; 159 cpu_tmp_points = *tmp_points;
155 } 160 }
  161 +
156 int get_num_nodes() const { // get the total number of nodes 162 int get_num_nodes() const { // get the total number of nodes
157 return n_id; 163 return n_id;
158 } 164 }
  165 +
159 kdtree::kdnode<T>* get_root() const { // get the root node of tree 166 kdtree::kdnode<T>* get_root() const { // get the root node of tree
160 return root; 167 return root;
161 } 168 }
162 - T cpu_distance(const kdtree::point<T, D> &a, const kdtree::point<T, D> &b) { 169 +
  170 + T cpu_distance(const kdtree::point<T, D> &a, const kdtree::point<T, D> &b) {
163 T distance = 0; 171 T distance = 0;
164 172
165 for (size_t i = 0; i < D; i++) { 173 for (size_t i = 0; i < D; i++) {
@@ -168,6 +176,7 @@ namespace stim { @@ -168,6 +176,7 @@ namespace stim {
168 } 176 }
169 return distance; 177 return distance;
170 } 178 }
  179 +
171 void cpu_search_at_node(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, size_t *index, T *distance, kdtree::kdnode<T> **node) { 180 void cpu_search_at_node(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, size_t *index, T *distance, kdtree::kdnode<T> **node) {
172 T best_distance = FLT_MAX; // initialize the best distance to max of floating point 181 T best_distance = FLT_MAX; // initialize the best distance to max of floating point
173 size_t best_index = 0; 182 size_t best_index = 0;
@@ -198,6 +207,7 @@ namespace stim { @@ -198,6 +207,7 @@ namespace stim {
198 *index = best_index; 207 *index = best_index;
199 *distance = best_distance; 208 *distance = best_distance;
200 } 209 }
  210 +
201 void cpu_search_at_node_range(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, T range, size_t *index, T *distance) { 211 void cpu_search_at_node_range(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, T range, size_t *index, T *distance) {
202 T best_distance = FLT_MAX; // initialize the best distance to max of floating point 212 T best_distance = FLT_MAX; // initialize the best distance to max of floating point
203 size_t best_index = 0; 213 size_t best_index = 0;
@@ -240,6 +250,7 @@ namespace stim { @@ -240,6 +250,7 @@ namespace stim {
240 *index = best_index; 250 *index = best_index;
241 *distance = best_distance; 251 *distance = best_distance;
242 } 252 }
  253 +
243 void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) { 254 void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) {
244 /// first convert the input query point into specific type 255 /// first convert the input query point into specific type
245 kdtree::point<T, D> query; 256 kdtree::point<T, D> query;
@@ -303,6 +314,7 @@ namespace stim { @@ -303,6 +314,7 @@ namespace stim {
303 } 314 }
304 return distance; 315 return distance;
305 } 316 }
  317 +
306 template <typename T, int D> 318 template <typename T, int D>
307 __device__ void search_at_node(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, int cur, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) { 319 __device__ void search_at_node(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, int cur, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) {
308 T best_distance = FLT_MAX; 320 T best_distance = FLT_MAX;
@@ -332,6 +344,7 @@ namespace stim { @@ -332,6 +344,7 @@ namespace stim {
332 *d_distance = best_distance; 344 *d_distance = best_distance;
333 *d_index = best_index; 345 *d_index = best_index;
334 } 346 }
  347 +
335 template <typename T, int D> 348 template <typename T, int D>
336 __device__ void search_at_node_range(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { 349 __device__ void search_at_node_range(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
337 T best_distance = FLT_MAX; 350 T best_distance = FLT_MAX;
@@ -390,6 +403,7 @@ namespace stim { @@ -390,6 +403,7 @@ namespace stim {
390 *d_distance = best_distance; 403 *d_distance = best_distance;
391 *d_index = best_index; 404 *d_index = best_index;
392 } 405 }
  406 +
393 template <typename T, int D> 407 template <typename T, int D>
394 __device__ void search(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { 408 __device__ void search(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
395 int best_node = 0; 409 int best_node = 0;
@@ -422,6 +436,7 @@ namespace stim { @@ -422,6 +436,7 @@ namespace stim {
422 *d_distance = sqrt(best_distance); 436 *d_distance = sqrt(best_distance);
423 *d_index = best_index; 437 *d_index = best_index;
424 } 438 }
  439 +
425 template <typename T, int D> 440 template <typename T, int D>
426 __global__ void search_batch(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) { 441 __global__ void search_batch(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) {
427 size_t idx = blockIdx.x * blockDim.x + threadIdx.x; 442 size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -429,6 +444,41 @@ namespace stim { @@ -429,6 +444,41 @@ namespace stim {
429 444
430 search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent 445 search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent
431 } 446 }
  447 +
  448 + template <typename T, int D>
  449 + void search_stream(cuda_kdnode<T> *d_nodes, size_t *d_index, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *query_stream_points, size_t stream_count, size_t *indices, T *distances) {
  450 + unsigned int threads = (unsigned int)(stream_count > 1024 ? 1024 : stream_count);
  451 + unsigned int blocks = (unsigned int)(stream_count / threads + (stream_count % threads ? 1 : 0));
  452 +
  453 + kdtree::point<T, D> *d_query_points;
  454 + size_t *d_indices;
  455 + T *d_distances;
  456 +
  457 + int *next_nodes;
  458 + int *next_search_nodes;
  459 +
  460 + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * stream_count * D));
  461 + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * stream_count));
  462 + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * stream_count));
  463 + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int)));
  464 + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));
  465 + HANDLE_ERROR(cudaMemcpy(d_query_points, query_stream_points, sizeof(T) * stream_count * D, cudaMemcpyHostToDevice));
  466 +
  467 + int *Judge = NULL;
  468 +
  469 + search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, stream_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge);
  470 +
  471 + if(Judge == NULL) {
  472 + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * stream_count, cudaMemcpyDeviceToHost));
  473 + HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * stream_count, cudaMemcpyDeviceToHost));
  474 + }
  475 +
  476 + HANDLE_ERROR(cudaFree(next_nodes));
  477 + HANDLE_ERROR(cudaFree(next_search_nodes));
  478 + HANDLE_ERROR(cudaFree(d_query_points));
  479 + HANDLE_ERROR(cudaFree(d_indices));
  480 + HANDLE_ERROR(cudaFree(d_distances));
  481 + }
432 482
433 template <typename T, int D = 3> 483 template <typename T, int D = 3>
434 class cuda_kdtree { 484 class cuda_kdtree {
@@ -457,7 +507,7 @@ namespace stim { @@ -457,7 +507,7 @@ namespace stim {
457 //bb.init(&h_reference_points[0]); 507 //bb.init(&h_reference_points[0]);
458 //aaboundingboxing<T, D>(bb, h_reference_points, reference_count); 508 //aaboundingboxing<T, D>(bb, h_reference_points, reference_count);
459 509
460 - std::vector < typename kdtree::point<T, D> > reference_points(reference_count); // restore the reference points in particular way 510 + std::vector < typename kdtree::point<T, D>> reference_points(reference_count); // restore the reference points in particular way
461 for (size_t j = 0; j < reference_count; j++) 511 for (size_t j = 0; j < reference_count; j++)
462 for (size_t i = 0; i < D; i++) 512 for (size_t i = 0; i < D; i++)
463 reference_points[j].dim[i] = h_reference_points[j * D + i]; 513 reference_points[j].dim[i] = h_reference_points[j * D + i];
@@ -509,7 +559,7 @@ namespace stim { @@ -509,7 +559,7 @@ namespace stim {
509 } 559 }
510 HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice)); 560 HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice));
511 HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice)); 561 HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
512 - HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point<T, D>) * reference_points.size(), cudaMemcpyHostToDevice)); 562 + HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point<T, D>) * reference_count, cudaMemcpyHostToDevice));
513 } 563 }
514 564
515 /// Search the KD tree for nearest neighbors to a set of specified query points 565 /// Search the KD tree for nearest neighbors to a set of specified query points
@@ -523,37 +573,57 @@ namespace stim { @@ -523,37 +573,57 @@ namespace stim {
523 for (size_t i = 0; i < D; i++) 573 for (size_t i = 0; i < D; i++)
524 query_points[j].dim[i] = h_query_points[j * D + i]; 574 query_points[j].dim[i] = h_query_points[j * D + i];
525 575
526 - unsigned int threads = (unsigned int)(query_points.size() > 1024 ? 1024 : query_points.size());  
527 - unsigned int blocks = (unsigned int)(query_points.size() / threads + (query_points.size() % threads ? 1 : 0)); 576 + cudaDeviceProp prop;
  577 + cudaGetDeviceProperties(&prop, 0);
  578 +
  579 + size_t query_memory = D * sizeof(T) * query_count;
  580 + size_t N = 3 * query_memory / prop.totalGlobalMem; //consider index and distance, roughly 3 times
  581 + if (N > 1) {
  582 + N++;
  583 + size_t stream_count = query_count / N;
  584 + for (size_t n = 0; n < N; n++) {
  585 + size_t query_stream_start = n * stream_count;
  586 + search_stream(d_nodes, d_index, d_reference_points, &query_points[query_stream_start], stream_count, &indices[query_stream_start], &distances[query_stream_start]);
  587 + }
  588 + size_t stream_remain_count = query_count - N * stream_count;
  589 + if (stream_remain_count > 0) {
  590 + size_t query_remain_start = N * stream_count;
  591 + search_stream(d_nodes, d_index, d_reference_points, &query_points[query_remain_start], stream_remain_count, &indices[query_remain_start], &distances[query_remain_start]);
  592 + }
  593 + }
  594 + else {
  595 + unsigned int threads = (unsigned int)(query_count > 1024 ? 1024 : query_count);
  596 + unsigned int blocks = (unsigned int)(query_count / threads + (query_count % threads ? 1 : 0));
528 597
529 - kdtree::point<T, D> *d_query_points; // create a pointer pointing to query points on gpu  
530 - size_t *d_indices;  
531 - T *d_distances; 598 + kdtree::point<T, D> *d_query_points; // create a pointer pointing to query points on gpu
  599 + size_t *d_indices;
  600 + T *d_distances;
532 601
533 - int *next_nodes; // create two STACK-like array  
534 - int *next_search_nodes; 602 + int *next_nodes; // create two STACK-like array
  603 + int *next_search_nodes;
535 604
536 - int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory 605 + int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory
537 606
538 - HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_points.size() * D));  
539 - HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_points.size()));  
540 - HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_points.size()));  
541 - HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); // STACK size right now is 50, you can change it if you mean to  
542 - HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));  
543 - HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_points.size() * D, cudaMemcpyHostToDevice));  
544 -  
545 - search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, query_points.size(), d_indices, d_distances, next_nodes, next_search_nodes, Judge);  
546 -  
547 - if (Judge == NULL) { // do the following work if the thread works safely  
548 - HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_points.size(), cudaMemcpyDeviceToHost));  
549 - HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_points.size(), cudaMemcpyDeviceToHost));  
550 - } 607 + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_count * D));
  608 + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_count));
  609 + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_count));
  610 + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); // STACK size right now is 50, you can change it if you mean to
  611 + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));
  612 + HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_count * D, cudaMemcpyHostToDevice));
  613 +
  614 + search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, query_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge);
  615 +
  616 + if (Judge == NULL) { // do the following work if the thread works safely
  617 + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_count, cudaMemcpyDeviceToHost));
  618 + HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_count, cudaMemcpyDeviceToHost));
  619 + }
551 620
552 - HANDLE_ERROR(cudaFree(next_nodes));  
553 - HANDLE_ERROR(cudaFree(next_search_nodes));  
554 - HANDLE_ERROR(cudaFree(d_query_points));  
555 - HANDLE_ERROR(cudaFree(d_indices));  
556 - HANDLE_ERROR(cudaFree(d_distances)); 621 + HANDLE_ERROR(cudaFree(next_nodes));
  622 + HANDLE_ERROR(cudaFree(next_search_nodes));
  623 + HANDLE_ERROR(cudaFree(d_query_points));
  624 + HANDLE_ERROR(cudaFree(d_indices));
  625 + HANDLE_ERROR(cudaFree(d_distances));
  626 + }
557 } 627 }
558 628
559 /// Return the number of points in the KD tree 629 /// Return the number of points in the KD tree