From bd574305b41cb1a39409ca8c4101dd365e50089d Mon Sep 17 00:00:00 2001 From: Jiaming Guo Date: Fri, 9 Dec 2016 15:48:12 -0600 Subject: [PATCH] add stream to kdtree::search in order to stream the query points --- stim/structures/kdtree.cuh | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 99 insertions(+), 29 deletions(-) diff --git a/stim/structures/kdtree.cuh b/stim/structures/kdtree.cuh index 9d401ce..b0e34df 100644 --- a/stim/structures/kdtree.cuh +++ b/stim/structures/kdtree.cuh @@ -63,6 +63,7 @@ namespace stim { cur_tree_ptr = this; // create a class pointer points to the current class value n_id = 0; // set total number of points to default 0 } + ~cpu_kdtree() { // destructor of cpu_kdtree std::vector *> next_nodes; next_nodes.push_back(root); @@ -81,6 +82,7 @@ namespace stim { } root = NULL; } + void cpu_create(std::vector < typename kdtree::point > &reference_points, size_t max_levels) { tmp_points = &reference_points; root = new kdtree::kdnode(); // initializing the root node @@ -121,10 +123,12 @@ namespace stim { next_nodes = next_search_nodes; // go deeper within the tree } } + static bool sort_points(const size_t a, const size_t b) { // create functor for std::sort std::vector < typename kdtree::point > &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis]; } + void split(kdtree::kdnode *cur, kdtree::kdnode *left, kdtree::kdnode *right) { std::vector < typename kdtree::point > &pts = *tmp_points; current_axis = cur->level % D; // indicate the judicative dimension or axis @@ -145,6 +149,7 @@ namespace stim { right->indices.push_back(idx); } } + void create(T *h_reference_points, size_t reference_count, size_t max_levels) { std::vector < typename kdtree::point > reference_points(reference_count); // restore the reference points in particular way for (size_t j = 0; j < reference_count; j++) @@ -153,13 +158,16 @@ namespace stim { cpu_create(reference_points, max_levels); cpu_tmp_points = *tmp_points; } + int get_num_nodes() const { // get the total number of nodes return n_id; } + kdtree::kdnode* get_root() const { // get the root node of tree return root; } - T cpu_distance(const kdtree::point &a, const kdtree::point &b) { + + T cpu_distance(const kdtree::point &a, const kdtree::point &b) { T distance = 0; for (size_t i = 0; i < D; i++) { @@ -168,6 +176,7 @@ namespace stim { } return distance; } + void cpu_search_at_node(kdtree::kdnode *cur, const kdtree::point &query, size_t *index, T *distance, kdtree::kdnode **node) { T best_distance = FLT_MAX; // initialize the best distance to max of floating point size_t best_index = 0; @@ -198,6 +207,7 @@ namespace stim { *index = best_index; *distance = best_distance; } + void cpu_search_at_node_range(kdtree::kdnode *cur, const kdtree::point &query, T range, size_t *index, T *distance) { T best_distance = FLT_MAX; // initialize the best distance to max of floating point size_t best_index = 0; @@ -240,6 +250,7 @@ namespace stim { *index = best_index; *distance = best_distance; } + void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) { /// first convert the input query point into specific type kdtree::point query; @@ -303,6 +314,7 @@ namespace stim { } return distance; } + template __device__ void search_at_node(cuda_kdnode *nodes, size_t *indices, kdtree::point *d_reference_points, int cur, kdtree::point &d_query_point, size_t *d_index, T *d_distance, int *d_node) { T best_distance = FLT_MAX; @@ -332,6 +344,7 @@ namespace stim { *d_distance = best_distance; *d_index = best_index; } + template __device__ void search_at_node_range(cuda_kdnode *nodes, size_t *indices, kdtree::point *d_reference_points, kdtree::point &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { T best_distance = FLT_MAX; @@ -390,6 +403,7 @@ namespace stim { *d_distance = best_distance; *d_index = best_index; } + template __device__ void search(cuda_kdnode *nodes, size_t *indices, kdtree::point *d_reference_points, kdtree::point &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { int best_node = 0; @@ -422,6 +436,7 @@ namespace stim { *d_distance = sqrt(best_distance); *d_index = best_index; } + template __global__ void search_batch(cuda_kdnode *nodes, size_t *indices, kdtree::point *d_reference_points, kdtree::point *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) { size_t idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -429,6 +444,41 @@ namespace stim { search(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent } + + template + void search_stream(cuda_kdnode *d_nodes, size_t *d_index, kdtree::point *d_reference_points, kdtree::point *query_stream_points, size_t stream_count, size_t *indices, T *distances) { + unsigned int threads = (unsigned int)(stream_count > 1024 ? 1024 : stream_count); + unsigned int blocks = (unsigned int)(stream_count / threads + (stream_count % threads ? 1 : 0)); + + kdtree::point *d_query_points; + size_t *d_indices; + T *d_distances; + + int *next_nodes; + int *next_search_nodes; + + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * stream_count * D)); + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * stream_count)); + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * stream_count)); + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int))); + HANDLE_ERROR(cudaMemcpy(d_query_points, query_stream_points, sizeof(T) * stream_count * D, cudaMemcpyHostToDevice)); + + int *Judge = NULL; + + search_batch<<>> (d_nodes, d_index, d_reference_points, d_query_points, stream_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge); + + if(Judge == NULL) { + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * stream_count, cudaMemcpyDeviceToHost)); + HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * stream_count, cudaMemcpyDeviceToHost)); + } + + HANDLE_ERROR(cudaFree(next_nodes)); + HANDLE_ERROR(cudaFree(next_search_nodes)); + HANDLE_ERROR(cudaFree(d_query_points)); + HANDLE_ERROR(cudaFree(d_indices)); + HANDLE_ERROR(cudaFree(d_distances)); + } template class cuda_kdtree { @@ -457,7 +507,7 @@ namespace stim { //bb.init(&h_reference_points[0]); //aaboundingboxing(bb, h_reference_points, reference_count); - std::vector < typename kdtree::point > reference_points(reference_count); // restore the reference points in particular way + std::vector > reference_points(reference_count); // restore the reference points in particular way for (size_t j = 0; j < reference_count; j++) for (size_t i = 0; i < D; i++) reference_points[j].dim[i] = h_reference_points[j * D + i]; @@ -509,7 +559,7 @@ namespace stim { } HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode) * tmp_nodes.size(), cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice)); - HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point) * reference_points.size(), cudaMemcpyHostToDevice)); + HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point) * reference_count, cudaMemcpyHostToDevice)); } /// Search the KD tree for nearest neighbors to a set of specified query points @@ -523,37 +573,57 @@ namespace stim { for (size_t i = 0; i < D; i++) query_points[j].dim[i] = h_query_points[j * D + i]; - unsigned int threads = (unsigned int)(query_points.size() > 1024 ? 1024 : query_points.size()); - unsigned int blocks = (unsigned int)(query_points.size() / threads + (query_points.size() % threads ? 1 : 0)); + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, 0); + + size_t query_memory = D * sizeof(T) * query_count; + size_t N = 3 * query_memory / prop.totalGlobalMem; //consider index and distance, roughly 3 times + if (N > 1) { + N++; + size_t stream_count = query_count / N; + for (size_t n = 0; n < N; n++) { + size_t query_stream_start = n * stream_count; + search_stream(d_nodes, d_index, d_reference_points, &query_points[query_stream_start], stream_count, &indices[query_stream_start], &distances[query_stream_start]); + } + size_t stream_remain_count = query_count - N * stream_count; + if (stream_remain_count > 0) { + size_t query_remain_start = N * stream_count; + search_stream(d_nodes, d_index, d_reference_points, &query_points[query_remain_start], stream_remain_count, &indices[query_remain_start], &distances[query_remain_start]); + } + } + else { + unsigned int threads = (unsigned int)(query_count > 1024 ? 1024 : query_count); + unsigned int blocks = (unsigned int)(query_count / threads + (query_count % threads ? 1 : 0)); - kdtree::point *d_query_points; // create a pointer pointing to query points on gpu - size_t *d_indices; - T *d_distances; + kdtree::point *d_query_points; // create a pointer pointing to query points on gpu + size_t *d_indices; + T *d_distances; - int *next_nodes; // create two STACK-like array - int *next_search_nodes; + int *next_nodes; // create two STACK-like array + int *next_search_nodes; - int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory + int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory - HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_points.size() * D)); - HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_points.size())); - HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_points.size())); - HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); // STACK size right now is 50, you can change it if you mean to - HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int))); - HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_points.size() * D, cudaMemcpyHostToDevice)); - - search_batch<<>> (d_nodes, d_index, d_reference_points, d_query_points, query_points.size(), d_indices, d_distances, next_nodes, next_search_nodes, Judge); - - if (Judge == NULL) { // do the following work if the thread works safely - HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_points.size(), cudaMemcpyDeviceToHost)); - HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_points.size(), cudaMemcpyDeviceToHost)); - } + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_count * D)); + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_count)); + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_count)); + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); // STACK size right now is 50, you can change it if you mean to + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int))); + HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_count * D, cudaMemcpyHostToDevice)); + + search_batch<<>> (d_nodes, d_index, d_reference_points, d_query_points, query_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge); + + if (Judge == NULL) { // do the following work if the thread works safely + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_count, cudaMemcpyDeviceToHost)); + HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_count, cudaMemcpyDeviceToHost)); + } - HANDLE_ERROR(cudaFree(next_nodes)); - HANDLE_ERROR(cudaFree(next_search_nodes)); - HANDLE_ERROR(cudaFree(d_query_points)); - HANDLE_ERROR(cudaFree(d_indices)); - HANDLE_ERROR(cudaFree(d_distances)); + HANDLE_ERROR(cudaFree(next_nodes)); + HANDLE_ERROR(cudaFree(next_search_nodes)); + HANDLE_ERROR(cudaFree(d_query_points)); + HANDLE_ERROR(cudaFree(d_indices)); + HANDLE_ERROR(cudaFree(d_distances)); + } } /// Return the number of points in the KD tree -- libgit2 0.21.4