From 57914be592c16fa94a93e78a5eee4b1da56b0be3 Mon Sep 17 00:00:00 2001 From: Jiaming Guo Date: Thu, 15 Dec 2016 12:04:01 -0600 Subject: [PATCH] new kdtree is ready. --- stim/structures/kdtree_dep.cuh | 627 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/structures/kdtreenew.cuh | 627 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 2 files changed, 627 insertions(+), 627 deletions(-) create mode 100644 stim/structures/kdtree_dep.cuh delete mode 100644 stim/structures/kdtreenew.cuh diff --git a/stim/structures/kdtree_dep.cuh b/stim/structures/kdtree_dep.cuh new file mode 100644 index 0000000..093c71a --- /dev/null +++ b/stim/structures/kdtree_dep.cuh @@ -0,0 +1,627 @@ +// right now the size of CUDA STACK is set to 50, increase it if you mean to make deeper tree +// data should be stored in row-major +// x1,x2,x3,x4,x5...... +// y1,y2,y3,y4,y5...... +// .................... +// .................... + +#ifndef KDTREE_H +#define KDTREE_H +#define stack_size 50 + +#include "device_launch_parameters.h" +#include +#include +#include "cuda_runtime.h" +#include +#include +#include +#include +#include +#include +#include + +namespace stim { + namespace cpu_kdtree { + template // typename refers to float or double while D refers to dimension of points + struct point { + T dim[D]; // create a structure to store every one input point + }; + + template + class cpu_kdnode { + public: + cpu_kdnode() { // constructor for initializing a kdnode + parent = NULL; // set every node's parent, left and right kdnode pointers to NULL + left = NULL; + right = NULL; + parent_idx = -1; // set parent node index to default -1 + left_idx = -1; + right_idx = -1; + split_value = -1; // set split_value to default -1 + } + int idx; // index of current node + int parent_idx, left_idx, right_idx; // index of parent, left and right nodes + cpu_kdnode *parent, *left, *right; // parent, left and right kdnodes + T split_value; // splitting value of current node + std::vector indices; // it indicates the points' indices that current node has + size_t level; // tree level of current node + }; + } // end of namespace cpu_kdtree + + template + struct cuda_kdnode { + int parent, left, right; + T split_value; + size_t num_index; // number of indices it has + int index; // the beginning index + size_t level; + }; + + template + __device__ T gpu_distance(cpu_kdtree::point &a, cpu_kdtree::point &b) { + T distance = 0; + + for (size_t i = 0; i < D; i++) { + T d = a.dim[i] - b.dim[i]; + distance += d*d; + } + return distance; + } + + template + __device__ void search_at_node(cuda_kdnode *nodes, size_t *indices, cpu_kdtree::point *d_reference_points, int cur, cpu_kdtree::point &d_query_point, size_t *d_index, T *d_distance, int *d_node) { + T best_distance = FLT_MAX; + size_t best_index = 0; + + while (true) { // break until reach the bottom + int split_axis = nodes[cur].level % D; + if (nodes[cur].left == -1) { // check whether it has left node or not + *d_node = cur; + for (int i = 0; i < nodes[cur].num_index; i++) { + size_t idx = indices[nodes[cur].index + i]; + T dist = gpu_distance(d_query_point, d_reference_points[idx]); + if (dist < best_distance) { + best_distance = dist; + best_index = idx; + } + } + break; + } + else if (d_query_point.dim[split_axis] < nodes[cur].split_value) { // jump into specific son node + cur = nodes[cur].left; + } + else { + cur = nodes[cur].right; + } + } + *d_distance = best_distance; + *d_index = best_index; + } + + template + __device__ void search_at_node_range(cuda_kdnode *nodes, size_t *indices, cpu_kdtree::point *d_reference_points, cpu_kdtree::point &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { + T best_distance = FLT_MAX; + size_t best_index = 0; + + int next_nodes_pos = 0; // initialize pop out order index + next_nodes[id * stack_size + next_nodes_pos] = cur; // find data that belongs to the very specific thread + next_nodes_pos++; + + while (next_nodes_pos) { + int next_search_nodes_pos = 0; // record push back order index + while (next_nodes_pos) { + cur = next_nodes[id * stack_size + next_nodes_pos - 1]; // pop out the last push in one and keep poping out + next_nodes_pos--; + int split_axis = nodes[cur].level % D; + + if (nodes[cur].left == -1) { + for (int i = 0; i < nodes[cur].num_index; i++) { + int idx = indices[nodes[cur].index + i]; // all indices are stored in one array, pick up from every node's beginning index + T d = gpu_distance(d_query_point, d_reference_points[idx]); + if (d < best_distance) { + best_distance = d; + best_index = idx; + } + } + } + else { + T d = d_query_point.dim[split_axis] - nodes[cur].split_value; + + if (fabs(d) > range) { + if (d < 0) { + next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left; + next_search_nodes_pos++; + } + else { + next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right; + next_search_nodes_pos++; + } + } + else { + next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right; + next_search_nodes_pos++; + next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left; + next_search_nodes_pos++; + if (next_search_nodes_pos > stack_size) { + printf("Thread conflict might be caused by thread %d, so please try smaller input max_tree_levels\n", id); + (*Judge)++; + } + } + } + } + for (int i = 0; i < next_search_nodes_pos; i++) + next_nodes[id * stack_size + i] = next_search_nodes[id * stack_size + i]; + next_nodes_pos = next_search_nodes_pos; + } + *d_distance = best_distance; + *d_index = best_index; + } + + template + __device__ void search(cuda_kdnode *nodes, size_t *indices, cpu_kdtree::point *d_reference_points, cpu_kdtree::point &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { + int best_node = 0; + T best_distance = FLT_MAX; + size_t best_index = 0; + T radius = 0; + + search_at_node(nodes, indices, d_reference_points, 0, d_query_point, &best_index, &best_distance, &best_node); + radius = sqrt(best_distance); // get range + int cur = best_node; + + while (nodes[cur].parent != -1) { + int parent = nodes[cur].parent; + int split_axis = nodes[parent].level % D; + + T tmp_dist = FLT_MAX; + size_t tmp_idx; + if (fabs(nodes[parent].split_value - d_query_point.dim[split_axis]) <= radius) { + if (nodes[parent].left != cur) + search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].left, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge); + else + search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].right, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge); + } + if (tmp_dist < best_distance) { + best_distance = tmp_dist; + best_index = tmp_idx; + } + cur = parent; + } + *d_distance = sqrt(best_distance); + *d_index = best_index; + } + + template + __global__ void search_batch(cuda_kdnode *nodes, size_t *indices, cpu_kdtree::point *d_reference_points, cpu_kdtree::point *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= d_query_count) return; // avoid segfault + + search(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent + } + + template + void search_stream(cuda_kdnode *d_nodes, size_t *d_index, cpu_kdtree::point *d_reference_points, cpu_kdtree::point *query_stream_points, size_t stream_count, size_t *indices, T *distances) { + unsigned int threads = (unsigned int)(stream_count > 1024 ? 1024 : stream_count); + unsigned int blocks = (unsigned int)(stream_count / threads + (stream_count % threads ? 1 : 0)); + + cpu_kdtree::point *d_query_points; + size_t *d_indices; + T *d_distances; + + int *next_nodes; + int *next_search_nodes; + + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * stream_count * D)); + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * stream_count)); + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * stream_count)); + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int))); + HANDLE_ERROR(cudaMemcpy(d_query_points, query_stream_points, sizeof(T) * stream_count * D, cudaMemcpyHostToDevice)); + + int *Judge = NULL; + + search_batch<<>> (d_nodes, d_index, d_reference_points, d_query_points, stream_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge); + + if(Judge == NULL) { + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * stream_count, cudaMemcpyDeviceToHost)); + HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * stream_count, cudaMemcpyDeviceToHost)); + } + + HANDLE_ERROR(cudaFree(next_nodes)); + HANDLE_ERROR(cudaFree(next_search_nodes)); + HANDLE_ERROR(cudaFree(d_query_points)); + HANDLE_ERROR(cudaFree(d_indices)); + HANDLE_ERROR(cudaFree(d_distances)); + } + + template // set dimension of data to default 3 + class kdtree { + protected: + int current_axis; // current judging axis + int n_id; // store the total number of nodes + std::vector < typename cpu_kdtree::point > *tmp_points; // transfer or temperary points + std::vector < typename cpu_kdtree::point > cpu_tmp_points; // for cpu searching + cpu_kdtree::cpu_kdnode *root; // root node + static kdtree *cur_tree_ptr; + #ifdef __CUDACC__ + cuda_kdnode *d_nodes; + size_t *d_index; + cpu_kdtree::point* d_reference_points; + size_t npts; + int num_nodes; + #endif + public: + kdtree() { // constructor for creating a cpu_kdtree + cur_tree_ptr = this; // create a class pointer points to the current class value + n_id = 0; // set total number of points to default 0 + } + + ~kdtree() { // destructor of cpu_kdtree + std::vector *> next_nodes; + next_nodes.push_back(root); + while (next_nodes.size()) { + std::vector *> next_search_nodes; + while (next_nodes.size()) { + cpu_kdtree::cpu_kdnode *cur = next_nodes.back(); + next_nodes.pop_back(); + if (cur->left) + next_search_nodes.push_back(cur->left); + if (cur->right) + next_search_nodes.push_back(cur->right); + delete cur; + } + next_nodes = next_search_nodes; + } + root = NULL; + #ifdef __CUDACC__ + HANDLE_ERROR(cudaFree(d_nodes)); + HANDLE_ERROR(cudaFree(d_index)); + HANDLE_ERROR(cudaFree(d_reference_points)); + #endif + } + + void cpu_create(std::vector < typename cpu_kdtree::point > &reference_points, size_t max_levels) { + tmp_points = &reference_points; + root = new cpu_kdtree::cpu_kdnode(); // initializing the root node + root->idx = n_id++; // the index of root is 0 + root->level = 0; // tree level begins at 0 + root->indices.resize(reference_points.size()); // get the number of points + for (size_t i = 0; i < reference_points.size(); i++) { + root->indices[i] = i; // set indices of input points + } + std::vector *> next_nodes; // next nodes + next_nodes.push_back(root); // push back the root node + while (next_nodes.size()) { + std::vector *> next_search_nodes; // next search nodes + while (next_nodes.size()) { // two same WHILE is because we need to make a new vector to store nodes for search + cpu_kdtree::cpu_kdnode *current_node = next_nodes.back(); // handle node one by one (right first) + next_nodes.pop_back(); // pop out current node in order to store next round of nodes + if (current_node->level < max_levels) { + if (current_node->indices.size() > 1) { // split if the nonleaf node contains more than one point + cpu_kdtree::cpu_kdnode *left = new cpu_kdtree::cpu_kdnode(); + cpu_kdtree::cpu_kdnode *right = new cpu_kdtree::cpu_kdnode(); + left->idx = n_id++; // set the index of current node's left node + right->idx = n_id++; + split(current_node, left, right); // split left and right and determine a node + std::vector temp; // empty vecters of int + //temp.resize(current_node->indices.size()); + current_node->indices.swap(temp); // clean up current node's indices + current_node->left = left; + current_node->right = right; + current_node->left_idx = left->idx; + current_node->right_idx = right->idx; + if (right->indices.size()) + next_search_nodes.push_back(right); // left pop out first + if (left->indices.size()) + next_search_nodes.push_back(left); + } + } + } + next_nodes = next_search_nodes; // go deeper within the tree + } + } + + static bool sort_points(const size_t a, const size_t b) { // create functor for std::sort + std::vector < typename cpu_kdtree::point > &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer + return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis]; + } + + void split(cpu_kdtree::cpu_kdnode *cur, cpu_kdtree::cpu_kdnode *left, cpu_kdtree::cpu_kdnode *right) { + std::vector < typename cpu_kdtree::point > &pts = *tmp_points; + current_axis = cur->level % D; // indicate the judicative dimension or axis + std::sort(cur->indices.begin(), cur->indices.end(), sort_points); // using SortPoints as comparison function to sort the data + size_t mid_value = cur->indices[cur->indices.size() / 2]; // odd in the mid_value, even take the floor + cur->split_value = pts[mid_value].dim[current_axis]; // get the parent node + left->parent = cur; // set the parent of the next search nodes to current node + right->parent = cur; + left->level = cur->level + 1; // level + 1 + right->level = cur->level + 1; + left->parent_idx = cur->idx; // set its parent node's index + right->parent_idx = cur->idx; + for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right half-space one by one + size_t idx = cur->indices[i]; + if (pts[idx].dim[current_axis] < cur->split_value) + left->indices.push_back(idx); + else + right->indices.push_back(idx); + } + } + + void create(T *h_reference_points, size_t reference_count, size_t max_levels) { + #ifdef __CUDACC__ + if (max_levels > 10) { + std::cout<<"The max_tree_levels should be smaller!"<(bb, h_reference_points, reference_count); + + std::vector < typename cpu_kdtree::point> reference_points(reference_count); // restore the reference points in particular way + for (size_t j = 0; j < reference_count; j++) + for (size_t i = 0; i < D; i++) + reference_points[j].dim[i] = h_reference_points[j * D + i]; // creating a tree on cpu + (*this).cpu_create(reference_points, max_levels); // building a tree on cpu + cpu_kdtree::cpu_kdnode *d_root = (*this).get_root(); + num_nodes = (*this).get_num_nodes(); + npts = reference_count; // also equals to reference_count + + HANDLE_ERROR(cudaMalloc((void**)&d_nodes, sizeof(cuda_kdnode) * num_nodes)); // copy data from host to device + HANDLE_ERROR(cudaMalloc((void**)&d_index, sizeof(size_t) * npts)); + HANDLE_ERROR(cudaMalloc((void**)&d_reference_points, sizeof(cpu_kdtree::point) * npts)); + + std::vector < cuda_kdnode > tmp_nodes(num_nodes); + std::vector indices(npts); + std::vector *> next_nodes; + size_t cur_pos = 0; + next_nodes.push_back(d_root); + while (next_nodes.size()) { + std::vector *> next_search_nodes; + while (next_nodes.size()) { + cpu_kdtree::cpu_kdnode *cur = next_nodes.back(); + next_nodes.pop_back(); + int id = cur->idx; // the nodes at same level are independent + tmp_nodes[id].level = cur->level; + tmp_nodes[id].parent = cur->parent_idx; + tmp_nodes[id].left = cur->left_idx; + tmp_nodes[id].right = cur->right_idx; + tmp_nodes[id].split_value = cur->split_value; + tmp_nodes[id].num_index = cur->indices.size(); // number of index + if (cur->indices.size()) { + for (size_t i = 0; i < cur->indices.size(); i++) + indices[cur_pos + i] = cur->indices[i]; + + tmp_nodes[id].index = (int)cur_pos; // beginning index of reference_points that every bottom node has + cur_pos += cur->indices.size(); // store indices continuously for every query_point + } + else { + tmp_nodes[id].index = -1; + } + + if (cur->left) + next_search_nodes.push_back(cur->left); + + if (cur->right) + next_search_nodes.push_back(cur->right); + } + next_nodes = next_search_nodes; + } + HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode) * tmp_nodes.size(), cudaMemcpyHostToDevice)); + HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice)); + HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(cpu_kdtree::point) * reference_count, cudaMemcpyHostToDevice)); + + #else + std::vector < typename cpu_kdtree::point > reference_points(reference_count); // restore the reference points in particular way + for (size_t j = 0; j < reference_count; j++) + for (size_t i = 0; i < D; i++) + reference_points[j].dim[i] = h_reference_points[j * D + i]; + cpu_create(reference_points, max_levels); + cpu_tmp_points = *tmp_points; + + #endif + } + + int get_num_nodes() const { // get the total number of nodes + return n_id; + } + + cpu_kdtree::cpu_kdnode* get_root() const { // get the root node of tree + return root; + } + + T cpu_distance(const cpu_kdtree::point &a, const cpu_kdtree::point &b) { + T distance = 0; + + for (size_t i = 0; i < D; i++) { + T d = a.dim[i] - b.dim[i]; + distance += d*d; + } + return distance; + } + + void cpu_search_at_node(cpu_kdtree::cpu_kdnode *cur, const cpu_kdtree::point &query, size_t *index, T *distance, cpu_kdtree::cpu_kdnode **node) { + T best_distance = FLT_MAX; // initialize the best distance to max of floating point + size_t best_index = 0; + std::vector < typename cpu_kdtree::point > pts = cpu_tmp_points; + while (true) { + size_t split_axis = cur->level % D; + if (cur->left == NULL) { // risky but acceptable, same goes for right because left and right are in same pace + *node = cur; // pointer points to a pointer + for (size_t i = 0; i < cur->indices.size(); i++) { + size_t idx = cur->indices[i]; + T d = cpu_distance(query, pts[idx]); // compute distances + /// if we want to compute k nearest neighbor, we can input the last resul + /// (last_best_dist < dist < best_dist) to select the next point until reaching to k + if (d < best_distance) { + best_distance = d; + best_index = idx; // record the nearest neighbor index + } + } + break; // find the target point then break the loop + } + else if (query.dim[split_axis] < cur->split_value) { // if it has son node, visit the next node on either left side or right side + cur = cur->left; + } + else { + cur = cur->right; + } + } + *index = best_index; + *distance = best_distance; + } + + void cpu_search_at_node_range(cpu_kdtree::cpu_kdnode *cur, const cpu_kdtree::point &query, T range, size_t *index, T *distance) { + T best_distance = FLT_MAX; // initialize the best distance to max of floating point + size_t best_index = 0; + std::vector < typename cpu_kdtree::point > pts = cpu_tmp_points; + std::vector < typename cpu_kdtree::cpu_kdnode*> next_node; + next_node.push_back(cur); + while (next_node.size()) { + std::vector*> next_search; + while (next_node.size()) { + cur = next_node.back(); + next_node.pop_back(); + size_t split_axis = cur->level % D; + if (cur->left == NULL) { + for (size_t i = 0; i < cur->indices.size(); i++) { + size_t idx = cur->indices[i]; + T d = cpu_distance(query, pts[idx]); + if (d < best_distance) { + best_distance = d; + best_index = idx; + } + } + } + else { + T d = query.dim[split_axis] - cur->split_value; // computer distance along specific axis or dimension + /// there are three possibilities: on either left or right, and on both left and right + if (fabs(d) > range) { // absolute value of floating point to see if distance will be larger that best_dist + if (d < 0) + next_search.push_back(cur->left); // every left[split_axis] is less and equal to cur->split_value, so it is possible to find the nearest point in this region + else + next_search.push_back(cur->right); + } + else { // it is possible that nereast neighbor will appear on both left and right + next_search.push_back(cur->left); + next_search.push_back(cur->right); + } + } + } + next_node = next_search; // pop out at least one time + } + *index = best_index; + *distance = best_distance; + } + + void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) { + /// first convert the input query point into specific type + cpu_kdtree::point query; + for (size_t j = 0; j < query_count; j++) { + for (size_t i = 0; i < D; i++) + query.dim[i] = h_query_points[j * D + i]; + /// find the nearest node, this will be the upper bound for the next time searching + cpu_kdtree::cpu_kdnode *best_node = NULL; + T best_distance = FLT_MAX; + size_t best_index = 0; + T radius = 0; // radius for range + cpu_search_at_node(root, query, &best_index, &best_distance, &best_node); // simple search to rougly determine a result for next search step + radius = sqrt(best_distance); // It is possible that nearest will appear in another region + /// find other possibilities + cpu_kdtree::cpu_kdnode *cur = best_node; + while (cur->parent != NULL) { // every node that you pass will be possible to be the best node + /// go up + cpu_kdtree::cpu_kdnode *parent = cur->parent; // travel back to every node that we pass through + size_t split_axis = (parent->level) % D; + /// search other nodes + size_t tmp_index; + T tmp_distance = FLT_MAX; + if (fabs(parent->split_value - query.dim[split_axis]) <= radius) { + /// search opposite node + if (parent->left != cur) + cpu_search_at_node_range(parent->left, query, radius, &tmp_index, &tmp_distance); // to see whether it is its mother node's left son node + else + cpu_search_at_node_range(parent->right, query, radius, &tmp_index, &tmp_distance); + } + if (tmp_distance < best_distance) { + best_distance = tmp_distance; + best_index = tmp_index; + } + cur = parent; + } + h_indices[j] = best_index; + h_distances[j] = best_distance; + } + } + + void search(T *h_query_points, size_t query_count, size_t *indices, T *distances) { + #ifdef __CUDACC__ + std::vector < typename cpu_kdtree::point > query_points(query_count); + for (size_t j = 0; j < query_count; j++) + for (size_t i = 0; i < D; i++) + query_points[j].dim[i] = h_query_points[j * D + i]; + + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, 0); + + size_t query_memory = D * sizeof(T) * query_count; + size_t N = 3 * query_memory / prop.totalGlobalMem; //consider index and distance, roughly 3 times + if (N > 1) { + N++; + size_t stream_count = query_count / N; + for (size_t n = 0; n < N; n++) { + size_t query_stream_start = n * stream_count; + search_stream(d_nodes, d_index, d_reference_points, &query_points[query_stream_start], stream_count, &indices[query_stream_start], &distances[query_stream_start]); + } + size_t stream_remain_count = query_count - N * stream_count; + if (stream_remain_count > 0) { + size_t query_remain_start = N * stream_count; + search_stream(d_nodes, d_index, d_reference_points, &query_points[query_remain_start], stream_remain_count, &indices[query_remain_start], &distances[query_remain_start]); + } + } + else { + unsigned int threads = (unsigned int)(query_count > 1024 ? 1024 : query_count); + unsigned int blocks = (unsigned int)(query_count / threads + (query_count % threads ? 1 : 0)); + + cpu_kdtree::point *d_query_points; // create a pointer pointing to query points on gpu + size_t *d_indices; + T *d_distances; + + int *next_nodes; // create two STACK-like array + int *next_search_nodes; + + int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory + + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_count * D)); + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_count)); + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_count)); + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); // STACK size right now is 50, you can change it if you mean to + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int))); + HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_count * D, cudaMemcpyHostToDevice)); + + search_batch<<>> (d_nodes, d_index, d_reference_points, d_query_points, query_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge); + + if (Judge == NULL) { // do the following work if the thread works safely + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_count, cudaMemcpyDeviceToHost)); + HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_count, cudaMemcpyDeviceToHost)); + } + + HANDLE_ERROR(cudaFree(next_nodes)); + HANDLE_ERROR(cudaFree(next_search_nodes)); + HANDLE_ERROR(cudaFree(d_query_points)); + HANDLE_ERROR(cudaFree(d_indices)); + HANDLE_ERROR(cudaFree(d_distances)); + } + + #else + cpu_search(h_query_points, query_count, indices, distances); + + #endif + + } + + }; //end class kdtree + + template + kdtree* kdtree::cur_tree_ptr = NULL; // definition of cur_tree_ptr pointer points to the current class + +} //end namespace stim +#endif \ No newline at end of file diff --git a/stim/structures/kdtreenew.cuh b/stim/structures/kdtreenew.cuh deleted file mode 100644 index 093c71a..0000000 --- a/stim/structures/kdtreenew.cuh +++ /dev/null @@ -1,627 +0,0 @@ -// right now the size of CUDA STACK is set to 50, increase it if you mean to make deeper tree -// data should be stored in row-major -// x1,x2,x3,x4,x5...... -// y1,y2,y3,y4,y5...... -// .................... -// .................... - -#ifndef KDTREE_H -#define KDTREE_H -#define stack_size 50 - -#include "device_launch_parameters.h" -#include -#include -#include "cuda_runtime.h" -#include -#include -#include -#include -#include -#include -#include - -namespace stim { - namespace cpu_kdtree { - template // typename refers to float or double while D refers to dimension of points - struct point { - T dim[D]; // create a structure to store every one input point - }; - - template - class cpu_kdnode { - public: - cpu_kdnode() { // constructor for initializing a kdnode - parent = NULL; // set every node's parent, left and right kdnode pointers to NULL - left = NULL; - right = NULL; - parent_idx = -1; // set parent node index to default -1 - left_idx = -1; - right_idx = -1; - split_value = -1; // set split_value to default -1 - } - int idx; // index of current node - int parent_idx, left_idx, right_idx; // index of parent, left and right nodes - cpu_kdnode *parent, *left, *right; // parent, left and right kdnodes - T split_value; // splitting value of current node - std::vector indices; // it indicates the points' indices that current node has - size_t level; // tree level of current node - }; - } // end of namespace cpu_kdtree - - template - struct cuda_kdnode { - int parent, left, right; - T split_value; - size_t num_index; // number of indices it has - int index; // the beginning index - size_t level; - }; - - template - __device__ T gpu_distance(cpu_kdtree::point &a, cpu_kdtree::point &b) { - T distance = 0; - - for (size_t i = 0; i < D; i++) { - T d = a.dim[i] - b.dim[i]; - distance += d*d; - } - return distance; - } - - template - __device__ void search_at_node(cuda_kdnode *nodes, size_t *indices, cpu_kdtree::point *d_reference_points, int cur, cpu_kdtree::point &d_query_point, size_t *d_index, T *d_distance, int *d_node) { - T best_distance = FLT_MAX; - size_t best_index = 0; - - while (true) { // break until reach the bottom - int split_axis = nodes[cur].level % D; - if (nodes[cur].left == -1) { // check whether it has left node or not - *d_node = cur; - for (int i = 0; i < nodes[cur].num_index; i++) { - size_t idx = indices[nodes[cur].index + i]; - T dist = gpu_distance(d_query_point, d_reference_points[idx]); - if (dist < best_distance) { - best_distance = dist; - best_index = idx; - } - } - break; - } - else if (d_query_point.dim[split_axis] < nodes[cur].split_value) { // jump into specific son node - cur = nodes[cur].left; - } - else { - cur = nodes[cur].right; - } - } - *d_distance = best_distance; - *d_index = best_index; - } - - template - __device__ void search_at_node_range(cuda_kdnode *nodes, size_t *indices, cpu_kdtree::point *d_reference_points, cpu_kdtree::point &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { - T best_distance = FLT_MAX; - size_t best_index = 0; - - int next_nodes_pos = 0; // initialize pop out order index - next_nodes[id * stack_size + next_nodes_pos] = cur; // find data that belongs to the very specific thread - next_nodes_pos++; - - while (next_nodes_pos) { - int next_search_nodes_pos = 0; // record push back order index - while (next_nodes_pos) { - cur = next_nodes[id * stack_size + next_nodes_pos - 1]; // pop out the last push in one and keep poping out - next_nodes_pos--; - int split_axis = nodes[cur].level % D; - - if (nodes[cur].left == -1) { - for (int i = 0; i < nodes[cur].num_index; i++) { - int idx = indices[nodes[cur].index + i]; // all indices are stored in one array, pick up from every node's beginning index - T d = gpu_distance(d_query_point, d_reference_points[idx]); - if (d < best_distance) { - best_distance = d; - best_index = idx; - } - } - } - else { - T d = d_query_point.dim[split_axis] - nodes[cur].split_value; - - if (fabs(d) > range) { - if (d < 0) { - next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left; - next_search_nodes_pos++; - } - else { - next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right; - next_search_nodes_pos++; - } - } - else { - next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right; - next_search_nodes_pos++; - next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left; - next_search_nodes_pos++; - if (next_search_nodes_pos > stack_size) { - printf("Thread conflict might be caused by thread %d, so please try smaller input max_tree_levels\n", id); - (*Judge)++; - } - } - } - } - for (int i = 0; i < next_search_nodes_pos; i++) - next_nodes[id * stack_size + i] = next_search_nodes[id * stack_size + i]; - next_nodes_pos = next_search_nodes_pos; - } - *d_distance = best_distance; - *d_index = best_index; - } - - template - __device__ void search(cuda_kdnode *nodes, size_t *indices, cpu_kdtree::point *d_reference_points, cpu_kdtree::point &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { - int best_node = 0; - T best_distance = FLT_MAX; - size_t best_index = 0; - T radius = 0; - - search_at_node(nodes, indices, d_reference_points, 0, d_query_point, &best_index, &best_distance, &best_node); - radius = sqrt(best_distance); // get range - int cur = best_node; - - while (nodes[cur].parent != -1) { - int parent = nodes[cur].parent; - int split_axis = nodes[parent].level % D; - - T tmp_dist = FLT_MAX; - size_t tmp_idx; - if (fabs(nodes[parent].split_value - d_query_point.dim[split_axis]) <= radius) { - if (nodes[parent].left != cur) - search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].left, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge); - else - search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].right, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge); - } - if (tmp_dist < best_distance) { - best_distance = tmp_dist; - best_index = tmp_idx; - } - cur = parent; - } - *d_distance = sqrt(best_distance); - *d_index = best_index; - } - - template - __global__ void search_batch(cuda_kdnode *nodes, size_t *indices, cpu_kdtree::point *d_reference_points, cpu_kdtree::point *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) { - size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= d_query_count) return; // avoid segfault - - search(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent - } - - template - void search_stream(cuda_kdnode *d_nodes, size_t *d_index, cpu_kdtree::point *d_reference_points, cpu_kdtree::point *query_stream_points, size_t stream_count, size_t *indices, T *distances) { - unsigned int threads = (unsigned int)(stream_count > 1024 ? 1024 : stream_count); - unsigned int blocks = (unsigned int)(stream_count / threads + (stream_count % threads ? 1 : 0)); - - cpu_kdtree::point *d_query_points; - size_t *d_indices; - T *d_distances; - - int *next_nodes; - int *next_search_nodes; - - HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * stream_count * D)); - HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * stream_count)); - HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * stream_count)); - HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); - HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int))); - HANDLE_ERROR(cudaMemcpy(d_query_points, query_stream_points, sizeof(T) * stream_count * D, cudaMemcpyHostToDevice)); - - int *Judge = NULL; - - search_batch<<>> (d_nodes, d_index, d_reference_points, d_query_points, stream_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge); - - if(Judge == NULL) { - HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * stream_count, cudaMemcpyDeviceToHost)); - HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * stream_count, cudaMemcpyDeviceToHost)); - } - - HANDLE_ERROR(cudaFree(next_nodes)); - HANDLE_ERROR(cudaFree(next_search_nodes)); - HANDLE_ERROR(cudaFree(d_query_points)); - HANDLE_ERROR(cudaFree(d_indices)); - HANDLE_ERROR(cudaFree(d_distances)); - } - - template // set dimension of data to default 3 - class kdtree { - protected: - int current_axis; // current judging axis - int n_id; // store the total number of nodes - std::vector < typename cpu_kdtree::point > *tmp_points; // transfer or temperary points - std::vector < typename cpu_kdtree::point > cpu_tmp_points; // for cpu searching - cpu_kdtree::cpu_kdnode *root; // root node - static kdtree *cur_tree_ptr; - #ifdef __CUDACC__ - cuda_kdnode *d_nodes; - size_t *d_index; - cpu_kdtree::point* d_reference_points; - size_t npts; - int num_nodes; - #endif - public: - kdtree() { // constructor for creating a cpu_kdtree - cur_tree_ptr = this; // create a class pointer points to the current class value - n_id = 0; // set total number of points to default 0 - } - - ~kdtree() { // destructor of cpu_kdtree - std::vector *> next_nodes; - next_nodes.push_back(root); - while (next_nodes.size()) { - std::vector *> next_search_nodes; - while (next_nodes.size()) { - cpu_kdtree::cpu_kdnode *cur = next_nodes.back(); - next_nodes.pop_back(); - if (cur->left) - next_search_nodes.push_back(cur->left); - if (cur->right) - next_search_nodes.push_back(cur->right); - delete cur; - } - next_nodes = next_search_nodes; - } - root = NULL; - #ifdef __CUDACC__ - HANDLE_ERROR(cudaFree(d_nodes)); - HANDLE_ERROR(cudaFree(d_index)); - HANDLE_ERROR(cudaFree(d_reference_points)); - #endif - } - - void cpu_create(std::vector < typename cpu_kdtree::point > &reference_points, size_t max_levels) { - tmp_points = &reference_points; - root = new cpu_kdtree::cpu_kdnode(); // initializing the root node - root->idx = n_id++; // the index of root is 0 - root->level = 0; // tree level begins at 0 - root->indices.resize(reference_points.size()); // get the number of points - for (size_t i = 0; i < reference_points.size(); i++) { - root->indices[i] = i; // set indices of input points - } - std::vector *> next_nodes; // next nodes - next_nodes.push_back(root); // push back the root node - while (next_nodes.size()) { - std::vector *> next_search_nodes; // next search nodes - while (next_nodes.size()) { // two same WHILE is because we need to make a new vector to store nodes for search - cpu_kdtree::cpu_kdnode *current_node = next_nodes.back(); // handle node one by one (right first) - next_nodes.pop_back(); // pop out current node in order to store next round of nodes - if (current_node->level < max_levels) { - if (current_node->indices.size() > 1) { // split if the nonleaf node contains more than one point - cpu_kdtree::cpu_kdnode *left = new cpu_kdtree::cpu_kdnode(); - cpu_kdtree::cpu_kdnode *right = new cpu_kdtree::cpu_kdnode(); - left->idx = n_id++; // set the index of current node's left node - right->idx = n_id++; - split(current_node, left, right); // split left and right and determine a node - std::vector temp; // empty vecters of int - //temp.resize(current_node->indices.size()); - current_node->indices.swap(temp); // clean up current node's indices - current_node->left = left; - current_node->right = right; - current_node->left_idx = left->idx; - current_node->right_idx = right->idx; - if (right->indices.size()) - next_search_nodes.push_back(right); // left pop out first - if (left->indices.size()) - next_search_nodes.push_back(left); - } - } - } - next_nodes = next_search_nodes; // go deeper within the tree - } - } - - static bool sort_points(const size_t a, const size_t b) { // create functor for std::sort - std::vector < typename cpu_kdtree::point > &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer - return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis]; - } - - void split(cpu_kdtree::cpu_kdnode *cur, cpu_kdtree::cpu_kdnode *left, cpu_kdtree::cpu_kdnode *right) { - std::vector < typename cpu_kdtree::point > &pts = *tmp_points; - current_axis = cur->level % D; // indicate the judicative dimension or axis - std::sort(cur->indices.begin(), cur->indices.end(), sort_points); // using SortPoints as comparison function to sort the data - size_t mid_value = cur->indices[cur->indices.size() / 2]; // odd in the mid_value, even take the floor - cur->split_value = pts[mid_value].dim[current_axis]; // get the parent node - left->parent = cur; // set the parent of the next search nodes to current node - right->parent = cur; - left->level = cur->level + 1; // level + 1 - right->level = cur->level + 1; - left->parent_idx = cur->idx; // set its parent node's index - right->parent_idx = cur->idx; - for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right half-space one by one - size_t idx = cur->indices[i]; - if (pts[idx].dim[current_axis] < cur->split_value) - left->indices.push_back(idx); - else - right->indices.push_back(idx); - } - } - - void create(T *h_reference_points, size_t reference_count, size_t max_levels) { - #ifdef __CUDACC__ - if (max_levels > 10) { - std::cout<<"The max_tree_levels should be smaller!"<(bb, h_reference_points, reference_count); - - std::vector < typename cpu_kdtree::point> reference_points(reference_count); // restore the reference points in particular way - for (size_t j = 0; j < reference_count; j++) - for (size_t i = 0; i < D; i++) - reference_points[j].dim[i] = h_reference_points[j * D + i]; // creating a tree on cpu - (*this).cpu_create(reference_points, max_levels); // building a tree on cpu - cpu_kdtree::cpu_kdnode *d_root = (*this).get_root(); - num_nodes = (*this).get_num_nodes(); - npts = reference_count; // also equals to reference_count - - HANDLE_ERROR(cudaMalloc((void**)&d_nodes, sizeof(cuda_kdnode) * num_nodes)); // copy data from host to device - HANDLE_ERROR(cudaMalloc((void**)&d_index, sizeof(size_t) * npts)); - HANDLE_ERROR(cudaMalloc((void**)&d_reference_points, sizeof(cpu_kdtree::point) * npts)); - - std::vector < cuda_kdnode > tmp_nodes(num_nodes); - std::vector indices(npts); - std::vector *> next_nodes; - size_t cur_pos = 0; - next_nodes.push_back(d_root); - while (next_nodes.size()) { - std::vector *> next_search_nodes; - while (next_nodes.size()) { - cpu_kdtree::cpu_kdnode *cur = next_nodes.back(); - next_nodes.pop_back(); - int id = cur->idx; // the nodes at same level are independent - tmp_nodes[id].level = cur->level; - tmp_nodes[id].parent = cur->parent_idx; - tmp_nodes[id].left = cur->left_idx; - tmp_nodes[id].right = cur->right_idx; - tmp_nodes[id].split_value = cur->split_value; - tmp_nodes[id].num_index = cur->indices.size(); // number of index - if (cur->indices.size()) { - for (size_t i = 0; i < cur->indices.size(); i++) - indices[cur_pos + i] = cur->indices[i]; - - tmp_nodes[id].index = (int)cur_pos; // beginning index of reference_points that every bottom node has - cur_pos += cur->indices.size(); // store indices continuously for every query_point - } - else { - tmp_nodes[id].index = -1; - } - - if (cur->left) - next_search_nodes.push_back(cur->left); - - if (cur->right) - next_search_nodes.push_back(cur->right); - } - next_nodes = next_search_nodes; - } - HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode) * tmp_nodes.size(), cudaMemcpyHostToDevice)); - HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice)); - HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(cpu_kdtree::point) * reference_count, cudaMemcpyHostToDevice)); - - #else - std::vector < typename cpu_kdtree::point > reference_points(reference_count); // restore the reference points in particular way - for (size_t j = 0; j < reference_count; j++) - for (size_t i = 0; i < D; i++) - reference_points[j].dim[i] = h_reference_points[j * D + i]; - cpu_create(reference_points, max_levels); - cpu_tmp_points = *tmp_points; - - #endif - } - - int get_num_nodes() const { // get the total number of nodes - return n_id; - } - - cpu_kdtree::cpu_kdnode* get_root() const { // get the root node of tree - return root; - } - - T cpu_distance(const cpu_kdtree::point &a, const cpu_kdtree::point &b) { - T distance = 0; - - for (size_t i = 0; i < D; i++) { - T d = a.dim[i] - b.dim[i]; - distance += d*d; - } - return distance; - } - - void cpu_search_at_node(cpu_kdtree::cpu_kdnode *cur, const cpu_kdtree::point &query, size_t *index, T *distance, cpu_kdtree::cpu_kdnode **node) { - T best_distance = FLT_MAX; // initialize the best distance to max of floating point - size_t best_index = 0; - std::vector < typename cpu_kdtree::point > pts = cpu_tmp_points; - while (true) { - size_t split_axis = cur->level % D; - if (cur->left == NULL) { // risky but acceptable, same goes for right because left and right are in same pace - *node = cur; // pointer points to a pointer - for (size_t i = 0; i < cur->indices.size(); i++) { - size_t idx = cur->indices[i]; - T d = cpu_distance(query, pts[idx]); // compute distances - /// if we want to compute k nearest neighbor, we can input the last resul - /// (last_best_dist < dist < best_dist) to select the next point until reaching to k - if (d < best_distance) { - best_distance = d; - best_index = idx; // record the nearest neighbor index - } - } - break; // find the target point then break the loop - } - else if (query.dim[split_axis] < cur->split_value) { // if it has son node, visit the next node on either left side or right side - cur = cur->left; - } - else { - cur = cur->right; - } - } - *index = best_index; - *distance = best_distance; - } - - void cpu_search_at_node_range(cpu_kdtree::cpu_kdnode *cur, const cpu_kdtree::point &query, T range, size_t *index, T *distance) { - T best_distance = FLT_MAX; // initialize the best distance to max of floating point - size_t best_index = 0; - std::vector < typename cpu_kdtree::point > pts = cpu_tmp_points; - std::vector < typename cpu_kdtree::cpu_kdnode*> next_node; - next_node.push_back(cur); - while (next_node.size()) { - std::vector*> next_search; - while (next_node.size()) { - cur = next_node.back(); - next_node.pop_back(); - size_t split_axis = cur->level % D; - if (cur->left == NULL) { - for (size_t i = 0; i < cur->indices.size(); i++) { - size_t idx = cur->indices[i]; - T d = cpu_distance(query, pts[idx]); - if (d < best_distance) { - best_distance = d; - best_index = idx; - } - } - } - else { - T d = query.dim[split_axis] - cur->split_value; // computer distance along specific axis or dimension - /// there are three possibilities: on either left or right, and on both left and right - if (fabs(d) > range) { // absolute value of floating point to see if distance will be larger that best_dist - if (d < 0) - next_search.push_back(cur->left); // every left[split_axis] is less and equal to cur->split_value, so it is possible to find the nearest point in this region - else - next_search.push_back(cur->right); - } - else { // it is possible that nereast neighbor will appear on both left and right - next_search.push_back(cur->left); - next_search.push_back(cur->right); - } - } - } - next_node = next_search; // pop out at least one time - } - *index = best_index; - *distance = best_distance; - } - - void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) { - /// first convert the input query point into specific type - cpu_kdtree::point query; - for (size_t j = 0; j < query_count; j++) { - for (size_t i = 0; i < D; i++) - query.dim[i] = h_query_points[j * D + i]; - /// find the nearest node, this will be the upper bound for the next time searching - cpu_kdtree::cpu_kdnode *best_node = NULL; - T best_distance = FLT_MAX; - size_t best_index = 0; - T radius = 0; // radius for range - cpu_search_at_node(root, query, &best_index, &best_distance, &best_node); // simple search to rougly determine a result for next search step - radius = sqrt(best_distance); // It is possible that nearest will appear in another region - /// find other possibilities - cpu_kdtree::cpu_kdnode *cur = best_node; - while (cur->parent != NULL) { // every node that you pass will be possible to be the best node - /// go up - cpu_kdtree::cpu_kdnode *parent = cur->parent; // travel back to every node that we pass through - size_t split_axis = (parent->level) % D; - /// search other nodes - size_t tmp_index; - T tmp_distance = FLT_MAX; - if (fabs(parent->split_value - query.dim[split_axis]) <= radius) { - /// search opposite node - if (parent->left != cur) - cpu_search_at_node_range(parent->left, query, radius, &tmp_index, &tmp_distance); // to see whether it is its mother node's left son node - else - cpu_search_at_node_range(parent->right, query, radius, &tmp_index, &tmp_distance); - } - if (tmp_distance < best_distance) { - best_distance = tmp_distance; - best_index = tmp_index; - } - cur = parent; - } - h_indices[j] = best_index; - h_distances[j] = best_distance; - } - } - - void search(T *h_query_points, size_t query_count, size_t *indices, T *distances) { - #ifdef __CUDACC__ - std::vector < typename cpu_kdtree::point > query_points(query_count); - for (size_t j = 0; j < query_count; j++) - for (size_t i = 0; i < D; i++) - query_points[j].dim[i] = h_query_points[j * D + i]; - - cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, 0); - - size_t query_memory = D * sizeof(T) * query_count; - size_t N = 3 * query_memory / prop.totalGlobalMem; //consider index and distance, roughly 3 times - if (N > 1) { - N++; - size_t stream_count = query_count / N; - for (size_t n = 0; n < N; n++) { - size_t query_stream_start = n * stream_count; - search_stream(d_nodes, d_index, d_reference_points, &query_points[query_stream_start], stream_count, &indices[query_stream_start], &distances[query_stream_start]); - } - size_t stream_remain_count = query_count - N * stream_count; - if (stream_remain_count > 0) { - size_t query_remain_start = N * stream_count; - search_stream(d_nodes, d_index, d_reference_points, &query_points[query_remain_start], stream_remain_count, &indices[query_remain_start], &distances[query_remain_start]); - } - } - else { - unsigned int threads = (unsigned int)(query_count > 1024 ? 1024 : query_count); - unsigned int blocks = (unsigned int)(query_count / threads + (query_count % threads ? 1 : 0)); - - cpu_kdtree::point *d_query_points; // create a pointer pointing to query points on gpu - size_t *d_indices; - T *d_distances; - - int *next_nodes; // create two STACK-like array - int *next_search_nodes; - - int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory - - HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_count * D)); - HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_count)); - HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_count)); - HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int))); // STACK size right now is 50, you can change it if you mean to - HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int))); - HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_count * D, cudaMemcpyHostToDevice)); - - search_batch<<>> (d_nodes, d_index, d_reference_points, d_query_points, query_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge); - - if (Judge == NULL) { // do the following work if the thread works safely - HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_count, cudaMemcpyDeviceToHost)); - HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_count, cudaMemcpyDeviceToHost)); - } - - HANDLE_ERROR(cudaFree(next_nodes)); - HANDLE_ERROR(cudaFree(next_search_nodes)); - HANDLE_ERROR(cudaFree(d_query_points)); - HANDLE_ERROR(cudaFree(d_indices)); - HANDLE_ERROR(cudaFree(d_distances)); - } - - #else - cpu_search(h_query_points, query_count, indices, distances); - - #endif - - } - - }; //end class kdtree - - template - kdtree* kdtree::cur_tree_ptr = NULL; // definition of cur_tree_ptr pointer points to the current class - -} //end namespace stim -#endif \ No newline at end of file -- libgit2 0.21.4