From 268037bc9457a41c60d0e74b78284fa8aac5d537 Mon Sep 17 00:00:00 2001 From: Jiaming Guo Date: Thu, 17 Nov 2016 17:49:31 -0600 Subject: [PATCH] made code more elegant. --- stim/structures/kdtree.cuh | 515 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 1 file changed, 257 insertions(+), 258 deletions(-) diff --git a/stim/structures/kdtree.cuh b/stim/structures/kdtree.cuh index 63d530f..8e80163 100644 --- a/stim/structures/kdtree.cuh +++ b/stim/structures/kdtree.cuh @@ -1,4 +1,4 @@ -// change CUDA_STACK together with max_tree_levels in trial and error manner +// right now the size of CUDA STACK is set to 1000, increase it if you mean to make deeper tree // data should be stored in row-major // x1,x2,x3,x4,x5...... // y1,y2,y3,y4,y5...... @@ -16,159 +16,150 @@ #include #include #include - -/// using API called HADDLE_ERROR -static void HandleError(cudaError_t err, const char *file, int line) { - if (err != cudaSuccess) { - std::cout< namespace stim { namespace kdtree { - template + template // typename refers to float or double while D refers to dimension of points struct point { - T coords[D]; // if we use size to measure a vector, it will show the number of point structures + T dim[D]; // create a structure to store every one input point }; template - class KDNode { + class kdnode { public: - KDNode() { // initialization - parent = NULL; + kdnode() { // constructor for initializing a kdnode + parent = NULL; // set every node's parent, left and right kdnode pointers to NULL left = NULL; right = NULL; - split_value = -1; - _parent = -1; - _left = -1; - _right = -1; + parent_idx = -1; // set parent node index to default -1 + left_idx = -1; + right_idx = -1; + split_value = -1; // set split_value to default -1 } - int id; // id for current node - size_t level; - KDNode *parent, *left, *right; - int _parent, _left, _right; // id for parent node - T split_value; // node value - std::vector indices; // indices that indicate the data that current tree has + int idx; // index of current node + int parent_idx, left_idx, right_idx; // index of parent, left and right nodes + kdnode *parent, *left, *right; // parent, left and right kdnodes + T split_value; // splitting value of current node + std::vector indices; // it indicates the points' indices that current node has + size_t level; // tree level of current node }; - } + } // end of namespace kdtree - template + template // set dimension of data to default 3 class cpu_kdtree { - protected: - std::vector > *m_pts; - kdtree::KDNode *m_root; // current node - int m_current_axis; - size_t m_levels; - int m_cmps; // count how many comparisons are to made in the tree for one query - int m_id; // level + 1 - static cpu_kdtree *myself; + int current_axis; // current judging axis + int cmps; // count how many time of comparisons (just for cpu-kdtree) + int n_id; // store the total number of nodes + std::vector > *tmp_points; // transfer or temp points + kdtree::kdnode *root; // root node + static cpu_kdtree *cur_tree_ptr; public: - cpu_kdtree() { // initialization - myself = this; - m_id = 0; // id = level + 1, level -> axis index while id -> node identifier + cpu_kdtree() { // constructor for creating a cpu_kdtree + cur_tree_ptr = this; // create a class pointer points to the current class value + n_id = 0; // set total number of points to default 0 } - ~cpu_kdtree() { // destructor for deleting what was created by kdtree() - std::vector *> next_node; - next_node.push_back(m_root); - while (next_node.size()) { - std::vector *> next_search; - while (next_node.size()) { - kdtree::KDNode *cur = next_node.back(); - next_node.pop_back(); + ~cpu_kdtree() { // destructor of cpu_kdtree + std::vector *> next_nodes; + next_nodes.push_back(root); + while (next_nodes.size()) { + std::vector *> next_search_nodes; + while (next_nodes.size()) { + kdtree::kdnode *cur = next_nodes.back(); + next_nodes.pop_back(); if (cur->left) - next_search.push_back(cur->left); + next_search_nodes.push_back(cur->left); if (cur->right) - next_search.push_back(cur->right); + next_search_nodes.push_back(cur->right); delete cur; } - next_node = next_search; + next_nodes = next_search_nodes; } - m_root = NULL; + root = NULL; } - void Create(std::vector > &pts, size_t max_levels) { - m_pts = &pts; // create a pointer point to the input data - m_levels = max_levels; // stores max tree levels - m_root = new kdtree::KDNode(); // using KDNode() to initialize an ancestor node - m_root->id = m_id++; // id is 1 while level is 0 at the very beginning - m_root->level = 0; // to begin with level 0 - m_root->indices.resize(pts.size()); // initialize the size of whole indices - for (size_t i = 0; i < pts.size(); i++) { - m_root->indices[i] = i; // like what we did on Keys in GPU-BF part + void Create(std::vector > &reference_points, size_t max_levels) { + tmp_points = &reference_points; + root = new kdtree::kdnode(); // initializing the root node + root->idx = n_id++; // the index of root is 0 + root->level = 0; // tree level begins at 0 + root->indices.resize(reference_points.size()); // get the number of points + for (size_t i = 0; i < reference_points.size(); i++) { + root->indices[i] = i; // set indices of input points } - std::vector *> next_node; // next node - next_node.push_back(m_root); // new node - while (next_node.size()) { - std::vector *> next_search; - while (next_node.size()) { // two same WHILE is because we need to make a new vector for searching - kdtree::KDNode *current_node = next_node.back(); // pointer point to current node (right first) - next_node.pop_back(); // pop out current node in order to store next node - if (current_node->level < max_levels) { // max_levels should be reasonably small compared with numbers of data - if (current_node->indices.size() > 1) { - kdtree::KDNode *left = new kdtree::KDNode(); - kdtree::KDNode *right = new kdtree::KDNode(); - left->id = m_id++; // risky guessing but OK for large amount of data since max_level is small - right->id = m_id++; // risky guessing but OK for large amount of data since max_level is small - Split(current_node, left, right); // split left and right and determine a node - std::vector temp; // empty vecters of int + std::vector *> next_nodes; // next nodes + next_nodes.push_back(root); // push back the root node + while (next_nodes.size()) { + std::vector *> next_search_nodes; // next search nodes + while (next_nodes.size()) { // two same WHILE is because we need to make a new vector to store nodes for search + kdtree::kdnode *current_node = next_nodes.back(); // handle node one by one (right first) + next_nodes.pop_back(); // pop out current node in order to store next round of nodes + if (current_node->level < max_levels) { + if (current_node->indices.size() > 1) { // split if the nonleaf node contains more than one point + kdtree::kdnode *left = new kdtree::kdnode(); + kdtree::kdnode *right = new kdtree::kdnode(); + left->idx = n_id++; // set the index of current node's left node + right->idx = n_id++; + Split(current_node, left, right); // split left and right and determine a node + std::vector temp; // empty vecters of int //temp.resize(current_node->indices.size()); - current_node->indices.swap(temp); // clean up current tree's indices + current_node->indices.swap(temp); // clean up current node's indices current_node->left = left; current_node->right = right; - current_node->_left = left->id; // indicates it has left son node and gets its id - current_node->_right = right->id; // indicates it has right son node and gets its id - if (left->indices.size()) - next_search.push_back(left); // right first then left according to stack(first in last out), it can be done in parallel for left and right are independent + current_node->left_idx = left->idx; + current_node->right_idx = right->idx; if (right->indices.size()) - next_search.push_back(right); + next_search_nodes.push_back(right); // left pop out first + if (left->indices.size()) + next_search_nodes.push_back(left); } } } - next_node = next_search; + next_nodes = next_search_nodes; // go deeper within the tree } } - static bool SortPoints(const size_t a, const size_t b) { - std::vector > &pts = *myself->m_pts; - return pts[a].coords[myself->m_current_axis] < pts[b].coords[myself->m_current_axis]; + static bool SortPoints(const size_t a, const size_t b) { // create functor for std::sort + std::vector > &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer + return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis]; } - void Split(kdtree::KDNode *cur, kdtree::KDNode *left, kdtree::KDNode *right) { - /// assume both two sides are created and sure it was - std::vector > &pts = *m_pts; - m_current_axis = cur->level % D; // indicate the judicative dimension or axis + void Split(kdtree::kdnode *cur, kdtree::kdnode *left, kdtree::kdnode *right) { + std::vector > &pts = *tmp_points; + current_axis = cur->level % D; // indicate the judicative dimension or axis std::sort(cur->indices.begin(), cur->indices.end(), SortPoints); // using SortPoints as comparison function to sort the data - size_t mid = cur->indices[cur->indices.size() / 2]; // odd in the mid, even take the floor - cur->split_value = pts[mid].coords[m_current_axis]; // get the mother node - left->parent = cur; // set the parent to current node for the next nodes + size_t mid_value = cur->indices[cur->indices.size() / 2]; // odd in the mid_value, even take the floor + cur->split_value = pts[mid_value].dim[current_axis]; // get the parent node + left->parent = cur; // set the parent of the next search nodes to current node right->parent = cur; - left->level = cur->level + 1; + left->level = cur->level + 1; // level + 1 right->level = cur->level + 1; - left->_parent = cur->id; // indicates it has mother node and gets its id - right->_parent = cur->id; // indicates it has mother node and gets its id - for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right area one by one + left->parent_idx = cur->idx; // set its parent node's index + right->parent_idx = cur->idx; + for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right half-space one by one size_t idx = cur->indices[i]; - if (pts[idx].coords[m_current_axis] < cur->split_value) + if (pts[idx].dim[current_axis] < cur->split_value) left->indices.push_back(idx); else right->indices.push_back(idx); } } - int GetNumNodes() const { return m_id; } - kdtree::KDNode* GetRoot() const { return m_root; } + int GetNumNodes() const { // get the total number of nodes + return n_id; + } + kdtree::kdnode* GetRoot() const { // get the root node of tree + return root; + } }; //end class kdtree template - cpu_kdtree* cpu_kdtree::myself = NULL; // definition of myself pointer points to the specific class + cpu_kdtree* cpu_kdtree::cur_tree_ptr = NULL; // definition of cur_tree_ptr pointer points to the current class template - struct CUDA_KDNode { - size_t level; - int parent, left, right; // indicates id of + struct cuda_kdnode { + int parent, left, right; T split_value; - size_t num_indices; // number of indices it has - int indices; // the beginning + size_t num_index; // number of indices it has + int index; // the beginning index + size_t level; }; template @@ -176,239 +167,247 @@ namespace stim { T dist = 0; for (size_t i = 0; i < D; i++) { - T d = a.coords[i] - b.coords[i]; + T d = a.dim[i] - b.dim[i]; dist += d*d; } return dist; } template - __device__ void SearchAtNode(CUDA_KDNode *nodes, size_t *indices, kdtree::point *pts, int cur, kdtree::point &Query, size_t *ret_index, T *ret_dist, int *ret_node) { - /// finds the first possibility - size_t best_idx = 0; - T best_dist = FLT_MAX; + __device__ void SearchAtNode(cuda_kdnode *nodes, size_t *indices, kdtree::point *d_reference_points, int cur, kdtree::point &d_query_point, size_t *d_index, T *d_distance, int *d_node) { + T best_distance = FLT_MAX; + size_t best_index = 0; - while (true) { + while (true) { // break until reach the bottom int split_axis = nodes[cur].level % D; - if (nodes[cur].left == -1) { // if it doesn't have left son node - *ret_node = cur; - for (int i = 0; i < nodes[cur].num_indices; i++) { - size_t idx = indices[nodes[cur].indices + i]; - T dist = Distance(Query, pts[idx]); - if (dist < best_dist) { - best_dist = dist; - best_idx = idx; + if (nodes[cur].left == -1) { // check whether it has left node or not + *d_node = cur; + for (int i = 0; i < nodes[cur].num_index; i++) { + size_t idx = indices[nodes[cur].index + i]; + T dist = Distance(d_query_point, d_reference_points[idx]); + if (dist < best_distance) { + best_distance = dist; + best_index = idx; } } - break; + break; } - else if (Query.coords[split_axis] < nodes[cur].split_value) { + else if (d_query_point.dim[split_axis] < nodes[cur].split_value) { // jump into specific son node cur = nodes[cur].left; } else { cur = nodes[cur].right; } } - *ret_index = best_idx; - *ret_dist = best_dist; + *d_distance = best_distance; + *d_index = best_index; } template - __device__ void SearchAtNodeRange(CUDA_KDNode *nodes, size_t *indices, kdtree::point *pts, kdtree::point &Query, int cur, T range, size_t *ret_index, T *ret_dist) { - /// search throught all the nodes that are within one range - size_t best_idx = 0; - T best_dist = FLT_MAX; - /// using fixed stack, increase it when need - int next_node[CUDA_STACK]; // should be larger than 1 - int next_node_pos = 0; // initialize pop out order index - next_node[next_node_pos++] = cur; // equals to next_node[next_node_pos] = cur, next_node_pos++ - - while (next_node_pos) { - int next_search[CUDA_STACK]; // for store next nodes - int next_search_pos = 0; // record push back order index - while (next_node_pos) { - cur = next_node[next_node_pos - 1]; // pop out the last in one and keep poping out - next_node_pos--; + __device__ void SearchAtNodeRange(cuda_kdnode *nodes, size_t *indices, kdtree::point *d_reference_points, kdtree::point &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { + T best_distance = FLT_MAX; + size_t best_index = 0; + + int next_nodes_pos = 0; // initialize pop out order index + next_nodes[id * 1000 + next_nodes_pos] = cur; // find data that belongs to the very specific thread + next_nodes_pos++; + + while (next_nodes_pos) { + int next_search_nodes_pos = 0; // record push back order index + while (next_nodes_pos) { + cur = next_nodes[id * 1000 + next_nodes_pos - 1]; // pop out the last push in one and keep poping out + next_nodes_pos--; int split_axis = nodes[cur].level % D; if (nodes[cur].left == -1) { - for (int i = 0; i < nodes[cur].num_indices; i++) { - int idx = indices[nodes[cur].indices + i]; // all indices are stored in one array, pick up from every node's beginning index - T d = Distance(Query, pts[idx]); - if (d < best_dist) { - best_dist = d; - best_idx = idx; + for (int i = 0; i < nodes[cur].num_index; i++) { + int idx = indices[nodes[cur].index + i]; // all indices are stored in one array, pick up from every node's beginning index + T d = Distance(d_query_point, d_reference_points[idx]); + if (d < best_distance) { + best_distance = d; + best_index = idx; } } } else { - T d = Query.coords[split_axis] - nodes[cur].split_value; + T d = d_query_point.dim[split_axis] - nodes[cur].split_value; if (fabs(d) > range) { - if (d < 0) - next_search[next_search_pos++] = nodes[cur].left; - else - next_search[next_search_pos++] = nodes[cur].right; + if (d < 0) { + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].left; + next_search_nodes_pos++; + } + else { + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].right; + next_search_nodes_pos++; + } } else { - next_search[next_search_pos++] = nodes[cur].left; - next_search[next_search_pos++] = nodes[cur].right; + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].right; + next_search_nodes_pos++; + next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].left; + next_search_nodes_pos++; + if (next_search_nodes_pos > 1000) { + printf("Thread conflict might be caused by thread %d, so please try smaller input max_tree_levels\n", id); + (*Judge)++; + } } } } - - for (int i = 0; i < next_search_pos; i++) - next_node[i] = next_search[i]; - next_node_pos = next_search_pos; // operation that really resemble STACK, namely first in last out + for (int i = 0; i < next_search_nodes_pos; i++) + next_nodes[id * 1000 + i] = next_search_nodes[id * 1000 + i]; + next_nodes_pos = next_search_nodes_pos; } - *ret_index = best_idx; - *ret_dist = best_dist; + *d_distance = best_distance; + *d_index = best_index; } template - __device__ void Search(CUDA_KDNode *nodes, size_t *indices, kdtree::point *pts, kdtree::point &Query, size_t *ret_index, T *ret_dist) { - /// find first nearest node + __device__ void Search(cuda_kdnode *nodes, size_t *indices, kdtree::point *d_reference_points, kdtree::point &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) { int best_node = 0; - size_t best_idx = 0; - T best_dist = FLT_MAX; + T best_distance = FLT_MAX; + size_t best_index = 0; T radius = 0; - SearchAtNode(nodes, indices, pts, 0, Query, &best_idx, &best_dist, &best_node); - radius = sqrt(best_dist); - /// find other possibilities + + SearchAtNode(nodes, indices, d_reference_points, 0, d_query_point, &best_index, &best_distance, &best_node); + radius = sqrt(best_distance); // get range int cur = best_node; while (nodes[cur].parent != -1) { - /// go up int parent = nodes[cur].parent; int split_axis = nodes[parent].level % D; - /// search other node + T tmp_dist = FLT_MAX; size_t tmp_idx; - if (fabs(nodes[parent].split_value - Query.coords[split_axis]) <= radius) { - /// search opposite node + if (fabs(nodes[parent].split_value - d_query_point.dim[split_axis]) <= radius) { if (nodes[parent].left != cur) - SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].left, radius, &tmp_idx, &tmp_dist); + SearchAtNodeRange(nodes, indices, d_reference_points, d_query_point, nodes[parent].left, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge); else - SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].right, radius, &tmp_idx, &tmp_dist); + SearchAtNodeRange(nodes, indices, d_reference_points, d_query_point, nodes[parent].right, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge); } - if (tmp_dist < best_dist) { - best_dist = tmp_dist; - best_idx = tmp_idx; + if (tmp_dist < best_distance) { + best_distance = tmp_dist; + best_index = tmp_idx; } cur = parent; } - *ret_index = best_idx; - *ret_dist = sqrt(best_dist); + *d_distance = sqrt(best_distance); + *d_index = best_index; } template - __global__ void SearchBatch(CUDA_KDNode *nodes, size_t *indices, kdtree::point *pts, size_t num_pts, kdtree::point *Query, size_t num_Query, size_t *ret_index, T *ret_dist) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= num_Query) return; + __global__ void SearchBatch(cuda_kdnode *nodes, size_t *indices, kdtree::point *d_reference_points, kdtree::point *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= d_query_count) return; // avoid segfault - Search(nodes, indices, pts, Query[idx], &ret_index[idx], &ret_dist[idx]); // every query points are independent + Search(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent } template class cuda_kdtree { protected: - CUDA_KDNode *m_gpu_nodes; // store nodes - size_t *m_gpu_indices; - kdtree::point* m_gpu_points; - size_t m_num_points; + cuda_kdnode *d_nodes; + size_t *d_index; + kdtree::point* d_reference_points; + size_t d_reference_count; public: ~cuda_kdtree() { - HANDLE_ERROR(cudaFree(m_gpu_nodes)); - HANDLE_ERROR(cudaFree(m_gpu_indices)); - HANDLE_ERROR(cudaFree(m_gpu_points)); + HANDLE_ERROR(cudaFree(d_nodes)); + HANDLE_ERROR(cudaFree(d_index)); + HANDLE_ERROR(cudaFree(d_reference_points)); } - void CreateKDTree(T *ReferencePoints, size_t ReferenceCount, size_t ColCount, size_t max_levels) { - std::vector < kdtree::point > pts(ReferenceCount); // create specific struct of reference data - for (size_t j = 0; j < ReferenceCount; j++) - for (size_t i = 0; i < ColCount; i++) - pts[j].coords[i] = ReferencePoints[j * ColCount + i]; - cpu_kdtree tree; // initialize a tree - tree.Create(pts, max_levels); // create KD-Tree on host - kdtree::KDNode *root = tree.GetRoot(); + void CreateKDTree(T *h_reference_points, size_t reference_count, size_t dim_count, size_t max_levels) { + if (max_levels > 10) { + std::cout<<"The max_tree_levels should be smaller!"<> reference_points(reference_count); // restore the reference points in particular way + for (size_t j = 0; j < reference_count; j++) + for (size_t i = 0; i < dim_count; i++) + reference_points[j].dim[i] = h_reference_points[j * dim_count + i]; + cpu_kdtree tree; // creating a tree on cpu + tree.Create(reference_points, max_levels); // building a tree on cpu + kdtree::kdnode *d_root = tree.GetRoot(); int num_nodes = tree.GetNumNodes(); - /// create the same on CPU - m_num_points = pts.size(); // number of points for creating tree = reference_count in the case + d_reference_count = reference_points.size(); // also equals to reference_count - HANDLE_ERROR(cudaMalloc((void**)&m_gpu_nodes, sizeof(CUDA_KDNode) * num_nodes)); // private variables for kdtree - HANDLE_ERROR(cudaMalloc((void**)&m_gpu_indices, sizeof(size_t) * m_num_points)); - HANDLE_ERROR(cudaMalloc((void**)&m_gpu_points, sizeof(kdtree::point) * m_num_points)); - - std::vector > cpu_nodes(num_nodes); // from left to right, id of nodes - std::vector indices(m_num_points); - std::vector < kdtree::KDNode* > next_node; + HANDLE_ERROR(cudaMalloc((void**)&d_nodes, sizeof(cuda_kdnode) * num_nodes)); // copy data from host to device + HANDLE_ERROR(cudaMalloc((void**)&d_index, sizeof(size_t) * d_reference_count)); + HANDLE_ERROR(cudaMalloc((void**)&d_reference_points, sizeof(kdtree::point) * d_reference_count)); + std::vector > tmp_nodes(num_nodes); + std::vector indices(d_reference_count); + std::vector *> next_nodes; size_t cur_pos = 0; - - next_node.push_back(root); - - while (next_node.size()) { - std::vector * > next_search; - - while (next_node.size()) { - kdtree::KDNode *cur = next_node.back(); - next_node.pop_back(); - - int id = cur->id; // the nodes at same level are independent - - cpu_nodes[id].level = cur->level; - cpu_nodes[id].parent = cur->_parent; - cpu_nodes[id].left = cur->_left; - cpu_nodes[id].right = cur->_right; - cpu_nodes[id].split_value = cur->split_value; - cpu_nodes[id].num_indices = cur->indices.size(); // number of index - + next_nodes.push_back(d_root); + while (next_nodes.size()) { + std::vector *> next_search_nodes; + while (next_nodes.size()) { + kdtree::kdnode *cur = next_nodes.back(); + next_nodes.pop_back(); + int id = cur->idx; // the nodes at same level are independent + tmp_nodes[id].level = cur->level; + tmp_nodes[id].parent = cur->parent_idx; + tmp_nodes[id].left = cur->left_idx; + tmp_nodes[id].right = cur->right_idx; + tmp_nodes[id].split_value = cur->split_value; + tmp_nodes[id].num_index = cur->indices.size(); // number of index if (cur->indices.size()) { for (size_t i = 0; i < cur->indices.size(); i++) indices[cur_pos + i] = cur->indices[i]; - cpu_nodes[id].indices = (int)cur_pos; // beginning index that every bottom node has - cur_pos += cur->indices.size(); // store indices continuously + tmp_nodes[id].index = (int)cur_pos; // beginning index of reference_points that every bottom node has + cur_pos += cur->indices.size(); // store indices continuously for every query_point } else { - cpu_nodes[id].indices = -1; + tmp_nodes[id].index = -1; } if (cur->left) - next_search.push_back(cur->left); + next_search_nodes.push_back(cur->left); if (cur->right) - next_search.push_back(cur->right); + next_search_nodes.push_back(cur->right); } - next_node = next_search; + next_nodes = next_search_nodes; } - - HANDLE_ERROR(cudaMemcpy(m_gpu_nodes, &cpu_nodes[0], sizeof(CUDA_KDNode) * cpu_nodes.size(), cudaMemcpyHostToDevice)); - HANDLE_ERROR(cudaMemcpy(m_gpu_indices, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice)); - HANDLE_ERROR(cudaMemcpy(m_gpu_points, &pts[0], sizeof(kdtree::point) * pts.size(), cudaMemcpyHostToDevice)); + HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode) * tmp_nodes.size(), cudaMemcpyHostToDevice)); + HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice)); + HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point) * reference_points.size(), cudaMemcpyHostToDevice)); } - void Search(T *QueryPoints, size_t QueryCount, size_t ColCount, T *dists, size_t *indices) { - std::vector < kdtree::point > query(QueryCount); - for (size_t j = 0; j < QueryCount; j++) - for (size_t i = 0; i < ColCount; i++) - query[j].coords[i] = QueryPoints[j * ColCount + i]; - - unsigned int threads = (unsigned int)(query.size() > 1024 ? 1024 : query.size()); - //unsigned int blocks = (unsigned int)ceil(query.size() / threads); - unsigned int blocks = (unsigned int)(query.size() / threads + (query.size() % threads ? 1 : 0)); - - kdtree::point *gpu_Query; - size_t *gpu_ret_indices; - T *gpu_ret_dist; - - HANDLE_ERROR(cudaMalloc((void**)&gpu_Query, sizeof(T) * query.size() * D)); - HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_indices, sizeof(size_t) * query.size())); - HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_dist, sizeof(T) * query.size())); - HANDLE_ERROR(cudaMemcpy(gpu_Query, &query[0], sizeof(T) * query.size() * D, cudaMemcpyHostToDevice)); - - SearchBatch << > > (m_gpu_nodes, m_gpu_indices, m_gpu_points, m_num_points, gpu_Query, query.size(), gpu_ret_indices, gpu_ret_dist); - - HANDLE_ERROR(cudaMemcpy(indices, gpu_ret_indices, sizeof(size_t) * query.size(), cudaMemcpyDeviceToHost)); - HANDLE_ERROR(cudaMemcpy(dists, gpu_ret_dist, sizeof(T) * query.size(), cudaMemcpyDeviceToHost)); + void Search(T *h_query_points, size_t query_count, size_t dim_count, T *dists, size_t *indices) { + std::vector > query_points(query_count); + for (size_t j = 0; j < query_count; j++) + for (size_t i = 0; i < dim_count; i++) + query_points[j].dim[i] = h_query_points[j * dim_count + i]; + + unsigned int threads = (unsigned int)(query_points.size() > 1024 ? 1024 : query_points.size()); + unsigned int blocks = (unsigned int)(query_points.size() / threads + (query_points.size() % threads ? 1 : 0)); + + kdtree::point *d_query_points; // create a pointer pointing to query points on gpu + size_t *d_indices; + T *d_distances; + + int *next_nodes; // create two STACK-like array + int *next_search_nodes; + + int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory + + HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_points.size() * D)); + HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_points.size())); + HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_points.size())); + HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * 1000 * sizeof(int))); // STACK size right now is 1000, you can change it if you mean to + HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * 1000 * sizeof(int))); + HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_points.size() * D, cudaMemcpyHostToDevice)); + + SearchBatch<<>> (d_nodes, d_index, d_reference_points, d_query_points, query_points.size(), d_indices, d_distances, next_nodes, next_search_nodes, Judge); + + if (Judge == NULL) { // do the following work if the thread works safely + HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_points.size(), cudaMemcpyDeviceToHost)); + HANDLE_ERROR(cudaMemcpy(dists, d_distances, sizeof(T) * query_points.size(), cudaMemcpyDeviceToHost)); + } - HANDLE_ERROR(cudaFree(gpu_Query)); - HANDLE_ERROR(cudaFree(gpu_ret_indices)); - HANDLE_ERROR(cudaFree(gpu_ret_dist)); + HANDLE_ERROR(cudaFree(next_nodes)); + HANDLE_ERROR(cudaFree(next_search_nodes)); + HANDLE_ERROR(cudaFree(d_query_points)); + HANDLE_ERROR(cudaFree(d_indices)); + HANDLE_ERROR(cudaFree(d_distances)); } }; } //end namespace stim -- libgit2 0.21.4