diff --git a/stim/structures/kdtree.cuh b/stim/structures/kdtree.cuh new file mode 100644 index 0000000..63d530f --- /dev/null +++ b/stim/structures/kdtree.cuh @@ -0,0 +1,415 @@ +// change CUDA_STACK together with max_tree_levels in trial and error manner +// data should be stored in row-major +// x1,x2,x3,x4,x5...... +// y1,y2,y3,y4,y5...... +// .................... +// .................... + +#ifndef KDTREE_H +#define KDTREE_H + +#include "device_launch_parameters.h" +#include +#include +#include "cuda_runtime.h" +#include +#include +#include +#include + +/// using API called HADDLE_ERROR +static void HandleError(cudaError_t err, const char *file, int line) { + if (err != cudaSuccess) { + std::cout< + struct point { + T coords[D]; // if we use size to measure a vector, it will show the number of point structures + }; + + template + class KDNode { + public: + KDNode() { // initialization + parent = NULL; + left = NULL; + right = NULL; + split_value = -1; + _parent = -1; + _left = -1; + _right = -1; + } + int id; // id for current node + size_t level; + KDNode *parent, *left, *right; + int _parent, _left, _right; // id for parent node + T split_value; // node value + std::vector indices; // indices that indicate the data that current tree has + }; + } + + template + class cpu_kdtree { + + protected: + std::vector > *m_pts; + kdtree::KDNode *m_root; // current node + int m_current_axis; + size_t m_levels; + int m_cmps; // count how many comparisons are to made in the tree for one query + int m_id; // level + 1 + static cpu_kdtree *myself; + public: + cpu_kdtree() { // initialization + myself = this; + m_id = 0; // id = level + 1, level -> axis index while id -> node identifier + } + ~cpu_kdtree() { // destructor for deleting what was created by kdtree() + std::vector *> next_node; + next_node.push_back(m_root); + while (next_node.size()) { + std::vector *> next_search; + while (next_node.size()) { + kdtree::KDNode *cur = next_node.back(); + next_node.pop_back(); + if (cur->left) + next_search.push_back(cur->left); + if (cur->right) + next_search.push_back(cur->right); + delete cur; + } + next_node = next_search; + } + m_root = NULL; + } + void Create(std::vector > &pts, size_t max_levels) { + m_pts = &pts; // create a pointer point to the input data + m_levels = max_levels; // stores max tree levels + m_root = new kdtree::KDNode(); // using KDNode() to initialize an ancestor node + m_root->id = m_id++; // id is 1 while level is 0 at the very beginning + m_root->level = 0; // to begin with level 0 + m_root->indices.resize(pts.size()); // initialize the size of whole indices + for (size_t i = 0; i < pts.size(); i++) { + m_root->indices[i] = i; // like what we did on Keys in GPU-BF part + } + std::vector *> next_node; // next node + next_node.push_back(m_root); // new node + while (next_node.size()) { + std::vector *> next_search; + while (next_node.size()) { // two same WHILE is because we need to make a new vector for searching + kdtree::KDNode *current_node = next_node.back(); // pointer point to current node (right first) + next_node.pop_back(); // pop out current node in order to store next node + if (current_node->level < max_levels) { // max_levels should be reasonably small compared with numbers of data + if (current_node->indices.size() > 1) { + kdtree::KDNode *left = new kdtree::KDNode(); + kdtree::KDNode *right = new kdtree::KDNode(); + left->id = m_id++; // risky guessing but OK for large amount of data since max_level is small + right->id = m_id++; // risky guessing but OK for large amount of data since max_level is small + Split(current_node, left, right); // split left and right and determine a node + std::vector temp; // empty vecters of int + //temp.resize(current_node->indices.size()); + current_node->indices.swap(temp); // clean up current tree's indices + current_node->left = left; + current_node->right = right; + current_node->_left = left->id; // indicates it has left son node and gets its id + current_node->_right = right->id; // indicates it has right son node and gets its id + if (left->indices.size()) + next_search.push_back(left); // right first then left according to stack(first in last out), it can be done in parallel for left and right are independent + if (right->indices.size()) + next_search.push_back(right); + } + } + } + next_node = next_search; + } + } + static bool SortPoints(const size_t a, const size_t b) { + std::vector > &pts = *myself->m_pts; + return pts[a].coords[myself->m_current_axis] < pts[b].coords[myself->m_current_axis]; + } + void Split(kdtree::KDNode *cur, kdtree::KDNode *left, kdtree::KDNode *right) { + /// assume both two sides are created and sure it was + std::vector > &pts = *m_pts; + m_current_axis = cur->level % D; // indicate the judicative dimension or axis + std::sort(cur->indices.begin(), cur->indices.end(), SortPoints); // using SortPoints as comparison function to sort the data + size_t mid = cur->indices[cur->indices.size() / 2]; // odd in the mid, even take the floor + cur->split_value = pts[mid].coords[m_current_axis]; // get the mother node + left->parent = cur; // set the parent to current node for the next nodes + right->parent = cur; + left->level = cur->level + 1; + right->level = cur->level + 1; + left->_parent = cur->id; // indicates it has mother node and gets its id + right->_parent = cur->id; // indicates it has mother node and gets its id + for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right area one by one + size_t idx = cur->indices[i]; + if (pts[idx].coords[m_current_axis] < cur->split_value) + left->indices.push_back(idx); + else + right->indices.push_back(idx); + } + } + int GetNumNodes() const { return m_id; } + kdtree::KDNode* GetRoot() const { return m_root; } + }; //end class kdtree + + template + cpu_kdtree* cpu_kdtree::myself = NULL; // definition of myself pointer points to the specific class + + template + struct CUDA_KDNode { + size_t level; + int parent, left, right; // indicates id of + T split_value; + size_t num_indices; // number of indices it has + int indices; // the beginning + }; + + template + __device__ T Distance(kdtree::point &a, kdtree::point &b) { + T dist = 0; + + for (size_t i = 0; i < D; i++) { + T d = a.coords[i] - b.coords[i]; + dist += d*d; + } + return dist; + } + template + __device__ void SearchAtNode(CUDA_KDNode *nodes, size_t *indices, kdtree::point *pts, int cur, kdtree::point &Query, size_t *ret_index, T *ret_dist, int *ret_node) { + /// finds the first possibility + size_t best_idx = 0; + T best_dist = FLT_MAX; + + while (true) { + int split_axis = nodes[cur].level % D; + if (nodes[cur].left == -1) { // if it doesn't have left son node + *ret_node = cur; + for (int i = 0; i < nodes[cur].num_indices; i++) { + size_t idx = indices[nodes[cur].indices + i]; + T dist = Distance(Query, pts[idx]); + if (dist < best_dist) { + best_dist = dist; + best_idx = idx; + } + } + break; + } + else if (Query.coords[split_axis] < nodes[cur].split_value) { + cur = nodes[cur].left; + } + else { + cur = nodes[cur].right; + } + } + *ret_index = best_idx; + *ret_dist = best_dist; + } + template + __device__ void SearchAtNodeRange(CUDA_KDNode *nodes, size_t *indices, kdtree::point *pts, kdtree::point &Query, int cur, T range, size_t *ret_index, T *ret_dist) { + /// search throught all the nodes that are within one range + size_t best_idx = 0; + T best_dist = FLT_MAX; + /// using fixed stack, increase it when need + int next_node[CUDA_STACK]; // should be larger than 1 + int next_node_pos = 0; // initialize pop out order index + next_node[next_node_pos++] = cur; // equals to next_node[next_node_pos] = cur, next_node_pos++ + + while (next_node_pos) { + int next_search[CUDA_STACK]; // for store next nodes + int next_search_pos = 0; // record push back order index + while (next_node_pos) { + cur = next_node[next_node_pos - 1]; // pop out the last in one and keep poping out + next_node_pos--; + int split_axis = nodes[cur].level % D; + + if (nodes[cur].left == -1) { + for (int i = 0; i < nodes[cur].num_indices; i++) { + int idx = indices[nodes[cur].indices + i]; // all indices are stored in one array, pick up from every node's beginning index + T d = Distance(Query, pts[idx]); + if (d < best_dist) { + best_dist = d; + best_idx = idx; + } + } + } + else { + T d = Query.coords[split_axis] - nodes[cur].split_value; + + if (fabs(d) > range) { + if (d < 0) + next_search[next_search_pos++] = nodes[cur].left; + else + next_search[next_search_pos++] = nodes[cur].right; + } + else { + next_search[next_search_pos++] = nodes[cur].left; + next_search[next_search_pos++] = nodes[cur].right; + } + } + } + + for (int i = 0; i < next_search_pos; i++) + next_node[i] = next_search[i]; + next_node_pos = next_search_pos; // operation that really resemble STACK, namely first in last out + } + *ret_index = best_idx; + *ret_dist = best_dist; + } + template + __device__ void Search(CUDA_KDNode *nodes, size_t *indices, kdtree::point *pts, kdtree::point &Query, size_t *ret_index, T *ret_dist) { + /// find first nearest node + int best_node = 0; + size_t best_idx = 0; + T best_dist = FLT_MAX; + T radius = 0; + SearchAtNode(nodes, indices, pts, 0, Query, &best_idx, &best_dist, &best_node); + radius = sqrt(best_dist); + /// find other possibilities + int cur = best_node; + + while (nodes[cur].parent != -1) { + /// go up + int parent = nodes[cur].parent; + int split_axis = nodes[parent].level % D; + /// search other node + T tmp_dist = FLT_MAX; + size_t tmp_idx; + if (fabs(nodes[parent].split_value - Query.coords[split_axis]) <= radius) { + /// search opposite node + if (nodes[parent].left != cur) + SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].left, radius, &tmp_idx, &tmp_dist); + else + SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].right, radius, &tmp_idx, &tmp_dist); + } + if (tmp_dist < best_dist) { + best_dist = tmp_dist; + best_idx = tmp_idx; + } + cur = parent; + } + *ret_index = best_idx; + *ret_dist = sqrt(best_dist); + } + template + __global__ void SearchBatch(CUDA_KDNode *nodes, size_t *indices, kdtree::point *pts, size_t num_pts, kdtree::point *Query, size_t num_Query, size_t *ret_index, T *ret_dist) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= num_Query) return; + + Search(nodes, indices, pts, Query[idx], &ret_index[idx], &ret_dist[idx]); // every query points are independent + } + + template + class cuda_kdtree { + protected: + CUDA_KDNode *m_gpu_nodes; // store nodes + size_t *m_gpu_indices; + kdtree::point* m_gpu_points; + size_t m_num_points; + public: + ~cuda_kdtree() { + HANDLE_ERROR(cudaFree(m_gpu_nodes)); + HANDLE_ERROR(cudaFree(m_gpu_indices)); + HANDLE_ERROR(cudaFree(m_gpu_points)); + } + void CreateKDTree(T *ReferencePoints, size_t ReferenceCount, size_t ColCount, size_t max_levels) { + std::vector < kdtree::point > pts(ReferenceCount); // create specific struct of reference data + for (size_t j = 0; j < ReferenceCount; j++) + for (size_t i = 0; i < ColCount; i++) + pts[j].coords[i] = ReferencePoints[j * ColCount + i]; + cpu_kdtree tree; // initialize a tree + tree.Create(pts, max_levels); // create KD-Tree on host + kdtree::KDNode *root = tree.GetRoot(); + int num_nodes = tree.GetNumNodes(); + /// create the same on CPU + m_num_points = pts.size(); // number of points for creating tree = reference_count in the case + + HANDLE_ERROR(cudaMalloc((void**)&m_gpu_nodes, sizeof(CUDA_KDNode) * num_nodes)); // private variables for kdtree + HANDLE_ERROR(cudaMalloc((void**)&m_gpu_indices, sizeof(size_t) * m_num_points)); + HANDLE_ERROR(cudaMalloc((void**)&m_gpu_points, sizeof(kdtree::point) * m_num_points)); + + std::vector > cpu_nodes(num_nodes); // from left to right, id of nodes + std::vector indices(m_num_points); + std::vector < kdtree::KDNode* > next_node; + + size_t cur_pos = 0; + + next_node.push_back(root); + + while (next_node.size()) { + std::vector * > next_search; + + while (next_node.size()) { + kdtree::KDNode *cur = next_node.back(); + next_node.pop_back(); + + int id = cur->id; // the nodes at same level are independent + + cpu_nodes[id].level = cur->level; + cpu_nodes[id].parent = cur->_parent; + cpu_nodes[id].left = cur->_left; + cpu_nodes[id].right = cur->_right; + cpu_nodes[id].split_value = cur->split_value; + cpu_nodes[id].num_indices = cur->indices.size(); // number of index + + if (cur->indices.size()) { + for (size_t i = 0; i < cur->indices.size(); i++) + indices[cur_pos + i] = cur->indices[i]; + + cpu_nodes[id].indices = (int)cur_pos; // beginning index that every bottom node has + cur_pos += cur->indices.size(); // store indices continuously + } + else { + cpu_nodes[id].indices = -1; + } + + if (cur->left) + next_search.push_back(cur->left); + + if (cur->right) + next_search.push_back(cur->right); + } + next_node = next_search; + } + + HANDLE_ERROR(cudaMemcpy(m_gpu_nodes, &cpu_nodes[0], sizeof(CUDA_KDNode) * cpu_nodes.size(), cudaMemcpyHostToDevice)); + HANDLE_ERROR(cudaMemcpy(m_gpu_indices, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice)); + HANDLE_ERROR(cudaMemcpy(m_gpu_points, &pts[0], sizeof(kdtree::point) * pts.size(), cudaMemcpyHostToDevice)); + } + void Search(T *QueryPoints, size_t QueryCount, size_t ColCount, T *dists, size_t *indices) { + std::vector < kdtree::point > query(QueryCount); + for (size_t j = 0; j < QueryCount; j++) + for (size_t i = 0; i < ColCount; i++) + query[j].coords[i] = QueryPoints[j * ColCount + i]; + + unsigned int threads = (unsigned int)(query.size() > 1024 ? 1024 : query.size()); + //unsigned int blocks = (unsigned int)ceil(query.size() / threads); + unsigned int blocks = (unsigned int)(query.size() / threads + (query.size() % threads ? 1 : 0)); + + kdtree::point *gpu_Query; + size_t *gpu_ret_indices; + T *gpu_ret_dist; + + HANDLE_ERROR(cudaMalloc((void**)&gpu_Query, sizeof(T) * query.size() * D)); + HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_indices, sizeof(size_t) * query.size())); + HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_dist, sizeof(T) * query.size())); + HANDLE_ERROR(cudaMemcpy(gpu_Query, &query[0], sizeof(T) * query.size() * D, cudaMemcpyHostToDevice)); + + SearchBatch << > > (m_gpu_nodes, m_gpu_indices, m_gpu_points, m_num_points, gpu_Query, query.size(), gpu_ret_indices, gpu_ret_dist); + + HANDLE_ERROR(cudaMemcpy(indices, gpu_ret_indices, sizeof(size_t) * query.size(), cudaMemcpyDeviceToHost)); + HANDLE_ERROR(cudaMemcpy(dists, gpu_ret_dist, sizeof(T) * query.size(), cudaMemcpyDeviceToHost)); + + HANDLE_ERROR(cudaFree(gpu_Query)); + HANDLE_ERROR(cudaFree(gpu_ret_indices)); + HANDLE_ERROR(cudaFree(gpu_ret_dist)); + } + }; +} //end namespace stim +#endif \ No newline at end of file -- libgit2 0.21.4