codebase / stimlib

Browse Code »

Commit 6a54a547256f32bceb7afb81e456d9c5fc699bee

Authored by Jiaming Guo 2016-12-14 18:22:27 -0600

1 parent 42bb075c

new version of kdtree

Showing 1 changed file with 627 additions and 0 deletions Show diff stats

Inline Side-by-side

stim/structures/kdtreenew.cuh 0 → 100644

Wrap text Show/Hide comments View file @6a54a54

	1	+// right now the size of CUDA STACK is set to 50, increase it if you mean to make deeper tree
	2	+// data should be stored in row-major
	3	+// x1,x2,x3,x4,x5......
	4	+// y1,y2,y3,y4,y5......
	5	+// ....................
	6	+// ....................
	7	+
	8	+#ifndef KDTREE_H
	9	+#define KDTREE_H
	10	+#define stack_size 50
	11	+
	12	+#include "device_launch_parameters.h"
	13	+#include <cuda.h>
	14	+#include <cuda_runtime_api.h>
	15	+#include "cuda_runtime.h"
	16	+#include <vector>
	17	+#include <cstring>
	18	+#include <float.h>
	19	+#include <iostream>
	20	+#include <algorithm>
	21	+#include <stim/cuda/cudatools/error.h>
	22	+#include <stim/visualization/aabbn.h>
	23	+
	24	+namespace stim {
	25	+ namespace cpu_kdtree {
	26	+ template<typename T, int D> // typename refers to float or double while D refers to dimension of points
	27	+ struct point {
	28	+ T dim[D]; // create a structure to store every one input point
	29	+ };
	30	+
	31	+ template<typename T>
	32	+ class cpu_kdnode {
	33	+ public:
	34	+ cpu_kdnode() { // constructor for initializing a kdnode
	35	+ parent = NULL; // set every node's parent, left and right kdnode pointers to NULL
	36	+ left = NULL;
	37	+ right = NULL;
	38	+ parent_idx = -1; // set parent node index to default -1
	39	+ left_idx = -1;
	40	+ right_idx = -1;
	41	+ split_value = -1; // set split_value to default -1
	42	+ }
	43	+ int idx; // index of current node
	44	+ int parent_idx, left_idx, right_idx; // index of parent, left and right nodes
	45	+ cpu_kdnode parent, left, *right; // parent, left and right kdnodes
	46	+ T split_value; // splitting value of current node
	47	+ std::vector <size_t> indices; // it indicates the points' indices that current node has
	48	+ size_t level; // tree level of current node
	49	+ };
	50	+ } // end of namespace cpu_kdtree
	51	+
	52	+ template <typename T>
	53	+ struct cuda_kdnode {
	54	+ int parent, left, right;
	55	+ T split_value;
	56	+ size_t num_index; // number of indices it has
	57	+ int index; // the beginning index
	58	+ size_t level;
	59	+ };
	60	+
	61	+ template <typename T, int D>
	62	+ __device__ T gpu_distance(cpu_kdtree::point<T, D> &a, cpu_kdtree::point<T, D> &b) {
	63	+ T distance = 0;
	64	+
	65	+ for (size_t i = 0; i < D; i++) {
	66	+ T d = a.dim[i] - b.dim[i];
	67	+ distance += d*d;
	68	+ }
	69	+ return distance;
	70	+ }
	71	+
	72	+ template <typename T, int D>
	73	+ __device__ void search_at_node(cuda_kdnode<T> nodes, size_t indices, cpu_kdtree::point<T, D> d_reference_points, int cur, cpu_kdtree::point<T, D> &d_query_point, size_t d_index, T d_distance, int d_node) {
	74	+ T best_distance = FLT_MAX;
	75	+ size_t best_index = 0;
	76	+
	77	+ while (true) { // break until reach the bottom
	78	+ int split_axis = nodes[cur].level % D;
	79	+ if (nodes[cur].left == -1) { // check whether it has left node or not
	80	+ *d_node = cur;
	81	+ for (int i = 0; i < nodes[cur].num_index; i++) {
	82	+ size_t idx = indices[nodes[cur].index + i];
	83	+ T dist = gpu_distance<T, D>(d_query_point, d_reference_points[idx]);
	84	+ if (dist < best_distance) {
	85	+ best_distance = dist;
	86	+ best_index = idx;
	87	+ }
	88	+ }
	89	+ break;
	90	+ }
	91	+ else if (d_query_point.dim[split_axis] < nodes[cur].split_value) { // jump into specific son node
	92	+ cur = nodes[cur].left;
	93	+ }
	94	+ else {
	95	+ cur = nodes[cur].right;
	96	+ }
	97	+ }
	98	+ *d_distance = best_distance;
	99	+ *d_index = best_index;
	100	+ }
	101	+
	102	+ template <typename T, int D>
	103	+ __device__ void search_at_node_range(cuda_kdnode<T> nodes, size_t indices, cpu_kdtree::point<T, D> d_reference_points, cpu_kdtree::point<T, D> &d_query_point, int cur, T range, size_t d_index, T d_distance, size_t id, int next_nodes, int next_search_nodes, int Judge) {
	104	+ T best_distance = FLT_MAX;
	105	+ size_t best_index = 0;
	106	+
	107	+ int next_nodes_pos = 0; // initialize pop out order index
	108	+ next_nodes[id * stack_size + next_nodes_pos] = cur; // find data that belongs to the very specific thread
	109	+ next_nodes_pos++;
	110	+
	111	+ while (next_nodes_pos) {
	112	+ int next_search_nodes_pos = 0; // record push back order index
	113	+ while (next_nodes_pos) {
	114	+ cur = next_nodes[id * stack_size + next_nodes_pos - 1]; // pop out the last push in one and keep poping out
	115	+ next_nodes_pos--;
	116	+ int split_axis = nodes[cur].level % D;
	117	+
	118	+ if (nodes[cur].left == -1) {
	119	+ for (int i = 0; i < nodes[cur].num_index; i++) {
	120	+ int idx = indices[nodes[cur].index + i]; // all indices are stored in one array, pick up from every node's beginning index
	121	+ T d = gpu_distance<T>(d_query_point, d_reference_points[idx]);
	122	+ if (d < best_distance) {
	123	+ best_distance = d;
	124	+ best_index = idx;
	125	+ }
	126	+ }
	127	+ }
	128	+ else {
	129	+ T d = d_query_point.dim[split_axis] - nodes[cur].split_value;
	130	+
	131	+ if (fabs(d) > range) {
	132	+ if (d < 0) {
	133	+ next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left;
	134	+ next_search_nodes_pos++;
	135	+ }
	136	+ else {
	137	+ next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right;
	138	+ next_search_nodes_pos++;
	139	+ }
	140	+ }
	141	+ else {
	142	+ next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right;
	143	+ next_search_nodes_pos++;
	144	+ next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left;
	145	+ next_search_nodes_pos++;
	146	+ if (next_search_nodes_pos > stack_size) {
	147	+ printf("Thread conflict might be caused by thread %d, so please try smaller input max_tree_levels\n", id);
	148	+ (*Judge)++;
	149	+ }
	150	+ }
	151	+ }
	152	+ }
	153	+ for (int i = 0; i < next_search_nodes_pos; i++)
	154	+ next_nodes[id * stack_size + i] = next_search_nodes[id * stack_size + i];
	155	+ next_nodes_pos = next_search_nodes_pos;
	156	+ }
	157	+ *d_distance = best_distance;
	158	+ *d_index = best_index;
	159	+ }
	160	+
	161	+ template <typename T, int D>
	162	+ __device__ void search(cuda_kdnode<T> nodes, size_t indices, cpu_kdtree::point<T, D> d_reference_points, cpu_kdtree::point<T, D> &d_query_point, size_t d_index, T d_distance, size_t id, int next_nodes, int next_search_nodes, int Judge) {
	163	+ int best_node = 0;
	164	+ T best_distance = FLT_MAX;
	165	+ size_t best_index = 0;
	166	+ T radius = 0;
	167	+
	168	+ search_at_node<T, D>(nodes, indices, d_reference_points, 0, d_query_point, &best_index, &best_distance, &best_node);
	169	+ radius = sqrt(best_distance); // get range
	170	+ int cur = best_node;
	171	+
	172	+ while (nodes[cur].parent != -1) {
	173	+ int parent = nodes[cur].parent;
	174	+ int split_axis = nodes[parent].level % D;
	175	+
	176	+ T tmp_dist = FLT_MAX;
	177	+ size_t tmp_idx;
	178	+ if (fabs(nodes[parent].split_value - d_query_point.dim[split_axis]) <= radius) {
	179	+ if (nodes[parent].left != cur)
	180	+ search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].left, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
	181	+ else
	182	+ search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].right, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
	183	+ }
	184	+ if (tmp_dist < best_distance) {
	185	+ best_distance = tmp_dist;
	186	+ best_index = tmp_idx;
	187	+ }
	188	+ cur = parent;
	189	+ }
	190	+ *d_distance = sqrt(best_distance);
	191	+ *d_index = best_index;
	192	+ }
	193	+
	194	+ template <typename T, int D>
	195	+ __global__ void search_batch(cuda_kdnode<T> nodes, size_t indices, cpu_kdtree::point<T, D> d_reference_points, cpu_kdtree::point<T, D> d_query_points, size_t d_query_count, size_t d_indices, T d_distances, int next_nodes, int next_search_nodes, int *Judge) {
	196	+ size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
	197	+ if (idx >= d_query_count) return; // avoid segfault
	198	+
	199	+ search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent
	200	+ }
	201	+
	202	+ template <typename T, int D>
	203	+ void search_stream(cuda_kdnode<T> d_nodes, size_t d_index, cpu_kdtree::point<T, D> d_reference_points, cpu_kdtree::point<T, D> query_stream_points, size_t stream_count, size_t indices, T distances) {
	204	+ unsigned int threads = (unsigned int)(stream_count > 1024 ? 1024 : stream_count);
	205	+ unsigned int blocks = (unsigned int)(stream_count / threads + (stream_count % threads ? 1 : 0));
	206	+
	207	+ cpu_kdtree::point<T, D> *d_query_points;
	208	+ size_t *d_indices;
	209	+ T *d_distances;
	210	+
	211	+ int *next_nodes;
	212	+ int *next_search_nodes;
	213	+
	214	+ HANDLE_ERROR(cudaMalloc((void*)&d_query_points, sizeof(T) stream_count * D));
	215	+ HANDLE_ERROR(cudaMalloc((void*)&d_indices, sizeof(size_t) stream_count));
	216	+ HANDLE_ERROR(cudaMalloc((void*)&d_distances, sizeof(T) stream_count));
	217	+ HANDLE_ERROR(cudaMalloc((void*)&next_nodes, threads blocks * stack_size * sizeof(int)));
	218	+ HANDLE_ERROR(cudaMalloc((void*)&next_search_nodes, threads blocks * stack_size * sizeof(int)));
	219	+ HANDLE_ERROR(cudaMemcpy(d_query_points, query_stream_points, sizeof(T) * stream_count * D, cudaMemcpyHostToDevice));
	220	+
	221	+ int *Judge = NULL;
	222	+
	223	+ search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, stream_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge);
	224	+
	225	+ if(Judge == NULL) {
	226	+ HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * stream_count, cudaMemcpyDeviceToHost));
	227	+ HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * stream_count, cudaMemcpyDeviceToHost));
	228	+ }
	229	+
	230	+ HANDLE_ERROR(cudaFree(next_nodes));
	231	+ HANDLE_ERROR(cudaFree(next_search_nodes));
	232	+ HANDLE_ERROR(cudaFree(d_query_points));
	233	+ HANDLE_ERROR(cudaFree(d_indices));
	234	+ HANDLE_ERROR(cudaFree(d_distances));
	235	+ }
	236	+
	237	+ template <typename T, int D = 3> // set dimension of data to default 3
	238	+ class kdtree {
	239	+ protected:
	240	+ int current_axis; // current judging axis
	241	+ int n_id; // store the total number of nodes
	242	+ std::vector < typename cpu_kdtree::point<T, D> > *tmp_points; // transfer or temperary points
	243	+ std::vector < typename cpu_kdtree::point<T, D> > cpu_tmp_points; // for cpu searching
	244	+ cpu_kdtree::cpu_kdnode<T> *root; // root node
	245	+ static kdtree<T, D> *cur_tree_ptr;
	246	+ #ifdef __CUDACC__
	247	+ cuda_kdnode<T> *d_nodes;
	248	+ size_t *d_index;
	249	+ cpu_kdtree::point<T, D>* d_reference_points;
	250	+ size_t npts;
	251	+ int num_nodes;
	252	+ #endif
	253	+ public:
	254	+ kdtree() { // constructor for creating a cpu_kdtree
	255	+ cur_tree_ptr = this; // create a class pointer points to the current class value
	256	+ n_id = 0; // set total number of points to default 0
	257	+ }
	258	+
	259	+ ~kdtree() { // destructor of cpu_kdtree
	260	+ std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes;
	261	+ next_nodes.push_back(root);
	262	+ while (next_nodes.size()) {
	263	+ std::vector <cpu_kdtree::cpu_kdnode<T>*> next_search_nodes;
	264	+ while (next_nodes.size()) {
	265	+ cpu_kdtree::cpu_kdnode<T> *cur = next_nodes.back();
	266	+ next_nodes.pop_back();
	267	+ if (cur->left)
	268	+ next_search_nodes.push_back(cur->left);
	269	+ if (cur->right)
	270	+ next_search_nodes.push_back(cur->right);
	271	+ delete cur;
	272	+ }
	273	+ next_nodes = next_search_nodes;
	274	+ }
	275	+ root = NULL;
	276	+ #ifdef __CUDACC__
	277	+ HANDLE_ERROR(cudaFree(d_nodes));
	278	+ HANDLE_ERROR(cudaFree(d_index));
	279	+ HANDLE_ERROR(cudaFree(d_reference_points));
	280	+ #endif
	281	+ }
	282	+
	283	+ void cpu_create(std::vector < typename cpu_kdtree::point<T, D> > &reference_points, size_t max_levels) {
	284	+ tmp_points = &reference_points;
	285	+ root = new cpu_kdtree::cpu_kdnode<T>(); // initializing the root node
	286	+ root->idx = n_id++; // the index of root is 0
	287	+ root->level = 0; // tree level begins at 0
	288	+ root->indices.resize(reference_points.size()); // get the number of points
	289	+ for (size_t i = 0; i < reference_points.size(); i++) {
	290	+ root->indices[i] = i; // set indices of input points
	291	+ }
	292	+ std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes; // next nodes
	293	+ next_nodes.push_back(root); // push back the root node
	294	+ while (next_nodes.size()) {
	295	+ std::vector <cpu_kdtree::cpu_kdnode<T>*> next_search_nodes; // next search nodes
	296	+ while (next_nodes.size()) { // two same WHILE is because we need to make a new vector to store nodes for search
	297	+ cpu_kdtree::cpu_kdnode<T> *current_node = next_nodes.back(); // handle node one by one (right first)
	298	+ next_nodes.pop_back(); // pop out current node in order to store next round of nodes
	299	+ if (current_node->level < max_levels) {
	300	+ if (current_node->indices.size() > 1) { // split if the nonleaf node contains more than one point
	301	+ cpu_kdtree::cpu_kdnode<T> *left = new cpu_kdtree::cpu_kdnode<T>();
	302	+ cpu_kdtree::cpu_kdnode<T> *right = new cpu_kdtree::cpu_kdnode<T>();
	303	+ left->idx = n_id++; // set the index of current node's left node
	304	+ right->idx = n_id++;
	305	+ split(current_node, left, right); // split left and right and determine a node
	306	+ std::vector <size_t> temp; // empty vecters of int
	307	+ //temp.resize(current_node->indices.size());
	308	+ current_node->indices.swap(temp); // clean up current node's indices
	309	+ current_node->left = left;
	310	+ current_node->right = right;
	311	+ current_node->left_idx = left->idx;
	312	+ current_node->right_idx = right->idx;
	313	+ if (right->indices.size())
	314	+ next_search_nodes.push_back(right); // left pop out first
	315	+ if (left->indices.size())
	316	+ next_search_nodes.push_back(left);
	317	+ }
	318	+ }
	319	+ }
	320	+ next_nodes = next_search_nodes; // go deeper within the tree
	321	+ }
	322	+ }
	323	+
	324	+ static bool sort_points(const size_t a, const size_t b) { // create functor for std::sort
	325	+ std::vector < typename cpu_kdtree::point<T, D> > &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer
	326	+ return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis];
	327	+ }
	328	+
	329	+ void split(cpu_kdtree::cpu_kdnode<T> cur, cpu_kdtree::cpu_kdnode<T> left, cpu_kdtree::cpu_kdnode<T> *right) {
	330	+ std::vector < typename cpu_kdtree::point<T, D> > &pts = *tmp_points;
	331	+ current_axis = cur->level % D; // indicate the judicative dimension or axis
	332	+ std::sort(cur->indices.begin(), cur->indices.end(), sort_points); // using SortPoints as comparison function to sort the data
	333	+ size_t mid_value = cur->indices[cur->indices.size() / 2]; // odd in the mid_value, even take the floor
	334	+ cur->split_value = pts[mid_value].dim[current_axis]; // get the parent node
	335	+ left->parent = cur; // set the parent of the next search nodes to current node
	336	+ right->parent = cur;
	337	+ left->level = cur->level + 1; // level + 1
	338	+ right->level = cur->level + 1;
	339	+ left->parent_idx = cur->idx; // set its parent node's index
	340	+ right->parent_idx = cur->idx;
	341	+ for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right half-space one by one
	342	+ size_t idx = cur->indices[i];
	343	+ if (pts[idx].dim[current_axis] < cur->split_value)
	344	+ left->indices.push_back(idx);
	345	+ else
	346	+ right->indices.push_back(idx);
	347	+ }
	348	+ }
	349	+
	350	+ void create(T *h_reference_points, size_t reference_count, size_t max_levels) {
	351	+ #ifdef __CUDACC__
	352	+ if (max_levels > 10) {
	353	+ std::cout<<"The max_tree_levels should be smaller!"<<std::endl;
	354	+ exit(1);
	355	+ }
	356	+ //bb.init(&h_reference_points[0]);
	357	+ //aaboundingboxing<T, D>(bb, h_reference_points, reference_count);
	358	+
	359	+ std::vector < typename cpu_kdtree::point<T, D>> reference_points(reference_count); // restore the reference points in particular way
	360	+ for (size_t j = 0; j < reference_count; j++)
	361	+ for (size_t i = 0; i < D; i++)
	362	+ reference_points[j].dim[i] = h_reference_points[j * D + i]; // creating a tree on cpu
	363	+ (*this).cpu_create(reference_points, max_levels); // building a tree on cpu
	364	+ cpu_kdtree::cpu_kdnode<T> d_root = (this).get_root();
	365	+ num_nodes = (*this).get_num_nodes();
	366	+ npts = reference_count; // also equals to reference_count
	367	+
	368	+ HANDLE_ERROR(cudaMalloc((void*)&d_nodes, sizeof(cuda_kdnode<T>) num_nodes)); // copy data from host to device
	369	+ HANDLE_ERROR(cudaMalloc((void*)&d_index, sizeof(size_t) npts));
	370	+ HANDLE_ERROR(cudaMalloc((void*)&d_reference_points, sizeof(cpu_kdtree::point<T, D>) npts));
	371	+
	372	+ std::vector < cuda_kdnode<T> > tmp_nodes(num_nodes);
	373	+ std::vector <size_t> indices(npts);
	374	+ std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes;
	375	+ size_t cur_pos = 0;
	376	+ next_nodes.push_back(d_root);
	377	+ while (next_nodes.size()) {
	378	+ std::vector <typename cpu_kdtree::cpu_kdnode<T>*> next_search_nodes;
	379	+ while (next_nodes.size()) {
	380	+ cpu_kdtree::cpu_kdnode<T> *cur = next_nodes.back();
	381	+ next_nodes.pop_back();
	382	+ int id = cur->idx; // the nodes at same level are independent
	383	+ tmp_nodes[id].level = cur->level;
	384	+ tmp_nodes[id].parent = cur->parent_idx;
	385	+ tmp_nodes[id].left = cur->left_idx;
	386	+ tmp_nodes[id].right = cur->right_idx;
	387	+ tmp_nodes[id].split_value = cur->split_value;
	388	+ tmp_nodes[id].num_index = cur->indices.size(); // number of index
	389	+ if (cur->indices.size()) {
	390	+ for (size_t i = 0; i < cur->indices.size(); i++)
	391	+ indices[cur_pos + i] = cur->indices[i];
	392	+
	393	+ tmp_nodes[id].index = (int)cur_pos; // beginning index of reference_points that every bottom node has
	394	+ cur_pos += cur->indices.size(); // store indices continuously for every query_point
	395	+ }
	396	+ else {
	397	+ tmp_nodes[id].index = -1;
	398	+ }
	399	+
	400	+ if (cur->left)
	401	+ next_search_nodes.push_back(cur->left);
	402	+
	403	+ if (cur->right)
	404	+ next_search_nodes.push_back(cur->right);
	405	+ }
	406	+ next_nodes = next_search_nodes;
	407	+ }
	408	+ HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice));
	409	+ HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
	410	+ HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(cpu_kdtree::point<T, D>) * reference_count, cudaMemcpyHostToDevice));
	411	+
	412	+ #else
	413	+ std::vector < typename cpu_kdtree::point<T, D> > reference_points(reference_count); // restore the reference points in particular way
	414	+ for (size_t j = 0; j < reference_count; j++)
	415	+ for (size_t i = 0; i < D; i++)
	416	+ reference_points[j].dim[i] = h_reference_points[j * D + i];
	417	+ cpu_create(reference_points, max_levels);
	418	+ cpu_tmp_points = *tmp_points;
	419	+
	420	+ #endif
	421	+ }
	422	+
	423	+ int get_num_nodes() const { // get the total number of nodes
	424	+ return n_id;
	425	+ }
	426	+
	427	+ cpu_kdtree::cpu_kdnode<T>* get_root() const { // get the root node of tree
	428	+ return root;
	429	+ }
	430	+
	431	+ T cpu_distance(const cpu_kdtree::point<T, D> &a, const cpu_kdtree::point<T, D> &b) {
	432	+ T distance = 0;
	433	+
	434	+ for (size_t i = 0; i < D; i++) {
	435	+ T d = a.dim[i] - b.dim[i];
	436	+ distance += d*d;
	437	+ }
	438	+ return distance;
	439	+ }
	440	+
	441	+ void cpu_search_at_node(cpu_kdtree::cpu_kdnode<T> cur, const cpu_kdtree::point<T, D> &query, size_t index, T distance, cpu_kdtree::cpu_kdnode<T> *node) {
	442	+ T best_distance = FLT_MAX; // initialize the best distance to max of floating point
	443	+ size_t best_index = 0;
	444	+ std::vector < typename cpu_kdtree::point<T, D> > pts = cpu_tmp_points;
	445	+ while (true) {
	446	+ size_t split_axis = cur->level % D;
	447	+ if (cur->left == NULL) { // risky but acceptable, same goes for right because left and right are in same pace
	448	+ *node = cur; // pointer points to a pointer
	449	+ for (size_t i = 0; i < cur->indices.size(); i++) {
	450	+ size_t idx = cur->indices[i];
	451	+ T d = cpu_distance(query, pts[idx]); // compute distances
	452	+ /// if we want to compute k nearest neighbor, we can input the last resul
	453	+ /// (last_best_dist < dist < best_dist) to select the next point until reaching to k
	454	+ if (d < best_distance) {
	455	+ best_distance = d;
	456	+ best_index = idx; // record the nearest neighbor index
	457	+ }
	458	+ }
	459	+ break; // find the target point then break the loop
	460	+ }
	461	+ else if (query.dim[split_axis] < cur->split_value) { // if it has son node, visit the next node on either left side or right side
	462	+ cur = cur->left;
	463	+ }
	464	+ else {
	465	+ cur = cur->right;
	466	+ }
	467	+ }
	468	+ *index = best_index;
	469	+ *distance = best_distance;
	470	+ }
	471	+
	472	+ void cpu_search_at_node_range(cpu_kdtree::cpu_kdnode<T> cur, const cpu_kdtree::point<T, D> &query, T range, size_t index, T *distance) {
	473	+ T best_distance = FLT_MAX; // initialize the best distance to max of floating point
	474	+ size_t best_index = 0;
	475	+ std::vector < typename cpu_kdtree::point<T, D> > pts = cpu_tmp_points;
	476	+ std::vector < typename cpu_kdtree::cpu_kdnode<T>*> next_node;
	477	+ next_node.push_back(cur);
	478	+ while (next_node.size()) {
	479	+ std::vector<typename cpu_kdtree::cpu_kdnode<T>*> next_search;
	480	+ while (next_node.size()) {
	481	+ cur = next_node.back();
	482	+ next_node.pop_back();
	483	+ size_t split_axis = cur->level % D;
	484	+ if (cur->left == NULL) {
	485	+ for (size_t i = 0; i < cur->indices.size(); i++) {
	486	+ size_t idx = cur->indices[i];
	487	+ T d = cpu_distance(query, pts[idx]);
	488	+ if (d < best_distance) {
	489	+ best_distance = d;
	490	+ best_index = idx;
	491	+ }
	492	+ }
	493	+ }
	494	+ else {
	495	+ T d = query.dim[split_axis] - cur->split_value; // computer distance along specific axis or dimension
	496	+ /// there are three possibilities: on either left or right, and on both left and right
	497	+ if (fabs(d) > range) { // absolute value of floating point to see if distance will be larger that best_dist
	498	+ if (d < 0)
	499	+ next_search.push_back(cur->left); // every left[split_axis] is less and equal to cur->split_value, so it is possible to find the nearest point in this region
	500	+ else
	501	+ next_search.push_back(cur->right);
	502	+ }
	503	+ else { // it is possible that nereast neighbor will appear on both left and right
	504	+ next_search.push_back(cur->left);
	505	+ next_search.push_back(cur->right);
	506	+ }
	507	+ }
	508	+ }
	509	+ next_node = next_search; // pop out at least one time
	510	+ }
	511	+ *index = best_index;
	512	+ *distance = best_distance;
	513	+ }
	514	+
	515	+ void cpu_search(T h_query_points, size_t query_count, size_t h_indices, T *h_distances) {
	516	+ /// first convert the input query point into specific type
	517	+ cpu_kdtree::point<T, D> query;
	518	+ for (size_t j = 0; j < query_count; j++) {
	519	+ for (size_t i = 0; i < D; i++)
	520	+ query.dim[i] = h_query_points[j * D + i];
	521	+ /// find the nearest node, this will be the upper bound for the next time searching
	522	+ cpu_kdtree::cpu_kdnode<T> *best_node = NULL;
	523	+ T best_distance = FLT_MAX;
	524	+ size_t best_index = 0;
	525	+ T radius = 0; // radius for range
	526	+ cpu_search_at_node(root, query, &best_index, &best_distance, &best_node); // simple search to rougly determine a result for next search step
	527	+ radius = sqrt(best_distance); // It is possible that nearest will appear in another region
	528	+ /// find other possibilities
	529	+ cpu_kdtree::cpu_kdnode<T> *cur = best_node;
	530	+ while (cur->parent != NULL) { // every node that you pass will be possible to be the best node
	531	+ /// go up
	532	+ cpu_kdtree::cpu_kdnode<T> *parent = cur->parent; // travel back to every node that we pass through
	533	+ size_t split_axis = (parent->level) % D;
	534	+ /// search other nodes
	535	+ size_t tmp_index;
	536	+ T tmp_distance = FLT_MAX;
	537	+ if (fabs(parent->split_value - query.dim[split_axis]) <= radius) {
	538	+ /// search opposite node
	539	+ if (parent->left != cur)
	540	+ cpu_search_at_node_range(parent->left, query, radius, &tmp_index, &tmp_distance); // to see whether it is its mother node's left son node
	541	+ else
	542	+ cpu_search_at_node_range(parent->right, query, radius, &tmp_index, &tmp_distance);
	543	+ }
	544	+ if (tmp_distance < best_distance) {
	545	+ best_distance = tmp_distance;
	546	+ best_index = tmp_index;
	547	+ }
	548	+ cur = parent;
	549	+ }
	550	+ h_indices[j] = best_index;
	551	+ h_distances[j] = best_distance;
	552	+ }
	553	+ }
	554	+
	555	+ void search(T h_query_points, size_t query_count, size_t indices, T *distances) {
	556	+ #ifdef __CUDACC__
	557	+ std::vector < typename cpu_kdtree::point<T, D> > query_points(query_count);
	558	+ for (size_t j = 0; j < query_count; j++)
	559	+ for (size_t i = 0; i < D; i++)
	560	+ query_points[j].dim[i] = h_query_points[j * D + i];
	561	+
	562	+ cudaDeviceProp prop;
	563	+ cudaGetDeviceProperties(&prop, 0);
	564	+
	565	+ size_t query_memory = D * sizeof(T) * query_count;
	566	+ size_t N = 3 * query_memory / prop.totalGlobalMem; //consider index and distance, roughly 3 times
	567	+ if (N > 1) {
	568	+ N++;
	569	+ size_t stream_count = query_count / N;
	570	+ for (size_t n = 0; n < N; n++) {
	571	+ size_t query_stream_start = n * stream_count;
	572	+ search_stream(d_nodes, d_index, d_reference_points, &query_points[query_stream_start], stream_count, &indices[query_stream_start], &distances[query_stream_start]);
	573	+ }
	574	+ size_t stream_remain_count = query_count - N * stream_count;
	575	+ if (stream_remain_count > 0) {
	576	+ size_t query_remain_start = N * stream_count;
	577	+ search_stream(d_nodes, d_index, d_reference_points, &query_points[query_remain_start], stream_remain_count, &indices[query_remain_start], &distances[query_remain_start]);
	578	+ }
	579	+ }
	580	+ else {
	581	+ unsigned int threads = (unsigned int)(query_count > 1024 ? 1024 : query_count);
	582	+ unsigned int blocks = (unsigned int)(query_count / threads + (query_count % threads ? 1 : 0));
	583	+
	584	+ cpu_kdtree::point<T, D> *d_query_points; // create a pointer pointing to query points on gpu
	585	+ size_t *d_indices;
	586	+ T *d_distances;
	587	+
	588	+ int *next_nodes; // create two STACK-like array
	589	+ int *next_search_nodes;
	590	+
	591	+ int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory
	592	+
	593	+ HANDLE_ERROR(cudaMalloc((void*)&d_query_points, sizeof(T) query_count * D));
	594	+ HANDLE_ERROR(cudaMalloc((void*)&d_indices, sizeof(size_t) query_count));
	595	+ HANDLE_ERROR(cudaMalloc((void*)&d_distances, sizeof(T) query_count));
	596	+ HANDLE_ERROR(cudaMalloc((void*)&next_nodes, threads blocks * stack_size * sizeof(int))); // STACK size right now is 50, you can change it if you mean to
	597	+ HANDLE_ERROR(cudaMalloc((void*)&next_search_nodes, threads blocks * stack_size * sizeof(int)));
	598	+ HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_count * D, cudaMemcpyHostToDevice));
	599	+
	600	+ search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, query_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge);
	601	+
	602	+ if (Judge == NULL) { // do the following work if the thread works safely
	603	+ HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_count, cudaMemcpyDeviceToHost));
	604	+ HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_count, cudaMemcpyDeviceToHost));
	605	+ }
	606	+
	607	+ HANDLE_ERROR(cudaFree(next_nodes));
	608	+ HANDLE_ERROR(cudaFree(next_search_nodes));
	609	+ HANDLE_ERROR(cudaFree(d_query_points));
	610	+ HANDLE_ERROR(cudaFree(d_indices));
	611	+ HANDLE_ERROR(cudaFree(d_distances));
	612	+ }
	613	+
	614	+ #else
	615	+ cpu_search(h_query_points, query_count, indices, distances);
	616	+
	617	+ #endif
	618	+
	619	+ }
	620	+
	621	+ }; //end class kdtree
	622	+
	623	+ template <typename T, int D>
	624	+ kdtree<T, D>* kdtree<T, D>::cur_tree_ptr = NULL; // definition of cur_tree_ptr pointer points to the current class
	625	+
	626	+} //end namespace stim
	627	+#endif
0	628	\ No newline at end of file
...	...