From 57914be592c16fa94a93e78a5eee4b1da56b0be3 Mon Sep 17 00:00:00 2001
From: Jiaming Guo <jackguo523@gmail.com>
Date: Thu, 15 Dec 2016 12:04:01 -0600
Subject: [PATCH] new kdtree is ready.

---
 stim/structures/kdtree_dep.cuh | 627 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 stim/structures/kdtreenew.cuh  | 627 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 2 files changed, 627 insertions(+), 627 deletions(-)
 create mode 100644 stim/structures/kdtree_dep.cuh
 delete mode 100644 stim/structures/kdtreenew.cuh
diff --git a/stim/structures/kdtree_dep.cuh b/stim/structures/kdtree_dep.cuh
new file mode 100644
index 0000000..093c71a
--- /dev/null
+++ b/stim/structures/kdtree_dep.cuh
@@ -0,0 +1,627 @@
+// right now the size of CUDA STACK is set to 50, increase it if you mean to make deeper tree
+// data should be stored in row-major
+// x1,x2,x3,x4,x5......
+// y1,y2,y3,y4,y5......
+// ....................
+// ....................
+
+#ifndef KDTREE_H
+#define KDTREE_H
+#define stack_size 50
+
+#include "device_launch_parameters.h"
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "cuda_runtime.h"
+#include <vector>
+#include <cstring>
+#include <float.h>
+#include <iostream>
+#include <algorithm>
+#include <stim/cuda/cudatools/error.h>
+#include <stim/visualization/aabbn.h>
+
+namespace stim {
+	namespace cpu_kdtree {
+		template<typename T, int D>											// typename refers to float or double while D refers to dimension of points
+		struct point {
+			T dim[D];														// create a structure to store every one input point
+		};
+
+		template<typename T>
+		class cpu_kdnode {
+		public:
+			cpu_kdnode() {														// constructor for initializing a kdnode
+				parent = NULL;												// set every node's parent, left and right kdnode pointers to NULL
+				left = NULL;
+				right = NULL;
+				parent_idx = -1;											// set parent node index to default -1
+				left_idx = -1;
+				right_idx = -1;
+				split_value = -1;											// set split_value to default -1
+			}
+			int idx;														// index of current node
+			int parent_idx, left_idx, right_idx;							// index of parent, left and right nodes
+			cpu_kdnode *parent, *left, *right;									// parent, left and right kdnodes
+			T split_value;													// splitting value of current node
+			std::vector <size_t> indices;									// it indicates the points' indices that current node has 
+			size_t level;													// tree level of current node
+		};
+	}				// end of namespace cpu_kdtree
+
+	template <typename T>
+	struct cuda_kdnode {
+		int parent, left, right;														
+		T split_value;
+		size_t num_index;																					// number of indices it has
+		int index;																							// the beginning index
+		size_t level;
+	};
+
+	template <typename T, int D>
+    __device__ T gpu_distance(cpu_kdtree::point<T, D> &a, cpu_kdtree::point<T, D> &b) {
+		T distance = 0;
+
+		for (size_t i = 0; i < D; i++) {
+			T d = a.dim[i] - b.dim[i];
+			distance += d*d;
+		}
+		return distance;
+	}
+	
+	template <typename T, int D>
+	__device__ void search_at_node(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, int cur, cpu_kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) {
+		T best_distance = FLT_MAX;
+		size_t best_index = 0;
+
+		while (true) {																						// break until reach the bottom
+			int split_axis = nodes[cur].level % D;
+			if (nodes[cur].left == -1) {																	// check whether it has left node or not
+				*d_node = cur;
+				for (int i = 0; i < nodes[cur].num_index; i++) {
+					size_t idx = indices[nodes[cur].index + i];
+					T dist = gpu_distance<T, D>(d_query_point, d_reference_points[idx]);
+					if (dist < best_distance) {
+						best_distance = dist;
+						best_index = idx;
+					}
+				}
+			break;
+			}
+			else if (d_query_point.dim[split_axis] < nodes[cur].split_value) {								// jump into specific son node
+				cur = nodes[cur].left;
+			}
+			else {
+				cur = nodes[cur].right;
+			}
+		}
+		*d_distance = best_distance;
+		*d_index = best_index;
+	}
+	
+	template <typename T, int D>
+	__device__ void search_at_node_range(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
+		T best_distance = FLT_MAX;
+		size_t best_index = 0;
+
+		int next_nodes_pos = 0;																				// initialize pop out order index
+		next_nodes[id * stack_size + next_nodes_pos] = cur;															// find data that belongs to the very specific thread
+		next_nodes_pos++;
+
+		while (next_nodes_pos) {
+			int next_search_nodes_pos = 0;																	// record push back order index
+			while (next_nodes_pos) {
+				cur = next_nodes[id * stack_size + next_nodes_pos - 1];												// pop out the last push in one and keep poping out
+				next_nodes_pos--;
+				int split_axis = nodes[cur].level % D;
+
+				if (nodes[cur].left == -1) {
+					for (int i = 0; i < nodes[cur].num_index; i++) {
+						int idx = indices[nodes[cur].index + i];											// all indices are stored in one array, pick up from every node's beginning index
+						T d = gpu_distance<T>(d_query_point, d_reference_points[idx]);
+						if (d < best_distance) {
+							best_distance = d;
+							best_index = idx;
+						}
+					}
+				}
+				else {
+					T d = d_query_point.dim[split_axis] - nodes[cur].split_value;
+
+					if (fabs(d) > range) {
+						if (d < 0) {
+							next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left;
+							next_search_nodes_pos++;
+						}
+						else {
+							next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right;
+							next_search_nodes_pos++;
+						}
+					}
+					else {
+						next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right;
+						next_search_nodes_pos++;
+						next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left;
+						next_search_nodes_pos++;
+						if (next_search_nodes_pos > stack_size) {
+							printf("Thread conflict might be caused by thread %d, so please try smaller input max_tree_levels\n", id);
+							(*Judge)++;
+						}
+					}
+				}
+			}
+			for (int i = 0; i < next_search_nodes_pos; i++)
+				next_nodes[id * stack_size + i] = next_search_nodes[id * stack_size + i];
+			next_nodes_pos = next_search_nodes_pos;										
+		}
+		*d_distance = best_distance;
+		*d_index = best_index;
+	}
+	
+	template <typename T, int D>
+	__device__ void search(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
+		int best_node = 0;
+		T best_distance = FLT_MAX;
+		size_t best_index = 0;
+		T radius = 0;
+
+		search_at_node<T, D>(nodes, indices, d_reference_points, 0, d_query_point, &best_index, &best_distance, &best_node);
+		radius = sqrt(best_distance);																															// get range
+		int cur = best_node;
+
+		while (nodes[cur].parent != -1) {
+			int parent = nodes[cur].parent;
+			int split_axis = nodes[parent].level % D;
+
+			T tmp_dist = FLT_MAX;
+			size_t tmp_idx;
+			if (fabs(nodes[parent].split_value - d_query_point.dim[split_axis]) <= radius) {
+				if (nodes[parent].left != cur)
+					search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].left, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
+				else
+					search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].right, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
+			}
+			if (tmp_dist < best_distance) {
+				best_distance = tmp_dist;
+				best_index = tmp_idx;
+			}
+			cur = parent;
+		}
+		*d_distance = sqrt(best_distance);
+		*d_index = best_index;
+	}
+	
+	template <typename T, int D>
+	__global__ void search_batch(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) {
+		size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+		if (idx >= d_query_count) return;																														 // avoid segfault
+
+		search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge);    // every query points are independent
+	}
+	
+	template <typename T, int D>
+	void search_stream(cuda_kdnode<T> *d_nodes, size_t *d_index, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> *query_stream_points, size_t stream_count, size_t *indices, T *distances) {
+		unsigned int threads = (unsigned int)(stream_count > 1024 ? 1024 : stream_count);
+		unsigned int blocks = (unsigned int)(stream_count / threads + (stream_count % threads ? 1 : 0));
+
+		cpu_kdtree::point<T, D> *d_query_points;	
+		size_t *d_indices;
+		T *d_distances;
+
+		int *next_nodes;																																	
+		int *next_search_nodes;
+		
+		HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * stream_count * D));
+		HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * stream_count));
+		HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * stream_count));
+		HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int)));																	
+		HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));	
+		HANDLE_ERROR(cudaMemcpy(d_query_points, query_stream_points, sizeof(T) * stream_count * D, cudaMemcpyHostToDevice));
+
+		int *Judge = NULL;
+
+		search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, stream_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge);
+
+		if(Judge == NULL) {
+			HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * stream_count, cudaMemcpyDeviceToHost));
+			HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * stream_count, cudaMemcpyDeviceToHost));
+		}
+
+		HANDLE_ERROR(cudaFree(next_nodes));
+		HANDLE_ERROR(cudaFree(next_search_nodes));
+		HANDLE_ERROR(cudaFree(d_query_points));
+		HANDLE_ERROR(cudaFree(d_indices));
+		HANDLE_ERROR(cudaFree(d_distances));
+	}
+
+	template <typename T, int D = 3>										// set dimension of data to default 3
+	class kdtree {
+	protected:
+		int current_axis;													// current judging axis
+		int n_id;															// store the total number of nodes
+		std::vector < typename cpu_kdtree::point<T, D> > *tmp_points;			// transfer or temperary points
+		std::vector < typename cpu_kdtree::point<T, D> > cpu_tmp_points;		// for cpu searching
+		cpu_kdtree::cpu_kdnode<T> *root;											// root node
+		static kdtree<T, D> *cur_tree_ptr;
+		#ifdef __CUDACC__
+			cuda_kdnode<T> *d_nodes;                                                    																		 
+			size_t *d_index;
+			cpu_kdtree::point<T, D>* d_reference_points;
+			size_t npts;
+			int num_nodes;
+		#endif
+	public:
+		kdtree() {														// constructor for creating a cpu_kdtree
+			cur_tree_ptr = this;											// create  a class pointer points to the current class value
+			n_id = 0;														// set total number of points to default 0
+		}
+		
+		~kdtree() {											  			// destructor of cpu_kdtree
+			std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes;
+			next_nodes.push_back(root);
+			while (next_nodes.size()) {
+				std::vector <cpu_kdtree::cpu_kdnode<T>*> next_search_nodes;
+				while (next_nodes.size()) {
+					cpu_kdtree::cpu_kdnode<T> *cur = next_nodes.back();
+					next_nodes.pop_back();
+					if (cur->left)
+						next_search_nodes.push_back(cur->left);
+					if (cur->right)
+						next_search_nodes.push_back(cur->right);
+					delete cur;
+				}
+				next_nodes = next_search_nodes;
+			}
+			root = NULL;
+			#ifdef __CUDACC__
+			HANDLE_ERROR(cudaFree(d_nodes));
+			HANDLE_ERROR(cudaFree(d_index));
+			HANDLE_ERROR(cudaFree(d_reference_points));
+			#endif
+		}
+		
+		void cpu_create(std::vector < typename cpu_kdtree::point<T, D> > &reference_points, size_t max_levels) {									
+			tmp_points = &reference_points;
+			root = new cpu_kdtree::cpu_kdnode<T>();									// initializing the root node
+			root->idx = n_id++;												// the index of root is 0
+			root->level = 0;												// tree level begins at 0
+			root->indices.resize(reference_points.size());					// get the number of points
+			for (size_t i = 0; i < reference_points.size(); i++) {
+				root->indices[i] = i;										// set indices of input points
+			}
+			std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes;					// next nodes
+			next_nodes.push_back(root);										// push back the root node
+			while (next_nodes.size()) {
+				std::vector <cpu_kdtree::cpu_kdnode<T>*> next_search_nodes;			// next search nodes
+				while (next_nodes.size()) {									// two same WHILE is because we need to make a new vector to store nodes for search
+					cpu_kdtree::cpu_kdnode<T> *current_node = next_nodes.back();	// handle node one by one (right first) 
+					next_nodes.pop_back();									// pop out current node in order to store next round of nodes
+					if (current_node->level < max_levels) {					
+						if (current_node->indices.size() > 1) {				// split if the nonleaf node contains more than one point
+							cpu_kdtree::cpu_kdnode<T> *left = new cpu_kdtree::cpu_kdnode<T>();
+							cpu_kdtree::cpu_kdnode<T> *right = new cpu_kdtree::cpu_kdnode<T>();
+							left->idx = n_id++;								// set the index of current node's left node
+							right->idx = n_id++;							
+							split(current_node, left, right);				// split left and right and determine a node
+							std::vector <size_t> temp;						// empty vecters of int
+							//temp.resize(current_node->indices.size());
+							current_node->indices.swap(temp);				// clean up current node's indices
+							current_node->left = left;
+							current_node->right = right;
+							current_node->left_idx = left->idx;				
+							current_node->right_idx = right->idx;					
+							if (right->indices.size())
+								next_search_nodes.push_back(right);			// left pop out first
+							if (left->indices.size())
+								next_search_nodes.push_back(left);	
+						}
+					}
+				}
+				next_nodes = next_search_nodes;								// go deeper within the tree
+			}
+		}
+		
+		static bool sort_points(const size_t a, const size_t b) {									// create functor for std::sort
+			std::vector < typename cpu_kdtree::point<T, D> > &pts = *cur_tree_ptr->tmp_points;			// put cur_tree_ptr to current input points' pointer
+			return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis];
+		}
+		
+		void split(cpu_kdtree::cpu_kdnode<T> *cur, cpu_kdtree::cpu_kdnode<T> *left, cpu_kdtree::cpu_kdnode<T> *right) {
+			std::vector < typename cpu_kdtree::point<T, D> > &pts = *tmp_points;
+			current_axis = cur->level % D;												// indicate the judicative dimension or axis
+			std::sort(cur->indices.begin(), cur->indices.end(), sort_points);			// using SortPoints as comparison function to sort the data
+			size_t mid_value = cur->indices[cur->indices.size() / 2];                   // odd in the mid_value, even take the floor
+			cur->split_value = pts[mid_value].dim[current_axis];						// get the parent node
+			left->parent = cur;                                                         // set the parent of the next search nodes to current node
+			right->parent = cur;
+			left->level = cur->level + 1;												// level + 1
+			right->level = cur->level + 1;
+			left->parent_idx = cur->idx;                                                // set its parent node's index
+			right->parent_idx = cur->idx;                                            
+			for (size_t i = 0; i < cur->indices.size(); i++) {							// split into left and right half-space one by one
+				size_t idx = cur->indices[i];
+				if (pts[idx].dim[current_axis] < cur->split_value)
+					left->indices.push_back(idx);
+				else
+					right->indices.push_back(idx);
+			}
+		}
+		
+		void create(T *h_reference_points, size_t reference_count, size_t max_levels) {
+			#ifdef __CUDACC__
+			if (max_levels > 10) {
+				std::cout<<"The max_tree_levels should be smaller!"<<std::endl;
+				exit(1);
+			}		
+			//bb.init(&h_reference_points[0]);
+			//aaboundingboxing<T, D>(bb, h_reference_points, reference_count);
+
+			std::vector < typename cpu_kdtree::point<T, D>> reference_points(reference_count);																				// restore the reference points in particular way
+			for (size_t j = 0; j < reference_count; j++)
+				for (size_t i = 0; i < D; i++)
+					reference_points[j].dim[i] = h_reference_points[j * D + i];																																// creating a tree on cpu
+			(*this).cpu_create(reference_points, max_levels);																											// building a tree on cpu
+			cpu_kdtree::cpu_kdnode<T> *d_root = (*this).get_root();
+			num_nodes = (*this).get_num_nodes();
+			npts = reference_count;																												// also equals to reference_count
+
+			HANDLE_ERROR(cudaMalloc((void**)&d_nodes, sizeof(cuda_kdnode<T>) * num_nodes));																		// copy data from host to device
+			HANDLE_ERROR(cudaMalloc((void**)&d_index, sizeof(size_t) * npts));
+			HANDLE_ERROR(cudaMalloc((void**)&d_reference_points, sizeof(cpu_kdtree::point<T, D>) * npts));
+
+			std::vector < cuda_kdnode<T> > tmp_nodes(num_nodes);																									
+			std::vector <size_t> indices(npts);
+			std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes;
+			size_t cur_pos = 0;
+			next_nodes.push_back(d_root);
+			while (next_nodes.size()) {
+				std::vector <typename cpu_kdtree::cpu_kdnode<T>*> next_search_nodes;
+				while (next_nodes.size()) {
+					cpu_kdtree::cpu_kdnode<T> *cur = next_nodes.back();
+					next_nodes.pop_back();
+					int id = cur->idx;																															// the nodes at same level are independent
+					tmp_nodes[id].level = cur->level;
+					tmp_nodes[id].parent = cur->parent_idx;
+					tmp_nodes[id].left = cur->left_idx;
+					tmp_nodes[id].right = cur->right_idx;
+					tmp_nodes[id].split_value = cur->split_value;
+					tmp_nodes[id].num_index = cur->indices.size();																								// number of index
+					if (cur->indices.size()) {
+						for (size_t i = 0; i < cur->indices.size(); i++)
+							indices[cur_pos + i] = cur->indices[i];
+
+						tmp_nodes[id].index = (int)cur_pos;																										// beginning index of reference_points that every bottom node has
+						cur_pos += cur->indices.size();																											// store indices continuously for every query_point
+					}
+					else {
+						tmp_nodes[id].index = -1;
+					}
+
+					if (cur->left)
+						next_search_nodes.push_back(cur->left);
+
+					if (cur->right)
+						next_search_nodes.push_back(cur->right);
+				}
+				next_nodes = next_search_nodes;
+			}
+			HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice));
+			HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
+			HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(cpu_kdtree::point<T, D>) * reference_count, cudaMemcpyHostToDevice));
+
+			#else
+			std::vector < typename cpu_kdtree::point<T, D> > reference_points(reference_count);		// restore the reference points in particular way
+			for (size_t j = 0; j < reference_count; j++)
+				for (size_t i = 0; i < D; i++)
+					reference_points[j].dim[i] = h_reference_points[j * D + i];
+			cpu_create(reference_points, max_levels);
+			cpu_tmp_points = *tmp_points;
+
+			#endif
+		}
+		
+		int get_num_nodes() const {														// get the total number of nodes
+			return n_id; 
+		}
+		
+		cpu_kdtree::cpu_kdnode<T>* get_root() const {											// get the root node of tree
+			return root; 
+		}
+        
+		T cpu_distance(const cpu_kdtree::point<T, D> &a, const cpu_kdtree::point<T, D> &b) {
+			T distance = 0;
+
+			for (size_t i = 0; i < D; i++) {
+				T d = a.dim[i] - b.dim[i];
+				distance += d*d;
+			}
+			return distance;
+		}
+		
+		void cpu_search_at_node(cpu_kdtree::cpu_kdnode<T> *cur, const cpu_kdtree::point<T, D> &query, size_t *index, T *distance, cpu_kdtree::cpu_kdnode<T> **node) {
+			T best_distance = FLT_MAX;                                              // initialize the best distance to max of floating point
+			size_t best_index = 0;
+			std::vector < typename cpu_kdtree::point<T, D> > pts = cpu_tmp_points;
+			while (true) {
+				size_t split_axis = cur->level % D;
+				if (cur->left == NULL) {                                            // risky but acceptable, same goes for right because left and right are in same pace
+					*node = cur;													// pointer points to a pointer
+					for (size_t i = 0; i < cur->indices.size(); i++) {
+						size_t idx = cur->indices[i];
+						T d = cpu_distance(query, pts[idx]);						// compute distances
+						/// if we want to compute k nearest neighbor, we can input the last resul
+						/// (last_best_dist < dist < best_dist) to select the next point until reaching to k
+						if (d < best_distance) {
+							best_distance = d;
+							best_index = idx;                                       // record the nearest neighbor index
+						}
+					}
+					break;                                                          // find the target point then break the loop
+				}
+				else if (query.dim[split_axis] < cur->split_value) {				// if it has son node, visit the next node on either left side or right side
+					cur = cur->left;
+				}
+				else {
+					cur = cur->right;
+				}
+			}
+			*index = best_index;
+			*distance = best_distance;
+		} 
+		
+		void cpu_search_at_node_range(cpu_kdtree::cpu_kdnode<T> *cur, const cpu_kdtree::point<T, D> &query, T range, size_t *index, T *distance) {
+			T best_distance = FLT_MAX;                                              // initialize the best distance to max of floating point
+			size_t best_index = 0;
+			std::vector < typename cpu_kdtree::point<T, D> > pts = cpu_tmp_points;
+			std::vector < typename cpu_kdtree::cpu_kdnode<T>*> next_node;
+			next_node.push_back(cur);
+			while (next_node.size()) {
+				std::vector<typename cpu_kdtree::cpu_kdnode<T>*> next_search;
+				while (next_node.size()) {
+					cur = next_node.back();                                         
+					next_node.pop_back();
+					size_t split_axis = cur->level % D;
+					if (cur->left == NULL) {
+						for (size_t i = 0; i < cur->indices.size(); i++) {
+							size_t idx = cur->indices[i];
+							T d = cpu_distance(query, pts[idx]);
+							if (d < best_distance) {
+								best_distance = d;
+								best_index = idx;
+							}
+						}
+					}
+					else {
+						T d = query.dim[split_axis] - cur->split_value;				// computer distance along specific axis or dimension
+						/// there are three possibilities: on either left or right, and on both left and right
+						if (fabs(d) > range) {										// absolute value of floating point to see if distance will be larger that best_dist
+							if (d < 0)
+								next_search.push_back(cur->left);                   // every left[split_axis] is less and equal to cur->split_value, so it is possible to find the nearest point in this region
+							else
+								next_search.push_back(cur->right);
+						}
+						else {                                                      // it is possible that nereast neighbor will appear on both left and right 
+							next_search.push_back(cur->left);
+							next_search.push_back(cur->right);
+						}
+					}
+				}
+				next_node = next_search;                                            // pop out at least one time                                  
+			}
+			*index = best_index;
+			*distance = best_distance;
+		}
+		
+		void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) {
+			/// first convert the input query point into specific type
+			cpu_kdtree::point<T, D> query;
+			for (size_t j = 0; j < query_count; j++) {
+				for (size_t i = 0; i < D; i++)
+					query.dim[i] = h_query_points[j * D + i];
+				/// find the nearest node, this will be the upper bound for the next time searching
+				cpu_kdtree::cpu_kdnode<T> *best_node = NULL;
+				T best_distance = FLT_MAX;
+				size_t best_index = 0;
+				T radius = 0;																				// radius for range                                                                           
+				cpu_search_at_node(root, query, &best_index, &best_distance, &best_node);                   // simple search to rougly determine a result for next search step
+				radius = sqrt(best_distance);                                                               // It is possible that nearest will appear in another region
+				/// find other possibilities
+				cpu_kdtree::cpu_kdnode<T> *cur = best_node;
+				while (cur->parent != NULL) {																// every node that you pass will be possible to be the best node
+					/// go up
+					cpu_kdtree::cpu_kdnode<T> *parent = cur->parent;                                                // travel back to every node that we pass through
+					size_t split_axis = (parent->level) % D;
+					/// search other nodes
+					size_t tmp_index;
+					T tmp_distance = FLT_MAX;
+					if (fabs(parent->split_value - query.dim[split_axis]) <= radius) {
+						/// search opposite node
+						if (parent->left != cur)
+							cpu_search_at_node_range(parent->left, query, radius, &tmp_index, &tmp_distance);        // to see whether it is its mother node's left son node
+						else
+							cpu_search_at_node_range(parent->right, query, radius, &tmp_index, &tmp_distance);
+					}
+					if (tmp_distance < best_distance) {
+						best_distance = tmp_distance;
+						best_index = tmp_index;
+					}
+					cur = parent;
+				}
+				h_indices[j] = best_index;
+				h_distances[j] = best_distance;
+			}
+		}
+
+		void search(T *h_query_points, size_t query_count, size_t *indices, T *distances) {
+			#ifdef __CUDACC__
+			std::vector < typename cpu_kdtree::point<T, D> > query_points(query_count);
+			for (size_t j = 0; j < query_count; j++)
+				for (size_t i = 0; i < D; i++)
+					query_points[j].dim[i] = h_query_points[j * D + i];
+
+			cudaDeviceProp prop;
+			cudaGetDeviceProperties(&prop, 0);
+			
+			size_t query_memory = D * sizeof(T) * query_count;
+			size_t N = 3 * query_memory / prop.totalGlobalMem;					//consider index and distance, roughly 3 times
+			if (N > 1) {
+				N++;
+				size_t stream_count = query_count / N;
+				for (size_t n = 0; n < N; n++) {
+					size_t query_stream_start = n * stream_count;
+					search_stream(d_nodes, d_index, d_reference_points, &query_points[query_stream_start], stream_count, &indices[query_stream_start], &distances[query_stream_start]);
+				}
+				size_t stream_remain_count = query_count - N * stream_count;
+				if (stream_remain_count > 0) {
+					size_t query_remain_start = N * stream_count;
+					search_stream(d_nodes, d_index, d_reference_points, &query_points[query_remain_start], stream_remain_count, &indices[query_remain_start], &distances[query_remain_start]);
+				}
+			}
+			else {
+				unsigned int threads = (unsigned int)(query_count > 1024 ? 1024 : query_count);
+				unsigned int blocks = (unsigned int)(query_count / threads + (query_count % threads ? 1 : 0));
+
+				cpu_kdtree::point<T, D> *d_query_points;																												// create a pointer pointing to query points on gpu
+				size_t *d_indices;
+				T *d_distances;
+
+				int *next_nodes;																																	// create two STACK-like array
+				int *next_search_nodes;
+
+				int *Judge = NULL;																																	// judge variable to see whether one thread is overwrite another thread's memory																						
+		
+				HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_count * D));
+				HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_count));
+				HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_count));
+				HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int)));																	// STACK size right now is 50, you can change it if you mean to
+				HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));	
+				HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_count * D, cudaMemcpyHostToDevice));
+
+				search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, query_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge);
+
+				if (Judge == NULL) {																																// do the following work if the thread works safely
+					HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_count, cudaMemcpyDeviceToHost));
+					HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_count, cudaMemcpyDeviceToHost));
+				}
+
+				HANDLE_ERROR(cudaFree(next_nodes));
+				HANDLE_ERROR(cudaFree(next_search_nodes));
+				HANDLE_ERROR(cudaFree(d_query_points));
+				HANDLE_ERROR(cudaFree(d_indices));
+				HANDLE_ERROR(cudaFree(d_distances));
+			}
+
+			#else
+			cpu_search(h_query_points, query_count, indices, distances);
+
+			#endif
+
+		}
+
+	};				//end class kdtree
+
+	template <typename T, int D>
+	kdtree<T, D>* kdtree<T, D>::cur_tree_ptr = NULL;												// definition of cur_tree_ptr pointer points to the current class
+
+}				//end namespace stim
+#endif
\ No newline at end of file
diff --git a/stim/structures/kdtreenew.cuh b/stim/structures/kdtreenew.cuh
deleted file mode 100644
index 093c71a..0000000
--- a/stim/structures/kdtreenew.cuh
+++ /dev/null
@@ -1,627 +0,0 @@
-// right now the size of CUDA STACK is set to 50, increase it if you mean to make deeper tree
-// data should be stored in row-major
-// x1,x2,x3,x4,x5......
-// y1,y2,y3,y4,y5......
-// ....................
-// ....................
-
-#ifndef KDTREE_H
-#define KDTREE_H
-#define stack_size 50
-
-#include "device_launch_parameters.h"
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include "cuda_runtime.h"
-#include <vector>
-#include <cstring>
-#include <float.h>
-#include <iostream>
-#include <algorithm>
-#include <stim/cuda/cudatools/error.h>
-#include <stim/visualization/aabbn.h>
-
-namespace stim {
-	namespace cpu_kdtree {
-		template<typename T, int D>											// typename refers to float or double while D refers to dimension of points
-		struct point {
-			T dim[D];														// create a structure to store every one input point
-		};
-
-		template<typename T>
-		class cpu_kdnode {
-		public:
-			cpu_kdnode() {														// constructor for initializing a kdnode
-				parent = NULL;												// set every node's parent, left and right kdnode pointers to NULL
-				left = NULL;
-				right = NULL;
-				parent_idx = -1;											// set parent node index to default -1
-				left_idx = -1;
-				right_idx = -1;
-				split_value = -1;											// set split_value to default -1
-			}
-			int idx;														// index of current node
-			int parent_idx, left_idx, right_idx;							// index of parent, left and right nodes
-			cpu_kdnode *parent, *left, *right;									// parent, left and right kdnodes
-			T split_value;													// splitting value of current node
-			std::vector <size_t> indices;									// it indicates the points' indices that current node has 
-			size_t level;													// tree level of current node
-		};
-	}				// end of namespace cpu_kdtree
-
-	template <typename T>
-	struct cuda_kdnode {
-		int parent, left, right;														
-		T split_value;
-		size_t num_index;																					// number of indices it has
-		int index;																							// the beginning index
-		size_t level;
-	};
-
-	template <typename T, int D>
-    __device__ T gpu_distance(cpu_kdtree::point<T, D> &a, cpu_kdtree::point<T, D> &b) {
-		T distance = 0;
-
-		for (size_t i = 0; i < D; i++) {
-			T d = a.dim[i] - b.dim[i];
-			distance += d*d;
-		}
-		return distance;
-	}
-	
-	template <typename T, int D>
-	__device__ void search_at_node(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, int cur, cpu_kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) {
-		T best_distance = FLT_MAX;
-		size_t best_index = 0;
-
-		while (true) {																						// break until reach the bottom
-			int split_axis = nodes[cur].level % D;
-			if (nodes[cur].left == -1) {																	// check whether it has left node or not
-				*d_node = cur;
-				for (int i = 0; i < nodes[cur].num_index; i++) {
-					size_t idx = indices[nodes[cur].index + i];
-					T dist = gpu_distance<T, D>(d_query_point, d_reference_points[idx]);
-					if (dist < best_distance) {
-						best_distance = dist;
-						best_index = idx;
-					}
-				}
-			break;
-			}
-			else if (d_query_point.dim[split_axis] < nodes[cur].split_value) {								// jump into specific son node
-				cur = nodes[cur].left;
-			}
-			else {
-				cur = nodes[cur].right;
-			}
-		}
-		*d_distance = best_distance;
-		*d_index = best_index;
-	}
-	
-	template <typename T, int D>
-	__device__ void search_at_node_range(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
-		T best_distance = FLT_MAX;
-		size_t best_index = 0;
-
-		int next_nodes_pos = 0;																				// initialize pop out order index
-		next_nodes[id * stack_size + next_nodes_pos] = cur;															// find data that belongs to the very specific thread
-		next_nodes_pos++;
-
-		while (next_nodes_pos) {
-			int next_search_nodes_pos = 0;																	// record push back order index
-			while (next_nodes_pos) {
-				cur = next_nodes[id * stack_size + next_nodes_pos - 1];												// pop out the last push in one and keep poping out
-				next_nodes_pos--;
-				int split_axis = nodes[cur].level % D;
-
-				if (nodes[cur].left == -1) {
-					for (int i = 0; i < nodes[cur].num_index; i++) {
-						int idx = indices[nodes[cur].index + i];											// all indices are stored in one array, pick up from every node's beginning index
-						T d = gpu_distance<T>(d_query_point, d_reference_points[idx]);
-						if (d < best_distance) {
-							best_distance = d;
-							best_index = idx;
-						}
-					}
-				}
-				else {
-					T d = d_query_point.dim[split_axis] - nodes[cur].split_value;
-
-					if (fabs(d) > range) {
-						if (d < 0) {
-							next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left;
-							next_search_nodes_pos++;
-						}
-						else {
-							next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right;
-							next_search_nodes_pos++;
-						}
-					}
-					else {
-						next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right;
-						next_search_nodes_pos++;
-						next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left;
-						next_search_nodes_pos++;
-						if (next_search_nodes_pos > stack_size) {
-							printf("Thread conflict might be caused by thread %d, so please try smaller input max_tree_levels\n", id);
-							(*Judge)++;
-						}
-					}
-				}
-			}
-			for (int i = 0; i < next_search_nodes_pos; i++)
-				next_nodes[id * stack_size + i] = next_search_nodes[id * stack_size + i];
-			next_nodes_pos = next_search_nodes_pos;										
-		}
-		*d_distance = best_distance;
-		*d_index = best_index;
-	}
-	
-	template <typename T, int D>
-	__device__ void search(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
-		int best_node = 0;
-		T best_distance = FLT_MAX;
-		size_t best_index = 0;
-		T radius = 0;
-
-		search_at_node<T, D>(nodes, indices, d_reference_points, 0, d_query_point, &best_index, &best_distance, &best_node);
-		radius = sqrt(best_distance);																															// get range
-		int cur = best_node;
-
-		while (nodes[cur].parent != -1) {
-			int parent = nodes[cur].parent;
-			int split_axis = nodes[parent].level % D;
-
-			T tmp_dist = FLT_MAX;
-			size_t tmp_idx;
-			if (fabs(nodes[parent].split_value - d_query_point.dim[split_axis]) <= radius) {
-				if (nodes[parent].left != cur)
-					search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].left, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
-				else
-					search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].right, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
-			}
-			if (tmp_dist < best_distance) {
-				best_distance = tmp_dist;
-				best_index = tmp_idx;
-			}
-			cur = parent;
-		}
-		*d_distance = sqrt(best_distance);
-		*d_index = best_index;
-	}
-	
-	template <typename T, int D>
-	__global__ void search_batch(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) {
-		size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-		if (idx >= d_query_count) return;																														 // avoid segfault
-
-		search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge);    // every query points are independent
-	}
-	
-	template <typename T, int D>
-	void search_stream(cuda_kdnode<T> *d_nodes, size_t *d_index, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> *query_stream_points, size_t stream_count, size_t *indices, T *distances) {
-		unsigned int threads = (unsigned int)(stream_count > 1024 ? 1024 : stream_count);
-		unsigned int blocks = (unsigned int)(stream_count / threads + (stream_count % threads ? 1 : 0));
-
-		cpu_kdtree::point<T, D> *d_query_points;	
-		size_t *d_indices;
-		T *d_distances;
-
-		int *next_nodes;																																	
-		int *next_search_nodes;
-		
-		HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * stream_count * D));
-		HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * stream_count));
-		HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * stream_count));
-		HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int)));																	
-		HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));	
-		HANDLE_ERROR(cudaMemcpy(d_query_points, query_stream_points, sizeof(T) * stream_count * D, cudaMemcpyHostToDevice));
-
-		int *Judge = NULL;
-
-		search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, stream_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge);
-
-		if(Judge == NULL) {
-			HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * stream_count, cudaMemcpyDeviceToHost));
-			HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * stream_count, cudaMemcpyDeviceToHost));
-		}
-
-		HANDLE_ERROR(cudaFree(next_nodes));
-		HANDLE_ERROR(cudaFree(next_search_nodes));
-		HANDLE_ERROR(cudaFree(d_query_points));
-		HANDLE_ERROR(cudaFree(d_indices));
-		HANDLE_ERROR(cudaFree(d_distances));
-	}
-
-	template <typename T, int D = 3>										// set dimension of data to default 3
-	class kdtree {
-	protected:
-		int current_axis;													// current judging axis
-		int n_id;															// store the total number of nodes
-		std::vector < typename cpu_kdtree::point<T, D> > *tmp_points;			// transfer or temperary points
-		std::vector < typename cpu_kdtree::point<T, D> > cpu_tmp_points;		// for cpu searching
-		cpu_kdtree::cpu_kdnode<T> *root;											// root node
-		static kdtree<T, D> *cur_tree_ptr;
-		#ifdef __CUDACC__
-			cuda_kdnode<T> *d_nodes;                                                    																		 
-			size_t *d_index;
-			cpu_kdtree::point<T, D>* d_reference_points;
-			size_t npts;
-			int num_nodes;
-		#endif
-	public:
-		kdtree() {														// constructor for creating a cpu_kdtree
-			cur_tree_ptr = this;											// create  a class pointer points to the current class value
-			n_id = 0;														// set total number of points to default 0
-		}
-		
-		~kdtree() {											  			// destructor of cpu_kdtree
-			std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes;
-			next_nodes.push_back(root);
-			while (next_nodes.size()) {
-				std::vector <cpu_kdtree::cpu_kdnode<T>*> next_search_nodes;
-				while (next_nodes.size()) {
-					cpu_kdtree::cpu_kdnode<T> *cur = next_nodes.back();
-					next_nodes.pop_back();
-					if (cur->left)
-						next_search_nodes.push_back(cur->left);
-					if (cur->right)
-						next_search_nodes.push_back(cur->right);
-					delete cur;
-				}
-				next_nodes = next_search_nodes;
-			}
-			root = NULL;
-			#ifdef __CUDACC__
-			HANDLE_ERROR(cudaFree(d_nodes));
-			HANDLE_ERROR(cudaFree(d_index));
-			HANDLE_ERROR(cudaFree(d_reference_points));
-			#endif
-		}
-		
-		void cpu_create(std::vector < typename cpu_kdtree::point<T, D> > &reference_points, size_t max_levels) {									
-			tmp_points = &reference_points;
-			root = new cpu_kdtree::cpu_kdnode<T>();									// initializing the root node
-			root->idx = n_id++;												// the index of root is 0
-			root->level = 0;												// tree level begins at 0
-			root->indices.resize(reference_points.size());					// get the number of points
-			for (size_t i = 0; i < reference_points.size(); i++) {
-				root->indices[i] = i;										// set indices of input points
-			}
-			std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes;					// next nodes
-			next_nodes.push_back(root);										// push back the root node
-			while (next_nodes.size()) {
-				std::vector <cpu_kdtree::cpu_kdnode<T>*> next_search_nodes;			// next search nodes
-				while (next_nodes.size()) {									// two same WHILE is because we need to make a new vector to store nodes for search
-					cpu_kdtree::cpu_kdnode<T> *current_node = next_nodes.back();	// handle node one by one (right first) 
-					next_nodes.pop_back();									// pop out current node in order to store next round of nodes
-					if (current_node->level < max_levels) {					
-						if (current_node->indices.size() > 1) {				// split if the nonleaf node contains more than one point
-							cpu_kdtree::cpu_kdnode<T> *left = new cpu_kdtree::cpu_kdnode<T>();
-							cpu_kdtree::cpu_kdnode<T> *right = new cpu_kdtree::cpu_kdnode<T>();
-							left->idx = n_id++;								// set the index of current node's left node
-							right->idx = n_id++;							
-							split(current_node, left, right);				// split left and right and determine a node
-							std::vector <size_t> temp;						// empty vecters of int
-							//temp.resize(current_node->indices.size());
-							current_node->indices.swap(temp);				// clean up current node's indices
-							current_node->left = left;
-							current_node->right = right;
-							current_node->left_idx = left->idx;				
-							current_node->right_idx = right->idx;					
-							if (right->indices.size())
-								next_search_nodes.push_back(right);			// left pop out first
-							if (left->indices.size())
-								next_search_nodes.push_back(left);	
-						}
-					}
-				}
-				next_nodes = next_search_nodes;								// go deeper within the tree
-			}
-		}
-		
-		static bool sort_points(const size_t a, const size_t b) {									// create functor for std::sort
-			std::vector < typename cpu_kdtree::point<T, D> > &pts = *cur_tree_ptr->tmp_points;			// put cur_tree_ptr to current input points' pointer
-			return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis];
-		}
-		
-		void split(cpu_kdtree::cpu_kdnode<T> *cur, cpu_kdtree::cpu_kdnode<T> *left, cpu_kdtree::cpu_kdnode<T> *right) {
-			std::vector < typename cpu_kdtree::point<T, D> > &pts = *tmp_points;
-			current_axis = cur->level % D;												// indicate the judicative dimension or axis
-			std::sort(cur->indices.begin(), cur->indices.end(), sort_points);			// using SortPoints as comparison function to sort the data
-			size_t mid_value = cur->indices[cur->indices.size() / 2];                   // odd in the mid_value, even take the floor
-			cur->split_value = pts[mid_value].dim[current_axis];						// get the parent node
-			left->parent = cur;                                                         // set the parent of the next search nodes to current node
-			right->parent = cur;
-			left->level = cur->level + 1;												// level + 1
-			right->level = cur->level + 1;
-			left->parent_idx = cur->idx;                                                // set its parent node's index
-			right->parent_idx = cur->idx;                                            
-			for (size_t i = 0; i < cur->indices.size(); i++) {							// split into left and right half-space one by one
-				size_t idx = cur->indices[i];
-				if (pts[idx].dim[current_axis] < cur->split_value)
-					left->indices.push_back(idx);
-				else
-					right->indices.push_back(idx);
-			}
-		}
-		
-		void create(T *h_reference_points, size_t reference_count, size_t max_levels) {
-			#ifdef __CUDACC__
-			if (max_levels > 10) {
-				std::cout<<"The max_tree_levels should be smaller!"<<std::endl;
-				exit(1);
-			}		
-			//bb.init(&h_reference_points[0]);
-			//aaboundingboxing<T, D>(bb, h_reference_points, reference_count);
-
-			std::vector < typename cpu_kdtree::point<T, D>> reference_points(reference_count);																				// restore the reference points in particular way
-			for (size_t j = 0; j < reference_count; j++)
-				for (size_t i = 0; i < D; i++)
-					reference_points[j].dim[i] = h_reference_points[j * D + i];																																// creating a tree on cpu
-			(*this).cpu_create(reference_points, max_levels);																											// building a tree on cpu
-			cpu_kdtree::cpu_kdnode<T> *d_root = (*this).get_root();
-			num_nodes = (*this).get_num_nodes();
-			npts = reference_count;																												// also equals to reference_count
-
-			HANDLE_ERROR(cudaMalloc((void**)&d_nodes, sizeof(cuda_kdnode<T>) * num_nodes));																		// copy data from host to device
-			HANDLE_ERROR(cudaMalloc((void**)&d_index, sizeof(size_t) * npts));
-			HANDLE_ERROR(cudaMalloc((void**)&d_reference_points, sizeof(cpu_kdtree::point<T, D>) * npts));
-
-			std::vector < cuda_kdnode<T> > tmp_nodes(num_nodes);																									
-			std::vector <size_t> indices(npts);
-			std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes;
-			size_t cur_pos = 0;
-			next_nodes.push_back(d_root);
-			while (next_nodes.size()) {
-				std::vector <typename cpu_kdtree::cpu_kdnode<T>*> next_search_nodes;
-				while (next_nodes.size()) {
-					cpu_kdtree::cpu_kdnode<T> *cur = next_nodes.back();
-					next_nodes.pop_back();
-					int id = cur->idx;																															// the nodes at same level are independent
-					tmp_nodes[id].level = cur->level;
-					tmp_nodes[id].parent = cur->parent_idx;
-					tmp_nodes[id].left = cur->left_idx;
-					tmp_nodes[id].right = cur->right_idx;
-					tmp_nodes[id].split_value = cur->split_value;
-					tmp_nodes[id].num_index = cur->indices.size();																								// number of index
-					if (cur->indices.size()) {
-						for (size_t i = 0; i < cur->indices.size(); i++)
-							indices[cur_pos + i] = cur->indices[i];
-
-						tmp_nodes[id].index = (int)cur_pos;																										// beginning index of reference_points that every bottom node has
-						cur_pos += cur->indices.size();																											// store indices continuously for every query_point
-					}
-					else {
-						tmp_nodes[id].index = -1;
-					}
-
-					if (cur->left)
-						next_search_nodes.push_back(cur->left);
-
-					if (cur->right)
-						next_search_nodes.push_back(cur->right);
-				}
-				next_nodes = next_search_nodes;
-			}
-			HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice));
-			HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
-			HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(cpu_kdtree::point<T, D>) * reference_count, cudaMemcpyHostToDevice));
-
-			#else
-			std::vector < typename cpu_kdtree::point<T, D> > reference_points(reference_count);		// restore the reference points in particular way
-			for (size_t j = 0; j < reference_count; j++)
-				for (size_t i = 0; i < D; i++)
-					reference_points[j].dim[i] = h_reference_points[j * D + i];
-			cpu_create(reference_points, max_levels);
-			cpu_tmp_points = *tmp_points;
-
-			#endif
-		}
-		
-		int get_num_nodes() const {														// get the total number of nodes
-			return n_id; 
-		}
-		
-		cpu_kdtree::cpu_kdnode<T>* get_root() const {											// get the root node of tree
-			return root; 
-		}
-        
-		T cpu_distance(const cpu_kdtree::point<T, D> &a, const cpu_kdtree::point<T, D> &b) {
-			T distance = 0;
-
-			for (size_t i = 0; i < D; i++) {
-				T d = a.dim[i] - b.dim[i];
-				distance += d*d;
-			}
-			return distance;
-		}
-		
-		void cpu_search_at_node(cpu_kdtree::cpu_kdnode<T> *cur, const cpu_kdtree::point<T, D> &query, size_t *index, T *distance, cpu_kdtree::cpu_kdnode<T> **node) {
-			T best_distance = FLT_MAX;                                              // initialize the best distance to max of floating point
-			size_t best_index = 0;
-			std::vector < typename cpu_kdtree::point<T, D> > pts = cpu_tmp_points;
-			while (true) {
-				size_t split_axis = cur->level % D;
-				if (cur->left == NULL) {                                            // risky but acceptable, same goes for right because left and right are in same pace
-					*node = cur;													// pointer points to a pointer
-					for (size_t i = 0; i < cur->indices.size(); i++) {
-						size_t idx = cur->indices[i];
-						T d = cpu_distance(query, pts[idx]);						// compute distances
-						/// if we want to compute k nearest neighbor, we can input the last resul
-						/// (last_best_dist < dist < best_dist) to select the next point until reaching to k
-						if (d < best_distance) {
-							best_distance = d;
-							best_index = idx;                                       // record the nearest neighbor index
-						}
-					}
-					break;                                                          // find the target point then break the loop
-				}
-				else if (query.dim[split_axis] < cur->split_value) {				// if it has son node, visit the next node on either left side or right side
-					cur = cur->left;
-				}
-				else {
-					cur = cur->right;
-				}
-			}
-			*index = best_index;
-			*distance = best_distance;
-		} 
-		
-		void cpu_search_at_node_range(cpu_kdtree::cpu_kdnode<T> *cur, const cpu_kdtree::point<T, D> &query, T range, size_t *index, T *distance) {
-			T best_distance = FLT_MAX;                                              // initialize the best distance to max of floating point
-			size_t best_index = 0;
-			std::vector < typename cpu_kdtree::point<T, D> > pts = cpu_tmp_points;
-			std::vector < typename cpu_kdtree::cpu_kdnode<T>*> next_node;
-			next_node.push_back(cur);
-			while (next_node.size()) {
-				std::vector<typename cpu_kdtree::cpu_kdnode<T>*> next_search;
-				while (next_node.size()) {
-					cur = next_node.back();                                         
-					next_node.pop_back();
-					size_t split_axis = cur->level % D;
-					if (cur->left == NULL) {
-						for (size_t i = 0; i < cur->indices.size(); i++) {
-							size_t idx = cur->indices[i];
-							T d = cpu_distance(query, pts[idx]);
-							if (d < best_distance) {
-								best_distance = d;
-								best_index = idx;
-							}
-						}
-					}
-					else {
-						T d = query.dim[split_axis] - cur->split_value;				// computer distance along specific axis or dimension
-						/// there are three possibilities: on either left or right, and on both left and right
-						if (fabs(d) > range) {										// absolute value of floating point to see if distance will be larger that best_dist
-							if (d < 0)
-								next_search.push_back(cur->left);                   // every left[split_axis] is less and equal to cur->split_value, so it is possible to find the nearest point in this region
-							else
-								next_search.push_back(cur->right);
-						}
-						else {                                                      // it is possible that nereast neighbor will appear on both left and right 
-							next_search.push_back(cur->left);
-							next_search.push_back(cur->right);
-						}
-					}
-				}
-				next_node = next_search;                                            // pop out at least one time                                  
-			}
-			*index = best_index;
-			*distance = best_distance;
-		}
-		
-		void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) {
-			/// first convert the input query point into specific type
-			cpu_kdtree::point<T, D> query;
-			for (size_t j = 0; j < query_count; j++) {
-				for (size_t i = 0; i < D; i++)
-					query.dim[i] = h_query_points[j * D + i];
-				/// find the nearest node, this will be the upper bound for the next time searching
-				cpu_kdtree::cpu_kdnode<T> *best_node = NULL;
-				T best_distance = FLT_MAX;
-				size_t best_index = 0;
-				T radius = 0;																				// radius for range                                                                           
-				cpu_search_at_node(root, query, &best_index, &best_distance, &best_node);                   // simple search to rougly determine a result for next search step
-				radius = sqrt(best_distance);                                                               // It is possible that nearest will appear in another region
-				/// find other possibilities
-				cpu_kdtree::cpu_kdnode<T> *cur = best_node;
-				while (cur->parent != NULL) {																// every node that you pass will be possible to be the best node
-					/// go up
-					cpu_kdtree::cpu_kdnode<T> *parent = cur->parent;                                                // travel back to every node that we pass through
-					size_t split_axis = (parent->level) % D;
-					/// search other nodes
-					size_t tmp_index;
-					T tmp_distance = FLT_MAX;
-					if (fabs(parent->split_value - query.dim[split_axis]) <= radius) {
-						/// search opposite node
-						if (parent->left != cur)
-							cpu_search_at_node_range(parent->left, query, radius, &tmp_index, &tmp_distance);        // to see whether it is its mother node's left son node
-						else
-							cpu_search_at_node_range(parent->right, query, radius, &tmp_index, &tmp_distance);
-					}
-					if (tmp_distance < best_distance) {
-						best_distance = tmp_distance;
-						best_index = tmp_index;
-					}
-					cur = parent;
-				}
-				h_indices[j] = best_index;
-				h_distances[j] = best_distance;
-			}
-		}
-
-		void search(T *h_query_points, size_t query_count, size_t *indices, T *distances) {
-			#ifdef __CUDACC__
-			std::vector < typename cpu_kdtree::point<T, D> > query_points(query_count);
-			for (size_t j = 0; j < query_count; j++)
-				for (size_t i = 0; i < D; i++)
-					query_points[j].dim[i] = h_query_points[j * D + i];
-
-			cudaDeviceProp prop;
-			cudaGetDeviceProperties(&prop, 0);
-			
-			size_t query_memory = D * sizeof(T) * query_count;
-			size_t N = 3 * query_memory / prop.totalGlobalMem;					//consider index and distance, roughly 3 times
-			if (N > 1) {
-				N++;
-				size_t stream_count = query_count / N;
-				for (size_t n = 0; n < N; n++) {
-					size_t query_stream_start = n * stream_count;
-					search_stream(d_nodes, d_index, d_reference_points, &query_points[query_stream_start], stream_count, &indices[query_stream_start], &distances[query_stream_start]);
-				}
-				size_t stream_remain_count = query_count - N * stream_count;
-				if (stream_remain_count > 0) {
-					size_t query_remain_start = N * stream_count;
-					search_stream(d_nodes, d_index, d_reference_points, &query_points[query_remain_start], stream_remain_count, &indices[query_remain_start], &distances[query_remain_start]);
-				}
-			}
-			else {
-				unsigned int threads = (unsigned int)(query_count > 1024 ? 1024 : query_count);
-				unsigned int blocks = (unsigned int)(query_count / threads + (query_count % threads ? 1 : 0));
-
-				cpu_kdtree::point<T, D> *d_query_points;																												// create a pointer pointing to query points on gpu
-				size_t *d_indices;
-				T *d_distances;
-
-				int *next_nodes;																																	// create two STACK-like array
-				int *next_search_nodes;
-
-				int *Judge = NULL;																																	// judge variable to see whether one thread is overwrite another thread's memory																						
-		
-				HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_count * D));
-				HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_count));
-				HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_count));
-				HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int)));																	// STACK size right now is 50, you can change it if you mean to
-				HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));	
-				HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_count * D, cudaMemcpyHostToDevice));
-
-				search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, query_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge);
-
-				if (Judge == NULL) {																																// do the following work if the thread works safely
-					HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_count, cudaMemcpyDeviceToHost));
-					HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_count, cudaMemcpyDeviceToHost));
-				}
-
-				HANDLE_ERROR(cudaFree(next_nodes));
-				HANDLE_ERROR(cudaFree(next_search_nodes));
-				HANDLE_ERROR(cudaFree(d_query_points));
-				HANDLE_ERROR(cudaFree(d_indices));
-				HANDLE_ERROR(cudaFree(d_distances));
-			}
-
-			#else
-			cpu_search(h_query_points, query_count, indices, distances);
-
-			#endif
-
-		}
-
-	};				//end class kdtree
-
-	template <typename T, int D>
-	kdtree<T, D>* kdtree<T, D>::cur_tree_ptr = NULL;												// definition of cur_tree_ptr pointer points to the current class
-
-}				//end namespace stim
-#endif
\ No newline at end of file
--
libgit2 0.21.4