Merge branch 'kdtree' of git.stim.ee.uh.edu:codebase/stimlib into kdtree

David Mayerich
2 parents e6d6e994 268037bc
Showing 1 changed file with 257 additions and 258 deletions Show diff stats
stim/structures/kdtree.cuh
-// change CUDA_STACK together with max_tree_levels in trial and error manner
+// right now the size of CUDA STACK is set to 1000, increase it if you mean to make deeper tree
 // data should be stored in row-major
 // x1,x2,x3,x4,x5......
 // y1,y2,y3,y4,y5......
@@ -16,159 +16,150 @@
 #include <float.h>
 #include <iostream>
 #include <algorithm>
-
-/// using API called HADDLE_ERROR
-static void HandleError(cudaError_t err, const char *file, int line) {
-	if (err != cudaSuccess) {
-	std::cout<<cudaGetErrorString(err)<<" in"<< file <<" at line "<<line<<std::endl;
-	}
-}
-#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
-
-#define CUDA_STACK 2												// implementation "stacks" on CUDA as to do store the nodes information
+#include <stim/cuda/cudatools/error.h>
 namespace stim {
 	namespace kdtree {
-		template<typename T, int D>
+		template<typename T, int D>											// typename refers to float or double while D refers to dimension of points
 		struct point {
-			T coords[D];													// if we use size to measure a vector<point>, it will show the number of point structures
+			T dim[D];														// create a structure to store every one input point
 		};
 		template<typename T>
-		class KDNode {
+		class kdnode {
 		public:
-			KDNode() {														// initialization
-				parent = NULL;
+			kdnode() {														// constructor for initializing a kdnode
+				parent = NULL;												// set every node's parent, left and right kdnode pointers to NULL
 				left = NULL;
 				right = NULL;
-				split_value = -1;
-				_parent = -1;
-				_left = -1;
-				_right = -1;
+				parent_idx = -1;											// set parent node index to default -1
+				left_idx = -1;
+				right_idx = -1;
+				split_value = -1;											// set split_value to default -1
 			}
-			int id;															// id for current node
-			size_t level;
-			KDNode *parent, *left, *right;
-			int _parent, _left, _right;										// id for parent node
-			T split_value;													// node value
-			std::vector <size_t> indices;									// indices that indicate the data that current tree has
+			int idx;														// index of current node
+			int parent_idx, left_idx, right_idx;							// index of parent, left and right nodes
+			kdnode *parent, *left, *right;									// parent, left and right kdnodes
+			T split_value;													// splitting value of current node
+			std::vector <size_t> indices;									// it indicates the points' indices that current node has 
+			size_t level;													// tree level of current node
 		};
-	}
+	}				// end of namespace kdtree
-	template <typename T, int D = 3>
+	template <typename T, int D = 3>										// set dimension of data to default 3
 	class cpu_kdtree {
-
 	protected:
-		std::vector <kdtree::point<T, D>> *m_pts;
-		kdtree::KDNode<T> *m_root;												// current node
-		int m_current_axis;
-		size_t m_levels;
-		int m_cmps;														// count how many comparisons are to made in the tree for one query
-		int m_id;														// level + 1
-		static cpu_kdtree<T, D> *myself;
+		int current_axis;													// current judging axis
+		int cmps;															// count how many time of comparisons (just for cpu-kdtree)
+		int n_id;															// store the total number of nodes
+		std::vector <kdtree::point<T, D>> *tmp_points;						// transfer or temp points
+		kdtree::kdnode<T> *root;											// root node
+		static cpu_kdtree<T, D> *cur_tree_ptr;
 	public:
-		cpu_kdtree() {														// initialization
-			myself = this;
-			m_id = 0;														// id = level + 1, level -> axis index while id -> node identifier
+		cpu_kdtree() {														// constructor for creating a cpu_kdtree
+			cur_tree_ptr = this;												// create  a class pointer points to the current class value
+			n_id = 0;														// set total number of points to default 0
 		}
-		~cpu_kdtree() {														// destructor for deleting what was created by kdtree()
-			std::vector <kdtree::KDNode<T>*> next_node;
-			next_node.push_back(m_root);
-			while (next_node.size()) {
-				std::vector <kdtree::KDNode<T>*> next_search;
-				while (next_node.size()) {
-					kdtree::KDNode<T> *cur = next_node.back();
-					next_node.pop_back();
+		~cpu_kdtree() {											  			// destructor of cpu_kdtree
+			std::vector <kdtree::kdnode<T>*> next_nodes;
+			next_nodes.push_back(root);
+			while (next_nodes.size()) {
+				std::vector <kdtree::kdnode<T>*> next_search_nodes;
+				while (next_nodes.size()) {
+					kdtree::kdnode<T> *cur = next_nodes.back();
+					next_nodes.pop_back();
 					if (cur->left)
-						next_search.push_back(cur->left);
+						next_search_nodes.push_back(cur->left);
 					if (cur->right)
-						next_search.push_back(cur->right);
+						next_search_nodes.push_back(cur->right);
 					delete cur;
 				}
-				next_node = next_search;
+				next_nodes = next_search_nodes;
 			}
-			m_root = NULL;
+			root = NULL;
 		}
-		void Create(std::vector <kdtree::point<T, D>> &pts, size_t max_levels) {
-			m_pts = &pts;												// create a pointer point to the input data
-			m_levels = max_levels;										// stores max tree levels
-			m_root = new kdtree::KDNode<T>();									// using KDNode() to initialize an ancestor node
-			m_root->id = m_id++;										// id is 1 while level is 0 at the very beginning
-			m_root->level = 0;											// to begin with level 0
-			m_root->indices.resize(pts.size());							// initialize the size of whole indices
-			for (size_t i = 0; i < pts.size(); i++) {
-				m_root->indices[i] = i;									// like what we did on Keys in GPU-BF part
+		void Create(std::vector <kdtree::point<T, D>> &reference_points, size_t max_levels) {
+			tmp_points = &reference_points;												
+			root = new kdtree::kdnode<T>();									// initializing the root node
+			root->idx = n_id++;												// the index of root is 0
+			root->level = 0;												// tree level begins at 0
+			root->indices.resize(reference_points.size());					// get the number of points
+			for (size_t i = 0; i < reference_points.size(); i++) {
+				root->indices[i] = i;										// set indices of input points
 			}
-			std::vector <kdtree::KDNode<T>*> next_node;							// next node
-			next_node.push_back(m_root);								// new node
-			while (next_node.size()) {
-				std::vector <kdtree::KDNode<T>*> next_search;
-				while (next_node.size()) {								// two same WHILE is because we need to make a new vector for searching
-					kdtree::KDNode<T> *current_node = next_node.back();			// pointer point to current node (right first) 
-					next_node.pop_back();								// pop out current node in order to store next node
-					if (current_node->level < max_levels) {				// max_levels should be reasonably small compared with numbers of data
-						if (current_node->indices.size() > 1) {
-							kdtree::KDNode<T> *left = new kdtree::KDNode<T>();
-							kdtree::KDNode<T> *right = new kdtree::KDNode<T>();
-							left->id = m_id++;							// risky guessing but OK for large amount of data since max_level is small     
-							right->id = m_id++;							// risky guessing but OK for large amount of data since max_level is small
-							Split(current_node, left, right);			// split left and right and determine a node
-							std::vector <size_t> temp;					// empty vecters of int
+			std::vector <kdtree::kdnode<T>*> next_nodes;					// next nodes
+			next_nodes.push_back(root);										// push back the root node
+			while (next_nodes.size()) {
+				std::vector <kdtree::kdnode<T>*> next_search_nodes;			// next search nodes
+				while (next_nodes.size()) {									// two same WHILE is because we need to make a new vector to store nodes for search
+					kdtree::kdnode<T> *current_node = next_nodes.back();	// handle node one by one (right first) 
+					next_nodes.pop_back();									// pop out current node in order to store next round of nodes
+					if (current_node->level < max_levels) {					
+						if (current_node->indices.size() > 1) {				// split if the nonleaf node contains more than one point
+							kdtree::kdnode<T> *left = new kdtree::kdnode<T>();
+							kdtree::kdnode<T> *right = new kdtree::kdnode<T>();
+							left->idx = n_id++;								// set the index of current node's left node
+							right->idx = n_id++;							
+							Split(current_node, left, right);				// split left and right and determine a node
+							std::vector <size_t> temp;						// empty vecters of int
 							//temp.resize(current_node->indices.size());
-							current_node->indices.swap(temp);			// clean up current tree's indices
+							current_node->indices.swap(temp);				// clean up current node's indices
 							current_node->left = left;
 							current_node->right = right;
-							current_node->_left = left->id;				// indicates it has left son node and gets its id
-							current_node->_right = right->id;			// indicates it has right son node and gets its id
-							if (left->indices.size())
-								next_search.push_back(left);			// right first then left according to stack(first in last out), it can be done in parallel for left and right are independent
+							current_node->left_idx = left->idx;				
+							current_node->right_idx = right->idx;					
 							if (right->indices.size())
-								next_search.push_back(right);
+								next_search_nodes.push_back(right);			// left pop out first
+							if (left->indices.size())
+								next_search_nodes.push_back(left);	
 						}
 					}
 				}
-				next_node = next_search;
+				next_nodes = next_search_nodes;								// go deeper within the tree
 			}
 		}
-		static bool SortPoints(const size_t a, const size_t b) {
-			std::vector <kdtree::point<T, D>> &pts = *myself->m_pts;
-			return pts[a].coords[myself->m_current_axis] < pts[b].coords[myself->m_current_axis];
+		static bool SortPoints(const size_t a, const size_t b) {						// create functor for std::sort
+			std::vector <kdtree::point<T, D>> &pts = *cur_tree_ptr->tmp_points;			// put cur_tree_ptr to current input points' pointer
+			return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis];
 		}
-		void Split(kdtree::KDNode<T> *cur, kdtree::KDNode<T> *left, kdtree::KDNode<T> *right) {
-			/// assume both two sides are created and sure it was
-			std::vector <kdtree::point<T, D>> &pts = *m_pts;
-			m_current_axis = cur->level % D;											// indicate the judicative dimension or axis
+		void Split(kdtree::kdnode<T> *cur, kdtree::kdnode<T> *left, kdtree::kdnode<T> *right) {
+			std::vector <kdtree::point<T, D>> &pts = *tmp_points;
+			current_axis = cur->level % D;												// indicate the judicative dimension or axis
 			std::sort(cur->indices.begin(), cur->indices.end(), SortPoints);			// using SortPoints as comparison function to sort the data
-			size_t mid = cur->indices[cur->indices.size() / 2];                         // odd in the mid, even take the floor
-			cur->split_value = pts[mid].coords[m_current_axis];                         // get the mother node
-			left->parent = cur;                                                         // set the parent to current node for the next nodes
+			size_t mid_value = cur->indices[cur->indices.size() / 2];                   // odd in the mid_value, even take the floor
+			cur->split_value = pts[mid_value].dim[current_axis];						// get the parent node
+			left->parent = cur;                                                         // set the parent of the next search nodes to current node
 			right->parent = cur;
-			left->level = cur->level + 1;
+			left->level = cur->level + 1;												// level + 1
 			right->level = cur->level + 1;
-			left->_parent = cur->id;                                                    // indicates it has mother node and gets its id
-			right->_parent = cur->id;                                                   // indicates it has mother node and gets its id
-			for (size_t i = 0; i < cur->indices.size(); i++) {							// split into left and right area one by one
+			left->parent_idx = cur->idx;                                                // set its parent node's index
+			right->parent_idx = cur->idx;                                            
+			for (size_t i = 0; i < cur->indices.size(); i++) {							// split into left and right half-space one by one
 				size_t idx = cur->indices[i];
-				if (pts[idx].coords[m_current_axis] < cur->split_value)
+				if (pts[idx].dim[current_axis] < cur->split_value)
 					left->indices.push_back(idx);
 				else
 					right->indices.push_back(idx);
 			}
 		}
-		int GetNumNodes() const { return m_id; }
-		kdtree::KDNode<T>* GetRoot() const { return m_root; }
+		int GetNumNodes() const {														// get the total number of nodes
+			return n_id; 
+		}
+		kdtree::kdnode<T>* GetRoot() const {											// get the root node of tree
+			return root; 
+		}
 	};				//end class kdtree
 	template <typename T, int D>
-	cpu_kdtree<T, D>* cpu_kdtree<T, D>::myself = NULL;												// definition of myself pointer points to the specific class
+	cpu_kdtree<T, D>* cpu_kdtree<T, D>::cur_tree_ptr = NULL;							// definition of cur_tree_ptr pointer points to the current class
 	template <typename T>
-	struct CUDA_KDNode {
-		size_t level;
-		int parent, left, right;														// indicates id of
+	struct cuda_kdnode {
+		int parent, left, right;														
 		T split_value;
-		size_t num_indices;																// number of indices it has
-		int indices;																	// the beginning
+		size_t num_index;																// number of indices it has
+		int index;																		// the beginning index
+		size_t level;
 	};
 	template <typename T, int D>
@@ -176,239 +167,247 @@ namespace stim {
 		T dist = 0;
 		for (size_t i = 0; i < D; i++) {
-			T d = a.coords[i] - b.coords[i];
+			T d = a.dim[i] - b.dim[i];
 			dist += d*d;
 		}
 		return dist;
 	}
 	template <typename T, int D>
-	__device__ void SearchAtNode(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, int cur, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist, int *ret_node) {
-		/// finds the first possibility
-		size_t best_idx = 0;
-		T best_dist = FLT_MAX;
+	__device__ void SearchAtNode(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, int cur, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) {
+		T best_distance = FLT_MAX;
+		size_t best_index = 0;
-		while (true) {
+		while (true) {																	// break until reach the bottom
 			int split_axis = nodes[cur].level % D;
-			if (nodes[cur].left == -1) {												// if it doesn't have left son node 
-				*ret_node = cur;
-				for (int i = 0; i < nodes[cur].num_indices; i++) {
-					size_t idx = indices[nodes[cur].indices + i];
-					T dist = Distance<T, D>(Query, pts[idx]);
-					if (dist < best_dist) {
-						best_dist = dist;
-						best_idx = idx;
+			if (nodes[cur].left == -1) {												// check whether it has left node or not
+				*d_node = cur;
+				for (int i = 0; i < nodes[cur].num_index; i++) {
+					size_t idx = indices[nodes[cur].index + i];
+					T dist = Distance<T, D>(d_query_point, d_reference_points[idx]);
+					if (dist < best_distance) {
+						best_distance = dist;
+						best_index = idx;
 					}
 				}
-				break;
+			break;
 			}
-			else if (Query.coords[split_axis] < nodes[cur].split_value) {
+			else if (d_query_point.dim[split_axis] < nodes[cur].split_value) {			// jump into specific son node
 				cur = nodes[cur].left;
 			}
 			else {
 				cur = nodes[cur].right;
 			}
 		}
-		*ret_index = best_idx;
-		*ret_dist = best_dist;
+		*d_distance = best_distance;
+		*d_index = best_index;
 	}
 	template <typename T, int D>
-	__device__ void SearchAtNodeRange(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, int cur, T range, size_t *ret_index, T *ret_dist) {
-		/// search throught all the nodes that are within one range
-		size_t best_idx = 0;
-		T best_dist = FLT_MAX;
-		/// using fixed stack, increase it when need
-		int next_node[CUDA_STACK];														// should be larger than 1
-		int next_node_pos = 0;															// initialize pop out order index
-		next_node[next_node_pos++] = cur;												// equals to next_node[next_node_pos] = cur, next_node_pos++
-
-		while (next_node_pos) {
-			int next_search[CUDA_STACK];												// for store next nodes
-			int next_search_pos = 0;													// record push back order index
-			while (next_node_pos) {
-				cur = next_node[next_node_pos - 1];										// pop out the last in one and keep poping out
-				next_node_pos--;
+	__device__ void SearchAtNodeRange(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
+		T best_distance = FLT_MAX;
+		size_t best_index = 0;
+
+		int next_nodes_pos = 0;															// initialize pop out order index
+		next_nodes[id * 1000 + next_nodes_pos] = cur;									// find data that belongs to the very specific thread
+		next_nodes_pos++;
+
+		while (next_nodes_pos) {
+			int next_search_nodes_pos = 0;												// record push back order index
+			while (next_nodes_pos) {
+				cur = next_nodes[id * 1000 + next_nodes_pos - 1];						// pop out the last push in one and keep poping out
+				next_nodes_pos--;
 				int split_axis = nodes[cur].level % D;
 				if (nodes[cur].left == -1) {
-					for (int i = 0; i < nodes[cur].num_indices; i++) {
-						int idx = indices[nodes[cur].indices + i];						// all indices are stored in one array, pick up from every node's beginning index
-						T d = Distance<T>(Query, pts[idx]);
-						if (d < best_dist) {
-							best_dist = d;
-							best_idx = idx;
+					for (int i = 0; i < nodes[cur].num_index; i++) {
+						int idx = indices[nodes[cur].index + i];						// all indices are stored in one array, pick up from every node's beginning index
+						T d = Distance<T>(d_query_point, d_reference_points[idx]);
+						if (d < best_distance) {
+							best_distance = d;
+							best_index = idx;
 						}
 					}
 				}
 				else {
-					T d = Query.coords[split_axis] - nodes[cur].split_value;
+					T d = d_query_point.dim[split_axis] - nodes[cur].split_value;
 					if (fabs(d) > range) {
-						if (d < 0)
-							next_search[next_search_pos++] = nodes[cur].left;
-						else
-							next_search[next_search_pos++] = nodes[cur].right;
+						if (d < 0) {
+							next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].left;
+							next_search_nodes_pos++;
+						}
+						else {
+							next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].right;
+							next_search_nodes_pos++;
+						}
 					}
 					else {
-						next_search[next_search_pos++] = nodes[cur].left;
-						next_search[next_search_pos++] = nodes[cur].right;
+						next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].right;
+						next_search_nodes_pos++;
+						next_search_nodes[id * 1000 + next_search_nodes_pos] = nodes[cur].left;
+						next_search_nodes_pos++;
+						if (next_search_nodes_pos > 1000) {
+							printf("Thread conflict might be caused by thread %d, so please try smaller input max_tree_levels\n", id);
+							(*Judge)++;
+						}
 					}
 				}
 			}
-
-			for (int i = 0; i < next_search_pos; i++)
-				next_node[i] = next_search[i];
-			next_node_pos = next_search_pos;											// operation that really resemble STACK, namely first in last out
+			for (int i = 0; i < next_search_nodes_pos; i++)
+				next_nodes[id * 1000 + i] = next_search_nodes[id * 1000 + i];
+			next_nodes_pos = next_search_nodes_pos;										
 		}
-		*ret_index = best_idx;
-		*ret_dist = best_dist;
+		*d_distance = best_distance;
+		*d_index = best_index;
 	}
 	template <typename T, int D>
-	__device__ void Search(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist) {
-		/// find first nearest node
+	__device__ void Search(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
 		int best_node = 0;
-		size_t best_idx = 0;
-		T best_dist = FLT_MAX;
+		T best_distance = FLT_MAX;
+		size_t best_index = 0;
 		T radius = 0;
-		SearchAtNode<T, D>(nodes, indices, pts, 0, Query, &best_idx, &best_dist, &best_node);
-		radius = sqrt(best_dist);
-		/// find other possibilities
+
+		SearchAtNode<T, D>(nodes, indices, d_reference_points, 0, d_query_point, &best_index, &best_distance, &best_node);
+		radius = sqrt(best_distance);																															// get range
 		int cur = best_node;
 		while (nodes[cur].parent != -1) {
-			/// go up
 			int parent = nodes[cur].parent;
 			int split_axis = nodes[parent].level % D;
-			/// search other node
+
 			T tmp_dist = FLT_MAX;
 			size_t tmp_idx;
-			if (fabs(nodes[parent].split_value - Query.coords[split_axis]) <= radius) {
-				/// search opposite node
+			if (fabs(nodes[parent].split_value - d_query_point.dim[split_axis]) <= radius) {
 				if (nodes[parent].left != cur)
-					SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].left, radius, &tmp_idx, &tmp_dist);
+					SearchAtNodeRange(nodes, indices, d_reference_points, d_query_point, nodes[parent].left, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
 				else
-					SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].right, radius, &tmp_idx, &tmp_dist);
+					SearchAtNodeRange(nodes, indices, d_reference_points, d_query_point, nodes[parent].right, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
 			}
-			if (tmp_dist < best_dist) {
-				best_dist = tmp_dist;
-				best_idx = tmp_idx;
+			if (tmp_dist < best_distance) {
+				best_distance = tmp_dist;
+				best_index = tmp_idx;
 			}
 			cur = parent;
 		}
-		*ret_index = best_idx;
-		*ret_dist = sqrt(best_dist);
+		*d_distance = sqrt(best_distance);
+		*d_index = best_index;
 	}
 	template <typename T, int D>
-	__global__ void SearchBatch(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, size_t num_pts, kdtree::point<T, D> *Query, size_t num_Query, size_t *ret_index, T *ret_dist) {
-		int idx = blockIdx.x * blockDim.x + threadIdx.x;
-		if (idx >= num_Query) return;
+	__global__ void SearchBatch(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) {
+		size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+		if (idx >= d_query_count) return;																														 // avoid segfault
-		Search<T, D>(nodes, indices, pts, Query[idx], &ret_index[idx], &ret_dist[idx]);        // every query points are independent
+		Search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge);    // every query points are independent
 	}
 	template <typename T, int D = 3>
 	class cuda_kdtree {
 	protected:
-		CUDA_KDNode<T> *m_gpu_nodes;                                                     // store nodes
-		size_t *m_gpu_indices;
-		kdtree::point<T, D>* m_gpu_points;
-		size_t m_num_points;
+		cuda_kdnode<T> *d_nodes;                                                    																		 
+		size_t *d_index;
+		kdtree::point<T, D>* d_reference_points;
+		size_t d_reference_count;
 	public:
 		~cuda_kdtree() {
-			HANDLE_ERROR(cudaFree(m_gpu_nodes));
-			HANDLE_ERROR(cudaFree(m_gpu_indices));
-			HANDLE_ERROR(cudaFree(m_gpu_points));
+			HANDLE_ERROR(cudaFree(d_nodes));
+			HANDLE_ERROR(cudaFree(d_index));
+			HANDLE_ERROR(cudaFree(d_reference_points));
 		}
-		void CreateKDTree(T *ReferencePoints, size_t ReferenceCount, size_t ColCount, size_t max_levels) {
-			std::vector < kdtree::point<T, D> > pts(ReferenceCount);															// create specific struct of reference data
-			for (size_t j = 0; j < ReferenceCount; j++)
-				for (size_t i = 0; i < ColCount; i++)
-					pts[j].coords[i] = ReferencePoints[j * ColCount + i];
-			cpu_kdtree<T, D> tree;																						// initialize a tree
-			tree.Create(pts, max_levels);																		// create KD-Tree on host
-			kdtree::KDNode<T> *root = tree.GetRoot();
+		void CreateKDTree(T *h_reference_points, size_t reference_count, size_t dim_count, size_t max_levels) {
+			if (max_levels > 10) {
+				std::cout<<"The max_tree_levels should be smaller!"<<std::endl;
+				exit(1);
+			}
+			std::vector <kdtree::point<T, D>> reference_points(reference_count);																				// restore the reference points in particular way
+			for (size_t j = 0; j < reference_count; j++)
+				for (size_t i = 0; i < dim_count; i++)
+					reference_points[j].dim[i] = h_reference_points[j * dim_count + i];
+			cpu_kdtree<T, D> tree;																																// creating a tree on cpu
+			tree.Create(reference_points, max_levels);																											// building a tree on cpu
+			kdtree::kdnode<T> *d_root = tree.GetRoot();
 			int num_nodes = tree.GetNumNodes();
-			/// create the same on CPU
-			m_num_points = pts.size();																			// number of points for creating tree = reference_count in the case
+			d_reference_count = reference_points.size();																										// also equals to reference_count
-			HANDLE_ERROR(cudaMalloc((void**)&m_gpu_nodes, sizeof(CUDA_KDNode<T>) * num_nodes));					// private variables for kdtree
-			HANDLE_ERROR(cudaMalloc((void**)&m_gpu_indices, sizeof(size_t) * m_num_points));
-			HANDLE_ERROR(cudaMalloc((void**)&m_gpu_points, sizeof(kdtree::point<T, D>) * m_num_points));
-
-			std::vector <CUDA_KDNode<T>> cpu_nodes(num_nodes);													// from left to right, id of nodes
-			std::vector <size_t> indices(m_num_points);
-			std::vector < kdtree::KDNode<T>* > next_node;
+			HANDLE_ERROR(cudaMalloc((void**)&d_nodes, sizeof(cuda_kdnode<T>) * num_nodes));																		// copy data from host to device
+			HANDLE_ERROR(cudaMalloc((void**)&d_index, sizeof(size_t) * d_reference_count));
+			HANDLE_ERROR(cudaMalloc((void**)&d_reference_points, sizeof(kdtree::point<T, D>) * d_reference_count));
+			std::vector <cuda_kdnode<T>> tmp_nodes(num_nodes);																									
+			std::vector <size_t> indices(d_reference_count);
+			std::vector <kdtree::kdnode<T>*> next_nodes;
 			size_t cur_pos = 0;
-
-			next_node.push_back(root);
-
-			while (next_node.size()) {
-				std::vector <typename kdtree::KDNode<T>* > next_search;
-
-				while (next_node.size()) {
-					kdtree::KDNode<T> *cur = next_node.back();
-					next_node.pop_back();
-
-					int id = cur->id;																			// the nodes at same level are independent
-
-					cpu_nodes[id].level = cur->level;
-					cpu_nodes[id].parent = cur->_parent;
-					cpu_nodes[id].left = cur->_left;
-					cpu_nodes[id].right = cur->_right;
-					cpu_nodes[id].split_value = cur->split_value;
-					cpu_nodes[id].num_indices = cur->indices.size();											// number of index
-
+			next_nodes.push_back(d_root);
+			while (next_nodes.size()) {
+				std::vector <typename kdtree::kdnode<T>*> next_search_nodes;
+				while (next_nodes.size()) {
+					kdtree::kdnode<T> *cur = next_nodes.back();
+					next_nodes.pop_back();
+					int id = cur->idx;																															// the nodes at same level are independent
+					tmp_nodes[id].level = cur->level;
+					tmp_nodes[id].parent = cur->parent_idx;
+					tmp_nodes[id].left = cur->left_idx;
+					tmp_nodes[id].right = cur->right_idx;
+					tmp_nodes[id].split_value = cur->split_value;
+					tmp_nodes[id].num_index = cur->indices.size();																								// number of index
 					if (cur->indices.size()) {
 						for (size_t i = 0; i < cur->indices.size(); i++)
 							indices[cur_pos + i] = cur->indices[i];
-						cpu_nodes[id].indices = (int)cur_pos;													// beginning index that every bottom node has
-						cur_pos += cur->indices.size();															// store indices continuously
+						tmp_nodes[id].index = (int)cur_pos;																										// beginning index of reference_points that every bottom node has
+						cur_pos += cur->indices.size();																											// store indices continuously for every query_point
 					}
 					else {
-						cpu_nodes[id].indices = -1;
+						tmp_nodes[id].index = -1;
 					}
 					if (cur->left)
-						next_search.push_back(cur->left);
+						next_search_nodes.push_back(cur->left);
 					if (cur->right)
-						next_search.push_back(cur->right);
+						next_search_nodes.push_back(cur->right);
 				}
-				next_node = next_search;
+				next_nodes = next_search_nodes;
 			}
-
-			HANDLE_ERROR(cudaMemcpy(m_gpu_nodes, &cpu_nodes[0], sizeof(CUDA_KDNode<T>) * cpu_nodes.size(), cudaMemcpyHostToDevice));
-			HANDLE_ERROR(cudaMemcpy(m_gpu_indices, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
-			HANDLE_ERROR(cudaMemcpy(m_gpu_points, &pts[0], sizeof(kdtree::point<T, D>) * pts.size(), cudaMemcpyHostToDevice));
+			HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice));
+			HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
+			HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point<T, D>) * reference_points.size(), cudaMemcpyHostToDevice));
 		}
-		void Search(T *QueryPoints, size_t QueryCount, size_t ColCount, T *dists, size_t *indices) {
-			std::vector < kdtree::point<T, D> > query(QueryCount);
-			for (size_t j = 0; j < QueryCount; j++)
-				for (size_t i = 0; i < ColCount; i++)
-					query[j].coords[i] = QueryPoints[j * ColCount + i];
-
-			unsigned int threads = (unsigned int)(query.size() > 1024 ? 1024 : query.size());
-			//unsigned int blocks = (unsigned int)ceil(query.size() / threads);
-			unsigned int blocks = (unsigned int)(query.size() / threads + (query.size() % threads ? 1 : 0));
-
-			kdtree::point<T, D> *gpu_Query;
-			size_t *gpu_ret_indices;
-			T *gpu_ret_dist;
-
-			HANDLE_ERROR(cudaMalloc((void**)&gpu_Query, sizeof(T) * query.size() * D));
-			HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_indices, sizeof(size_t) * query.size()));
-			HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_dist, sizeof(T) * query.size()));
-			HANDLE_ERROR(cudaMemcpy(gpu_Query, &query[0], sizeof(T) * query.size() * D, cudaMemcpyHostToDevice));
-
-			SearchBatch << <threads, blocks >> > (m_gpu_nodes, m_gpu_indices, m_gpu_points, m_num_points, gpu_Query, query.size(), gpu_ret_indices, gpu_ret_dist);
-
-			HANDLE_ERROR(cudaMemcpy(indices, gpu_ret_indices, sizeof(size_t) * query.size(), cudaMemcpyDeviceToHost));
-			HANDLE_ERROR(cudaMemcpy(dists, gpu_ret_dist, sizeof(T) * query.size(), cudaMemcpyDeviceToHost));
+		void Search(T *h_query_points, size_t query_count, size_t dim_count, T *dists, size_t *indices) {
+			std::vector <kdtree::point<T, D>> query_points(query_count);
+			for (size_t j = 0; j < query_count; j++)
+				for (size_t i = 0; i < dim_count; i++)
+					query_points[j].dim[i] = h_query_points[j * dim_count + i];
+
+			unsigned int threads = (unsigned int)(query_points.size() > 1024 ? 1024 : query_points.size());
+			unsigned int blocks = (unsigned int)(query_points.size() / threads + (query_points.size() % threads ? 1 : 0));
+
+			kdtree::point<T, D> *d_query_points;																												// create a pointer pointing to query points on gpu
+			size_t *d_indices;
+			T *d_distances;
+
+			int *next_nodes;																																	// create two STACK-like array
+			int *next_search_nodes;
+
+			int *Judge = NULL;																																	// judge variable to see whether one thread is overwrite another thread's memory																						
+		
+			HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_points.size() * D));
+			HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_points.size()));
+			HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_points.size()));
+			HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * 1000 * sizeof(int)));																// STACK size right now is 1000, you can change it if you mean to
+			HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * 1000 * sizeof(int)));	
+			HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_points.size() * D, cudaMemcpyHostToDevice));
+
+			SearchBatch<<<threads, blocks>>> (d_nodes, d_index, d_reference_points, d_query_points, query_points.size(), d_indices, d_distances, next_nodes, next_search_nodes, Judge);
+
+			if (Judge == NULL) {																																// do the following work if the thread works safely
+				HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_points.size(), cudaMemcpyDeviceToHost));
+				HANDLE_ERROR(cudaMemcpy(dists, d_distances, sizeof(T) * query_points.size(), cudaMemcpyDeviceToHost));
+			}
-			HANDLE_ERROR(cudaFree(gpu_Query));
-			HANDLE_ERROR(cudaFree(gpu_ret_indices));
-			HANDLE_ERROR(cudaFree(gpu_ret_dist));
+			HANDLE_ERROR(cudaFree(next_nodes));
+			HANDLE_ERROR(cudaFree(next_search_nodes));
+			HANDLE_ERROR(cudaFree(d_query_points));
+			HANDLE_ERROR(cudaFree(d_indices));
+			HANDLE_ERROR(cudaFree(d_distances));
 		}
 	};
 }				//end namespace stim