diff --git a/stim/structures/kdtree.cuh b/stim/structures/kdtree.cuh
new file mode 100644
index 0000000..63d530f
--- /dev/null
+++ b/stim/structures/kdtree.cuh
@@ -0,0 +1,415 @@
+// change CUDA_STACK together with max_tree_levels in trial and error manner
+// data should be stored in row-major
+// x1,x2,x3,x4,x5......
+// y1,y2,y3,y4,y5......
+// ....................
+// ....................
+
+#ifndef KDTREE_H
+#define KDTREE_H
+
+#include "device_launch_parameters.h"
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "cuda_runtime.h"
+#include <vector>
+#include <float.h>
+#include <iostream>
+#include <algorithm>
+
+/// using API called HADDLE_ERROR
+static void HandleError(cudaError_t err, const char *file, int line) {
+	if (err != cudaSuccess) {
+	std::cout<<cudaGetErrorString(err)<<" in"<< file <<" at line "<<line<<std::endl;
+	}
+}
+#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
+
+#define CUDA_STACK 2												// implementation "stacks" on CUDA as to do store the nodes information
+
+namespace stim {
+	namespace kdtree {
+		template<typename T, int D>
+		struct point {
+			T coords[D];													// if we use size to measure a vector<point>, it will show the number of point structures
+		};
+
+		template<typename T>
+		class KDNode {
+		public:
+			KDNode() {														// initialization
+				parent = NULL;
+				left = NULL;
+				right = NULL;
+				split_value = -1;
+				_parent = -1;
+				_left = -1;
+				_right = -1;
+			}
+			int id;															// id for current node
+			size_t level;
+			KDNode *parent, *left, *right;
+			int _parent, _left, _right;										// id for parent node
+			T split_value;													// node value
+			std::vector <size_t> indices;									// indices that indicate the data that current tree has
+		};
+	}
+
+	template <typename T, int D = 3>
+	class cpu_kdtree {
+
+	protected:
+		std::vector <kdtree::point<T, D>> *m_pts;
+		kdtree::KDNode<T> *m_root;												// current node
+		int m_current_axis;
+		size_t m_levels;
+		int m_cmps;														// count how many comparisons are to made in the tree for one query
+		int m_id;														// level + 1
+		static cpu_kdtree<T, D> *myself;
+	public:
+		cpu_kdtree() {														// initialization
+			myself = this;
+			m_id = 0;														// id = level + 1, level -> axis index while id -> node identifier
+		}
+		~cpu_kdtree() {														// destructor for deleting what was created by kdtree()
+			std::vector <kdtree::KDNode<T>*> next_node;
+			next_node.push_back(m_root);
+			while (next_node.size()) {
+				std::vector <kdtree::KDNode<T>*> next_search;
+				while (next_node.size()) {
+					kdtree::KDNode<T> *cur = next_node.back();
+					next_node.pop_back();
+					if (cur->left)
+						next_search.push_back(cur->left);
+					if (cur->right)
+						next_search.push_back(cur->right);
+					delete cur;
+				}
+				next_node = next_search;
+			}
+			m_root = NULL;
+		}
+		void Create(std::vector <kdtree::point<T, D>> &pts, size_t max_levels) {
+			m_pts = &pts;												// create a pointer point to the input data
+			m_levels = max_levels;										// stores max tree levels
+			m_root = new kdtree::KDNode<T>();									// using KDNode() to initialize an ancestor node
+			m_root->id = m_id++;										// id is 1 while level is 0 at the very beginning
+			m_root->level = 0;											// to begin with level 0
+			m_root->indices.resize(pts.size());							// initialize the size of whole indices
+			for (size_t i = 0; i < pts.size(); i++) {
+				m_root->indices[i] = i;									// like what we did on Keys in GPU-BF part
+			}
+			std::vector <kdtree::KDNode<T>*> next_node;							// next node
+			next_node.push_back(m_root);								// new node
+			while (next_node.size()) {
+				std::vector <kdtree::KDNode<T>*> next_search;
+				while (next_node.size()) {								// two same WHILE is because we need to make a new vector for searching
+					kdtree::KDNode<T> *current_node = next_node.back();			// pointer point to current node (right first) 
+					next_node.pop_back();								// pop out current node in order to store next node
+					if (current_node->level < max_levels) {				// max_levels should be reasonably small compared with numbers of data
+						if (current_node->indices.size() > 1) {
+							kdtree::KDNode<T> *left = new kdtree::KDNode<T>();
+							kdtree::KDNode<T> *right = new kdtree::KDNode<T>();
+							left->id = m_id++;							// risky guessing but OK for large amount of data since max_level is small     
+							right->id = m_id++;							// risky guessing but OK for large amount of data since max_level is small
+							Split(current_node, left, right);			// split left and right and determine a node
+							std::vector <size_t> temp;					// empty vecters of int
+							//temp.resize(current_node->indices.size());
+							current_node->indices.swap(temp);			// clean up current tree's indices
+							current_node->left = left;
+							current_node->right = right;
+							current_node->_left = left->id;				// indicates it has left son node and gets its id
+							current_node->_right = right->id;			// indicates it has right son node and gets its id
+							if (left->indices.size())
+								next_search.push_back(left);			// right first then left according to stack(first in last out), it can be done in parallel for left and right are independent
+							if (right->indices.size())
+								next_search.push_back(right);
+						}
+					}
+				}
+				next_node = next_search;
+			}
+		}
+		static bool SortPoints(const size_t a, const size_t b) {
+			std::vector <kdtree::point<T, D>> &pts = *myself->m_pts;
+			return pts[a].coords[myself->m_current_axis] < pts[b].coords[myself->m_current_axis];
+		}
+		void Split(kdtree::KDNode<T> *cur, kdtree::KDNode<T> *left, kdtree::KDNode<T> *right) {
+			/// assume both two sides are created and sure it was
+			std::vector <kdtree::point<T, D>> &pts = *m_pts;
+			m_current_axis = cur->level % D;											// indicate the judicative dimension or axis
+			std::sort(cur->indices.begin(), cur->indices.end(), SortPoints);			// using SortPoints as comparison function to sort the data
+			size_t mid = cur->indices[cur->indices.size() / 2];                         // odd in the mid, even take the floor
+			cur->split_value = pts[mid].coords[m_current_axis];                         // get the mother node
+			left->parent = cur;                                                         // set the parent to current node for the next nodes
+			right->parent = cur;
+			left->level = cur->level + 1;
+			right->level = cur->level + 1;
+			left->_parent = cur->id;                                                    // indicates it has mother node and gets its id
+			right->_parent = cur->id;                                                   // indicates it has mother node and gets its id
+			for (size_t i = 0; i < cur->indices.size(); i++) {							// split into left and right area one by one
+				size_t idx = cur->indices[i];
+				if (pts[idx].coords[m_current_axis] < cur->split_value)
+					left->indices.push_back(idx);
+				else
+					right->indices.push_back(idx);
+			}
+		}
+		int GetNumNodes() const { return m_id; }
+		kdtree::KDNode<T>* GetRoot() const { return m_root; }
+	};				//end class kdtree
+
+	template <typename T, int D>
+	cpu_kdtree<T, D>* cpu_kdtree<T, D>::myself = NULL;												// definition of myself pointer points to the specific class
+
+	template <typename T>
+	struct CUDA_KDNode {
+		size_t level;
+		int parent, left, right;														// indicates id of
+		T split_value;
+		size_t num_indices;																// number of indices it has
+		int indices;																	// the beginning
+	};
+
+	template <typename T, int D>
+	__device__ T Distance(kdtree::point<T, D> &a, kdtree::point<T, D> &b) {
+		T dist = 0;
+
+		for (size_t i = 0; i < D; i++) {
+			T d = a.coords[i] - b.coords[i];
+			dist += d*d;
+		}
+		return dist;
+	}
+	template <typename T, int D>
+	__device__ void SearchAtNode(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, int cur, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist, int *ret_node) {
+		/// finds the first possibility
+		size_t best_idx = 0;
+		T best_dist = FLT_MAX;
+
+		while (true) {
+			int split_axis = nodes[cur].level % D;
+			if (nodes[cur].left == -1) {												// if it doesn't have left son node 
+				*ret_node = cur;
+				for (int i = 0; i < nodes[cur].num_indices; i++) {
+					size_t idx = indices[nodes[cur].indices + i];
+					T dist = Distance<T, D>(Query, pts[idx]);
+					if (dist < best_dist) {
+						best_dist = dist;
+						best_idx = idx;
+					}
+				}
+				break;
+			}
+			else if (Query.coords[split_axis] < nodes[cur].split_value) {
+				cur = nodes[cur].left;
+			}
+			else {
+				cur = nodes[cur].right;
+			}
+		}
+		*ret_index = best_idx;
+		*ret_dist = best_dist;
+	}
+	template <typename T, int D>
+	__device__ void SearchAtNodeRange(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, int cur, T range, size_t *ret_index, T *ret_dist) {
+		/// search throught all the nodes that are within one range
+		size_t best_idx = 0;
+		T best_dist = FLT_MAX;
+		/// using fixed stack, increase it when need
+		int next_node[CUDA_STACK];														// should be larger than 1
+		int next_node_pos = 0;															// initialize pop out order index
+		next_node[next_node_pos++] = cur;												// equals to next_node[next_node_pos] = cur, next_node_pos++
+
+		while (next_node_pos) {
+			int next_search[CUDA_STACK];												// for store next nodes
+			int next_search_pos = 0;													// record push back order index
+			while (next_node_pos) {
+				cur = next_node[next_node_pos - 1];										// pop out the last in one and keep poping out
+				next_node_pos--;
+				int split_axis = nodes[cur].level % D;
+
+				if (nodes[cur].left == -1) {
+					for (int i = 0; i < nodes[cur].num_indices; i++) {
+						int idx = indices[nodes[cur].indices + i];						// all indices are stored in one array, pick up from every node's beginning index
+						T d = Distance<T>(Query, pts[idx]);
+						if (d < best_dist) {
+							best_dist = d;
+							best_idx = idx;
+						}
+					}
+				}
+				else {
+					T d = Query.coords[split_axis] - nodes[cur].split_value;
+
+					if (fabs(d) > range) {
+						if (d < 0)
+							next_search[next_search_pos++] = nodes[cur].left;
+						else
+							next_search[next_search_pos++] = nodes[cur].right;
+					}
+					else {
+						next_search[next_search_pos++] = nodes[cur].left;
+						next_search[next_search_pos++] = nodes[cur].right;
+					}
+				}
+			}
+
+			for (int i = 0; i < next_search_pos; i++)
+				next_node[i] = next_search[i];
+			next_node_pos = next_search_pos;											// operation that really resemble STACK, namely first in last out
+		}
+		*ret_index = best_idx;
+		*ret_dist = best_dist;
+	}
+	template <typename T, int D>
+	__device__ void Search(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, kdtree::point<T, D> &Query, size_t *ret_index, T *ret_dist) {
+		/// find first nearest node
+		int best_node = 0;
+		size_t best_idx = 0;
+		T best_dist = FLT_MAX;
+		T radius = 0;
+		SearchAtNode<T, D>(nodes, indices, pts, 0, Query, &best_idx, &best_dist, &best_node);
+		radius = sqrt(best_dist);
+		/// find other possibilities
+		int cur = best_node;
+
+		while (nodes[cur].parent != -1) {
+			/// go up
+			int parent = nodes[cur].parent;
+			int split_axis = nodes[parent].level % D;
+			/// search other node
+			T tmp_dist = FLT_MAX;
+			size_t tmp_idx;
+			if (fabs(nodes[parent].split_value - Query.coords[split_axis]) <= radius) {
+				/// search opposite node
+				if (nodes[parent].left != cur)
+					SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].left, radius, &tmp_idx, &tmp_dist);
+				else
+					SearchAtNodeRange(nodes, indices, pts, Query, nodes[parent].right, radius, &tmp_idx, &tmp_dist);
+			}
+			if (tmp_dist < best_dist) {
+				best_dist = tmp_dist;
+				best_idx = tmp_idx;
+			}
+			cur = parent;
+		}
+		*ret_index = best_idx;
+		*ret_dist = sqrt(best_dist);
+	}
+	template <typename T, int D>
+	__global__ void SearchBatch(CUDA_KDNode<T> *nodes, size_t *indices, kdtree::point<T, D> *pts, size_t num_pts, kdtree::point<T, D> *Query, size_t num_Query, size_t *ret_index, T *ret_dist) {
+		int idx = blockIdx.x * blockDim.x + threadIdx.x;
+		if (idx >= num_Query) return;
+
+		Search<T, D>(nodes, indices, pts, Query[idx], &ret_index[idx], &ret_dist[idx]);        // every query points are independent
+	}
+
+	template <typename T, int D = 3>
+	class cuda_kdtree {
+	protected:
+		CUDA_KDNode<T> *m_gpu_nodes;                                                     // store nodes
+		size_t *m_gpu_indices;
+		kdtree::point<T, D>* m_gpu_points;
+		size_t m_num_points;
+	public:
+		~cuda_kdtree() {
+			HANDLE_ERROR(cudaFree(m_gpu_nodes));
+			HANDLE_ERROR(cudaFree(m_gpu_indices));
+			HANDLE_ERROR(cudaFree(m_gpu_points));
+		}
+		void CreateKDTree(T *ReferencePoints, size_t ReferenceCount, size_t ColCount, size_t max_levels) {
+			std::vector < kdtree::point<T, D> > pts(ReferenceCount);															// create specific struct of reference data
+			for (size_t j = 0; j < ReferenceCount; j++)
+				for (size_t i = 0; i < ColCount; i++)
+					pts[j].coords[i] = ReferencePoints[j * ColCount + i];
+			cpu_kdtree<T, D> tree;																						// initialize a tree
+			tree.Create(pts, max_levels);																		// create KD-Tree on host
+			kdtree::KDNode<T> *root = tree.GetRoot();
+			int num_nodes = tree.GetNumNodes();
+			/// create the same on CPU
+			m_num_points = pts.size();																			// number of points for creating tree = reference_count in the case
+
+			HANDLE_ERROR(cudaMalloc((void**)&m_gpu_nodes, sizeof(CUDA_KDNode<T>) * num_nodes));					// private variables for kdtree
+			HANDLE_ERROR(cudaMalloc((void**)&m_gpu_indices, sizeof(size_t) * m_num_points));
+			HANDLE_ERROR(cudaMalloc((void**)&m_gpu_points, sizeof(kdtree::point<T, D>) * m_num_points));
+
+			std::vector <CUDA_KDNode<T>> cpu_nodes(num_nodes);													// from left to right, id of nodes
+			std::vector <size_t> indices(m_num_points);
+			std::vector < kdtree::KDNode<T>* > next_node;
+
+			size_t cur_pos = 0;
+
+			next_node.push_back(root);
+
+			while (next_node.size()) {
+				std::vector <typename kdtree::KDNode<T>* > next_search;
+
+				while (next_node.size()) {
+					kdtree::KDNode<T> *cur = next_node.back();
+					next_node.pop_back();
+
+					int id = cur->id;																			// the nodes at same level are independent
+
+					cpu_nodes[id].level = cur->level;
+					cpu_nodes[id].parent = cur->_parent;
+					cpu_nodes[id].left = cur->_left;
+					cpu_nodes[id].right = cur->_right;
+					cpu_nodes[id].split_value = cur->split_value;
+					cpu_nodes[id].num_indices = cur->indices.size();											// number of index
+
+					if (cur->indices.size()) {
+						for (size_t i = 0; i < cur->indices.size(); i++)
+							indices[cur_pos + i] = cur->indices[i];
+
+						cpu_nodes[id].indices = (int)cur_pos;													// beginning index that every bottom node has
+						cur_pos += cur->indices.size();															// store indices continuously
+					}
+					else {
+						cpu_nodes[id].indices = -1;
+					}
+
+					if (cur->left)
+						next_search.push_back(cur->left);
+
+					if (cur->right)
+						next_search.push_back(cur->right);
+				}
+				next_node = next_search;
+			}
+
+			HANDLE_ERROR(cudaMemcpy(m_gpu_nodes, &cpu_nodes[0], sizeof(CUDA_KDNode<T>) * cpu_nodes.size(), cudaMemcpyHostToDevice));
+			HANDLE_ERROR(cudaMemcpy(m_gpu_indices, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
+			HANDLE_ERROR(cudaMemcpy(m_gpu_points, &pts[0], sizeof(kdtree::point<T, D>) * pts.size(), cudaMemcpyHostToDevice));
+		}
+		void Search(T *QueryPoints, size_t QueryCount, size_t ColCount, T *dists, size_t *indices) {
+			std::vector < kdtree::point<T, D> > query(QueryCount);
+			for (size_t j = 0; j < QueryCount; j++)
+				for (size_t i = 0; i < ColCount; i++)
+					query[j].coords[i] = QueryPoints[j * ColCount + i];
+
+			unsigned int threads = (unsigned int)(query.size() > 1024 ? 1024 : query.size());
+			//unsigned int blocks = (unsigned int)ceil(query.size() / threads);
+			unsigned int blocks = (unsigned int)(query.size() / threads + (query.size() % threads ? 1 : 0));
+
+			kdtree::point<T, D> *gpu_Query;
+			size_t *gpu_ret_indices;
+			T *gpu_ret_dist;
+
+			HANDLE_ERROR(cudaMalloc((void**)&gpu_Query, sizeof(T) * query.size() * D));
+			HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_indices, sizeof(size_t) * query.size()));
+			HANDLE_ERROR(cudaMalloc((void**)&gpu_ret_dist, sizeof(T) * query.size()));
+			HANDLE_ERROR(cudaMemcpy(gpu_Query, &query[0], sizeof(T) * query.size() * D, cudaMemcpyHostToDevice));
+
+			SearchBatch << <threads, blocks >> > (m_gpu_nodes, m_gpu_indices, m_gpu_points, m_num_points, gpu_Query, query.size(), gpu_ret_indices, gpu_ret_dist);
+
+			HANDLE_ERROR(cudaMemcpy(indices, gpu_ret_indices, sizeof(size_t) * query.size(), cudaMemcpyDeviceToHost));
+			HANDLE_ERROR(cudaMemcpy(dists, gpu_ret_dist, sizeof(T) * query.size(), cudaMemcpyDeviceToHost));
+
+			HANDLE_ERROR(cudaFree(gpu_Query));
+			HANDLE_ERROR(cudaFree(gpu_ret_indices));
+			HANDLE_ERROR(cudaFree(gpu_ret_dist));
+		}
+	};
+}				//end namespace stim
+#endif
\ No newline at end of file
--
libgit2 0.21.4