Merge remote-tracking branch 'refs/remotes/origin/JACK'

Conflicts: stim/biomodels/network.h stim/math/filters/conv2.h stim/visualization/cylinder.h stim/visualization/gl_network.h

Merge remote-tracking branch 'refs/remotes/origin/JACK'
Conflicts: stim/biomodels/network.h stim/math/filters/conv2.h stim/visualization/cylinder.h stim/visualization/gl_network.h
David Mayerich
2 parents 1ee79b84 fdfdeda0
Showing 6 changed files with 870 additions and 386 deletions Show diff stats
stim/biomodels/centerline.h
stim/biomodels/network.h
stim/math/filters/conv2.h
stim/structures/kdtree.cuh
stim/visualization/cylinder.h
stim/visualization/gl_network.h
@@ -138,7 +138,7 @@ public:
 		//if the index is not an end point
 		else{
-			unsigned int N1 = idx + 1;					//calculate the size of both fibers
+			unsigned int N1 = idx;					//calculate the size of both fibers
 			unsigned int N2 = N - idx;
 			fl.resize(2);								//set the array size to 2
@@ -147,7 +147,7 @@ public:
 			fl[1] = stim::centerline<T>(N2);
 			//copy both halves of the fiber
-			unsigned int i, d;
+			unsigned int i;
 			//first half
 			for(i = 0; i < N1; i++)					//for each centerline point
@@ -15,6 +15,39 @@
 #include <stim/cuda/cudatools/timer.h>
+#ifdef __CUDACC__
+//device gaussian function
+__device__ float gaussianFunction(float x, float std){ return expf(-x/(2*std*std));}
+
+//compute metric in parallel
+template <typename T>
+__global__ void d_metric(T* M, size_t n, T* D, float sigma){
+	size_t x = blockDim.x * blockIdx.x + threadIdx.x;
+	if(x >= n) return;
+	M[x] = 1.0f - gaussianFunction(D[x], sigma);
+}
+
+//find the corresponding edge index from array index
+__global__ void d_findedge(size_t* I, size_t n, unsigned* R, size_t* E, size_t ne){
+	size_t x = blockDim.x * blockIdx.x + threadIdx.x;
+	if(x >= n) return;
+	unsigned i = 0;
+	size_t N = 0;
+	for(unsigned e = 0; e < ne; e++){
+		N += E[e];
+		if(I[x] < N){
+			R[x] = i;
+			break;
+		}
+		i++;
+	}
+}
+#endif
+
+//hard-coded factor
+int threshold_fac = 10;
+float metric_fac = 0.6f;							// might be related to the distance field
+
 namespace stim{
 /** This is the a class that interfaces with gl_spider in order to store the currently
  *   segmented network. The following data is stored and can be extracted:
@@ -22,7 +55,6 @@ namespace stim{
  *   2)Network connectivity (a graph of nodes and edges), reconstructed using ANN library.
 */
-
 template<typename T>
 class network{
@@ -60,6 +92,19 @@ class network{
 			return ss.str();
 		}
+		std::vector<edge> split(unsigned int idx){
+		
+			std::vector< stim::cylinder<T> > C;
+			C.resize(2);
+			C =	(*this).cylinder<T>::split(idx);
+			std::vector<edge> E(C.size());
+
+			for(unsigned e = 0; e < E.size(); e++){
+				E[e] = C[e];
+			}
+			return E;
+		}
+
 	};
 	///Node class that stores the physical position of the node as well as the edges it is connected to (edges that connect to it), As well as any additional data necessary.
@@ -376,7 +421,8 @@ public:
 		return n;							              	//return the resampled network
 	}
-
+	// host gaussian function
+	__host__ float gaussianFunction(float x, float std = 25){ return exp(-x/(2*std*std));}  // std default value is 25
 	/// Calculate the total number of points on all edges.
 	unsigned total_points(){
@@ -402,9 +448,6 @@ public:
 		}
 	}
-	// gaussian function
-	float gaussianFunction(float x, float std=25){ return exp(-x/(2*std*std));} // by default std = 25
-
     // convert vec3 to array
 	void stim2array(float *a, stim::vec3<T> b){
 		a[0] = b[0];
@@ -412,6 +455,16 @@ public:
 		a[2] = b[2];
 	}
+	// convert vec3 to array in bunch
+	void edge2array(T* a, edge b){
+		size_t n = b.size();
+		for(size_t i = 0; i < n; i++){
+			a[i * 3 + 0] = b[i][0];
+			a[i * 3 + 1] = b[i][1];
+			a[i * 3 + 2] = b[i][2];	 
+		}
+	}
+
 	/// Calculate the average magnitude across the entire network.
 	/// @param m is the magnitude value to use. The default is 0 (usually radius).
 	T average(unsigned m = 0){
@@ -419,7 +472,7 @@ public:
 		T M, L;										//allocate space for the total magnitude and length
 		M = L = 0;									//initialize both the initial magnitude and length to zero
 		for(unsigned e = 0; e < E.size(); e++){						//for each edge in the network
-			M += E[e].integrate(m);							//get the integrated magnitude
+			M += E[e].integrate();							//get the integrated magnitude
 			L += E[e].length();							//get the edge length
 		}
@@ -427,7 +480,6 @@ public:
 	}
 	/// This function compares two networks and returns the percentage of the current network that is missing from A.
-
 	/// @param A is the network to compare to - the field is generated for A
 	/// @param sigma is the user-defined tolerance value - smaller values provide a stricter comparison
 	stim::network<T> compare(stim::network<T> A, float sigma, int device = -1){
@@ -437,7 +489,7 @@ public:
 		T *c;						                 			// centerline (array of double pointers) - points on kdtree must be double
 		size_t n_data = A.total_points();          				// set the number of points
-		c = (T*) malloc(sizeof(T) * n_data * 3);				//allocate an array to store all points in the data set				
+		c = (T*) malloc(sizeof(T) * n_data * 3);				// allocate an array to store all points in the data set				
 		unsigned t = 0;
 		for(unsigned e = 0; e < A.E.size(); e++){				//for each edge in the network
@@ -451,62 +503,362 @@ public:
 		}
 		//generate a KD-tree for network A
-		//float metric = 0.0;                               	// initialize metric to be returned after comparing the network
 		size_t MaxTreeLevels = 3;								// max tree level
 #ifdef __CUDACC__
 		cudaSetDevice(device);
-		stim::cuda_kdtree<T, 3> kdt;								// initialize a pointer to a kd tree
-	
-		//compare each point in the current network to the field produced by A
+		stim::kdtree<T, 3> kdt;								// initialize a pointer to a kd tree
+
 		kdt.create(c, n_data, MaxTreeLevels);				// build a KD tree
-		T *dists = new T[1];								// near neighbor distances
-		size_t *nnIdx = new size_t[1];							// near neighbor indices // allocate near neigh indices
-		stim::vec3<T> p0, p1;
-		T m1;
-		//float M = 0;									//stores the total metric value
-		//float L = 0;									//stores the total network length
-		T* queryPt = new T[3];
 		for(unsigned e = 0; e < R.E.size(); e++){					//for each edge in A
 			R.E[e].add_mag(0);							//add a new magnitude for the metric
 			size_t errormag_id = R.E[e].nmags() - 1;		//get the id for the new magnitude
+			
+			size_t n = R.E[e].size();						// the number of points in current edge
+			T* queryPt = new T[3 * n];
+			T* m1 = new T[n];
+			T* dists = new T[n];
+			size_t* nnIdx = new size_t[n];
-			for(unsigned p = 0; p < R.E[e].size(); p++){				//for each point in the edge
+			T* d_dists;										
+			T* d_m1;										
+			cudaMalloc((void**)&d_dists, n * sizeof(T));
+			cudaMalloc((void**)&d_m1, n * sizeof(T));
-				p1 = R.E[e][p];							//get the next point in the edge
-				stim2array(queryPt, p1);
-				kdt.search(queryPt, 1, nnIdx, dists);			//find the distance between A and the current network
+			edge2array(queryPt, R.E[e]);
+			kdt.search(queryPt, n, nnIdx, dists);		
+
+			cudaMemcpy(d_dists, dists, n * sizeof(T), cudaMemcpyHostToDevice);					// copy dists from host to device
+
+			// configuration parameters
+			size_t threads = (1024>n)?n:1024;
+			size_t blocks = n/threads + (n%threads)?1:0;
+
+			d_metric<<<blocks, threads>>>(d_m1, n, d_dists, sigma);					//calculate the metric value based on the distance
+
+			cudaMemcpy(m1, d_m1, n * sizeof(T), cudaMemcpyDeviceToHost);
+
+			for(unsigned p = 0; p < n; p++){
+				R.E[e].set_mag(errormag_id, p, m1[p]);
+			}
+
+			//d_set_mag<<<blocks, threads>>>(R.E[e].M, errormag_id, n, m1);
+		}
+
+#else
+		stim::kdtree<T, 3> kdt;
+		kdt.create(c, n_data, MaxTreeLevels);
+	
+		for(unsigned e = 0; e < R.E.size(); e++){			//for each edge in A
+			R.E[e].add_mag(0);								//add a new magnitude for the metric
+			size_t errormag_id = R.E[e].nmags() - 1;
+
+			size_t n = R.E[e].size();						// the number of points in current edge
+			T* query = new T[3 * n];
+			T* m1 = new T[n];
+			T* dists = new T[n];
+			size* nnIdx = new size_t[n];
+
+			edge2array(queryPt, R.E[e]);
+
+			kdt.cpu_search(queryPt, n, nnIdx, dists);			//find the distance between A and the current network
+
+			for(unsigned p = 0; p < R.E[e].size(); p++){
+				m1[p] = 1.0f - gaussianFunction((T)dists[p], sigma);		//calculate the metric value based on the distance
+				R.E[e].set_mag(errormag_id, p, m1[p]);					//set the error for the second point in the segment
+			}
+		}
+#endif
+		return R;		//return the resulting network
+	}
+
+	/// This function compares two networks and split the current one according to the nearest neighbor of each point in each edge
+	/// @param A is the network to split
+	/// @param B is the corresponding mapping network
+	/// @param sigma is the user-defined tolerance value - smaller values provide a stricter comparison
+	/// @param device is the device that user want to use
+	void split(stim::network<T> A, stim::network<T> B, float sigma, int device){
+
+		T *c;						                 	
+		size_t n_data = B.total_points();          				
+		c = (T*) malloc(sizeof(T) * n_data * 3); 				
+
+		size_t NB = B.E.size();								// the number of edges in B
+		unsigned t = 0;
+		for(unsigned e = 0; e < NB; e++){					// for every edge in B			
+			for(unsigned p = 0; p < B.E[e].size(); p++){	// for every points in B.E[e]
+				for(unsigned d = 0; d < 3; d++){				
+
+					c[t * 3 + d] = B.E[e][p][d];			// convert to array
+				}
+				t++;
+			}
+		}
+		size_t MaxTreeLevels = 3;							// max tree level
+
+#ifdef __CUDACC__
+		cudaSetDevice(device);
+		stim::kdtree<T, 3> kdt;								// initialize a pointer to a kd tree
+	
+		//compare each point in the current network to the field produced by A
+		kdt.create(c, n_data, MaxTreeLevels);				// build a KD tree
+
+		std::vector<std::vector<unsigned>> relation;		// the relationship between GT and T corresponding to NN
+		relation.resize(A.E.size());										
+
+		for(unsigned e = 0; e < A.E.size(); e++){			//for each edge in A
+			A.E[e].add_mag(0);								//add a new magnitude for the metric
+			size_t errormag_id = A.E[e].nmags() - 1;
+			
+			size_t n = A.E[e].size();						// the number of edges in A
+
+			T* queryPt = new T[3 * n];							// set of all the points in current edge
+			T* m1 = new T[n];								// array of metrics for every point in current edge
+			T* dists = new T[n];							// store the distances for every point in current edge
+			size_t* nnIdx = new size_t[n];					// store the indices for every point in current edge
+			
+			// define pointers in device
+			T* d_dists;														
+			T* d_m1;
+			size_t* d_nnIdx;
+
+			// allocate memory for defined pointers
+			cudaMalloc((void**)&d_dists, n * sizeof(T));
+			cudaMalloc((void**)&d_m1, n * sizeof(T));
+			cudaMalloc((void**)&d_nnIdx, n * sizeof(size_t));
+
+			edge2array(queryPt, A.E[e]);						// convert edge to array
+			kdt.search(queryPt, n, nnIdx, dists);				// search the tree to find the NN for every point in current edge
+
+			cudaMemcpy(d_dists, dists, n * sizeof(T), cudaMemcpyHostToDevice);					// copy dists from host to device
+			cudaMemcpy(d_nnIdx, nnIdx, n * sizeof(size_t), cudaMemcpyHostToDevice);				// copy Idx from host to device
+
+			// configuration parameters
+			size_t threads = (1024>n)?n:1024;													// test to see whether the number of point in current edge is more than 1024
+			size_t blocks = n/threads + (n%threads)?1:0;
+
+			d_metric<<<blocks, threads>>>(d_m1, n, d_dists, sigma);								// calculate the metrics in parallel
+
+			cudaMemcpy(m1, d_m1, n * sizeof(T), cudaMemcpyDeviceToHost);
+
+			for(unsigned p = 0; p < n; p++){
+				A.E[e].set_mag(errormag_id, p, m1[p]);											// set the error(radius) value to every point in current edge
+			}
+
+			relation[e].resize(n);																// resize every edge relation size
+
+			unsigned* d_relation;
+			cudaMalloc((void**)&d_relation, n * sizeof(unsigned));								// allocate memory
+
+			std::vector<size_t> edge_point_num(NB);												// %th element is the number of points that %th edge has
+			for(unsigned ee = 0; ee < NB; ee++)
+				edge_point_num[ee] = B.E[ee].size();
+
+			size_t* d_edge_point_num;
+			cudaMalloc((void**)&d_edge_point_num, NB * sizeof(size_t));
+			cudaMemcpy(d_edge_point_num, &edge_point_num[0], NB * sizeof(size_t), cudaMemcpyHostToDevice);
+
+			d_findedge<<<blocks, threads>>>(d_nnIdx, n, d_relation, d_edge_point_num, NB);			// find the edge corresponding to the array index in parallel
+
+			cudaMemcpy(&relation[e][0], d_relation, n * sizeof(unsigned), cudaMemcpyDeviceToHost);	//copy relationship from device to host
+		}
+#else
+		stim::kdtree<T, 3> kdt;
+		kdt.create(c, n_data, MaxTreeLevels);
+	
+		std::vector<std::vector<unsigned>> relation;		// the mapping relationship between two networks
+		relation.resize(A.E.size());										
+		for(unsigned i = 0; i < A.E.size(); i++)
+			relation[i].resize(A.E[i].size());
+
+		std::vector<size_t> edge_point_num(NB);				//%th element is the number of points that %th edge has
+		for(unsigned ee = 0; ee < NB; ee++)
+			edge_point_num[ee] = B.E[ee].size();
+
+		for(unsigned e = 0; e < A.E.size(); e++){			//for each edge in A
+			A.E[e].add_mag(0);								//add a new magnitude for the metric
+			size_t errormag_id = A.E[e].nmags() - 1;
+
+			size_t n = A.E[e].size();						// the number of edges in A
+
+			T* queryPt = new T[3 * n];
+			T* m1 = new T[n];
+			T* dists = new T[n];							//store the distances
+			size_t* nnIdx = new size_t[n];					//store the indices
+			
+			edge2array(queryPt, A.E[e]);
+			kdt.search(queryPt, n, nnIdx, dists);		
+
+			for(unsigned p = 0; p < A.E[e].size(); p++){
+				m1[p] = 1.0f - gaussianFunction((T)dists[p], sigma);	//calculate the metric value based on the distance
+				A.E[e].set_mag(errormag_id, p, m1[p]);					//set the error for the second point in the segment
+				
+				unsigned id = 0;																	//mapping edge's idx
+				size_t num = 0;																		//total number of points before #th edge
+				for(unsigned i = 0; i < NB; i++){
+					num += B.E[i].size();
+					if(nnIdx[p] < num){																//find the edge it belongs to
+						relation[e][p] = id;
+						break;
+					}
+					id++;																			//current edge won't be the one, move to next edge
+				}
+			}
+		}
+#endif
+		E = A.E;
+		V = A.V;
+
+		unsigned int id = 0;									// split value								
+		for(unsigned e = 0; e < E.size(); e++){					// for every edge
+			for(unsigned p = 0; p < E[e].size() - 1; p++){		// for every point in each edge
+				if(relation[e][p] != relation[e][p + 1]){		// find the nearest edge changing point
+					id = p + 1;									// if there is no change in NN
+					if(id < E[e].size()/threshold_fac || (E[e].size() - id) < E[e].size()/threshold_fac)				// set tolerance to 10, tentatively--should find out the mathematical threshold!
+						id = E[e].size() - 1;																			// extreme situation is not acceptable
+					else
+						break;
+				}
+				if(p == E[e].size() - 2)																// if there is no splitting index, set the id to the last point index of current edge
+					id = p + 1;
+			}
+			unsigned errormag_id = E[e].nmags() - 1;
+			T G = 0;																					// test to see whether it has its nearest neighbor
+			for(unsigned i = 0; i < E[e].size(); i++)
+				G += E[e].m(i, errormag_id);															// won't split special edges
+			if(G / E[e].size() > metric_fac)															// should based on the color map
+				id = E[e].size() - 1;																	// set split idx to outgoing direction vertex
+
+			std::vector<edge> tmpe;
+			tmpe.resize(2);
+			tmpe = E[e].split(id);
+			vertex tmpv = stim::vec3<T>(-1, -1, 0);														// store the split point as vertex
+			if(tmpe.size() == 2){
+				relation.resize(relation.size() + 1);
+				for(unsigned d = id; d < E[e].size(); d++)
+					relation[relation.size() - 1].push_back(relation[e][d]);
+				tmpe[0].v[0] = E[e].v[0];																// begining vertex of first half edge -> original begining vertex
+				tmpe[1].v[1] = E[e].v[1];																// ending vertex of second half edge -> original ending vertex
+				tmpv = E[e][id];
+				V.push_back(tmpv);
+				tmpe[0].v[1] = (unsigned)V.size() - 1;													// ending vertex of first half edge -> new vertex
+				tmpe[1].v[0] = (unsigned)V.size() - 1;													// begining vertex of second half edge -> new vertex
+				edge tmp(E[e]);
+				E[e] = tmpe[0];																			// replace original edge by first half edge
+				E.push_back(tmpe[1]);																	// push second half edge to the last
+				V[V.size() - 1].e[1].push_back(e);														// push first half edge to the incoming of new vertex
+				V[V.size() - 1].e[0].push_back((unsigned)E.size() - 1);									// push second half edge to the outgoing of new vertex
+				for(unsigned i = 0; i < V[tmp.v[1]].e[1].size(); i++)									// find the incoming edge of original ending vertex
+					if(V[tmp.v[1]].e[1][i] == e)
+						V[tmp.v[1]].e[1][i] = (unsigned)V.size() - 1;
+			}
+		}
+	}
+
+	/// This function compares two splitted networks and yields a mapping relationship between them according to NN
+	/// @param B is the network that the current network is going to map to
+	/// @param C is the mapping relationship: C[e1] = _e1 means e1 edge in current network is mapping the _e1 edge in B
+	/// @param device is the device that user want to use
+	void mapping(stim::network<T> B, std::vector<unsigned> &C, int device){
+		stim::network<T> A;								//generate a network storing the result of the comparison
+		A = (*this);
+
+		size_t n = A.E.size();							// the number of edges in A
+		size_t NB = B.E.size();							// the number of edges in B
+
+		C.resize(A.E.size());	
+
+		T *c;						                 	// centerline (array of double pointers) - points on kdtree must be double
+		size_t n_data = B.total_points();          		// set the number of points
+		c = (T*) malloc(sizeof(T) * n_data * 3); 				
+
+		unsigned t = 0;
+		for(unsigned e = 0; e < NB; e++){					// for each edge in the network
+			for(unsigned p = 0; p < B.E[e].size(); p++){	// for each point in the edge
+				for(unsigned d = 0; d < 3; d++){			// for each coordinate
+
+					c[t * 3 + d] = B.E[e][p][d];
+				}
+				t++;
+			}
+		}
-				m1 = 1.0f - gaussianFunction((T)dists[0], sigma);		//calculate the metric value based on the distance
-				R.E[e].set_mag(errormag_id, p, m1);					//set the error for the second point in the segment
+		//generate a KD-tree for network A
+		//float metric = 0.0;                               		// initialize metric to be returned after comparing the network
+		size_t MaxTreeLevels = 3;									// max tree level
+		
+#ifdef __CUDACC__
+		cudaSetDevice(device);
+		stim::kdtree<T, 3> kdt;								// initialize a pointer to a kd tree
+	
+		kdt.create(c, n_data, MaxTreeLevels);				// build a KD tree
+		for(unsigned e = 0; e < n; e++){					//for each edge in A
+			size_t errormag_id = A.E[e].nmags() - 1;		//get the id for the new magnitude
+			
+			//pre-judge to get rid of impossibly mapping edges
+			T M = 0;
+			for(unsigned p = 0; p < A.E[e].size(); p++)
+				M += A.E[e].m(p, errormag_id);
+			M = M / A.E[e].size();
+			if(M > metric_fac)
+				C[e] = (unsigned)-1;						//set the nearest edge of impossibly mapping edges to maximum of unsigned
+			else{
+				T* queryPt = new T[3];
+				T* dists = new T[1];
+				size_t* nnIdx = new size_t[1];
+
+				stim2array(queryPt, A.E[e][A.E[e].size()/2]);
+				kdt.search(queryPt, 1, nnIdx, dists);
+				
+				unsigned id = 0;							//mapping edge's idx
+				size_t num = 0;								//total number of points before #th edge
+				for(unsigned i = 0; i < NB; i++){
+					num += B.E[i].size();
+					if(nnIdx[0] < num){
+						C[e] = id;
+						break;
+					}
+					id++;
+				}
 			}
 		}
 #else
-		stim::cpu_kdtree<T, 3> kdt;
+		stim::kdtree<T, 3> kdt;
 		kdt.create(c, n_data, MaxTreeLevels);
 		T *dists = new T[1];								// near neighbor distances
 		size_t *nnIdx = new size_t[1];						// near neighbor indices // allocate near neigh indices
 		stim::vec3<T> p0, p1;
-		T m1;
 		T* queryPt = new T[3];
-		for(unsigned e = 0; e < R.E.size(); e++){			//for each edge in A
-			R.E[e].add_mag(0);								//add a new magnitude for the metric
-			for(unsigned p = 0; p < R.E[e].size(); p++){				//for each point in the edge
-
-				p1 = R.E[e][p];							//get the next point in the edge
+		for(unsigned e = 0; e < R.E.size(); e++){			// for each edge in A
+			T M;											// the sum of metrics of current edge
+			unsigned errormag_id = R.E[e].nmags() - 1;
+			for(unsigned p = 0; p < R.E[e].size(); p++)
+				M += A.E[e].m(p, errormag_id);
+			M = M / A.E[e].size();
+			if(M > metric_fac)								// if the sum is larger than the metric_fac, we assume that it doesn't has corresponding edge in B
+				C[e] = (unsigned)-1;
+			else{											// if it should have corresponding edge in B, then...
+				p1 = R.E[e][R.E[e].size()/2];							
 				stim2array(queryPt, p1);
-				kdt.cpu_search(queryPt, 1, nnIdx, dists);			//find the distance between A and the current network
-
-				m1 = 1.0f - gaussianFunction((T)dists[0], sigma);		//calculate the metric value based on the distance
-				R.E[e].set_mag(m1, p, 1);					//set the error for the second point in the segment
+				kdt.cpu_search(queryPt, 1, nnIdx, dists);	// search the tree		
+				
+				unsigned id = 0;							//mapping edge's idx
+				size_t num = 0;								//total number of points before #th edge
+				for(unsigned i = 0; i < NB; i++){
+					num += B.E[i].size();
+					if(nnIdx[0] < num){
+						C[e] = id;
+						break;
+					}
+					id++;
+				}
 			}
 		}
 #endif
-		return R;		//return the resulting network
 	}
 	/// Returns the number of magnitude values stored in each edge. This should be uniform across the network.
@@ -616,4 +968,4 @@ public:
 	}
 };		//end stim::network class
 };		//end stim namespace
 -#endif
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_CONV2_H
+#define STIM_CUDA_CONV2_H
+//#define __CUDACC__
+
+#ifdef __CUDACC__
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#endif
+
+namespace stim {
+#ifdef __CUDACC__
+	//Kernel function that performs the 2D convolution.
+	template<typename T, typename K>
+	__global__ void kernel_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
+		extern __shared__ T s[];								//declare a shared memory array
+		size_t xi = blockIdx.x * blockDim.x + threadIdx.x;	//threads correspond to indices into the output image
+		size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
+		size_t tid = threadIdx.y * blockDim.x + threadIdx.x;
+		size_t nt = blockDim.x * blockDim.y;
+
+		size_t cx = blockIdx.x * blockDim.x;					//find the upper left corner of the input region
+		size_t cy = blockIdx.y * blockDim.y;
+
+		size_t X = sx - kx + 1;								//calculate the size of the output image
+		size_t Y = sy - ky + 1;
+
+		if (cx >= X || cy >= Y) return;						//return if the entire block is outside the image
+		size_t smx = min(blockDim.x + kx - 1, sx - cx);			//size of the shared copy of the input image
+		size_t smy = min(blockDim.y + ky - 1, sy - cy);			//	min function is used to deal with boundary blocks
+		stim::cuda::threadedMemcpy2D<T>(s, smx, smy, in, cx, cy, sx, sy, tid, nt);	//copy the input region to shared memory
+		__syncthreads();
+
+		if (xi >= X || yi >= Y) return;						//returns if the thread is outside of the output image
+		
+		//loop through the kernel
+		size_t kxi, kyi;
+		K v = 0;
+		for (kyi = 0; kyi < ky; kyi++) {
+			for (kxi = 0; kxi < kx; kxi++) {
+				v += s[(threadIdx.y + kyi) * smx + threadIdx.x + kxi] * kernel[kyi * kx + kxi];
+				//v += in[(yi + kyi) * sx + xi + kxi] * kernel[kyi * kx + kxi];
+			}
+		}
+		out[yi * X + xi] = (T)v;								//write the result to global memory
+
+	}
+
+	//Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device.
+	//@param out is a pointer to the output image
+	//@param in is a pointer to the input image
+	//@param sx is the size of the input image along X
+	//@param sy is the size of the input image along Y
+	//@param kx is the size of the kernel along X
+	//@param ky is the size of the kernel along Y
+	template<typename T, typename K>
+	void gpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
+		cudaDeviceProp p;
+		HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
+		size_t tmax = p.maxThreadsPerBlock;
+		dim3 nt(sqrt(tmax), sqrt(tmax));					//calculate the block dimensions
+		size_t X = sx - kx + 1;								//calculate the size of the output image
+		size_t Y = sy - ky + 1;
+		dim3 nb(X / nt.x + 1, Y / nt.y + 1);							//calculate the grid dimensions
+		size_t sm = (nt.x + kx - 1) * (nt.y + ky - 1) * sizeof(T);		//shared memory bytes required to store block data
+		if (sm > p.sharedMemPerBlock) {
+			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
+			exit(1);
+		}
+		kernel_conv2 <<<nb, nt, sm>>> (out, in, kernel, sx, sy, kx, ky);	//launch the kernel
+	}
+#endif
+	//Performs a convolution of a 2D image. Only valid pixels based on the kernel are returned.
+	//	As a result, the output image will be smaller than the input image by (kx-1, ky-1)
+	//@param out is a pointer to the output image
+	//@param in is a pointer to the input image
+	//@param sx is the size of the input image along X
+	//@param sy is the size of the input image along Y
+	//@param kx is the size of the kernel along X
+	//@param ky is the size of the kernel along Y
+	template<typename T, typename K>
+	void cpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
+		size_t X = sx - kx + 1;					//x size of the output image
+		size_t Y = sy - ky + 1;					//y size of the output image
+
+#ifdef __CUDACC__
+		//allocate memory and copy everything to the GPU
+		T* gpu_in;
+		HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice));
+		K* gpu_kernel;
+		HANDLE_ERROR(cudaMalloc(&gpu_kernel, kx * ky * sizeof(K)));
+		HANDLE_ERROR(cudaMemcpy(gpu_kernel, kernel, kx * ky * sizeof(K), cudaMemcpyHostToDevice));
+		T* gpu_out;
+		HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T)));
+		gpu_conv2(gpu_out, gpu_in, gpu_kernel, sx, sy, kx, ky);								//execute the GPU kernel
+		HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost));	//copy the result to the host
+		HANDLE_ERROR(cudaFree(gpu_in));
+		HANDLE_ERROR(cudaFree(gpu_kernel));
+		HANDLE_ERROR(cudaFree(gpu_out));
+#else
+		K v;												//register stores the integral of the current pixel value
+		size_t yi, xi, kyi, kxi, yi_kyi_sx;
+		for (yi = 0; yi < Y; yi++) {					//for each pixel in the output image
+			for (xi = 0; xi < X; xi++) {
+				v = 0;
+				for (kyi = 0; kyi < ky; kyi++) {		//for each pixel in the kernel
+					yi_kyi_sx = (yi + kyi) * sx;
+					for (kxi = 0; kxi < kx; kxi++) {
+						v += in[yi_kyi_sx + xi + kxi] * kernel[kyi * kx + kxi];
+					}
+				}
+				out[yi * X + xi] = v;						//save the result to the output array
+			}
+		}
+		
+#endif
+	}
+
+
+}
+
+
+#endif
 \ No newline at end of file
-// right now the size of CUDA STACK is set to 1000, increase it if you mean to make deeper tree
+// right now the size of CUDA STACK is set to 50, increase it if you mean to make deeper tree
 // data should be stored in row-major
 // x1,x2,x3,x4,x5......
 // y1,y2,y3,y4,y5......
@@ -22,16 +22,16 @@
 #include <stim/visualization/aabbn.h>
 namespace stim {
-	namespace kdtree {
+	namespace cpu_kdtree {
 		template<typename T, int D>											// typename refers to float or double while D refers to dimension of points
 		struct point {
 			T dim[D];														// create a structure to store every one input point
 		};
 		template<typename T>
-		class kdnode {
+		class cpu_kdnode {
 		public:
-			kdnode() {														// constructor for initializing a kdnode
+			cpu_kdnode() {														// constructor for initializing a kdnode
 				parent = NULL;												// set every node's parent, left and right kdnode pointers to NULL
 				left = NULL;
 				right = NULL;
@@ -42,258 +42,12 @@ namespace stim {
 			}
 			int idx;														// index of current node
 			int parent_idx, left_idx, right_idx;							// index of parent, left and right nodes
-			kdnode *parent, *left, *right;									// parent, left and right kdnodes
+			cpu_kdnode *parent, *left, *right;									// parent, left and right kdnodes
 			T split_value;													// splitting value of current node
 			std::vector <size_t> indices;									// it indicates the points' indices that current node has 
 			size_t level;													// tree level of current node
 		};
-	}				// end of namespace kdtree
-
-	template <typename T, int D = 3>										// set dimension of data to default 3
-	class cpu_kdtree {
-	protected:
-		int current_axis;													// current judging axis
-		int n_id;															// store the total number of nodes
-		std::vector < typename kdtree::point<T, D> > *tmp_points;			// transfer or temperary points
-		std::vector < typename kdtree::point<T, D> > cpu_tmp_points;		// for cpu searching
-		kdtree::kdnode<T> *root;											// root node
-		static cpu_kdtree<T, D> *cur_tree_ptr;
-	public:
-		cpu_kdtree() {														// constructor for creating a cpu_kdtree
-			cur_tree_ptr = this;											// create  a class pointer points to the current class value
-			n_id = 0;														// set total number of points to default 0
-		}
-		
-		~cpu_kdtree() {											  			// destructor of cpu_kdtree
-			std::vector <kdtree::kdnode<T>*> next_nodes;
-			next_nodes.push_back(root);
-			while (next_nodes.size()) {
-				std::vector <kdtree::kdnode<T>*> next_search_nodes;
-				while (next_nodes.size()) {
-					kdtree::kdnode<T> *cur = next_nodes.back();
-					next_nodes.pop_back();
-					if (cur->left)
-						next_search_nodes.push_back(cur->left);
-					if (cur->right)
-						next_search_nodes.push_back(cur->right);
-					delete cur;
-				}
-				next_nodes = next_search_nodes;
-			}
-			root = NULL;
-		}
-		
-		void cpu_create(std::vector < typename kdtree::point<T, D> > &reference_points, size_t max_levels) {									
-			tmp_points = &reference_points;
-			root = new kdtree::kdnode<T>();									// initializing the root node
-			root->idx = n_id++;												// the index of root is 0
-			root->level = 0;												// tree level begins at 0
-			root->indices.resize(reference_points.size());					// get the number of points
-			for (size_t i = 0; i < reference_points.size(); i++) {
-				root->indices[i] = i;										// set indices of input points
-			}
-			std::vector <kdtree::kdnode<T>*> next_nodes;					// next nodes
-			next_nodes.push_back(root);										// push back the root node
-			while (next_nodes.size()) {
-				std::vector <kdtree::kdnode<T>*> next_search_nodes;			// next search nodes
-				while (next_nodes.size()) {									// two same WHILE is because we need to make a new vector to store nodes for search
-					kdtree::kdnode<T> *current_node = next_nodes.back();	// handle node one by one (right first) 
-					next_nodes.pop_back();									// pop out current node in order to store next round of nodes
-					if (current_node->level < max_levels) {					
-						if (current_node->indices.size() > 1) {				// split if the nonleaf node contains more than one point
-							kdtree::kdnode<T> *left = new kdtree::kdnode<T>();
-							kdtree::kdnode<T> *right = new kdtree::kdnode<T>();
-							left->idx = n_id++;								// set the index of current node's left node
-							right->idx = n_id++;							
-							split(current_node, left, right);				// split left and right and determine a node
-							std::vector <size_t> temp;						// empty vecters of int
-							//temp.resize(current_node->indices.size());
-							current_node->indices.swap(temp);				// clean up current node's indices
-							current_node->left = left;
-							current_node->right = right;
-							current_node->left_idx = left->idx;				
-							current_node->right_idx = right->idx;					
-							if (right->indices.size())
-								next_search_nodes.push_back(right);			// left pop out first
-							if (left->indices.size())
-								next_search_nodes.push_back(left);	
-						}
-					}
-				}
-				next_nodes = next_search_nodes;								// go deeper within the tree
-			}
-		}
-		
-		static bool sort_points(const size_t a, const size_t b) {									// create functor for std::sort
-			std::vector < typename kdtree::point<T, D> > &pts = *cur_tree_ptr->tmp_points;			// put cur_tree_ptr to current input points' pointer
-			return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis];
-		}
-		
-		void split(kdtree::kdnode<T> *cur, kdtree::kdnode<T> *left, kdtree::kdnode<T> *right) {
-			std::vector < typename kdtree::point<T, D> > &pts = *tmp_points;
-			current_axis = cur->level % D;												// indicate the judicative dimension or axis
-			std::sort(cur->indices.begin(), cur->indices.end(), sort_points);			// using SortPoints as comparison function to sort the data
-			size_t mid_value = cur->indices[cur->indices.size() / 2];                   // odd in the mid_value, even take the floor
-			cur->split_value = pts[mid_value].dim[current_axis];						// get the parent node
-			left->parent = cur;                                                         // set the parent of the next search nodes to current node
-			right->parent = cur;
-			left->level = cur->level + 1;												// level + 1
-			right->level = cur->level + 1;
-			left->parent_idx = cur->idx;                                                // set its parent node's index
-			right->parent_idx = cur->idx;                                            
-			for (size_t i = 0; i < cur->indices.size(); i++) {							// split into left and right half-space one by one
-				size_t idx = cur->indices[i];
-				if (pts[idx].dim[current_axis] < cur->split_value)
-					left->indices.push_back(idx);
-				else
-					right->indices.push_back(idx);
-			}
-		}
-		
-		void create(T *h_reference_points, size_t reference_count, size_t max_levels) {
-			std::vector < typename kdtree::point<T, D> > reference_points(reference_count);		// restore the reference points in particular way
-			for (size_t j = 0; j < reference_count; j++)
-				for (size_t i = 0; i < D; i++)
-					reference_points[j].dim[i] = h_reference_points[j * D + i];
-			cpu_create(reference_points, max_levels);
-			cpu_tmp_points = *tmp_points;
-		}
-		
-		int get_num_nodes() const {														// get the total number of nodes
-			return n_id; 
-		}
-		
-		kdtree::kdnode<T>* get_root() const {											// get the root node of tree
-			return root; 
-		}
-        
-		T cpu_distance(const kdtree::point<T, D> &a, const kdtree::point<T, D> &b) {
-			T distance = 0;
-
-			for (size_t i = 0; i < D; i++) {
-				T d = a.dim[i] - b.dim[i];
-				distance += d*d;
-			}
-			return distance;
-		}
-		
-		void cpu_search_at_node(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, size_t *index, T *distance, kdtree::kdnode<T> **node) {
-			T best_distance = FLT_MAX;                                              // initialize the best distance to max of floating point
-			size_t best_index = 0;
-			std::vector < typename kdtree::point<T, D> > pts = cpu_tmp_points;
-			while (true) {
-				size_t split_axis = cur->level % D;
-				if (cur->left == NULL) {                                            // risky but acceptable, same goes for right because left and right are in same pace
-					*node = cur;													// pointer points to a pointer
-					for (size_t i = 0; i < cur->indices.size(); i++) {
-						size_t idx = cur->indices[i];
-						T d = cpu_distance(query, pts[idx]);						// compute distances
-						/// if we want to compute k nearest neighbor, we can input the last resul
-						/// (last_best_dist < dist < best_dist) to select the next point until reaching to k
-						if (d < best_distance) {
-							best_distance = d;
-							best_index = idx;                                       // record the nearest neighbor index
-						}
-					}
-					break;                                                          // find the target point then break the loop
-				}
-				else if (query.dim[split_axis] < cur->split_value) {				// if it has son node, visit the next node on either left side or right side
-					cur = cur->left;
-				}
-				else {
-					cur = cur->right;
-				}
-			}
-			*index = best_index;
-			*distance = best_distance;
-		} 
-		
-		void cpu_search_at_node_range(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, T range, size_t *index, T *distance) {
-			T best_distance = FLT_MAX;                                              // initialize the best distance to max of floating point
-			size_t best_index = 0;
-			std::vector < typename kdtree::point<T, D> > pts = cpu_tmp_points;
-			std::vector < typename kdtree::kdnode<T>*> next_node;
-			next_node.push_back(cur);
-			while (next_node.size()) {
-				std::vector<typename kdtree::kdnode<T>*> next_search;
-				while (next_node.size()) {
-					cur = next_node.back();                                         
-					next_node.pop_back();
-					size_t split_axis = cur->level % D;
-					if (cur->left == NULL) {
-						for (size_t i = 0; i < cur->indices.size(); i++) {
-							size_t idx = cur->indices[i];
-							T d = cpu_distance(query, pts[idx]);
-							if (d < best_distance) {
-								best_distance = d;
-								best_index = idx;
-							}
-						}
-					}
-					else {
-						T d = query.dim[split_axis] - cur->split_value;				// computer distance along specific axis or dimension
-						/// there are three possibilities: on either left or right, and on both left and right
-						if (fabs(d) > range) {										// absolute value of floating point to see if distance will be larger that best_dist
-							if (d < 0)
-								next_search.push_back(cur->left);                   // every left[split_axis] is less and equal to cur->split_value, so it is possible to find the nearest point in this region
-							else
-								next_search.push_back(cur->right);
-						}
-						else {                                                      // it is possible that nereast neighbor will appear on both left and right 
-							next_search.push_back(cur->left);
-							next_search.push_back(cur->right);
-						}
-					}
-				}
-				next_node = next_search;                                            // pop out at least one time                                  
-			}
-			*index = best_index;
-			*distance = best_distance;
-		}
-		
-		void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) {
-			/// first convert the input query point into specific type
-			kdtree::point<T, D> query;
-			for (size_t j = 0; j < query_count; j++) {
-				for (size_t i = 0; i < D; i++)
-					query.dim[i] = h_query_points[j * D + i];
-				/// find the nearest node, this will be the upper bound for the next time searching
-				kdtree::kdnode<T> *best_node = NULL;
-				T best_distance = FLT_MAX;
-				size_t best_index = 0;
-				T radius = 0;																				// radius for range                                                                           
-				cpu_search_at_node(root, query, &best_index, &best_distance, &best_node);                   // simple search to rougly determine a result for next search step
-				radius = sqrt(best_distance);                                                               // It is possible that nearest will appear in another region
-				/// find other possibilities
-				kdtree::kdnode<T> *cur = best_node;
-				while (cur->parent != NULL) {																// every node that you pass will be possible to be the best node
-					/// go up
-					kdtree::kdnode<T> *parent = cur->parent;                                                // travel back to every node that we pass through
-					size_t split_axis = (parent->level) % D;
-					/// search other nodes
-					size_t tmp_index;
-					T tmp_distance = FLT_MAX;
-					if (fabs(parent->split_value - query.dim[split_axis]) <= radius) {
-						/// search opposite node
-						if (parent->left != cur)
-							cpu_search_at_node_range(parent->left, query, radius, &tmp_index, &tmp_distance);        // to see whether it is its mother node's left son node
-						else
-							cpu_search_at_node_range(parent->right, query, radius, &tmp_index, &tmp_distance);
-					}
-					if (tmp_distance < best_distance) {
-						best_distance = tmp_distance;
-						best_index = tmp_index;
-					}
-					cur = parent;
-				}
-				h_indices[j] = best_index;
-				h_distances[j] = best_distance;
-			}
-		}
-	};				//end class kdtree
-
-	template <typename T, int D>
-	cpu_kdtree<T, D>* cpu_kdtree<T, D>::cur_tree_ptr = NULL;												// definition of cur_tree_ptr pointer points to the current class
+	}				// end of namespace cpu_kdtree
 	template <typename T>
 	struct cuda_kdnode {
@@ -305,7 +59,7 @@ namespace stim {
 	};
 	template <typename T, int D>
-    __device__ T gpu_distance(kdtree::point<T, D> &a, kdtree::point<T, D> &b) {
+    __device__ T gpu_distance(cpu_kdtree::point<T, D> &a, cpu_kdtree::point<T, D> &b) {
 		T distance = 0;
 		for (size_t i = 0; i < D; i++) {
@@ -316,7 +70,7 @@ namespace stim {
 	}
 	template <typename T, int D>
-	__device__ void search_at_node(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, int cur, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) {
+	__device__ void search_at_node(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, int cur, cpu_kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) {
 		T best_distance = FLT_MAX;
 		size_t best_index = 0;
@@ -346,7 +100,7 @@ namespace stim {
 	}
 	template <typename T, int D>
-	__device__ void search_at_node_range(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
+	__device__ void search_at_node_range(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
 		T best_distance = FLT_MAX;
 		size_t best_index = 0;
@@ -405,7 +159,7 @@ namespace stim {
 	}
 	template <typename T, int D>
-	__device__ void search(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
+	__device__ void search(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
 		int best_node = 0;
 		T best_distance = FLT_MAX;
 		size_t best_index = 0;
@@ -438,7 +192,7 @@ namespace stim {
 	}
 	template <typename T, int D>
-	__global__ void search_batch(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) {
+	__global__ void search_batch(cuda_kdnode<T> *nodes, size_t *indices, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) {
 		size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
 		if (idx >= d_query_count) return;																														 // avoid segfault
@@ -446,11 +200,11 @@ namespace stim {
 	}
 	template <typename T, int D>
-	void search_stream(cuda_kdnode<T> *d_nodes, size_t *d_index, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *query_stream_points, size_t stream_count, size_t *indices, T *distances) {
+	void search_stream(cuda_kdnode<T> *d_nodes, size_t *d_index, cpu_kdtree::point<T, D> *d_reference_points, cpu_kdtree::point<T, D> *query_stream_points, size_t stream_count, size_t *indices, T *distances) {
 		unsigned int threads = (unsigned int)(stream_count > 1024 ? 1024 : stream_count);
 		unsigned int blocks = (unsigned int)(stream_count / threads + (stream_count % threads ? 1 : 0));
-		kdtree::point<T, D> *d_query_points;	
+		cpu_kdtree::point<T, D> *d_query_points;	
 		size_t *d_indices;
 		T *d_distances;
@@ -480,26 +234,121 @@ namespace stim {
 		HANDLE_ERROR(cudaFree(d_distances));
 	}
-	template <typename T, int D = 3>
-	class cuda_kdtree {
+	template <typename T, int D = 3>										// set dimension of data to default 3
+	class kdtree {
 	protected:
-		cuda_kdnode<T> *d_nodes;                                                    																		 
-		size_t *d_index;
-		kdtree::point<T, D>* d_reference_points;
-		size_t npts;
-		int num_nodes;
+		int current_axis;													// current judging axis
+		int n_id;															// store the total number of nodes
+		std::vector < typename cpu_kdtree::point<T, D> > *tmp_points;			// transfer or temperary points
+		std::vector < typename cpu_kdtree::point<T, D> > cpu_tmp_points;		// for cpu searching
+		cpu_kdtree::cpu_kdnode<T> *root;											// root node
+		static kdtree<T, D> *cur_tree_ptr;
+		#ifdef __CUDACC__
+			cuda_kdnode<T> *d_nodes;                                                    																		 
+			size_t *d_index;
+			cpu_kdtree::point<T, D>* d_reference_points;
+			size_t npts;
+			int num_nodes;
+		#endif
 	public:
-		~cuda_kdtree() {
+		kdtree() {														// constructor for creating a cpu_kdtree
+			cur_tree_ptr = this;											// create  a class pointer points to the current class value
+			n_id = 0;														// set total number of points to default 0
+		}
+		
+		~kdtree() {											  			// destructor of cpu_kdtree
+			std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes;
+			next_nodes.push_back(root);
+			while (next_nodes.size()) {
+				std::vector <cpu_kdtree::cpu_kdnode<T>*> next_search_nodes;
+				while (next_nodes.size()) {
+					cpu_kdtree::cpu_kdnode<T> *cur = next_nodes.back();
+					next_nodes.pop_back();
+					if (cur->left)
+						next_search_nodes.push_back(cur->left);
+					if (cur->right)
+						next_search_nodes.push_back(cur->right);
+					delete cur;
+				}
+				next_nodes = next_search_nodes;
+			}
+			root = NULL;
+			#ifdef __CUDACC__
 			HANDLE_ERROR(cudaFree(d_nodes));
 			HANDLE_ERROR(cudaFree(d_index));
 			HANDLE_ERROR(cudaFree(d_reference_points));
+			#endif
 		}
-
-		/// Create a KD-tree given a pointer to an array of reference points and the number of reference points
-		/// @param h_reference_points is a host array containing the reference points in (x0, y0, z0, ...., ) order
-		/// @param reference_count is the number of reference point in the array
-		/// @param max_levels is the deepest number of tree levels allowed
-		void create(T *h_reference_points, size_t reference_count, size_t max_levels = 3) {
+		
+		void cpu_create(std::vector < typename cpu_kdtree::point<T, D> > &reference_points, size_t max_levels) {									
+			tmp_points = &reference_points;
+			root = new cpu_kdtree::cpu_kdnode<T>();									// initializing the root node
+			root->idx = n_id++;												// the index of root is 0
+			root->level = 0;												// tree level begins at 0
+			root->indices.resize(reference_points.size());					// get the number of points
+			for (size_t i = 0; i < reference_points.size(); i++) {
+				root->indices[i] = i;										// set indices of input points
+			}
+			std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes;					// next nodes
+			next_nodes.push_back(root);										// push back the root node
+			while (next_nodes.size()) {
+				std::vector <cpu_kdtree::cpu_kdnode<T>*> next_search_nodes;			// next search nodes
+				while (next_nodes.size()) {									// two same WHILE is because we need to make a new vector to store nodes for search
+					cpu_kdtree::cpu_kdnode<T> *current_node = next_nodes.back();	// handle node one by one (right first) 
+					next_nodes.pop_back();									// pop out current node in order to store next round of nodes
+					if (current_node->level < max_levels) {					
+						if (current_node->indices.size() > 1) {				// split if the nonleaf node contains more than one point
+							cpu_kdtree::cpu_kdnode<T> *left = new cpu_kdtree::cpu_kdnode<T>();
+							cpu_kdtree::cpu_kdnode<T> *right = new cpu_kdtree::cpu_kdnode<T>();
+							left->idx = n_id++;								// set the index of current node's left node
+							right->idx = n_id++;							
+							split(current_node, left, right);				// split left and right and determine a node
+							std::vector <size_t> temp;						// empty vecters of int
+							//temp.resize(current_node->indices.size());
+							current_node->indices.swap(temp);				// clean up current node's indices
+							current_node->left = left;
+							current_node->right = right;
+							current_node->left_idx = left->idx;				
+							current_node->right_idx = right->idx;					
+							if (right->indices.size())
+								next_search_nodes.push_back(right);			// left pop out first
+							if (left->indices.size())
+								next_search_nodes.push_back(left);	
+						}
+					}
+				}
+				next_nodes = next_search_nodes;								// go deeper within the tree
+			}
+		}
+		
+		static bool sort_points(const size_t a, const size_t b) {									// create functor for std::sort
+			std::vector < typename cpu_kdtree::point<T, D> > &pts = *cur_tree_ptr->tmp_points;			// put cur_tree_ptr to current input points' pointer
+			return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis];
+		}
+		
+		void split(cpu_kdtree::cpu_kdnode<T> *cur, cpu_kdtree::cpu_kdnode<T> *left, cpu_kdtree::cpu_kdnode<T> *right) {
+			std::vector < typename cpu_kdtree::point<T, D> > &pts = *tmp_points;
+			current_axis = cur->level % D;												// indicate the judicative dimension or axis
+			std::sort(cur->indices.begin(), cur->indices.end(), sort_points);			// using SortPoints as comparison function to sort the data
+			size_t mid_value = cur->indices[cur->indices.size() / 2];                   // odd in the mid_value, even take the floor
+			cur->split_value = pts[mid_value].dim[current_axis];						// get the parent node
+			left->parent = cur;                                                         // set the parent of the next search nodes to current node
+			right->parent = cur;
+			left->level = cur->level + 1;												// level + 1
+			right->level = cur->level + 1;
+			left->parent_idx = cur->idx;                                                // set its parent node's index
+			right->parent_idx = cur->idx;                                            
+			for (size_t i = 0; i < cur->indices.size(); i++) {							// split into left and right half-space one by one
+				size_t idx = cur->indices[i];
+				if (pts[idx].dim[current_axis] < cur->split_value)
+					left->indices.push_back(idx);
+				else
+					right->indices.push_back(idx);
+			}
+		}
+		
+		void create(T *h_reference_points, size_t reference_count, size_t max_levels) {
+			#ifdef __CUDACC__
 			if (max_levels > 10) {
 				std::cout<<"The max_tree_levels should be smaller!"<<std::endl;
 				exit(1);
@@ -507,29 +356,28 @@ namespace stim {
 			//bb.init(&h_reference_points[0]);
 			//aaboundingboxing<T, D>(bb, h_reference_points, reference_count);
-			std::vector < typename kdtree::point<T, D>> reference_points(reference_count);																				// restore the reference points in particular way
+			std::vector < typename cpu_kdtree::point<T, D>> reference_points(reference_count);																				// restore the reference points in particular way
 			for (size_t j = 0; j < reference_count; j++)
 				for (size_t i = 0; i < D; i++)
-					reference_points[j].dim[i] = h_reference_points[j * D + i];	
-			cpu_kdtree<T, D> tree;																																// creating a tree on cpu
-			tree.cpu_create(reference_points, max_levels);																											// building a tree on cpu
-			kdtree::kdnode<T> *d_root = tree.get_root();
-			num_nodes = tree.get_num_nodes();
+					reference_points[j].dim[i] = h_reference_points[j * D + i];																																// creating a tree on cpu
+			(*this).cpu_create(reference_points, max_levels);																											// building a tree on cpu
+			cpu_kdtree::cpu_kdnode<T> *d_root = (*this).get_root();
+			num_nodes = (*this).get_num_nodes();
 			npts = reference_count;																												// also equals to reference_count
 			HANDLE_ERROR(cudaMalloc((void**)&d_nodes, sizeof(cuda_kdnode<T>) * num_nodes));																		// copy data from host to device
 			HANDLE_ERROR(cudaMalloc((void**)&d_index, sizeof(size_t) * npts));
-			HANDLE_ERROR(cudaMalloc((void**)&d_reference_points, sizeof(kdtree::point<T, D>) * npts));
+			HANDLE_ERROR(cudaMalloc((void**)&d_reference_points, sizeof(cpu_kdtree::point<T, D>) * npts));
 			std::vector < cuda_kdnode<T> > tmp_nodes(num_nodes);																									
 			std::vector <size_t> indices(npts);
-			std::vector <kdtree::kdnode<T>*> next_nodes;
+			std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes;
 			size_t cur_pos = 0;
 			next_nodes.push_back(d_root);
 			while (next_nodes.size()) {
-				std::vector <typename kdtree::kdnode<T>*> next_search_nodes;
+				std::vector <typename cpu_kdtree::cpu_kdnode<T>*> next_search_nodes;
 				while (next_nodes.size()) {
-					kdtree::kdnode<T> *cur = next_nodes.back();
+					cpu_kdtree::cpu_kdnode<T> *cur = next_nodes.back();
 					next_nodes.pop_back();
 					int id = cur->idx;																															// the nodes at same level are independent
 					tmp_nodes[id].level = cur->level;
@@ -559,16 +407,154 @@ namespace stim {
 			}
 			HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice));
 			HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
-			HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point<T, D>) * reference_count, cudaMemcpyHostToDevice));
+			HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(cpu_kdtree::point<T, D>) * reference_count, cudaMemcpyHostToDevice));
+
+			#else
+			std::vector < typename cpu_kdtree::point<T, D> > reference_points(reference_count);		// restore the reference points in particular way
+			for (size_t j = 0; j < reference_count; j++)
+				for (size_t i = 0; i < D; i++)
+					reference_points[j].dim[i] = h_reference_points[j * D + i];
+			cpu_create(reference_points, max_levels);
+			cpu_tmp_points = *tmp_points;
+
+			#endif
+		}
+		
+		int get_num_nodes() const {														// get the total number of nodes
+			return n_id; 
+		}
+		
+		cpu_kdtree::cpu_kdnode<T>* get_root() const {											// get the root node of tree
+			return root; 
+		}
+        
+		T cpu_distance(const cpu_kdtree::point<T, D> &a, const cpu_kdtree::point<T, D> &b) {
+			T distance = 0;
+
+			for (size_t i = 0; i < D; i++) {
+				T d = a.dim[i] - b.dim[i];
+				distance += d*d;
+			}
+			return distance;
+		}
+		
+		void cpu_search_at_node(cpu_kdtree::cpu_kdnode<T> *cur, const cpu_kdtree::point<T, D> &query, size_t *index, T *distance, cpu_kdtree::cpu_kdnode<T> **node) {
+			T best_distance = FLT_MAX;                                              // initialize the best distance to max of floating point
+			size_t best_index = 0;
+			std::vector < typename cpu_kdtree::point<T, D> > pts = cpu_tmp_points;
+			while (true) {
+				size_t split_axis = cur->level % D;
+				if (cur->left == NULL) {                                            // risky but acceptable, same goes for right because left and right are in same pace
+					*node = cur;													// pointer points to a pointer
+					for (size_t i = 0; i < cur->indices.size(); i++) {
+						size_t idx = cur->indices[i];
+						T d = cpu_distance(query, pts[idx]);						// compute distances
+						/// if we want to compute k nearest neighbor, we can input the last resul
+						/// (last_best_dist < dist < best_dist) to select the next point until reaching to k
+						if (d < best_distance) {
+							best_distance = d;
+							best_index = idx;                                       // record the nearest neighbor index
+						}
+					}
+					break;                                                          // find the target point then break the loop
+				}
+				else if (query.dim[split_axis] < cur->split_value) {				// if it has son node, visit the next node on either left side or right side
+					cur = cur->left;
+				}
+				else {
+					cur = cur->right;
+				}
+			}
+			*index = best_index;
+			*distance = best_distance;
+		} 
+		
+		void cpu_search_at_node_range(cpu_kdtree::cpu_kdnode<T> *cur, const cpu_kdtree::point<T, D> &query, T range, size_t *index, T *distance) {
+			T best_distance = FLT_MAX;                                              // initialize the best distance to max of floating point
+			size_t best_index = 0;
+			std::vector < typename cpu_kdtree::point<T, D> > pts = cpu_tmp_points;
+			std::vector < typename cpu_kdtree::cpu_kdnode<T>*> next_node;
+			next_node.push_back(cur);
+			while (next_node.size()) {
+				std::vector<typename cpu_kdtree::cpu_kdnode<T>*> next_search;
+				while (next_node.size()) {
+					cur = next_node.back();                                         
+					next_node.pop_back();
+					size_t split_axis = cur->level % D;
+					if (cur->left == NULL) {
+						for (size_t i = 0; i < cur->indices.size(); i++) {
+							size_t idx = cur->indices[i];
+							T d = cpu_distance(query, pts[idx]);
+							if (d < best_distance) {
+								best_distance = d;
+								best_index = idx;
+							}
+						}
+					}
+					else {
+						T d = query.dim[split_axis] - cur->split_value;				// computer distance along specific axis or dimension
+						/// there are three possibilities: on either left or right, and on both left and right
+						if (fabs(d) > range) {										// absolute value of floating point to see if distance will be larger that best_dist
+							if (d < 0)
+								next_search.push_back(cur->left);                   // every left[split_axis] is less and equal to cur->split_value, so it is possible to find the nearest point in this region
+							else
+								next_search.push_back(cur->right);
+						}
+						else {                                                      // it is possible that nereast neighbor will appear on both left and right 
+							next_search.push_back(cur->left);
+							next_search.push_back(cur->right);
+						}
+					}
+				}
+				next_node = next_search;                                            // pop out at least one time                                  
+			}
+			*index = best_index;
+			*distance = best_distance;
+		}
+		
+		void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) {
+			/// first convert the input query point into specific type
+			cpu_kdtree::point<T, D> query;
+			for (size_t j = 0; j < query_count; j++) {
+				for (size_t i = 0; i < D; i++)
+					query.dim[i] = h_query_points[j * D + i];
+				/// find the nearest node, this will be the upper bound for the next time searching
+				cpu_kdtree::cpu_kdnode<T> *best_node = NULL;
+				T best_distance = FLT_MAX;
+				size_t best_index = 0;
+				T radius = 0;																				// radius for range                                                                           
+				cpu_search_at_node(root, query, &best_index, &best_distance, &best_node);                   // simple search to rougly determine a result for next search step
+				radius = sqrt(best_distance);                                                               // It is possible that nearest will appear in another region
+				/// find other possibilities
+				cpu_kdtree::cpu_kdnode<T> *cur = best_node;
+				while (cur->parent != NULL) {																// every node that you pass will be possible to be the best node
+					/// go up
+					cpu_kdtree::cpu_kdnode<T> *parent = cur->parent;                                                // travel back to every node that we pass through
+					size_t split_axis = (parent->level) % D;
+					/// search other nodes
+					size_t tmp_index;
+					T tmp_distance = FLT_MAX;
+					if (fabs(parent->split_value - query.dim[split_axis]) <= radius) {
+						/// search opposite node
+						if (parent->left != cur)
+							cpu_search_at_node_range(parent->left, query, radius, &tmp_index, &tmp_distance);        // to see whether it is its mother node's left son node
+						else
+							cpu_search_at_node_range(parent->right, query, radius, &tmp_index, &tmp_distance);
+					}
+					if (tmp_distance < best_distance) {
+						best_distance = tmp_distance;
+						best_index = tmp_index;
+					}
+					cur = parent;
+				}
+				h_indices[j] = best_index;
+				h_distances[j] = best_distance;
+			}
 		}
-		/// Search the KD tree for nearest neighbors to a set of specified query points
-		/// @param h_query_points an array of query points in (x0, y0, z0, ...) order
-		/// @param query_count is the number of query points
-		/// @param indices are the indices to the nearest reference point for each query points
-		/// @param distances is an array containing the distance between each query point and the nearest reference point
 		void search(T *h_query_points, size_t query_count, size_t *indices, T *distances) {
-			std::vector < typename kdtree::point<T, D> > query_points(query_count);
+			#ifdef __CUDACC__
+			std::vector < typename cpu_kdtree::point<T, D> > query_points(query_count);
 			for (size_t j = 0; j < query_count; j++)
 				for (size_t i = 0; i < D; i++)
 					query_points[j].dim[i] = h_query_points[j * D + i];
@@ -595,7 +581,7 @@ namespace stim {
 				unsigned int threads = (unsigned int)(query_count > 1024 ? 1024 : query_count);
 				unsigned int blocks = (unsigned int)(query_count / threads + (query_count % threads ? 1 : 0));
-				kdtree::point<T, D> *d_query_points;																												// create a pointer pointing to query points on gpu
+				cpu_kdtree::point<T, D> *d_query_points;																												// create a pointer pointing to query points on gpu
 				size_t *d_indices;
 				T *d_distances;
@@ -624,64 +610,18 @@ namespace stim {
 				HANDLE_ERROR(cudaFree(d_indices));
 				HANDLE_ERROR(cudaFree(d_distances));
 			}
-		}
-
-		/// Return the number of points in the KD tree
-		size_t num_points() {
-			return npts;
-		}
-		stim::aabbn<T, D> getbox() {
-			size_t N = npts;
-			//std::vector < typename kdtree::point<T, D> > cpu_ref(npts);	//allocate space on the CPU for the reference points
-			T* cpu_ref = (T*)malloc(N * D * sizeof(T));					//allocate space on the CPU for the reference points
-			HANDLE_ERROR(cudaMemcpy(cpu_ref, d_reference_points, N * D * sizeof(T), cudaMemcpyDeviceToHost));	//copy from GPU to CPU
+			#else
+			cpu_search(h_query_points, query_count, indices, distances);
-			stim::aabbn<T, D> bb(cpu_ref);
+			#endif
-			for (size_t i = 1; i < N; i++) {							//for each reference point
-				//std::cout << "( " << cpu_ref[i * D + 0] << ", " << cpu_ref[i * D + 1] << ", " << cpu_ref[i * D + 2] << ")" << std::endl;
-				bb.insert(&cpu_ref[i * D]);
-			}
-			return bb;
 		}
-		//generate an implicit distance field for the KD-tree
-		void dist_field3(T* dist, size_t* dims, stim::aabbn<T, 3> bb) {
-			size_t N = 1;									//number of query points that make up the distance field
-			for (size_t d = 0; d < 3; d++) N *= dims[d];	//calculate the total number of query points
-
-			//calculate the grid spatial parameters
-			T dx = 0;
-			if (dims[0] > 1) dx = bb.length(0) / dims[0];
-			T dy = 0;
-			if (dims[1] > 1) dy = bb.length(1) / dims[1];
-			T dz = 0;
-			if (dims[2] > 1) dz = bb.length(2) / dims[2];
-
-			T* Q = (T*)malloc(N * 3 * sizeof(T));				//allocate space for the query points
-			size_t i;
-			for (size_t z = 0; z < dims[2]; z++) {				//for each query point (which is a point in the grid)
-				for (size_t y = 0; y < dims[1]; y++) {
-					for (size_t x = 0; x < dims[0]; x++) {
-						i = z * dims[1] * dims[0] + y * dims[0] + x;
-						Q[i * 3 + 0] = bb.low[0] + x * dx + dx / 2;
-						Q[i * 3 + 1] = bb.low[1] + y * dy + dy / 2;
-						Q[i * 3 + 2] = bb.low[2] + z * dz + dz / 2;
-						//std::cout << i<<"     "<<Q[i * 3 + 0] << "     " << Q[i * 3 + 1] << "     " << Q[i * 3 + 2] << std::endl;
-					}
-				}
-			}
-			size_t* temp = (size_t*)malloc(N * sizeof(size_t));	//allocate space to store the indices (unused)
-			search(Q, N, temp, dist);
-		}
+	};				//end class kdtree
-		//generate an implicit distance field for the KD-tree
-		void dist_field3(T* dist, size_t* dims) {
-			stim::aabbn<T, D> bb = getbox();					//get a bounding box around the tree
-			dist_field3(dist, dims, bb);
-		}
+	template <typename T, int D>
+	kdtree<T, D>* kdtree<T, D>::cur_tree_ptr = NULL;												// definition of cur_tree_ptr pointer points to the current class
-	};
 }				//end namespace stim
 #endif
 \ No newline at end of file
@@ -668,4 +668,4 @@ public:
 };
 }
 -#endif
+#endif
 \ No newline at end of file
@@ -44,25 +44,94 @@ public:
 	}
 	/// Render the network centerline as a series of line strips.
+	/// glCenterline0 is for only one input
+	void glCenterline0(){
+		if (!glIsList(dlist)) {					//if dlist isn't a display list, create it
+			dlist = glGenLists(1);				//generate a display list
+			glNewList(dlist, GL_COMPILE);		//start a new display list
+			for (unsigned e = 0; e < E.size(); e++) {				//for each edge in the network
+				glBegin(GL_LINE_STRIP);
+				for (unsigned p = 0; p < E[e].size(); p++) {			//for each point on that edge
+					glVertex3f(E[e][p][0], E[e][p][1], E[e][p][2]);		//set the vertex position based on the current point
+					glTexCoord1f(0);									//set white color
+				}
+				glEnd();
+			}
+			glEndList();						//end the display list
+		}
+		glCallList(dlist);					// render the display list
+	}
 	/// @param m specifies the magnitude value used as the vertex weight (radius, error, etc.)
 	void glCenterline(unsigned m = 0){
 		if(!glIsList(dlist)){					//if dlist isn't a display list, create it
-			dlist = glGenLists(1);				//generate a display list
+			dlist = glGenLists(3);				//generate a display list
 			glNewList(dlist, GL_COMPILE);		//start a new display list
 			for(unsigned e = 0; e < E.size(); e++){				//for each edge in the network
+				unsigned errormag_id = E[e].nmags() - 1;
 				glBegin(GL_LINE_STRIP);
 				for(unsigned p = 0; p < E[e].size(); p++){				//for each point on that edge
 					glVertex3f(E[e][p][0], E[e][p][1], E[e][p][2]);		//set the vertex position based on the current point
-					glTexCoord1f(E[e].m(p, m));						//set the texture coordinate based on the specified magnitude index
+					glTexCoord1f(E[e].m(p, errormag_id));						//set the texture coordinate based on the specified magnitude index
 				}
 				glEnd();
 			}
 			glEndList();						//end the display list
 		}		
 		glCallList(dlist);					// render the display list
+	}
+	void glRandColorCenterlineGT(GLuint &dlist1, std::vector<unsigned> map, std::vector<T> colormap){
+		if(!glIsList(dlist1)){
+			glNewList(dlist1, GL_COMPILE);
+			for(unsigned e = 0; e < E.size(); e++){
+				if(map[e] != unsigned(-1)){
+					glColor3f(colormap[e * 3 + 0], colormap[e * 3 + 1], colormap[e * 3 + 2]);
+					glBegin(GL_LINE_STRIP);
+					for(unsigned p = 0; p < E[e].size(); p++){
+						glVertex3f(E[e][p][0], E[e][p][1], E[e][p][2]);
+					}
+					glEnd();
+				}
+				else{
+					glColor3f(1.0, 1.0, 1.0);
+					glBegin(GL_LINE_STRIP);
+					for(unsigned p = 0; p < E[e].size(); p++){
+						glVertex3f(E[e][p][0], E[e][p][1], E[e][p][2]);
+					}
+					glEnd();
+				}
+			}
+			glEndList();
+		}
+		glCallList(dlist1);
+	}
+
+	void glRandColorCenterlineT(GLuint &dlist2, std::vector<unsigned> map, std::vector<T> colormap){
+		if(!glIsList(dlist2)){
+			glNewList(dlist2, GL_COMPILE);
+			for(unsigned e = 0; e < E.size(); e++){
+				if(map[e] != unsigned(-1)){
+					glColor3f(colormap[map[e] * 3 + 0], colormap[map[e] * 3 + 1], colormap[map[e] * 3 + 2]);
+					glBegin(GL_LINE_STRIP);
+					for(unsigned p = 0; p < E[e].size(); p++){
+						glVertex3f(E[e][p][0], E[e][p][1], E[e][p][2]);
+					}
+					glEnd();
+				}
+				else{
+					glColor3f(1.0, 1.0, 1.0);
+					glBegin(GL_LINE_STRIP);
+					for(unsigned p = 0; p < E[e].size(); p++){
+						glVertex3f(E[e][p][0], E[e][p][1], E[e][p][2]);
+					}
+					glEnd();
+				}
+			}
+			glEndList();
+		}
+		glCallList(dlist2);
 	}
 };		//end stim::gl_network class