From bd574305b41cb1a39409ca8c4101dd365e50089d Mon Sep 17 00:00:00 2001
From: Jiaming Guo <jackguo523@gmail.com>
Date: Fri, 9 Dec 2016 15:48:12 -0600
Subject: [PATCH] add stream to kdtree::search in order to stream the query points

---
 stim/structures/kdtree.cuh | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 99 insertions(+), 29 deletions(-)
diff --git a/stim/structures/kdtree.cuh b/stim/structures/kdtree.cuh
index 9d401ce..b0e34df 100644
--- a/stim/structures/kdtree.cuh
+++ b/stim/structures/kdtree.cuh
@@ -63,6 +63,7 @@ namespace stim {
 			cur_tree_ptr = this;											// create  a class pointer points to the current class value
 			n_id = 0;														// set total number of points to default 0
 		}
+		
 		~cpu_kdtree() {											  			// destructor of cpu_kdtree
 			std::vector <kdtree::kdnode<T>*> next_nodes;
 			next_nodes.push_back(root);
@@ -81,6 +82,7 @@ namespace stim {
 			}
 			root = NULL;
 		}
+		
 		void cpu_create(std::vector < typename kdtree::point<T, D> > &reference_points, size_t max_levels) {									
 			tmp_points = &reference_points;
 			root = new kdtree::kdnode<T>();									// initializing the root node
@@ -121,10 +123,12 @@ namespace stim {
 				next_nodes = next_search_nodes;								// go deeper within the tree
 			}
 		}
+		
 		static bool sort_points(const size_t a, const size_t b) {									// create functor for std::sort
 			std::vector < typename kdtree::point<T, D> > &pts = *cur_tree_ptr->tmp_points;			// put cur_tree_ptr to current input points' pointer
 			return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis];
 		}
+		
 		void split(kdtree::kdnode<T> *cur, kdtree::kdnode<T> *left, kdtree::kdnode<T> *right) {
 			std::vector < typename kdtree::point<T, D> > &pts = *tmp_points;
 			current_axis = cur->level % D;												// indicate the judicative dimension or axis
@@ -145,6 +149,7 @@ namespace stim {
 					right->indices.push_back(idx);
 			}
 		}
+		
 		void create(T *h_reference_points, size_t reference_count, size_t max_levels) {
 			std::vector < typename kdtree::point<T, D> > reference_points(reference_count);		// restore the reference points in particular way
 			for (size_t j = 0; j < reference_count; j++)
@@ -153,13 +158,16 @@ namespace stim {
 			cpu_create(reference_points, max_levels);
 			cpu_tmp_points = *tmp_points;
 		}
+		
 		int get_num_nodes() const {														// get the total number of nodes
 			return n_id; 
 		}
+		
 		kdtree::kdnode<T>* get_root() const {											// get the root node of tree
 			return root; 
 		}
-        T cpu_distance(const kdtree::point<T, D> &a, const kdtree::point<T, D> &b) {
+        
+		T cpu_distance(const kdtree::point<T, D> &a, const kdtree::point<T, D> &b) {
 			T distance = 0;
 
 			for (size_t i = 0; i < D; i++) {
@@ -168,6 +176,7 @@ namespace stim {
 			}
 			return distance;
 		}
+		
 		void cpu_search_at_node(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, size_t *index, T *distance, kdtree::kdnode<T> **node) {
 			T best_distance = FLT_MAX;                                              // initialize the best distance to max of floating point
 			size_t best_index = 0;
@@ -198,6 +207,7 @@ namespace stim {
 			*index = best_index;
 			*distance = best_distance;
 		} 
+		
 		void cpu_search_at_node_range(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, T range, size_t *index, T *distance) {
 			T best_distance = FLT_MAX;                                              // initialize the best distance to max of floating point
 			size_t best_index = 0;
@@ -240,6 +250,7 @@ namespace stim {
 			*index = best_index;
 			*distance = best_distance;
 		}
+		
 		void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) {
 			/// first convert the input query point into specific type
 			kdtree::point<T, D> query;
@@ -303,6 +314,7 @@ namespace stim {
 		}
 		return distance;
 	}
+	
 	template <typename T, int D>
 	__device__ void search_at_node(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, int cur, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) {
 		T best_distance = FLT_MAX;
@@ -332,6 +344,7 @@ namespace stim {
 		*d_distance = best_distance;
 		*d_index = best_index;
 	}
+	
 	template <typename T, int D>
 	__device__ void search_at_node_range(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
 		T best_distance = FLT_MAX;
@@ -390,6 +403,7 @@ namespace stim {
 		*d_distance = best_distance;
 		*d_index = best_index;
 	}
+	
 	template <typename T, int D>
 	__device__ void search(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
 		int best_node = 0;
@@ -422,6 +436,7 @@ namespace stim {
 		*d_distance = sqrt(best_distance);
 		*d_index = best_index;
 	}
+	
 	template <typename T, int D>
 	__global__ void search_batch(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) {
 		size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -429,6 +444,41 @@ namespace stim {
 
 		search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge);    // every query points are independent
 	}
+	
+	template <typename T, int D>
+	void search_stream(cuda_kdnode<T> *d_nodes, size_t *d_index, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *query_stream_points, size_t stream_count, size_t *indices, T *distances) {
+		unsigned int threads = (unsigned int)(stream_count > 1024 ? 1024 : stream_count);
+		unsigned int blocks = (unsigned int)(stream_count / threads + (stream_count % threads ? 1 : 0));
+
+		kdtree::point<T, D> *d_query_points;	
+		size_t *d_indices;
+		T *d_distances;
+
+		int *next_nodes;																																	
+		int *next_search_nodes;
+		
+		HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * stream_count * D));
+		HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * stream_count));
+		HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * stream_count));
+		HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int)));																	
+		HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));	
+		HANDLE_ERROR(cudaMemcpy(d_query_points, query_stream_points, sizeof(T) * stream_count * D, cudaMemcpyHostToDevice));
+
+		int *Judge = NULL;
+
+		search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, stream_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge);
+
+		if(Judge == NULL) {
+			HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * stream_count, cudaMemcpyDeviceToHost));
+			HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * stream_count, cudaMemcpyDeviceToHost));
+		}
+
+		HANDLE_ERROR(cudaFree(next_nodes));
+		HANDLE_ERROR(cudaFree(next_search_nodes));
+		HANDLE_ERROR(cudaFree(d_query_points));
+		HANDLE_ERROR(cudaFree(d_indices));
+		HANDLE_ERROR(cudaFree(d_distances));
+	}
 
 	template <typename T, int D = 3>
 	class cuda_kdtree {
@@ -457,7 +507,7 @@ namespace stim {
 			//bb.init(&h_reference_points[0]);
 			//aaboundingboxing<T, D>(bb, h_reference_points, reference_count);
 
-			std::vector < typename kdtree::point<T, D> > reference_points(reference_count);																				// restore the reference points in particular way
+			std::vector <kdtree::point<T, D>> reference_points(reference_count);																				// restore the reference points in particular way
 			for (size_t j = 0; j < reference_count; j++)
 				for (size_t i = 0; i < D; i++)
 					reference_points[j].dim[i] = h_reference_points[j * D + i];	
@@ -509,7 +559,7 @@ namespace stim {
 			}
 			HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice));
 			HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
-			HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point<T, D>) * reference_points.size(), cudaMemcpyHostToDevice));
+			HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point<T, D>) * reference_count, cudaMemcpyHostToDevice));
 		}
 
 		/// Search the KD tree for nearest neighbors to a set of specified query points
@@ -523,37 +573,57 @@ namespace stim {
 				for (size_t i = 0; i < D; i++)
 					query_points[j].dim[i] = h_query_points[j * D + i];
 
-			unsigned int threads = (unsigned int)(query_points.size() > 1024 ? 1024 : query_points.size());
-			unsigned int blocks = (unsigned int)(query_points.size() / threads + (query_points.size() % threads ? 1 : 0));
+			cudaDeviceProp prop;
+			cudaGetDeviceProperties(&prop, 0);
+			
+			size_t query_memory = D * sizeof(T) * query_count;
+			size_t N = 3 * query_memory / prop.totalGlobalMem;					//consider index and distance, roughly 3 times
+			if (N > 1) {
+				N++;
+				size_t stream_count = query_count / N;
+				for (size_t n = 0; n < N; n++) {
+					size_t query_stream_start = n * stream_count;
+					search_stream(d_nodes, d_index, d_reference_points, &query_points[query_stream_start], stream_count, &indices[query_stream_start], &distances[query_stream_start]);
+				}
+				size_t stream_remain_count = query_count - N * stream_count;
+				if (stream_remain_count > 0) {
+					size_t query_remain_start = N * stream_count;
+					search_stream(d_nodes, d_index, d_reference_points, &query_points[query_remain_start], stream_remain_count, &indices[query_remain_start], &distances[query_remain_start]);
+				}
+			}
+			else {
+				unsigned int threads = (unsigned int)(query_count > 1024 ? 1024 : query_count);
+				unsigned int blocks = (unsigned int)(query_count / threads + (query_count % threads ? 1 : 0));
 
-			kdtree::point<T, D> *d_query_points;																												// create a pointer pointing to query points on gpu
-			size_t *d_indices;
-			T *d_distances;
+				kdtree::point<T, D> *d_query_points;																												// create a pointer pointing to query points on gpu
+				size_t *d_indices;
+				T *d_distances;
 
-			int *next_nodes;																																	// create two STACK-like array
-			int *next_search_nodes;
+				int *next_nodes;																																	// create two STACK-like array
+				int *next_search_nodes;
 
-			int *Judge = NULL;																																	// judge variable to see whether one thread is overwrite another thread's memory																						
+				int *Judge = NULL;																																	// judge variable to see whether one thread is overwrite another thread's memory																						
 		
-			HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_points.size() * D));
-			HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_points.size()));
-			HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_points.size()));
-			HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int)));																	// STACK size right now is 50, you can change it if you mean to
-			HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));	
-			HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_points.size() * D, cudaMemcpyHostToDevice));
-
-			search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, query_points.size(), d_indices, d_distances, next_nodes, next_search_nodes, Judge);
-
-			if (Judge == NULL) {																																// do the following work if the thread works safely
-				HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_points.size(), cudaMemcpyDeviceToHost));
-				HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_points.size(), cudaMemcpyDeviceToHost));
-			}
+				HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_count * D));
+				HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_count));
+				HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_count));
+				HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int)));																	// STACK size right now is 50, you can change it if you mean to
+				HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));	
+				HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_count * D, cudaMemcpyHostToDevice));
+
+				search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, query_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge);
+
+				if (Judge == NULL) {																																// do the following work if the thread works safely
+					HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_count, cudaMemcpyDeviceToHost));
+					HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_count, cudaMemcpyDeviceToHost));
+				}
 
-			HANDLE_ERROR(cudaFree(next_nodes));
-			HANDLE_ERROR(cudaFree(next_search_nodes));
-			HANDLE_ERROR(cudaFree(d_query_points));
-			HANDLE_ERROR(cudaFree(d_indices));
-			HANDLE_ERROR(cudaFree(d_distances));
+				HANDLE_ERROR(cudaFree(next_nodes));
+				HANDLE_ERROR(cudaFree(next_search_nodes));
+				HANDLE_ERROR(cudaFree(d_query_points));
+				HANDLE_ERROR(cudaFree(d_indices));
+				HANDLE_ERROR(cudaFree(d_distances));
+			}
 		}
 
 		/// Return the number of points in the KD tree
--
libgit2 0.21.4