Merge branch 'JACK' into 'master'

add stream to kdtree::search I did the bench test with my data. Please use your data to do the test to see whether it works. If you still encounter the same problem, please let me know! See merge request !16

Merge branch 'JACK' into 'master'
add stream to kdtree::search I did the bench test with my data. Please use your data to do the test to see whether it works. If you still encounter the same problem, please let me know! See merge request !16
David Mayerich
2 parents 9b563709 9dc73a42
Showing 1 changed file with 99 additions and 29 deletions Show diff stats
stim/structures/kdtree.cuh
@@ -63,6 +63,7 @@ namespace stim {
 			cur_tree_ptr = this;											// create  a class pointer points to the current class value
 			n_id = 0;														// set total number of points to default 0
 		}
+		
 		~cpu_kdtree() {											  			// destructor of cpu_kdtree
 			std::vector <kdtree::kdnode<T>*> next_nodes;
 			next_nodes.push_back(root);
@@ -81,6 +82,7 @@ namespace stim {
 			}
 			root = NULL;
 		}
+		
 		void cpu_create(std::vector < typename kdtree::point<T, D> > &reference_points, size_t max_levels) {									
 			tmp_points = &reference_points;
 			root = new kdtree::kdnode<T>();									// initializing the root node
@@ -121,10 +123,12 @@ namespace stim {
 				next_nodes = next_search_nodes;								// go deeper within the tree
 			}
 		}
+		
 		static bool sort_points(const size_t a, const size_t b) {									// create functor for std::sort
 			std::vector < typename kdtree::point<T, D> > &pts = *cur_tree_ptr->tmp_points;			// put cur_tree_ptr to current input points' pointer
 			return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis];
 		}
+		
 		void split(kdtree::kdnode<T> *cur, kdtree::kdnode<T> *left, kdtree::kdnode<T> *right) {
 			std::vector < typename kdtree::point<T, D> > &pts = *tmp_points;
 			current_axis = cur->level % D;												// indicate the judicative dimension or axis
@@ -145,6 +149,7 @@ namespace stim {
 					right->indices.push_back(idx);
 			}
 		}
+		
 		void create(T *h_reference_points, size_t reference_count, size_t max_levels) {
 			std::vector < typename kdtree::point<T, D> > reference_points(reference_count);		// restore the reference points in particular way
 			for (size_t j = 0; j < reference_count; j++)
@@ -153,13 +158,16 @@ namespace stim {
 			cpu_create(reference_points, max_levels);
 			cpu_tmp_points = *tmp_points;
 		}
+		
 		int get_num_nodes() const {														// get the total number of nodes
 			return n_id; 
 		}
+		
 		kdtree::kdnode<T>* get_root() const {											// get the root node of tree
 			return root; 
 		}
-        T cpu_distance(const kdtree::point<T, D> &a, const kdtree::point<T, D> &b) {
+        
+		T cpu_distance(const kdtree::point<T, D> &a, const kdtree::point<T, D> &b) {
 			T distance = 0;
  
 			for (size_t i = 0; i < D; i++) {
@@ -168,6 +176,7 @@ namespace stim {
 			}
 			return distance;
 		}
+		
 		void cpu_search_at_node(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, size_t *index, T *distance, kdtree::kdnode<T> **node) {
 			T best_distance = FLT_MAX;                                              // initialize the best distance to max of floating point
 			size_t best_index = 0;
@@ -198,6 +207,7 @@ namespace stim {
 			*index = best_index;
 			*distance = best_distance;
 		} 
+		
 		void cpu_search_at_node_range(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, T range, size_t *index, T *distance) {
 			T best_distance = FLT_MAX;                                              // initialize the best distance to max of floating point
 			size_t best_index = 0;
@@ -240,6 +250,7 @@ namespace stim {
 			*index = best_index;
 			*distance = best_distance;
 		}
+		
 		void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) {
 			/// first convert the input query point into specific type
 			kdtree::point<T, D> query;
@@ -303,6 +314,7 @@ namespace stim {
 		}
 		return distance;
 	}
+	
 	template <typename T, int D>
 	__device__ void search_at_node(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, int cur, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) {
 		T best_distance = FLT_MAX;
@@ -332,6 +344,7 @@ namespace stim {
 		*d_distance = best_distance;
 		*d_index = best_index;
 	}
+	
 	template <typename T, int D>
 	__device__ void search_at_node_range(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
 		T best_distance = FLT_MAX;
@@ -390,6 +403,7 @@ namespace stim {
 		*d_distance = best_distance;
 		*d_index = best_index;
 	}
+	
 	template <typename T, int D>
 	__device__ void search(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
 		int best_node = 0;
@@ -422,6 +436,7 @@ namespace stim {
 		*d_distance = sqrt(best_distance);
 		*d_index = best_index;
 	}
+	
 	template <typename T, int D>
 	__global__ void search_batch(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) {
 		size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -429,6 +444,41 @@ namespace stim {
  
 		search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge);    // every query points are independent
 	}
+	
+	template <typename T, int D>
+	void search_stream(cuda_kdnode<T> *d_nodes, size_t *d_index, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *query_stream_points, size_t stream_count, size_t *indices, T *distances) {
+		unsigned int threads = (unsigned int)(stream_count > 1024 ? 1024 : stream_count);
+		unsigned int blocks = (unsigned int)(stream_count / threads + (stream_count % threads ? 1 : 0));
+
+		kdtree::point<T, D> *d_query_points;	
+		size_t *d_indices;
+		T *d_distances;
+
+		int *next_nodes;																																	
+		int *next_search_nodes;
+		
+		HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * stream_count * D));
+		HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * stream_count));
+		HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * stream_count));
+		HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int)));																	
+		HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));	
+		HANDLE_ERROR(cudaMemcpy(d_query_points, query_stream_points, sizeof(T) * stream_count * D, cudaMemcpyHostToDevice));
+
+		int *Judge = NULL;
+
+		search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, stream_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge);
+
+		if(Judge == NULL) {
+			HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * stream_count, cudaMemcpyDeviceToHost));
+			HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * stream_count, cudaMemcpyDeviceToHost));
+		}
+
+		HANDLE_ERROR(cudaFree(next_nodes));
+		HANDLE_ERROR(cudaFree(next_search_nodes));
+		HANDLE_ERROR(cudaFree(d_query_points));
+		HANDLE_ERROR(cudaFree(d_indices));
+		HANDLE_ERROR(cudaFree(d_distances));
+	}
  
 	template <typename T, int D = 3>
 	class cuda_kdtree {
@@ -457,7 +507,7 @@ namespace stim {
 			//bb.init(&h_reference_points[0]);
 			//aaboundingboxing<T, D>(bb, h_reference_points, reference_count);
  
-			std::vector < typename kdtree::point<T, D> > reference_points(reference_count);																				// restore the reference points in particular way
+			std::vector < typename kdtree::point<T, D>> reference_points(reference_count);																				// restore the reference points in particular way
 			for (size_t j = 0; j < reference_count; j++)
 				for (size_t i = 0; i < D; i++)
 					reference_points[j].dim[i] = h_reference_points[j * D + i];	
@@ -509,7 +559,7 @@ namespace stim {
 			}
 			HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice));
 			HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
-			HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point<T, D>) * reference_points.size(), cudaMemcpyHostToDevice));
+			HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point<T, D>) * reference_count, cudaMemcpyHostToDevice));
 		}
  
 		/// Search the KD tree for nearest neighbors to a set of specified query points
@@ -523,37 +573,57 @@ namespace stim {
 				for (size_t i = 0; i < D; i++)
 					query_points[j].dim[i] = h_query_points[j * D + i];
  
-			unsigned int threads = (unsigned int)(query_points.size() > 1024 ? 1024 : query_points.size());
-			unsigned int blocks = (unsigned int)(query_points.size() / threads + (query_points.size() % threads ? 1 : 0));
+			cudaDeviceProp prop;
+			cudaGetDeviceProperties(&prop, 0);
+			
+			size_t query_memory = D * sizeof(T) * query_count;
+			size_t N = 3 * query_memory / prop.totalGlobalMem;					//consider index and distance, roughly 3 times
+			if (N > 1) {
+				N++;
+				size_t stream_count = query_count / N;
+				for (size_t n = 0; n < N; n++) {
+					size_t query_stream_start = n * stream_count;
+					search_stream(d_nodes, d_index, d_reference_points, &query_points[query_stream_start], stream_count, &indices[query_stream_start], &distances[query_stream_start]);
+				}
+				size_t stream_remain_count = query_count - N * stream_count;
+				if (stream_remain_count > 0) {
+					size_t query_remain_start = N * stream_count;
+					search_stream(d_nodes, d_index, d_reference_points, &query_points[query_remain_start], stream_remain_count, &indices[query_remain_start], &distances[query_remain_start]);
+				}
+			}
+			else {
+				unsigned int threads = (unsigned int)(query_count > 1024 ? 1024 : query_count);
+				unsigned int blocks = (unsigned int)(query_count / threads + (query_count % threads ? 1 : 0));
  
-			kdtree::point<T, D> *d_query_points;																												// create a pointer pointing to query points on gpu
-			size_t *d_indices;
-			T *d_distances;
+				kdtree::point<T, D> *d_query_points;																												// create a pointer pointing to query points on gpu
+				size_t *d_indices;
+				T *d_distances;
  
-			int *next_nodes;																																	// create two STACK-like array
-			int *next_search_nodes;
+				int *next_nodes;																																	// create two STACK-like array
+				int *next_search_nodes;
  
-			int *Judge = NULL;																																	// judge variable to see whether one thread is overwrite another thread's memory																						
+				int *Judge = NULL;																																	// judge variable to see whether one thread is overwrite another thread's memory																						
  
-			HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_points.size() * D));
-			HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_points.size()));
-			HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_points.size()));
-			HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int)));																	// STACK size right now is 50, you can change it if you mean to
-			HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));	
-			HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_points.size() * D, cudaMemcpyHostToDevice));
-
-			search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, query_points.size(), d_indices, d_distances, next_nodes, next_search_nodes, Judge);
-
-			if (Judge == NULL) {																																// do the following work if the thread works safely
-				HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_points.size(), cudaMemcpyDeviceToHost));
-				HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_points.size(), cudaMemcpyDeviceToHost));
-			}
+				HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_count * D));
+				HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_count));
+				HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_count));
+				HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int)));																	// STACK size right now is 50, you can change it if you mean to
+				HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));	
+				HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_count * D, cudaMemcpyHostToDevice));
+
+				search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, query_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge);
+
+				if (Judge == NULL) {																																// do the following work if the thread works safely
+					HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_count, cudaMemcpyDeviceToHost));
+					HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_count, cudaMemcpyDeviceToHost));
+				}
  
-			HANDLE_ERROR(cudaFree(next_nodes));
-			HANDLE_ERROR(cudaFree(next_search_nodes));
-			HANDLE_ERROR(cudaFree(d_query_points));
-			HANDLE_ERROR(cudaFree(d_indices));
-			HANDLE_ERROR(cudaFree(d_distances));
+				HANDLE_ERROR(cudaFree(next_nodes));
+				HANDLE_ERROR(cudaFree(next_search_nodes));
+				HANDLE_ERROR(cudaFree(d_query_points));
+				HANDLE_ERROR(cudaFree(d_indices));
+				HANDLE_ERROR(cudaFree(d_distances));
+			}
 		}
  
 		/// Return the number of points in the KD tree