codebase / stimlib

Browse Code »

Commit 6a54a547256f32bceb7afb81e456d9c5fc699bee

Authored by Jiaming Guo 2016-12-14 18:22:27 -0600

1 parent 42bb075c

new version of kdtree

Showing 1 changed file with 627 additions and 0 deletions Show diff stats

Inline Side-by-side

stim/structures/kdtreenew.cuh 0 → 100644

Show/Hide comments View file @6a54a54

		1	+// right now the size of CUDA STACK is set to 50, increase it if you mean to make deeper tree
		2	+// data should be stored in row-major
		3	+// x1,x2,x3,x4,x5......
		4	+// y1,y2,y3,y4,y5......
		5	+// ....................
		6	+// ....................
		7	+
		8	+#ifndef KDTREE_H
		9	+#define KDTREE_H
		10	+#define stack_size 50
		11	+
		12	+#include "device_launch_parameters.h"
		13	+#include <cuda.h>
		14	+#include <cuda_runtime_api.h>
		15	+#include "cuda_runtime.h"
		16	+#include <vector>
		17	+#include <cstring>
		18	+#include <float.h>
		19	+#include <iostream>
		20	+#include <algorithm>
		21	+#include <stim/cuda/cudatools/error.h>
		22	+#include <stim/visualization/aabbn.h>
		23	+
		24	+namespace stim {
		25	+ namespace cpu_kdtree {
		26	+ template<typename T, int D> // typename refers to float or double while D refers to dimension of points
		27	+ struct point {
		28	+ T dim[D]; // create a structure to store every one input point
		29	+ };
		30	+
		31	+ template<typename T>
		32	+ class cpu_kdnode {
		33	+ public:
		34	+ cpu_kdnode() { // constructor for initializing a kdnode
		35	+ parent = NULL; // set every node's parent, left and right kdnode pointers to NULL
		36	+ left = NULL;
		37	+ right = NULL;
		38	+ parent_idx = -1; // set parent node index to default -1
		39	+ left_idx = -1;
		40	+ right_idx = -1;
		41	+ split_value = -1; // set split_value to default -1
		42	+ }
		43	+ int idx; // index of current node
		44	+ int parent_idx, left_idx, right_idx; // index of parent, left and right nodes
		45	+ cpu_kdnode parent, left, *right; // parent, left and right kdnodes
		46	+ T split_value; // splitting value of current node
		47	+ std::vector <size_t> indices; // it indicates the points' indices that current node has
		48	+ size_t level; // tree level of current node
		49	+ };
		50	+ } // end of namespace cpu_kdtree
		51	+
		52	+ template <typename T>
		53	+ struct cuda_kdnode {
		54	+ int parent, left, right;
		55	+ T split_value;
		56	+ size_t num_index; // number of indices it has
		57	+ int index; // the beginning index
		58	+ size_t level;
		59	+ };
		60	+
		61	+ template <typename T, int D>
		62	+ __device__ T gpu_distance(cpu_kdtree::point<T, D> &a, cpu_kdtree::point<T, D> &b) {
		63	+ T distance = 0;
		64	+
		65	+ for (size_t i = 0; i < D; i++) {
		66	+ T d = a.dim[i] - b.dim[i];
		67	+ distance += d*d;
		68	+ }
		69	+ return distance;
		70	+ }
		71	+
		72	+ template <typename T, int D>
		73	+ __device__ void search_at_node(cuda_kdnode<T> nodes, size_t indices, cpu_kdtree::point<T, D> d_reference_points, int cur, cpu_kdtree::point<T, D> &d_query_point, size_t d_index, T d_distance, int d_node) {
		74	+ T best_distance = FLT_MAX;
		75	+ size_t best_index = 0;
		76	+
		77	+ while (true) { // break until reach the bottom
		78	+ int split_axis = nodes[cur].level % D;
		79	+ if (nodes[cur].left == -1) { // check whether it has left node or not
		80	+ *d_node = cur;
		81	+ for (int i = 0; i < nodes[cur].num_index; i++) {
		82	+ size_t idx = indices[nodes[cur].index + i];
		83	+ T dist = gpu_distance<T, D>(d_query_point, d_reference_points[idx]);
		84	+ if (dist < best_distance) {
		85	+ best_distance = dist;
		86	+ best_index = idx;
		87	+ }
		88	+ }
		89	+ break;
		90	+ }
		91	+ else if (d_query_point.dim[split_axis] < nodes[cur].split_value) { // jump into specific son node
		92	+ cur = nodes[cur].left;
		93	+ }
		94	+ else {
		95	+ cur = nodes[cur].right;
		96	+ }
		97	+ }
		98	+ *d_distance = best_distance;
		99	+ *d_index = best_index;
		100	+ }
		101	+
		102	+ template <typename T, int D>
		103	+ __device__ void search_at_node_range(cuda_kdnode<T> nodes, size_t indices, cpu_kdtree::point<T, D> d_reference_points, cpu_kdtree::point<T, D> &d_query_point, int cur, T range, size_t d_index, T d_distance, size_t id, int next_nodes, int next_search_nodes, int Judge) {
		104	+ T best_distance = FLT_MAX;
		105	+ size_t best_index = 0;
		106	+
		107	+ int next_nodes_pos = 0; // initialize pop out order index
		108	+ next_nodes[id * stack_size + next_nodes_pos] = cur; // find data that belongs to the very specific thread
		109	+ next_nodes_pos++;
		110	+
		111	+ while (next_nodes_pos) {
		112	+ int next_search_nodes_pos = 0; // record push back order index
		113	+ while (next_nodes_pos) {
		114	+ cur = next_nodes[id * stack_size + next_nodes_pos - 1]; // pop out the last push in one and keep poping out
		115	+ next_nodes_pos--;
		116	+ int split_axis = nodes[cur].level % D;
		117	+
		118	+ if (nodes[cur].left == -1) {
		119	+ for (int i = 0; i < nodes[cur].num_index; i++) {
		120	+ int idx = indices[nodes[cur].index + i]; // all indices are stored in one array, pick up from every node's beginning index
		121	+ T d = gpu_distance<T>(d_query_point, d_reference_points[idx]);
		122	+ if (d < best_distance) {
		123	+ best_distance = d;
		124	+ best_index = idx;
		125	+ }
		126	+ }
		127	+ }
		128	+ else {
		129	+ T d = d_query_point.dim[split_axis] - nodes[cur].split_value;
		130	+
		131	+ if (fabs(d) > range) {
		132	+ if (d < 0) {
		133	+ next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left;
		134	+ next_search_nodes_pos++;
		135	+ }
		136	+ else {
		137	+ next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right;
		138	+ next_search_nodes_pos++;
		139	+ }
		140	+ }
		141	+ else {
		142	+ next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right;
		143	+ next_search_nodes_pos++;
		144	+ next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left;
		145	+ next_search_nodes_pos++;
		146	+ if (next_search_nodes_pos > stack_size) {
		147	+ printf("Thread conflict might be caused by thread %d, so please try smaller input max_tree_levels\n", id);
		148	+ (*Judge)++;
		149	+ }
		150	+ }
		151	+ }
		152	+ }
		153	+ for (int i = 0; i < next_search_nodes_pos; i++)
		154	+ next_nodes[id * stack_size + i] = next_search_nodes[id * stack_size + i];
		155	+ next_nodes_pos = next_search_nodes_pos;
		156	+ }
		157	+ *d_distance = best_distance;
		158	+ *d_index = best_index;
		159	+ }
		160	+
		161	+ template <typename T, int D>
		162	+ __device__ void search(cuda_kdnode<T> nodes, size_t indices, cpu_kdtree::point<T, D> d_reference_points, cpu_kdtree::point<T, D> &d_query_point, size_t d_index, T d_distance, size_t id, int next_nodes, int next_search_nodes, int Judge) {
		163	+ int best_node = 0;
		164	+ T best_distance = FLT_MAX;
		165	+ size_t best_index = 0;
		166	+ T radius = 0;
		167	+
		168	+ search_at_node<T, D>(nodes, indices, d_reference_points, 0, d_query_point, &best_index, &best_distance, &best_node);
		169	+ radius = sqrt(best_distance); // get range
		170	+ int cur = best_node;
		171	+
		172	+ while (nodes[cur].parent != -1) {
		173	+ int parent = nodes[cur].parent;
		174	+ int split_axis = nodes[parent].level % D;
		175	+
		176	+ T tmp_dist = FLT_MAX;
		177	+ size_t tmp_idx;
		178	+ if (fabs(nodes[parent].split_value - d_query_point.dim[split_axis]) <= radius) {
		179	+ if (nodes[parent].left != cur)
		180	+ search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].left, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
		181	+ else
		182	+ search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].right, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
		183	+ }
		184	+ if (tmp_dist < best_distance) {
		185	+ best_distance = tmp_dist;
		186	+ best_index = tmp_idx;
		187	+ }
		188	+ cur = parent;
		189	+ }
		190	+ *d_distance = sqrt(best_distance);
		191	+ *d_index = best_index;
		192	+ }
		193	+
		194	+ template <typename T, int D>
		195	+ __global__ void search_batch(cuda_kdnode<T> nodes, size_t indices, cpu_kdtree::point<T, D> d_reference_points, cpu_kdtree::point<T, D> d_query_points, size_t d_query_count, size_t d_indices, T d_distances, int next_nodes, int next_search_nodes, int *Judge) {
		196	+ size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
		197	+ if (idx >= d_query_count) return; // avoid segfault
		198	+
		199	+ search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge); // every query points are independent
		200	+ }
		201	+
		202	+ template <typename T, int D>
		203	+ void search_stream(cuda_kdnode<T> d_nodes, size_t d_index, cpu_kdtree::point<T, D> d_reference_points, cpu_kdtree::point<T, D> query_stream_points, size_t stream_count, size_t indices, T distances) {
		204	+ unsigned int threads = (unsigned int)(stream_count > 1024 ? 1024 : stream_count);
		205	+ unsigned int blocks = (unsigned int)(stream_count / threads + (stream_count % threads ? 1 : 0));
		206	+
		207	+ cpu_kdtree::point<T, D> *d_query_points;
		208	+ size_t *d_indices;
		209	+ T *d_distances;
		210	+
		211	+ int *next_nodes;
		212	+ int *next_search_nodes;
		213	+
		214	+ HANDLE_ERROR(cudaMalloc((void*)&d_query_points, sizeof(T) stream_count * D));
		215	+ HANDLE_ERROR(cudaMalloc((void*)&d_indices, sizeof(size_t) stream_count));
		216	+ HANDLE_ERROR(cudaMalloc((void*)&d_distances, sizeof(T) stream_count));
		217	+ HANDLE_ERROR(cudaMalloc((void*)&next_nodes, threads blocks * stack_size * sizeof(int)));
		218	+ HANDLE_ERROR(cudaMalloc((void*)&next_search_nodes, threads blocks * stack_size * sizeof(int)));
		219	+ HANDLE_ERROR(cudaMemcpy(d_query_points, query_stream_points, sizeof(T) * stream_count * D, cudaMemcpyHostToDevice));
		220	+
		221	+ int *Judge = NULL;
		222	+
		223	+ search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, stream_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge);
		224	+
		225	+ if(Judge == NULL) {
		226	+ HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * stream_count, cudaMemcpyDeviceToHost));
		227	+ HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * stream_count, cudaMemcpyDeviceToHost));
		228	+ }
		229	+
		230	+ HANDLE_ERROR(cudaFree(next_nodes));
		231	+ HANDLE_ERROR(cudaFree(next_search_nodes));
		232	+ HANDLE_ERROR(cudaFree(d_query_points));
		233	+ HANDLE_ERROR(cudaFree(d_indices));
		234	+ HANDLE_ERROR(cudaFree(d_distances));
		235	+ }
		236	+
		237	+ template <typename T, int D = 3> // set dimension of data to default 3
		238	+ class kdtree {
		239	+ protected:
		240	+ int current_axis; // current judging axis
		241	+ int n_id; // store the total number of nodes
		242	+ std::vector < typename cpu_kdtree::point<T, D> > *tmp_points; // transfer or temperary points
		243	+ std::vector < typename cpu_kdtree::point<T, D> > cpu_tmp_points; // for cpu searching
		244	+ cpu_kdtree::cpu_kdnode<T> *root; // root node
		245	+ static kdtree<T, D> *cur_tree_ptr;
		246	+ #ifdef __CUDACC__
		247	+ cuda_kdnode<T> *d_nodes;
		248	+ size_t *d_index;
		249	+ cpu_kdtree::point<T, D>* d_reference_points;
		250	+ size_t npts;
		251	+ int num_nodes;
		252	+ #endif
		253	+ public:
		254	+ kdtree() { // constructor for creating a cpu_kdtree
		255	+ cur_tree_ptr = this; // create a class pointer points to the current class value
		256	+ n_id = 0; // set total number of points to default 0
		257	+ }
		258	+
		259	+ ~kdtree() { // destructor of cpu_kdtree
		260	+ std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes;
		261	+ next_nodes.push_back(root);
		262	+ while (next_nodes.size()) {
		263	+ std::vector <cpu_kdtree::cpu_kdnode<T>*> next_search_nodes;
		264	+ while (next_nodes.size()) {
		265	+ cpu_kdtree::cpu_kdnode<T> *cur = next_nodes.back();
		266	+ next_nodes.pop_back();
		267	+ if (cur->left)
		268	+ next_search_nodes.push_back(cur->left);
		269	+ if (cur->right)
		270	+ next_search_nodes.push_back(cur->right);
		271	+ delete cur;
		272	+ }
		273	+ next_nodes = next_search_nodes;
		274	+ }
		275	+ root = NULL;
		276	+ #ifdef __CUDACC__
		277	+ HANDLE_ERROR(cudaFree(d_nodes));
		278	+ HANDLE_ERROR(cudaFree(d_index));
		279	+ HANDLE_ERROR(cudaFree(d_reference_points));
		280	+ #endif
		281	+ }
		282	+
		283	+ void cpu_create(std::vector < typename cpu_kdtree::point<T, D> > &reference_points, size_t max_levels) {
		284	+ tmp_points = &reference_points;
		285	+ root = new cpu_kdtree::cpu_kdnode<T>(); // initializing the root node
		286	+ root->idx = n_id++; // the index of root is 0
		287	+ root->level = 0; // tree level begins at 0
		288	+ root->indices.resize(reference_points.size()); // get the number of points
		289	+ for (size_t i = 0; i < reference_points.size(); i++) {
		290	+ root->indices[i] = i; // set indices of input points
		291	+ }
		292	+ std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes; // next nodes
		293	+ next_nodes.push_back(root); // push back the root node
		294	+ while (next_nodes.size()) {
		295	+ std::vector <cpu_kdtree::cpu_kdnode<T>*> next_search_nodes; // next search nodes
		296	+ while (next_nodes.size()) { // two same WHILE is because we need to make a new vector to store nodes for search
		297	+ cpu_kdtree::cpu_kdnode<T> *current_node = next_nodes.back(); // handle node one by one (right first)
		298	+ next_nodes.pop_back(); // pop out current node in order to store next round of nodes
		299	+ if (current_node->level < max_levels) {
		300	+ if (current_node->indices.size() > 1) { // split if the nonleaf node contains more than one point
		301	+ cpu_kdtree::cpu_kdnode<T> *left = new cpu_kdtree::cpu_kdnode<T>();
		302	+ cpu_kdtree::cpu_kdnode<T> *right = new cpu_kdtree::cpu_kdnode<T>();
		303	+ left->idx = n_id++; // set the index of current node's left node
		304	+ right->idx = n_id++;
		305	+ split(current_node, left, right); // split left and right and determine a node
		306	+ std::vector <size_t> temp; // empty vecters of int
		307	+ //temp.resize(current_node->indices.size());
		308	+ current_node->indices.swap(temp); // clean up current node's indices
		309	+ current_node->left = left;
		310	+ current_node->right = right;
		311	+ current_node->left_idx = left->idx;
		312	+ current_node->right_idx = right->idx;
		313	+ if (right->indices.size())
		314	+ next_search_nodes.push_back(right); // left pop out first
		315	+ if (left->indices.size())
		316	+ next_search_nodes.push_back(left);
		317	+ }
		318	+ }
		319	+ }
		320	+ next_nodes = next_search_nodes; // go deeper within the tree
		321	+ }
		322	+ }
		323	+
		324	+ static bool sort_points(const size_t a, const size_t b) { // create functor for std::sort
		325	+ std::vector < typename cpu_kdtree::point<T, D> > &pts = *cur_tree_ptr->tmp_points; // put cur_tree_ptr to current input points' pointer
		326	+ return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis];
		327	+ }
		328	+
		329	+ void split(cpu_kdtree::cpu_kdnode<T> cur, cpu_kdtree::cpu_kdnode<T> left, cpu_kdtree::cpu_kdnode<T> *right) {
		330	+ std::vector < typename cpu_kdtree::point<T, D> > &pts = *tmp_points;
		331	+ current_axis = cur->level % D; // indicate the judicative dimension or axis
		332	+ std::sort(cur->indices.begin(), cur->indices.end(), sort_points); // using SortPoints as comparison function to sort the data
		333	+ size_t mid_value = cur->indices[cur->indices.size() / 2]; // odd in the mid_value, even take the floor
		334	+ cur->split_value = pts[mid_value].dim[current_axis]; // get the parent node
		335	+ left->parent = cur; // set the parent of the next search nodes to current node
		336	+ right->parent = cur;
		337	+ left->level = cur->level + 1; // level + 1
		338	+ right->level = cur->level + 1;
		339	+ left->parent_idx = cur->idx; // set its parent node's index
		340	+ right->parent_idx = cur->idx;
		341	+ for (size_t i = 0; i < cur->indices.size(); i++) { // split into left and right half-space one by one
		342	+ size_t idx = cur->indices[i];
		343	+ if (pts[idx].dim[current_axis] < cur->split_value)
		344	+ left->indices.push_back(idx);
		345	+ else
		346	+ right->indices.push_back(idx);
		347	+ }
		348	+ }
		349	+
		350	+ void create(T *h_reference_points, size_t reference_count, size_t max_levels) {
		351	+ #ifdef __CUDACC__
		352	+ if (max_levels > 10) {
		353	+ std::cout<<"The max_tree_levels should be smaller!"<<std::endl;
		354	+ exit(1);
		355	+ }
		356	+ //bb.init(&h_reference_points[0]);
		357	+ //aaboundingboxing<T, D>(bb, h_reference_points, reference_count);
		358	+
		359	+ std::vector < typename cpu_kdtree::point<T, D>> reference_points(reference_count); // restore the reference points in particular way
		360	+ for (size_t j = 0; j < reference_count; j++)
		361	+ for (size_t i = 0; i < D; i++)
		362	+ reference_points[j].dim[i] = h_reference_points[j * D + i]; // creating a tree on cpu
		363	+ (*this).cpu_create(reference_points, max_levels); // building a tree on cpu
		364	+ cpu_kdtree::cpu_kdnode<T> d_root = (this).get_root();
		365	+ num_nodes = (*this).get_num_nodes();
		366	+ npts = reference_count; // also equals to reference_count
		367	+
		368	+ HANDLE_ERROR(cudaMalloc((void*)&d_nodes, sizeof(cuda_kdnode<T>) num_nodes)); // copy data from host to device
		369	+ HANDLE_ERROR(cudaMalloc((void*)&d_index, sizeof(size_t) npts));
		370	+ HANDLE_ERROR(cudaMalloc((void*)&d_reference_points, sizeof(cpu_kdtree::point<T, D>) npts));
		371	+
		372	+ std::vector < cuda_kdnode<T> > tmp_nodes(num_nodes);
		373	+ std::vector <size_t> indices(npts);
		374	+ std::vector <cpu_kdtree::cpu_kdnode<T>*> next_nodes;
		375	+ size_t cur_pos = 0;
		376	+ next_nodes.push_back(d_root);
		377	+ while (next_nodes.size()) {
		378	+ std::vector <typename cpu_kdtree::cpu_kdnode<T>*> next_search_nodes;
		379	+ while (next_nodes.size()) {
		380	+ cpu_kdtree::cpu_kdnode<T> *cur = next_nodes.back();
		381	+ next_nodes.pop_back();
		382	+ int id = cur->idx; // the nodes at same level are independent
		383	+ tmp_nodes[id].level = cur->level;
		384	+ tmp_nodes[id].parent = cur->parent_idx;
		385	+ tmp_nodes[id].left = cur->left_idx;
		386	+ tmp_nodes[id].right = cur->right_idx;
		387	+ tmp_nodes[id].split_value = cur->split_value;
		388	+ tmp_nodes[id].num_index = cur->indices.size(); // number of index
		389	+ if (cur->indices.size()) {
		390	+ for (size_t i = 0; i < cur->indices.size(); i++)
		391	+ indices[cur_pos + i] = cur->indices[i];
		392	+
		393	+ tmp_nodes[id].index = (int)cur_pos; // beginning index of reference_points that every bottom node has
		394	+ cur_pos += cur->indices.size(); // store indices continuously for every query_point
		395	+ }
		396	+ else {
		397	+ tmp_nodes[id].index = -1;
		398	+ }
		399	+
		400	+ if (cur->left)
		401	+ next_search_nodes.push_back(cur->left);
		402	+
		403	+ if (cur->right)
		404	+ next_search_nodes.push_back(cur->right);
		405	+ }
		406	+ next_nodes = next_search_nodes;
		407	+ }
		408	+ HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice));
		409	+ HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
		410	+ HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(cpu_kdtree::point<T, D>) * reference_count, cudaMemcpyHostToDevice));
		411	+
		412	+ #else
		413	+ std::vector < typename cpu_kdtree::point<T, D> > reference_points(reference_count); // restore the reference points in particular way
		414	+ for (size_t j = 0; j < reference_count; j++)
		415	+ for (size_t i = 0; i < D; i++)
		416	+ reference_points[j].dim[i] = h_reference_points[j * D + i];
		417	+ cpu_create(reference_points, max_levels);
		418	+ cpu_tmp_points = *tmp_points;
		419	+
		420	+ #endif
		421	+ }
		422	+
		423	+ int get_num_nodes() const { // get the total number of nodes
		424	+ return n_id;
		425	+ }
		426	+
		427	+ cpu_kdtree::cpu_kdnode<T>* get_root() const { // get the root node of tree
		428	+ return root;
		429	+ }
		430	+
		431	+ T cpu_distance(const cpu_kdtree::point<T, D> &a, const cpu_kdtree::point<T, D> &b) {
		432	+ T distance = 0;
		433	+
		434	+ for (size_t i = 0; i < D; i++) {
		435	+ T d = a.dim[i] - b.dim[i];
		436	+ distance += d*d;
		437	+ }
		438	+ return distance;
		439	+ }
		440	+
		441	+ void cpu_search_at_node(cpu_kdtree::cpu_kdnode<T> cur, const cpu_kdtree::point<T, D> &query, size_t index, T distance, cpu_kdtree::cpu_kdnode<T> *node) {
		442	+ T best_distance = FLT_MAX; // initialize the best distance to max of floating point
		443	+ size_t best_index = 0;
		444	+ std::vector < typename cpu_kdtree::point<T, D> > pts = cpu_tmp_points;
		445	+ while (true) {
		446	+ size_t split_axis = cur->level % D;
		447	+ if (cur->left == NULL) { // risky but acceptable, same goes for right because left and right are in same pace
		448	+ *node = cur; // pointer points to a pointer
		449	+ for (size_t i = 0; i < cur->indices.size(); i++) {
		450	+ size_t idx = cur->indices[i];
		451	+ T d = cpu_distance(query, pts[idx]); // compute distances
		452	+ /// if we want to compute k nearest neighbor, we can input the last resul
		453	+ /// (last_best_dist < dist < best_dist) to select the next point until reaching to k
		454	+ if (d < best_distance) {
		455	+ best_distance = d;
		456	+ best_index = idx; // record the nearest neighbor index
		457	+ }
		458	+ }
		459	+ break; // find the target point then break the loop
		460	+ }
		461	+ else if (query.dim[split_axis] < cur->split_value) { // if it has son node, visit the next node on either left side or right side
		462	+ cur = cur->left;
		463	+ }
		464	+ else {
		465	+ cur = cur->right;
		466	+ }
		467	+ }
		468	+ *index = best_index;
		469	+ *distance = best_distance;
		470	+ }
		471	+
		472	+ void cpu_search_at_node_range(cpu_kdtree::cpu_kdnode<T> cur, const cpu_kdtree::point<T, D> &query, T range, size_t index, T *distance) {
		473	+ T best_distance = FLT_MAX; // initialize the best distance to max of floating point
		474	+ size_t best_index = 0;
		475	+ std::vector < typename cpu_kdtree::point<T, D> > pts = cpu_tmp_points;
		476	+ std::vector < typename cpu_kdtree::cpu_kdnode<T>*> next_node;
		477	+ next_node.push_back(cur);
		478	+ while (next_node.size()) {
		479	+ std::vector<typename cpu_kdtree::cpu_kdnode<T>*> next_search;
		480	+ while (next_node.size()) {
		481	+ cur = next_node.back();
		482	+ next_node.pop_back();
		483	+ size_t split_axis = cur->level % D;
		484	+ if (cur->left == NULL) {
		485	+ for (size_t i = 0; i < cur->indices.size(); i++) {
		486	+ size_t idx = cur->indices[i];
		487	+ T d = cpu_distance(query, pts[idx]);
		488	+ if (d < best_distance) {
		489	+ best_distance = d;
		490	+ best_index = idx;
		491	+ }
		492	+ }
		493	+ }
		494	+ else {
		495	+ T d = query.dim[split_axis] - cur->split_value; // computer distance along specific axis or dimension
		496	+ /// there are three possibilities: on either left or right, and on both left and right
		497	+ if (fabs(d) > range) { // absolute value of floating point to see if distance will be larger that best_dist
		498	+ if (d < 0)
		499	+ next_search.push_back(cur->left); // every left[split_axis] is less and equal to cur->split_value, so it is possible to find the nearest point in this region
		500	+ else
		501	+ next_search.push_back(cur->right);
		502	+ }
		503	+ else { // it is possible that nereast neighbor will appear on both left and right
		504	+ next_search.push_back(cur->left);
		505	+ next_search.push_back(cur->right);
		506	+ }
		507	+ }
		508	+ }
		509	+ next_node = next_search; // pop out at least one time
		510	+ }
		511	+ *index = best_index;
		512	+ *distance = best_distance;
		513	+ }
		514	+
		515	+ void cpu_search(T h_query_points, size_t query_count, size_t h_indices, T *h_distances) {
		516	+ /// first convert the input query point into specific type
		517	+ cpu_kdtree::point<T, D> query;
		518	+ for (size_t j = 0; j < query_count; j++) {
		519	+ for (size_t i = 0; i < D; i++)
		520	+ query.dim[i] = h_query_points[j * D + i];
		521	+ /// find the nearest node, this will be the upper bound for the next time searching
		522	+ cpu_kdtree::cpu_kdnode<T> *best_node = NULL;
		523	+ T best_distance = FLT_MAX;
		524	+ size_t best_index = 0;
		525	+ T radius = 0; // radius for range
		526	+ cpu_search_at_node(root, query, &best_index, &best_distance, &best_node); // simple search to rougly determine a result for next search step
		527	+ radius = sqrt(best_distance); // It is possible that nearest will appear in another region
		528	+ /// find other possibilities
		529	+ cpu_kdtree::cpu_kdnode<T> *cur = best_node;
		530	+ while (cur->parent != NULL) { // every node that you pass will be possible to be the best node
		531	+ /// go up
		532	+ cpu_kdtree::cpu_kdnode<T> *parent = cur->parent; // travel back to every node that we pass through
		533	+ size_t split_axis = (parent->level) % D;
		534	+ /// search other nodes
		535	+ size_t tmp_index;
		536	+ T tmp_distance = FLT_MAX;
		537	+ if (fabs(parent->split_value - query.dim[split_axis]) <= radius) {
		538	+ /// search opposite node
		539	+ if (parent->left != cur)
		540	+ cpu_search_at_node_range(parent->left, query, radius, &tmp_index, &tmp_distance); // to see whether it is its mother node's left son node
		541	+ else
		542	+ cpu_search_at_node_range(parent->right, query, radius, &tmp_index, &tmp_distance);
		543	+ }
		544	+ if (tmp_distance < best_distance) {
		545	+ best_distance = tmp_distance;
		546	+ best_index = tmp_index;
		547	+ }
		548	+ cur = parent;
		549	+ }
		550	+ h_indices[j] = best_index;
		551	+ h_distances[j] = best_distance;
		552	+ }
		553	+ }
		554	+
		555	+ void search(T h_query_points, size_t query_count, size_t indices, T *distances) {
		556	+ #ifdef __CUDACC__
		557	+ std::vector < typename cpu_kdtree::point<T, D> > query_points(query_count);
		558	+ for (size_t j = 0; j < query_count; j++)
		559	+ for (size_t i = 0; i < D; i++)
		560	+ query_points[j].dim[i] = h_query_points[j * D + i];
		561	+
		562	+ cudaDeviceProp prop;
		563	+ cudaGetDeviceProperties(&prop, 0);
		564	+
		565	+ size_t query_memory = D * sizeof(T) * query_count;
		566	+ size_t N = 3 * query_memory / prop.totalGlobalMem; //consider index and distance, roughly 3 times
		567	+ if (N > 1) {
		568	+ N++;
		569	+ size_t stream_count = query_count / N;
		570	+ for (size_t n = 0; n < N; n++) {
		571	+ size_t query_stream_start = n * stream_count;
		572	+ search_stream(d_nodes, d_index, d_reference_points, &query_points[query_stream_start], stream_count, &indices[query_stream_start], &distances[query_stream_start]);
		573	+ }
		574	+ size_t stream_remain_count = query_count - N * stream_count;
		575	+ if (stream_remain_count > 0) {
		576	+ size_t query_remain_start = N * stream_count;
		577	+ search_stream(d_nodes, d_index, d_reference_points, &query_points[query_remain_start], stream_remain_count, &indices[query_remain_start], &distances[query_remain_start]);
		578	+ }
		579	+ }
		580	+ else {
		581	+ unsigned int threads = (unsigned int)(query_count > 1024 ? 1024 : query_count);
		582	+ unsigned int blocks = (unsigned int)(query_count / threads + (query_count % threads ? 1 : 0));
		583	+
		584	+ cpu_kdtree::point<T, D> *d_query_points; // create a pointer pointing to query points on gpu
		585	+ size_t *d_indices;
		586	+ T *d_distances;
		587	+
		588	+ int *next_nodes; // create two STACK-like array
		589	+ int *next_search_nodes;
		590	+
		591	+ int *Judge = NULL; // judge variable to see whether one thread is overwrite another thread's memory
		592	+
		593	+ HANDLE_ERROR(cudaMalloc((void*)&d_query_points, sizeof(T) query_count * D));
		594	+ HANDLE_ERROR(cudaMalloc((void*)&d_indices, sizeof(size_t) query_count));
		595	+ HANDLE_ERROR(cudaMalloc((void*)&d_distances, sizeof(T) query_count));
		596	+ HANDLE_ERROR(cudaMalloc((void*)&next_nodes, threads blocks * stack_size * sizeof(int))); // STACK size right now is 50, you can change it if you mean to
		597	+ HANDLE_ERROR(cudaMalloc((void*)&next_search_nodes, threads blocks * stack_size * sizeof(int)));
		598	+ HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_count * D, cudaMemcpyHostToDevice));
		599	+
		600	+ search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, query_count, d_indices, d_distances, next_nodes, next_search_nodes, Judge);
		601	+
		602	+ if (Judge == NULL) { // do the following work if the thread works safely
		603	+ HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_count, cudaMemcpyDeviceToHost));
		604	+ HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_count, cudaMemcpyDeviceToHost));
		605	+ }
		606	+
		607	+ HANDLE_ERROR(cudaFree(next_nodes));
		608	+ HANDLE_ERROR(cudaFree(next_search_nodes));
		609	+ HANDLE_ERROR(cudaFree(d_query_points));
		610	+ HANDLE_ERROR(cudaFree(d_indices));
		611	+ HANDLE_ERROR(cudaFree(d_distances));
		612	+ }
		613	+
		614	+ #else
		615	+ cpu_search(h_query_points, query_count, indices, distances);
		616	+
		617	+ #endif
		618	+
		619	+ }
		620	+
		621	+ }; //end class kdtree
		622	+
		623	+ template <typename T, int D>
		624	+ kdtree<T, D>* kdtree<T, D>::cur_tree_ptr = NULL; // definition of cur_tree_ptr pointer points to the current class
		625	+
		626	+} //end namespace stim
		627	+#endif
0	\ No newline at end of file	628	\ No newline at end of file