cpp/update_dir3_aabb.cuh

#ifndef STIM_CUDA_UPDATE_DIR3_AABB_H
#define STIM_CUDA_UPDATE_DIR3_AABB_H
# include <iostream>
# include <cuda.h>
#include <stim/cuda/cudatools.h>
#include "cpyToshare.cuh"
#define M_PI	3.14159
#include <stim/math/circle.h>
#include <stim/math/vec3.h>
#include <stim/math/plane.h>
#include <stim/math/vector.h>
#include <stim/visualization/aabb3.h>
		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
		template<typename T>
		__global__ void update_dir3(T* gpu_dir, T* gpu_grad, T* gpu_vote, T cos_phi, int rx, int ry, int rz,  int x,  int y, int z){
			//extern __shared__ float s_vote[];
			
			int xi = blockIdx.x * blockDim.x + threadIdx.x;			//calculate x,y,z coordinates for this thread
			
			int grid_y = y / blockDim.y;						//find the grid size along y
			int blockidx_y = blockIdx.y % grid_y;
			int yi = blockidx_y * blockDim.y + threadIdx.y;
			int zi = blockIdx.y / grid_y;
			if(xi >= x|| yi >= y || zi>= z) return;
			int i = zi * x * y + yi * x + xi;				//compute the global 1D index for this pixel
			
			// find the starting points for this block along the x and y directions
			//int bxi = blockIdx.x * blockDim.x;
			//int byi = blockidx_y * blockDim.y;
			//find the starting points and the size of the window, which will be copied to the 2D-shared memory
			//int bxs = bxi - rx;
			//int bys = byi - ry;
			//int xwidth = 2 * rx + blockDim.x;
			//int ywidth = 2 * ry + blockDim.y;
			//compute the coordinations of this pixel in the 2D-shared memory.
			//int sx_rx = threadIdx.x + rx;
			//int sy_ry = threadIdx.y + ry;
			float rx_sq = rx * rx;        // compute the square for rmax 
			float ry_sq = ry * ry;
			float rz_sq = rz * rz;
			
			stim::vec3<float> g(gpu_grad[3*i],gpu_grad[3*i+1],gpu_grad[3*i+2]);   // form a vec3 variable for the gradient vector
			stim::vec3<float> g_sph = g.cart2sph();			//convert cartesian coordinate to spherical for the gradient vector
			int n =4;										//set the number of points to find the boundaries of the conical voting area
			float xc = rx * cos(g_sph[1]) * sin(g_sph[2]) ;			//calculate the center point of the surface of the voting area for the voter
			float yc = ry * sin(g_sph[1]) * sin(g_sph[2]) ;
			float zc = rz * cos(g_sph[2]) ;
			float r = sqrt(xc*xc + yc*yc + zc*zc);
			xc+=xi;
			yc+=yi;
			zc+=zi;
			stim::vec3<float> center(xc,yc,zc);
			float d = 2 * r * tan(acos(cos_phi) );		//find the diameter of the conical voting area
			stim::vec3<float> norm = g.norm();			//compute the normalize gradient vector
			float step = 360.0/(float) n;
			stim::circle<float>  cir(center, d, norm);
			stim::aabb3<int> bb(xi,yi,zi);
			bb.insert(xc,yc,zc);
			for(float j = 0; j <360.0; j += step){
				stim::vec3<float> out = cir.p(j);
				bb.insert(out[0], out[1], out[2]);
			}
			bb.trim_low(0,0,0);
			bb.trim_high(x-1, y-1, z-1);
			int bx,by,bz;
			int dx, dy, dz;
			float dx_sq, dy_sq, dz_sq;
			
			float dist, cos_diff;
			int idx_c;
			float max = 0;					// define a local variable to maximum value of the vote image in the voting area for this voter
			float l_vote = 0;
			
			float id_x = g[0];				// define local variables for the x, y, and z coordinations point to the vote direction
			float id_y = g[1];
			float id_z = g[2];
			for (bz=bb.low[2]; bz<=bb.high[2]; bz++){
				dz = bz - zi;							//compute the distance bw the voter and the current counter along z axis
				dz_sq = dz * dz;
				for (by=bb.low[1]; by<=bb.high[1]; by++){
					dy = by - yi;								//compute the distance bw the voter and the current counter along y axis
					dy_sq = dy * dy;
					for (bx=bb.low[0]; bx<=bb.high[0]; bx++){
						dx = bx - xi;								//compute the distance bw the voter and the current counter along x axis
						dx_sq = dx * dx;
						dist = sqrt(dx_sq + dy_sq + dz_sq);			//calculate the distance between the voter and the current counter
						cos_diff = (norm[0] * dx + norm[1] * dy +  norm[2] * dz)/dist;			 // calculate the cosine of angle between the voter and the current counter
						if ( ( (dx_sq/rx_sq + dy_sq/ry_sq + dz_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){			//check if the current counter located in the voting area of the voter    
							idx_c = (bz* y + by) * x + bx;			//calculate the 1D index for the current counter
							l_vote = gpu_vote[idx_c];
							if  (l_vote>max) {
								max = l_vote;
								id_x = dx;
								id_y = dy;
								id_z = dz;
							}
						}								
					}						
				}				
			}
			float m_id = sqrt (id_x*id_x + id_y*id_y + id_z*id_z);
			gpu_dir[i * 3 + 0] = g_sph[0] * (id_x/m_id);
			gpu_dir[i * 3 + 1] = g_sph[0] * (id_y/m_id);
			gpu_dir[i * 3 + 2] = g_sph[0] * (id_z/m_id);
		}
		// this kernel updates the gradient direction by the calculated voting direction.
		template<typename T>
		__global__ void update_grad3(T* gpu_grad, T* gpu_dir, int x, int y, int z){
			//calculate x,y,z coordinates for this thread
			int xi = blockIdx.x * blockDim.x + threadIdx.x;
			//find the grid size along y
			int grid_y = y / blockDim.y;
			int blockidx_y = blockIdx.y % grid_y;
			int yi = blockidx_y * blockDim.y + threadIdx.y;
			int zi = blockIdx.y / grid_y;
			int i = zi * x * y + yi * x + xi;
			if(xi >= x || yi >= y || zi >= z) return;
			//update the gradient image with the new direction direction
			gpu_grad[i * 3 + 0] = gpu_dir [i * 3 + 0];
			gpu_grad[i * 3 + 1] = gpu_dir [i * 3 + 1];
			gpu_grad[i * 3 + 2] = gpu_dir [i * 3 + 2];
		}
		
		template<typename T>
		void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
			unsigned int max_threads = stim::maxThreadsPerBlock();
			dim3 threads(sqrt (max_threads),sqrt (max_threads));
			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
			//unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*sizeof(T);			
			// allocate space on the GPU for the updated vote direction
			T* gpu_dir;
			cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3);	
			//call the kernel to calculate the new voting direction
			update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z);
			
			
			//call the kernel to update the gradient direction
			update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z);
			
			//free allocated memory
			cudaFree(gpu_dir);
		}
		
		template<typename T>
		void cpu_update_dir3(T* cpu_grad, T* cpu_vote, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
			//calculate the number of bytes in the array
			unsigned int bytes = x * y * z * sizeof(T);
			//allocate space on the GPU for the Vote data
			T* gpu_vote;
			cudaMalloc(&gpu_vote, bytes);
			//copy the input vote data to the GPU
			cudaMemcpy(gpu_vote, cpu_vote, bytes, cudaMemcpyHostToDevice);	
			//allocate space on the GPU for the Gradient data
			T* gpu_grad;
			cudaMalloc(&gpu_grad, bytes*3);
			//copy the Gradient data to the GPU
			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
			//call the GPU version of the update direction function
			gpu_update_dir3<T>(gpu_grad, gpu_vote, cos_phi, r, x , y, z);
							
			//copy the new gradient image back to the CPU
			cudaMemcpy(cpu_grad, gpu_grad, bytes*3, cudaMemcpyDeviceToHost) ;
			//free allocated memory
			cudaFree(gpu_vote);
			cudaFree(gpu_grad);
		}
		
#endif