cpp/vote3_atomic_aabb.cuh

#ifndef STIM_CUDA_VOTE3_ATOMIC_AABB_H
#define STIM_CUDA_VOTE3_ATOMIC_AABB_H
#include <iostream>
#include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <stim/cuda/cudatools/error.h>
#include "cpyToshare.cuh"
#define M_PI	3.14159
#include <stim/math/circle.h>
#include <stim/math/vec3.h>
#include <stim/math/plane.h>
#include <stim/math/vector.h>
#include <stim/visualization/aabb3.h>
		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
		template<typename T>
		__global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
			int xi = blockIdx.x * blockDim.x + threadIdx.x;						//calculate x,y,z coordinates for this thread
			
			int grid_y = y / blockDim.y;										//find the grid size along y
			int blockidx_y = blockIdx.y % grid_y;
			int yi = blockidx_y * blockDim.y + threadIdx.y;
			int zi = blockIdx.y / grid_y;
						
			if(xi>=x || yi>=y || zi>=z) return;
			int i = zi * x * y + yi * x + xi;	  // calculate the 1D index of the voter
			
			float rx_sq = rx * rx;        // compute the square for rmax 
			float ry_sq = ry * ry;
			float rz_sq = rz * rz;
			float dist, cos_diff;
			int idx_c;
			//float rmax = sqrt(rx_sq + ry_sq + rz_sq);
			stim::vec3<float> g(gpu_grad[3*i],gpu_grad[3*i+1],gpu_grad[3*i+2]);   // form a vec3 variable for the gradient vector
			stim::vec3<float> g_sph = g.cart2sph();			//convert cartesian coordinate to spherical for the gradient vector
			int n =4;										//set the number of points to find the boundaries of the conical voting area
			float xc = rx * cos(g_sph[1]) * sin(g_sph[2]);			//calculate the center point of the surface of the voting area for the voter
			float yc = ry * sin(g_sph[1]) * sin(g_sph[2]) ;
			float zc = rz * cos(g_sph[2]) ;
			float r = sqrt(xc*xc + yc*yc + zc*zc);
			xc+=xi;
			yc+=yi;
			zc+=zi;
			stim::vec3<float> center(xc,yc,zc);
			
			float d = 2 * r * tan(acos(cos_phi) );		//find the diameter of the conical voting area
			stim::vec3<float> norm = g.norm();			//compute the normalize gradient vector
			float step = 360.0/(float) n;
			stim::circle<float>  cir(center, d, norm);
			stim::aabb3<int> bb(xi,yi,zi);
			bb.insert(xc,yc,zc);
			for(float j = 0; j <360.0; j += step){
				stim::vec3<float> out = cir.p(j);
				bb.insert(out[0], out[1], out[2]);
			}
			bb.trim_low(0,0,0);
			bb.trim_high(x-1, y-1, z-1);
			int bx,by,bz;
			int dx, dy, dz;
			float dx_sq, dy_sq, dz_sq;
			for (bz=bb.low[2]; bz<=bb.high[2]; bz++){
				dz = bz - zi;							//compute the distance bw the voter and the current counter along z axis
				dz_sq = dz * dz;
				for (by=bb.low[1]; by<=bb.high[1]; by++){
					dy = by - yi;								//compute the distance bw the voter and the current counter along y axis
					dy_sq = dy * dy;
					for (bx=bb.low[0]; bx<=bb.high[0]; bx++){
						dx = bx - xi;								//compute the distance bw the voter and the current counter along x axis
						dx_sq = dx * dx;
						dist = sqrt(dx_sq + dy_sq + dz_sq);			//calculate the distance between the voter and the current counter
						cos_diff = (norm[0] * dx + norm[1] * dy +  norm[2] * dz)/dist;			 // calculate the cosine of angle between the voter and the current counter
						if ( ( (dx_sq/rx_sq + dy_sq/ry_sq + dz_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){			//check if the current counter located in the voting area of the voter    
							idx_c = (bz* y + by) * x + bx;			//calculate the 1D index for the current counter
							atomicAdd (&gpu_vote[idx_c] , g_sph[0]);
						}								
					}						
				}				
			}	
		}
		template<typename T>
		void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
			
			unsigned int max_threads = stim::maxThreadsPerBlock();
			dim3 threads(sqrt (max_threads),sqrt (max_threads));
			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
			vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);			//call the kernel to do the voting
		}
		template<typename T>
		void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
			//calculate the number of bytes in the array
			unsigned int bytes = x * y * z * sizeof(T);
			
			//allocate space on the GPU for the Vote Image
			T* gpu_vote;
			cudaMalloc(&gpu_vote, bytes);		
			//allocate space on the GPU for the input Gradient image
			T* gpu_grad;
			cudaMalloc(&gpu_grad, bytes*3);
			
			
			//copy the Gradient data to the GPU
			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
			
					
			//call the GPU version of the vote calculation function
			gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z);
							
			//copy the Vote Data back to the CPU
			cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;
			//free allocated memory
			cudaFree(gpu_vote);
			cudaFree(gpu_grad);
			
		}
#endif