vote3_atomic_aabb.cuh 4.84 KB
#ifndef STIM_CUDA_VOTE3_ATOMIC_AABB_H
#define STIM_CUDA_VOTE3_ATOMIC_AABB_H

#include <iostream>
#include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <stim/cuda/cudatools/error.h>
#include "cpyToshare.cuh"
#define M_PI	3.14159
#include <stim/math/circle.h>
#include <stim/math/vec3.h>
#include <stim/math/plane.h>
#include <stim/math/vector.h>
#include <stim/visualization/aabb3.h>

// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
template<typename T>
__global__ void vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, int rx, int ry, int rz, int x, int y, int z) {

	int xi = blockIdx.x * blockDim.x + threadIdx.x;						//calculate x,y,z coordinates for this thread

	int grid_y = y / blockDim.y;										//find the grid size along y
	int blockidx_y = blockIdx.y % grid_y;
	int yi = blockidx_y * blockDim.y + threadIdx.y;
	int zi = blockIdx.y / grid_y;

	if (xi >= x || yi >= y || zi >= z) return;

	int i = zi * x * y + yi * x + xi;	  // calculate the 1D index of the voter


	float rx_sq = rx * rx;        // compute the square for rmax 
	float ry_sq = ry * ry;
	float rz_sq = rz * rz;

	stim::vec3<float> g(gpu_grad[3 * i], gpu_grad[3 * i + 1], gpu_grad[3 * i + 2]);   // form a vec3 variable for the gradient vector
	stim::vec3<float> g_sph = g.cart2sph();			//convert cartesian coordinate to spherical for the gradient vector
	float n = 8;										//set the number of points to find the boundaries of the conical voting area
	float xc = rx * cos(g_sph[1]) * sin(g_sph[2]);			//calculate the center point of the surface of the voting area for the voter
	float yc = ry * sin(g_sph[1]) * sin(g_sph[2]);
	float zc = rz * cos(g_sph[2]);
	float r = sqrt(xc*xc + yc*yc + zc*zc);
	xc += xi;
	yc += yi;
	zc += zi;
	stim::vec3<float> center(xc, yc, zc);

	float d = 2 * r * tan(phi);		//find the diameter of the conical voting area
	stim::vec3<float> norm = g.norm();			//compute the normalize gradient vector
	float step = 360.0 / n;
	stim::circle<float>  cir(center, d, norm);
	stim::aabb3<int> bb(xi, yi, zi);
	bb.insert((int)xc, (int)yc, (int)zc);
	for (float j = 0; j <360.0; j += step) {
		stim::vec3<float> out = cir.p(j);
		bb.insert(out[0], out[1], out[2]);
	}
	bb.trim_low(xi - rx, yi - ry, zi - rz);
	bb.trim_low(0, 0, 0);
	bb.trim_high(xi + rx, yi + ry, zi + rz);
	bb.trim_high(x - 1, y - 1, z - 1);
	int bx, by, bz;
	int dx, dy, dz;
	float dx_sq, dy_sq, dz_sq;
	float dist, cos_diff;
	int idx_c;
	for (bz = bb.low[2]; bz <= bb.high[2]; bz++) {
		dz = bz - zi;							//compute the distance bw the voter and the current counter along z axis
		dz_sq = dz * dz;
		for (by = bb.low[1]; by <= bb.high[1]; by++) {
			dy = by - yi;								//compute the distance bw the voter and the current counter along y axis
			dy_sq = dy * dy;
			for (bx = bb.low[0]; bx <= bb.high[0]; bx++) {
				dx = bx - xi;								//compute the distance bw the voter and the current counter along x axis
				dx_sq = dx * dx;

				dist = sqrt(dx_sq + dy_sq + dz_sq);			//calculate the distance between the voter and the current counter
				cos_diff = (norm[0] * dx + norm[1] * dy + norm[2] * dz) / dist;			 // calculate the cosine of angle between the voter and the current counter
				if (((dx_sq / rx_sq + dy_sq / ry_sq + dz_sq / rz_sq) <= 1) && (cos_diff >= cos_phi)) {			//check if the current counter located in the voting area of the voter    
					idx_c = (bz* y + by) * x + bx;			//calculate the 1D index for the current counter
					atomicAdd(&gpu_vote[idx_c], g_sph[0]);
				}
			}
		}
	}
}

template<typename T>
void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z) {


	unsigned int max_threads = stim::maxThreadsPerBlock();
	dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads));
	dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
	vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, (int)r[0], (int)r[1], (int)r[2], (int)x, (int)y, (int)z);			//call the kernel to do the voting

}


template<typename T>
void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z) {

	//calculate the number of bytes in the array
	unsigned int bytes = x * y * z * sizeof(T);


	//allocate space on the GPU for the Vote Image
	T* gpu_vote;
	cudaMalloc(&gpu_vote, bytes);

	//allocate space on the GPU for the input Gradient image
	T* gpu_grad;
	cudaMalloc(&gpu_grad, bytes * 3);


	//copy the Gradient data to the GPU
	cudaMemcpy(gpu_grad, cpu_grad, bytes * 3, cudaMemcpyHostToDevice);


	//call the GPU version of the vote calculation function
	gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x, y, z);

	//copy the Vote Data back to the CPU
	cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost);

	//free allocated memory
	cudaFree(gpu_vote);
	cudaFree(gpu_grad);

}


#endif