#ifndef STIM_CUDA_VOTE3_ATOMIC_AABB_H #define STIM_CUDA_VOTE3_ATOMIC_AABB_H #include #include #include #include #include "cpyToshare.cuh" #define M_PI 3.14159 #include #include #include #include #include // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area template __global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread int grid_y = y / blockDim.y; //find the grid size along y int blockidx_y = blockIdx.y % grid_y; int yi = blockidx_y * blockDim.y + threadIdx.y; int zi = blockIdx.y / grid_y; if(xi>=x || yi>=y || zi>=z) return; int i = zi * x * y + yi * x + xi; // calculate the 1D index of the voter float rx_sq = rx * rx; // compute the square for rmax float ry_sq = ry * ry; float rz_sq = rz * rz; float dist, cos_diff; int idx_c; //float rmax = sqrt(rx_sq + ry_sq + rz_sq); stim::vec3 g(gpu_grad[3*i],gpu_grad[3*i+1],gpu_grad[3*i+2]); // form a vec3 variable for the gradient vector stim::vec3 g_sph = g.cart2sph(); //convert cartesian coordinate to spherical for the gradient vector int n =4; //set the number of points to find the boundaries of the conical voting area float xc = rx * cos(g_sph[1]) * sin(g_sph[2]); //calculate the center point of the surface of the voting area for the voter float yc = ry * sin(g_sph[1]) * sin(g_sph[2]) ; float zc = rz * cos(g_sph[2]) ; float r = sqrt(xc*xc + yc*yc + zc*zc); xc+=xi; yc+=yi; zc+=zi; stim::vec3 center(xc,yc,zc); float d = 2 * r * tan(acos(cos_phi) ); //find the diameter of the conical voting area stim::vec3 norm = g.norm(); //compute the normalize gradient vector float step = 360.0/(float) n; stim::circle cir(center, d, norm); stim::aabb3 bb(xi,yi,zi); bb.insert(xc,yc,zc); for(float j = 0; j <360.0; j += step){ stim::vec3 out = cir.p(j); bb.insert(out[0], out[1], out[2]); } bb.trim_low(0,0,0); bb.trim_high(x-1, y-1, z-1); int bx,by,bz; int dx, dy, dz; float dx_sq, dy_sq, dz_sq; for (bz=bb.low[2]; bz<=bb.high[2]; bz++){ dz = bz - zi; //compute the distance bw the voter and the current counter along z axis dz_sq = dz * dz; for (by=bb.low[1]; by<=bb.high[1]; by++){ dy = by - yi; //compute the distance bw the voter and the current counter along y axis dy_sq = dy * dy; for (bx=bb.low[0]; bx<=bb.high[0]; bx++){ dx = bx - xi; //compute the distance bw the voter and the current counter along x axis dx_sq = dx * dx; dist = sqrt(dx_sq + dy_sq + dz_sq); //calculate the distance between the voter and the current counter cos_diff = (norm[0] * dx + norm[1] * dy + norm[2] * dz)/dist; // calculate the cosine of angle between the voter and the current counter if ( ( (dx_sq/rx_sq + dy_sq/ry_sq + dz_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){ //check if the current counter located in the voting area of the voter idx_c = (bz* y + by) * x + bx; //calculate the 1D index for the current counter atomicAdd (&gpu_vote[idx_c] , g_sph[0]); } } } } } template void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(sqrt (max_threads),sqrt (max_threads)); dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); vote3 <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z); //call the kernel to do the voting } template void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ //calculate the number of bytes in the array unsigned int bytes = x * y * z * sizeof(T); //allocate space on the GPU for the Vote Image T* gpu_vote; cudaMalloc(&gpu_vote, bytes); //allocate space on the GPU for the input Gradient image T* gpu_grad; cudaMalloc(&gpu_grad, bytes*3); //copy the Gradient data to the GPU cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice); //call the GPU version of the vote calculation function gpu_vote3(gpu_vote, gpu_grad, cos_phi, r, x , y, z); //copy the Vote Data back to the CPU cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ; //free allocated memory cudaFree(gpu_vote); cudaFree(gpu_grad); } #endif