#ifndef STIM_CUDA_VOTE3_ATOMIC_AABB_H #define STIM_CUDA_VOTE3_ATOMIC_AABB_H #include #include #include #include #include "cpyToshare.cuh" #define M_PI 3.14159 #include #include #include #include #include // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area template __global__ void vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, int rx, int ry, int rz, int x, int y, int z) { int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread int grid_y = y / blockDim.y; //find the grid size along y int blockidx_y = blockIdx.y % grid_y; int yi = blockidx_y * blockDim.y + threadIdx.y; int zi = blockIdx.y / grid_y; if (xi >= x || yi >= y || zi >= z) return; int i = zi * x * y + yi * x + xi; // calculate the 1D index of the voter float rx_sq = rx * rx; // compute the square for rmax float ry_sq = ry * ry; float rz_sq = rz * rz; stim::vec3 g(gpu_grad[3 * i], gpu_grad[3 * i + 1], gpu_grad[3 * i + 2]); // form a vec3 variable for the gradient vector stim::vec3 g_sph = g.cart2sph(); //convert cartesian coordinate to spherical for the gradient vector float n = 8; //set the number of points to find the boundaries of the conical voting area float xc = rx * cos(g_sph[1]) * sin(g_sph[2]); //calculate the center point of the surface of the voting area for the voter float yc = ry * sin(g_sph[1]) * sin(g_sph[2]); float zc = rz * cos(g_sph[2]); float r = sqrt(xc*xc + yc*yc + zc*zc); xc += xi; yc += yi; zc += zi; stim::vec3 center(xc, yc, zc); float d = 2 * r * tan(phi); //find the diameter of the conical voting area stim::vec3 norm = g.norm(); //compute the normalize gradient vector float step = 360.0 / n; stim::circle cir(center, d, norm); stim::aabb3 bb(xi, yi, zi); bb.insert((int)xc, (int)yc, (int)zc); for (float j = 0; j <360.0; j += step) { stim::vec3 out = cir.p(j); bb.insert(out[0], out[1], out[2]); } bb.trim_low(xi - rx, yi - ry, zi - rz); bb.trim_low(0, 0, 0); bb.trim_high(xi + rx, yi + ry, zi + rz); bb.trim_high(x - 1, y - 1, z - 1); int bx, by, bz; int dx, dy, dz; float dx_sq, dy_sq, dz_sq; float dist, cos_diff; int idx_c; for (bz = bb.low[2]; bz <= bb.high[2]; bz++) { dz = bz - zi; //compute the distance bw the voter and the current counter along z axis dz_sq = dz * dz; for (by = bb.low[1]; by <= bb.high[1]; by++) { dy = by - yi; //compute the distance bw the voter and the current counter along y axis dy_sq = dy * dy; for (bx = bb.low[0]; bx <= bb.high[0]; bx++) { dx = bx - xi; //compute the distance bw the voter and the current counter along x axis dx_sq = dx * dx; dist = sqrt(dx_sq + dy_sq + dz_sq); //calculate the distance between the voter and the current counter cos_diff = (norm[0] * dx + norm[1] * dy + norm[2] * dz) / dist; // calculate the cosine of angle between the voter and the current counter if (((dx_sq / rx_sq + dy_sq / ry_sq + dz_sq / rz_sq) <= 1) && (cos_diff >= cos_phi)) { //check if the current counter located in the voting area of the voter idx_c = (bz* y + by) * x + bx; //calculate the 1D index for the current counter atomicAdd(&gpu_vote[idx_c], g_sph[0]); } } } } } template void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z) { unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads)); dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z); vote3 << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, (int)r[0], (int)r[1], (int)r[2], (int)x, (int)y, (int)z); //call the kernel to do the voting } template void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z) { //calculate the number of bytes in the array unsigned int bytes = x * y * z * sizeof(T); //allocate space on the GPU for the Vote Image T* gpu_vote; cudaMalloc(&gpu_vote, bytes); //allocate space on the GPU for the input Gradient image T* gpu_grad; cudaMalloc(&gpu_grad, bytes * 3); //copy the Gradient data to the GPU cudaMemcpy(gpu_grad, cpu_grad, bytes * 3, cudaMemcpyHostToDevice); //call the GPU version of the vote calculation function gpu_vote3(gpu_vote, gpu_grad, cos_phi, r, x, y, z); //copy the Vote Data back to the CPU cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost); //free allocated memory cudaFree(gpu_vote); cudaFree(gpu_grad); } #endif