#ifndef STIM_CUDA_VOTE3_ATOMIC_H #define STIM_CUDA_VOTE3_ATOMIC_H #include #include #include #include #include "cpyToshare.cuh" // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area template __global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread int grid_y = y / blockDim.y; //find the grid size along y int blockidx_y = blockIdx.y % grid_y; int yi = blockidx_y * blockDim.y + threadIdx.y; int zi = blockIdx.y / grid_y; if(xi>=x || yi>=y || zi>=z) return; int i = zi * x * y + yi * x + xi; // calculate the 1D index of the voter float gx_v = gpu_grad[3*i]; // find the gradient information in cartesian coordinate for the voter - global memory fetch float gy_v = gpu_grad[3*i+1]; float gz_v = gpu_grad[3*i+2]; float mag_v = sqrt(gx_v*gx_v + gy_v*gy_v + gz_v*gz_v); // compute the gradient magnitude for the voter //float gx_v_n = gx_v/mag_v; // normalize the gradient vector for the voter //float gy_v_n = gy_v/mag_v; //float gz_v_n = gz_v/mag_v; float rx_sq = rx * rx; // compute the square for rmax float ry_sq = ry * ry; float rz_sq = rz * rz; float x_sq, y_sq, z_sq, dist, cos_diff; int xi_c, yi_c, zi_c, idx_c; for (int z_c=-rz; z_c<=rz; z_c++){ zi_c = zi + z_c; // calculate the z position for the current counter if (zi_c =0){ // make sure the current counter is inside the image z_sq = z_c * z_c; for (int y_c=-ry; y_c<=ry; y_c++){ yi_c = yi + y_c; if (yi_c < y && yi_c>=0){ y_sq = y_c * y_c; for (int x_c=-rx; x_c<=rx; x_c++){ xi_c = xi + x_c; if (xi_c < x && xi_c>=0){ x_sq = x_c * x_c; dist = sqrt(x_sq + y_sq + z_sq); //calculate the distance between the voter and the current counter cos_diff = (gx_v * x_c + gy_v * y_c + gz_v * z_c)/(dist * mag_v); // calculate the cosine of angle between the voter and the current counter if ( ( (x_sq/rx_sq + y_sq/ry_sq + z_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){ idx_c = (zi_c * y + yi_c) * x + xi_c; //calculate the 1D index for the current counter atomicAdd (&gpu_vote[idx_c] , mag_v); } } } } } } } } template void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(sqrt (max_threads),sqrt (max_threads)); dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); //unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*4*sizeof(T); //call the kernel to do the voting vote3 <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z); } template void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ //calculate the number of bytes in the array unsigned int bytes = x * y * z * sizeof(T); //allocate space on the GPU for the Vote Image T* gpu_vote; cudaMalloc(&gpu_vote, bytes); //allocate space on the GPU for the input Gradient image T* gpu_grad; cudaMalloc(&gpu_grad, bytes*3); //copy the Gradient data to the GPU cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice); //call the GPU version of the vote calculation function gpu_vote3(gpu_vote, gpu_grad, cos_phi, r, x , y, z); //copy the Vote Data back to the CPU cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ; //free allocated memory cudaFree(gpu_vote); cudaFree(gpu_grad); } #endif