#ifndef STIM_CUDA_VOTE3_H #define STIM_CUDA_VOTE3_H #include #include #include #include #include // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area template __global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ //calculate x,y,z coordinates for this thread int xi = blockIdx.x * blockDim.x + threadIdx.x; //find the grid size along y int grid_y = y / blockDim.y; int blockidx_y = blockIdx.y % grid_y; int yi = blockidx_y * blockDim.y + threadIdx.y; int zi = blockIdx.y / grid_y; int i = zi * x * y + yi * x + xi; if(xi>=x || yi>=y || zi>=z) return; // define a local variable to sum the votes from the voters float sum = 0; int rx_sq = rx * rx; int ry_sq = ry * ry; int rz_sq = rz * rz; for (int z_v = -rz; z_v<=rz; z_v++){ for(int y_v = -ry; y_v <= ry; y_v++){ for(int x_v = -rx; x_v <= rx; x_v++){ //calculate the x, y ,z indices for the current voter int xi_v = (xi + x_v) ; int yi_v = (yi + y_v) ; int zi_v = (zi + z_v) ; if (zi_v >=0 && zi_v < z && yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){ //calculate the 1D index for the current voter unsigned int id_v = (zi_v) * x * y + (yi_v) * x + (xi_v); //find the gradient values along the x, y ,z axis, and the gradient magnitude for this voter float g_v_x = gpu_grad[id_v * 3 + 0]; float g_v_y = gpu_grad[id_v * 3 + 1]; float g_v_z = gpu_grad[id_v * 3 + 2]; float mag_v = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z); //calculate the distance between the pixel and the current voter. float x_sq = x_v * x_v; float y_sq = y_v * y_v; float z_sq = z_v * z_v; float d_pv = sqrt(x_sq + y_sq + z_sq); // calculate the angle between the pixel and the current voter. float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v); // check if the current voter is located in the voting area of this pixel. if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){ sum += mag_v; } } } } } gpu_vote[i] = sum; } template void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(sqrt (max_threads),sqrt (max_threads)); dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); //call the kernel to do the voting vote3 <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z); } template void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ //calculate the number of bytes in the array unsigned int bytes = x * y * z * sizeof(T); //allocate space on the GPU for the Vote Image T* gpu_vote; cudaMalloc(&gpu_vote, bytes); //allocate space on the GPU for the input Gradient image T* gpu_grad; cudaMalloc(&gpu_grad, bytes*3); //copy the Gradient data to the GPU cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice); //call the GPU version of the vote calculation function gpu_vote3(gpu_vote, gpu_grad, cos_phi, r, x , y, z); //copy the Vote Data back to the CPU cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ; //free allocated memory cudaFree(gpu_vote); cudaFree(gpu_grad); } #endif