#ifndef STIM_CUDA_UPDATE_DIR3_H #define STIM_CUDA_UPDATE_DIR3_H # include # include #include #include #include // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. template __global__ void update_dir3(T* gpu_dir, T* gpu_grad, T* gpu_vote, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ //calculate x,y,z coordinates for this thread int xi = blockIdx.x * blockDim.x + threadIdx.x; //find the grid size along y int grid_y = y / blockDim.y; int blockidx_y = blockIdx.y % grid_y; int yi = blockidx_y * blockDim.y + threadIdx.y; int zi = blockIdx.y / grid_y; int i = zi * x * y + yi * x + xi; if(xi >= x|| yi >= y || zi>= z) return; //find the gradient values along the x, y ,z axis, and the gradient magnitude for the voter float g_v_x = gpu_grad[i * 3 + 0]; float g_v_y = gpu_grad[i * 3 + 1]; float g_v_z = gpu_grad[i * 3 + 2]; float mag_v = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z); // define a local variable to maximum value of the vote image in the voting area for this voter float max = 0; float l_vote = 0; // define local variables for the x, y, and z coordinations point to the vote direction float id_x = g_v_x; float id_y = g_v_y; float id_z = g_v_z; int rx_sq = rx * rx; int ry_sq = ry * ry; int rz_sq = rz * rz; for (int z_p = -rz; z_p<=rz; z_p++){ for(int y_p = -ry; y_p <= ry; y_p++){ for(int x_p = -rx; x_p <= rx; x_p++){ //calculate the x, y ,z indices for the current pixel int xi_p = (xi + x_p) ; int yi_p = (yi + y_p) ; int zi_p = (zi + z_p) ; if (zi_p >=0 && zi_p < z && yi_p >=0 && yi_p < y && xi_p >=0 && xi_p < x){ //calculate the distance between the pixel and the current voter. float x_sq = x_p * x_p; float y_sq = y_p * y_p; float z_sq = z_p * z_p; float d_pv = sqrt(x_sq + y_sq + z_sq); // calculate the angle between the pixel and the current voter. float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * mag_v); if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){ //calculate the 1D index for the current pixel unsigned int id_p = (zi_p) * x * y + (yi_p) * x + (xi_p); l_vote = gpu_vote[id_p]; // compare the vote value of this pixel with the max value to find the maxima and its index. if (l_vote>max) { max = l_vote; id_x = x_p; id_y = y_p; id_z = z_p; } } } } } } float m_id = sqrt (id_x*id_x + id_y*id_y + id_z*id_z); gpu_dir[i * 3 + 0] = mag_v * (id_x/m_id); gpu_dir[i * 3 + 1] = mag_v * (id_y/m_id); gpu_dir[i * 3 + 2] = mag_v * (id_z/m_id); } // this kernel updates the gradient direction by the calculated voting direction. template __global__ void update_grad3(T* gpu_grad, T* gpu_dir, int x, int y, int z){ //calculate x,y,z coordinates for this thread int xi = blockIdx.x * blockDim.x + threadIdx.x; //find the grid size along y int grid_y = y / blockDim.y; int blockidx_y = blockIdx.y % grid_y; int yi = blockidx_y * blockDim.y + threadIdx.y; int zi = blockIdx.y / grid_y; int i = zi * x * y + yi * x + xi; if(xi >= x || yi >= y || zi >= z) return; //update the gradient image with the new direction direction gpu_grad[i * 3 + 0] = gpu_dir [i * 3 + 0]; gpu_grad[i * 3 + 1] = gpu_dir [i * 3 + 1]; gpu_grad[i * 3 + 2] = gpu_dir [i * 3 + 2]; } template void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(sqrt (max_threads),sqrt (max_threads)); dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); // allocate space on the GPU for the updated vote direction T* gpu_dir; cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3); //call the kernel to calculate the new voting direction update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z); //call the kernel to update the gradient direction update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z); //free allocated memory cudaFree(gpu_dir); } template void cpu_update_dir3(T* cpu_grad, T* cpu_vote, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ //calculate the number of bytes in the array unsigned int bytes = x * y * z * sizeof(T); //allocate space on the GPU for the Vote data T* gpu_vote; cudaMalloc(&gpu_vote, bytes); //copy the input vote data to the GPU cudaMemcpy(gpu_vote, cpu_vote, bytes, cudaMemcpyHostToDevice); //allocate space on the GPU for the Gradient data T* gpu_grad; cudaMalloc(&gpu_grad, bytes*3); //copy the Gradient data to the GPU cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice); //call the GPU version of the update direction function gpu_update_dir3(gpu_grad, gpu_vote, cos_phi, r, x , y, z); //copy the new gradient image back to the CPU cudaMemcpy(cpu_grad, gpu_grad, bytes*3, cudaMemcpyDeviceToHost) ; //free allocated memory cudaFree(gpu_vote); cudaFree(gpu_grad); } #endif