#ifndef STIM_CUDA_VOTE3_H #define STIM_CUDA_VOTE3_H #include #include #include #include #include #include "cpyToshare.cuh" // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area template __global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ extern __shared__ float s[]; //calculate x,y,z coordinates for this thread int xi = blockIdx.x * blockDim.x + threadIdx.x; //find the grid size along y int grid_y = y / blockDim.y; int blockidx_y = blockIdx.y % grid_y; int yi = blockidx_y * blockDim.y + threadIdx.y; int zi = blockIdx.y / grid_y; int i = zi * x * y + yi * x + xi; if(xi>=x || yi>=y || zi>=z) return; //find the starting points and the size of the window, which will be copied to the 2D-shared memory int bxs = blockIdx.x * blockDim.x - rx; int bys = blockidx_y * blockDim.y - ry; int xwidth = 2 * rx + blockDim.x; int ywidth = 2 * ry + blockDim.y; //calculate the starting point of shared memory for storing the magnitude. unsigned int b_s = 3 * xwidth * ywidth; //compute the coordinations of this pixel in the 2D-shared memory. int sx_rx = threadIdx.x + rx; int sy_ry = threadIdx.y + ry; // define a local variable to sum the votes from the voters float sum = 0; int rx_sq = rx * rx; int ry_sq = ry * ry; int rz_sq = rz * rz; for (int z_v = -rz; z_v<=rz; z_v++){ int zi_v = zi + z_v; if ((zi_v) >=0 && (zi_v) (s, gpu_grad, bxs, bys, zi + z_v, 3*xwidth, ywidth, threadIdx, blockDim, x, y); __syncthreads(); mag_share2D(s, b_s, xwidth, ywidth, threadIdx, blockDim); __syncthreads(); float z_sq = z_v * z_v; float d_z_sq = z_sq/rz_sq; for(int y_v = -ry; y_v <= ry; y_v++){ int yi_v = (yi + y_v) ; //compute the position of the current voter in the shared memory along the y axis. unsigned int sIdx_y1d = (sy_ry + y_v)* xwidth; float y_sq = y_v * y_v; float yz_sq = z_sq + y_sq; float d_yz_sq = y_sq/ry_sq + d_z_sq; for(int x_v = -rx; x_v <= rx; x_v++){ //check if the current voter is inside of the data-set int xi_v = (xi + x_v) ; if (yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){ //compute the position of the current voter in the 2D-shared memory along the x axis. unsigned int sIdx_x = (sx_rx + x_v); //find the 1D index of this voter in the 2D-shared memory. unsigned int s_Idx = (sIdx_y1d + sIdx_x); unsigned int s_Idx3 = s_Idx * 3; //save the gradient values for the current voter to the local variables and compute the gradient magnitude. float g_v_x = s[s_Idx3]; float g_v_y = s[s_Idx3 + 1]; float g_v_z = s[s_Idx3 + 2]; float mag_v = s[b_s + s_Idx]; //sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z); //calculate the distance between the pixel and the current voter. float x_sq = x_v * x_v; float d_pv = sqrt(x_sq + yz_sq); // calculate the angle between the pixel and the current voter. float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v); // check if the current voter is located in the voting area of this pixel. if ((((x_sq)/rx_sq + d_yz_sq)<= 1) && (cos_diff >= cos_phi)){ sum += mag_v; } } } } } } gpu_vote[i] = sum; } template void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(sqrt (max_threads),sqrt (max_threads)); dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*4*sizeof(T); //call the kernel to do the voting vote3 <<< blocks, threads, shared_bytes >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z); } template void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ //calculate the number of bytes in the array unsigned int bytes = x * y * z * sizeof(T); //allocate space on the GPU for the Vote Image T* gpu_vote; cudaMalloc(&gpu_vote, bytes); //allocate space on the GPU for the input Gradient image T* gpu_grad; cudaMalloc(&gpu_grad, bytes*3); //copy the Gradient data to the GPU cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice); //call the GPU version of the vote calculation function gpu_vote3(gpu_vote, gpu_grad, cos_phi, r, x , y, z); //copy the Vote Data back to the CPU cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ; //free allocated memory cudaFree(gpu_vote); cudaFree(gpu_grad); } #endif