#ifndef STIM_CUDA_VOTE_ATOMIC_BB_H #define STIM_CUDA_VOTE_ATOMIC_BB_H # include # include #include #include #include #include #include namespace stim{ namespace cuda{ // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area template __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ extern __shared__ T S[]; T* shared_atan = S; size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1); stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x); // calculate the 2D coordinates for this current thread. int xi = blockIdx.x * blockDim.x + threadIdx.x; int yi = blockIdx.y * blockDim.y + threadIdx.y; if(xi >= x || yi >= y) return; // convert 2D coordinates to 1D int i = yi * x + xi; // calculate the voting direction based on the grtadient direction float theta = gpuGrad[2*i]; //calculate the amount of vote for the voter float mag = gpuGrad[2*i + 1]; stim::aabb2 bb(xi, yi); //initialize a bounding box at the current point bb.insert(xi + ceil(rmax * cos(theta)), ceil(yi + rmax * sin(theta))); bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi))); //insert one corner of the triangle into the bounding box bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi))); //insert the final corner into the bounding box // compute the size of window which will be checked for finding the proper voters for this pixel int x_table = 2*rmax +1; int rmax_sq = rmax * rmax; int lut_i; T dx_sq, dy_sq; bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image bb.trim_high(x-1, y-1); int by, bx; int dx, dy; unsigned int ind_g; //initialize the maximum vote value to zero T alpha; for(by = bb.low[1]; by <= bb.high[1]; by++){ //for each element in the bounding box dy = by - yi; //calculate the y coordinate of the current point relative to yi dy_sq = dy * dy; for(bx = bb.low[0]; bx <= bb.high[0]; bx++){ dx = bx - xi; dx_sq = dx * dx; lut_i = (rmax - dy) * x_table + rmax - dx; alpha = shared_atan[lut_i]; if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){ ind_g = (by)*x + (bx); atomicAdd(&gpuVote[ind_g], mag); } } } } template void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads( sqrt(max_threads), sqrt(max_threads) ); dim3 blocks(x/threads.x + 1, y/threads.y + 1); size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); size_t shared_mem_req = table_bytes;// + template_bytes; std::cout<<"Shared Memory required: "< shared_mem){ std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); } template void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ //calculate the number of bytes in the array unsigned int bytes = x * y * sizeof(T); //calculate the number of bytes in the atan2 table unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); //allocate space on the GPU for the Vote Image T* gpuVote; cudaMalloc(&gpuVote, bytes); //allocate space on the GPU for the input Gradient image T* gpuGrad; HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); //copy the Gradient Magnitude data to the GPU HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); //allocate space on the GPU for the atan2 table T* gpuTable; HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); //copy the atan2 values to the GPU HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); //call the GPU version of the vote calculation function gpu_vote(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); //copy the Vote Data back to the CPU cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; //free allocated memory cudaFree(gpuTable); cudaFree(gpuVote); cudaFree(gpuGrad); } } } #endif