#ifndef STIM_CUDA_UPDATE_DIR_BB_H #define STIM_CUDA_UPDATE_DIR_BB_H # include # include #include #include #include #include #include //#define RMAX_TEST 8 namespace stim{ namespace cuda{ template __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ extern __shared__ T S[]; T* shared_atan = S; size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1); stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x); //T* shared_vote = &S[n_table]; //size_t template_size_x = (blockDim.x + 2 * rmax); //size_t template_size_y = (blockDim.y + 2 * rmax); //stim::cuda::threadedMemcpy2D((char*)shared_vote, (char*)gpuVote, template_size_x, template_size_y, x, threadIdx.y * blockDim.x + threadIdx.x, blockDim.x * blockDim.y); int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate the 2D coordinates for this current thread. int yi = blockIdx.y * blockDim.y + threadIdx.y; if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel int i = yi * x + xi; //convert 2D coordinates to 1D float theta = gpuGrad[2*i]; //calculate the voting direction based on the grtadient direction - global memory fetch stim::aabb2 bb(xi, yi); //initialize a bounding box at the current point bb.insert(xi + ceil(rmax * cos(theta)), ceil(yi + rmax * sin(theta))); bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi))); //insert one corner of the triangle into the bounding box bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi))); //insert the final corner into the bounding box int x_table = 2*rmax +1; int lut_i; T rmax_sq = rmax * rmax; T dx_sq, dy_sq; bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image bb.trim_high(x-1, y-1); int by, bx; int dx, dy; //coordinate relative to (xi, yi) T v; T max_v = 0; //initialize the maximum vote value to zero T alpha; int max_dx = bb.low[0]; int max_dy = bb.low[1]; for(by = bb.low[1]; by <= bb.high[1]; by++){ //for each element in the bounding box dy = by - yi; //calculate the y coordinate of the current point relative to yi dy_sq = dy * dy; for(bx = bb.low[0]; bx <= bb.high[0]; bx++){ dx = bx - xi; dx_sq = dx * dx; lut_i = (rmax - dy) * x_table + rmax - dx; alpha = shared_atan[lut_i]; if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){ v = gpuVote[by * x + bx]; // find the vote value for the current counter if(v > max_v){ max_v = v; max_dx = dx; max_dy = dy; } } } } gpuDir[i] = atan2((T)max_dy, (T)max_dx); } // this kernel updates the gradient direction by the calculated voting direction. template __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ // calculate the 2D coordinates for this current thread. int xi = blockIdx.x * blockDim.x + threadIdx.x; int yi = blockIdx.y * blockDim.y + threadIdx.y; if(xi >= x || yi >= y) return; // convert 2D coordinates to 1D int i = yi * x + xi; //update the gradient image with the vote direction gpuGrad[2*i] = gpuDir[i]; } template void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ //calculate the number of bytes in the array unsigned int bytes = x * y * sizeof(T); // allocate space on the GPU for the updated vote direction T* gpuDir; HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) ); unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads( sqrt(max_threads), sqrt(max_threads) ); dim3 blocks(x/threads.x + 1, y/threads.y + 1); size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); //size_t curtain = 2 * rmax; //size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain); size_t shared_mem_req = table_bytes;// + template_bytes; std::cout<<"Shared Memory required: "< shared_mem){ std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); //stim::gpu2image(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer); //exit(0); //threads = dim3( sqrt(max_threads), sqrt(max_threads) ); //blocks = dim3(x/threads.x + 1, y/threads.y + 1); //call the kernel to update the gradient direction cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); //free allocated memory HANDLE_ERROR( cudaFree(gpuDir) ); } template void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ //calculate the number of bytes in the array unsigned int bytes = x * y * sizeof(T); //calculate the number of bytes in the atan2 table unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); //allocate space on the GPU for the Vote Image T* gpuVote; cudaMalloc(&gpuVote, bytes); //copy the input vote image to the GPU HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); //allocate space on the GPU for the input Gradient image T* gpuGrad; HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); //copy the Gradient data to the GPU HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); //allocate space on the GPU for the atan2 table T* gpuTable; HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); //copy the atan2 values to the GPU HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); //call the GPU version of the update direction function gpu_update_dir(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); //copy the new gradient image back to the CPU cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; //free allocated memory cudaFree(gpuTable); cudaFree(gpuVote); cudaFree(gpuGrad); } } } #endif