From 11cd127f82ec3bb2f8d7fd8ebabac1a8ca1c4995 Mon Sep 17 00:00:00 2001 From: laila Saadatifard Date: Fri, 17 Jun 2016 15:02:42 -0500 Subject: [PATCH] Leila's ivote profiling push --- stim/cuda/arraymath.cuh | 3 ++- stim/cuda/ivote/local_max.cuh | 22 +++++++++++++++++++--- stim/cuda/ivote/update_dir_global.cuh | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/ivote/update_dir_shared.cuh | 184 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/ivote/vote_atomic.cuh | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/ivote/vote_atomic_shared.cuh | 166 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/ivote/vote_shared_32-32.cuh | 150 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/ivote_atomic.cuh | 18 ++++++++++++++++++ 8 files changed, 814 insertions(+), 4 deletions(-) create mode 100644 stim/cuda/ivote/update_dir_global.cuh create mode 100644 stim/cuda/ivote/update_dir_shared.cuh create mode 100644 stim/cuda/ivote/vote_atomic.cuh create mode 100644 stim/cuda/ivote/vote_atomic_shared.cuh create mode 100644 stim/cuda/ivote/vote_shared_32-32.cuh create mode 100644 stim/cuda/ivote_atomic.cuh diff --git a/stim/cuda/arraymath.cuh b/stim/cuda/arraymath.cuh index adbf66a..548d442 100644 --- a/stim/cuda/arraymath.cuh +++ b/stim/cuda/arraymath.cuh @@ -10,7 +10,8 @@ #include #include #include - +#include +#include namespace stim{ namespace cuda{ diff --git a/stim/cuda/ivote/local_max.cuh b/stim/cuda/ivote/local_max.cuh index a2bd2fe..3d3abdf 100644 --- a/stim/cuda/ivote/local_max.cuh +++ b/stim/cuda/ivote/local_max.cuh @@ -29,17 +29,33 @@ namespace stim{ //compare to the threshold if(val < final_t) return; + //define an array to store indices with same vote value + /*int * IdxEq; + IdxEq = new int [2*conn]; + int n = 0;*/ + for(int xl = xi - conn; xl < xi + conn; xl++){ for(int yl = yi - conn; yl < yi + conn; yl++){ if(xl >= 0 && xl < x && yl >= 0 && yl < y){ int il = yl * x + xl; if(gpuVote[il] > val){ return; - } - } + } + if (gpuVote[il] == val){ + /*IdxEq[n] = il; + n = n+1;*/ + if( il > i){ + return; + } + } + } } } - + /*if (n!=0){ + if(IdxEq[n/2] !=i){ + return; + } + } */ gpuCenters[i] = 1; } diff --git a/stim/cuda/ivote/update_dir_global.cuh b/stim/cuda/ivote/update_dir_global.cuh new file mode 100644 index 0000000..71eeebf --- /dev/null +++ b/stim/cuda/ivote/update_dir_global.cuh @@ -0,0 +1,159 @@ +#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H +#define STIM_CUDA_UPDATE_DIR_GLOBAL_H + +# include +# include +#include +#include +#include "cpyToshare.cuh" + +#define RMAX_TEST 8 + +namespace stim{ + namespace cuda{ + + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. + template + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ + + + //calculate the start point for this block + int bxi = blockIdx.x * blockDim.x; + + // calculate the 2D coordinates for this current thread. + int xi = bxi + threadIdx.x; + if(xi >= x) return; //if the index is outside of the image, terminate the kernel + int yi = blockIdx.y * blockDim.y + threadIdx.y; + int i = yi * x + xi; // convert 2D coordinates to 1D + + float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch + gpuDir[i] = 0; //initialize the vote direction to zero + float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter + int id_x = 0; // define two local variables for the x and y position of the maximum + int id_y = 0; + + int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter + int rmax_sq = rmax * rmax; + int tx_rmax = threadIdx.x + rmax; + float atan_angle; + float vote_c; + for(int yr = -RMAX_TEST; yr <= RMAX_TEST; yr++){ + if (yi+yr >= 0 && yi + yr < y){ + for(int xr = -RMAX_TEST; xr <= RMAX_TEST; xr++){ + + unsigned int ind_t = (RMAX_TEST - yr) * x_table + RMAX_TEST - xr; + + // calculate the angle between the voter and the current pixel in x and y directions + atan_angle = gpuTable[ind_t]; + + // find the vote value for the current counter + vote_c = gpuVote[(yi+yr)*x + (xi+xr)]; + + // check if the current pixel is located in the voting area of this voter. + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) max) { + + max = vote_c; + id_x = xr; + id_y = yr; + } + } + } + } + } + + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); + float new_angle = gpuTable[ind_m]; + + if(xi < x && yi < y) + gpuDir[i] = new_angle; + } //end kernel + + // this kernel updates the gradient direction by the calculated voting direction. + template + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ + + // calculate the 2D coordinates for this current thread. + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y * blockDim.y + threadIdx.y; + + // convert 2D coordinates to 1D + int i = yi * x + xi; + + //update the gradient image with the vote direction + gpuGrad[2*i] = gpuDir[i]; + } + + template + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //calculate the number of bytes in the array + unsigned int bytes = x * y * sizeof(T); + + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(max_threads, 1); + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); + + // allocate space on the GPU for the updated vote direction + T* gpuDir; + cudaMalloc(&gpuDir, bytes); + + //call the kernel to calculate the new voting direction + cuda_update_dir <<< blocks, threads>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + //call the kernel to update the gradient direction + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); + + //free allocated memory + cudaFree(gpuDir); + + } + + template + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //calculate the number of bytes in the array + unsigned int bytes = x * y * sizeof(T); + + //calculate the number of bytes in the atan2 table + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); + + //allocate space on the GPU for the Vote Image + T* gpuVote; + cudaMalloc(&gpuVote, bytes); + + //copy the input vote image to the GPU + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the input Gradient image + T* gpuGrad; + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); + + //copy the Gradient data to the GPU + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the atan2 table + T* gpuTable; + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); + + //copy the atan2 values to the GPU + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); + + //call the GPU version of the update direction function + gpu_update_dir(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + //copy the new gradient image back to the CPU + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; + + //free allocated memory + cudaFree(gpuTable); + cudaFree(gpuVote); + cudaFree(gpuGrad); + } + + } +} + +#endif \ No newline at end of file diff --git a/stim/cuda/ivote/update_dir_shared.cuh b/stim/cuda/ivote/update_dir_shared.cuh new file mode 100644 index 0000000..91aa717 --- /dev/null +++ b/stim/cuda/ivote/update_dir_shared.cuh @@ -0,0 +1,184 @@ +#ifndef STIM_CUDA_UPDATE_DIR_SHARED_H +#define STIM_CUDA_UPDATE_DIR_SHARED_H + +# include +# include +#include +#include +#include "cpyToshare.cuh" + +namespace stim{ + namespace cuda{ + + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. + template + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ + + //generate a pointer to shared memory (size will be specified as a kernel parameter) + extern __shared__ float s_vote[]; + + //calculate the start point for this block + int bxi = blockIdx.x * blockDim.x; + + // calculate the 2D coordinates for this current thread. + int xi = bxi + threadIdx.x; + int yi = blockIdx.y * blockDim.y + threadIdx.y; + // convert 2D coordinates to 1D + int i = yi * x + xi; + + // calculate the voting direction based on the grtadient direction + float theta = gpuGrad[2*i]; + + //initialize the vote direction to zero + gpuDir[i] = 0; + + // define a local variable to maximum value of the vote image in the voting area for this voter + float max = 0; + + // define two local variables for the x and y coordinations where the maximum happened + int id_x = 0; + int id_y = 0; + + //calculate the width of the shared memory block + int swidth = 2 * rmax + blockDim.x; + + // compute the size of window which will be checked for finding the voting area for this voter + int x_table = 2*rmax +1; + int rmax_sq = rmax * rmax; + int tx_rmax = threadIdx.x + rmax; + int bxs = bxi - rmax; + + for(int yr = -rmax; yr <= rmax; yr++){ + //if (yi+yr >= 0 && yi + yr < y){ + //copy the portion of the image necessary for this block to shared memory + __syncthreads(); + cpyG2S1D(s_vote, gpuVote, bxs, yi + yr , swidth, 1, threadIdx, blockDim, x, y); + __syncthreads(); + + //if the current thread is outside of the image, it doesn't have to be computed + if(xi < x && yi < y){ + + for(int xr = -rmax; xr <= rmax; xr++){ + + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; + + // calculate the angle between the voter and the current pixel in x and y directions + float atan_angle = gpuTable[ind_t]; + + // calculate the voting direction based on the grtadient direction + int idx_share_update = xr + tx_rmax ; + float share_vote = s_vote[idx_share_update]; + + // check if the current pixel is located in the voting area of this voter. + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) max) { + + max = share_vote; + id_x = xr; + id_y = yr; + } + } + } + } + //} + } + + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); + float new_angle = gpuTable[ind_m]; + + if(xi < x && yi < y) + gpuDir[i] = new_angle; + + } + + // this kernel updates the gradient direction by the calculated voting direction. + template + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ + + // calculate the 2D coordinates for this current thread. + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y * blockDim.y + threadIdx.y; + + // convert 2D coordinates to 1D + int i = yi * x + xi; + + //update the gradient image with the vote direction + gpuGrad[2*i] = gpuDir[i]; + } + + template + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //calculate the number of bytes in the array + unsigned int bytes = x * y * sizeof(T); + + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(max_threads, 1); + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); + + // specify share memory + unsigned int share_bytes = (2*rmax + threads.x)*(1)*4; + + // allocate space on the GPU for the updated vote direction + T* gpuDir; + cudaMalloc(&gpuDir, bytes); + + //call the kernel to calculate the new voting direction + cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + //call the kernel to update the gradient direction + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); + + //free allocated memory + cudaFree(gpuDir); + + } + + template + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //calculate the number of bytes in the array + unsigned int bytes = x * y * sizeof(T); + + //calculate the number of bytes in the atan2 table + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); + + //allocate space on the GPU for the Vote Image + T* gpuVote; + cudaMalloc(&gpuVote, bytes); + + //copy the input vote image to the GPU + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the input Gradient image + T* gpuGrad; + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); + + //copy the Gradient data to the GPU + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the atan2 table + T* gpuTable; + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); + + //copy the atan2 values to the GPU + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); + + //call the GPU version of the update direction function + gpu_update_dir(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + //copy the new gradient image back to the CPU + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; + + //free allocated memory + cudaFree(gpuTable); + cudaFree(gpuVote); + cudaFree(gpuGrad); + } + + } +} + +#endif \ No newline at end of file diff --git a/stim/cuda/ivote/vote_atomic.cuh b/stim/cuda/ivote/vote_atomic.cuh new file mode 100644 index 0000000..fc0ce47 --- /dev/null +++ b/stim/cuda/ivote/vote_atomic.cuh @@ -0,0 +1,116 @@ +#ifndef STIM_CUDA_VOTE_ATOMIC_H +#define STIM_CUDA_VOTE_ATOMIC_H + +# include +# include +#include +#include +#include "cpyToshare.cuh" + +namespace stim{ + namespace cuda{ + + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area + template + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ + + + // calculate the 2D coordinates for this current thread. + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y * blockDim.y + threadIdx.y; + // convert 2D coordinates to 1D + int i = yi * x + xi; + + // calculate the voting direction based on the grtadient direction + float theta = gpuGrad[2*i]; + //calculate the amount of vote for the voter + float mag = gpuGrad[2*i + 1]; + + // compute the size of window which will be checked for finding the proper voters for this pixel + int x_table = 2*rmax +1; + int rmax_sq = rmax * rmax; + if(xi < x && yi < y){ + //for every line (along y) + for(int yr = -rmax; yr <= rmax; yr++){ + for(int xr = -rmax; xr <= rmax; xr++){ + if ((yi+yr)>=0 && (yi+yr)=0 && (xi+xr) + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(max_threads, 1); + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); + + // specify share memory + //unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4; + + //call the kernel to do the voting + cuda_vote <<< blocks, threads>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + } + + + template + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //calculate the number of bytes in the array + unsigned int bytes = x * y * sizeof(T); + + //calculate the number of bytes in the atan2 table + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); + + //allocate space on the GPU for the Vote Image + T* gpuVote; + cudaMalloc(&gpuVote, bytes); + + //allocate space on the GPU for the input Gradient image + T* gpuGrad; + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); + + //copy the Gradient Magnitude data to the GPU + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the atan2 table + T* gpuTable; + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); + + //copy the atan2 values to the GPU + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); + + //call the GPU version of the vote calculation function + gpu_vote(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + //copy the Vote Data back to the CPU + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; + + //free allocated memory + cudaFree(gpuTable); + cudaFree(gpuVote); + cudaFree(gpuGrad); + } + + } +} + +#endif \ No newline at end of file diff --git a/stim/cuda/ivote/vote_atomic_shared.cuh b/stim/cuda/ivote/vote_atomic_shared.cuh new file mode 100644 index 0000000..563adb4 --- /dev/null +++ b/stim/cuda/ivote/vote_atomic_shared.cuh @@ -0,0 +1,166 @@ +#ifndef STIM_CUDA_VOTE_ATOMIC_SHARED_H +#define STIM_CUDA_VOTE_ATOMIC_SHARED_H + +# include +# include +#include +#include +#include "cpyToshare.cuh" +//#include "writebackshared.cuh" +namespace stim{ + namespace cuda{ + + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area + template + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ + + //generate a pointer to the shared memory + extern __shared__ float s_vote[]; + // calculate the 2D coordinates for this current thread. + int bxi = blockIdx.x * blockDim.x; + int byi = blockIdx.y * blockDim.y; + int xi = bxi + threadIdx.x; + int yi = byi + threadIdx.y; + // convert 2D coordinates to 1D + int i = yi * x + xi; + + // calculate the voting direction based on the gradient direction + float theta = gpuGrad[2*i]; + //calculate the amount of vote for the voter + float mag = gpuGrad[2*i + 1]; + + //find the starting points and size of window, wich will be copied to the shared memory + int bxs = bxi - rmax; + int bys = byi - rmax; + int xwidth = 2*rmax + blockDim.x; + int ywidth = 2*rmax + blockDim.y; + //compute the coordinations of this pixel in the 2D-shared memory. + int sx_rx = threadIdx.x + rmax; + int sy_ry = threadIdx.y + rmax; + // compute the size of window which will be checked for finding the counters for this voter + int x_table = 2*rmax +1; + int rmax_sq = rmax * rmax; + //calculate some parameters for indexing shared memory + //calculate the total number of threads available + unsigned int tThreads = blockDim.x * blockDim.y; + //calculate the current 1D thread ID + unsigned int ti = threadIdx.y * (blockDim.x) + threadIdx.x; + //calculate the number of iteration required + unsigned int In = xwidth*ywidth/tThreads + 1; + if(xi < x && yi < y){ + __syncthreads(); + //initialize the shared memory to zero + for (unsigned int i = 0; i < In; i++){ + unsigned int sIdx0 = i * tThreads + ti; + if (sIdx0< xwidth*ywidth) { + s_vote[sIdx0] = 0; + } + } + __syncthreads(); + //for every line (along y) + for(int yr = -rmax; yr <= rmax; yr++){ + //compute the position of the current voter in the shared memory along the y axis. + unsigned int sIdx_y1d = (sy_ry + yr)* xwidth; + for(int xr = -rmax; xr <= rmax; xr++){ + + //find the location of the current pixel in the atan2 table + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; + + // calculate the angle between the voter and the current pixel in x and y directions + float atan_angle = gpuTable[ind_t]; + + // check if the current pixel is located in the voting area of this voter. + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) = xwidth*ywidth) return; + + unsigned int sy = sIdx/xwidth; + unsigned int sx = sIdx - (sy * xwidth); + + unsigned int gx = bxs + sx; + unsigned int gy = bys + sy; + if (gx + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(sqrt(max_threads), sqrt(max_threads)); + dim3 blocks(x/threads.x + 1 , y/threads.y+1); + + // specify share memory + unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*sizeof(T); + + //call the kernel to do the voting + cuda_vote <<< blocks, threads, share_bytes>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + } + + + template + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //calculate the number of bytes in the array + unsigned int bytes = x * y * sizeof(T); + + //calculate the number of bytes in the atan2 table + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); + + //allocate space on the GPU for the Vote Image + T* gpuVote; + cudaMalloc(&gpuVote, bytes); + + //allocate space on the GPU for the input Gradient image + T* gpuGrad; + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); + + //copy the Gradient Magnitude data to the GPU + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the atan2 table + T* gpuTable; + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); + + //copy the atan2 values to the GPU + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); + + //call the GPU version of the vote calculation function + gpu_vote(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + //copy the Vote Data back to the CPU + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; + + //free allocated memory + cudaFree(gpuTable); + cudaFree(gpuVote); + cudaFree(gpuGrad); + } + + } +} + +#endif \ No newline at end of file diff --git a/stim/cuda/ivote/vote_shared_32-32.cuh b/stim/cuda/ivote/vote_shared_32-32.cuh new file mode 100644 index 0000000..23c9481 --- /dev/null +++ b/stim/cuda/ivote/vote_shared_32-32.cuh @@ -0,0 +1,150 @@ +#ifndef STIM_CUDA_VOTE_SHARED_H +#define STIM_CUDA_VOTE_SHARED +# include +# include +#include +#include +#include "cpyToshare.cuh" + +namespace stim{ + namespace cuda{ + + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area + template + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ + + //generate a pointer to shared memory (size will be specified as a kernel parameter) + extern __shared__ float s_grad[]; + + //calculate the start point for this block + int bxi = blockIdx.x * blockDim.x; + int byi = blockIdx.y * blockDim.y; + // calculate the 2D coordinates for this current thread. + int xi = bxi + threadIdx.x; + int yi = byi + threadIdx.y; + // convert 2D coordinates to 1D + int i = yi * x + xi; + + // define a local variable to sum the votes from the voters + float sum = 0; + + //calculate the width of the shared memory block + int xwidth = 2 * rmax + blockDim.x; + int ywidth = 2 * rmax + blockDim.y; + // compute the size of window which will be checked for finding the proper voters for this pixel + int x_table = 2*rmax +1; + int rmax_sq = rmax * rmax; + int tx_rmax = threadIdx.x + rmax; + int bxs = bxi - rmax; + int bys = byi - rmax; + //compute the coordinations of this pixel in the 2D-shared memory. + int sx_rx = threadIdx.x + rmax; + int sy_ry = threadIdx.y + rmax; + //copy the portion of the image necessary for this block to shared memory + __syncthreads(); + cpyG2S2D2ch(s_grad, gpuGrad, bxs, bys, 2*xwidth, ywidth, threadIdx, blockDim, x, y); + __syncthreads(); + + for(int yr = -rmax; yr <= rmax; yr++){ + int yi_v = (yi + yr) ; + //compute the position of the current voter in the shared memory along the y axis. + unsigned int sIdx_y1d = (sy_ry + yr)* xwidth; + //if (yi+yr=0){ + if(xi < x && yi < y){ + + for(int xr = -rmax; xr <= rmax; xr++){ + + //compute the position of the current voter in the 2D-shared memory along the x axis. + unsigned int sIdx_x = (sx_rx + xr); + //find the 1D index of this voter in the 2D-shared memory. + unsigned int s_Idx = (sIdx_y1d + sIdx_x); + unsigned int s_Idx2 = s_Idx * 2; + + //find the location of this voter in the atan2 table + int id_t = (yr + rmax) * x_table + xr + rmax; + + // calculate the angle between the pixel and the current voter in x and y directions + float atan_angle = gpuTable[id_t]; + + // calculate the voting direction based on the grtadient direction + //int idx_share = xr + tx_rmax ; + float theta = s_grad[s_Idx2]; + float mag = s_grad[s_Idx2 + 1]; + + + // check if the current voter is located in the voting area of this pixel. + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(sqrt(max_threads), sqrt(max_threads)); + dim3 blocks(x/threads.x + 1 , y/threads.y+1); + + + // specify share memory + unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*2*sizeof(T); + + //call the kernel to do the voting + cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + } + + + template + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //calculate the number of bytes in the array + unsigned int bytes = x * y * sizeof(T); + + //calculate the number of bytes in the atan2 table + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); + + //allocate space on the GPU for the Vote Image + T* gpuVote; + cudaMalloc(&gpuVote, bytes); + + //allocate space on the GPU for the input Gradient image + T* gpuGrad; + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); + + //copy the Gradient Magnitude data to the GPU + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the atan2 table + T* gpuTable; + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); + + //copy the atan2 values to the GPU + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); + + //call the GPU version of the vote calculation function + gpu_vote(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + //copy the Vote Data back to the CPU + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; + + //free allocated memory + cudaFree(gpuTable); + cudaFree(gpuVote); + cudaFree(gpuGrad); + } + + } +} + +#endif \ No newline at end of file diff --git a/stim/cuda/ivote_atomic.cuh b/stim/cuda/ivote_atomic.cuh new file mode 100644 index 0000000..c7a827f --- /dev/null +++ b/stim/cuda/ivote_atomic.cuh @@ -0,0 +1,18 @@ +#ifndef STIM_CUDA_IVOTE_ATOMIC_H +#define STIM_CUDA_IVOTE_ATOMIC_H + +#include +#include +#include +//#include +#include +#include +namespace stim{ + namespace cuda{ + + } +} + + + +#endif \ No newline at end of file -- libgit2 0.21.4