diff --git a/stim/cuda/ivote/david_update_dir_global.cuh b/stim/cuda/ivote/david_update_dir_global.cuh deleted file mode 100644 index 3ae624c..0000000 --- a/stim/cuda/ivote/david_update_dir_global.cuh +++ /dev/null @@ -1,171 +0,0 @@ -#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H -#define STIM_CUDA_UPDATE_DIR_GLOBAL_H - -# include -# include -#include -#include -#include -#include "cpyToshare.cuh" - -#define RMAX_TEST 8 - -namespace stim{ - namespace cuda{ - - // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. - template - __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ - extern __shared__ T atan2_table[]; - - //calculate the start point for this block - //int bxi = blockIdx.x * blockDim.x; - - stim::cuda::sharedMemcpy(atan2_table, gpuTable, (2 * rmax + 1) * (2 * rmax + 1), threadIdx.x, blockDim.x); - - __syncthreads(); - - // calculate the 2D coordinates for this current thread. - //int xi = bxi + threadIdx.x; - int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y * blockDim.y + threadIdx.y; - if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel - - int i = yi * x + xi; // convert 2D coordinates to 1D - - float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch - gpuDir[i] = 0; //initialize the vote direction to zero - float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter - int id_x = 0; // define two local variables for the x and y position of the maximum - int id_y = 0; - - int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter - int rmax_sq = rmax * rmax; - int tx_rmax = threadIdx.x + rmax; - float atan_angle; - float vote_c; - unsigned int ind_t; - for(int yr = -rmax; yr <= rmax; yr++){ //for each counter in the y direction - if (yi+yr >= 0 && yi + yr < y){ //if the counter exists (we aren't looking outside of the image) - for(int xr = -rmax; xr <= rmax; xr++){ //for each counter in the x direction - if((xr * xr + yr *yr)< rmax_sq){ //if the counter is within range of the voter - - ind_t = (rmax - yr) * x_table + rmax - xr; //calculate the index to the atan2 table - atan_angle = atan2_table[ind_t]; //retrieve the direction vector from the table - - //atan_angle = atan2((float)yr, (float)xr); - - if (abs(atan_angle - theta) max) { // compare the vote value of this pixel with the max value to find the maxima and its index. - max = vote_c; - id_x = xr; - id_y = yr; - } - } - } - } - } - } - - unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); - float new_angle = gpuTable[ind_m]; - - if(xi < x && yi < y) - gpuDir[i] = new_angle; - } //end kernel - - // this kernel updates the gradient direction by the calculated voting direction. - template - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ - - // calculate the 2D coordinates for this current thread. - int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y * blockDim.y + threadIdx.y; - - // convert 2D coordinates to 1D - int i = yi * x + xi; - - //update the gradient image with the vote direction - gpuGrad[2*i] = gpuDir[i]; - } - - template - void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ - - - - //calculate the number of bytes in the array - unsigned int bytes = x * y * sizeof(T); - - unsigned int max_threads = stim::maxThreadsPerBlock(); - - dim3 threads(sqrt(max_threads), sqrt(max_threads)); - dim3 blocks(x/threads.x + 1, y/threads.y + 1); - - - - // allocate space on the GPU for the updated vote direction - T* gpuDir; - cudaMalloc(&gpuDir, bytes); - - size_t shared_mem = sizeof(T) * std::pow((2 * rmax + 1), 2); - std::cout<<"Shared memory for atan2 table: "<>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); - - //call the kernel to update the gradient direction - cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); - - //free allocated memory - cudaFree(gpuDir); - - } - - template - void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ - - //calculate the number of bytes in the array - unsigned int bytes = x * y * sizeof(T); - - //calculate the number of bytes in the atan2 table - unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); - - //allocate space on the GPU for the Vote Image - T* gpuVote; - cudaMalloc(&gpuVote, bytes); - - //copy the input vote image to the GPU - HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); - - //allocate space on the GPU for the input Gradient image - T* gpuGrad; - HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); - - //copy the Gradient data to the GPU - HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); - - //allocate space on the GPU for the atan2 table - T* gpuTable; - HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); - - //copy the atan2 values to the GPU - HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); - - //call the GPU version of the update direction function - gpu_update_dir(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); - - //copy the new gradient image back to the CPU - cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; - - //free allocated memory - cudaFree(gpuTable); - cudaFree(gpuVote); - cudaFree(gpuGrad); - } - - } -} - -#endif \ No newline at end of file diff --git a/stim/cuda/ivote/re_sample.cuh b/stim/cuda/ivote/re_sample.cuh new file mode 100644 index 0000000..06bfeb7 --- /dev/null +++ b/stim/cuda/ivote/re_sample.cuh @@ -0,0 +1,106 @@ +#ifndef STIM_CUDA_RE_SAMPLE_H +#define STIM_CUDA_RE_SAMPLE_H + +#include +#include +#include +#include + +namespace stim{ + namespace cuda{ + + template + __global__ void cuda_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ + + unsigned int sigma_ds = 1/resize; + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); + + + // calculate the 2D coordinates for this current thread. + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y; + // convert 2D coordinates to 1D + int i = yi * x + xi; + + if(xi< x && yi< y){ + if(xi%sigma_ds==0){ + if(yi%sigma_ds==0){ + gpuI[i] = gpuI0[(yi/sigma_ds)*x_ds + xi/sigma_ds]; + } + } + else gpuI[i] = 0; + + //int x_org = xi * sigma_ds ; + //int y_org = yi * sigma_ds ; + //int i_org = y_org * x + x_org; + //gpuI[i] = gpuI0[i_org]; + } + + } + + + /// Applies a Gaussian blur to a 2D image stored on the GPU + template + void gpu_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ + + + //unsigned int sigma_ds = 1/resize; + //unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); + //unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); + + //get the number of pixels in the image + //unsigned int pixels_ds = x_ds * y_ds; + + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(max_threads, 1); + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); + + //stim::cuda::gpu_gaussian_blur2(gpuI0, sigma_ds,x ,y); + + //resample the image + cuda_re_sample <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y); + + } + + /// Applies a Gaussian blur to a 2D image stored on the CPU + template + void cpu_re_sample(T* out, T* in, T resize, unsigned int x, unsigned int y){ + + //get the number of pixels in the image + unsigned int pixels = x*y; + unsigned int bytes = sizeof(T) * pixels; + + unsigned int sigma_ds = 1/resize; + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); + unsigned int bytes_ds = sizeof(T) * x_ds * y_ds; + + + + //allocate space on the GPU for the original image + T* gpuI0; + cudaMalloc(&gpuI0, bytes_ds); + + + //copy the image data to the GPU + cudaMemcpy(gpuI0, in, bytes_ds, cudaMemcpyHostToDevice); + + //allocate space on the GPU for the down sampled image + T* gpuI; + cudaMalloc(&gpuI, bytes); + + //run the GPU-based version of the algorithm + gpu_re_sample(gpuI, gpuI0, resize, x, y); + + //copy the image data to the GPU + cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice); + + cudaFree(gpuI0); + cudeFree(gpuI); + } + + } +} + +#endif \ No newline at end of file diff --git a/stim/cuda/ivote/update_dir_bb.cuh b/stim/cuda/ivote/update_dir_bb.cuh new file mode 100644 index 0000000..bb04f23 --- /dev/null +++ b/stim/cuda/ivote/update_dir_bb.cuh @@ -0,0 +1,185 @@ +#ifndef STIM_CUDA_UPDATE_DIR_BB_H +#define STIM_CUDA_UPDATE_DIR_BB_H + +# include +# include +#include +#include +#include +#include +#include + +//#define RMAX_TEST 8 + +namespace stim{ + namespace cuda{ + + template + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ + extern __shared__ T S[]; + T* shared_atan = S; + size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1); + stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x); + + //T* shared_vote = &S[n_table]; + //size_t template_size_x = (blockDim.x + 2 * rmax); + //size_t template_size_y = (blockDim.y + 2 * rmax); + //stim::cuda::threadedMemcpy2D((char*)shared_vote, (char*)gpuVote, template_size_x, template_size_y, x, threadIdx.y * blockDim.x + threadIdx.x, blockDim.x * blockDim.y); + + int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate the 2D coordinates for this current thread. + int yi = blockIdx.y * blockDim.y + threadIdx.y; + + if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel + + int i = yi * x + xi; //convert 2D coordinates to 1D + float theta = gpuGrad[2*i]; //calculate the voting direction based on the grtadient direction - global memory fetch + + stim::aabb2 bb(xi, yi); //initialize a bounding box at the current point + bb.insert(xi + ceil(rmax * cos(theta)), ceil(yi + rmax * sin(theta))); + bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi))); //insert one corner of the triangle into the bounding box + bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi))); //insert the final corner into the bounding box + + int x_table = 2*rmax +1; + int lut_i; + T rmax_sq = rmax * rmax; + T dx_sq, dy_sq; + + bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image + bb.trim_high(x-1, y-1); + + int by, bx; + int dx, dy; //coordinate relative to (xi, yi) + T v; + T max_v = 0; //initialize the maximum vote value to zero + T alpha; + int max_dx = bb.low[0]; + int max_dy = bb.low[1]; + for(by = bb.low[1]; by <= bb.high[1]; by++){ //for each element in the bounding box + dy = by - yi; //calculate the y coordinate of the current point relative to yi + dy_sq = dy * dy; + for(bx = bb.low[0]; bx <= bb.high[0]; bx++){ + dx = bx - xi; + dx_sq = dx * dx; + lut_i = (rmax - dy) * x_table + rmax - dx; + alpha = shared_atan[lut_i]; + if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){ + v = gpuVote[by * x + bx]; // find the vote value for the current counter + if(v > max_v){ + max_v = v; + max_dx = dx; + max_dy = dy; + } + } + } + } + gpuDir[i] = atan2((T)max_dy, (T)max_dx); + } + + + + // this kernel updates the gradient direction by the calculated voting direction. + template + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ + + // calculate the 2D coordinates for this current thread. + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y * blockDim.y + threadIdx.y; + + if(xi >= x || yi >= y) return; + + // convert 2D coordinates to 1D + int i = yi * x + xi; + + //update the gradient image with the vote direction + gpuGrad[2*i] = gpuDir[i]; + } + + template + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //calculate the number of bytes in the array + unsigned int bytes = x * y * sizeof(T); + + // allocate space on the GPU for the updated vote direction + T* gpuDir; + HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) ); + + unsigned int max_threads = stim::maxThreadsPerBlock(); + + dim3 threads( sqrt(max_threads), sqrt(max_threads) ); + dim3 blocks(x/threads.x + 1, y/threads.y + 1); + + size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); + //size_t curtain = 2 * rmax; + //size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain); + size_t shared_mem_req = table_bytes;// + template_bytes; + std::cout<<"Shared Memory required: "< shared_mem){ + std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + //stim::gpu2image(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer); + + //exit(0); + + //threads = dim3( sqrt(max_threads), sqrt(max_threads) ); + //blocks = dim3(x/threads.x + 1, y/threads.y + 1); + + //call the kernel to update the gradient direction + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); + //free allocated memory + HANDLE_ERROR( cudaFree(gpuDir) ); + + } + + template + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //calculate the number of bytes in the array + unsigned int bytes = x * y * sizeof(T); + + //calculate the number of bytes in the atan2 table + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); + + //allocate space on the GPU for the Vote Image + T* gpuVote; + cudaMalloc(&gpuVote, bytes); + + //copy the input vote image to the GPU + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the input Gradient image + T* gpuGrad; + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); + + //copy the Gradient data to the GPU + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the atan2 table + T* gpuTable; + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); + + //copy the atan2 values to the GPU + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); + + //call the GPU version of the update direction function + gpu_update_dir(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + //copy the new gradient image back to the CPU + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; + + //free allocated memory + cudaFree(gpuTable); + cudaFree(gpuVote); + cudaFree(gpuGrad); + } + + } +} + +#endif \ No newline at end of file diff --git a/stim/cuda/ivote/update_dir_global.cuh b/stim/cuda/ivote/update_dir_global.cuh deleted file mode 100644 index b575619..0000000 --- a/stim/cuda/ivote/update_dir_global.cuh +++ /dev/null @@ -1,185 +0,0 @@ -#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H -#define STIM_CUDA_UPDATE_DIR_GLOBAL_H - -# include -# include -#include -#include -#include -#include -#include - -//#define RMAX_TEST 8 - -namespace stim{ - namespace cuda{ - - template - __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ - extern __shared__ T S[]; - T* shared_atan = S; - size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1); - stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x); - - //T* shared_vote = &S[n_table]; - //size_t template_size_x = (blockDim.x + 2 * rmax); - //size_t template_size_y = (blockDim.y + 2 * rmax); - //stim::cuda::threadedMemcpy2D((char*)shared_vote, (char*)gpuVote, template_size_x, template_size_y, x, threadIdx.y * blockDim.x + threadIdx.x, blockDim.x * blockDim.y); - - int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate the 2D coordinates for this current thread. - int yi = blockIdx.y * blockDim.y + threadIdx.y; - - if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel - - int i = yi * x + xi; //convert 2D coordinates to 1D - float theta = gpuGrad[2*i]; //calculate the voting direction based on the grtadient direction - global memory fetch - - stim::aabb2 bb(xi, yi); //initialize a bounding box at the current point - bb.insert(xi + ceil(rmax * cos(theta)), ceil(yi + rmax * sin(theta))); - bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi))); //insert one corner of the triangle into the bounding box - bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi))); //insert the final corner into the bounding box - - int x_table = 2*rmax +1; - int lut_i; - T rmax_sq = rmax * rmax; - T dx_sq, dy_sq; - - bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image - bb.trim_high(x-1, y-1); - - int by, bx; - int dx, dy; //coordinate relative to (xi, yi) - T v; - T max_v = 0; //initialize the maximum vote value to zero - T alpha; - int max_dx = bb.low[0]; - int max_dy = bb.low[1]; - for(by = bb.low[1]; by <= bb.high[1]; by++){ //for each element in the bounding box - dy = by - yi; //calculate the y coordinate of the current point relative to yi - dy_sq = dy * dy; - for(bx = bb.low[0]; bx <= bb.high[0]; bx++){ - dx = bx - xi; - dx_sq = dx * dx; - lut_i = (rmax - dy) * x_table + rmax - dx; - alpha = shared_atan[lut_i]; - if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){ - v = gpuVote[by * x + bx]; // find the vote value for the current counter - if(v > max_v){ - max_v = v; - max_dx = dx; - max_dy = dy; - } - } - } - } - gpuDir[i] = atan2((T)max_dy, (T)max_dx); - } - - - - // this kernel updates the gradient direction by the calculated voting direction. - template - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ - - // calculate the 2D coordinates for this current thread. - int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y * blockDim.y + threadIdx.y; - - if(xi >= x || yi >= y) return; - - // convert 2D coordinates to 1D - int i = yi * x + xi; - - //update the gradient image with the vote direction - gpuGrad[2*i] = gpuDir[i]; - } - - template - void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ - - //calculate the number of bytes in the array - unsigned int bytes = x * y * sizeof(T); - - // allocate space on the GPU for the updated vote direction - T* gpuDir; - HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) ); - - unsigned int max_threads = stim::maxThreadsPerBlock(); - - dim3 threads( sqrt(max_threads), sqrt(max_threads) ); - dim3 blocks(x/threads.x + 1, y/threads.y + 1); - - size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); - //size_t curtain = 2 * rmax; - //size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain); - size_t shared_mem_req = table_bytes;// + template_bytes; - std::cout<<"Shared Memory required: "< shared_mem){ - std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); - //stim::gpu2image(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer); - - //exit(0); - - //threads = dim3( sqrt(max_threads), sqrt(max_threads) ); - //blocks = dim3(x/threads.x + 1, y/threads.y + 1); - - //call the kernel to update the gradient direction - cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); - //free allocated memory - HANDLE_ERROR( cudaFree(gpuDir) ); - - } - - template - void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ - - //calculate the number of bytes in the array - unsigned int bytes = x * y * sizeof(T); - - //calculate the number of bytes in the atan2 table - unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); - - //allocate space on the GPU for the Vote Image - T* gpuVote; - cudaMalloc(&gpuVote, bytes); - - //copy the input vote image to the GPU - HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); - - //allocate space on the GPU for the input Gradient image - T* gpuGrad; - HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); - - //copy the Gradient data to the GPU - HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); - - //allocate space on the GPU for the atan2 table - T* gpuTable; - HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); - - //copy the atan2 values to the GPU - HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); - - //call the GPU version of the update direction function - gpu_update_dir(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); - - //copy the new gradient image back to the CPU - cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; - - //free allocated memory - cudaFree(gpuTable); - cudaFree(gpuVote); - cudaFree(gpuGrad); - } - - } -} - -#endif \ No newline at end of file diff --git a/stim/cuda/ivote/vote_atomic_bb.cuh b/stim/cuda/ivote/vote_atomic_bb.cuh new file mode 100644 index 0000000..32de21f --- /dev/null +++ b/stim/cuda/ivote/vote_atomic_bb.cuh @@ -0,0 +1,142 @@ +#ifndef STIM_CUDA_VOTE_ATOMIC_BB_H +#define STIM_CUDA_VOTE_ATOMIC_BB_H + +# include +# include +#include +#include +#include +#include +#include + +namespace stim{ + namespace cuda{ + + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area + template + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ + + extern __shared__ T S[]; + T* shared_atan = S; + size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1); + stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x); + + // calculate the 2D coordinates for this current thread. + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y * blockDim.y + threadIdx.y; + + if(xi >= x || yi >= y) return; + // convert 2D coordinates to 1D + int i = yi * x + xi; + + // calculate the voting direction based on the grtadient direction + float theta = gpuGrad[2*i]; + //calculate the amount of vote for the voter + float mag = gpuGrad[2*i + 1]; + + + stim::aabb2 bb(xi, yi); //initialize a bounding box at the current point + bb.insert(xi + ceil(rmax * cos(theta)), ceil(yi + rmax * sin(theta))); + bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi))); //insert one corner of the triangle into the bounding box + bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi))); //insert the final corner into the bounding box + + // compute the size of window which will be checked for finding the proper voters for this pixel + int x_table = 2*rmax +1; + int rmax_sq = rmax * rmax; + + int lut_i; + T dx_sq, dy_sq; + + bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image + bb.trim_high(x-1, y-1); + + int by, bx; + int dx, dy; + + unsigned int ind_g; //initialize the maximum vote value to zero + T alpha; + + for(by = bb.low[1]; by <= bb.high[1]; by++){ //for each element in the bounding box + dy = by - yi; //calculate the y coordinate of the current point relative to yi + dy_sq = dy * dy; + for(bx = bb.low[0]; bx <= bb.high[0]; bx++){ + dx = bx - xi; + dx_sq = dx * dx; + lut_i = (rmax - dy) * x_table + rmax - dx; + alpha = shared_atan[lut_i]; + if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){ + ind_g = (by)*x + (bx); + atomicAdd(&gpuVote[ind_g], mag); + + } + } + } + + } + + + template + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads( sqrt(max_threads), sqrt(max_threads) ); + dim3 blocks(x/threads.x + 1, y/threads.y + 1); + size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); + size_t shared_mem_req = table_bytes;// + template_bytes; + std::cout<<"Shared Memory required: "< shared_mem){ + std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + } + + + template + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //calculate the number of bytes in the array + unsigned int bytes = x * y * sizeof(T); + + //calculate the number of bytes in the atan2 table + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); + + //allocate space on the GPU for the Vote Image + T* gpuVote; + cudaMalloc(&gpuVote, bytes); + + //allocate space on the GPU for the input Gradient image + T* gpuGrad; + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); + + //copy the Gradient Magnitude data to the GPU + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the atan2 table + T* gpuTable; + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); + + //copy the atan2 values to the GPU + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); + + //call the GPU version of the vote calculation function + gpu_vote(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + //copy the Vote Data back to the CPU + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; + + //free allocated memory + cudaFree(gpuTable); + cudaFree(gpuVote); + cudaFree(gpuGrad); + } + + } +} + +#endif \ No newline at end of file diff --git a/stim/cuda/ivote/vote_atomic_global.cuh b/stim/cuda/ivote/vote_atomic_global.cuh deleted file mode 100644 index 33e1da8..0000000 --- a/stim/cuda/ivote/vote_atomic_global.cuh +++ /dev/null @@ -1,142 +0,0 @@ -#ifndef STIM_CUDA_VOTE_ATOMIC_GLOBAL_H -#define STIM_CUDA_VOTE_ATOMIC_GLOBAL_H - -# include -# include -#include -#include -#include -#include -#include - -namespace stim{ - namespace cuda{ - - // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area - template - __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ - - extern __shared__ T S[]; - T* shared_atan = S; - size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1); - stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x); - - // calculate the 2D coordinates for this current thread. - int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y * blockDim.y + threadIdx.y; - - if(xi >= x || yi >= y) return; - // convert 2D coordinates to 1D - int i = yi * x + xi; - - // calculate the voting direction based on the grtadient direction - float theta = gpuGrad[2*i]; - //calculate the amount of vote for the voter - float mag = gpuGrad[2*i + 1]; - - - stim::aabb2 bb(xi, yi); //initialize a bounding box at the current point - bb.insert(xi + ceil(rmax * cos(theta)), ceil(yi + rmax * sin(theta))); - bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi))); //insert one corner of the triangle into the bounding box - bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi))); //insert the final corner into the bounding box - - // compute the size of window which will be checked for finding the proper voters for this pixel - int x_table = 2*rmax +1; - int rmax_sq = rmax * rmax; - - int lut_i; - T dx_sq, dy_sq; - - bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image - bb.trim_high(x-1, y-1); - - int by, bx; - int dx, dy; - - unsigned int ind_g; //initialize the maximum vote value to zero - T alpha; - - for(by = bb.low[1]; by <= bb.high[1]; by++){ //for each element in the bounding box - dy = by - yi; //calculate the y coordinate of the current point relative to yi - dy_sq = dy * dy; - for(bx = bb.low[0]; bx <= bb.high[0]; bx++){ - dx = bx - xi; - dx_sq = dx * dx; - lut_i = (rmax - dy) * x_table + rmax - dx; - alpha = shared_atan[lut_i]; - if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){ - ind_g = (by)*x + (bx); - atomicAdd(&gpuVote[ind_g], mag); - - } - } - } - - } - - - template - void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ - - - unsigned int max_threads = stim::maxThreadsPerBlock(); - dim3 threads( sqrt(max_threads), sqrt(max_threads) ); - dim3 blocks(x/threads.x + 1, y/threads.y + 1); - size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); - size_t shared_mem_req = table_bytes;// + template_bytes; - std::cout<<"Shared Memory required: "< shared_mem){ - std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); - - } - - - template - void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ - - //calculate the number of bytes in the array - unsigned int bytes = x * y * sizeof(T); - - //calculate the number of bytes in the atan2 table - unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); - - //allocate space on the GPU for the Vote Image - T* gpuVote; - cudaMalloc(&gpuVote, bytes); - - //allocate space on the GPU for the input Gradient image - T* gpuGrad; - HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); - - //copy the Gradient Magnitude data to the GPU - HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); - - //allocate space on the GPU for the atan2 table - T* gpuTable; - HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); - - //copy the atan2 values to the GPU - HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); - - //call the GPU version of the vote calculation function - gpu_vote(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); - - //copy the Vote Data back to the CPU - cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; - - //free allocated memory - cudaFree(gpuTable); - cudaFree(gpuVote); - cudaFree(gpuGrad); - } - - } -} - -#endif \ No newline at end of file diff --git a/stim/cuda/ivote_atomic.cuh b/stim/cuda/ivote_atomic.cuh deleted file mode 100644 index f265c60..0000000 --- a/stim/cuda/ivote_atomic.cuh +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef STIM_CUDA_IVOTE_ATOMIC_H -#define STIM_CUDA_IVOTE_ATOMIC_H - -#include -#include -#include -//#include -#include -//#include -namespace stim{ - namespace cuda{ - - } -} - - - -#endif \ No newline at end of file diff --git a/stim/cuda/ivote_atomic_bb.cuh b/stim/cuda/ivote_atomic_bb.cuh new file mode 100644 index 0000000..14e002d --- /dev/null +++ b/stim/cuda/ivote_atomic_bb.cuh @@ -0,0 +1,17 @@ +#ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H +#define STIM_CUDA_IVOTE_ATOMIC_BB_H + +#include +#include +#include +#include + +namespace stim{ + namespace cuda{ + + } +} + + + +#endif \ No newline at end of file -- libgit2 0.21.4