From 96f9b10f641f32443600d9789e6d244a285b564f Mon Sep 17 00:00:00 2001 From: Laila Saadatifard Date: Mon, 14 Sep 2015 13:43:49 -0500 Subject: [PATCH] change the header files to be compatible with the new organized stim/cuda directory , and add the stim/cude/ivote subdirectory that includes the ivote related functions --- stim/cuda/arraymath/array_add.cuh | 3 +-- stim/cuda/arraymath/array_multiply.cuh | 3 +-- stim/cuda/down_sample.cuh | 101 ----------------------------------------------------------------------------------------------------- stim/cuda/gaussian_blur.cuh | 89 ----------------------------------------------------------------------------------------- stim/cuda/ivote.cuh | 17 +++++++++++++++++ stim/cuda/ivote/down_sample.cuh | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/ivote/local_max.cuh | 150 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/ivote/update_dir.cuh | 231 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/ivote/vote.cuh | 187 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/local_max.cuh | 151 ------------------------------------------------------------------------------------------------------------------------------------------------------- stim/cuda/templates/conv2.cuh | 3 +-- stim/cuda/templates/gaussian_blur.cuh | 1 + stim/cuda/templates/gradient.cuh | 3 +-- stim/cuda/update_dir.cuh | 232 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- stim/cuda/vote.cuh | 188 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- stim/image/image.h | 4 ++-- stim/math/vector.h | 2 +- 17 files changed, 693 insertions(+), 772 deletions(-) delete mode 100644 stim/cuda/down_sample.cuh delete mode 100644 stim/cuda/gaussian_blur.cuh create mode 100644 stim/cuda/ivote.cuh create mode 100644 stim/cuda/ivote/down_sample.cuh create mode 100644 stim/cuda/ivote/local_max.cuh create mode 100644 stim/cuda/ivote/update_dir.cuh create mode 100644 stim/cuda/ivote/vote.cuh delete mode 100644 stim/cuda/local_max.cuh delete mode 100644 stim/cuda/update_dir.cuh delete mode 100644 stim/cuda/vote.cuh diff --git a/stim/cuda/arraymath/array_add.cuh b/stim/cuda/arraymath/array_add.cuh index a05353c..cfc35b4 100644 --- a/stim/cuda/arraymath/array_add.cuh +++ b/stim/cuda/arraymath/array_add.cuh @@ -3,8 +3,7 @@ #include #include -#include -#include +#include namespace stim{ namespace cuda{ diff --git a/stim/cuda/arraymath/array_multiply.cuh b/stim/cuda/arraymath/array_multiply.cuh index add19ba..4385e77 100644 --- a/stim/cuda/arraymath/array_multiply.cuh +++ b/stim/cuda/arraymath/array_multiply.cuh @@ -3,8 +3,7 @@ #include #include -#include -#include +#include namespace stim{ namespace cuda{ diff --git a/stim/cuda/down_sample.cuh b/stim/cuda/down_sample.cuh deleted file mode 100644 index ecd19c9..0000000 --- a/stim/cuda/down_sample.cuh +++ /dev/null @@ -1,101 +0,0 @@ -#ifndef STIM_CUDA_DOWN_SAMPLE_H -#define STIM_CUDA_DOWN_SAMPLE_H - -#include -#include -#include -#include -#include - -namespace stim{ - namespace cuda{ - - template - __global__ void down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ - - unsigned int sigma_ds = 1/resize; - unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); - unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); - - - // calculate the 2D coordinates for this current thread. - int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y; - // convert 2D coordinates to 1D - int i = yi * x_ds + xi; - - if(xi< x_ds && yi< y_ds){ - - int x_org = xi * sigma_ds; - int y_org = yi * sigma_ds; - int i_org = y_org * x + x_org; - gpuI[i] = gpuI0[i_org]; - } - - } - - - /// Applies a Gaussian blur to a 2D image stored on the GPU - template - void gpu_down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ - - - unsigned int sigma_ds = 1/resize; - unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); - unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); - - //get the number of pixels in the image - unsigned int pixels_ds = x_ds * y_ds; - - unsigned int max_threads = stim::maxThreadsPerBlock(); - dim3 threads(max_threads, 1); - dim3 blocks(x_ds/threads.x + (x_ds %threads.x == 0 ? 0:1) , y_ds); - - stim::cuda::gpu_gaussian_blur_2d(gpuI0, sigma_ds,x ,y); - - //resample the image - down_sample <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y); - - } - - /// Applies a Gaussian blur to a 2D image stored on the CPU - template - void cpu_down_sample(T* re_img, T* image, T resize, unsigned int x, unsigned int y){ - - //get the number of pixels in the image - unsigned int pixels = x * y; - unsigned int bytes = sizeof(T) * pixels; - - unsigned int sigma_ds = 1/resize; - unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); - unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); - unsigned int bytes_ds = sizeof(T) * x_ds * y_ds; - - - - //allocate space on the GPU for the original image - T* gpuI0; - cudaMalloc(&gpuI0, bytes); - - - //copy the image data to the GPU - cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice); - - //allocate space on the GPU for the down sampled image - T* gpuI; - cudaMalloc(&gpuI, bytes_ds); - - //run the GPU-based version of the algorithm - gpu_down_sample(gpuI, gpuI0, resize, x, y); - - //copy the image data to the GPU - cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice); - - cudaFree(gpuI0); - cudeFree(gpuI); - } - - } -} - -#endif \ No newline at end of file diff --git a/stim/cuda/gaussian_blur.cuh b/stim/cuda/gaussian_blur.cuh deleted file mode 100644 index 0b060f0..0000000 --- a/stim/cuda/gaussian_blur.cuh +++ /dev/null @@ -1,89 +0,0 @@ -#ifndef STIM_CUDA_GAUSSIAN_BLUR_H -#define STIM_CUDA_GAUSSIAN_BLUR_H - -#include -#include -#include -#include -#include //GPU-based separable convolution algorithm - -#define pi 3.14159 - -namespace stim{ - namespace cuda{ - - template - void gen_gaussian(T* out, T sigma, unsigned int width){ - - //fill the kernel with a gaussian - for(unsigned int xi = 0; xi < width; xi++){ - - float x = (float)xi - (float)(width/2); //calculate the x position of the gaussian - float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) ); - out[xi] = g; - } - - } - - template - void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){ - - //allocate space for the kernel - unsigned int kwidth = sigma * 8 + 1; - float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); - - //fill the kernel with a gaussian - gen_gaussian(kernel0, sigma, kwidth); - - //copy the kernel to the GPU - T* gpuKernel0; - HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); - - //perform the gaussian blur as a separable convolution - stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth); - - HANDLE_ERROR(cudaFree(gpuKernel0)); - - } - - template - void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){ - - //allocate space for the kernel - unsigned int kwidth = sigma * 8 + 1; - float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); - - //fill the kernel with a gaussian - gen_gaussian(kernel0, sigma, kwidth); - - //copy the kernel to the GPU - T* gpuKernel0; - HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); - - //perform the gaussian blur as a separable convolution - stim::cuda::gpu_conv2sep(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth); - - HANDLE_ERROR(cudaFree(gpuKernel0)); - - } - - /// Applies a Gaussian blur to a 2D image stored on the CPU - template - void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){ - - //allocate space for the kernel - unsigned int kwidth = sigma * 8 + 1; - float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); - - //fill the kernel with a gaussian - gen_gaussian(kernel0, sigma, kwidth); - - //perform the gaussian blur as a separable convolution - stim::cuda::cpu_conv2sep(image, x, y, kernel0, kwidth, kernel0, kwidth); - - } - - }; -}; - -#endif \ No newline at end of file diff --git a/stim/cuda/ivote.cuh b/stim/cuda/ivote.cuh new file mode 100644 index 0000000..cc07d1d --- /dev/null +++ b/stim/cuda/ivote.cuh @@ -0,0 +1,17 @@ +#ifndef STIM_CUDA_IVOTE_H +#define STIM_CUDA_IVOTE_H + +#include +#include +#include +#include + +namespace stim{ + namespace cuda{ + + } +} + + + +#endif \ No newline at end of file diff --git a/stim/cuda/ivote/down_sample.cuh b/stim/cuda/ivote/down_sample.cuh new file mode 100644 index 0000000..598d095 --- /dev/null +++ b/stim/cuda/ivote/down_sample.cuh @@ -0,0 +1,100 @@ +#ifndef STIM_CUDA_DOWN_SAMPLE_H +#define STIM_CUDA_DOWN_SAMPLE_H + +#include +#include +#include +#include + +namespace stim{ + namespace cuda{ + + template + __global__ void down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ + + unsigned int sigma_ds = 1/resize; + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); + + + // calculate the 2D coordinates for this current thread. + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y; + // convert 2D coordinates to 1D + int i = yi * x_ds + xi; + + if(xi< x_ds && yi< y_ds){ + + int x_org = xi * sigma_ds; + int y_org = yi * sigma_ds; + int i_org = y_org * x + x_org; + gpuI[i] = gpuI0[i_org]; + } + + } + + + /// Applies a Gaussian blur to a 2D image stored on the GPU + template + void gpu_down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ + + + unsigned int sigma_ds = 1/resize; + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); + + //get the number of pixels in the image + unsigned int pixels_ds = x_ds * y_ds; + + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(max_threads, 1); + dim3 blocks(x_ds/threads.x + (x_ds %threads.x == 0 ? 0:1) , y_ds); + + stim::cuda::gpu_gaussian_blur2(gpuI0, sigma_ds,x ,y); + + //resample the image + down_sample <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y); + + } + + /// Applies a Gaussian blur to a 2D image stored on the CPU + template + void cpu_down_sample(T* re_img, T* image, T resize, unsigned int x, unsigned int y){ + + //get the number of pixels in the image + unsigned int pixels = x * y; + unsigned int bytes = sizeof(T) * pixels; + + unsigned int sigma_ds = 1/resize; + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); + unsigned int bytes_ds = sizeof(T) * x_ds * y_ds; + + + + //allocate space on the GPU for the original image + T* gpuI0; + cudaMalloc(&gpuI0, bytes); + + + //copy the image data to the GPU + cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice); + + //allocate space on the GPU for the down sampled image + T* gpuI; + cudaMalloc(&gpuI, bytes_ds); + + //run the GPU-based version of the algorithm + gpu_down_sample(gpuI, gpuI0, resize, x, y); + + //copy the image data to the GPU + cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice); + + cudaFree(gpuI0); + cudeFree(gpuI); + } + + } +} + +#endif \ No newline at end of file diff --git a/stim/cuda/ivote/local_max.cuh b/stim/cuda/ivote/local_max.cuh new file mode 100644 index 0000000..9519735 --- /dev/null +++ b/stim/cuda/ivote/local_max.cuh @@ -0,0 +1,150 @@ +#ifndef STIM_CUDA_LOCAL_MAX_H +#define STIM_CUDA_LOCAL_MAX_H + + +# include +# include +#include + +namespace stim{ + namespace cuda{ + + + // this kernel calculates the local maximum for finding the cell centers + template + __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ + + // calculate the 2D coordinates for this current thread. + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y; + // convert 2D coordinates to 1D + int i = yi * x + xi; + + + + //calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. + int xl = xi - conn; + int yl = yi - conn; + + // use zero for the lowest limits if the xi or yi is less than conn. + if (xi <= conn) + xl = 0; + if (yi <= conn) + yl = 0; + + //calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. + int xh = xi + conn; + int yh = yi + conn; + + // use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn. + if (xi >= x - conn) + xh = x; + if (yi>= y - conn) + yh = y; + + // calculate the limits for finding the local maximum location in the connected neighbors for the current pixel + int n_l = yl * x + xl; + int n_h = yh * x + xh; + + //initial the centers image to zero + gpuCenters[i] = 0; + + + int n = n_l; + + float l_value = 0; + + if (i < x * y) + + // check if the vote value for this pixel is greater than threshold, so this pixel may be a local max. + if (gpuVote[i]>final_t){ + + // compare the vote value for this pixel with the vote values with its neighbors. + while (n<=n_h){ + + // check if this vote value is a local max in its neighborhood. + if (gpuVote[i] < gpuVote[n]){ + l_value = 0; + n =n_h+1; + } + else if (n == n_h){ + l_value = 1; + n = n+1; + } + // check if the current neighbor is the last one at the current row + else if ((n - n_l - 2*conn)% x ==0){ + n = n + x - 2*conn -1; + n ++; + } + else + n ++; + } + // set the center value for this pixel to high if it's a local max ,and to low if not. + gpuCenters[i] = l_value ; + } + + } + + + + template + void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ + + + + + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(max_threads, 1); + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); + + + + //call the kernel to find the local maximum. + cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y); + + + } + + + + template + void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ + + + //calculate the number of bytes in the array + unsigned int bytes = x * y * sizeof(T); + + // allocate space on the GPU for the detected cell centes + T* gpuCenters; + cudaMalloc(&gpuCenters, bytes); + + + //allocate space on the GPU for the input Vote Image + T* gpuVote; + cudaMalloc(&gpuVote, bytes); + + + //copy the Vote image data to the GPU + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); + + + //call the GPU version of the local max function + gpu_local_max(gpuCenters, gpuVote, final_t, conn, x, y); + + + //copy the cell centers data to the CPU + cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ; + + + //free allocated memory + cudaFree(gpuCenters); + cudaFree(gpuVote); + cudaFree(gpuGrad); + } + + } +} + + + +#endif \ No newline at end of file diff --git a/stim/cuda/ivote/update_dir.cuh b/stim/cuda/ivote/update_dir.cuh new file mode 100644 index 0000000..5ec9f0a --- /dev/null +++ b/stim/cuda/ivote/update_dir.cuh @@ -0,0 +1,231 @@ +#ifndef STIM_CUDA_UPDATE_DIR_H +#define STIM_CUDA_UPDATE_DIR_H + + +# include +# include +#include +#include + +namespace stim{ + namespace cuda{ + + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. + template + __global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //generate a pointer to shared memory (size will be specified as a kernel parameter) + extern __shared__ float s_vote[]; + + //calculate the start point for this block + int bxi = blockIdx.x * blockDim.x; + + //calculate the width of the shared memory block + int swidth = 2 * rmax + blockDim.x; + + // calculate the 2D coordinates for this current thread. + int xi = bxi + threadIdx.x; + int yi = blockIdx.y; + + // convert 2D coordinates to 1D + int i = yi * x + xi; + + // calculate the voting direction based on the grtadient direction + float theta = gpuGrad[2*i]; + + //initialize the vote direction to zero + gpuDir[i] = 0; + + // define a local variable to maximum value of the vote image in the voting area for this voter + float max = 0; + + // define two local variables for the x and y coordinations where the maximum happened + int id_x = 0; + int id_y = 0; + + // compute the size of window which will be checked for finding the voting area for this voter + unsigned int x_table = 2*rmax +1; + unsigned int rmax_sq = rmax * rmax; + int r = (int)rmax; + int tx_rmax = threadIdx.x + rmax; + int bxs = bxi - rmax; + + for(int yr = -r; yr <= r; yr++){ + + //copy the portion of the image necessary for this block to shared memory + __syncthreads(); + stim::cuda::sharedMemcpy_tex2D(s_vote, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim); + __syncthreads(); + + //if the current thread is outside of the image, it doesn't have to be computed + if(xi < x && yi < y){ + + for(int xr = -r; xr <= r; xr++){ + + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; + + // calculate the angle between the voter and the current pixel in x and y directions + float atan_angle = gpuTable[ind_t]; + + + // calculate the voting direction based on the grtadient direction + int idx_share_update = xr + tx_rmax ; + float share_vote = s_vote[idx_share_update]; + + // check if the current pixel is located in the voting area of this voter. + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) max) { + + max = share_vote; + id_x = xr; + id_y = yr; + } + } + } + } + } + + + //float new_angle = atan2(dy, dx); + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); + + float new_angle = gpuTable[ind_m]; + + gpuDir[i] = new_angle; + + } + + // this kernel updates the gradient direction by the calculated voting direction. + template + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, unsigned int x, unsigned int y){ + + //************ when the number of threads are (1024,1) ************* + + // calculate the 2D coordinates for this current thread. + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y; + // convert 2D coordinates to 1D + int i = yi * x + xi; + + + //update the gradient image with the vote direction + gpuGrad[2*i] = gpuDir[i]; + } + + + template + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //get the number of pixels in the image + unsigned int pixels = x * y; + unsigned int bytes = sizeof(T) * pixels; + + + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(max_threads, 1); + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); + + // Allocate CUDA array in device memory + + //define a channel descriptor for a single 32-bit channel + cudaChannelFormatDesc channelDesc = + cudaCreateChannelDesc(32, 0, 0, 0, + cudaChannelFormatKindFloat); + cudaArray* cuArray; //declare the cuda array + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array + + // Copy the image data from global memory to the array + cudaMemcpyToArray(cuArray, 0, 0, gpuVote, bytes, + cudaMemcpyDeviceToDevice); + + // Specify texture + struct cudaResourceDesc resDesc; //create a resource descriptor + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type + resDesc.res.array.array = cuArray; //add a pointer to the cuda array + + // Specify texture object parameters + struct cudaTextureDesc texDesc; //create a texture descriptor + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) + texDesc.addressMode[1] = cudaAddressModeWrap; + texDesc.filterMode = cudaFilterModePoint; //use linear filtering + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) + texDesc.normalizedCoords = 0; //not using normalized coordinates + + // Create texture object + cudaTextureObject_t texObj = 0; + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); + + // specify share memory + unsigned int share_bytes = (2*rmax + threads.x)*(1)*4; + + // allocate space on the GPU for the updated vote direction + T* gpuDir; + cudaMalloc(&gpuDir, bytes); + + //call the kernel to calculate the new voting direction + cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, texObj, gpuGrad, gpuTable, phi, rmax, x , y); + + //call the kernel to update the gradient direction + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); + + + //free allocated memory + cudaFree(gpuDir); + + } + + + template + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //calculate the number of bytes in the array + unsigned int bytes = x * y * sizeof(T); + + //calculate the number of bytes in the atan2 table + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); + + //allocate space on the GPU for the Vote Image + T* gpuVote; + cudaMalloc(&gpuVote, bytes); + + //copy the input vote image to the GPU + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the input Gradient image + T* gpuGrad; + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); + + //copy the Gradient data to the GPU + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the atan2 table + T* gpuTable; + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); + + //copy the atan2 values to the GPU + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); + + + //call the GPU version of the update direction function + gpu_update_dir(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + + //copy the new gradient image back to the CPU + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; + + //free allocated memory + cudaFree(gpuTable); + cudaFree(gpuVote); + cudaFree(gpuGrad); + } + + } +} + + + +#endif \ No newline at end of file diff --git a/stim/cuda/ivote/vote.cuh b/stim/cuda/ivote/vote.cuh new file mode 100644 index 0000000..7e98894 --- /dev/null +++ b/stim/cuda/ivote/vote.cuh @@ -0,0 +1,187 @@ +#ifndef STIM_CUDA_VOTE_H +#define STIM_CUDA_VOTE_H + + +# include +# include +#include +#include + + +namespace stim{ + namespace cuda{ + + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area + template + __global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //generate a pointer to shared memory (size will be specified as a kernel parameter) + extern __shared__ float2 s_grad[]; + + //calculate the start point for this block + int bxi = blockIdx.x * blockDim.x; + + //calculate the width of the shared memory block + int swidth = 2 * rmax + blockDim.x; + + // calculate the 2D coordinates for this current thread. + int xi = bxi + threadIdx.x; + int yi = blockIdx.y; + // convert 2D coordinates to 1D + int i = yi * x + xi; + + + // define a local variable to sum the votes from the voters + float sum = 0; + + // compute the size of window which will be checked for finding the proper voters for this pixel + unsigned int x_table = 2*rmax +1; + + unsigned int rmax_sq = rmax * rmax; + int r = (int)rmax; + int tx_rmax = threadIdx.x + rmax; + int bxs = bxi - rmax; + + + for(int yr = -r; yr <= r; yr++){ + + //copy the portion of the image necessary for this block to shared memory + __syncthreads(); + stim::cuda::sharedMemcpy_tex2D(s_grad, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim); + __syncthreads(); + + //if the current thread is outside of the image, it doesn't have to be computed + if(xi < x && yi < y){ + + for(int xr = -r; xr <= r; xr++){ + + //find the location of this voter in the atan2 table + unsigned int id_t = (yr + rmax) * x_table + xr + rmax; + + // calculate the angle between the pixel and the current voter in x and y directions + float atan_angle = gpuTable[id_t]; + + + // calculate the voting direction based on the grtadient direction + int idx_share = xr + tx_rmax ; + float2 g = s_grad[idx_share]; + float theta = g.x; + + // check if the current voter is located in the voting area of this pixel. + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //get the number of pixels in the image + unsigned int pixels = x * y; + unsigned int bytes = sizeof(T) * pixels; + + + unsigned int max_threads = stim::maxThreadsPerBlock(); + //unsigned int thread_dim = sqrt(max_threads); + dim3 threads(max_threads, 1); + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); + + // Allocate CUDA array in device memory + + //define a channel descriptor for a single 32-bit channel + cudaChannelFormatDesc channelDesc = + cudaCreateChannelDesc(32, 32, 0, 0, + cudaChannelFormatKindFloat); + cudaArray* cuArray; //declare the cuda array + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array + + // Copy the image data from global memory to the array + cudaMemcpyToArray(cuArray, 0, 0, gpuGrad, bytes*2, + cudaMemcpyDeviceToDevice); + + // Specify texture + struct cudaResourceDesc resDesc; //create a resource descriptor + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type + resDesc.res.array.array = cuArray; //add a pointer to the cuda array + + // Specify texture object parameters + struct cudaTextureDesc texDesc; //create a texture descriptor + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) + texDesc.addressMode[1] = cudaAddressModeWrap; + texDesc.filterMode = cudaFilterModePoint; //use linear filtering + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) + texDesc.normalizedCoords = 0; //not using normalized coordinates + + // Create texture object + cudaTextureObject_t texObj = 0; + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); + + // specify share memory + unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4; + + + //call the kernel to do the voting + + cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y); + + } + + + template + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //calculate the number of bytes in the array + unsigned int bytes = x * y * sizeof(T); + + //calculate the number of bytes in the atan2 table + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); + + //allocate space on the GPU for the Vote Image + T* gpuVote; + cudaMalloc(&gpuVote, bytes); + + //allocate space on the GPU for the input Gradient image + T* gpuGrad; + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); + + //copy the Gradient Magnitude data to the GPU + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the atan2 table + T* gpuTable; + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); + + //copy the atan2 values to the GPU + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); + + //cudaMemcpyToSymbol (cstTable, cpuTable, bytes_table ); + + + //call the GPU version of the vote calculation function + gpu_vote(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + + //copy the Vote Data back to the CPU + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; + + //free allocated memory + cudaFree(gpuTable); + cudaFree(gpuVote); + cudaFree(gpuGrad); + } + + } +} + + + +#endif \ No newline at end of file diff --git a/stim/cuda/local_max.cuh b/stim/cuda/local_max.cuh deleted file mode 100644 index 5083952..0000000 --- a/stim/cuda/local_max.cuh +++ /dev/null @@ -1,151 +0,0 @@ -#ifndef STIM_CUDA_LOCAL_MAX_H -#define STIM_CUDA_LOCAL_MAX_H - - -# include -# include -# include -# include - -namespace stim{ - namespace cuda{ - - - // this kernel calculates the local maximum for finding the cell centers - template - __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ - - // calculate the 2D coordinates for this current thread. - int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y; - // convert 2D coordinates to 1D - int i = yi * x + xi; - - - - //calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. - int xl = xi - conn; - int yl = yi - conn; - - // use zero for the lowest limits if the xi or yi is less than conn. - if (xi <= conn) - xl = 0; - if (yi <= conn) - yl = 0; - - //calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. - int xh = xi + conn; - int yh = yi + conn; - - // use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn. - if (xi >= x - conn) - xh = x; - if (yi>= y - conn) - yh = y; - - // calculate the limits for finding the local maximum location in the connected neighbors for the current pixel - int n_l = yl * x + xl; - int n_h = yh * x + xh; - - //initial the centers image to zero - gpuCenters[i] = 0; - - - int n = n_l; - - float l_value = 0; - - if (i < x * y) - - // check if the vote value for this pixel is greater than threshold, so this pixel may be a local max. - if (gpuVote[i]>final_t){ - - // compare the vote value for this pixel with the vote values with its neighbors. - while (n<=n_h){ - - // check if this vote value is a local max in its neighborhood. - if (gpuVote[i] < gpuVote[n]){ - l_value = 0; - n =n_h+1; - } - else if (n == n_h){ - l_value = 1; - n = n+1; - } - // check if the current neighbor is the last one at the current row - else if ((n - n_l - 2*conn)% x ==0){ - n = n + x - 2*conn -1; - n ++; - } - else - n ++; - } - // set the center value for this pixel to high if it's a local max ,and to low if not. - gpuCenters[i] = l_value ; - } - - } - - - - template - void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ - - - - - unsigned int max_threads = stim::maxThreadsPerBlock(); - dim3 threads(max_threads, 1); - dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); - - - - //call the kernel to find the local maximum. - cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y); - - - } - - - - template - void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ - - - //calculate the number of bytes in the array - unsigned int bytes = x * y * sizeof(T); - - // allocate space on the GPU for the detected cell centes - T* gpuCenters; - cudaMalloc(&gpuCenters, bytes); - - - //allocate space on the GPU for the input Vote Image - T* gpuVote; - cudaMalloc(&gpuVote, bytes); - - - //copy the Vote image data to the GPU - HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); - - - //call the GPU version of the local max function - gpu_local_max(gpuCenters, gpuVote, final_t, conn, x, y); - - - //copy the cell centers data to the CPU - cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ; - - - //free allocated memory - cudaFree(gpuCenters); - cudaFree(gpuVote); - cudaFree(gpuGrad); - } - - } -} - - - -#endif \ No newline at end of file diff --git a/stim/cuda/templates/conv2.cuh b/stim/cuda/templates/conv2.cuh index a40fdd0..045e06d 100644 --- a/stim/cuda/templates/conv2.cuh +++ b/stim/cuda/templates/conv2.cuh @@ -3,8 +3,7 @@ #include #include -#include -#include +#include #include #include diff --git a/stim/cuda/templates/gaussian_blur.cuh b/stim/cuda/templates/gaussian_blur.cuh index 0b060f0..f4485ae 100644 --- a/stim/cuda/templates/gaussian_blur.cuh +++ b/stim/cuda/templates/gaussian_blur.cuh @@ -58,6 +58,7 @@ namespace stim{ //copy the kernel to the GPU T* gpuKernel0; + HANDLE_ERROR(cudaMalloc(&gpuKernel0, kwidth * sizeof(T))); HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); //perform the gaussian blur as a separable convolution diff --git a/stim/cuda/templates/gradient.cuh b/stim/cuda/templates/gradient.cuh index 95931dc..c55ec91 100644 --- a/stim/cuda/templates/gradient.cuh +++ b/stim/cuda/templates/gradient.cuh @@ -3,8 +3,7 @@ #include #include -#include -#include +#include namespace stim{ namespace cuda{ diff --git a/stim/cuda/update_dir.cuh b/stim/cuda/update_dir.cuh deleted file mode 100644 index 8522f2f..0000000 --- a/stim/cuda/update_dir.cuh +++ /dev/null @@ -1,232 +0,0 @@ -#ifndef STIM_CUDA_UPDATE_DIR_H -#define STIM_CUDA_UPDATE_DIR_H - - -# include -# include -# include -# include -#include - -namespace stim{ - namespace cuda{ - - // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. - template - __global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ - - //generate a pointer to shared memory (size will be specified as a kernel parameter) - extern __shared__ float s_vote[]; - - //calculate the start point for this block - int bxi = blockIdx.x * blockDim.x; - - //calculate the width of the shared memory block - int swidth = 2 * rmax + blockDim.x; - - // calculate the 2D coordinates for this current thread. - int xi = bxi + threadIdx.x; - int yi = blockIdx.y; - - // convert 2D coordinates to 1D - int i = yi * x + xi; - - // calculate the voting direction based on the grtadient direction - float theta = gpuGrad[2*i]; - - //initialize the vote direction to zero - gpuDir[i] = 0; - - // define a local variable to maximum value of the vote image in the voting area for this voter - float max = 0; - - // define two local variables for the x and y coordinations where the maximum happened - int id_x = 0; - int id_y = 0; - - // compute the size of window which will be checked for finding the voting area for this voter - unsigned int x_table = 2*rmax +1; - unsigned int rmax_sq = rmax * rmax; - int r = (int)rmax; - int tx_rmax = threadIdx.x + rmax; - int bxs = bxi - rmax; - - for(int yr = -r; yr <= r; yr++){ - - //copy the portion of the image necessary for this block to shared memory - __syncthreads(); - stim::cuda::sharedMemcpy_tex2D(s_vote, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim); - __syncthreads(); - - //if the current thread is outside of the image, it doesn't have to be computed - if(xi < x && yi < y){ - - for(int xr = -r; xr <= r; xr++){ - - unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; - - // calculate the angle between the voter and the current pixel in x and y directions - float atan_angle = gpuTable[ind_t]; - - - // calculate the voting direction based on the grtadient direction - int idx_share_update = xr + tx_rmax ; - float share_vote = s_vote[idx_share_update]; - - // check if the current pixel is located in the voting area of this voter. - if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) max) { - - max = share_vote; - id_x = xr; - id_y = yr; - } - } - } - } - } - - - //float new_angle = atan2(dy, dx); - unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); - - float new_angle = gpuTable[ind_m]; - - gpuDir[i] = new_angle; - - } - - // this kernel updates the gradient direction by the calculated voting direction. - template - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, unsigned int x, unsigned int y){ - - //************ when the number of threads are (1024,1) ************* - - // calculate the 2D coordinates for this current thread. - int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y; - // convert 2D coordinates to 1D - int i = yi * x + xi; - - - //update the gradient image with the vote direction - gpuGrad[2*i] = gpuDir[i]; - } - - - template - void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ - - //get the number of pixels in the image - unsigned int pixels = x * y; - unsigned int bytes = sizeof(T) * pixels; - - - unsigned int max_threads = stim::maxThreadsPerBlock(); - dim3 threads(max_threads, 1); - dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); - - // Allocate CUDA array in device memory - - //define a channel descriptor for a single 32-bit channel - cudaChannelFormatDesc channelDesc = - cudaCreateChannelDesc(32, 0, 0, 0, - cudaChannelFormatKindFloat); - cudaArray* cuArray; //declare the cuda array - cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array - - // Copy the image data from global memory to the array - cudaMemcpyToArray(cuArray, 0, 0, gpuVote, bytes, - cudaMemcpyDeviceToDevice); - - // Specify texture - struct cudaResourceDesc resDesc; //create a resource descriptor - memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero - resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type - resDesc.res.array.array = cuArray; //add a pointer to the cuda array - - // Specify texture object parameters - struct cudaTextureDesc texDesc; //create a texture descriptor - memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero - texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) - texDesc.addressMode[1] = cudaAddressModeWrap; - texDesc.filterMode = cudaFilterModePoint; //use linear filtering - texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) - texDesc.normalizedCoords = 0; //not using normalized coordinates - - // Create texture object - cudaTextureObject_t texObj = 0; - cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); - - // specify share memory - unsigned int share_bytes = (2*rmax + threads.x)*(1)*4; - - // allocate space on the GPU for the updated vote direction - T* gpuDir; - cudaMalloc(&gpuDir, bytes); - - //call the kernel to calculate the new voting direction - cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, texObj, gpuGrad, gpuTable, phi, rmax, x , y); - - //call the kernel to update the gradient direction - cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); - - - //free allocated memory - cudaFree(gpuDir); - - } - - - template - void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ - - //calculate the number of bytes in the array - unsigned int bytes = x * y * sizeof(T); - - //calculate the number of bytes in the atan2 table - unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); - - //allocate space on the GPU for the Vote Image - T* gpuVote; - cudaMalloc(&gpuVote, bytes); - - //copy the input vote image to the GPU - HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); - - //allocate space on the GPU for the input Gradient image - T* gpuGrad; - HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); - - //copy the Gradient data to the GPU - HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); - - //allocate space on the GPU for the atan2 table - T* gpuTable; - HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); - - //copy the atan2 values to the GPU - HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); - - - //call the GPU version of the update direction function - gpu_update_dir(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); - - - //copy the new gradient image back to the CPU - cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; - - //free allocated memory - cudaFree(gpuTable); - cudaFree(gpuVote); - cudaFree(gpuGrad); - } - - } -} - - - -#endif \ No newline at end of file diff --git a/stim/cuda/vote.cuh b/stim/cuda/vote.cuh deleted file mode 100644 index 9abcb04..0000000 --- a/stim/cuda/vote.cuh +++ /dev/null @@ -1,188 +0,0 @@ -#ifndef STIM_CUDA_VOTE_H -#define STIM_CUDA_VOTE_H - - -# include -# include -# include -# include -#include - - -namespace stim{ - namespace cuda{ - - // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area - template - __global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ - - //generate a pointer to shared memory (size will be specified as a kernel parameter) - extern __shared__ float2 s_grad[]; - - //calculate the start point for this block - int bxi = blockIdx.x * blockDim.x; - - //calculate the width of the shared memory block - int swidth = 2 * rmax + blockDim.x; - - // calculate the 2D coordinates for this current thread. - int xi = bxi + threadIdx.x; - int yi = blockIdx.y; - // convert 2D coordinates to 1D - int i = yi * x + xi; - - - // define a local variable to sum the votes from the voters - float sum = 0; - - // compute the size of window which will be checked for finding the proper voters for this pixel - unsigned int x_table = 2*rmax +1; - - unsigned int rmax_sq = rmax * rmax; - int r = (int)rmax; - int tx_rmax = threadIdx.x + rmax; - int bxs = bxi - rmax; - - - for(int yr = -r; yr <= r; yr++){ - - //copy the portion of the image necessary for this block to shared memory - __syncthreads(); - stim::cuda::sharedMemcpy_tex2D(s_grad, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim); - __syncthreads(); - - //if the current thread is outside of the image, it doesn't have to be computed - if(xi < x && yi < y){ - - for(int xr = -r; xr <= r; xr++){ - - //find the location of this voter in the atan2 table - unsigned int id_t = (yr + rmax) * x_table + xr + rmax; - - // calculate the angle between the pixel and the current voter in x and y directions - float atan_angle = gpuTable[id_t]; - - - // calculate the voting direction based on the grtadient direction - int idx_share = xr + tx_rmax ; - float2 g = s_grad[idx_share]; - float theta = g.x; - - // check if the current voter is located in the voting area of this pixel. - if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) - void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ - - //get the number of pixels in the image - unsigned int pixels = x * y; - unsigned int bytes = sizeof(T) * pixels; - - - unsigned int max_threads = stim::maxThreadsPerBlock(); - //unsigned int thread_dim = sqrt(max_threads); - dim3 threads(max_threads, 1); - dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); - - // Allocate CUDA array in device memory - - //define a channel descriptor for a single 32-bit channel - cudaChannelFormatDesc channelDesc = - cudaCreateChannelDesc(32, 32, 0, 0, - cudaChannelFormatKindFloat); - cudaArray* cuArray; //declare the cuda array - cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array - - // Copy the image data from global memory to the array - cudaMemcpyToArray(cuArray, 0, 0, gpuGrad, bytes*2, - cudaMemcpyDeviceToDevice); - - // Specify texture - struct cudaResourceDesc resDesc; //create a resource descriptor - memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero - resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type - resDesc.res.array.array = cuArray; //add a pointer to the cuda array - - // Specify texture object parameters - struct cudaTextureDesc texDesc; //create a texture descriptor - memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero - texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) - texDesc.addressMode[1] = cudaAddressModeWrap; - texDesc.filterMode = cudaFilterModePoint; //use linear filtering - texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) - texDesc.normalizedCoords = 0; //not using normalized coordinates - - // Create texture object - cudaTextureObject_t texObj = 0; - cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); - - // specify share memory - unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4; - - - //call the kernel to do the voting - - cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y); - - } - - - template - void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ - - //calculate the number of bytes in the array - unsigned int bytes = x * y * sizeof(T); - - //calculate the number of bytes in the atan2 table - unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); - - //allocate space on the GPU for the Vote Image - T* gpuVote; - cudaMalloc(&gpuVote, bytes); - - //allocate space on the GPU for the input Gradient image - T* gpuGrad; - HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); - - //copy the Gradient Magnitude data to the GPU - HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); - - //allocate space on the GPU for the atan2 table - T* gpuTable; - HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); - - //copy the atan2 values to the GPU - HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); - - //cudaMemcpyToSymbol (cstTable, cpuTable, bytes_table ); - - - //call the GPU version of the vote calculation function - gpu_vote(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); - - - //copy the Vote Data back to the CPU - cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; - - //free allocated memory - cudaFree(gpuTable); - cudaFree(gpuVote); - cudaFree(gpuGrad); - } - - } -} - - - -#endif \ No newline at end of file diff --git a/stim/image/image.h b/stim/image/image.h index 0db4d91..8bbb507 100644 --- a/stim/image/image.h +++ b/stim/image/image.h @@ -174,7 +174,7 @@ public: /// Returns the maximum pixel value in the image - T max(){ + T maxv(){ float max = 0; unsigned long N = width() * height(); //get the number of pixels @@ -190,7 +190,7 @@ public: } /// Returns the minimum pixel value in the image - T min(){ + T minv(){ float min = 0; unsigned long N = width() * height(); //get the number of pixels diff --git a/stim/math/vector.h b/stim/math/vector.h index b2a7db7..7833f82 100644 --- a/stim/math/vector.h +++ b/stim/math/vector.h @@ -5,7 +5,7 @@ #include #include #include -#include "../cuda/callable.h" +#include "../cuda/cudatools/callable.h" namespace stim { -- libgit2 0.21.4