From 13fe3c84bbffad233bbd8728590c870364472ebc Mon Sep 17 00:00:00 2001 From: laila Date: Thu, 27 Aug 2015 13:06:29 -0500 Subject: [PATCH] update the stimlib by adding some gpu-functions (abs, cart2polar, multiply, down_sample, gaussian_blur, gradient, local_max, sharedmem, update_dir, vote) --- stim/cuda/array_abs.cuh | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/array_cart2polar.cuh | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/array_multiply.cuh | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/arraymath.cuh | 16 ++++++++++++++++ stim/cuda/down_sample.cuh | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/gaussian_blur.cuh | 243 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/gradient.cuh | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/local_max.cuh | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/sharedmem.cuh | 42 ++++++++++++++++++++++++++++++++++++++++++ stim/cuda/update_dir.cuh | 232 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/vote.cuh | 188 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 11 files changed, 1277 insertions(+), 0 deletions(-) create mode 100644 stim/cuda/array_abs.cuh create mode 100644 stim/cuda/array_cart2polar.cuh create mode 100644 stim/cuda/array_multiply.cuh create mode 100644 stim/cuda/arraymath.cuh create mode 100644 stim/cuda/down_sample.cuh create mode 100644 stim/cuda/gaussian_blur.cuh create mode 100644 stim/cuda/gradient.cuh create mode 100644 stim/cuda/local_max.cuh create mode 100644 stim/cuda/sharedmem.cuh create mode 100644 stim/cuda/update_dir.cuh create mode 100644 stim/cuda/vote.cuh diff --git a/stim/cuda/array_abs.cuh b/stim/cuda/array_abs.cuh new file mode 100644 index 0000000..fc4d176 --- /dev/null +++ b/stim/cuda/array_abs.cuh @@ -0,0 +1,59 @@ +#ifndef STIM_CUDA_ARRAY_ABS_H +#define STIM_CUDA_ARRAY_ABS_H + +namespace stim{ + namespace cuda{ + template + __global__ void cuda_abs(T* a, unsigned int N){ + + //calculate the 1D index for this thread + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if(i < N) + a[i] = abs(a[i]); + } + + + template + void gpu_abs(T* a, unsigned int N){ + + //get the maximum number of threads per block for the CUDA device + int threads = stim::maxThreadsPerBlock(); + + //calculate the number of blocks + int blocks = N / threads + (N%threads == 0 ? 0:1); + + //call the kernel to do the multiplication + cuda_abs <<< blocks, threads >>>(a, N); + + } + + + template + void cpu_abs(T* a, unsigned int N){ + + //calculate the number of bytes in the array + unsigned int bytes = N * sizeof(T); + + //allocate memory on the GPU for the array + T* gpuA; + HANDLE_ERROR( cudaMalloc(&gpuA, bytes) ); + + //copy the array to the GPU + HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) ); + + //call the GPU version of this function + gpu_abs(gpuA, N); + + //copy the array back to the CPU + HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) ); + + //free allocated memory + cudaFree(gpuA); + + } + + } //end namespace cuda +} //end namespace stim + +#endif \ No newline at end of file diff --git a/stim/cuda/array_cart2polar.cuh b/stim/cuda/array_cart2polar.cuh new file mode 100644 index 0000000..8889b03 --- /dev/null +++ b/stim/cuda/array_cart2polar.cuh @@ -0,0 +1,66 @@ +#ifndef STIM_CUDA_ARRAY_CART2POLAR_H +#define STIM_CUDA_ARRAY_CART2POLAR_H + +namespace stim{ + namespace cuda{ + template + __global__ void cuda_cart2polar(T* a, unsigned int N){ + + + //calculate the 1D index for this thread + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if(i < N){ + float x = a[i * 2 + 0]; + float y = a[i * 2 + 1]; + float theta = atan2( y, x ) ; + float r = sqrt(x * x + y * y); + a[i * 2 + 0] = theta; + a[i * 2 + 1] = r; + } + } + + + template + void gpu_cart2polar(T* gpuGrad, unsigned int N){ + + //get the maximum number of threads per block for the CUDA device + int threads = stim::maxThreadsPerBlock(); + + //calculate the number of blocks + int blocks = N / threads + (N % threads == 0 ? 0:1); + + //call the kernel to do the multiplication + cuda_cart2polar <<< blocks, threads >>>(gpuGrad, N); + + } + + + template + void cpu_cart2polar(T* a, unsigned int N){ + + //calculate the number of bytes in the array + unsigned int bytes = N * sizeof(T) * 2; + + //allocate memory on the GPU for the array + T* gpuA; + HANDLE_ERROR( cudaMalloc(&gpuA, bytes) ); + + //copy the array to the GPU + HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) ); + + //call the GPU version of this function + gpu_cart2polar(gpuA, N); + + //copy the array back to the CPU + HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) ); + + //free allocated memory + cudaFree(gpuA); + + } + + } +} + +#endif \ No newline at end of file diff --git a/stim/cuda/array_multiply.cuh b/stim/cuda/array_multiply.cuh new file mode 100644 index 0000000..add19ba --- /dev/null +++ b/stim/cuda/array_multiply.cuh @@ -0,0 +1,64 @@ +#ifndef STIM_CUDA_ARRAY_MULTIPLY_H +#define STIM_CUDA_ARRAY_MULTIPLY_H + +#include +#include +#include +#include + +namespace stim{ + namespace cuda{ + + template + __global__ void cuda_multiply(T* lhs, T rhs, unsigned int N){ + + //calculate the 1D index for this thread + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if(i < N) + lhs[i] *= rhs; + } + + template + void gpu_multiply(T* lhs, T rhs, unsigned int N){ + + //get the maximum number of threads per block for the CUDA device + int threads = stim::maxThreadsPerBlock(); + + //calculate the number of blocks + int blocks = N / threads + (N%threads == 0 ? 0:1); + + //call the kernel to do the multiplication + cuda_multiply <<< blocks, threads >>>(lhs, rhs, N); + + } + + template + void cpu_multiply(T* lhs, T rhs, unsigned int N){ + + //calculate the number of bytes in the array + unsigned int bytes = N * sizeof(T); + + //allocate memory on the GPU for the array + T* gpuLHS; + HANDLE_ERROR( cudaMalloc(&gpuLHS, bytes) ); + + //copy the array to the GPU + HANDLE_ERROR( cudaMemcpy(gpuLHS, lhs, bytes, cudaMemcpyHostToDevice) ); + + //call the GPU version of this function + gpu_multiply(gpuLHS, rhs, N); + + //copy the array back to the CPU + HANDLE_ERROR( cudaMemcpy(lhs, gpuLHS, bytes, cudaMemcpyDeviceToHost) ); + + //free allocated memory + cudaFree(gpuLHS); + } + + } +} + + + +#endif \ No newline at end of file diff --git a/stim/cuda/arraymath.cuh b/stim/cuda/arraymath.cuh new file mode 100644 index 0000000..b5dcf20 --- /dev/null +++ b/stim/cuda/arraymath.cuh @@ -0,0 +1,16 @@ +#ifndef STIM_CUDA_ARRAYMATH_H +#define STIM_CUDA_ARRAYMATH_H + +#include +#include +#include + +namespace stim{ + namespace cuda{ + + } +} + + + +#endif \ No newline at end of file diff --git a/stim/cuda/down_sample.cuh b/stim/cuda/down_sample.cuh new file mode 100644 index 0000000..73081e3 --- /dev/null +++ b/stim/cuda/down_sample.cuh @@ -0,0 +1,101 @@ +#ifndef STIM_CUDA_DOWN_SAMPLE_H +#define STIM_CUDA_DOWN_SAMPLE_H + +#include +#include +#include +#include +#include + +namespace stim{ + namespace cuda{ + + template + __global__ void down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ + + unsigned int sigma_ds = 1/resize; + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); + + + // calculate the 2D coordinates for this current thread. + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y; + // convert 2D coordinates to 1D + int i = yi * x_ds + xi; + + if(xi< x_ds && yi< y_ds){ + + int x_org = xi * sigma_ds; + int y_org = yi * sigma_ds; + int i_org = y_org * x + x_org; + gpuI[i] = gpuI0[i_org]; + } + + } + + + /// Applies a Gaussian blur to a 2D image stored on the GPU + template + void gpu_down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ + + + unsigned int sigma_ds = 1/resize; + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); + + //get the number of pixels in the image + unsigned int pixels_ds = x_ds * y_ds; + + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(max_threads, 1); + dim3 blocks(x_ds/threads.x + (x_ds %threads.x == 0 ? 0:1) , y_ds); + + stim::cuda::gpu_gaussian_blur_2d(gpuI0, sigma_ds,x ,y); + + //resample the image + down_sample <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y); + + } + + /// Applies a Gaussian blur to a 2D image stored on the CPU + template + void cpu_down_sample(T* re_img, T* image, T resize, unsigned int x, unsigned int y){ + + //get the number of pixels in the image + unsigned int pixels = x * y; + unsigned int bytes = sizeof(T) * pixels; + + unsigned int sigma_ds = 1/resize; + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); + unsigned int bytes_ds = sizeof(T) * x_ds * y_ds; + + + + //allocate space on the GPU for the original image + T* gpuI0; + cudaMalloc(&gpuI0, bytes); + + + //copy the image data to the GPU + cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice); + + //allocate space on the GPU for the down sampled image + T* gpuI; + cudaMalloc(&gpuI, bytes_ds); + + //run the GPU-based version of the algorithm + gpu_down_sample(gpuI, gpuI0, resize, x, y); + + //copy the image data to the GPU + cudaMemcpy(re_image, gpuI, bytes_ds, cudaMemcpyHostToDevice); + + cudaFree(gpuI0); + cudeFree(gpuI); + } + + } +} + +#endif \ No newline at end of file diff --git a/stim/cuda/gaussian_blur.cuh b/stim/cuda/gaussian_blur.cuh new file mode 100644 index 0000000..d510bf9 --- /dev/null +++ b/stim/cuda/gaussian_blur.cuh @@ -0,0 +1,243 @@ +#ifndef STIM_CUDA_GAUSSIAN_BLUR_H +#define STIM_CUDA_GAUSSIAN_BLUR_H + +#include +#include +#include +#include +#include + +#define pi 3.14159 + +namespace stim{ + namespace cuda{ + + template + __global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ + + //generate a pointer to shared memory (size will be specified as a kernel parameter) + extern __shared__ T s[]; + + int kr = sigma * 4; //calculate the kernel radius + + //get a pointer to the gaussian in memory + T* g = (T*)&s[blockDim.x + 2 * kr]; + + //calculate the start point for this block + int bxi = blockIdx.x * blockDim.x; + int byi = blockIdx.y; + + //copy the portion of the image necessary for this block to shared memory + stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); + + //calculate the thread index and block index + int ti = threadIdx.x; + + //calculate the spatial coordinate for this thread + int xi = bxi + ti; + + //pre-compute the gaussian values for each kernel point + T a = 1.0 / (sigma * sqrt(2 * pi)); + T c = - 1.0 / (2*sigma*sigma); + int ki; + + //use the first 2kr+1 threads to evaluate a gaussian and store the result + if(ti <= 2* kr+1){ + ki = ti - kr; + g[ti] = a * exp((ki*ki) * c); + } + + //make sure that all writing to shared memory is done before continuing + __syncthreads(); + + //if the current pixel is outside of the image + if(bxi + ti > x || byi > y) + return; + + + + //calculate the coordinates of the current thread in shared memory + int si = ti + kr; + + T sum = 0; //running weighted sum across the kernel + + + //for each element of the kernel + for(int ki = -kr; ki <= kr; ki++){ + sum += g[ki + kr] * s[si + ki]; + } + + //calculate the 1D image index for this thread + unsigned int i = byi * x + xi; + + //output the result to global memory + out[i] = sum; + } + + template + __global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ + + //generate a pointer to shared memory (size will be specified as a kernel parameter) + extern __shared__ T s[]; + + int kr = sigma * 4; //calculate the kernel radius + + //get a pointer to the gaussian in memory + T* g = (T*)&s[blockDim.y + 2 * kr]; + + //calculate the start point for this block + int bxi = blockIdx.x; + int byi = blockIdx.y * blockDim.y; + + //copy the portion of the image necessary for this block to shared memory + stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); + + //calculate the thread index and block index + int ti = threadIdx.y; + + //calculate the spatial coordinate for this thread + int yi = byi + ti; + + //pre-compute the gaussian values for each kernel point + T a = 1.0 / (sigma * sqrt(2 * pi)); + T c = - 1.0 / (2*sigma*sigma); + int ki; + + //use the first 2kr+1 threads to evaluate a gaussian and store the result + if(ti <= 2* kr+1){ + ki = ti - kr; + g[ti] = a * exp((ki*ki) * c); + } + + //make sure that all writing to shared memory is done before continuing + __syncthreads(); + + //if the current pixel is outside of the image + if(bxi >= x || yi >= y) + return; + + + + //calculate the coordinates of the current thread in shared memory + int si = ti + kr; + + T sum = 0; //running weighted sum across the kernel + + + //for each element of the kernel + for(int ki = -kr; ki <= kr; ki++){ + sum += g[ki + kr] * s[si + ki]; + } + + //calculate the 1D image index for this thread + unsigned int i = yi * x + bxi; + + //output the result to global memory + out[i] = sum; + } + + /// Applies a Gaussian blur to a 2D image stored on the GPU + template + void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ + + //get the number of pixels in the image + unsigned int pixels = x * y; + unsigned int bytes = sizeof(T) * pixels; + + // Allocate CUDA array in device memory + + //define a channel descriptor for a single 32-bit channel + cudaChannelFormatDesc channelDesc = + cudaCreateChannelDesc(32, 0, 0, 0, + cudaChannelFormatKindFloat); + cudaArray* cuArray; //declare the cuda array + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array + + // Copy the image data from global memory to the array + cudaMemcpyToArray(cuArray, 0, 0, image, bytes, + cudaMemcpyDeviceToDevice); + + // Specify texture + struct cudaResourceDesc resDesc; //create a resource descriptor + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type + resDesc.res.array.array = cuArray; //add a pointer to the cuda array + + // Specify texture object parameters + struct cudaTextureDesc texDesc; //create a texture descriptor + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) + texDesc.addressMode[1] = cudaAddressModeWrap; + texDesc.filterMode = cudaFilterModePoint; //use linear filtering + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) + texDesc.normalizedCoords = 0; //not using normalized coordinates + + // Create texture object + cudaTextureObject_t texObj = 0; + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); + + + //get the maximum number of threads per block for the CUDA device + int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(max_threads, 1); + + //calculate the number of blocks + dim3 blocks(x / threads.x + 1, y); + + //calculate the shared memory used in the kernel + unsigned int pixel_bytes = max_threads * 4; //bytes devoted to pixel data being processed + unsigned int apron_bytes = sigma * 8 * 4; //bytes devoted to pixels outside the window + unsigned int gaussian_bytes = (sigma * 8 + 1) * 4; //bytes devoted to memory used to store the pre-computed Gaussian window + unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes; //total number of bytes shared memory used + + //blur the image along the x-axis + gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y); + + // Copy the x-blurred data from global memory to the texture + cudaMemcpyToArray(cuArray, 0, 0, image, bytes, + cudaMemcpyDeviceToDevice); + + //transpose the block and thread dimensions + threads.x = 1; + threads.y = max_threads; + blocks.x = x; + blocks.y = y / threads.y + 1; + + //blur the image along the y-axis + gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y); + + //free allocated memory + cudaFree(cuArray); + + } + + /// Applies a Gaussian blur to a 2D image stored on the CPU + template + void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ + + //get the number of pixels in the image + unsigned int pixels = x * y; + unsigned int bytes = sizeof(T) * pixels; + + //allocate space on the GPU + T* gpuI0; + cudaMalloc(&gpuI0, bytes); + + + //copy the image data to the GPU + cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice); + + //run the GPU-based version of the algorithm + gpu_gaussian_blur_2d(gpuI0, sigma, x, y); + + //copy the image data from the device + cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost); + + //free allocated memory + cudaFree(gpuI0); + } + + } +} + +#endif \ No newline at end of file diff --git a/stim/cuda/gradient.cuh b/stim/cuda/gradient.cuh new file mode 100644 index 0000000..95931dc --- /dev/null +++ b/stim/cuda/gradient.cuh @@ -0,0 +1,115 @@ +#ifndef STIM_CUDA_GRADIENT_H +#define STIM_CUDA_GRADIENT_H + +#include +#include +#include +#include + +namespace stim{ + namespace cuda{ + + template + __global__ void gradient_2d(T* out, T* in, unsigned int x, unsigned int y){ + + //calculate the 1D image index for this thread + int i = blockIdx.x * blockDim.x + threadIdx.x; + + //convert this to 2D pixel coordinates + int yi = i / x; + int xi = i - (yi * x); + + //return if the pixel is outside of the image + if(xi >= x || yi >= y) return; + + //calculate indices for the forward difference + int i_xp = yi * x + (xi + 1); + int i_yp = (yi + 1) * x + xi; + + //use forward differences if a coordinate is zero + if(xi == 0) + out[i * 2 + 0] = in[i_xp] - in[i]; + if(yi == 0) + out[i * 2 + 1] = in[i_yp] - in[i]; + + //calculate indices for the backward difference + int i_xn = yi * x + (xi - 1); + int i_yn = (yi - 1) * x + xi; + + //use backward differences if the coordinate is at the maximum edge + if(xi == x-1) + out[i * 2 + 0] = in[i] - in[i_xn]; + if(yi == y-1) + out[i * 2 + 1] = in[i] - in[i_yn]; + + //otherwise use central differences + if(xi > 0 && xi < x-1) + out[i * 2 + 0] = (in[i_xp] - in[i_xn]) / 2; + + if(yi > 0 && yi < y-1) + out[i * 2 + 1] = (in[i_yp] - in[i_yn]) / 2; + + } + + template + //void gpu_gradient_2d(T* gpuOut, T* gpuIn, unsigned int x, unsigned int y){ + void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){ + + //get the number of pixels in the image + unsigned int pixels = x * y; + + //allocate space on the GPU for the input image + //T* gpuI; + //HANDLE_ERROR(cudaMalloc(&gpuI, bytes)); + + //cudaMemcpy(gpuI, gpuI0, bytes, cudaMemcpyDeviceToDevice); + + + //allocate space on the GPU for the output gradient image + //T* gpuGrad; + //cudaMalloc(&gpuGrad, bytes * 2); //the output image will have two channels (x, y) + + //get the maximum number of threads per block for the CUDA device + int threads = stim::maxThreadsPerBlock(); + + //calculate the number of blocks + int blocks = pixels / threads + (pixels%threads == 0 ? 0:1); + + //call the GPU kernel to determine the gradient + gradient_2d <<< blocks, threads >>>(gpuGrad, gpuI, x, y); + + } + + template + void cpu_gradient_2d(T* out, T* in, unsigned int x, unsigned int y){ + + //get the number of pixels in the image + unsigned int pixels = x * y; + unsigned int bytes = pixels * sizeof(T); + + //allocate space on the GPU for the input image + T* gpuIn; + HANDLE_ERROR(cudaMalloc(&gpuIn, bytes)); + + //copy the image data to the GPU + HANDLE_ERROR(cudaMemcpy(gpuIn, in, bytes, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the output gradient image + T* gpuOut; + cudaMalloc(&gpuOut, bytes * 2); //the output image will have two channels (x, y) + + //call the GPU version of this function + gpu_gradient_2d(gpuOut, gpuIn, x, y); + + //copy the results to the CPU + cudaMemcpy(out, gpuOut, bytes * 2, cudaMemcpyDeviceToHost); + + //free allocated memory + cudaFree(gpuOut); + } + + } +} + + +#endif \ No newline at end of file diff --git a/stim/cuda/local_max.cuh b/stim/cuda/local_max.cuh new file mode 100644 index 0000000..5083952 --- /dev/null +++ b/stim/cuda/local_max.cuh @@ -0,0 +1,151 @@ +#ifndef STIM_CUDA_LOCAL_MAX_H +#define STIM_CUDA_LOCAL_MAX_H + + +# include +# include +# include +# include + +namespace stim{ + namespace cuda{ + + + // this kernel calculates the local maximum for finding the cell centers + template + __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ + + // calculate the 2D coordinates for this current thread. + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y; + // convert 2D coordinates to 1D + int i = yi * x + xi; + + + + //calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. + int xl = xi - conn; + int yl = yi - conn; + + // use zero for the lowest limits if the xi or yi is less than conn. + if (xi <= conn) + xl = 0; + if (yi <= conn) + yl = 0; + + //calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. + int xh = xi + conn; + int yh = yi + conn; + + // use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn. + if (xi >= x - conn) + xh = x; + if (yi>= y - conn) + yh = y; + + // calculate the limits for finding the local maximum location in the connected neighbors for the current pixel + int n_l = yl * x + xl; + int n_h = yh * x + xh; + + //initial the centers image to zero + gpuCenters[i] = 0; + + + int n = n_l; + + float l_value = 0; + + if (i < x * y) + + // check if the vote value for this pixel is greater than threshold, so this pixel may be a local max. + if (gpuVote[i]>final_t){ + + // compare the vote value for this pixel with the vote values with its neighbors. + while (n<=n_h){ + + // check if this vote value is a local max in its neighborhood. + if (gpuVote[i] < gpuVote[n]){ + l_value = 0; + n =n_h+1; + } + else if (n == n_h){ + l_value = 1; + n = n+1; + } + // check if the current neighbor is the last one at the current row + else if ((n - n_l - 2*conn)% x ==0){ + n = n + x - 2*conn -1; + n ++; + } + else + n ++; + } + // set the center value for this pixel to high if it's a local max ,and to low if not. + gpuCenters[i] = l_value ; + } + + } + + + + template + void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ + + + + + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(max_threads, 1); + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); + + + + //call the kernel to find the local maximum. + cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y); + + + } + + + + template + void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ + + + //calculate the number of bytes in the array + unsigned int bytes = x * y * sizeof(T); + + // allocate space on the GPU for the detected cell centes + T* gpuCenters; + cudaMalloc(&gpuCenters, bytes); + + + //allocate space on the GPU for the input Vote Image + T* gpuVote; + cudaMalloc(&gpuVote, bytes); + + + //copy the Vote image data to the GPU + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); + + + //call the GPU version of the local max function + gpu_local_max(gpuCenters, gpuVote, final_t, conn, x, y); + + + //copy the cell centers data to the CPU + cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ; + + + //free allocated memory + cudaFree(gpuCenters); + cudaFree(gpuVote); + cudaFree(gpuGrad); + } + + } +} + + + +#endif \ No newline at end of file diff --git a/stim/cuda/sharedmem.cuh b/stim/cuda/sharedmem.cuh new file mode 100644 index 0000000..77de31f --- /dev/null +++ b/stim/cuda/sharedmem.cuh @@ -0,0 +1,42 @@ + +#ifndef STIM_CUDA_SHAREDMEM_H +#define STIM_CUDA_SHAREDMEM_H + +namespace stim{ + namespace cuda{ + + // Copies values from global memory to shared memory, optimizing threads + template + __device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src, + unsigned int x, unsigned int y, unsigned int X, unsigned int Y, + dim3 threadIdx, dim3 blockDim){ + + //calculate the number of iterations required for the copy + unsigned int xI, yI; + xI = X/blockDim.x + 1; //number of iterations along X + yI = Y/blockDim.y + 1; //number of iterations along Y + + //for each iteration + for(unsigned int xi = 0; xi < xI; xi++){ + for(unsigned int yi = 0; yi < yI; yi++){ + + //calculate the index into shared memory + unsigned int sx = xi * blockDim.x + threadIdx.x; + unsigned int sy = yi * blockDim.y + threadIdx.y; + + //calculate the index into the texture + unsigned int tx = x + sx; + unsigned int ty = y + sy; + + //perform the copy + if(sx < X && sy < Y) + dest[sy * X + sx] = tex2D(src, tx, ty); + } + } + } + + } +} + + +#endif \ No newline at end of file diff --git a/stim/cuda/update_dir.cuh b/stim/cuda/update_dir.cuh new file mode 100644 index 0000000..8522f2f --- /dev/null +++ b/stim/cuda/update_dir.cuh @@ -0,0 +1,232 @@ +#ifndef STIM_CUDA_UPDATE_DIR_H +#define STIM_CUDA_UPDATE_DIR_H + + +# include +# include +# include +# include +#include + +namespace stim{ + namespace cuda{ + + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. + template + __global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //generate a pointer to shared memory (size will be specified as a kernel parameter) + extern __shared__ float s_vote[]; + + //calculate the start point for this block + int bxi = blockIdx.x * blockDim.x; + + //calculate the width of the shared memory block + int swidth = 2 * rmax + blockDim.x; + + // calculate the 2D coordinates for this current thread. + int xi = bxi + threadIdx.x; + int yi = blockIdx.y; + + // convert 2D coordinates to 1D + int i = yi * x + xi; + + // calculate the voting direction based on the grtadient direction + float theta = gpuGrad[2*i]; + + //initialize the vote direction to zero + gpuDir[i] = 0; + + // define a local variable to maximum value of the vote image in the voting area for this voter + float max = 0; + + // define two local variables for the x and y coordinations where the maximum happened + int id_x = 0; + int id_y = 0; + + // compute the size of window which will be checked for finding the voting area for this voter + unsigned int x_table = 2*rmax +1; + unsigned int rmax_sq = rmax * rmax; + int r = (int)rmax; + int tx_rmax = threadIdx.x + rmax; + int bxs = bxi - rmax; + + for(int yr = -r; yr <= r; yr++){ + + //copy the portion of the image necessary for this block to shared memory + __syncthreads(); + stim::cuda::sharedMemcpy_tex2D(s_vote, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim); + __syncthreads(); + + //if the current thread is outside of the image, it doesn't have to be computed + if(xi < x && yi < y){ + + for(int xr = -r; xr <= r; xr++){ + + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; + + // calculate the angle between the voter and the current pixel in x and y directions + float atan_angle = gpuTable[ind_t]; + + + // calculate the voting direction based on the grtadient direction + int idx_share_update = xr + tx_rmax ; + float share_vote = s_vote[idx_share_update]; + + // check if the current pixel is located in the voting area of this voter. + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) max) { + + max = share_vote; + id_x = xr; + id_y = yr; + } + } + } + } + } + + + //float new_angle = atan2(dy, dx); + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); + + float new_angle = gpuTable[ind_m]; + + gpuDir[i] = new_angle; + + } + + // this kernel updates the gradient direction by the calculated voting direction. + template + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, unsigned int x, unsigned int y){ + + //************ when the number of threads are (1024,1) ************* + + // calculate the 2D coordinates for this current thread. + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y; + // convert 2D coordinates to 1D + int i = yi * x + xi; + + + //update the gradient image with the vote direction + gpuGrad[2*i] = gpuDir[i]; + } + + + template + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //get the number of pixels in the image + unsigned int pixels = x * y; + unsigned int bytes = sizeof(T) * pixels; + + + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(max_threads, 1); + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); + + // Allocate CUDA array in device memory + + //define a channel descriptor for a single 32-bit channel + cudaChannelFormatDesc channelDesc = + cudaCreateChannelDesc(32, 0, 0, 0, + cudaChannelFormatKindFloat); + cudaArray* cuArray; //declare the cuda array + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array + + // Copy the image data from global memory to the array + cudaMemcpyToArray(cuArray, 0, 0, gpuVote, bytes, + cudaMemcpyDeviceToDevice); + + // Specify texture + struct cudaResourceDesc resDesc; //create a resource descriptor + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type + resDesc.res.array.array = cuArray; //add a pointer to the cuda array + + // Specify texture object parameters + struct cudaTextureDesc texDesc; //create a texture descriptor + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) + texDesc.addressMode[1] = cudaAddressModeWrap; + texDesc.filterMode = cudaFilterModePoint; //use linear filtering + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) + texDesc.normalizedCoords = 0; //not using normalized coordinates + + // Create texture object + cudaTextureObject_t texObj = 0; + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); + + // specify share memory + unsigned int share_bytes = (2*rmax + threads.x)*(1)*4; + + // allocate space on the GPU for the updated vote direction + T* gpuDir; + cudaMalloc(&gpuDir, bytes); + + //call the kernel to calculate the new voting direction + cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, texObj, gpuGrad, gpuTable, phi, rmax, x , y); + + //call the kernel to update the gradient direction + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); + + + //free allocated memory + cudaFree(gpuDir); + + } + + + template + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //calculate the number of bytes in the array + unsigned int bytes = x * y * sizeof(T); + + //calculate the number of bytes in the atan2 table + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); + + //allocate space on the GPU for the Vote Image + T* gpuVote; + cudaMalloc(&gpuVote, bytes); + + //copy the input vote image to the GPU + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the input Gradient image + T* gpuGrad; + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); + + //copy the Gradient data to the GPU + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the atan2 table + T* gpuTable; + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); + + //copy the atan2 values to the GPU + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); + + + //call the GPU version of the update direction function + gpu_update_dir(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + + //copy the new gradient image back to the CPU + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; + + //free allocated memory + cudaFree(gpuTable); + cudaFree(gpuVote); + cudaFree(gpuGrad); + } + + } +} + + + +#endif \ No newline at end of file diff --git a/stim/cuda/vote.cuh b/stim/cuda/vote.cuh new file mode 100644 index 0000000..9abcb04 --- /dev/null +++ b/stim/cuda/vote.cuh @@ -0,0 +1,188 @@ +#ifndef STIM_CUDA_VOTE_H +#define STIM_CUDA_VOTE_H + + +# include +# include +# include +# include +#include + + +namespace stim{ + namespace cuda{ + + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area + template + __global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //generate a pointer to shared memory (size will be specified as a kernel parameter) + extern __shared__ float2 s_grad[]; + + //calculate the start point for this block + int bxi = blockIdx.x * blockDim.x; + + //calculate the width of the shared memory block + int swidth = 2 * rmax + blockDim.x; + + // calculate the 2D coordinates for this current thread. + int xi = bxi + threadIdx.x; + int yi = blockIdx.y; + // convert 2D coordinates to 1D + int i = yi * x + xi; + + + // define a local variable to sum the votes from the voters + float sum = 0; + + // compute the size of window which will be checked for finding the proper voters for this pixel + unsigned int x_table = 2*rmax +1; + + unsigned int rmax_sq = rmax * rmax; + int r = (int)rmax; + int tx_rmax = threadIdx.x + rmax; + int bxs = bxi - rmax; + + + for(int yr = -r; yr <= r; yr++){ + + //copy the portion of the image necessary for this block to shared memory + __syncthreads(); + stim::cuda::sharedMemcpy_tex2D(s_grad, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim); + __syncthreads(); + + //if the current thread is outside of the image, it doesn't have to be computed + if(xi < x && yi < y){ + + for(int xr = -r; xr <= r; xr++){ + + //find the location of this voter in the atan2 table + unsigned int id_t = (yr + rmax) * x_table + xr + rmax; + + // calculate the angle between the pixel and the current voter in x and y directions + float atan_angle = gpuTable[id_t]; + + + // calculate the voting direction based on the grtadient direction + int idx_share = xr + tx_rmax ; + float2 g = s_grad[idx_share]; + float theta = g.x; + + // check if the current voter is located in the voting area of this pixel. + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //get the number of pixels in the image + unsigned int pixels = x * y; + unsigned int bytes = sizeof(T) * pixels; + + + unsigned int max_threads = stim::maxThreadsPerBlock(); + //unsigned int thread_dim = sqrt(max_threads); + dim3 threads(max_threads, 1); + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); + + // Allocate CUDA array in device memory + + //define a channel descriptor for a single 32-bit channel + cudaChannelFormatDesc channelDesc = + cudaCreateChannelDesc(32, 32, 0, 0, + cudaChannelFormatKindFloat); + cudaArray* cuArray; //declare the cuda array + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array + + // Copy the image data from global memory to the array + cudaMemcpyToArray(cuArray, 0, 0, gpuGrad, bytes*2, + cudaMemcpyDeviceToDevice); + + // Specify texture + struct cudaResourceDesc resDesc; //create a resource descriptor + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type + resDesc.res.array.array = cuArray; //add a pointer to the cuda array + + // Specify texture object parameters + struct cudaTextureDesc texDesc; //create a texture descriptor + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) + texDesc.addressMode[1] = cudaAddressModeWrap; + texDesc.filterMode = cudaFilterModePoint; //use linear filtering + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) + texDesc.normalizedCoords = 0; //not using normalized coordinates + + // Create texture object + cudaTextureObject_t texObj = 0; + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); + + // specify share memory + unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4; + + + //call the kernel to do the voting + + cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y); + + } + + + template + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //calculate the number of bytes in the array + unsigned int bytes = x * y * sizeof(T); + + //calculate the number of bytes in the atan2 table + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); + + //allocate space on the GPU for the Vote Image + T* gpuVote; + cudaMalloc(&gpuVote, bytes); + + //allocate space on the GPU for the input Gradient image + T* gpuGrad; + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); + + //copy the Gradient Magnitude data to the GPU + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the atan2 table + T* gpuTable; + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); + + //copy the atan2 values to the GPU + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); + + //cudaMemcpyToSymbol (cstTable, cpuTable, bytes_table ); + + + //call the GPU version of the vote calculation function + gpu_vote(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + + //copy the Vote Data back to the CPU + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; + + //free allocated memory + cudaFree(gpuTable); + cudaFree(gpuVote); + cudaFree(gpuGrad); + } + + } +} + + + +#endif \ No newline at end of file -- libgit2 0.21.4