Commit 13fe3c84bbffad233bbd8728590c870364472ebc

Authored by Laila Saadatifard
1 parent 788862df

update the stimlib by adding some gpu-functions (abs, cart2polar, multiply, down…

…_sample, gaussian_blur, gradient, local_max, sharedmem, update_dir, vote)
stim/cuda/array_abs.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_ARRAY_ABS_H
  2 +#define STIM_CUDA_ARRAY_ABS_H
  3 +
  4 +namespace stim{
  5 + namespace cuda{
  6 + template<typename T>
  7 + __global__ void cuda_abs(T* a, unsigned int N){
  8 +
  9 + //calculate the 1D index for this thread
  10 + int i = blockIdx.x * blockDim.x + threadIdx.x;
  11 +
  12 + if(i < N)
  13 + a[i] = abs(a[i]);
  14 + }
  15 +
  16 +
  17 + template<typename T>
  18 + void gpu_abs(T* a, unsigned int N){
  19 +
  20 + //get the maximum number of threads per block for the CUDA device
  21 + int threads = stim::maxThreadsPerBlock();
  22 +
  23 + //calculate the number of blocks
  24 + int blocks = N / threads + (N%threads == 0 ? 0:1);
  25 +
  26 + //call the kernel to do the multiplication
  27 + cuda_abs <<< blocks, threads >>>(a, N);
  28 +
  29 + }
  30 +
  31 +
  32 + template<typename T>
  33 + void cpu_abs(T* a, unsigned int N){
  34 +
  35 + //calculate the number of bytes in the array
  36 + unsigned int bytes = N * sizeof(T);
  37 +
  38 + //allocate memory on the GPU for the array
  39 + T* gpuA;
  40 + HANDLE_ERROR( cudaMalloc(&gpuA, bytes) );
  41 +
  42 + //copy the array to the GPU
  43 + HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) );
  44 +
  45 + //call the GPU version of this function
  46 + gpu_abs<T>(gpuA, N);
  47 +
  48 + //copy the array back to the CPU
  49 + HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) );
  50 +
  51 + //free allocated memory
  52 + cudaFree(gpuA);
  53 +
  54 + }
  55 +
  56 + } //end namespace cuda
  57 +} //end namespace stim
  58 +
  59 +#endif
0 60 \ No newline at end of file
... ...
stim/cuda/array_cart2polar.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_ARRAY_CART2POLAR_H
  2 +#define STIM_CUDA_ARRAY_CART2POLAR_H
  3 +
  4 +namespace stim{
  5 + namespace cuda{
  6 + template<typename T>
  7 + __global__ void cuda_cart2polar(T* a, unsigned int N){
  8 +
  9 +
  10 + //calculate the 1D index for this thread
  11 + int i = blockIdx.x * blockDim.x + threadIdx.x;
  12 +
  13 + if(i < N){
  14 + float x = a[i * 2 + 0];
  15 + float y = a[i * 2 + 1];
  16 + float theta = atan2( y, x ) ;
  17 + float r = sqrt(x * x + y * y);
  18 + a[i * 2 + 0] = theta;
  19 + a[i * 2 + 1] = r;
  20 + }
  21 + }
  22 +
  23 +
  24 + template<typename T>
  25 + void gpu_cart2polar(T* gpuGrad, unsigned int N){
  26 +
  27 + //get the maximum number of threads per block for the CUDA device
  28 + int threads = stim::maxThreadsPerBlock();
  29 +
  30 + //calculate the number of blocks
  31 + int blocks = N / threads + (N % threads == 0 ? 0:1);
  32 +
  33 + //call the kernel to do the multiplication
  34 + cuda_cart2polar <<< blocks, threads >>>(gpuGrad, N);
  35 +
  36 + }
  37 +
  38 +
  39 + template<typename T>
  40 + void cpu_cart2polar(T* a, unsigned int N){
  41 +
  42 + //calculate the number of bytes in the array
  43 + unsigned int bytes = N * sizeof(T) * 2;
  44 +
  45 + //allocate memory on the GPU for the array
  46 + T* gpuA;
  47 + HANDLE_ERROR( cudaMalloc(&gpuA, bytes) );
  48 +
  49 + //copy the array to the GPU
  50 + HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) );
  51 +
  52 + //call the GPU version of this function
  53 + gpu_cart2polar<T>(gpuA, N);
  54 +
  55 + //copy the array back to the CPU
  56 + HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) );
  57 +
  58 + //free allocated memory
  59 + cudaFree(gpuA);
  60 +
  61 + }
  62 +
  63 + }
  64 +}
  65 +
  66 +#endif
0 67 \ No newline at end of file
... ...
stim/cuda/array_multiply.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_ARRAY_MULTIPLY_H
  2 +#define STIM_CUDA_ARRAY_MULTIPLY_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/devices.h>
  7 +#include <stim/cuda/error.h>
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + template<typename T>
  13 + __global__ void cuda_multiply(T* lhs, T rhs, unsigned int N){
  14 +
  15 + //calculate the 1D index for this thread
  16 + int i = blockIdx.x * blockDim.x + threadIdx.x;
  17 +
  18 + if(i < N)
  19 + lhs[i] *= rhs;
  20 + }
  21 +
  22 + template<typename T>
  23 + void gpu_multiply(T* lhs, T rhs, unsigned int N){
  24 +
  25 + //get the maximum number of threads per block for the CUDA device
  26 + int threads = stim::maxThreadsPerBlock();
  27 +
  28 + //calculate the number of blocks
  29 + int blocks = N / threads + (N%threads == 0 ? 0:1);
  30 +
  31 + //call the kernel to do the multiplication
  32 + cuda_multiply <<< blocks, threads >>>(lhs, rhs, N);
  33 +
  34 + }
  35 +
  36 + template<typename T>
  37 + void cpu_multiply(T* lhs, T rhs, unsigned int N){
  38 +
  39 + //calculate the number of bytes in the array
  40 + unsigned int bytes = N * sizeof(T);
  41 +
  42 + //allocate memory on the GPU for the array
  43 + T* gpuLHS;
  44 + HANDLE_ERROR( cudaMalloc(&gpuLHS, bytes) );
  45 +
  46 + //copy the array to the GPU
  47 + HANDLE_ERROR( cudaMemcpy(gpuLHS, lhs, bytes, cudaMemcpyHostToDevice) );
  48 +
  49 + //call the GPU version of this function
  50 + gpu_multiply<T>(gpuLHS, rhs, N);
  51 +
  52 + //copy the array back to the CPU
  53 + HANDLE_ERROR( cudaMemcpy(lhs, gpuLHS, bytes, cudaMemcpyDeviceToHost) );
  54 +
  55 + //free allocated memory
  56 + cudaFree(gpuLHS);
  57 + }
  58 +
  59 + }
  60 +}
  61 +
  62 +
  63 +
  64 +#endif
0 65 \ No newline at end of file
... ...
stim/cuda/arraymath.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_ARRAYMATH_H
  2 +#define STIM_CUDA_ARRAYMATH_H
  3 +
  4 +#include <stim/cuda/array_multiply.cuh>
  5 +#include <stim/cuda/array_abs.cuh>
  6 +#include <stim/cuda/array_cart2polar.cuh>
  7 +
  8 +namespace stim{
  9 + namespace cuda{
  10 +
  11 + }
  12 +}
  13 +
  14 +
  15 +
  16 +#endif
0 17 \ No newline at end of file
... ...
stim/cuda/down_sample.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_DOWN_SAMPLE_H
  2 +#define STIM_CUDA_DOWN_SAMPLE_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/devices.h>
  7 +#include <stim/cuda/timer.h>
  8 +#include <stim/cuda/gaussian_blur.cuh>
  9 +
  10 +namespace stim{
  11 + namespace cuda{
  12 +
  13 + template<typename T>
  14 + __global__ void down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
  15 +
  16 + unsigned int sigma_ds = 1/resize;
  17 + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
  18 + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
  19 +
  20 +
  21 + // calculate the 2D coordinates for this current thread.
  22 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  23 + int yi = blockIdx.y;
  24 + // convert 2D coordinates to 1D
  25 + int i = yi * x_ds + xi;
  26 +
  27 + if(xi< x_ds && yi< y_ds){
  28 +
  29 + int x_org = xi * sigma_ds;
  30 + int y_org = yi * sigma_ds;
  31 + int i_org = y_org * x + x_org;
  32 + gpuI[i] = gpuI0[i_org];
  33 + }
  34 +
  35 + }
  36 +
  37 +
  38 + /// Applies a Gaussian blur to a 2D image stored on the GPU
  39 + template<typename T>
  40 + void gpu_down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
  41 +
  42 +
  43 + unsigned int sigma_ds = 1/resize;
  44 + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
  45 + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
  46 +
  47 + //get the number of pixels in the image
  48 + unsigned int pixels_ds = x_ds * y_ds;
  49 +
  50 + unsigned int max_threads = stim::maxThreadsPerBlock();
  51 + dim3 threads(max_threads, 1);
  52 + dim3 blocks(x_ds/threads.x + (x_ds %threads.x == 0 ? 0:1) , y_ds);
  53 +
  54 + stim::cuda::gpu_gaussian_blur_2d<float>(gpuI0, sigma_ds,x ,y);
  55 +
  56 + //resample the image
  57 + down_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y);
  58 +
  59 + }
  60 +
  61 + /// Applies a Gaussian blur to a 2D image stored on the CPU
  62 + template<typename T>
  63 + void cpu_down_sample(T* re_img, T* image, T resize, unsigned int x, unsigned int y){
  64 +
  65 + //get the number of pixels in the image
  66 + unsigned int pixels = x * y;
  67 + unsigned int bytes = sizeof(T) * pixels;
  68 +
  69 + unsigned int sigma_ds = 1/resize;
  70 + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
  71 + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
  72 + unsigned int bytes_ds = sizeof(T) * x_ds * y_ds;
  73 +
  74 +
  75 +
  76 + //allocate space on the GPU for the original image
  77 + T* gpuI0;
  78 + cudaMalloc(&gpuI0, bytes);
  79 +
  80 +
  81 + //copy the image data to the GPU
  82 + cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
  83 +
  84 + //allocate space on the GPU for the down sampled image
  85 + T* gpuI;
  86 + cudaMalloc(&gpuI, bytes_ds);
  87 +
  88 + //run the GPU-based version of the algorithm
  89 + gpu_down_sample<T>(gpuI, gpuI0, resize, x, y);
  90 +
  91 + //copy the image data to the GPU
  92 + cudaMemcpy(re_image, gpuI, bytes_ds, cudaMemcpyHostToDevice);
  93 +
  94 + cudaFree(gpuI0);
  95 + cudeFree(gpuI);
  96 + }
  97 +
  98 + }
  99 +}
  100 +
  101 +#endif
0 102 \ No newline at end of file
... ...
stim/cuda/gaussian_blur.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_GAUSSIAN_BLUR_H
  2 +#define STIM_CUDA_GAUSSIAN_BLUR_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/devices.h>
  7 +#include <stim/cuda/timer.h>
  8 +#include <stim/cuda/sharedmem.cuh>
  9 +
  10 +#define pi 3.14159
  11 +
  12 +namespace stim{
  13 + namespace cuda{
  14 +
  15 + template<typename T>
  16 + __global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){
  17 +
  18 + //generate a pointer to shared memory (size will be specified as a kernel parameter)
  19 + extern __shared__ T s[];
  20 +
  21 + int kr = sigma * 4; //calculate the kernel radius
  22 +
  23 + //get a pointer to the gaussian in memory
  24 + T* g = (T*)&s[blockDim.x + 2 * kr];
  25 +
  26 + //calculate the start point for this block
  27 + int bxi = blockIdx.x * blockDim.x;
  28 + int byi = blockIdx.y;
  29 +
  30 + //copy the portion of the image necessary for this block to shared memory
  31 + stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
  32 +
  33 + //calculate the thread index and block index
  34 + int ti = threadIdx.x;
  35 +
  36 + //calculate the spatial coordinate for this thread
  37 + int xi = bxi + ti;
  38 +
  39 + //pre-compute the gaussian values for each kernel point
  40 + T a = 1.0 / (sigma * sqrt(2 * pi));
  41 + T c = - 1.0 / (2*sigma*sigma);
  42 + int ki;
  43 +
  44 + //use the first 2kr+1 threads to evaluate a gaussian and store the result
  45 + if(ti <= 2* kr+1){
  46 + ki = ti - kr;
  47 + g[ti] = a * exp((ki*ki) * c);
  48 + }
  49 +
  50 + //make sure that all writing to shared memory is done before continuing
  51 + __syncthreads();
  52 +
  53 + //if the current pixel is outside of the image
  54 + if(bxi + ti > x || byi > y)
  55 + return;
  56 +
  57 +
  58 +
  59 + //calculate the coordinates of the current thread in shared memory
  60 + int si = ti + kr;
  61 +
  62 + T sum = 0; //running weighted sum across the kernel
  63 +
  64 +
  65 + //for each element of the kernel
  66 + for(int ki = -kr; ki <= kr; ki++){
  67 + sum += g[ki + kr] * s[si + ki];
  68 + }
  69 +
  70 + //calculate the 1D image index for this thread
  71 + unsigned int i = byi * x + xi;
  72 +
  73 + //output the result to global memory
  74 + out[i] = sum;
  75 + }
  76 +
  77 + template<typename T>
  78 + __global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){
  79 +
  80 + //generate a pointer to shared memory (size will be specified as a kernel parameter)
  81 + extern __shared__ T s[];
  82 +
  83 + int kr = sigma * 4; //calculate the kernel radius
  84 +
  85 + //get a pointer to the gaussian in memory
  86 + T* g = (T*)&s[blockDim.y + 2 * kr];
  87 +
  88 + //calculate the start point for this block
  89 + int bxi = blockIdx.x;
  90 + int byi = blockIdx.y * blockDim.y;
  91 +
  92 + //copy the portion of the image necessary for this block to shared memory
  93 + stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
  94 +
  95 + //calculate the thread index and block index
  96 + int ti = threadIdx.y;
  97 +
  98 + //calculate the spatial coordinate for this thread
  99 + int yi = byi + ti;
  100 +
  101 + //pre-compute the gaussian values for each kernel point
  102 + T a = 1.0 / (sigma * sqrt(2 * pi));
  103 + T c = - 1.0 / (2*sigma*sigma);
  104 + int ki;
  105 +
  106 + //use the first 2kr+1 threads to evaluate a gaussian and store the result
  107 + if(ti <= 2* kr+1){
  108 + ki = ti - kr;
  109 + g[ti] = a * exp((ki*ki) * c);
  110 + }
  111 +
  112 + //make sure that all writing to shared memory is done before continuing
  113 + __syncthreads();
  114 +
  115 + //if the current pixel is outside of the image
  116 + if(bxi >= x || yi >= y)
  117 + return;
  118 +
  119 +
  120 +
  121 + //calculate the coordinates of the current thread in shared memory
  122 + int si = ti + kr;
  123 +
  124 + T sum = 0; //running weighted sum across the kernel
  125 +
  126 +
  127 + //for each element of the kernel
  128 + for(int ki = -kr; ki <= kr; ki++){
  129 + sum += g[ki + kr] * s[si + ki];
  130 + }
  131 +
  132 + //calculate the 1D image index for this thread
  133 + unsigned int i = yi * x + bxi;
  134 +
  135 + //output the result to global memory
  136 + out[i] = sum;
  137 + }
  138 +
  139 + /// Applies a Gaussian blur to a 2D image stored on the GPU
  140 + template<typename T>
  141 + void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
  142 +
  143 + //get the number of pixels in the image
  144 + unsigned int pixels = x * y;
  145 + unsigned int bytes = sizeof(T) * pixels;
  146 +
  147 + // Allocate CUDA array in device memory
  148 +
  149 + //define a channel descriptor for a single 32-bit channel
  150 + cudaChannelFormatDesc channelDesc =
  151 + cudaCreateChannelDesc(32, 0, 0, 0,
  152 + cudaChannelFormatKindFloat);
  153 + cudaArray* cuArray; //declare the cuda array
  154 + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array
  155 +
  156 + // Copy the image data from global memory to the array
  157 + cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
  158 + cudaMemcpyDeviceToDevice);
  159 +
  160 + // Specify texture
  161 + struct cudaResourceDesc resDesc; //create a resource descriptor
  162 + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero
  163 + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type
  164 + resDesc.res.array.array = cuArray; //add a pointer to the cuda array
  165 +
  166 + // Specify texture object parameters
  167 + struct cudaTextureDesc texDesc; //create a texture descriptor
  168 + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero
  169 + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges)
  170 + texDesc.addressMode[1] = cudaAddressModeWrap;
  171 + texDesc.filterMode = cudaFilterModePoint; //use linear filtering
  172 + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats)
  173 + texDesc.normalizedCoords = 0; //not using normalized coordinates
  174 +
  175 + // Create texture object
  176 + cudaTextureObject_t texObj = 0;
  177 + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
  178 +
  179 +
  180 + //get the maximum number of threads per block for the CUDA device
  181 + int max_threads = stim::maxThreadsPerBlock();
  182 + dim3 threads(max_threads, 1);
  183 +
  184 + //calculate the number of blocks
  185 + dim3 blocks(x / threads.x + 1, y);
  186 +
  187 + //calculate the shared memory used in the kernel
  188 + unsigned int pixel_bytes = max_threads * 4; //bytes devoted to pixel data being processed
  189 + unsigned int apron_bytes = sigma * 8 * 4; //bytes devoted to pixels outside the window
  190 + unsigned int gaussian_bytes = (sigma * 8 + 1) * 4; //bytes devoted to memory used to store the pre-computed Gaussian window
  191 + unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes; //total number of bytes shared memory used
  192 +
  193 + //blur the image along the x-axis
  194 + gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);
  195 +
  196 + // Copy the x-blurred data from global memory to the texture
  197 + cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
  198 + cudaMemcpyDeviceToDevice);
  199 +
  200 + //transpose the block and thread dimensions
  201 + threads.x = 1;
  202 + threads.y = max_threads;
  203 + blocks.x = x;
  204 + blocks.y = y / threads.y + 1;
  205 +
  206 + //blur the image along the y-axis
  207 + gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);
  208 +
  209 + //free allocated memory
  210 + cudaFree(cuArray);
  211 +
  212 + }
  213 +
  214 + /// Applies a Gaussian blur to a 2D image stored on the CPU
  215 + template<typename T>
  216 + void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
  217 +
  218 + //get the number of pixels in the image
  219 + unsigned int pixels = x * y;
  220 + unsigned int bytes = sizeof(T) * pixels;
  221 +
  222 + //allocate space on the GPU
  223 + T* gpuI0;
  224 + cudaMalloc(&gpuI0, bytes);
  225 +
  226 +
  227 + //copy the image data to the GPU
  228 + cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
  229 +
  230 + //run the GPU-based version of the algorithm
  231 + gpu_gaussian_blur_2d<T>(gpuI0, sigma, x, y);
  232 +
  233 + //copy the image data from the device
  234 + cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost);
  235 +
  236 + //free allocated memory
  237 + cudaFree(gpuI0);
  238 + }
  239 +
  240 + }
  241 +}
  242 +
  243 +#endif
0 244 \ No newline at end of file
... ...
stim/cuda/gradient.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_GRADIENT_H
  2 +#define STIM_CUDA_GRADIENT_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/devices.h>
  7 +#include <stim/cuda/error.h>
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + template<typename T>
  13 + __global__ void gradient_2d(T* out, T* in, unsigned int x, unsigned int y){
  14 +
  15 + //calculate the 1D image index for this thread
  16 + int i = blockIdx.x * blockDim.x + threadIdx.x;
  17 +
  18 + //convert this to 2D pixel coordinates
  19 + int yi = i / x;
  20 + int xi = i - (yi * x);
  21 +
  22 + //return if the pixel is outside of the image
  23 + if(xi >= x || yi >= y) return;
  24 +
  25 + //calculate indices for the forward difference
  26 + int i_xp = yi * x + (xi + 1);
  27 + int i_yp = (yi + 1) * x + xi;
  28 +
  29 + //use forward differences if a coordinate is zero
  30 + if(xi == 0)
  31 + out[i * 2 + 0] = in[i_xp] - in[i];
  32 + if(yi == 0)
  33 + out[i * 2 + 1] = in[i_yp] - in[i];
  34 +
  35 + //calculate indices for the backward difference
  36 + int i_xn = yi * x + (xi - 1);
  37 + int i_yn = (yi - 1) * x + xi;
  38 +
  39 + //use backward differences if the coordinate is at the maximum edge
  40 + if(xi == x-1)
  41 + out[i * 2 + 0] = in[i] - in[i_xn];
  42 + if(yi == y-1)
  43 + out[i * 2 + 1] = in[i] - in[i_yn];
  44 +
  45 + //otherwise use central differences
  46 + if(xi > 0 && xi < x-1)
  47 + out[i * 2 + 0] = (in[i_xp] - in[i_xn]) / 2;
  48 +
  49 + if(yi > 0 && yi < y-1)
  50 + out[i * 2 + 1] = (in[i_yp] - in[i_yn]) / 2;
  51 +
  52 + }
  53 +
  54 + template<typename T>
  55 + //void gpu_gradient_2d(T* gpuOut, T* gpuIn, unsigned int x, unsigned int y){
  56 + void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){
  57 +
  58 + //get the number of pixels in the image
  59 + unsigned int pixels = x * y;
  60 +
  61 + //allocate space on the GPU for the input image
  62 + //T* gpuI;
  63 + //HANDLE_ERROR(cudaMalloc(&gpuI, bytes));
  64 +
  65 + //cudaMemcpy(gpuI, gpuI0, bytes, cudaMemcpyDeviceToDevice);
  66 +
  67 +
  68 + //allocate space on the GPU for the output gradient image
  69 + //T* gpuGrad;
  70 + //cudaMalloc(&gpuGrad, bytes * 2); //the output image will have two channels (x, y)
  71 +
  72 + //get the maximum number of threads per block for the CUDA device
  73 + int threads = stim::maxThreadsPerBlock();
  74 +
  75 + //calculate the number of blocks
  76 + int blocks = pixels / threads + (pixels%threads == 0 ? 0:1);
  77 +
  78 + //call the GPU kernel to determine the gradient
  79 + gradient_2d<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y);
  80 +
  81 + }
  82 +
  83 + template<typename T>
  84 + void cpu_gradient_2d(T* out, T* in, unsigned int x, unsigned int y){
  85 +
  86 + //get the number of pixels in the image
  87 + unsigned int pixels = x * y;
  88 + unsigned int bytes = pixels * sizeof(T);
  89 +
  90 + //allocate space on the GPU for the input image
  91 + T* gpuIn;
  92 + HANDLE_ERROR(cudaMalloc(&gpuIn, bytes));
  93 +
  94 + //copy the image data to the GPU
  95 + HANDLE_ERROR(cudaMemcpy(gpuIn, in, bytes, cudaMemcpyHostToDevice));
  96 +
  97 + //allocate space on the GPU for the output gradient image
  98 + T* gpuOut;
  99 + cudaMalloc(&gpuOut, bytes * 2); //the output image will have two channels (x, y)
  100 +
  101 + //call the GPU version of this function
  102 + gpu_gradient_2d(gpuOut, gpuIn, x, y);
  103 +
  104 + //copy the results to the CPU
  105 + cudaMemcpy(out, gpuOut, bytes * 2, cudaMemcpyDeviceToHost);
  106 +
  107 + //free allocated memory
  108 + cudaFree(gpuOut);
  109 + }
  110 +
  111 + }
  112 +}
  113 +
  114 +
  115 +#endif
0 116 \ No newline at end of file
... ...
stim/cuda/local_max.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_LOCAL_MAX_H
  2 +#define STIM_CUDA_LOCAL_MAX_H
  3 +
  4 +
  5 +# include <iostream>
  6 +# include <cuda.h>
  7 +# include <stim/cuda/devices.h>
  8 +# include <stim/cuda/error.h>
  9 +
  10 +namespace stim{
  11 + namespace cuda{
  12 +
  13 +
  14 + // this kernel calculates the local maximum for finding the cell centers
  15 + template<typename T>
  16 + __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
  17 +
  18 + // calculate the 2D coordinates for this current thread.
  19 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  20 + int yi = blockIdx.y;
  21 + // convert 2D coordinates to 1D
  22 + int i = yi * x + xi;
  23 +
  24 +
  25 +
  26 + //calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
  27 + int xl = xi - conn;
  28 + int yl = yi - conn;
  29 +
  30 + // use zero for the lowest limits if the xi or yi is less than conn.
  31 + if (xi <= conn)
  32 + xl = 0;
  33 + if (yi <= conn)
  34 + yl = 0;
  35 +
  36 + //calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
  37 + int xh = xi + conn;
  38 + int yh = yi + conn;
  39 +
  40 + // use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn.
  41 + if (xi >= x - conn)
  42 + xh = x;
  43 + if (yi>= y - conn)
  44 + yh = y;
  45 +
  46 + // calculate the limits for finding the local maximum location in the connected neighbors for the current pixel
  47 + int n_l = yl * x + xl;
  48 + int n_h = yh * x + xh;
  49 +
  50 + //initial the centers image to zero
  51 + gpuCenters[i] = 0;
  52 +
  53 +
  54 + int n = n_l;
  55 +
  56 + float l_value = 0;
  57 +
  58 + if (i < x * y)
  59 +
  60 + // check if the vote value for this pixel is greater than threshold, so this pixel may be a local max.
  61 + if (gpuVote[i]>final_t){
  62 +
  63 + // compare the vote value for this pixel with the vote values with its neighbors.
  64 + while (n<=n_h){
  65 +
  66 + // check if this vote value is a local max in its neighborhood.
  67 + if (gpuVote[i] < gpuVote[n]){
  68 + l_value = 0;
  69 + n =n_h+1;
  70 + }
  71 + else if (n == n_h){
  72 + l_value = 1;
  73 + n = n+1;
  74 + }
  75 + // check if the current neighbor is the last one at the current row
  76 + else if ((n - n_l - 2*conn)% x ==0){
  77 + n = n + x - 2*conn -1;
  78 + n ++;
  79 + }
  80 + else
  81 + n ++;
  82 + }
  83 + // set the center value for this pixel to high if it's a local max ,and to low if not.
  84 + gpuCenters[i] = l_value ;
  85 + }
  86 +
  87 + }
  88 +
  89 +
  90 +
  91 + template<typename T>
  92 + void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
  93 +
  94 +
  95 +
  96 +
  97 + unsigned int max_threads = stim::maxThreadsPerBlock();
  98 + dim3 threads(max_threads, 1);
  99 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  100 +
  101 +
  102 +
  103 + //call the kernel to find the local maximum.
  104 + cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y);
  105 +
  106 +
  107 + }
  108 +
  109 +
  110 +
  111 + template<typename T>
  112 + void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
  113 +
  114 +
  115 + //calculate the number of bytes in the array
  116 + unsigned int bytes = x * y * sizeof(T);
  117 +
  118 + // allocate space on the GPU for the detected cell centes
  119 + T* gpuCenters;
  120 + cudaMalloc(&gpuCenters, bytes);
  121 +
  122 +
  123 + //allocate space on the GPU for the input Vote Image
  124 + T* gpuVote;
  125 + cudaMalloc(&gpuVote, bytes);
  126 +
  127 +
  128 + //copy the Vote image data to the GPU
  129 + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
  130 +
  131 +
  132 + //call the GPU version of the local max function
  133 + gpu_local_max<T>(gpuCenters, gpuVote, final_t, conn, x, y);
  134 +
  135 +
  136 + //copy the cell centers data to the CPU
  137 + cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ;
  138 +
  139 +
  140 + //free allocated memory
  141 + cudaFree(gpuCenters);
  142 + cudaFree(gpuVote);
  143 + cudaFree(gpuGrad);
  144 + }
  145 +
  146 + }
  147 +}
  148 +
  149 +
  150 +
  151 +#endif
0 152 \ No newline at end of file
... ...
stim/cuda/sharedmem.cuh 0 → 100644
  1 +
  2 +#ifndef STIM_CUDA_SHAREDMEM_H
  3 +#define STIM_CUDA_SHAREDMEM_H
  4 +
  5 +namespace stim{
  6 + namespace cuda{
  7 +
  8 + // Copies values from global memory to shared memory, optimizing threads
  9 + template<typename T>
  10 + __device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src,
  11 + unsigned int x, unsigned int y, unsigned int X, unsigned int Y,
  12 + dim3 threadIdx, dim3 blockDim){
  13 +
  14 + //calculate the number of iterations required for the copy
  15 + unsigned int xI, yI;
  16 + xI = X/blockDim.x + 1; //number of iterations along X
  17 + yI = Y/blockDim.y + 1; //number of iterations along Y
  18 +
  19 + //for each iteration
  20 + for(unsigned int xi = 0; xi < xI; xi++){
  21 + for(unsigned int yi = 0; yi < yI; yi++){
  22 +
  23 + //calculate the index into shared memory
  24 + unsigned int sx = xi * blockDim.x + threadIdx.x;
  25 + unsigned int sy = yi * blockDim.y + threadIdx.y;
  26 +
  27 + //calculate the index into the texture
  28 + unsigned int tx = x + sx;
  29 + unsigned int ty = y + sy;
  30 +
  31 + //perform the copy
  32 + if(sx < X && sy < Y)
  33 + dest[sy * X + sx] = tex2D<T>(src, tx, ty);
  34 + }
  35 + }
  36 + }
  37 +
  38 + }
  39 +}
  40 +
  41 +
  42 +#endif
0 43 \ No newline at end of file
... ...
stim/cuda/update_dir.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_UPDATE_DIR_H
  2 +#define STIM_CUDA_UPDATE_DIR_H
  3 +
  4 +
  5 +# include <iostream>
  6 +# include <cuda.h>
  7 +# include <stim/cuda/devices.h>
  8 +# include <stim/cuda/error.h>
  9 +#include <stim/cuda/sharedmem.cuh>
  10 +
  11 +namespace stim{
  12 + namespace cuda{
  13 +
  14 + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
  15 + template<typename T>
  16 + __global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  17 +
  18 + //generate a pointer to shared memory (size will be specified as a kernel parameter)
  19 + extern __shared__ float s_vote[];
  20 +
  21 + //calculate the start point for this block
  22 + int bxi = blockIdx.x * blockDim.x;
  23 +
  24 + //calculate the width of the shared memory block
  25 + int swidth = 2 * rmax + blockDim.x;
  26 +
  27 + // calculate the 2D coordinates for this current thread.
  28 + int xi = bxi + threadIdx.x;
  29 + int yi = blockIdx.y;
  30 +
  31 + // convert 2D coordinates to 1D
  32 + int i = yi * x + xi;
  33 +
  34 + // calculate the voting direction based on the grtadient direction
  35 + float theta = gpuGrad[2*i];
  36 +
  37 + //initialize the vote direction to zero
  38 + gpuDir[i] = 0;
  39 +
  40 + // define a local variable to maximum value of the vote image in the voting area for this voter
  41 + float max = 0;
  42 +
  43 + // define two local variables for the x and y coordinations where the maximum happened
  44 + int id_x = 0;
  45 + int id_y = 0;
  46 +
  47 + // compute the size of window which will be checked for finding the voting area for this voter
  48 + unsigned int x_table = 2*rmax +1;
  49 + unsigned int rmax_sq = rmax * rmax;
  50 + int r = (int)rmax;
  51 + int tx_rmax = threadIdx.x + rmax;
  52 + int bxs = bxi - rmax;
  53 +
  54 + for(int yr = -r; yr <= r; yr++){
  55 +
  56 + //copy the portion of the image necessary for this block to shared memory
  57 + __syncthreads();
  58 + stim::cuda::sharedMemcpy_tex2D<float>(s_vote, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim);
  59 + __syncthreads();
  60 +
  61 + //if the current thread is outside of the image, it doesn't have to be computed
  62 + if(xi < x && yi < y){
  63 +
  64 + for(int xr = -r; xr <= r; xr++){
  65 +
  66 + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
  67 +
  68 + // calculate the angle between the voter and the current pixel in x and y directions
  69 + float atan_angle = gpuTable[ind_t];
  70 +
  71 +
  72 + // calculate the voting direction based on the grtadient direction
  73 + int idx_share_update = xr + tx_rmax ;
  74 + float share_vote = s_vote[idx_share_update];
  75 +
  76 + // check if the current pixel is located in the voting area of this voter.
  77 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  78 +
  79 + // compare the vote value of this pixel with the max value to find the maxima and its index.
  80 + if (share_vote>max) {
  81 +
  82 + max = share_vote;
  83 + id_x = xr;
  84 + id_y = yr;
  85 + }
  86 + }
  87 + }
  88 + }
  89 + }
  90 +
  91 +
  92 + //float new_angle = atan2(dy, dx);
  93 + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
  94 +
  95 + float new_angle = gpuTable[ind_m];
  96 +
  97 + gpuDir[i] = new_angle;
  98 +
  99 + }
  100 +
  101 + // this kernel updates the gradient direction by the calculated voting direction.
  102 + template<typename T>
  103 + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, unsigned int x, unsigned int y){
  104 +
  105 + //************ when the number of threads are (1024,1) *************
  106 +
  107 + // calculate the 2D coordinates for this current thread.
  108 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  109 + int yi = blockIdx.y;
  110 + // convert 2D coordinates to 1D
  111 + int i = yi * x + xi;
  112 +
  113 +
  114 + //update the gradient image with the vote direction
  115 + gpuGrad[2*i] = gpuDir[i];
  116 + }
  117 +
  118 +
  119 + template<typename T>
  120 + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  121 +
  122 + //get the number of pixels in the image
  123 + unsigned int pixels = x * y;
  124 + unsigned int bytes = sizeof(T) * pixels;
  125 +
  126 +
  127 + unsigned int max_threads = stim::maxThreadsPerBlock();
  128 + dim3 threads(max_threads, 1);
  129 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  130 +
  131 + // Allocate CUDA array in device memory
  132 +
  133 + //define a channel descriptor for a single 32-bit channel
  134 + cudaChannelFormatDesc channelDesc =
  135 + cudaCreateChannelDesc(32, 0, 0, 0,
  136 + cudaChannelFormatKindFloat);
  137 + cudaArray* cuArray; //declare the cuda array
  138 + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array
  139 +
  140 + // Copy the image data from global memory to the array
  141 + cudaMemcpyToArray(cuArray, 0, 0, gpuVote, bytes,
  142 + cudaMemcpyDeviceToDevice);
  143 +
  144 + // Specify texture
  145 + struct cudaResourceDesc resDesc; //create a resource descriptor
  146 + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero
  147 + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type
  148 + resDesc.res.array.array = cuArray; //add a pointer to the cuda array
  149 +
  150 + // Specify texture object parameters
  151 + struct cudaTextureDesc texDesc; //create a texture descriptor
  152 + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero
  153 + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges)
  154 + texDesc.addressMode[1] = cudaAddressModeWrap;
  155 + texDesc.filterMode = cudaFilterModePoint; //use linear filtering
  156 + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats)
  157 + texDesc.normalizedCoords = 0; //not using normalized coordinates
  158 +
  159 + // Create texture object
  160 + cudaTextureObject_t texObj = 0;
  161 + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
  162 +
  163 + // specify share memory
  164 + unsigned int share_bytes = (2*rmax + threads.x)*(1)*4;
  165 +
  166 + // allocate space on the GPU for the updated vote direction
  167 + T* gpuDir;
  168 + cudaMalloc(&gpuDir, bytes);
  169 +
  170 + //call the kernel to calculate the new voting direction
  171 + cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, texObj, gpuGrad, gpuTable, phi, rmax, x , y);
  172 +
  173 + //call the kernel to update the gradient direction
  174 + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
  175 +
  176 +
  177 + //free allocated memory
  178 + cudaFree(gpuDir);
  179 +
  180 + }
  181 +
  182 +
  183 + template<typename T>
  184 + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  185 +
  186 + //calculate the number of bytes in the array
  187 + unsigned int bytes = x * y * sizeof(T);
  188 +
  189 + //calculate the number of bytes in the atan2 table
  190 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  191 +
  192 + //allocate space on the GPU for the Vote Image
  193 + T* gpuVote;
  194 + cudaMalloc(&gpuVote, bytes);
  195 +
  196 + //copy the input vote image to the GPU
  197 + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
  198 +
  199 + //allocate space on the GPU for the input Gradient image
  200 + T* gpuGrad;
  201 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  202 +
  203 + //copy the Gradient data to the GPU
  204 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  205 +
  206 + //allocate space on the GPU for the atan2 table
  207 + T* gpuTable;
  208 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  209 +
  210 + //copy the atan2 values to the GPU
  211 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  212 +
  213 +
  214 + //call the GPU version of the update direction function
  215 + gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  216 +
  217 +
  218 + //copy the new gradient image back to the CPU
  219 + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
  220 +
  221 + //free allocated memory
  222 + cudaFree(gpuTable);
  223 + cudaFree(gpuVote);
  224 + cudaFree(gpuGrad);
  225 + }
  226 +
  227 + }
  228 +}
  229 +
  230 +
  231 +
  232 +#endif
0 233 \ No newline at end of file
... ...
stim/cuda/vote.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_VOTE_H
  2 +#define STIM_CUDA_VOTE_H
  3 +
  4 +
  5 +# include <iostream>
  6 +# include <cuda.h>
  7 +# include <stim/cuda/devices.h>
  8 +# include <stim/cuda/error.h>
  9 +#include <stim/cuda/sharedmem.cuh>
  10 +
  11 +
  12 +namespace stim{
  13 + namespace cuda{
  14 +
  15 + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  16 + template<typename T>
  17 + __global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  18 +
  19 + //generate a pointer to shared memory (size will be specified as a kernel parameter)
  20 + extern __shared__ float2 s_grad[];
  21 +
  22 + //calculate the start point for this block
  23 + int bxi = blockIdx.x * blockDim.x;
  24 +
  25 + //calculate the width of the shared memory block
  26 + int swidth = 2 * rmax + blockDim.x;
  27 +
  28 + // calculate the 2D coordinates for this current thread.
  29 + int xi = bxi + threadIdx.x;
  30 + int yi = blockIdx.y;
  31 + // convert 2D coordinates to 1D
  32 + int i = yi * x + xi;
  33 +
  34 +
  35 + // define a local variable to sum the votes from the voters
  36 + float sum = 0;
  37 +
  38 + // compute the size of window which will be checked for finding the proper voters for this pixel
  39 + unsigned int x_table = 2*rmax +1;
  40 +
  41 + unsigned int rmax_sq = rmax * rmax;
  42 + int r = (int)rmax;
  43 + int tx_rmax = threadIdx.x + rmax;
  44 + int bxs = bxi - rmax;
  45 +
  46 +
  47 + for(int yr = -r; yr <= r; yr++){
  48 +
  49 + //copy the portion of the image necessary for this block to shared memory
  50 + __syncthreads();
  51 + stim::cuda::sharedMemcpy_tex2D<float2>(s_grad, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim);
  52 + __syncthreads();
  53 +
  54 + //if the current thread is outside of the image, it doesn't have to be computed
  55 + if(xi < x && yi < y){
  56 +
  57 + for(int xr = -r; xr <= r; xr++){
  58 +
  59 + //find the location of this voter in the atan2 table
  60 + unsigned int id_t = (yr + rmax) * x_table + xr + rmax;
  61 +
  62 + // calculate the angle between the pixel and the current voter in x and y directions
  63 + float atan_angle = gpuTable[id_t];
  64 +
  65 +
  66 + // calculate the voting direction based on the grtadient direction
  67 + int idx_share = xr + tx_rmax ;
  68 + float2 g = s_grad[idx_share];
  69 + float theta = g.x;
  70 +
  71 + // check if the current voter is located in the voting area of this pixel.
  72 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  73 + sum += g.y;
  74 +
  75 + }
  76 +
  77 + }
  78 + }
  79 + }
  80 +
  81 + gpuVote[i] = sum;
  82 + }
  83 +
  84 + template<typename T>
  85 + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  86 +
  87 + //get the number of pixels in the image
  88 + unsigned int pixels = x * y;
  89 + unsigned int bytes = sizeof(T) * pixels;
  90 +
  91 +
  92 + unsigned int max_threads = stim::maxThreadsPerBlock();
  93 + //unsigned int thread_dim = sqrt(max_threads);
  94 + dim3 threads(max_threads, 1);
  95 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  96 +
  97 + // Allocate CUDA array in device memory
  98 +
  99 + //define a channel descriptor for a single 32-bit channel
  100 + cudaChannelFormatDesc channelDesc =
  101 + cudaCreateChannelDesc(32, 32, 0, 0,
  102 + cudaChannelFormatKindFloat);
  103 + cudaArray* cuArray; //declare the cuda array
  104 + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array
  105 +
  106 + // Copy the image data from global memory to the array
  107 + cudaMemcpyToArray(cuArray, 0, 0, gpuGrad, bytes*2,
  108 + cudaMemcpyDeviceToDevice);
  109 +
  110 + // Specify texture
  111 + struct cudaResourceDesc resDesc; //create a resource descriptor
  112 + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero
  113 + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type
  114 + resDesc.res.array.array = cuArray; //add a pointer to the cuda array
  115 +
  116 + // Specify texture object parameters
  117 + struct cudaTextureDesc texDesc; //create a texture descriptor
  118 + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero
  119 + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges)
  120 + texDesc.addressMode[1] = cudaAddressModeWrap;
  121 + texDesc.filterMode = cudaFilterModePoint; //use linear filtering
  122 + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats)
  123 + texDesc.normalizedCoords = 0; //not using normalized coordinates
  124 +
  125 + // Create texture object
  126 + cudaTextureObject_t texObj = 0;
  127 + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
  128 +
  129 + // specify share memory
  130 + unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4;
  131 +
  132 +
  133 + //call the kernel to do the voting
  134 +
  135 + cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y);
  136 +
  137 + }
  138 +
  139 +
  140 + template<typename T>
  141 + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  142 +
  143 + //calculate the number of bytes in the array
  144 + unsigned int bytes = x * y * sizeof(T);
  145 +
  146 + //calculate the number of bytes in the atan2 table
  147 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  148 +
  149 + //allocate space on the GPU for the Vote Image
  150 + T* gpuVote;
  151 + cudaMalloc(&gpuVote, bytes);
  152 +
  153 + //allocate space on the GPU for the input Gradient image
  154 + T* gpuGrad;
  155 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  156 +
  157 + //copy the Gradient Magnitude data to the GPU
  158 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  159 +
  160 + //allocate space on the GPU for the atan2 table
  161 + T* gpuTable;
  162 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  163 +
  164 + //copy the atan2 values to the GPU
  165 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  166 +
  167 + //cudaMemcpyToSymbol (cstTable, cpuTable, bytes_table );
  168 +
  169 +
  170 + //call the GPU version of the vote calculation function
  171 + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  172 +
  173 +
  174 + //copy the Vote Data back to the CPU
  175 + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  176 +
  177 + //free allocated memory
  178 + cudaFree(gpuTable);
  179 + cudaFree(gpuVote);
  180 + cudaFree(gpuGrad);
  181 + }
  182 +
  183 + }
  184 +}
  185 +
  186 +
  187 +
  188 +#endif
0 189 \ No newline at end of file
... ...