From dbeb83f2a83d7c1624fee2f0e2e5c860abd3c5b6 Mon Sep 17 00:00:00 2001 From: David Date: Thu, 6 Oct 2016 00:36:51 -0500 Subject: [PATCH] added separable convolution CPU and GPU --- stim/cuda/sharedmem.cuh | 4 ++++ stim/image/image.h | 49 +++++++++++++++++++++++-------------------------- stim/math/filters/conv2.h | 57 +++++++++++++++++++++++++++++++++++++-------------------- stim/math/filters/sepconv2.h | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 153 insertions(+), 46 deletions(-) create mode 100644 stim/math/filters/sepconv2.h diff --git a/stim/cuda/sharedmem.cuh b/stim/cuda/sharedmem.cuh index a5f05cc..60db5e0 100644 --- a/stim/cuda/sharedmem.cuh +++ b/stim/cuda/sharedmem.cuh @@ -48,7 +48,11 @@ namespace stim{ /// Threaded copying of 2D data on a CUDA device /// @param dest is a linear destination array of size nx * ny + /// @param nx is the size of the region to be copied along the X dimension + /// @param ny is the size of the region to be copied along the Y dimension /// @param src is a 2D image stored as a linear array with a pitch of X + /// @param x is the x position in the source image where the copy is started + /// @param y is the y position in the source image where the copy is started /// @param X is the number of bytes in a row of src /// @param tid is a 1D id for the current thread /// @param nt is the number of threads in the block diff --git a/stim/image/image.h b/stim/image/image.h index 7a461b9..fa7b5eb 100644 --- a/stim/image/image.h +++ b/stim/image/image.h @@ -23,9 +23,9 @@ class image{ T* img; //pointer to the image data (interleaved RGB for color) size_t R[3]; - size_t X() const { return R[1]; } - size_t Y() const { return R[2]; } - size_t C() const { return R[0]; } + inline size_t X() const { return R[1]; } + inline size_t Y() const { return R[2]; } + inline size_t C() const { return R[0]; } void init(){ //initializes all variables, assumes no memory is allocated memset(R, 0, sizeof(size_t) * 3); //set the resolution and number of channels to zero @@ -54,8 +54,8 @@ class image{ size_t bytes(){ return size() * sizeof(T); } - size_t idx(size_t x, size_t y, size_t c = 0){ - return y * C() * X() + x * C() + c; + inline size_t idx(size_t x, size_t y, size_t c = 0){ + return y * R[0] * R[1] + x * R[0] + c; } #ifdef USING_OPENCV @@ -338,20 +338,27 @@ public: } } - - image channel(size_t c){ - - //create a new image - image r(X(), Y(), 1); - + /// Return an image representing a specified channel + /// @param c is the channel to be returned + image channel(size_t c){ + image r(X(), Y(), 1); //create a new image for(size_t x = 0; x < X(); x++){ for(size_t y = 0; y < Y(); y++){ r.img[r.idx(x, y, 0)] = img[idx(x, y, c)]; } } - return r; + } + + /// Returns an std::vector containing each channel as a separate image + std::vector> split() { + std::vector> r; //create an image array + r.resize(C()); //create images for each channel + for (size_t c = 0; c < C(); c++) { //for each channel + r[c] = channel(c); //copy the channel image to the array + } + return r; } T& operator()(size_t x, size_t y, size_t c = 0){ @@ -505,22 +512,12 @@ public: exit(1); } + /// Casting operator, casts every value in an image to a different data type V template operator image() { - //create a new image - //image r(X(), Y(), C()); - V* dst = (V*)malloc(size() * sizeof(V)); - - for (size_t x = 0; x < X(); x++) { - for (size_t y = 0; y < Y(); y++) { - for (size_t c = 0; c < C(); c++) { - dst[idx(x, y, c)] = (V)img[idx(x, y, c)]; - } - } - } - - image r(dst, X(), Y(), C()); - return r; + image r(X(), Y(), C()); //create a new image + std::copy(img, img + size(), r.data()); //copy and cast the data + return r; //return the new image } }; diff --git a/stim/math/filters/conv2.h b/stim/math/filters/conv2.h index 760dea4..5c149ea 100644 --- a/stim/math/filters/conv2.h +++ b/stim/math/filters/conv2.h @@ -4,30 +4,44 @@ #ifdef __CUDACC__ #include +#include #endif namespace stim { - +#ifdef __CUDACC__ //Kernel function that performs the 2D convolution. - template - __global__ void kernel_conv2(T* out, T* in, T* kernel, size_t sx, size_t sy, size_t kx, size_t ky) { + template + __global__ void kernel_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) { + extern __shared__ T s[]; //declare a shared memory array size_t xi = blockIdx.x * blockDim.x + threadIdx.x; //threads correspond to indices into the output image size_t yi = blockIdx.y * blockDim.y + threadIdx.y; + size_t tid = threadIdx.y * blockDim.x + threadIdx.x; + size_t nt = blockDim.x * blockDim.y; + + size_t cx = blockIdx.x * blockDim.x; //find the upper left corner of the input region + size_t cy = blockIdx.y * blockDim.y; size_t X = sx - kx + 1; //calculate the size of the output image size_t Y = sy - ky + 1; + if (cx >= X || cy >= Y) return; //return if the entire block is outside the image + size_t smx = min(blockDim.x + kx - 1, sx - cx); //size of the shared copy of the input image + size_t smy = min(blockDim.y + ky - 1, sy - cy); // min function is used to deal with boundary blocks + stim::cuda::threadedMemcpy2D(s, smx, smy, in, cx, cy, sx, sy, tid, nt); //copy the input region to shared memory + __syncthreads(); + if (xi >= X || yi >= Y) return; //returns if the thread is outside of the output image //loop through the kernel size_t kxi, kyi; - T v = 0; + K v = 0; for (kyi = 0; kyi < ky; kyi++) { for (kxi = 0; kxi < kx; kxi++) { - v += in[(yi + kyi) * sx + xi + kxi] * kernel[kyi * kx + kxi]; + v += s[(threadIdx.y + kyi) * smx + threadIdx.x + kxi] * kernel[kyi * kx + kxi]; + //v += in[(yi + kyi) * sx + xi + kxi] * kernel[kyi * kx + kxi]; } } - out[yi * X + xi] = v; //write the result to global memory + out[yi * X + xi] = (T)v; //write the result to global memory } @@ -38,18 +52,23 @@ namespace stim { //@param sy is the size of the input image along Y //@param kx is the size of the kernel along X //@param ky is the size of the kernel along Y - template - void gpu_conv2(T* out, T* in, T* kernel, size_t sx, size_t sy, size_t kx, size_t ky) { + template + void gpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) { cudaDeviceProp p; HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); size_t tmax = p.maxThreadsPerBlock; - dim3 tn(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions + dim3 nt(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions size_t X = sx - kx + 1; //calculate the size of the output image size_t Y = sy - ky + 1; - dim3 bn(X / tn.x + 1, Y / tn.y + 1); //calculate the grid dimensions - kernel_conv2 <<>> (out, in, kernel, sx, sy, kx, ky); //launch the kernel + dim3 nb(X / nt.x + 1, Y / nt.y + 1); //calculate the grid dimensions + size_t sm = (nt.x + kx - 1) * (nt.y + ky - 1) * sizeof(T); //shared memory bytes required to store block data + if (sm > p.sharedMemPerBlock) { + std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; + exit(1); + } + kernel_conv2 <<>> (out, in, kernel, sx, sy, kx, ky); //launch the kernel } - +#endif //Performs a convolution of a 2D image. Only valid pixels based on the kernel are returned. // As a result, the output image will be smaller than the input image by (kx-1, ky-1) //@param out is a pointer to the output image @@ -58,8 +77,8 @@ namespace stim { //@param sy is the size of the input image along Y //@param kx is the size of the kernel along X //@param ky is the size of the kernel along Y - template - void cpu_conv2(T* out, T* in, T* kernel, size_t sx, size_t sy, size_t kx, size_t ky) { + template + void cpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) { size_t X = sx - kx + 1; //x size of the output image size_t Y = sy - ky + 1; //y size of the output image @@ -68,9 +87,9 @@ namespace stim { T* gpu_in; HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T))); HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice)); - T* gpu_kernel; - HANDLE_ERROR(cudaMalloc(&gpu_kernel, kx * ky * sizeof(T))); - HANDLE_ERROR(cudaMemcpy(gpu_kernel, kernel, kx * ky * sizeof(T), cudaMemcpyHostToDevice)); + K* gpu_kernel; + HANDLE_ERROR(cudaMalloc(&gpu_kernel, kx * ky * sizeof(K))); + HANDLE_ERROR(cudaMemcpy(gpu_kernel, kernel, kx * ky * sizeof(K), cudaMemcpyHostToDevice)); T* gpu_out; HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T))); gpu_conv2(gpu_out, gpu_in, gpu_kernel, sx, sy, kx, ky); //execute the GPU kernel @@ -79,9 +98,7 @@ namespace stim { HANDLE_ERROR(cudaFree(gpu_kernel)); HANDLE_ERROR(cudaFree(gpu_out)); #else - - - T v; //register stores the integral of the current pixel value + K v; //register stores the integral of the current pixel value size_t yi, xi, kyi, kxi, yi_kyi_sx; for (yi = 0; yi < Y; yi++) { //for each pixel in the output image for (xi = 0; xi < X; xi++) { diff --git a/stim/math/filters/sepconv2.h b/stim/math/filters/sepconv2.h new file mode 100644 index 0000000..90503aa --- /dev/null +++ b/stim/math/filters/sepconv2.h @@ -0,0 +1,89 @@ +#ifndef STIM_CUDA_SEPCONV2_H +#define STIM_CUDA_SEPCONV2_H +#include +#ifdef __CUDACC__ +#include +#include +#endif + +namespace stim { +#ifdef __CUDACC__ + //Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device. + //@param out is a pointer to the output image + //@param in is a pointer to the input image + //@param sx is the size of the input image along X + //@param sy is the size of the input image along Y + //@param kx is the size of the kernel along X + //@param ky is the size of the kernel along Y + template + void gpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) { + cudaDeviceProp p; + HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); + size_t tmax = p.maxThreadsPerBlock; + dim3 nt(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions + size_t X = sx - kx + 1; //calculate the x size of the output image + T* temp; //declare a temporary variable to store the intermediate image + HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T))); //allocate memory for the intermediate image + + dim3 nb(X / nt.x + 1, sy / nt.y + 1); //calculate the grid dimensions + size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T); //shared memory bytes required to store block data + if (sm > p.sharedMemPerBlock) { + std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; + exit(1); + } + kernel_conv2 <<>> (temp, in, k0, sx, sy, kx, 1); //launch the kernel to compute the intermediate image + + size_t Y = sy - ky + 1; //calculate the y size of the output image + nb.y = Y / nt.y + 1; //update the grid dimensions to reflect the Y-axis size of the output image + sm = nt.x * (nt.y + ky - 1) * sizeof(T); //calculate the amount of shared memory needed for the second pass + if (sm > p.sharedMemPerBlock) { + std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; + exit(1); + } + kernel_conv2 <<>> (out, temp, k1, X, sy, 1, ky); //launch the kernel to compute the final image + HANDLE_ERROR(cudaFree(temp)); //free memory allocated for the intermediate image + } +#endif + //Performs a separable convolution of a 2D image. Only valid pixels based on the kernel are returned. + // As a result, the output image will be smaller than the input image by (kx-1, ky-1) + //@param out is a pointer to the output image + //@param in is a pointer to the input image + //@param k0 is the x-axis convolution filter + //@param k1 is the y-axis convolution filter + //@param sx is the size of the input image along X + //@param sy is the size of the input image along Y + //@param kx is the size of the kernel along X + //@param ky is the size of the kernel along Y + template + void cpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) { + size_t X = sx - kx + 1; //x size of the output image + size_t Y = sy - ky + 1; +#ifdef __CUDACC__ + //allocate memory and copy everything to the GPU + T* gpu_in; + HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice)); + K* gpu_k0; + HANDLE_ERROR(cudaMalloc(&gpu_k0, kx * sizeof(K))); + HANDLE_ERROR(cudaMemcpy(gpu_k0, k0, kx * sizeof(K), cudaMemcpyHostToDevice)); + K* gpu_k1; + HANDLE_ERROR(cudaMalloc(&gpu_k1, ky * sizeof(K))); + HANDLE_ERROR(cudaMemcpy(gpu_k1, k1, ky * sizeof(K), cudaMemcpyHostToDevice)); + T* gpu_out; + HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T))); + gpu_sepconv2(gpu_out, gpu_in, gpu_k0, gpu_k1, sx, sy, kx, ky); //execute the GPU kernel + HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost)); //copy the result to the host + HANDLE_ERROR(cudaFree(gpu_in)); + HANDLE_ERROR(cudaFree(gpu_k0)); + HANDLE_ERROR(cudaFree(gpu_k1)); + HANDLE_ERROR(cudaFree(gpu_out)); +#else + T* temp = (T*)malloc(X * sy * sizeof(T)); //allocate space for the intermediate image + cpu_conv2(temp, in, k0, sx, sy, kx, 1); //evaluate the intermediate image + cpu_conv2(out, temp, k1, X, sy, 1, ky); //evaluate the final image + free(temp); //free the memory for the intermediate image +#endif + } +} + +#endif \ No newline at end of file -- libgit2 0.21.4