Commit dbeb83f2a83d7c1624fee2f0e2e5c860abd3c5b6

Authored by David Mayerich
1 parent 2e5e3a26

added separable convolution CPU and GPU

stim/cuda/sharedmem.cuh
... ... @@ -48,7 +48,11 @@ namespace stim{
48 48  
49 49 /// Threaded copying of 2D data on a CUDA device
50 50 /// @param dest is a linear destination array of size nx * ny
  51 + /// @param nx is the size of the region to be copied along the X dimension
  52 + /// @param ny is the size of the region to be copied along the Y dimension
51 53 /// @param src is a 2D image stored as a linear array with a pitch of X
  54 + /// @param x is the x position in the source image where the copy is started
  55 + /// @param y is the y position in the source image where the copy is started
52 56 /// @param X is the number of bytes in a row of src
53 57 /// @param tid is a 1D id for the current thread
54 58 /// @param nt is the number of threads in the block
... ...
stim/image/image.h
... ... @@ -23,9 +23,9 @@ class image{
23 23 T* img; //pointer to the image data (interleaved RGB for color)
24 24 size_t R[3];
25 25  
26   - size_t X() const { return R[1]; }
27   - size_t Y() const { return R[2]; }
28   - size_t C() const { return R[0]; }
  26 + inline size_t X() const { return R[1]; }
  27 + inline size_t Y() const { return R[2]; }
  28 + inline size_t C() const { return R[0]; }
29 29  
30 30 void init(){ //initializes all variables, assumes no memory is allocated
31 31 memset(R, 0, sizeof(size_t) * 3); //set the resolution and number of channels to zero
... ... @@ -54,8 +54,8 @@ class image{
54 54  
55 55 size_t bytes(){ return size() * sizeof(T); }
56 56  
57   - size_t idx(size_t x, size_t y, size_t c = 0){
58   - return y * C() * X() + x * C() + c;
  57 + inline size_t idx(size_t x, size_t y, size_t c = 0){
  58 + return y * R[0] * R[1] + x * R[0] + c;
59 59 }
60 60  
61 61 #ifdef USING_OPENCV
... ... @@ -338,20 +338,27 @@ public:
338 338 }
339 339 }
340 340  
341   -
342   - image<T> channel(size_t c){
343   -
344   - //create a new image
345   - image<T> r(X(), Y(), 1);
346   -
  341 + /// Return an image representing a specified channel
  342 + /// @param c is the channel to be returned
  343 + image<T> channel(size_t c){
  344 + image<T> r(X(), Y(), 1); //create a new image
347 345 for(size_t x = 0; x < X(); x++){
348 346 for(size_t y = 0; y < Y(); y++){
349 347 r.img[r.idx(x, y, 0)] = img[idx(x, y, c)];
350 348 }
351 349 }
352   -
353 350 return r;
  351 + }
  352 +
  353 + /// Returns an std::vector containing each channel as a separate image
  354 + std::vector<image<T>> split() {
  355 + std::vector<image<T>> r; //create an image array
  356 + r.resize(C()); //create images for each channel
354 357  
  358 + for (size_t c = 0; c < C(); c++) { //for each channel
  359 + r[c] = channel(c); //copy the channel image to the array
  360 + }
  361 + return r;
355 362 }
356 363  
357 364 T& operator()(size_t x, size_t y, size_t c = 0){
... ... @@ -505,22 +512,12 @@ public:
505 512 exit(1);
506 513 }
507 514  
  515 + /// Casting operator, casts every value in an image to a different data type V
508 516 template<typename V>
509 517 operator image<V>() {
510   - //create a new image
511   - //image<V> r(X(), Y(), C());
512   - V* dst = (V*)malloc(size() * sizeof(V));
513   -
514   - for (size_t x = 0; x < X(); x++) {
515   - for (size_t y = 0; y < Y(); y++) {
516   - for (size_t c = 0; c < C(); c++) {
517   - dst[idx(x, y, c)] = (V)img[idx(x, y, c)];
518   - }
519   - }
520   - }
521   -
522   - image<V> r(dst, X(), Y(), C());
523   - return r;
  518 + image<V> r(X(), Y(), C()); //create a new image
  519 + std::copy(img, img + size(), r.data()); //copy and cast the data
  520 + return r; //return the new image
524 521 }
525 522  
526 523 };
... ...
stim/math/filters/conv2.h
... ... @@ -4,30 +4,44 @@
4 4  
5 5 #ifdef __CUDACC__
6 6 #include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
7 8 #endif
8 9  
9 10 namespace stim {
10   -
  11 +#ifdef __CUDACC__
11 12 //Kernel function that performs the 2D convolution.
12   - template<typename T>
13   - __global__ void kernel_conv2(T* out, T* in, T* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
  13 + template<typename T, typename K = T>
  14 + __global__ void kernel_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
  15 + extern __shared__ T s[]; //declare a shared memory array
14 16 size_t xi = blockIdx.x * blockDim.x + threadIdx.x; //threads correspond to indices into the output image
15 17 size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
  18 + size_t tid = threadIdx.y * blockDim.x + threadIdx.x;
  19 + size_t nt = blockDim.x * blockDim.y;
  20 +
  21 + size_t cx = blockIdx.x * blockDim.x; //find the upper left corner of the input region
  22 + size_t cy = blockIdx.y * blockDim.y;
16 23  
17 24 size_t X = sx - kx + 1; //calculate the size of the output image
18 25 size_t Y = sy - ky + 1;
19 26  
  27 + if (cx >= X || cy >= Y) return; //return if the entire block is outside the image
  28 + size_t smx = min(blockDim.x + kx - 1, sx - cx); //size of the shared copy of the input image
  29 + size_t smy = min(blockDim.y + ky - 1, sy - cy); // min function is used to deal with boundary blocks
  30 + stim::cuda::threadedMemcpy2D<T>(s, smx, smy, in, cx, cy, sx, sy, tid, nt); //copy the input region to shared memory
  31 + __syncthreads();
  32 +
20 33 if (xi >= X || yi >= Y) return; //returns if the thread is outside of the output image
21 34  
22 35 //loop through the kernel
23 36 size_t kxi, kyi;
24   - T v = 0;
  37 + K v = 0;
25 38 for (kyi = 0; kyi < ky; kyi++) {
26 39 for (kxi = 0; kxi < kx; kxi++) {
27   - v += in[(yi + kyi) * sx + xi + kxi] * kernel[kyi * kx + kxi];
  40 + v += s[(threadIdx.y + kyi) * smx + threadIdx.x + kxi] * kernel[kyi * kx + kxi];
  41 + //v += in[(yi + kyi) * sx + xi + kxi] * kernel[kyi * kx + kxi];
28 42 }
29 43 }
30   - out[yi * X + xi] = v; //write the result to global memory
  44 + out[yi * X + xi] = (T)v; //write the result to global memory
31 45  
32 46 }
33 47  
... ... @@ -38,18 +52,23 @@ namespace stim {
38 52 //@param sy is the size of the input image along Y
39 53 //@param kx is the size of the kernel along X
40 54 //@param ky is the size of the kernel along Y
41   - template<typename T>
42   - void gpu_conv2(T* out, T* in, T* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
  55 + template<typename T, typename K = T>
  56 + void gpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
43 57 cudaDeviceProp p;
44 58 HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
45 59 size_t tmax = p.maxThreadsPerBlock;
46   - dim3 tn(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions
  60 + dim3 nt(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions
47 61 size_t X = sx - kx + 1; //calculate the size of the output image
48 62 size_t Y = sy - ky + 1;
49   - dim3 bn(X / tn.x + 1, Y / tn.y + 1); //calculate the grid dimensions
50   - kernel_conv2 <<<bn, tn >>> (out, in, kernel, sx, sy, kx, ky); //launch the kernel
  63 + dim3 nb(X / nt.x + 1, Y / nt.y + 1); //calculate the grid dimensions
  64 + size_t sm = (nt.x + kx - 1) * (nt.y + ky - 1) * sizeof(T); //shared memory bytes required to store block data
  65 + if (sm > p.sharedMemPerBlock) {
  66 + std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
  67 + exit(1);
  68 + }
  69 + kernel_conv2 <<<nb, nt, sm>>> (out, in, kernel, sx, sy, kx, ky); //launch the kernel
51 70 }
52   -
  71 +#endif
53 72 //Performs a convolution of a 2D image. Only valid pixels based on the kernel are returned.
54 73 // As a result, the output image will be smaller than the input image by (kx-1, ky-1)
55 74 //@param out is a pointer to the output image
... ... @@ -58,8 +77,8 @@ namespace stim {
58 77 //@param sy is the size of the input image along Y
59 78 //@param kx is the size of the kernel along X
60 79 //@param ky is the size of the kernel along Y
61   - template<typename T>
62   - void cpu_conv2(T* out, T* in, T* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
  80 + template<typename T, typename K = T>
  81 + void cpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
63 82 size_t X = sx - kx + 1; //x size of the output image
64 83 size_t Y = sy - ky + 1; //y size of the output image
65 84  
... ... @@ -68,9 +87,9 @@ namespace stim {
68 87 T* gpu_in;
69 88 HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T)));
70 89 HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice));
71   - T* gpu_kernel;
72   - HANDLE_ERROR(cudaMalloc(&gpu_kernel, kx * ky * sizeof(T)));
73   - HANDLE_ERROR(cudaMemcpy(gpu_kernel, kernel, kx * ky * sizeof(T), cudaMemcpyHostToDevice));
  90 + K* gpu_kernel;
  91 + HANDLE_ERROR(cudaMalloc(&gpu_kernel, kx * ky * sizeof(K)));
  92 + HANDLE_ERROR(cudaMemcpy(gpu_kernel, kernel, kx * ky * sizeof(K), cudaMemcpyHostToDevice));
74 93 T* gpu_out;
75 94 HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T)));
76 95 gpu_conv2(gpu_out, gpu_in, gpu_kernel, sx, sy, kx, ky); //execute the GPU kernel
... ... @@ -79,9 +98,7 @@ namespace stim {
79 98 HANDLE_ERROR(cudaFree(gpu_kernel));
80 99 HANDLE_ERROR(cudaFree(gpu_out));
81 100 #else
82   -
83   -
84   - T v; //register stores the integral of the current pixel value
  101 + K v; //register stores the integral of the current pixel value
85 102 size_t yi, xi, kyi, kxi, yi_kyi_sx;
86 103 for (yi = 0; yi < Y; yi++) { //for each pixel in the output image
87 104 for (xi = 0; xi < X; xi++) {
... ...
stim/math/filters/sepconv2.h 0 → 100644
  1 +#ifndef STIM_CUDA_SEPCONV2_H
  2 +#define STIM_CUDA_SEPCONV2_H
  3 +#include <stim/math/filters/conv2.h>
  4 +#ifdef __CUDACC__
  5 +#include <stim/cuda/cudatools.h>
  6 +#include <stim/cuda/sharedmem.cuh>
  7 +#endif
  8 +
  9 +namespace stim {
  10 +#ifdef __CUDACC__
  11 + //Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device.
  12 + //@param out is a pointer to the output image
  13 + //@param in is a pointer to the input image
  14 + //@param sx is the size of the input image along X
  15 + //@param sy is the size of the input image along Y
  16 + //@param kx is the size of the kernel along X
  17 + //@param ky is the size of the kernel along Y
  18 + template<typename T, typename K = T>
  19 + void gpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) {
  20 + cudaDeviceProp p;
  21 + HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
  22 + size_t tmax = p.maxThreadsPerBlock;
  23 + dim3 nt(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions
  24 + size_t X = sx - kx + 1; //calculate the x size of the output image
  25 + T* temp; //declare a temporary variable to store the intermediate image
  26 + HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T))); //allocate memory for the intermediate image
  27 +
  28 + dim3 nb(X / nt.x + 1, sy / nt.y + 1); //calculate the grid dimensions
  29 + size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T); //shared memory bytes required to store block data
  30 + if (sm > p.sharedMemPerBlock) {
  31 + std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
  32 + exit(1);
  33 + }
  34 + kernel_conv2 <<<nb, nt, sm>>> (temp, in, k0, sx, sy, kx, 1); //launch the kernel to compute the intermediate image
  35 +
  36 + size_t Y = sy - ky + 1; //calculate the y size of the output image
  37 + nb.y = Y / nt.y + 1; //update the grid dimensions to reflect the Y-axis size of the output image
  38 + sm = nt.x * (nt.y + ky - 1) * sizeof(T); //calculate the amount of shared memory needed for the second pass
  39 + if (sm > p.sharedMemPerBlock) {
  40 + std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
  41 + exit(1);
  42 + }
  43 + kernel_conv2 <<<nb, nt, sm>>> (out, temp, k1, X, sy, 1, ky); //launch the kernel to compute the final image
  44 + HANDLE_ERROR(cudaFree(temp)); //free memory allocated for the intermediate image
  45 + }
  46 +#endif
  47 + //Performs a separable convolution of a 2D image. Only valid pixels based on the kernel are returned.
  48 + // As a result, the output image will be smaller than the input image by (kx-1, ky-1)
  49 + //@param out is a pointer to the output image
  50 + //@param in is a pointer to the input image
  51 + //@param k0 is the x-axis convolution filter
  52 + //@param k1 is the y-axis convolution filter
  53 + //@param sx is the size of the input image along X
  54 + //@param sy is the size of the input image along Y
  55 + //@param kx is the size of the kernel along X
  56 + //@param ky is the size of the kernel along Y
  57 + template<typename T, typename K = T>
  58 + void cpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) {
  59 + size_t X = sx - kx + 1; //x size of the output image
  60 + size_t Y = sy - ky + 1;
  61 +#ifdef __CUDACC__
  62 + //allocate memory and copy everything to the GPU
  63 + T* gpu_in;
  64 + HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T)));
  65 + HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice));
  66 + K* gpu_k0;
  67 + HANDLE_ERROR(cudaMalloc(&gpu_k0, kx * sizeof(K)));
  68 + HANDLE_ERROR(cudaMemcpy(gpu_k0, k0, kx * sizeof(K), cudaMemcpyHostToDevice));
  69 + K* gpu_k1;
  70 + HANDLE_ERROR(cudaMalloc(&gpu_k1, ky * sizeof(K)));
  71 + HANDLE_ERROR(cudaMemcpy(gpu_k1, k1, ky * sizeof(K), cudaMemcpyHostToDevice));
  72 + T* gpu_out;
  73 + HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T)));
  74 + gpu_sepconv2(gpu_out, gpu_in, gpu_k0, gpu_k1, sx, sy, kx, ky); //execute the GPU kernel
  75 + HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost)); //copy the result to the host
  76 + HANDLE_ERROR(cudaFree(gpu_in));
  77 + HANDLE_ERROR(cudaFree(gpu_k0));
  78 + HANDLE_ERROR(cudaFree(gpu_k1));
  79 + HANDLE_ERROR(cudaFree(gpu_out));
  80 +#else
  81 + T* temp = (T*)malloc(X * sy * sizeof(T)); //allocate space for the intermediate image
  82 + cpu_conv2(temp, in, k0, sx, sy, kx, 1); //evaluate the intermediate image
  83 + cpu_conv2(out, temp, k1, X, sy, 1, ky); //evaluate the final image
  84 + free(temp); //free the memory for the intermediate image
  85 +#endif
  86 + }
  87 +}
  88 +
  89 +#endif
0 90 \ No newline at end of file
... ...