Commit dbeb83f2a83d7c1624fee2f0e2e5c860abd3c5b6
1 parent
2e5e3a26
added separable convolution CPU and GPU
Showing
4 changed files
with
153 additions
and
46 deletions
Show diff stats
stim/cuda/sharedmem.cuh
@@ -48,7 +48,11 @@ namespace stim{ | @@ -48,7 +48,11 @@ namespace stim{ | ||
48 | 48 | ||
49 | /// Threaded copying of 2D data on a CUDA device | 49 | /// Threaded copying of 2D data on a CUDA device |
50 | /// @param dest is a linear destination array of size nx * ny | 50 | /// @param dest is a linear destination array of size nx * ny |
51 | + /// @param nx is the size of the region to be copied along the X dimension | ||
52 | + /// @param ny is the size of the region to be copied along the Y dimension | ||
51 | /// @param src is a 2D image stored as a linear array with a pitch of X | 53 | /// @param src is a 2D image stored as a linear array with a pitch of X |
54 | + /// @param x is the x position in the source image where the copy is started | ||
55 | + /// @param y is the y position in the source image where the copy is started | ||
52 | /// @param X is the number of bytes in a row of src | 56 | /// @param X is the number of bytes in a row of src |
53 | /// @param tid is a 1D id for the current thread | 57 | /// @param tid is a 1D id for the current thread |
54 | /// @param nt is the number of threads in the block | 58 | /// @param nt is the number of threads in the block |
stim/image/image.h
@@ -23,9 +23,9 @@ class image{ | @@ -23,9 +23,9 @@ class image{ | ||
23 | T* img; //pointer to the image data (interleaved RGB for color) | 23 | T* img; //pointer to the image data (interleaved RGB for color) |
24 | size_t R[3]; | 24 | size_t R[3]; |
25 | 25 | ||
26 | - size_t X() const { return R[1]; } | ||
27 | - size_t Y() const { return R[2]; } | ||
28 | - size_t C() const { return R[0]; } | 26 | + inline size_t X() const { return R[1]; } |
27 | + inline size_t Y() const { return R[2]; } | ||
28 | + inline size_t C() const { return R[0]; } | ||
29 | 29 | ||
30 | void init(){ //initializes all variables, assumes no memory is allocated | 30 | void init(){ //initializes all variables, assumes no memory is allocated |
31 | memset(R, 0, sizeof(size_t) * 3); //set the resolution and number of channels to zero | 31 | memset(R, 0, sizeof(size_t) * 3); //set the resolution and number of channels to zero |
@@ -54,8 +54,8 @@ class image{ | @@ -54,8 +54,8 @@ class image{ | ||
54 | 54 | ||
55 | size_t bytes(){ return size() * sizeof(T); } | 55 | size_t bytes(){ return size() * sizeof(T); } |
56 | 56 | ||
57 | - size_t idx(size_t x, size_t y, size_t c = 0){ | ||
58 | - return y * C() * X() + x * C() + c; | 57 | + inline size_t idx(size_t x, size_t y, size_t c = 0){ |
58 | + return y * R[0] * R[1] + x * R[0] + c; | ||
59 | } | 59 | } |
60 | 60 | ||
61 | #ifdef USING_OPENCV | 61 | #ifdef USING_OPENCV |
@@ -338,20 +338,27 @@ public: | @@ -338,20 +338,27 @@ public: | ||
338 | } | 338 | } |
339 | } | 339 | } |
340 | 340 | ||
341 | - | ||
342 | - image<T> channel(size_t c){ | ||
343 | - | ||
344 | - //create a new image | ||
345 | - image<T> r(X(), Y(), 1); | ||
346 | - | 341 | + /// Return an image representing a specified channel |
342 | + /// @param c is the channel to be returned | ||
343 | + image<T> channel(size_t c){ | ||
344 | + image<T> r(X(), Y(), 1); //create a new image | ||
347 | for(size_t x = 0; x < X(); x++){ | 345 | for(size_t x = 0; x < X(); x++){ |
348 | for(size_t y = 0; y < Y(); y++){ | 346 | for(size_t y = 0; y < Y(); y++){ |
349 | r.img[r.idx(x, y, 0)] = img[idx(x, y, c)]; | 347 | r.img[r.idx(x, y, 0)] = img[idx(x, y, c)]; |
350 | } | 348 | } |
351 | } | 349 | } |
352 | - | ||
353 | return r; | 350 | return r; |
351 | + } | ||
352 | + | ||
353 | + /// Returns an std::vector containing each channel as a separate image | ||
354 | + std::vector<image<T>> split() { | ||
355 | + std::vector<image<T>> r; //create an image array | ||
356 | + r.resize(C()); //create images for each channel | ||
354 | 357 | ||
358 | + for (size_t c = 0; c < C(); c++) { //for each channel | ||
359 | + r[c] = channel(c); //copy the channel image to the array | ||
360 | + } | ||
361 | + return r; | ||
355 | } | 362 | } |
356 | 363 | ||
357 | T& operator()(size_t x, size_t y, size_t c = 0){ | 364 | T& operator()(size_t x, size_t y, size_t c = 0){ |
@@ -505,22 +512,12 @@ public: | @@ -505,22 +512,12 @@ public: | ||
505 | exit(1); | 512 | exit(1); |
506 | } | 513 | } |
507 | 514 | ||
515 | + /// Casting operator, casts every value in an image to a different data type V | ||
508 | template<typename V> | 516 | template<typename V> |
509 | operator image<V>() { | 517 | operator image<V>() { |
510 | - //create a new image | ||
511 | - //image<V> r(X(), Y(), C()); | ||
512 | - V* dst = (V*)malloc(size() * sizeof(V)); | ||
513 | - | ||
514 | - for (size_t x = 0; x < X(); x++) { | ||
515 | - for (size_t y = 0; y < Y(); y++) { | ||
516 | - for (size_t c = 0; c < C(); c++) { | ||
517 | - dst[idx(x, y, c)] = (V)img[idx(x, y, c)]; | ||
518 | - } | ||
519 | - } | ||
520 | - } | ||
521 | - | ||
522 | - image<V> r(dst, X(), Y(), C()); | ||
523 | - return r; | 518 | + image<V> r(X(), Y(), C()); //create a new image |
519 | + std::copy(img, img + size(), r.data()); //copy and cast the data | ||
520 | + return r; //return the new image | ||
524 | } | 521 | } |
525 | 522 | ||
526 | }; | 523 | }; |
stim/math/filters/conv2.h
@@ -4,30 +4,44 @@ | @@ -4,30 +4,44 @@ | ||
4 | 4 | ||
5 | #ifdef __CUDACC__ | 5 | #ifdef __CUDACC__ |
6 | #include <stim/cuda/cudatools.h> | 6 | #include <stim/cuda/cudatools.h> |
7 | +#include <stim/cuda/sharedmem.cuh> | ||
7 | #endif | 8 | #endif |
8 | 9 | ||
9 | namespace stim { | 10 | namespace stim { |
10 | - | 11 | +#ifdef __CUDACC__ |
11 | //Kernel function that performs the 2D convolution. | 12 | //Kernel function that performs the 2D convolution. |
12 | - template<typename T> | ||
13 | - __global__ void kernel_conv2(T* out, T* in, T* kernel, size_t sx, size_t sy, size_t kx, size_t ky) { | 13 | + template<typename T, typename K = T> |
14 | + __global__ void kernel_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) { | ||
15 | + extern __shared__ T s[]; //declare a shared memory array | ||
14 | size_t xi = blockIdx.x * blockDim.x + threadIdx.x; //threads correspond to indices into the output image | 16 | size_t xi = blockIdx.x * blockDim.x + threadIdx.x; //threads correspond to indices into the output image |
15 | size_t yi = blockIdx.y * blockDim.y + threadIdx.y; | 17 | size_t yi = blockIdx.y * blockDim.y + threadIdx.y; |
18 | + size_t tid = threadIdx.y * blockDim.x + threadIdx.x; | ||
19 | + size_t nt = blockDim.x * blockDim.y; | ||
20 | + | ||
21 | + size_t cx = blockIdx.x * blockDim.x; //find the upper left corner of the input region | ||
22 | + size_t cy = blockIdx.y * blockDim.y; | ||
16 | 23 | ||
17 | size_t X = sx - kx + 1; //calculate the size of the output image | 24 | size_t X = sx - kx + 1; //calculate the size of the output image |
18 | size_t Y = sy - ky + 1; | 25 | size_t Y = sy - ky + 1; |
19 | 26 | ||
27 | + if (cx >= X || cy >= Y) return; //return if the entire block is outside the image | ||
28 | + size_t smx = min(blockDim.x + kx - 1, sx - cx); //size of the shared copy of the input image | ||
29 | + size_t smy = min(blockDim.y + ky - 1, sy - cy); // min function is used to deal with boundary blocks | ||
30 | + stim::cuda::threadedMemcpy2D<T>(s, smx, smy, in, cx, cy, sx, sy, tid, nt); //copy the input region to shared memory | ||
31 | + __syncthreads(); | ||
32 | + | ||
20 | if (xi >= X || yi >= Y) return; //returns if the thread is outside of the output image | 33 | if (xi >= X || yi >= Y) return; //returns if the thread is outside of the output image |
21 | 34 | ||
22 | //loop through the kernel | 35 | //loop through the kernel |
23 | size_t kxi, kyi; | 36 | size_t kxi, kyi; |
24 | - T v = 0; | 37 | + K v = 0; |
25 | for (kyi = 0; kyi < ky; kyi++) { | 38 | for (kyi = 0; kyi < ky; kyi++) { |
26 | for (kxi = 0; kxi < kx; kxi++) { | 39 | for (kxi = 0; kxi < kx; kxi++) { |
27 | - v += in[(yi + kyi) * sx + xi + kxi] * kernel[kyi * kx + kxi]; | 40 | + v += s[(threadIdx.y + kyi) * smx + threadIdx.x + kxi] * kernel[kyi * kx + kxi]; |
41 | + //v += in[(yi + kyi) * sx + xi + kxi] * kernel[kyi * kx + kxi]; | ||
28 | } | 42 | } |
29 | } | 43 | } |
30 | - out[yi * X + xi] = v; //write the result to global memory | 44 | + out[yi * X + xi] = (T)v; //write the result to global memory |
31 | 45 | ||
32 | } | 46 | } |
33 | 47 | ||
@@ -38,18 +52,23 @@ namespace stim { | @@ -38,18 +52,23 @@ namespace stim { | ||
38 | //@param sy is the size of the input image along Y | 52 | //@param sy is the size of the input image along Y |
39 | //@param kx is the size of the kernel along X | 53 | //@param kx is the size of the kernel along X |
40 | //@param ky is the size of the kernel along Y | 54 | //@param ky is the size of the kernel along Y |
41 | - template<typename T> | ||
42 | - void gpu_conv2(T* out, T* in, T* kernel, size_t sx, size_t sy, size_t kx, size_t ky) { | 55 | + template<typename T, typename K = T> |
56 | + void gpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) { | ||
43 | cudaDeviceProp p; | 57 | cudaDeviceProp p; |
44 | HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); | 58 | HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); |
45 | size_t tmax = p.maxThreadsPerBlock; | 59 | size_t tmax = p.maxThreadsPerBlock; |
46 | - dim3 tn(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions | 60 | + dim3 nt(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions |
47 | size_t X = sx - kx + 1; //calculate the size of the output image | 61 | size_t X = sx - kx + 1; //calculate the size of the output image |
48 | size_t Y = sy - ky + 1; | 62 | size_t Y = sy - ky + 1; |
49 | - dim3 bn(X / tn.x + 1, Y / tn.y + 1); //calculate the grid dimensions | ||
50 | - kernel_conv2 <<<bn, tn >>> (out, in, kernel, sx, sy, kx, ky); //launch the kernel | 63 | + dim3 nb(X / nt.x + 1, Y / nt.y + 1); //calculate the grid dimensions |
64 | + size_t sm = (nt.x + kx - 1) * (nt.y + ky - 1) * sizeof(T); //shared memory bytes required to store block data | ||
65 | + if (sm > p.sharedMemPerBlock) { | ||
66 | + std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; | ||
67 | + exit(1); | ||
68 | + } | ||
69 | + kernel_conv2 <<<nb, nt, sm>>> (out, in, kernel, sx, sy, kx, ky); //launch the kernel | ||
51 | } | 70 | } |
52 | - | 71 | +#endif |
53 | //Performs a convolution of a 2D image. Only valid pixels based on the kernel are returned. | 72 | //Performs a convolution of a 2D image. Only valid pixels based on the kernel are returned. |
54 | // As a result, the output image will be smaller than the input image by (kx-1, ky-1) | 73 | // As a result, the output image will be smaller than the input image by (kx-1, ky-1) |
55 | //@param out is a pointer to the output image | 74 | //@param out is a pointer to the output image |
@@ -58,8 +77,8 @@ namespace stim { | @@ -58,8 +77,8 @@ namespace stim { | ||
58 | //@param sy is the size of the input image along Y | 77 | //@param sy is the size of the input image along Y |
59 | //@param kx is the size of the kernel along X | 78 | //@param kx is the size of the kernel along X |
60 | //@param ky is the size of the kernel along Y | 79 | //@param ky is the size of the kernel along Y |
61 | - template<typename T> | ||
62 | - void cpu_conv2(T* out, T* in, T* kernel, size_t sx, size_t sy, size_t kx, size_t ky) { | 80 | + template<typename T, typename K = T> |
81 | + void cpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) { | ||
63 | size_t X = sx - kx + 1; //x size of the output image | 82 | size_t X = sx - kx + 1; //x size of the output image |
64 | size_t Y = sy - ky + 1; //y size of the output image | 83 | size_t Y = sy - ky + 1; //y size of the output image |
65 | 84 | ||
@@ -68,9 +87,9 @@ namespace stim { | @@ -68,9 +87,9 @@ namespace stim { | ||
68 | T* gpu_in; | 87 | T* gpu_in; |
69 | HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T))); | 88 | HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T))); |
70 | HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice)); | 89 | HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice)); |
71 | - T* gpu_kernel; | ||
72 | - HANDLE_ERROR(cudaMalloc(&gpu_kernel, kx * ky * sizeof(T))); | ||
73 | - HANDLE_ERROR(cudaMemcpy(gpu_kernel, kernel, kx * ky * sizeof(T), cudaMemcpyHostToDevice)); | 90 | + K* gpu_kernel; |
91 | + HANDLE_ERROR(cudaMalloc(&gpu_kernel, kx * ky * sizeof(K))); | ||
92 | + HANDLE_ERROR(cudaMemcpy(gpu_kernel, kernel, kx * ky * sizeof(K), cudaMemcpyHostToDevice)); | ||
74 | T* gpu_out; | 93 | T* gpu_out; |
75 | HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T))); | 94 | HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T))); |
76 | gpu_conv2(gpu_out, gpu_in, gpu_kernel, sx, sy, kx, ky); //execute the GPU kernel | 95 | gpu_conv2(gpu_out, gpu_in, gpu_kernel, sx, sy, kx, ky); //execute the GPU kernel |
@@ -79,9 +98,7 @@ namespace stim { | @@ -79,9 +98,7 @@ namespace stim { | ||
79 | HANDLE_ERROR(cudaFree(gpu_kernel)); | 98 | HANDLE_ERROR(cudaFree(gpu_kernel)); |
80 | HANDLE_ERROR(cudaFree(gpu_out)); | 99 | HANDLE_ERROR(cudaFree(gpu_out)); |
81 | #else | 100 | #else |
82 | - | ||
83 | - | ||
84 | - T v; //register stores the integral of the current pixel value | 101 | + K v; //register stores the integral of the current pixel value |
85 | size_t yi, xi, kyi, kxi, yi_kyi_sx; | 102 | size_t yi, xi, kyi, kxi, yi_kyi_sx; |
86 | for (yi = 0; yi < Y; yi++) { //for each pixel in the output image | 103 | for (yi = 0; yi < Y; yi++) { //for each pixel in the output image |
87 | for (xi = 0; xi < X; xi++) { | 104 | for (xi = 0; xi < X; xi++) { |
1 | +#ifndef STIM_CUDA_SEPCONV2_H | ||
2 | +#define STIM_CUDA_SEPCONV2_H | ||
3 | +#include <stim/math/filters/conv2.h> | ||
4 | +#ifdef __CUDACC__ | ||
5 | +#include <stim/cuda/cudatools.h> | ||
6 | +#include <stim/cuda/sharedmem.cuh> | ||
7 | +#endif | ||
8 | + | ||
9 | +namespace stim { | ||
10 | +#ifdef __CUDACC__ | ||
11 | + //Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device. | ||
12 | + //@param out is a pointer to the output image | ||
13 | + //@param in is a pointer to the input image | ||
14 | + //@param sx is the size of the input image along X | ||
15 | + //@param sy is the size of the input image along Y | ||
16 | + //@param kx is the size of the kernel along X | ||
17 | + //@param ky is the size of the kernel along Y | ||
18 | + template<typename T, typename K = T> | ||
19 | + void gpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) { | ||
20 | + cudaDeviceProp p; | ||
21 | + HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); | ||
22 | + size_t tmax = p.maxThreadsPerBlock; | ||
23 | + dim3 nt(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions | ||
24 | + size_t X = sx - kx + 1; //calculate the x size of the output image | ||
25 | + T* temp; //declare a temporary variable to store the intermediate image | ||
26 | + HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T))); //allocate memory for the intermediate image | ||
27 | + | ||
28 | + dim3 nb(X / nt.x + 1, sy / nt.y + 1); //calculate the grid dimensions | ||
29 | + size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T); //shared memory bytes required to store block data | ||
30 | + if (sm > p.sharedMemPerBlock) { | ||
31 | + std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; | ||
32 | + exit(1); | ||
33 | + } | ||
34 | + kernel_conv2 <<<nb, nt, sm>>> (temp, in, k0, sx, sy, kx, 1); //launch the kernel to compute the intermediate image | ||
35 | + | ||
36 | + size_t Y = sy - ky + 1; //calculate the y size of the output image | ||
37 | + nb.y = Y / nt.y + 1; //update the grid dimensions to reflect the Y-axis size of the output image | ||
38 | + sm = nt.x * (nt.y + ky - 1) * sizeof(T); //calculate the amount of shared memory needed for the second pass | ||
39 | + if (sm > p.sharedMemPerBlock) { | ||
40 | + std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; | ||
41 | + exit(1); | ||
42 | + } | ||
43 | + kernel_conv2 <<<nb, nt, sm>>> (out, temp, k1, X, sy, 1, ky); //launch the kernel to compute the final image | ||
44 | + HANDLE_ERROR(cudaFree(temp)); //free memory allocated for the intermediate image | ||
45 | + } | ||
46 | +#endif | ||
47 | + //Performs a separable convolution of a 2D image. Only valid pixels based on the kernel are returned. | ||
48 | + // As a result, the output image will be smaller than the input image by (kx-1, ky-1) | ||
49 | + //@param out is a pointer to the output image | ||
50 | + //@param in is a pointer to the input image | ||
51 | + //@param k0 is the x-axis convolution filter | ||
52 | + //@param k1 is the y-axis convolution filter | ||
53 | + //@param sx is the size of the input image along X | ||
54 | + //@param sy is the size of the input image along Y | ||
55 | + //@param kx is the size of the kernel along X | ||
56 | + //@param ky is the size of the kernel along Y | ||
57 | + template<typename T, typename K = T> | ||
58 | + void cpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) { | ||
59 | + size_t X = sx - kx + 1; //x size of the output image | ||
60 | + size_t Y = sy - ky + 1; | ||
61 | +#ifdef __CUDACC__ | ||
62 | + //allocate memory and copy everything to the GPU | ||
63 | + T* gpu_in; | ||
64 | + HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T))); | ||
65 | + HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice)); | ||
66 | + K* gpu_k0; | ||
67 | + HANDLE_ERROR(cudaMalloc(&gpu_k0, kx * sizeof(K))); | ||
68 | + HANDLE_ERROR(cudaMemcpy(gpu_k0, k0, kx * sizeof(K), cudaMemcpyHostToDevice)); | ||
69 | + K* gpu_k1; | ||
70 | + HANDLE_ERROR(cudaMalloc(&gpu_k1, ky * sizeof(K))); | ||
71 | + HANDLE_ERROR(cudaMemcpy(gpu_k1, k1, ky * sizeof(K), cudaMemcpyHostToDevice)); | ||
72 | + T* gpu_out; | ||
73 | + HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T))); | ||
74 | + gpu_sepconv2(gpu_out, gpu_in, gpu_k0, gpu_k1, sx, sy, kx, ky); //execute the GPU kernel | ||
75 | + HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost)); //copy the result to the host | ||
76 | + HANDLE_ERROR(cudaFree(gpu_in)); | ||
77 | + HANDLE_ERROR(cudaFree(gpu_k0)); | ||
78 | + HANDLE_ERROR(cudaFree(gpu_k1)); | ||
79 | + HANDLE_ERROR(cudaFree(gpu_out)); | ||
80 | +#else | ||
81 | + T* temp = (T*)malloc(X * sy * sizeof(T)); //allocate space for the intermediate image | ||
82 | + cpu_conv2(temp, in, k0, sx, sy, kx, 1); //evaluate the intermediate image | ||
83 | + cpu_conv2(out, temp, k1, X, sy, 1, ky); //evaluate the final image | ||
84 | + free(temp); //free the memory for the intermediate image | ||
85 | +#endif | ||
86 | + } | ||
87 | +} | ||
88 | + | ||
89 | +#endif | ||
0 | \ No newline at end of file | 90 | \ No newline at end of file |