sepconv2.cuh
4.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#ifndef STIM_CUDA_SEPCONV2_H
#define STIM_CUDA_SEPCONV2_H
#include <stim/math/filters/conv2.cuh>
#ifdef __CUDACC__
#include <stim/cuda/cudatools.h>
#include <stim/cuda/sharedmem.cuh>
#endif
namespace stim {
#ifdef __CUDACC__
//Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device.
//@param out is a pointer to the output image
//@param in is a pointer to the input image
//@param sx is the size of the input image along X
//@param sy is the size of the input image along Y
//@param kx is the size of the kernel along X
//@param ky is the size of the kernel along Y
template<typename T, typename K>
void gpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) {
cudaDeviceProp p;
HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
size_t tmax = p.maxThreadsPerBlock;
dim3 nt((unsigned int)sqrt(tmax), (unsigned int)sqrt(tmax)); //calculate the block dimensions
size_t X = sx - kx + 1; //calculate the x size of the output image
T* temp; //declare a temporary variable to store the intermediate image
HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T))); //allocate memory for the intermediate image
dim3 nb((unsigned int)(X / nt.x) + 1, (unsigned int)(sy / nt.y) + 1); //calculate the grid dimensions
size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T); //shared memory bytes required to store block data
if (sm > p.sharedMemPerBlock) {
std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
exit(1);
}
kernel_conv2 <<<nb, nt, sm>>> (temp, in, k0, sx, sy, kx, 1); //launch the kernel to compute the intermediate image
size_t Y = sy - ky + 1; //calculate the y size of the output image
nb.y = (unsigned int)(Y / nt.y) + 1; //update the grid dimensions to reflect the Y-axis size of the output image
sm = nt.x * (nt.y + ky - 1) * sizeof(T); //calculate the amount of shared memory needed for the second pass
if (sm > p.sharedMemPerBlock) {
std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
exit(1);
}
kernel_conv2 <<<nb, nt, sm>>> (out, temp, k1, X, sy, 1, ky); //launch the kernel to compute the final image
HANDLE_ERROR(cudaFree(temp)); //free memory allocated for the intermediate image
}
#endif
//Performs a separable convolution of a 2D image. Only valid pixels based on the kernel are returned.
// As a result, the output image will be smaller than the input image by (kx-1, ky-1)
//@param out is a pointer to the output image
//@param in is a pointer to the input image
//@param k0 is the x-axis convolution filter
//@param k1 is the y-axis convolution filter
//@param sx is the size of the input image along X
//@param sy is the size of the input image along Y
//@param kx is the size of the kernel along X
//@param ky is the size of the kernel along Y
template<typename T, typename K>
void cpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) {
size_t X = sx - kx + 1; //x size of the output image
size_t Y = sy - ky + 1;
#ifdef __CUDACC__
//allocate memory and copy everything to the GPU
T* gpu_in;
HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T)));
HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice));
K* gpu_k0;
HANDLE_ERROR(cudaMalloc(&gpu_k0, kx * sizeof(K)));
HANDLE_ERROR(cudaMemcpy(gpu_k0, k0, kx * sizeof(K), cudaMemcpyHostToDevice));
K* gpu_k1;
HANDLE_ERROR(cudaMalloc(&gpu_k1, ky * sizeof(K)));
HANDLE_ERROR(cudaMemcpy(gpu_k1, k1, ky * sizeof(K), cudaMemcpyHostToDevice));
T* gpu_out;
HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T)));
gpu_sepconv2(gpu_out, gpu_in, gpu_k0, gpu_k1, sx, sy, kx, ky); //execute the GPU kernel
HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost)); //copy the result to the host
HANDLE_ERROR(cudaFree(gpu_in));
HANDLE_ERROR(cudaFree(gpu_k0));
HANDLE_ERROR(cudaFree(gpu_k1));
HANDLE_ERROR(cudaFree(gpu_out));
#else
T* temp = (T*)malloc(X * sy * sizeof(T)); //allocate space for the intermediate image
cpu_conv2(temp, in, k0, sx, sy, kx, 1); //evaluate the intermediate image
cpu_conv2(out, temp, k1, X, sy, 1, ky); //evaluate the final image
free(temp); //free the memory for the intermediate image
#endif
}
}
#endif