Commit bb7275b6b9f13a14f0bfd4da4bd4770806e40dd0
1 parent
2eefb035
added a gauss3.h and sepconv3.h kernels for 3D seperable convolution
Showing
3 changed files
with
179 additions
and
1 deletions
Show diff stats
1 | +#ifndef STIM_CUDA_GAUSS3_H | |
2 | +#define STIM_CUDA_GAUSS3_H | |
3 | +#include <stim/math/filters/sepconv3.h> | |
4 | +#include <stim/math/filters/gauss2.h> | |
5 | +#include <stim/math/constants.h> | |
6 | + | |
7 | +namespace stim | |
8 | +{ | |
9 | + ///Perform a 3D gaussian convolution on an input image. | |
10 | + ///@param in is a pointer to the input data. | |
11 | + ///@param dimx is the size of in* in the x direction. | |
12 | + ///@param dimx is the size of in* in the y direction. | |
13 | + ///@param dimx is the size of in* in the z direction. | |
14 | + ///@param stdx is the standard deviation (in pixels) along the x axis. | |
15 | + ///@param stdy is the standard deviation (in pixels) along the y axis. | |
16 | + ///@param nstds specifies the number of standard deviations of the Gaussian that will be k ept in the kernel. | |
17 | + template<typename T, typename K> | |
18 | + void cpu_gauss3(T* in, K dimx, K dimy, K dimz, K stdx, K stdy, K stdz, size_t nstds = 3) | |
19 | + { | |
20 | + //Set up the sizes of the gaussian Kernels. | |
21 | + size_t kx = stdx * nstds * 2; | |
22 | + size_t ky = stdy * nstds * 2; | |
23 | + size_t kz = stdz * nstds * 2; | |
24 | + | |
25 | + //Set up the sizes of the new output, which will be kx, ky, kz, smaller than the input. | |
26 | + size_t X = dimx - kx +1; | |
27 | + size_t Y = dimy - ky +1; | |
28 | + size_t Z = dimz - kz +1; | |
29 | + T* out = (T*) malloc(X*Y*Z* sizeof(T)); | |
30 | + | |
31 | + ///Set up the memory that will store the gaussians | |
32 | + K* gaussx = (K*)malloc(kx *sizeof(K)); | |
33 | + K* gaussy = (K*)malloc(ky *sizeof(K)); | |
34 | + K* gaussz = (K*)malloc(kz *sizeof(K)); | |
35 | + | |
36 | + ///Set up the midpoints of the gaussians. | |
37 | + K midgaussx = (K) kx/ (K)2; | |
38 | + K midgaussy = (K) ky/ (K)2; | |
39 | + K midgaussz = (K) kz/ (K)2; | |
40 | + | |
41 | + ///Evaluate the kernels in each cardinal direction. | |
42 | + for(size_t i = 0; i < kx; i++) | |
43 | + gaussx[i] = gauss1d((K) i, midgaussx, stdx); | |
44 | + | |
45 | + for(size_t i = 0; i < kx; i++) | |
46 | + gaussy[i] = gauss1d((K) i, midgaussy, stdy); | |
47 | + | |
48 | + for(size_t i = 0; i < kx; i++) | |
49 | + gaussz[i] = gauss1d((K) i, midgaussz, stdz); | |
50 | + | |
51 | + cpu_sepconv3(out, in, gaussx, gaussy, gaussz, dimx, dimy, dimz, kx, ky, kz); | |
52 | + | |
53 | + } | |
54 | +} | |
55 | +#endif | ... | ... |
stim/math/filters/sepconv2.h
1 | +#ifndef STIM_CUDA_SEPCONV3_H | |
2 | +#define STIM_CUDA_SEPCONV3_H | |
3 | + | |
4 | +#include <stim/math/filters/conv2.h> | |
5 | +#include <stim/math/filters/sepconv2.h> | |
6 | +#ifdef __CUDACC__ | |
7 | + #include <stim/cuda/cudatools.h> | |
8 | + #include <stim/cuda/sharedmem.cuh> | |
9 | +#endif | |
10 | + | |
11 | +namespace stim | |
12 | +{ | |
13 | +#ifdef __CUDACC__ | |
14 | + template<typename T, typename K> | |
15 | + void gpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz) | |
16 | +{ | |
17 | + | |
18 | + size_t X = dimx - kx + 1; | |
19 | + size_t Y = dimy - ky + 1; | |
20 | + size_t Z = dimz - kz + 1; | |
21 | + | |
22 | + T* temp_out; | |
23 | + int idx_IN; | |
24 | + int idx_OUT; | |
25 | + HANDLE_ERROR(cudaMalloc(&temp_out, X*Y*dimz*sizeof(T))); | |
26 | + | |
27 | + for(int i = 0; i < dimz; i++) | |
28 | + { | |
29 | + idx_IN = (dimx*dimy)*i-i; | |
30 | + idx_OUT = (X*Y)*i-i; | |
31 | + gpu_sepconv2(&temp_out[idx_OUT], &in[idx_IN], k0, k1, dimx, dimy, kx, ky); | |
32 | + } | |
33 | + | |
34 | + cudaDeviceProp p; | |
35 | + HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); | |
36 | + size_t tmax = p.maxThreadsPerBlock; | |
37 | + | |
38 | + dim3 numThreads(sqrt(tmax), sqrt(tmax)); | |
39 | + dim3 numBlocks(X*Y/numThreads.x +1, dimz/numThreads.y + 1); | |
40 | + size_t sharedMem = (numThreads.x + kz - 1) * numThreads.y * sizeof(T); | |
41 | + if(sharedMem > p.sharedMemPerBlock) | |
42 | + { | |
43 | + std::cout << "Error in stim::gpu_sepconv3() - insufficient shared memory for this kernel." << std::endl; | |
44 | + exit(1); | |
45 | + } | |
46 | + kernel_conv2 <<< numBlocks, numThreads, sharedMem >>> (out, temp_out, k2, X*Y, dimz, 1, kz); | |
47 | + HANDLE_ERROR(cudaFree(temp_out)); | |
48 | + | |
49 | + | |
50 | +} | |
51 | +#endif | |
52 | + | |
53 | + //Performs a separable convolution of a 3D image. Only valid pixels based on the kernel ar e returned. | |
54 | + // As a result, the output image will be smaller than the input image by (kx-1, ky-1 , kz-1) | |
55 | + //@param out is a pointer to the output image | |
56 | + //@param in is a pointer to the input image | |
57 | + //@param kx is the x-axis convolution filter | |
58 | + //@param ky is the y-axis convolution filter | |
59 | + //@param kz is the z-axis convolution filter | |
60 | + //@param dimx is the size of the input image along X | |
61 | + //@param dimy is the size of the input image along Y | |
62 | + //@param dimz is the size of the input image along Z | |
63 | + //@param kx is the size of the kernel along X | |
64 | + //@param ky is the size of the kernel along Y | |
65 | + //@param kz is the size of the kernel along Z | |
66 | + | |
67 | + template <typename T, typename K> | |
68 | + void cpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz) | |
69 | + { | |
70 | + //Set up the sizes of the new output, which will be kx, ky, kz, smaller than the i nput. | |
71 | + size_t X = dimx - kx + 1; | |
72 | + size_t Y = dimy - ky + 1; | |
73 | + size_t Z = dimz - kz + 1; | |
74 | + | |
75 | +#ifdef __CUDACC__ | |
76 | + ///Set up all of the memory on the GPU | |
77 | + T* gpu_in; | |
78 | + HANDLE_ERROR(cudaMalloc(&gpu_in, dimx*dimy*dimz*sizeof(T))); | |
79 | + HANDLE_ERROR(cudaMemcpy(gpu_in, in, dimx*dimy*dimz*sizeof(T),cudaMemcpyHostToDevice)); | |
80 | + K* gpu_kx; | |
81 | + HANDLE_ERROR(cudaMalloc(&gpu_kx, kx*sizeof(K))); | |
82 | + HANDLE_ERROR(cudaMemcpy(gpu_kx, k0, kx*sizeof(K),cudaMemcpyHostToDevice)); | |
83 | + K* gpu_ky; | |
84 | + HANDLE_ERROR(cudaMalloc(&gpu_ky, ky*sizeof(K))); | |
85 | + HANDLE_ERROR(cudaMemcpy(gpu_ky, k1, ky*sizeof(K),cudaMemcpyHostToDevice)); | |
86 | + K* gpu_kz; | |
87 | + HANDLE_ERROR(cudaMalloc(&gpu_kz, kz*sizeof(K))); | |
88 | + HANDLE_ERROR(cudaMemcpy(gpu_kz, k2, kz*sizeof(K),cudaMemcpyHostToDevice)); | |
89 | + T* gpu_out; | |
90 | + HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * Z*sizeof(T))); | |
91 | + | |
92 | + ///run the kernel | |
93 | + gpu_sepconv3(gpu_out, gpu_in, gpu_kx, gpu_ky, gpu_kz, dimx, dimy, dimz, kx, ky, kz); | |
94 | + | |
95 | + ///Copy the output | |
96 | + HANDLE_ERROR(cudaMemcpy(out, gpu_out, X*Y*Z*sizeof(T), cudaMemcpyDeviceToHost)); | |
97 | + | |
98 | + ///Free all the memory used. | |
99 | + HANDLE_ERROR(cudaFree(gpu_in)); | |
100 | + HANDLE_ERROR(cudaFree(gpu_kx)); | |
101 | + HANDLE_ERROR(cudaFree(gpu_ky)); | |
102 | + HANDLE_ERROR(cudaFree(gpu_kz)); | |
103 | + HANDLE_ERROR(cudaFree(gpu_out)); | |
104 | +#else | |
105 | + T* temp = (T*) malloc(X * dimy * sizeof(T)); | |
106 | + T* temp3 = (T*) malloc(X * Y * dimz * sizeof(T)); | |
107 | + for(int i = 0; i < dimz; i++) | |
108 | + { | |
109 | + idx_IN = (dimx*dimy)*i-i; | |
110 | + idx_OUT = (X*Y)*i-i; | |
111 | + cpu_conv2(temp, &in[idx_IN], k0, dimx, dimy, kx, 1) | |
112 | + cpu_conv2(&temp3[idx_OUT], temp, k1, X, dimy, 1, ky); | |
113 | + } | |
114 | + cpu_conv2(out, temp, k2, X*Y, dimz, 1, kz); | |
115 | + free(temp); | |
116 | + free(temp3); | |
117 | + | |
118 | +#endif | |
119 | + } | |
120 | +} | |
121 | + | |
122 | + | |
123 | +#endif | ... | ... |