Commit bb7275b6b9f13a14f0bfd4da4bd4770806e40dd0
1 parent
2eefb035
added a gauss3.h and sepconv3.h kernels for 3D seperable convolution
Showing
3 changed files
with
179 additions
and
1 deletions
Show diff stats
1 | +#ifndef STIM_CUDA_GAUSS3_H | ||
2 | +#define STIM_CUDA_GAUSS3_H | ||
3 | +#include <stim/math/filters/sepconv3.h> | ||
4 | +#include <stim/math/filters/gauss2.h> | ||
5 | +#include <stim/math/constants.h> | ||
6 | + | ||
7 | +namespace stim | ||
8 | +{ | ||
9 | + ///Perform a 3D gaussian convolution on an input image. | ||
10 | + ///@param in is a pointer to the input data. | ||
11 | + ///@param dimx is the size of in* in the x direction. | ||
12 | + ///@param dimx is the size of in* in the y direction. | ||
13 | + ///@param dimx is the size of in* in the z direction. | ||
14 | + ///@param stdx is the standard deviation (in pixels) along the x axis. | ||
15 | + ///@param stdy is the standard deviation (in pixels) along the y axis. | ||
16 | + ///@param nstds specifies the number of standard deviations of the Gaussian that will be k ept in the kernel. | ||
17 | + template<typename T, typename K> | ||
18 | + void cpu_gauss3(T* in, K dimx, K dimy, K dimz, K stdx, K stdy, K stdz, size_t nstds = 3) | ||
19 | + { | ||
20 | + //Set up the sizes of the gaussian Kernels. | ||
21 | + size_t kx = stdx * nstds * 2; | ||
22 | + size_t ky = stdy * nstds * 2; | ||
23 | + size_t kz = stdz * nstds * 2; | ||
24 | + | ||
25 | + //Set up the sizes of the new output, which will be kx, ky, kz, smaller than the input. | ||
26 | + size_t X = dimx - kx +1; | ||
27 | + size_t Y = dimy - ky +1; | ||
28 | + size_t Z = dimz - kz +1; | ||
29 | + T* out = (T*) malloc(X*Y*Z* sizeof(T)); | ||
30 | + | ||
31 | + ///Set up the memory that will store the gaussians | ||
32 | + K* gaussx = (K*)malloc(kx *sizeof(K)); | ||
33 | + K* gaussy = (K*)malloc(ky *sizeof(K)); | ||
34 | + K* gaussz = (K*)malloc(kz *sizeof(K)); | ||
35 | + | ||
36 | + ///Set up the midpoints of the gaussians. | ||
37 | + K midgaussx = (K) kx/ (K)2; | ||
38 | + K midgaussy = (K) ky/ (K)2; | ||
39 | + K midgaussz = (K) kz/ (K)2; | ||
40 | + | ||
41 | + ///Evaluate the kernels in each cardinal direction. | ||
42 | + for(size_t i = 0; i < kx; i++) | ||
43 | + gaussx[i] = gauss1d((K) i, midgaussx, stdx); | ||
44 | + | ||
45 | + for(size_t i = 0; i < kx; i++) | ||
46 | + gaussy[i] = gauss1d((K) i, midgaussy, stdy); | ||
47 | + | ||
48 | + for(size_t i = 0; i < kx; i++) | ||
49 | + gaussz[i] = gauss1d((K) i, midgaussz, stdz); | ||
50 | + | ||
51 | + cpu_sepconv3(out, in, gaussx, gaussy, gaussz, dimx, dimy, dimz, kx, ky, kz); | ||
52 | + | ||
53 | + } | ||
54 | +} | ||
55 | +#endif |
stim/math/filters/sepconv2.h
1 | +#ifndef STIM_CUDA_SEPCONV3_H | ||
2 | +#define STIM_CUDA_SEPCONV3_H | ||
3 | + | ||
4 | +#include <stim/math/filters/conv2.h> | ||
5 | +#include <stim/math/filters/sepconv2.h> | ||
6 | +#ifdef __CUDACC__ | ||
7 | + #include <stim/cuda/cudatools.h> | ||
8 | + #include <stim/cuda/sharedmem.cuh> | ||
9 | +#endif | ||
10 | + | ||
11 | +namespace stim | ||
12 | +{ | ||
13 | +#ifdef __CUDACC__ | ||
14 | + template<typename T, typename K> | ||
15 | + void gpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz) | ||
16 | +{ | ||
17 | + | ||
18 | + size_t X = dimx - kx + 1; | ||
19 | + size_t Y = dimy - ky + 1; | ||
20 | + size_t Z = dimz - kz + 1; | ||
21 | + | ||
22 | + T* temp_out; | ||
23 | + int idx_IN; | ||
24 | + int idx_OUT; | ||
25 | + HANDLE_ERROR(cudaMalloc(&temp_out, X*Y*dimz*sizeof(T))); | ||
26 | + | ||
27 | + for(int i = 0; i < dimz; i++) | ||
28 | + { | ||
29 | + idx_IN = (dimx*dimy)*i-i; | ||
30 | + idx_OUT = (X*Y)*i-i; | ||
31 | + gpu_sepconv2(&temp_out[idx_OUT], &in[idx_IN], k0, k1, dimx, dimy, kx, ky); | ||
32 | + } | ||
33 | + | ||
34 | + cudaDeviceProp p; | ||
35 | + HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); | ||
36 | + size_t tmax = p.maxThreadsPerBlock; | ||
37 | + | ||
38 | + dim3 numThreads(sqrt(tmax), sqrt(tmax)); | ||
39 | + dim3 numBlocks(X*Y/numThreads.x +1, dimz/numThreads.y + 1); | ||
40 | + size_t sharedMem = (numThreads.x + kz - 1) * numThreads.y * sizeof(T); | ||
41 | + if(sharedMem > p.sharedMemPerBlock) | ||
42 | + { | ||
43 | + std::cout << "Error in stim::gpu_sepconv3() - insufficient shared memory for this kernel." << std::endl; | ||
44 | + exit(1); | ||
45 | + } | ||
46 | + kernel_conv2 <<< numBlocks, numThreads, sharedMem >>> (out, temp_out, k2, X*Y, dimz, 1, kz); | ||
47 | + HANDLE_ERROR(cudaFree(temp_out)); | ||
48 | + | ||
49 | + | ||
50 | +} | ||
51 | +#endif | ||
52 | + | ||
53 | + //Performs a separable convolution of a 3D image. Only valid pixels based on the kernel ar e returned. | ||
54 | + // As a result, the output image will be smaller than the input image by (kx-1, ky-1 , kz-1) | ||
55 | + //@param out is a pointer to the output image | ||
56 | + //@param in is a pointer to the input image | ||
57 | + //@param kx is the x-axis convolution filter | ||
58 | + //@param ky is the y-axis convolution filter | ||
59 | + //@param kz is the z-axis convolution filter | ||
60 | + //@param dimx is the size of the input image along X | ||
61 | + //@param dimy is the size of the input image along Y | ||
62 | + //@param dimz is the size of the input image along Z | ||
63 | + //@param kx is the size of the kernel along X | ||
64 | + //@param ky is the size of the kernel along Y | ||
65 | + //@param kz is the size of the kernel along Z | ||
66 | + | ||
67 | + template <typename T, typename K> | ||
68 | + void cpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz) | ||
69 | + { | ||
70 | + //Set up the sizes of the new output, which will be kx, ky, kz, smaller than the i nput. | ||
71 | + size_t X = dimx - kx + 1; | ||
72 | + size_t Y = dimy - ky + 1; | ||
73 | + size_t Z = dimz - kz + 1; | ||
74 | + | ||
75 | +#ifdef __CUDACC__ | ||
76 | + ///Set up all of the memory on the GPU | ||
77 | + T* gpu_in; | ||
78 | + HANDLE_ERROR(cudaMalloc(&gpu_in, dimx*dimy*dimz*sizeof(T))); | ||
79 | + HANDLE_ERROR(cudaMemcpy(gpu_in, in, dimx*dimy*dimz*sizeof(T),cudaMemcpyHostToDevice)); | ||
80 | + K* gpu_kx; | ||
81 | + HANDLE_ERROR(cudaMalloc(&gpu_kx, kx*sizeof(K))); | ||
82 | + HANDLE_ERROR(cudaMemcpy(gpu_kx, k0, kx*sizeof(K),cudaMemcpyHostToDevice)); | ||
83 | + K* gpu_ky; | ||
84 | + HANDLE_ERROR(cudaMalloc(&gpu_ky, ky*sizeof(K))); | ||
85 | + HANDLE_ERROR(cudaMemcpy(gpu_ky, k1, ky*sizeof(K),cudaMemcpyHostToDevice)); | ||
86 | + K* gpu_kz; | ||
87 | + HANDLE_ERROR(cudaMalloc(&gpu_kz, kz*sizeof(K))); | ||
88 | + HANDLE_ERROR(cudaMemcpy(gpu_kz, k2, kz*sizeof(K),cudaMemcpyHostToDevice)); | ||
89 | + T* gpu_out; | ||
90 | + HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * Z*sizeof(T))); | ||
91 | + | ||
92 | + ///run the kernel | ||
93 | + gpu_sepconv3(gpu_out, gpu_in, gpu_kx, gpu_ky, gpu_kz, dimx, dimy, dimz, kx, ky, kz); | ||
94 | + | ||
95 | + ///Copy the output | ||
96 | + HANDLE_ERROR(cudaMemcpy(out, gpu_out, X*Y*Z*sizeof(T), cudaMemcpyDeviceToHost)); | ||
97 | + | ||
98 | + ///Free all the memory used. | ||
99 | + HANDLE_ERROR(cudaFree(gpu_in)); | ||
100 | + HANDLE_ERROR(cudaFree(gpu_kx)); | ||
101 | + HANDLE_ERROR(cudaFree(gpu_ky)); | ||
102 | + HANDLE_ERROR(cudaFree(gpu_kz)); | ||
103 | + HANDLE_ERROR(cudaFree(gpu_out)); | ||
104 | +#else | ||
105 | + T* temp = (T*) malloc(X * dimy * sizeof(T)); | ||
106 | + T* temp3 = (T*) malloc(X * Y * dimz * sizeof(T)); | ||
107 | + for(int i = 0; i < dimz; i++) | ||
108 | + { | ||
109 | + idx_IN = (dimx*dimy)*i-i; | ||
110 | + idx_OUT = (X*Y)*i-i; | ||
111 | + cpu_conv2(temp, &in[idx_IN], k0, dimx, dimy, kx, 1) | ||
112 | + cpu_conv2(&temp3[idx_OUT], temp, k1, X, dimy, 1, ky); | ||
113 | + } | ||
114 | + cpu_conv2(out, temp, k2, X*Y, dimz, 1, kz); | ||
115 | + free(temp); | ||
116 | + free(temp3); | ||
117 | + | ||
118 | +#endif | ||
119 | + } | ||
120 | +} | ||
121 | + | ||
122 | + | ||
123 | +#endif |