Commit bb7275b6b9f13a14f0bfd4da4bd4770806e40dd0

Authored by Pavel Govyadinov
1 parent 2eefb035

added a gauss3.h and sepconv3.h kernels for 3D seperable convolution

stim/math/filters/gauss3.h 0 → 100644
  1 +#ifndef STIM_CUDA_GAUSS3_H
  2 +#define STIM_CUDA_GAUSS3_H
  3 +#include <stim/math/filters/sepconv3.h>
  4 +#include <stim/math/filters/gauss2.h>
  5 +#include <stim/math/constants.h>
  6 +
  7 +namespace stim
  8 +{
  9 + ///Perform a 3D gaussian convolution on an input image.
  10 + ///@param in is a pointer to the input data.
  11 + ///@param dimx is the size of in* in the x direction.
  12 + ///@param dimx is the size of in* in the y direction.
  13 + ///@param dimx is the size of in* in the z direction.
  14 + ///@param stdx is the standard deviation (in pixels) along the x axis.
  15 + ///@param stdy is the standard deviation (in pixels) along the y axis.
  16 + ///@param nstds specifies the number of standard deviations of the Gaussian that will be k ept in the kernel.
  17 + template<typename T, typename K>
  18 + void cpu_gauss3(T* in, K dimx, K dimy, K dimz, K stdx, K stdy, K stdz, size_t nstds = 3)
  19 + {
  20 + //Set up the sizes of the gaussian Kernels.
  21 + size_t kx = stdx * nstds * 2;
  22 + size_t ky = stdy * nstds * 2;
  23 + size_t kz = stdz * nstds * 2;
  24 +
  25 + //Set up the sizes of the new output, which will be kx, ky, kz, smaller than the input.
  26 + size_t X = dimx - kx +1;
  27 + size_t Y = dimy - ky +1;
  28 + size_t Z = dimz - kz +1;
  29 + T* out = (T*) malloc(X*Y*Z* sizeof(T));
  30 +
  31 + ///Set up the memory that will store the gaussians
  32 + K* gaussx = (K*)malloc(kx *sizeof(K));
  33 + K* gaussy = (K*)malloc(ky *sizeof(K));
  34 + K* gaussz = (K*)malloc(kz *sizeof(K));
  35 +
  36 + ///Set up the midpoints of the gaussians.
  37 + K midgaussx = (K) kx/ (K)2;
  38 + K midgaussy = (K) ky/ (K)2;
  39 + K midgaussz = (K) kz/ (K)2;
  40 +
  41 + ///Evaluate the kernels in each cardinal direction.
  42 + for(size_t i = 0; i < kx; i++)
  43 + gaussx[i] = gauss1d((K) i, midgaussx, stdx);
  44 +
  45 + for(size_t i = 0; i < kx; i++)
  46 + gaussy[i] = gauss1d((K) i, midgaussy, stdy);
  47 +
  48 + for(size_t i = 0; i < kx; i++)
  49 + gaussz[i] = gauss1d((K) i, midgaussz, stdz);
  50 +
  51 + cpu_sepconv3(out, in, gaussx, gaussy, gaussz, dimx, dimy, dimz, kx, ky, kz);
  52 +
  53 + }
  54 +}
  55 +#endif
... ...
stim/math/filters/sepconv2.h
... ... @@ -86,4 +86,4 @@ namespace stim {
86 86 }
87 87 }
88 88  
89   -#endif
90 89 \ No newline at end of file
  90 +#endif
... ...
stim/math/filters/sepconv3.h 0 → 100644
  1 +#ifndef STIM_CUDA_SEPCONV3_H
  2 +#define STIM_CUDA_SEPCONV3_H
  3 +
  4 +#include <stim/math/filters/conv2.h>
  5 +#include <stim/math/filters/sepconv2.h>
  6 +#ifdef __CUDACC__
  7 + #include <stim/cuda/cudatools.h>
  8 + #include <stim/cuda/sharedmem.cuh>
  9 +#endif
  10 +
  11 +namespace stim
  12 +{
  13 +#ifdef __CUDACC__
  14 + template<typename T, typename K>
  15 + void gpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz)
  16 +{
  17 +
  18 + size_t X = dimx - kx + 1;
  19 + size_t Y = dimy - ky + 1;
  20 + size_t Z = dimz - kz + 1;
  21 +
  22 + T* temp_out;
  23 + int idx_IN;
  24 + int idx_OUT;
  25 + HANDLE_ERROR(cudaMalloc(&temp_out, X*Y*dimz*sizeof(T)));
  26 +
  27 + for(int i = 0; i < dimz; i++)
  28 + {
  29 + idx_IN = (dimx*dimy)*i-i;
  30 + idx_OUT = (X*Y)*i-i;
  31 + gpu_sepconv2(&temp_out[idx_OUT], &in[idx_IN], k0, k1, dimx, dimy, kx, ky);
  32 + }
  33 +
  34 + cudaDeviceProp p;
  35 + HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
  36 + size_t tmax = p.maxThreadsPerBlock;
  37 +
  38 + dim3 numThreads(sqrt(tmax), sqrt(tmax));
  39 + dim3 numBlocks(X*Y/numThreads.x +1, dimz/numThreads.y + 1);
  40 + size_t sharedMem = (numThreads.x + kz - 1) * numThreads.y * sizeof(T);
  41 + if(sharedMem > p.sharedMemPerBlock)
  42 + {
  43 + std::cout << "Error in stim::gpu_sepconv3() - insufficient shared memory for this kernel." << std::endl;
  44 + exit(1);
  45 + }
  46 + kernel_conv2 <<< numBlocks, numThreads, sharedMem >>> (out, temp_out, k2, X*Y, dimz, 1, kz);
  47 + HANDLE_ERROR(cudaFree(temp_out));
  48 +
  49 +
  50 +}
  51 +#endif
  52 +
  53 + //Performs a separable convolution of a 3D image. Only valid pixels based on the kernel ar e returned.
  54 + // As a result, the output image will be smaller than the input image by (kx-1, ky-1 , kz-1)
  55 + //@param out is a pointer to the output image
  56 + //@param in is a pointer to the input image
  57 + //@param kx is the x-axis convolution filter
  58 + //@param ky is the y-axis convolution filter
  59 + //@param kz is the z-axis convolution filter
  60 + //@param dimx is the size of the input image along X
  61 + //@param dimy is the size of the input image along Y
  62 + //@param dimz is the size of the input image along Z
  63 + //@param kx is the size of the kernel along X
  64 + //@param ky is the size of the kernel along Y
  65 + //@param kz is the size of the kernel along Z
  66 +
  67 + template <typename T, typename K>
  68 + void cpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz)
  69 + {
  70 + //Set up the sizes of the new output, which will be kx, ky, kz, smaller than the i nput.
  71 + size_t X = dimx - kx + 1;
  72 + size_t Y = dimy - ky + 1;
  73 + size_t Z = dimz - kz + 1;
  74 +
  75 +#ifdef __CUDACC__
  76 + ///Set up all of the memory on the GPU
  77 + T* gpu_in;
  78 + HANDLE_ERROR(cudaMalloc(&gpu_in, dimx*dimy*dimz*sizeof(T)));
  79 + HANDLE_ERROR(cudaMemcpy(gpu_in, in, dimx*dimy*dimz*sizeof(T),cudaMemcpyHostToDevice));
  80 + K* gpu_kx;
  81 + HANDLE_ERROR(cudaMalloc(&gpu_kx, kx*sizeof(K)));
  82 + HANDLE_ERROR(cudaMemcpy(gpu_kx, k0, kx*sizeof(K),cudaMemcpyHostToDevice));
  83 + K* gpu_ky;
  84 + HANDLE_ERROR(cudaMalloc(&gpu_ky, ky*sizeof(K)));
  85 + HANDLE_ERROR(cudaMemcpy(gpu_ky, k1, ky*sizeof(K),cudaMemcpyHostToDevice));
  86 + K* gpu_kz;
  87 + HANDLE_ERROR(cudaMalloc(&gpu_kz, kz*sizeof(K)));
  88 + HANDLE_ERROR(cudaMemcpy(gpu_kz, k2, kz*sizeof(K),cudaMemcpyHostToDevice));
  89 + T* gpu_out;
  90 + HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * Z*sizeof(T)));
  91 +
  92 + ///run the kernel
  93 + gpu_sepconv3(gpu_out, gpu_in, gpu_kx, gpu_ky, gpu_kz, dimx, dimy, dimz, kx, ky, kz);
  94 +
  95 + ///Copy the output
  96 + HANDLE_ERROR(cudaMemcpy(out, gpu_out, X*Y*Z*sizeof(T), cudaMemcpyDeviceToHost));
  97 +
  98 + ///Free all the memory used.
  99 + HANDLE_ERROR(cudaFree(gpu_in));
  100 + HANDLE_ERROR(cudaFree(gpu_kx));
  101 + HANDLE_ERROR(cudaFree(gpu_ky));
  102 + HANDLE_ERROR(cudaFree(gpu_kz));
  103 + HANDLE_ERROR(cudaFree(gpu_out));
  104 +#else
  105 + T* temp = (T*) malloc(X * dimy * sizeof(T));
  106 + T* temp3 = (T*) malloc(X * Y * dimz * sizeof(T));
  107 + for(int i = 0; i < dimz; i++)
  108 + {
  109 + idx_IN = (dimx*dimy)*i-i;
  110 + idx_OUT = (X*Y)*i-i;
  111 + cpu_conv2(temp, &in[idx_IN], k0, dimx, dimy, kx, 1)
  112 + cpu_conv2(&temp3[idx_OUT], temp, k1, X, dimy, 1, ky);
  113 + }
  114 + cpu_conv2(out, temp, k2, X*Y, dimz, 1, kz);
  115 + free(temp);
  116 + free(temp3);
  117 +
  118 +#endif
  119 + }
  120 +}
  121 +
  122 +
  123 +#endif
... ...