diff --git a/stim/math/filters/gauss3.h b/stim/math/filters/gauss3.h new file mode 100644 index 0000000..15e0ae2 --- /dev/null +++ b/stim/math/filters/gauss3.h @@ -0,0 +1,55 @@ +#ifndef STIM_CUDA_GAUSS3_H +#define STIM_CUDA_GAUSS3_H +#include +#include +#include + +namespace stim +{ + ///Perform a 3D gaussian convolution on an input image. + ///@param in is a pointer to the input data. + ///@param dimx is the size of in* in the x direction. + ///@param dimx is the size of in* in the y direction. + ///@param dimx is the size of in* in the z direction. + ///@param stdx is the standard deviation (in pixels) along the x axis. + ///@param stdy is the standard deviation (in pixels) along the y axis. + ///@param nstds specifies the number of standard deviations of the Gaussian that will be k ept in the kernel. + template + void cpu_gauss3(T* in, K dimx, K dimy, K dimz, K stdx, K stdy, K stdz, size_t nstds = 3) + { + //Set up the sizes of the gaussian Kernels. + size_t kx = stdx * nstds * 2; + size_t ky = stdy * nstds * 2; + size_t kz = stdz * nstds * 2; + + //Set up the sizes of the new output, which will be kx, ky, kz, smaller than the input. + size_t X = dimx - kx +1; + size_t Y = dimy - ky +1; + size_t Z = dimz - kz +1; + T* out = (T*) malloc(X*Y*Z* sizeof(T)); + + ///Set up the memory that will store the gaussians + K* gaussx = (K*)malloc(kx *sizeof(K)); + K* gaussy = (K*)malloc(ky *sizeof(K)); + K* gaussz = (K*)malloc(kz *sizeof(K)); + + ///Set up the midpoints of the gaussians. + K midgaussx = (K) kx/ (K)2; + K midgaussy = (K) ky/ (K)2; + K midgaussz = (K) kz/ (K)2; + + ///Evaluate the kernels in each cardinal direction. + for(size_t i = 0; i < kx; i++) + gaussx[i] = gauss1d((K) i, midgaussx, stdx); + + for(size_t i = 0; i < kx; i++) + gaussy[i] = gauss1d((K) i, midgaussy, stdy); + + for(size_t i = 0; i < kx; i++) + gaussz[i] = gauss1d((K) i, midgaussz, stdz); + + cpu_sepconv3(out, in, gaussx, gaussy, gaussz, dimx, dimy, dimz, kx, ky, kz); + + } +} +#endif diff --git a/stim/math/filters/sepconv2.h b/stim/math/filters/sepconv2.h index 5809595..d05b697 100644 --- a/stim/math/filters/sepconv2.h +++ b/stim/math/filters/sepconv2.h @@ -86,4 +86,4 @@ namespace stim { } } -#endif \ No newline at end of file +#endif diff --git a/stim/math/filters/sepconv3.h b/stim/math/filters/sepconv3.h new file mode 100644 index 0000000..11f1b0d --- /dev/null +++ b/stim/math/filters/sepconv3.h @@ -0,0 +1,123 @@ +#ifndef STIM_CUDA_SEPCONV3_H +#define STIM_CUDA_SEPCONV3_H + +#include +#include +#ifdef __CUDACC__ + #include + #include +#endif + +namespace stim +{ +#ifdef __CUDACC__ + template + void gpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz) +{ + + size_t X = dimx - kx + 1; + size_t Y = dimy - ky + 1; + size_t Z = dimz - kz + 1; + + T* temp_out; + int idx_IN; + int idx_OUT; + HANDLE_ERROR(cudaMalloc(&temp_out, X*Y*dimz*sizeof(T))); + + for(int i = 0; i < dimz; i++) + { + idx_IN = (dimx*dimy)*i-i; + idx_OUT = (X*Y)*i-i; + gpu_sepconv2(&temp_out[idx_OUT], &in[idx_IN], k0, k1, dimx, dimy, kx, ky); + } + + cudaDeviceProp p; + HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); + size_t tmax = p.maxThreadsPerBlock; + + dim3 numThreads(sqrt(tmax), sqrt(tmax)); + dim3 numBlocks(X*Y/numThreads.x +1, dimz/numThreads.y + 1); + size_t sharedMem = (numThreads.x + kz - 1) * numThreads.y * sizeof(T); + if(sharedMem > p.sharedMemPerBlock) + { + std::cout << "Error in stim::gpu_sepconv3() - insufficient shared memory for this kernel." << std::endl; + exit(1); + } + kernel_conv2 <<< numBlocks, numThreads, sharedMem >>> (out, temp_out, k2, X*Y, dimz, 1, kz); + HANDLE_ERROR(cudaFree(temp_out)); + + +} +#endif + + //Performs a separable convolution of a 3D image. Only valid pixels based on the kernel ar e returned. + // As a result, the output image will be smaller than the input image by (kx-1, ky-1 , kz-1) + //@param out is a pointer to the output image + //@param in is a pointer to the input image + //@param kx is the x-axis convolution filter + //@param ky is the y-axis convolution filter + //@param kz is the z-axis convolution filter + //@param dimx is the size of the input image along X + //@param dimy is the size of the input image along Y + //@param dimz is the size of the input image along Z + //@param kx is the size of the kernel along X + //@param ky is the size of the kernel along Y + //@param kz is the size of the kernel along Z + + template + void cpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz) + { + //Set up the sizes of the new output, which will be kx, ky, kz, smaller than the i nput. + size_t X = dimx - kx + 1; + size_t Y = dimy - ky + 1; + size_t Z = dimz - kz + 1; + +#ifdef __CUDACC__ + ///Set up all of the memory on the GPU + T* gpu_in; + HANDLE_ERROR(cudaMalloc(&gpu_in, dimx*dimy*dimz*sizeof(T))); + HANDLE_ERROR(cudaMemcpy(gpu_in, in, dimx*dimy*dimz*sizeof(T),cudaMemcpyHostToDevice)); + K* gpu_kx; + HANDLE_ERROR(cudaMalloc(&gpu_kx, kx*sizeof(K))); + HANDLE_ERROR(cudaMemcpy(gpu_kx, k0, kx*sizeof(K),cudaMemcpyHostToDevice)); + K* gpu_ky; + HANDLE_ERROR(cudaMalloc(&gpu_ky, ky*sizeof(K))); + HANDLE_ERROR(cudaMemcpy(gpu_ky, k1, ky*sizeof(K),cudaMemcpyHostToDevice)); + K* gpu_kz; + HANDLE_ERROR(cudaMalloc(&gpu_kz, kz*sizeof(K))); + HANDLE_ERROR(cudaMemcpy(gpu_kz, k2, kz*sizeof(K),cudaMemcpyHostToDevice)); + T* gpu_out; + HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * Z*sizeof(T))); + + ///run the kernel + gpu_sepconv3(gpu_out, gpu_in, gpu_kx, gpu_ky, gpu_kz, dimx, dimy, dimz, kx, ky, kz); + + ///Copy the output + HANDLE_ERROR(cudaMemcpy(out, gpu_out, X*Y*Z*sizeof(T), cudaMemcpyDeviceToHost)); + + ///Free all the memory used. + HANDLE_ERROR(cudaFree(gpu_in)); + HANDLE_ERROR(cudaFree(gpu_kx)); + HANDLE_ERROR(cudaFree(gpu_ky)); + HANDLE_ERROR(cudaFree(gpu_kz)); + HANDLE_ERROR(cudaFree(gpu_out)); +#else + T* temp = (T*) malloc(X * dimy * sizeof(T)); + T* temp3 = (T*) malloc(X * Y * dimz * sizeof(T)); + for(int i = 0; i < dimz; i++) + { + idx_IN = (dimx*dimy)*i-i; + idx_OUT = (X*Y)*i-i; + cpu_conv2(temp, &in[idx_IN], k0, dimx, dimy, kx, 1) + cpu_conv2(&temp3[idx_OUT], temp, k1, X, dimy, 1, ky); + } + cpu_conv2(out, temp, k2, X*Y, dimz, 1, kz); + free(temp); + free(temp3); + +#endif + } +} + + +#endif -- libgit2 0.21.4