Blame view

stim/math/filters/sepconv3.cuh 3.87 KB
bb7275b6   Pavel Govyadinov   added a gauss3.h ...
1
2
3
  #ifndef STIM_CUDA_SEPCONV3_H
  #define STIM_CUDA_SEPCONV3_H
  
817b3aba   David Mayerich   changed filter fi...
4
5
  #include <stim/math/filters/conv2.cuh>
  #include <stim/math/filters/sepconv2.cuh>
bb7275b6   Pavel Govyadinov   added a gauss3.h ...
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
  #ifdef __CUDACC__
  	#include <stim/cuda/cudatools.h>
  	#include <stim/cuda/sharedmem.cuh>
  #endif
  
  namespace stim
  {
  #ifdef __CUDACC__
  	template<typename T, typename K>
  	void gpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz)
  {
  
  	size_t X = dimx - kx + 1; 
  	size_t Y = dimy - ky + 1; 
  	size_t Z = dimz - kz + 1;
  	
  	T* temp_out;
  	int idx_IN;
  	int idx_OUT;
  	HANDLE_ERROR(cudaMalloc(&temp_out, X*Y*dimz*sizeof(T)));
  
  	for(int i = 0; i < dimz; i++)
  	{
  		idx_IN 	= (dimx*dimy)*i-i;
  		idx_OUT = (X*Y)*i-i;
  		gpu_sepconv2(&temp_out[idx_OUT], &in[idx_IN], k0, k1, dimx, dimy, kx, ky);
  	}
  
  	cudaDeviceProp p;
  	HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
  	size_t tmax = p.maxThreadsPerBlock;
  
  	dim3 numThreads(sqrt(tmax), sqrt(tmax));
  	dim3 numBlocks(X*Y/numThreads.x +1, dimz/numThreads.y + 1);
  	size_t sharedMem = (numThreads.x + kz - 1) * numThreads.y * sizeof(T);
  	if(sharedMem > p.sharedMemPerBlock)
  	{
  		std::cout << "Error in stim::gpu_sepconv3() - insufficient shared memory for this kernel." << std::endl;
  		exit(1);
  	}
  	kernel_conv2 <<< numBlocks, numThreads, sharedMem >>> (out, temp_out, k2, X*Y, dimz, 1, kz);
  	HANDLE_ERROR(cudaFree(temp_out));
  
  
  }
  #endif
  
  	//Performs a separable convolution of a 3D image. Only valid pixels based on the kernel ar    e returned.
  	//      As a result, the output image will be smaller than the input image by (kx-1, ky-1 , kz-1)
  	//@param out is a pointer to the output image
  	//@param in is a pointer to the input image
  	//@param kx is the x-axis convolution filter
  	//@param ky is the y-axis convolution filter
  	//@param kz is the z-axis convolution filter
  	//@param dimx is the size of the input image along X
  	//@param dimy is the size of the input image along Y
  	//@param dimz is the size of the input image along Z
  	//@param kx is the size of the kernel along X
  	//@param ky is the size of the kernel along Y
  	//@param kz is the size of the kernel along Z
  
  	template <typename T, typename K>
  	void cpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz)
  	{
  		//Set up the sizes of the new output, which will be kx, ky, kz, smaller than the i    nput.
  		size_t X = dimx - kx + 1; 
  		size_t Y = dimy - ky + 1; 
  		size_t Z = dimz - kz + 1;
  
  #ifdef __CUDACC__
  	///Set up all of the memory on the GPU
  	T* gpu_in;
  	HANDLE_ERROR(cudaMalloc(&gpu_in, dimx*dimy*dimz*sizeof(T)));
  	HANDLE_ERROR(cudaMemcpy(gpu_in, in, dimx*dimy*dimz*sizeof(T),cudaMemcpyHostToDevice));
  	K* gpu_kx;
  	HANDLE_ERROR(cudaMalloc(&gpu_kx, kx*sizeof(K)));
  	HANDLE_ERROR(cudaMemcpy(gpu_kx, k0, kx*sizeof(K),cudaMemcpyHostToDevice));
  	K* gpu_ky;
  	HANDLE_ERROR(cudaMalloc(&gpu_ky, ky*sizeof(K)));
  	HANDLE_ERROR(cudaMemcpy(gpu_ky, k1, ky*sizeof(K),cudaMemcpyHostToDevice));
  	K* gpu_kz;
  	HANDLE_ERROR(cudaMalloc(&gpu_kz, kz*sizeof(K)));
  	HANDLE_ERROR(cudaMemcpy(gpu_kz, k2, kz*sizeof(K),cudaMemcpyHostToDevice));
  	T* gpu_out;
  	HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * Z*sizeof(T)));
  
  	///run the kernel
  	gpu_sepconv3(gpu_out, gpu_in, gpu_kx, gpu_ky, gpu_kz, dimx, dimy, dimz, kx, ky, kz);
  
  	///Copy the output
  	HANDLE_ERROR(cudaMemcpy(out, gpu_out, X*Y*Z*sizeof(T), cudaMemcpyDeviceToHost));
  
  	///Free all the memory used.
  	HANDLE_ERROR(cudaFree(gpu_in));
  	HANDLE_ERROR(cudaFree(gpu_kx));
  	HANDLE_ERROR(cudaFree(gpu_ky));
  	HANDLE_ERROR(cudaFree(gpu_kz));
  	HANDLE_ERROR(cudaFree(gpu_out));
  #else
  	T* temp = (T*) malloc(X * dimy * sizeof(T));
  	T* temp3 = (T*) malloc(X * Y * dimz * sizeof(T));
  	for(int i = 0; i < dimz; i++)
  	{
  		idx_IN 	= (dimx*dimy)*i-i;
  		idx_OUT = (X*Y)*i-i;
  		cpu_conv2(temp, &in[idx_IN], k0, dimx, dimy, kx, 1)
  		cpu_conv2(&temp3[idx_OUT], temp, k1, X, dimy, 1, ky);
  	}
  	cpu_conv2(out, temp, k2, X*Y, dimz, 1, kz);
  	free(temp);
  	free(temp3);
  
  #endif
  	}
  }
  
  
  #endif