Blame view

stim/math/filters/sepconv2.h 4.28 KB
dbeb83f2   David Mayerich   added separable c...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
  #ifndef STIM_CUDA_SEPCONV2_H
  #define STIM_CUDA_SEPCONV2_H
  #include <stim/math/filters/conv2.h>
  #ifdef __CUDACC__
  #include <stim/cuda/cudatools.h>
  #include <stim/cuda/sharedmem.cuh>
  #endif
  
  namespace stim {
  #ifdef __CUDACC__
  	//Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device.
  	//@param out is a pointer to the output image
  	//@param in is a pointer to the input image
  	//@param sx is the size of the input image along X
  	//@param sy is the size of the input image along Y
  	//@param kx is the size of the kernel along X
  	//@param ky is the size of the kernel along Y
  	template<typename T, typename K = T>
  	void gpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) {
  		cudaDeviceProp p;
  		HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
  		size_t tmax = p.maxThreadsPerBlock;
  		dim3 nt(sqrt(tmax), sqrt(tmax));						//calculate the block dimensions
  		size_t X = sx - kx + 1;									//calculate the x size of the output image
  		T* temp;												//declare a temporary variable to store the intermediate image
  		HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T)));	//allocate memory for the intermediate image
  
  		dim3 nb(X / nt.x + 1, sy / nt.y + 1);							//calculate the grid dimensions
  		size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T);					//shared memory bytes required to store block data
  		if (sm > p.sharedMemPerBlock) {
  			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
  			exit(1);
  		}
  		kernel_conv2 <<<nb, nt, sm>>> (temp, in, k0, sx, sy, kx, 1);	//launch the kernel to compute the intermediate image
  
  		size_t Y = sy - ky + 1;									//calculate the y size of the output image
  		nb.y = Y / nt.y + 1;									//update the grid dimensions to reflect the Y-axis size of the output image
  		sm = nt.x * (nt.y + ky - 1) * sizeof(T);				//calculate the amount of shared memory needed for the second pass
  		if (sm > p.sharedMemPerBlock) {
  			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
  			exit(1);
  		}
  		kernel_conv2 <<<nb, nt, sm>>> (out, temp, k1, X, sy, 1, ky);	//launch the kernel to compute the final image
  		HANDLE_ERROR(cudaFree(temp));							//free memory allocated for the intermediate image
  	}
  #endif
  	//Performs a separable convolution of a 2D image. Only valid pixels based on the kernel are returned.
  	//	As a result, the output image will be smaller than the input image by (kx-1, ky-1)
  	//@param out is a pointer to the output image
  	//@param in is a pointer to the input image
  	//@param k0 is the x-axis convolution filter
  	//@param k1 is the y-axis convolution filter
  	//@param sx is the size of the input image along X
  	//@param sy is the size of the input image along Y
  	//@param kx is the size of the kernel along X
  	//@param ky is the size of the kernel along Y
  	template<typename T, typename K = T>
  	void cpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) {
  		size_t X = sx - kx + 1;					//x size of the output image
  		size_t Y = sy - ky + 1;
  #ifdef __CUDACC__
  		//allocate memory and copy everything to the GPU
  		T* gpu_in;
  		HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T)));
  		HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice));
  		K* gpu_k0;
  		HANDLE_ERROR(cudaMalloc(&gpu_k0, kx * sizeof(K)));
  		HANDLE_ERROR(cudaMemcpy(gpu_k0, k0, kx * sizeof(K), cudaMemcpyHostToDevice));
  		K* gpu_k1;
  		HANDLE_ERROR(cudaMalloc(&gpu_k1, ky * sizeof(K)));
  		HANDLE_ERROR(cudaMemcpy(gpu_k1, k1, ky * sizeof(K), cudaMemcpyHostToDevice));
  		T* gpu_out;
  		HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T)));
  		gpu_sepconv2(gpu_out, gpu_in, gpu_k0, gpu_k1, sx, sy, kx, ky);								//execute the GPU kernel
  		HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost));	//copy the result to the host
  		HANDLE_ERROR(cudaFree(gpu_in));
  		HANDLE_ERROR(cudaFree(gpu_k0));
  		HANDLE_ERROR(cudaFree(gpu_k1));
  		HANDLE_ERROR(cudaFree(gpu_out));
  #else
  		T* temp = (T*)malloc(X * sy * sizeof(T));		//allocate space for the intermediate image
  		cpu_conv2(temp, in, k0, sx, sy, kx, 1);			//evaluate the intermediate image
  		cpu_conv2(out, temp, k1, X, sy, 1, ky);			//evaluate the final image
  		free(temp);										//free the memory for the intermediate image
  #endif
  	}
  }
  
  #endif