Blame view

stim/math/filters/sepconv2.cuh 4.35 KB
dbeb83f2   David Mayerich   added separable c...
1
2
  #ifndef STIM_CUDA_SEPCONV2_H
  #define STIM_CUDA_SEPCONV2_H
817b3aba   David Mayerich   changed filter fi...
3
  #include <stim/math/filters/conv2.cuh>
dbeb83f2   David Mayerich   added separable c...
4
5
6
7
8
9
10
11
12
13
14
15
16
17
  #ifdef __CUDACC__
  #include <stim/cuda/cudatools.h>
  #include <stim/cuda/sharedmem.cuh>
  #endif
  
  namespace stim {
  #ifdef __CUDACC__
  	//Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device.
  	//@param out is a pointer to the output image
  	//@param in is a pointer to the input image
  	//@param sx is the size of the input image along X
  	//@param sy is the size of the input image along Y
  	//@param kx is the size of the kernel along X
  	//@param ky is the size of the kernel along Y
683f216a   David Mayerich   fixed? Linux comp...
18
  	template<typename T, typename K>
dbeb83f2   David Mayerich   added separable c...
19
20
21
22
  	void gpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) {
  		cudaDeviceProp p;
  		HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
  		size_t tmax = p.maxThreadsPerBlock;
276f9b23   David Mayerich   forced matching t...
23
  		dim3 nt((unsigned int)sqrt(tmax), (unsigned int)sqrt(tmax));						//calculate the block dimensions
dbeb83f2   David Mayerich   added separable c...
24
25
26
27
  		size_t X = sx - kx + 1;									//calculate the x size of the output image
  		T* temp;												//declare a temporary variable to store the intermediate image
  		HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T)));	//allocate memory for the intermediate image
  
276f9b23   David Mayerich   forced matching t...
28
  		dim3 nb((unsigned int)(X / nt.x) + 1, (unsigned int)(sy / nt.y) + 1);							//calculate the grid dimensions
dbeb83f2   David Mayerich   added separable c...
29
30
31
32
33
34
35
36
  		size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T);					//shared memory bytes required to store block data
  		if (sm > p.sharedMemPerBlock) {
  			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
  			exit(1);
  		}
  		kernel_conv2 <<<nb, nt, sm>>> (temp, in, k0, sx, sy, kx, 1);	//launch the kernel to compute the intermediate image
  
  		size_t Y = sy - ky + 1;									//calculate the y size of the output image
276f9b23   David Mayerich   forced matching t...
37
  		nb.y = (unsigned int)(Y / nt.y) + 1;									//update the grid dimensions to reflect the Y-axis size of the output image
dbeb83f2   David Mayerich   added separable c...
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
  		sm = nt.x * (nt.y + ky - 1) * sizeof(T);				//calculate the amount of shared memory needed for the second pass
  		if (sm > p.sharedMemPerBlock) {
  			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
  			exit(1);
  		}
  		kernel_conv2 <<<nb, nt, sm>>> (out, temp, k1, X, sy, 1, ky);	//launch the kernel to compute the final image
  		HANDLE_ERROR(cudaFree(temp));							//free memory allocated for the intermediate image
  	}
  #endif
  	//Performs a separable convolution of a 2D image. Only valid pixels based on the kernel are returned.
  	//	As a result, the output image will be smaller than the input image by (kx-1, ky-1)
  	//@param out is a pointer to the output image
  	//@param in is a pointer to the input image
  	//@param k0 is the x-axis convolution filter
  	//@param k1 is the y-axis convolution filter
  	//@param sx is the size of the input image along X
  	//@param sy is the size of the input image along Y
  	//@param kx is the size of the kernel along X
  	//@param ky is the size of the kernel along Y
683f216a   David Mayerich   fixed? Linux comp...
57
  	template<typename T, typename K>
dbeb83f2   David Mayerich   added separable c...
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
  	void cpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) {
  		size_t X = sx - kx + 1;					//x size of the output image
  		size_t Y = sy - ky + 1;
  #ifdef __CUDACC__
  		//allocate memory and copy everything to the GPU
  		T* gpu_in;
  		HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T)));
  		HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice));
  		K* gpu_k0;
  		HANDLE_ERROR(cudaMalloc(&gpu_k0, kx * sizeof(K)));
  		HANDLE_ERROR(cudaMemcpy(gpu_k0, k0, kx * sizeof(K), cudaMemcpyHostToDevice));
  		K* gpu_k1;
  		HANDLE_ERROR(cudaMalloc(&gpu_k1, ky * sizeof(K)));
  		HANDLE_ERROR(cudaMemcpy(gpu_k1, k1, ky * sizeof(K), cudaMemcpyHostToDevice));
  		T* gpu_out;
  		HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T)));
  		gpu_sepconv2(gpu_out, gpu_in, gpu_k0, gpu_k1, sx, sy, kx, ky);								//execute the GPU kernel
  		HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost));	//copy the result to the host
  		HANDLE_ERROR(cudaFree(gpu_in));
  		HANDLE_ERROR(cudaFree(gpu_k0));
  		HANDLE_ERROR(cudaFree(gpu_k1));
  		HANDLE_ERROR(cudaFree(gpu_out));
  #else
  		T* temp = (T*)malloc(X * sy * sizeof(T));		//allocate space for the intermediate image
  		cpu_conv2(temp, in, k0, sx, sy, kx, 1);			//evaluate the intermediate image
  		cpu_conv2(out, temp, k1, X, sy, 1, ky);			//evaluate the final image
  		free(temp);										//free the memory for the intermediate image
  #endif
  	}
  }
  
bb7275b6   Pavel Govyadinov   added a gauss3.h ...
89
  #endif