Blame view

stim/math/filters/conv2.h 5.02 KB
2e5e3a26   David Mayerich   added 2D convolut...
1
2
3
4
5
6
  #ifndef STIM_CUDA_CONV2_H
  #define STIM_CUDA_CONV2_H
  //#define __CUDACC__
  
  #ifdef __CUDACC__
  #include <stim/cuda/cudatools.h>
dbeb83f2   David Mayerich   added separable c...
7
  #include <stim/cuda/sharedmem.cuh>
2e5e3a26   David Mayerich   added 2D convolut...
8
9
10
  #endif
  
  namespace stim {
dbeb83f2   David Mayerich   added separable c...
11
  #ifdef __CUDACC__
2e5e3a26   David Mayerich   added 2D convolut...
12
  	//Kernel function that performs the 2D convolution.
dbeb83f2   David Mayerich   added separable c...
13
14
15
  	template<typename T, typename K = T>
  	__global__ void kernel_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
  		extern __shared__ T s[];								//declare a shared memory array
2e5e3a26   David Mayerich   added 2D convolut...
16
17
  		size_t xi = blockIdx.x * blockDim.x + threadIdx.x;	//threads correspond to indices into the output image
  		size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
dbeb83f2   David Mayerich   added separable c...
18
19
20
21
22
  		size_t tid = threadIdx.y * blockDim.x + threadIdx.x;
  		size_t nt = blockDim.x * blockDim.y;
  
  		size_t cx = blockIdx.x * blockDim.x;					//find the upper left corner of the input region
  		size_t cy = blockIdx.y * blockDim.y;
2e5e3a26   David Mayerich   added 2D convolut...
23
24
25
26
  
  		size_t X = sx - kx + 1;								//calculate the size of the output image
  		size_t Y = sy - ky + 1;
  
dbeb83f2   David Mayerich   added separable c...
27
28
29
30
31
32
  		if (cx >= X || cy >= Y) return;						//return if the entire block is outside the image
  		size_t smx = min(blockDim.x + kx - 1, sx - cx);			//size of the shared copy of the input image
  		size_t smy = min(blockDim.y + ky - 1, sy - cy);			//	min function is used to deal with boundary blocks
  		stim::cuda::threadedMemcpy2D<T>(s, smx, smy, in, cx, cy, sx, sy, tid, nt);	//copy the input region to shared memory
  		__syncthreads();
  
2e5e3a26   David Mayerich   added 2D convolut...
33
34
35
36
  		if (xi >= X || yi >= Y) return;						//returns if the thread is outside of the output image
  		
  		//loop through the kernel
  		size_t kxi, kyi;
dbeb83f2   David Mayerich   added separable c...
37
  		K v = 0;
2e5e3a26   David Mayerich   added 2D convolut...
38
39
  		for (kyi = 0; kyi < ky; kyi++) {
  			for (kxi = 0; kxi < kx; kxi++) {
dbeb83f2   David Mayerich   added separable c...
40
41
  				v += s[(threadIdx.y + kyi) * smx + threadIdx.x + kxi] * kernel[kyi * kx + kxi];
  				//v += in[(yi + kyi) * sx + xi + kxi] * kernel[kyi * kx + kxi];
2e5e3a26   David Mayerich   added 2D convolut...
42
43
  			}
  		}
dbeb83f2   David Mayerich   added separable c...
44
  		out[yi * X + xi] = (T)v;								//write the result to global memory
2e5e3a26   David Mayerich   added 2D convolut...
45
46
47
48
49
50
51
52
53
54
  
  	}
  
  	//Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device.
  	//@param out is a pointer to the output image
  	//@param in is a pointer to the input image
  	//@param sx is the size of the input image along X
  	//@param sy is the size of the input image along Y
  	//@param kx is the size of the kernel along X
  	//@param ky is the size of the kernel along Y
dbeb83f2   David Mayerich   added separable c...
55
56
  	template<typename T, typename K = T>
  	void gpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
2e5e3a26   David Mayerich   added 2D convolut...
57
58
59
  		cudaDeviceProp p;
  		HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
  		size_t tmax = p.maxThreadsPerBlock;
dbeb83f2   David Mayerich   added separable c...
60
  		dim3 nt(sqrt(tmax), sqrt(tmax));					//calculate the block dimensions
2e5e3a26   David Mayerich   added 2D convolut...
61
62
  		size_t X = sx - kx + 1;								//calculate the size of the output image
  		size_t Y = sy - ky + 1;
dbeb83f2   David Mayerich   added separable c...
63
64
65
66
67
68
69
  		dim3 nb(X / nt.x + 1, Y / nt.y + 1);							//calculate the grid dimensions
  		size_t sm = (nt.x + kx - 1) * (nt.y + ky - 1) * sizeof(T);		//shared memory bytes required to store block data
  		if (sm > p.sharedMemPerBlock) {
  			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
  			exit(1);
  		}
  		kernel_conv2 <<<nb, nt, sm>>> (out, in, kernel, sx, sy, kx, ky);	//launch the kernel
2e5e3a26   David Mayerich   added 2D convolut...
70
  	}
dbeb83f2   David Mayerich   added separable c...
71
  #endif
2e5e3a26   David Mayerich   added 2D convolut...
72
73
74
75
76
77
78
79
  	//Performs a convolution of a 2D image. Only valid pixels based on the kernel are returned.
  	//	As a result, the output image will be smaller than the input image by (kx-1, ky-1)
  	//@param out is a pointer to the output image
  	//@param in is a pointer to the input image
  	//@param sx is the size of the input image along X
  	//@param sy is the size of the input image along Y
  	//@param kx is the size of the kernel along X
  	//@param ky is the size of the kernel along Y
dbeb83f2   David Mayerich   added separable c...
80
81
  	template<typename T, typename K = T>
  	void cpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
2e5e3a26   David Mayerich   added 2D convolut...
82
83
84
85
86
87
88
89
  		size_t X = sx - kx + 1;					//x size of the output image
  		size_t Y = sy - ky + 1;					//y size of the output image
  
  #ifdef __CUDACC__
  		//allocate memory and copy everything to the GPU
  		T* gpu_in;
  		HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T)));
  		HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice));
dbeb83f2   David Mayerich   added separable c...
90
91
92
  		K* gpu_kernel;
  		HANDLE_ERROR(cudaMalloc(&gpu_kernel, kx * ky * sizeof(K)));
  		HANDLE_ERROR(cudaMemcpy(gpu_kernel, kernel, kx * ky * sizeof(K), cudaMemcpyHostToDevice));
2e5e3a26   David Mayerich   added 2D convolut...
93
94
95
96
97
98
99
100
  		T* gpu_out;
  		HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T)));
  		gpu_conv2(gpu_out, gpu_in, gpu_kernel, sx, sy, kx, ky);								//execute the GPU kernel
  		HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost));	//copy the result to the host
  		HANDLE_ERROR(cudaFree(gpu_in));
  		HANDLE_ERROR(cudaFree(gpu_kernel));
  		HANDLE_ERROR(cudaFree(gpu_out));
  #else
dbeb83f2   David Mayerich   added separable c...
101
  		K v;												//register stores the integral of the current pixel value
2e5e3a26   David Mayerich   added 2D convolut...
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
  		size_t yi, xi, kyi, kxi, yi_kyi_sx;
  		for (yi = 0; yi < Y; yi++) {					//for each pixel in the output image
  			for (xi = 0; xi < X; xi++) {
  				v = 0;
  				for (kyi = 0; kyi < ky; kyi++) {		//for each pixel in the kernel
  					yi_kyi_sx = (yi + kyi) * sx;
  					for (kxi = 0; kxi < kx; kxi++) {
  						v += in[yi_kyi_sx + xi + kxi] * kernel[kyi * kx + kxi];
  					}
  				}
  				out[yi * X + xi] = v;						//save the result to the output array
  			}
  		}
  		
  #endif
  	}
  
  
  }
  
  
  #endif