Blame view

stim/math/filters/conv2.cuh 5.01 KB
2e5e3a26   David Mayerich   added 2D convolut...
1
2
3
4
  #ifndef STIM_CUDA_CONV2_H
  #define STIM_CUDA_CONV2_H
  //#define __CUDACC__
  
2e5e3a26   David Mayerich   added 2D convolut...
5
  #include <stim/cuda/cudatools.h>
dbeb83f2   David Mayerich   added separable c...
6
  #include <stim/cuda/sharedmem.cuh>
2e5e3a26   David Mayerich   added 2D convolut...
7
8
  
  namespace stim {
2e5e3a26   David Mayerich   added 2D convolut...
9
  	//Kernel function that performs the 2D convolution.
683f216a   David Mayerich   fixed? Linux comp...
10
  	template<typename T, typename K>
dbeb83f2   David Mayerich   added separable c...
11
12
  	__global__ void kernel_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
  		extern __shared__ T s[];								//declare a shared memory array
2e5e3a26   David Mayerich   added 2D convolut...
13
14
  		size_t xi = blockIdx.x * blockDim.x + threadIdx.x;	//threads correspond to indices into the output image
  		size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
dbeb83f2   David Mayerich   added separable c...
15
16
17
18
19
  		size_t tid = threadIdx.y * blockDim.x + threadIdx.x;
  		size_t nt = blockDim.x * blockDim.y;
  
  		size_t cx = blockIdx.x * blockDim.x;					//find the upper left corner of the input region
  		size_t cy = blockIdx.y * blockDim.y;
2e5e3a26   David Mayerich   added 2D convolut...
20
21
22
23
  
  		size_t X = sx - kx + 1;								//calculate the size of the output image
  		size_t Y = sy - ky + 1;
  
dbeb83f2   David Mayerich   added separable c...
24
25
26
27
28
29
  		if (cx >= X || cy >= Y) return;						//return if the entire block is outside the image
  		size_t smx = min(blockDim.x + kx - 1, sx - cx);			//size of the shared copy of the input image
  		size_t smy = min(blockDim.y + ky - 1, sy - cy);			//	min function is used to deal with boundary blocks
  		stim::cuda::threadedMemcpy2D<T>(s, smx, smy, in, cx, cy, sx, sy, tid, nt);	//copy the input region to shared memory
  		__syncthreads();
  
2e5e3a26   David Mayerich   added 2D convolut...
30
31
32
33
  		if (xi >= X || yi >= Y) return;						//returns if the thread is outside of the output image
  		
  		//loop through the kernel
  		size_t kxi, kyi;
dbeb83f2   David Mayerich   added separable c...
34
  		K v = 0;
2e5e3a26   David Mayerich   added 2D convolut...
35
36
  		for (kyi = 0; kyi < ky; kyi++) {
  			for (kxi = 0; kxi < kx; kxi++) {
dbeb83f2   David Mayerich   added separable c...
37
38
  				v += s[(threadIdx.y + kyi) * smx + threadIdx.x + kxi] * kernel[kyi * kx + kxi];
  				//v += in[(yi + kyi) * sx + xi + kxi] * kernel[kyi * kx + kxi];
2e5e3a26   David Mayerich   added 2D convolut...
39
40
  			}
  		}
dbeb83f2   David Mayerich   added separable c...
41
  		out[yi * X + xi] = (T)v;								//write the result to global memory
2e5e3a26   David Mayerich   added 2D convolut...
42
43
44
45
  
  	}
  
  	//Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device.
83cdf0dc   David Mayerich   added the ability...
46
  	//@param out is a pointer to the output image, which is of size (sx - kx + 1) x (sy - ky + 1)
2e5e3a26   David Mayerich   added 2D convolut...
47
48
49
50
51
  	//@param in is a pointer to the input image
  	//@param sx is the size of the input image along X
  	//@param sy is the size of the input image along Y
  	//@param kx is the size of the kernel along X
  	//@param ky is the size of the kernel along Y
683f216a   David Mayerich   fixed? Linux comp...
52
  	template<typename T, typename K>
dbeb83f2   David Mayerich   added separable c...
53
  	void gpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
2e5e3a26   David Mayerich   added 2D convolut...
54
55
56
  		cudaDeviceProp p;
  		HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
  		size_t tmax = p.maxThreadsPerBlock;
dbeb83f2   David Mayerich   added separable c...
57
  		dim3 nt(sqrt(tmax), sqrt(tmax));					//calculate the block dimensions
2e5e3a26   David Mayerich   added 2D convolut...
58
59
  		size_t X = sx - kx + 1;								//calculate the size of the output image
  		size_t Y = sy - ky + 1;
dbeb83f2   David Mayerich   added separable c...
60
61
62
63
64
65
66
  		dim3 nb(X / nt.x + 1, Y / nt.y + 1);							//calculate the grid dimensions
  		size_t sm = (nt.x + kx - 1) * (nt.y + ky - 1) * sizeof(T);		//shared memory bytes required to store block data
  		if (sm > p.sharedMemPerBlock) {
  			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
  			exit(1);
  		}
  		kernel_conv2 <<<nb, nt, sm>>> (out, in, kernel, sx, sy, kx, ky);	//launch the kernel
2e5e3a26   David Mayerich   added 2D convolut...
67
  	}
817b3aba   David Mayerich   changed filter fi...
68
  //#endif
2e5e3a26   David Mayerich   added 2D convolut...
69
70
71
72
73
74
75
76
  	//Performs a convolution of a 2D image. Only valid pixels based on the kernel are returned.
  	//	As a result, the output image will be smaller than the input image by (kx-1, ky-1)
  	//@param out is a pointer to the output image
  	//@param in is a pointer to the input image
  	//@param sx is the size of the input image along X
  	//@param sy is the size of the input image along Y
  	//@param kx is the size of the kernel along X
  	//@param ky is the size of the kernel along Y
683f216a   David Mayerich   fixed? Linux comp...
77
  	template<typename T, typename K>
dbeb83f2   David Mayerich   added separable c...
78
  	void cpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
2e5e3a26   David Mayerich   added 2D convolut...
79
80
81
  		size_t X = sx - kx + 1;					//x size of the output image
  		size_t Y = sy - ky + 1;					//y size of the output image
  
2e5e3a26   David Mayerich   added 2D convolut...
82
83
84
85
  		//allocate memory and copy everything to the GPU
  		T* gpu_in;
  		HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T)));
  		HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice));
dbeb83f2   David Mayerich   added separable c...
86
87
88
  		K* gpu_kernel;
  		HANDLE_ERROR(cudaMalloc(&gpu_kernel, kx * ky * sizeof(K)));
  		HANDLE_ERROR(cudaMemcpy(gpu_kernel, kernel, kx * ky * sizeof(K), cudaMemcpyHostToDevice));
2e5e3a26   David Mayerich   added 2D convolut...
89
90
91
92
93
94
95
  		T* gpu_out;
  		HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T)));
  		gpu_conv2(gpu_out, gpu_in, gpu_kernel, sx, sy, kx, ky);								//execute the GPU kernel
  		HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost));	//copy the result to the host
  		HANDLE_ERROR(cudaFree(gpu_in));
  		HANDLE_ERROR(cudaFree(gpu_kernel));
  		HANDLE_ERROR(cudaFree(gpu_out));
817b3aba   David Mayerich   changed filter fi...
96
  /* CPU CODE
dbeb83f2   David Mayerich   added separable c...
97
  		K v;												//register stores the integral of the current pixel value
2e5e3a26   David Mayerich   added 2D convolut...
98
99
100
101
102
103
104
105
106
107
108
109
110
111
  		size_t yi, xi, kyi, kxi, yi_kyi_sx;
  		for (yi = 0; yi < Y; yi++) {					//for each pixel in the output image
  			for (xi = 0; xi < X; xi++) {
  				v = 0;
  				for (kyi = 0; kyi < ky; kyi++) {		//for each pixel in the kernel
  					yi_kyi_sx = (yi + kyi) * sx;
  					for (kxi = 0; kxi < kx; kxi++) {
  						v += in[yi_kyi_sx + xi + kxi] * kernel[kyi * kx + kxi];
  					}
  				}
  				out[yi * X + xi] = v;						//save the result to the output array
  			}
  		}
  		
817b3aba   David Mayerich   changed filter fi...
112
  		*/
2e5e3a26   David Mayerich   added 2D convolut...
113
  	}
817b3aba   David Mayerich   changed filter fi...
114
  	
2e5e3a26   David Mayerich   added 2D convolut...
115
116
117
118
119
  
  }
  
  
  #endif