Blame view

cpp/gaussian_blur3.cuh 6.04 KB
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
1
2
3
4
5
6
  #ifndef STIM_CUDA_GAUSSIAN_BLUR3_H
  #define STIM_CUDA_GAUSSIAN_BLUR3_H
  
  #include <iostream>
  #include <cuda.h>
  #include <stim/cuda/cudatools.h>
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
7
  #include <stim/cuda/cudatools/error.h>
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
8
9
10
  
  #define pi	3.14159
  
310a1698   Laila Saadatifard   update the ivote3...
11
  
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
12
13
  				
  		template<typename T>
6ef1dab9   Laila Saadatifard   fix one bug in th...
14
  		__global__ void blur_x(T* out, T* in, T sigma, int x, int y, int z){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
15
  
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
16
17
  			//calculate x,y,z coordinates for this thread
  			int xi = blockIdx.x * blockDim.x + threadIdx.x;
f12505fb   Laila Saadatifard   upload the ivote ...
18
19
20
21
22
  			//find the grid size along y
  			int grid_y = y / blockDim.y;
  			int blockidx_y = blockIdx.y % grid_y;
  			int yi = blockidx_y * blockDim.y + threadIdx.y;
  			int zi = blockIdx.y / grid_y;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
23
24
  			int i = zi * x * y + yi * x + xi;
  			
f12505fb   Laila Saadatifard   upload the ivote ...
25
  			// calculate the kernel size			
310a1698   Laila Saadatifard   update the ivote3...
26
  			T k_size = sigma * 4;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
27
28
29
30
31
32
  			
  			//if the current pixel is outside of the image
  			if(xi >= x || yi >= y || zi>=z)
  				return;
  			
  
f12505fb   Laila Saadatifard   upload the ivote ...
33
  			int gx, gi;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
34
35
36
  			T G;
  			T sum = 0;		//running weighted sum across the kernel
  			out[i] = 0;
310a1698   Laila Saadatifard   update the ivote3...
37
38
39
40
41
42
43
44
45
  			T sigma_sq = 2.0 * sigma * sigma;	
  			T a = 1.0 / (sigma * sqrt(2.0 * pi));
  
  			//handle the boundary in x direction
  			int kl = -(int)k_size;
  			//if (xi < k_size) kl = -xi;
  			int kh = (int)k_size;
  			//if (xi >= x - (int)k_size) kh = x - xi - 1;
  
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
46
47
  
  			//for each element of the kernel
310a1698   Laila Saadatifard   update the ivote3...
48
  			for(int ki = kl; ki <= kh; ki++){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
49
50
  
  				//calculate the gaussian value
310a1698   Laila Saadatifard   update the ivote3...
51
  				G = a * exp(-(T)(ki*ki) / (sigma_sq));
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
52
  				//calculate the global coordinates for this point in the kernel
f12505fb   Laila Saadatifard   upload the ivote ...
53
54
55
  				gx = (xi + ki) % x;					
  				gi = zi * x * y + yi * x + gx;
  				sum += G * in[gi];				
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
56
57
58
59
  			}
  			
  			//output the result to global memory
  			out[i] = sum;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
60
61
62
63
  		}
  
  	
  		template<typename T>
6ef1dab9   Laila Saadatifard   fix one bug in th...
64
  		__global__ void blur_y(T* out, T* in, T sigma, int x, int y, int z){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
65
66
67
  			
  			//calculate x,y,z coordinates for this thread
  			int xi = blockIdx.x * blockDim.x + threadIdx.x;
f12505fb   Laila Saadatifard   upload the ivote ...
68
69
70
71
72
  			//find the grid size along y
  			int grid_y = y / blockDim.y;
  			int blockidx_y = blockIdx.y % grid_y;
  			int yi = blockidx_y * blockDim.y + threadIdx.y;
  			int zi = blockIdx.y / grid_y;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
73
  			int i = zi * x * y + yi * x + xi;
f12505fb   Laila Saadatifard   upload the ivote ...
74
75
  
  			// calculate the kernel size			
310a1698   Laila Saadatifard   update the ivote3...
76
  			T k_size = sigma * 4;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
77
78
79
80
81
82
  			
  			//if the current pixel is outside of the image
  			if(xi >= x || yi >= y || zi>=z)
  				return;
  			
  
f12505fb   Laila Saadatifard   upload the ivote ...
83
  			int gy, gi;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
84
85
86
  			T G;
  			T sum = 0;		//running weighted sum across the kernel
  			out[i] = 0;
310a1698   Laila Saadatifard   update the ivote3...
87
88
89
90
91
92
93
94
95
96
  			T sigma_sq = 2.0 * sigma * sigma;	
  			T a = 1.0 / (sigma * sqrt(2.0 * pi));	
  
  			//handle the boundary in y direction
  			int kl = -(int)k_size;
  			//if (yi < k_size) kl = -yi;
  			int kh = (int)k_size;
  			//if (yi >= y - (int)k_size) kh = y - yi - 1;
  
  
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
97
98
  
  			//for each element of the kernel
310a1698   Laila Saadatifard   update the ivote3...
99
  			for(int ki = kl; ki <= kh; ki++){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
100
101
  
  				//calculate the gaussian value
310a1698   Laila Saadatifard   update the ivote3...
102
  				G = a * exp(-(T)(ki*ki) / sigma_sq);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
103
  				//calculate the global coordinates for this point in the kernel
f12505fb   Laila Saadatifard   upload the ivote ...
104
105
106
  				gy = (yi + ki ) % y;				
  				gi = zi * x * y + gy * x + xi;
  				sum += G * in[gi];						
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
107
108
  			}
  			
f12505fb   Laila Saadatifard   upload the ivote ...
109
  			
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
110
111
112
113
114
  			//output the result to global memory
  			out[i] = sum;
  		}
  		
  		template<typename T>
6ef1dab9   Laila Saadatifard   fix one bug in th...
115
  		__global__ void blur_z(T* out, T* in, T sigma, int x, int y, int z){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
116
117
118
  			
  			//calculate x,y,z coordinates for this thread
  			int xi = blockIdx.x * blockDim.x + threadIdx.x;
f12505fb   Laila Saadatifard   upload the ivote ...
119
120
121
122
123
  			//find the grid size along y
  			int grid_y = y / blockDim.y;
  			int blockidx_y = blockIdx.y % grid_y;
  			int yi = blockidx_y * blockDim.y + threadIdx.y;
  			int zi = blockIdx.y / grid_y;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
124
  			int i = zi * x * y + yi * x + xi;
f12505fb   Laila Saadatifard   upload the ivote ...
125
126
  
  			// calculate the kernel size			
310a1698   Laila Saadatifard   update the ivote3...
127
  			T k_size = sigma * 4;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
128
129
130
131
132
133
  			
  			//if the current pixel is outside of the image
  			if(xi >= x || yi >= y || zi>=z)
  				return;
  			
  
f12505fb   Laila Saadatifard   upload the ivote ...
134
  			int gz, gi;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
135
136
137
  			T G;
  			T sum = 0;		//running weighted sum across the kernel
  			out[i] = 0;
310a1698   Laila Saadatifard   update the ivote3...
138
139
140
141
142
143
144
145
  			T sigma_sq = 2.0 * sigma * sigma;	
  			T a = 1.0 / (sigma * sqrt(2.0 * pi));	
  
  			//handle the boundary in z direction
  			int kl = -(int)k_size;
  			//if (zi < k_size) kl = -zi;
  			int kh = (int)k_size;
  			//if (zi >= z - (int)k_size) kh = z - zi - 1;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
146
147
  
  			//for each element of the kernel
310a1698   Laila Saadatifard   update the ivote3...
148
  			for(int ki = kl; ki <= kh; ki++){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
149
150
  
  				//calculate the gaussian value
310a1698   Laila Saadatifard   update the ivote3...
151
  				G = a * exp(-(T)(ki*ki) / sigma_sq);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
152
  				//calculate the global coordinates for this point in the kernel
f12505fb   Laila Saadatifard   upload the ivote ...
153
154
155
  				gz = (zi + ki) % z;				
  				gi = gz * x * y + yi * x + xi;					
  				sum += G * in[gi];
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
156
157
158
159
160
161
162
  			}
  			
  			//output the result to global memory
  			out[i] = sum;
  		}
  		
  		template<typename T>
310a1698   Laila Saadatifard   update the ivote3...
163
  		void gpu_gaussian_blur3(T* image, T sigma[], size_t x, size_t y, size_t z){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
164
165
  
  			//get the number of pixels in the image
310a1698   Laila Saadatifard   update the ivote3...
166
167
  			size_t pixels = x * y * z;
  			size_t bytes = sizeof(T) * pixels;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
168
  
310a1698   Laila Saadatifard   update the ivote3...
169
  			unsigned int max_threads = stim::maxThreadsPerBlock();
07e31b34   Laila Saadatifard   fix the bugs in i...
170
171
  			dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
  			dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
172
173
174
175
176
177
178
179
180
  
  			//allocate temporary space on the GPU
  			T* gpuIb_x;
  			cudaMalloc(&gpuIb_x, bytes);
  
  			//allocate temporary space on the GPU
  			T* gpuIb_y;
  			cudaMalloc(&gpuIb_y, bytes);
  
f12505fb   Laila Saadatifard   upload the ivote ...
181
  			// blur the original image along the x direction
07e31b34   Laila Saadatifard   fix the bugs in i...
182
  			blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], (int)x, (int)y, (int)z);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
183
  			
f12505fb   Laila Saadatifard   upload the ivote ...
184
  			// blur the x-blurred image along the y direction
07e31b34   Laila Saadatifard   fix the bugs in i...
185
  			blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], (int)x, (int)y, (int)z);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
186
  			
f12505fb   Laila Saadatifard   upload the ivote ...
187
  			// blur the xy-blurred image along the z direction
07e31b34   Laila Saadatifard   fix the bugs in i...
188
  			blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], (int)x, (int)y, (int)z);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
189
190
191
192
193
  
  			//cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice);
  
  			cudaFree(gpuIb_x);
  			cudaFree(gpuIb_y);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
194
195
196
197
198
  		}
  
  
  		/// Applies a Gaussian blur to a 2D image stored on the CPU
  		template<typename T>
f12505fb   Laila Saadatifard   upload the ivote ...
199
  		void cpu_gaussian_blur3(T* blur, T* image, T sigma[], unsigned int x, unsigned int y, unsigned int z){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
  
  			
  			//get the number of pixels in the image
  			unsigned int pixels = x * y *z;
  			unsigned int bytes = sizeof(T) * pixels;
  
  			//---------Allocate Image---------
  			//allocate space on the GPU for the image
  			T* gpuI0;
  			cudaMalloc(&gpuI0, bytes);			
  			
  			//copy the image data to the GPU
  			cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
  
  			//run the GPU-based version of the algorithm
  			gpu_gaussian_blur3<T>(gpuI0, sigma, x, y, z);
  
  			//copy the image data from the device
  			cudaMemcpy(blur, gpuI0, bytes, cudaMemcpyDeviceToHost);
  
  			//free allocated memory
  			cudaFree(gpuI0);
  
  			
  		}
  
  
  #endif