Blame view

cpp/gaussian_blur3.cuh 5.44 KB
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
1
2
3
4
5
6
  #ifndef STIM_CUDA_GAUSSIAN_BLUR3_H
  #define STIM_CUDA_GAUSSIAN_BLUR3_H
  
  #include <iostream>
  #include <cuda.h>
  #include <stim/cuda/cudatools.h>
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
7
  #include <stim/cuda/cudatools/error.h>
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
8
9
10
11
12
  
  #define pi	3.14159
  
  				
  		template<typename T>
6ef1dab9   Laila Saadatifard   fix one bug in th...
13
  		__global__ void blur_x(T* out, T* in, T sigma, int x, int y, int z){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
14
  
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
15
16
  			//calculate x,y,z coordinates for this thread
  			int xi = blockIdx.x * blockDim.x + threadIdx.x;
f12505fb   Laila Saadatifard   upload the ivote ...
17
18
19
20
21
  			//find the grid size along y
  			int grid_y = y / blockDim.y;
  			int blockidx_y = blockIdx.y % grid_y;
  			int yi = blockidx_y * blockDim.y + threadIdx.y;
  			int zi = blockIdx.y / grid_y;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
22
23
  			int i = zi * x * y + yi * x + xi;
  			
f12505fb   Laila Saadatifard   upload the ivote ...
24
  			// calculate the kernel size			
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
25
26
27
28
29
30
31
  			int k_size = sigma * 4;
  			
  			//if the current pixel is outside of the image
  			if(xi >= x || yi >= y || zi>=z)
  				return;
  			
  
f12505fb   Laila Saadatifard   upload the ivote ...
32
  			int gx, gi;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
33
34
35
  			T G;
  			T sum = 0;		//running weighted sum across the kernel
  			out[i] = 0;
f12505fb   Laila Saadatifard   upload the ivote ...
36
37
  			T sigma_sq = 2 * sigma * sigma;	
  			T a = 1.0 / (sigma * sqrt(2 * pi));
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
38
39
40
41
42
  
  			//for each element of the kernel
  			for(int ki = -k_size; ki <= k_size; ki++){
  
  				//calculate the gaussian value
f12505fb   Laila Saadatifard   upload the ivote ...
43
  				G = a * exp(-(ki*ki) / (sigma_sq));
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
44
  				//calculate the global coordinates for this point in the kernel
f12505fb   Laila Saadatifard   upload the ivote ...
45
46
47
  				gx = (xi + ki) % x;					
  				gi = zi * x * y + yi * x + gx;
  				sum += G * in[gi];				
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
48
49
50
51
  			}
  			
  			//output the result to global memory
  			out[i] = sum;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
52
53
54
55
  		}
  
  	
  		template<typename T>
6ef1dab9   Laila Saadatifard   fix one bug in th...
56
  		__global__ void blur_y(T* out, T* in, T sigma, int x, int y, int z){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
57
58
59
  			
  			//calculate x,y,z coordinates for this thread
  			int xi = blockIdx.x * blockDim.x + threadIdx.x;
f12505fb   Laila Saadatifard   upload the ivote ...
60
61
62
63
64
  			//find the grid size along y
  			int grid_y = y / blockDim.y;
  			int blockidx_y = blockIdx.y % grid_y;
  			int yi = blockidx_y * blockDim.y + threadIdx.y;
  			int zi = blockIdx.y / grid_y;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
65
  			int i = zi * x * y + yi * x + xi;
f12505fb   Laila Saadatifard   upload the ivote ...
66
67
  
  			// calculate the kernel size			
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
68
69
70
71
72
73
74
  			int k_size = sigma * 4;
  			
  			//if the current pixel is outside of the image
  			if(xi >= x || yi >= y || zi>=z)
  				return;
  			
  
f12505fb   Laila Saadatifard   upload the ivote ...
75
  			int gy, gi;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
76
77
78
  			T G;
  			T sum = 0;		//running weighted sum across the kernel
  			out[i] = 0;
f12505fb   Laila Saadatifard   upload the ivote ...
79
80
  			T sigma_sq = 2 * sigma * sigma;	
  			T a = 1.0 / (sigma * sqrt(2 * pi));	
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
81
82
83
84
85
  
  			//for each element of the kernel
  			for(int ki = -k_size; ki <= k_size; ki++){
  
  				//calculate the gaussian value
f12505fb   Laila Saadatifard   upload the ivote ...
86
  				G = a * exp(-(ki*ki) / sigma_sq);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
87
  				//calculate the global coordinates for this point in the kernel
f12505fb   Laila Saadatifard   upload the ivote ...
88
89
90
  				gy = (yi + ki ) % y;				
  				gi = zi * x * y + gy * x + xi;
  				sum += G * in[gi];						
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
91
92
  			}
  			
f12505fb   Laila Saadatifard   upload the ivote ...
93
  			
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
94
95
96
97
98
  			//output the result to global memory
  			out[i] = sum;
  		}
  		
  		template<typename T>
6ef1dab9   Laila Saadatifard   fix one bug in th...
99
  		__global__ void blur_z(T* out, T* in, T sigma, int x, int y, int z){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
100
101
102
  			
  			//calculate x,y,z coordinates for this thread
  			int xi = blockIdx.x * blockDim.x + threadIdx.x;
f12505fb   Laila Saadatifard   upload the ivote ...
103
104
105
106
107
  			//find the grid size along y
  			int grid_y = y / blockDim.y;
  			int blockidx_y = blockIdx.y % grid_y;
  			int yi = blockidx_y * blockDim.y + threadIdx.y;
  			int zi = blockIdx.y / grid_y;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
108
  			int i = zi * x * y + yi * x + xi;
f12505fb   Laila Saadatifard   upload the ivote ...
109
110
  
  			// calculate the kernel size			
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
111
112
113
114
115
116
117
  			int k_size = sigma * 4;
  			
  			//if the current pixel is outside of the image
  			if(xi >= x || yi >= y || zi>=z)
  				return;
  			
  
f12505fb   Laila Saadatifard   upload the ivote ...
118
  			int gz, gi;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
119
120
121
  			T G;
  			T sum = 0;		//running weighted sum across the kernel
  			out[i] = 0;
f12505fb   Laila Saadatifard   upload the ivote ...
122
123
  			T sigma_sq = 2 * sigma * sigma;	
  			T a = 1.0 / (sigma * sqrt(2 * pi));	
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
124
125
126
127
128
  
  			//for each element of the kernel
  			for(int ki = -k_size; ki <= k_size; ki++){
  
  				//calculate the gaussian value
f12505fb   Laila Saadatifard   upload the ivote ...
129
  				G = a * exp(-(ki*ki) / sigma_sq);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
130
  				//calculate the global coordinates for this point in the kernel
f12505fb   Laila Saadatifard   upload the ivote ...
131
132
133
  				gz = (zi + ki) % z;				
  				gi = gz * x * y + yi * x + xi;					
  				sum += G * in[gi];
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
134
135
136
137
138
139
140
  			}
  			
  			//output the result to global memory
  			out[i] = sum;
  		}
  		
  		template<typename T>
f12505fb   Laila Saadatifard   upload the ivote ...
141
  		void gpu_gaussian_blur3(T* image, T sigma[], unsigned int x, unsigned int y, unsigned int z){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
142
143
144
145
146
147
  
  			//get the number of pixels in the image
  			unsigned int pixels = x * y * z;
  			unsigned int bytes = sizeof(T) * pixels;
  
  			int max_threads = stim::maxThreadsPerBlock();
f12505fb   Laila Saadatifard   upload the ivote ...
148
149
  			dim3 threads(sqrt (max_threads),sqrt (max_threads));
  			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
150
151
152
153
154
155
156
157
158
  
  			//allocate temporary space on the GPU
  			T* gpuIb_x;
  			cudaMalloc(&gpuIb_x, bytes);
  
  			//allocate temporary space on the GPU
  			T* gpuIb_y;
  			cudaMalloc(&gpuIb_y, bytes);
  
f12505fb   Laila Saadatifard   upload the ivote ...
159
160
  			// blur the original image along the x direction
  			blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], x, y, z);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
161
  			
f12505fb   Laila Saadatifard   upload the ivote ...
162
163
  			// blur the x-blurred image along the y direction
  			blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], x, y, z);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
164
  			
f12505fb   Laila Saadatifard   upload the ivote ...
165
166
  			// blur the xy-blurred image along the z direction
  			blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], x, y, z);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
167
168
169
170
171
  
  			//cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice);
  
  			cudaFree(gpuIb_x);
  			cudaFree(gpuIb_y);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
172
173
174
175
176
  		}
  
  
  		/// Applies a Gaussian blur to a 2D image stored on the CPU
  		template<typename T>
f12505fb   Laila Saadatifard   upload the ivote ...
177
  		void cpu_gaussian_blur3(T* blur, T* image, T sigma[], unsigned int x, unsigned int y, unsigned int z){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
  
  			
  			//get the number of pixels in the image
  			unsigned int pixels = x * y *z;
  			unsigned int bytes = sizeof(T) * pixels;
  
  			//---------Allocate Image---------
  			//allocate space on the GPU for the image
  			T* gpuI0;
  			cudaMalloc(&gpuI0, bytes);			
  			
  			//copy the image data to the GPU
  			cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
  
  			//run the GPU-based version of the algorithm
  			gpu_gaussian_blur3<T>(gpuI0, sigma, x, y, z);
  
  			//copy the image data from the device
  			cudaMemcpy(blur, gpuI0, bytes, cudaMemcpyDeviceToHost);
  
  			//free allocated memory
  			cudaFree(gpuI0);
  
  			
  		}
  
  
  #endif