c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
1
2
3
4
5
6
|
#ifndef STIM_CUDA_GAUSSIAN_BLUR3_H
#define STIM_CUDA_GAUSSIAN_BLUR3_H
#include <iostream>
#include <cuda.h>
#include <stim/cuda/cudatools.h>
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
7
|
#include <stim/cuda/cudatools/error.h>
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
8
9
10
|
#define pi 3.14159
|
310a1698
Laila Saadatifard
update the ivote3...
|
11
|
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
12
13
|
template<typename T>
|
6ef1dab9
Laila Saadatifard
fix one bug in th...
|
14
|
__global__ void blur_x(T* out, T* in, T sigma, int x, int y, int z){
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
15
|
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
16
17
|
//calculate x,y,z coordinates for this thread
int xi = blockIdx.x * blockDim.x + threadIdx.x;
|
f12505fb
Laila Saadatifard
upload the ivote ...
|
18
19
20
21
22
|
//find the grid size along y
int grid_y = y / blockDim.y;
int blockidx_y = blockIdx.y % grid_y;
int yi = blockidx_y * blockDim.y + threadIdx.y;
int zi = blockIdx.y / grid_y;
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
23
24
|
int i = zi * x * y + yi * x + xi;
|
f12505fb
Laila Saadatifard
upload the ivote ...
|
25
|
// calculate the kernel size
|
310a1698
Laila Saadatifard
update the ivote3...
|
26
|
T k_size = sigma * 4;
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
27
28
29
30
31
32
|
//if the current pixel is outside of the image
if(xi >= x || yi >= y || zi>=z)
return;
|
f12505fb
Laila Saadatifard
upload the ivote ...
|
33
|
int gx, gi;
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
34
35
36
|
T G;
T sum = 0; //running weighted sum across the kernel
out[i] = 0;
|
310a1698
Laila Saadatifard
update the ivote3...
|
37
38
39
40
41
42
43
44
45
|
T sigma_sq = 2.0 * sigma * sigma;
T a = 1.0 / (sigma * sqrt(2.0 * pi));
//handle the boundary in x direction
int kl = -(int)k_size;
//if (xi < k_size) kl = -xi;
int kh = (int)k_size;
//if (xi >= x - (int)k_size) kh = x - xi - 1;
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
46
47
|
//for each element of the kernel
|
310a1698
Laila Saadatifard
update the ivote3...
|
48
|
for(int ki = kl; ki <= kh; ki++){
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
49
50
|
//calculate the gaussian value
|
310a1698
Laila Saadatifard
update the ivote3...
|
51
|
G = a * exp(-(T)(ki*ki) / (sigma_sq));
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
52
|
//calculate the global coordinates for this point in the kernel
|
f12505fb
Laila Saadatifard
upload the ivote ...
|
53
54
55
|
gx = (xi + ki) % x;
gi = zi * x * y + yi * x + gx;
sum += G * in[gi];
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
56
57
58
59
|
}
//output the result to global memory
out[i] = sum;
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
60
61
62
63
|
}
template<typename T>
|
6ef1dab9
Laila Saadatifard
fix one bug in th...
|
64
|
__global__ void blur_y(T* out, T* in, T sigma, int x, int y, int z){
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
65
66
67
|
//calculate x,y,z coordinates for this thread
int xi = blockIdx.x * blockDim.x + threadIdx.x;
|
f12505fb
Laila Saadatifard
upload the ivote ...
|
68
69
70
71
72
|
//find the grid size along y
int grid_y = y / blockDim.y;
int blockidx_y = blockIdx.y % grid_y;
int yi = blockidx_y * blockDim.y + threadIdx.y;
int zi = blockIdx.y / grid_y;
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
73
|
int i = zi * x * y + yi * x + xi;
|
f12505fb
Laila Saadatifard
upload the ivote ...
|
74
75
|
// calculate the kernel size
|
310a1698
Laila Saadatifard
update the ivote3...
|
76
|
T k_size = sigma * 4;
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
77
78
79
80
81
82
|
//if the current pixel is outside of the image
if(xi >= x || yi >= y || zi>=z)
return;
|
f12505fb
Laila Saadatifard
upload the ivote ...
|
83
|
int gy, gi;
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
84
85
86
|
T G;
T sum = 0; //running weighted sum across the kernel
out[i] = 0;
|
310a1698
Laila Saadatifard
update the ivote3...
|
87
88
89
90
91
92
93
94
95
96
|
T sigma_sq = 2.0 * sigma * sigma;
T a = 1.0 / (sigma * sqrt(2.0 * pi));
//handle the boundary in y direction
int kl = -(int)k_size;
//if (yi < k_size) kl = -yi;
int kh = (int)k_size;
//if (yi >= y - (int)k_size) kh = y - yi - 1;
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
97
98
|
//for each element of the kernel
|
310a1698
Laila Saadatifard
update the ivote3...
|
99
|
for(int ki = kl; ki <= kh; ki++){
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
100
101
|
//calculate the gaussian value
|
310a1698
Laila Saadatifard
update the ivote3...
|
102
|
G = a * exp(-(T)(ki*ki) / sigma_sq);
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
103
|
//calculate the global coordinates for this point in the kernel
|
f12505fb
Laila Saadatifard
upload the ivote ...
|
104
105
106
|
gy = (yi + ki ) % y;
gi = zi * x * y + gy * x + xi;
sum += G * in[gi];
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
107
108
|
}
|
f12505fb
Laila Saadatifard
upload the ivote ...
|
109
|
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
110
111
112
113
114
|
//output the result to global memory
out[i] = sum;
}
template<typename T>
|
6ef1dab9
Laila Saadatifard
fix one bug in th...
|
115
|
__global__ void blur_z(T* out, T* in, T sigma, int x, int y, int z){
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
116
117
118
|
//calculate x,y,z coordinates for this thread
int xi = blockIdx.x * blockDim.x + threadIdx.x;
|
f12505fb
Laila Saadatifard
upload the ivote ...
|
119
120
121
122
123
|
//find the grid size along y
int grid_y = y / blockDim.y;
int blockidx_y = blockIdx.y % grid_y;
int yi = blockidx_y * blockDim.y + threadIdx.y;
int zi = blockIdx.y / grid_y;
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
124
|
int i = zi * x * y + yi * x + xi;
|
f12505fb
Laila Saadatifard
upload the ivote ...
|
125
126
|
// calculate the kernel size
|
310a1698
Laila Saadatifard
update the ivote3...
|
127
|
T k_size = sigma * 4;
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
128
129
130
131
132
133
|
//if the current pixel is outside of the image
if(xi >= x || yi >= y || zi>=z)
return;
|
f12505fb
Laila Saadatifard
upload the ivote ...
|
134
|
int gz, gi;
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
135
136
137
|
T G;
T sum = 0; //running weighted sum across the kernel
out[i] = 0;
|
310a1698
Laila Saadatifard
update the ivote3...
|
138
139
140
141
142
143
144
145
|
T sigma_sq = 2.0 * sigma * sigma;
T a = 1.0 / (sigma * sqrt(2.0 * pi));
//handle the boundary in z direction
int kl = -(int)k_size;
//if (zi < k_size) kl = -zi;
int kh = (int)k_size;
//if (zi >= z - (int)k_size) kh = z - zi - 1;
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
146
147
|
//for each element of the kernel
|
310a1698
Laila Saadatifard
update the ivote3...
|
148
|
for(int ki = kl; ki <= kh; ki++){
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
149
150
|
//calculate the gaussian value
|
310a1698
Laila Saadatifard
update the ivote3...
|
151
|
G = a * exp(-(T)(ki*ki) / sigma_sq);
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
152
|
//calculate the global coordinates for this point in the kernel
|
f12505fb
Laila Saadatifard
upload the ivote ...
|
153
154
155
|
gz = (zi + ki) % z;
gi = gz * x * y + yi * x + xi;
sum += G * in[gi];
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
156
157
158
159
160
161
162
|
}
//output the result to global memory
out[i] = sum;
}
template<typename T>
|
310a1698
Laila Saadatifard
update the ivote3...
|
163
|
void gpu_gaussian_blur3(T* image, T sigma[], size_t x, size_t y, size_t z){
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
164
165
|
//get the number of pixels in the image
|
310a1698
Laila Saadatifard
update the ivote3...
|
166
167
|
size_t pixels = x * y * z;
size_t bytes = sizeof(T) * pixels;
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
168
|
|
310a1698
Laila Saadatifard
update the ivote3...
|
169
|
unsigned int max_threads = stim::maxThreadsPerBlock();
|
f12505fb
Laila Saadatifard
upload the ivote ...
|
170
171
|
dim3 threads(sqrt (max_threads),sqrt (max_threads));
dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
172
173
174
175
176
177
178
179
180
|
//allocate temporary space on the GPU
T* gpuIb_x;
cudaMalloc(&gpuIb_x, bytes);
//allocate temporary space on the GPU
T* gpuIb_y;
cudaMalloc(&gpuIb_y, bytes);
|
f12505fb
Laila Saadatifard
upload the ivote ...
|
181
182
|
// blur the original image along the x direction
blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], x, y, z);
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
183
|
|
f12505fb
Laila Saadatifard
upload the ivote ...
|
184
185
|
// blur the x-blurred image along the y direction
blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], x, y, z);
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
186
|
|
f12505fb
Laila Saadatifard
upload the ivote ...
|
187
188
|
// blur the xy-blurred image along the z direction
blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], x, y, z);
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
189
190
191
192
193
|
//cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice);
cudaFree(gpuIb_x);
cudaFree(gpuIb_y);
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
194
195
196
197
198
|
}
/// Applies a Gaussian blur to a 2D image stored on the CPU
template<typename T>
|
f12505fb
Laila Saadatifard
upload the ivote ...
|
199
|
void cpu_gaussian_blur3(T* blur, T* image, T sigma[], unsigned int x, unsigned int y, unsigned int z){
|
c9aba18a
Laila Saadatifard
ivote3 on the GPU...
|
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
|
//get the number of pixels in the image
unsigned int pixels = x * y *z;
unsigned int bytes = sizeof(T) * pixels;
//---------Allocate Image---------
//allocate space on the GPU for the image
T* gpuI0;
cudaMalloc(&gpuI0, bytes);
//copy the image data to the GPU
cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
//run the GPU-based version of the algorithm
gpu_gaussian_blur3<T>(gpuI0, sigma, x, y, z);
//copy the image data from the device
cudaMemcpy(blur, gpuI0, bytes, cudaMemcpyDeviceToHost);
//free allocated memory
cudaFree(gpuI0);
}
#endif
|