gaussian_blur.cuh
3.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#ifndef STIM_CUDA_GAUSSIAN_BLUR_H
#define STIM_CUDA_GAUSSIAN_BLUR_H
#include <iostream>
#include <cuda.h>
#include "gaussian_blur.cuh"
#include <stim/cuda/devices.h>
#define pi 3.14159
// This CUDA kernel applies a Gaussian blur along the x axis
template<typename T>
__global__ void gaussian_blur_x(T* out, T* in, T sigma, unsigned int x, unsigned int y){
//calculate the 1D image index for this thread
int i = blockIdx.x * blockDim.x + threadIdx.x;
//convert this to 2D pixel coordinates
int yi = i / x;
int xi = i - (yi * x);
int k_size = sigma * 4; //calculate the kernel size
int gx, gy, gi; //variables to store global coordinates
T sum = 0; //variable stores the integral of the kernel times the image
T G; //stores the value of the gaussian kernel
out[i] = 0;
//for each element of the kernel
for(int ki = -k_size; ki <= k_size; ki++){
//calculate the gaussian value
G = 1.0 / (sigma * sqrt(2 * pi)) * exp(-(ki*ki) / (2*sigma*sigma));
//calculate the global coordinates for this point in the kernel
gx = xi + ki;
gy = yi;
//make sure we are inside the bounds of the image
if(gx >= 0 && gy >= 0 && gx < x && gy < y){
gi = gy * x + gx;
//perform the integration
sum += G * in[gi];
}
}
//set the output value to the integral of the function times the kernel
out[i] = sum;
}
// This CUDA kernel applies a Gaussian blur along the y axis.
template<typename T>
__global__ void gaussian_blur_y(T* out, T* in, T sigma, unsigned int x, unsigned int y){
//calculate the 1D image index for this thread
int i = blockIdx.x * blockDim.x + threadIdx.x;
//convert this to 2D pixel coordinates
int yi = i / x;
int xi = i - (yi * x);
int k_size = sigma * 4; //calculate the kernel size
int gx, gy, gi; //variables to store global coordinates
T sum = 0; //variable stores the integral of the kernel times the image
T G; //stores the value of the gaussian kernel
out[i] = 0;
//for each element of the kernel
for(int ki = -k_size; ki <= k_size; ki++){
//calculate the gaussian value
G = 1.0 / (sigma * sqrt(2 * pi)) * exp(-(ki*ki) / (2*sigma*sigma));
//calculate the global coordinates for this point in the kernel
gx = xi;
gy = yi + ki;
//make sure we are inside the bounds of the image
if(gx >= 0 && gy >= 0 && gx < x && gy < y){
gi = gy * x + gx;
//perform the integration
sum += G * in[gi];
}
}
//set the output value to the integral of the function times the kernel
out[i] = sum;
}
/// Applies a Gaussian blur to a 2D image stored on the GPU
template<typename T>
void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
//get the number of pixels in the image
unsigned int pixels = x * y;
unsigned int bytes = sizeof(T) * pixels;
//allocate temporary space on the GPU
T* gpuI1;
cudaMalloc(&gpuI1, bytes);
//get the maximum number of threads per block for the CUDA device
int threads = stim::maxThreadsPerBlock();
//calculate the number of blocks
int blocks = pixels / threads + (pixels%threads == 0 ? 0:1);
//run the kernel for the x dimension
gaussian_blur_x <<< blocks, threads >>>(gpuI1, image, sigma, x, y);
//run the kernel for the y dimension
gaussian_blur_y <<< blocks, threads >>>(image, gpuI1, sigma, x, y);
}
/// Applies a Gaussian blur to a 2D image stored on the CPU
template<typename T>
void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
//get the number of pixels in the image
unsigned int pixels = x * y;
unsigned int bytes = sizeof(T) * pixels;
//allocate space on the GPU
T* gpuI0;
cudaMalloc(&gpuI0, bytes);
//copy the image data to the GPU
cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
//run the GPU-based version of the algorithm
gpu_gaussian_blur_2d<T>(gpuI0, sigma, x, y);
//copy the image data from the device
cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost);
}
#endif