Blame view

cpp/gaussian_blur3.cuh 5.5 KB
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
1
2
3
4
5
6
7
8
  #ifndef STIM_CUDA_GAUSSIAN_BLUR3_H
  #define STIM_CUDA_GAUSSIAN_BLUR3_H
  
  #include <iostream>
  #include <cuda.h>
  #include <stim/cuda/cudatools.h>
  #include <stim/cuda/sharedmem.cuh>
  #include <stim/cuda/cudatools/error.h>
f12505fb   Laila Saadatifard   upload the ivote ...
9
  #include "cuda_fp16.h"
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
10
11
12
13
14
  
  #define pi	3.14159
  
  				
  		template<typename T>
6ef1dab9   Laila Saadatifard   fix one bug in th...
15
  		__global__ void blur_x(T* out, T* in, T sigma, int x, int y, int z){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
16
  
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
17
18
  			//calculate x,y,z coordinates for this thread
  			int xi = blockIdx.x * blockDim.x + threadIdx.x;
f12505fb   Laila Saadatifard   upload the ivote ...
19
20
21
22
23
  			//find the grid size along y
  			int grid_y = y / blockDim.y;
  			int blockidx_y = blockIdx.y % grid_y;
  			int yi = blockidx_y * blockDim.y + threadIdx.y;
  			int zi = blockIdx.y / grid_y;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
24
25
  			int i = zi * x * y + yi * x + xi;
  			
f12505fb   Laila Saadatifard   upload the ivote ...
26
  			// calculate the kernel size			
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
27
28
29
30
31
32
33
  			int k_size = sigma * 4;
  			
  			//if the current pixel is outside of the image
  			if(xi >= x || yi >= y || zi>=z)
  				return;
  			
  
f12505fb   Laila Saadatifard   upload the ivote ...
34
  			int gx, gi;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
35
36
37
  			T G;
  			T sum = 0;		//running weighted sum across the kernel
  			out[i] = 0;
f12505fb   Laila Saadatifard   upload the ivote ...
38
39
  			T sigma_sq = 2 * sigma * sigma;	
  			T a = 1.0 / (sigma * sqrt(2 * pi));
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
40
41
42
43
44
  
  			//for each element of the kernel
  			for(int ki = -k_size; ki <= k_size; ki++){
  
  				//calculate the gaussian value
f12505fb   Laila Saadatifard   upload the ivote ...
45
  				G = a * exp(-(ki*ki) / (sigma_sq));
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
46
  				//calculate the global coordinates for this point in the kernel
f12505fb   Laila Saadatifard   upload the ivote ...
47
48
49
  				gx = (xi + ki) % x;					
  				gi = zi * x * y + yi * x + gx;
  				sum += G * in[gi];				
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
50
51
52
53
  			}
  			
  			//output the result to global memory
  			out[i] = sum;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
54
55
56
57
  		}
  
  	
  		template<typename T>
6ef1dab9   Laila Saadatifard   fix one bug in th...
58
  		__global__ void blur_y(T* out, T* in, T sigma, int x, int y, int z){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
59
60
61
  			
  			//calculate x,y,z coordinates for this thread
  			int xi = blockIdx.x * blockDim.x + threadIdx.x;
f12505fb   Laila Saadatifard   upload the ivote ...
62
63
64
65
66
  			//find the grid size along y
  			int grid_y = y / blockDim.y;
  			int blockidx_y = blockIdx.y % grid_y;
  			int yi = blockidx_y * blockDim.y + threadIdx.y;
  			int zi = blockIdx.y / grid_y;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
67
  			int i = zi * x * y + yi * x + xi;
f12505fb   Laila Saadatifard   upload the ivote ...
68
69
  
  			// calculate the kernel size			
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
70
71
72
73
74
75
76
  			int k_size = sigma * 4;
  			
  			//if the current pixel is outside of the image
  			if(xi >= x || yi >= y || zi>=z)
  				return;
  			
  
f12505fb   Laila Saadatifard   upload the ivote ...
77
  			int gy, gi;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
78
79
80
  			T G;
  			T sum = 0;		//running weighted sum across the kernel
  			out[i] = 0;
f12505fb   Laila Saadatifard   upload the ivote ...
81
82
  			T sigma_sq = 2 * sigma * sigma;	
  			T a = 1.0 / (sigma * sqrt(2 * pi));	
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
83
84
85
86
87
  
  			//for each element of the kernel
  			for(int ki = -k_size; ki <= k_size; ki++){
  
  				//calculate the gaussian value
f12505fb   Laila Saadatifard   upload the ivote ...
88
  				G = a * exp(-(ki*ki) / sigma_sq);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
89
  				//calculate the global coordinates for this point in the kernel
f12505fb   Laila Saadatifard   upload the ivote ...
90
91
92
  				gy = (yi + ki ) % y;				
  				gi = zi * x * y + gy * x + xi;
  				sum += G * in[gi];						
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
93
94
  			}
  			
f12505fb   Laila Saadatifard   upload the ivote ...
95
  			
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
96
97
98
99
100
  			//output the result to global memory
  			out[i] = sum;
  		}
  		
  		template<typename T>
6ef1dab9   Laila Saadatifard   fix one bug in th...
101
  		__global__ void blur_z(T* out, T* in, T sigma, int x, int y, int z){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
102
103
104
  			
  			//calculate x,y,z coordinates for this thread
  			int xi = blockIdx.x * blockDim.x + threadIdx.x;
f12505fb   Laila Saadatifard   upload the ivote ...
105
106
107
108
109
  			//find the grid size along y
  			int grid_y = y / blockDim.y;
  			int blockidx_y = blockIdx.y % grid_y;
  			int yi = blockidx_y * blockDim.y + threadIdx.y;
  			int zi = blockIdx.y / grid_y;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
110
  			int i = zi * x * y + yi * x + xi;
f12505fb   Laila Saadatifard   upload the ivote ...
111
112
  
  			// calculate the kernel size			
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
113
114
115
116
117
118
119
  			int k_size = sigma * 4;
  			
  			//if the current pixel is outside of the image
  			if(xi >= x || yi >= y || zi>=z)
  				return;
  			
  
f12505fb   Laila Saadatifard   upload the ivote ...
120
  			int gz, gi;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
121
122
123
  			T G;
  			T sum = 0;		//running weighted sum across the kernel
  			out[i] = 0;
f12505fb   Laila Saadatifard   upload the ivote ...
124
125
  			T sigma_sq = 2 * sigma * sigma;	
  			T a = 1.0 / (sigma * sqrt(2 * pi));	
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
126
127
128
129
130
  
  			//for each element of the kernel
  			for(int ki = -k_size; ki <= k_size; ki++){
  
  				//calculate the gaussian value
f12505fb   Laila Saadatifard   upload the ivote ...
131
  				G = a * exp(-(ki*ki) / sigma_sq);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
132
  				//calculate the global coordinates for this point in the kernel
f12505fb   Laila Saadatifard   upload the ivote ...
133
134
135
  				gz = (zi + ki) % z;				
  				gi = gz * x * y + yi * x + xi;					
  				sum += G * in[gi];
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
136
137
138
139
140
141
142
  			}
  			
  			//output the result to global memory
  			out[i] = sum;
  		}
  		
  		template<typename T>
f12505fb   Laila Saadatifard   upload the ivote ...
143
  		void gpu_gaussian_blur3(T* image, T sigma[], unsigned int x, unsigned int y, unsigned int z){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
144
145
146
147
148
149
  
  			//get the number of pixels in the image
  			unsigned int pixels = x * y * z;
  			unsigned int bytes = sizeof(T) * pixels;
  
  			int max_threads = stim::maxThreadsPerBlock();
f12505fb   Laila Saadatifard   upload the ivote ...
150
151
  			dim3 threads(sqrt (max_threads),sqrt (max_threads));
  			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
152
153
154
155
156
157
158
159
160
  
  			//allocate temporary space on the GPU
  			T* gpuIb_x;
  			cudaMalloc(&gpuIb_x, bytes);
  
  			//allocate temporary space on the GPU
  			T* gpuIb_y;
  			cudaMalloc(&gpuIb_y, bytes);
  
f12505fb   Laila Saadatifard   upload the ivote ...
161
162
  			// blur the original image along the x direction
  			blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], x, y, z);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
163
  			
f12505fb   Laila Saadatifard   upload the ivote ...
164
165
  			// blur the x-blurred image along the y direction
  			blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], x, y, z);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
166
  			
f12505fb   Laila Saadatifard   upload the ivote ...
167
168
  			// blur the xy-blurred image along the z direction
  			blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], x, y, z);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
169
170
171
172
173
  
  			//cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice);
  
  			cudaFree(gpuIb_x);
  			cudaFree(gpuIb_y);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
174
175
176
177
178
  		}
  
  
  		/// Applies a Gaussian blur to a 2D image stored on the CPU
  		template<typename T>
f12505fb   Laila Saadatifard   upload the ivote ...
179
  		void cpu_gaussian_blur3(T* blur, T* image, T sigma[], unsigned int x, unsigned int y, unsigned int z){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
  
  			
  			//get the number of pixels in the image
  			unsigned int pixels = x * y *z;
  			unsigned int bytes = sizeof(T) * pixels;
  
  			//---------Allocate Image---------
  			//allocate space on the GPU for the image
  			T* gpuI0;
  			cudaMalloc(&gpuI0, bytes);			
  			
  			//copy the image data to the GPU
  			cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
  
  			//run the GPU-based version of the algorithm
  			gpu_gaussian_blur3<T>(gpuI0, sigma, x, y, z);
  
  			//copy the image data from the device
  			cudaMemcpy(blur, gpuI0, bytes, cudaMemcpyDeviceToHost);
  
  			//free allocated memory
  			cudaFree(gpuI0);
  
  			
  		}
  
  
  #endif