ivote2.cuh
6.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#ifndef STIM_IVOTE2_CUH
#define STIM_IVOTE2_CUH
#include <iostream>
#include <fstream>
#include <stim/cuda/cudatools/error.h>
#include <stim/cuda/templates/gradient.cuh>
#include <stim/cuda/arraymath.cuh>
#include <stim/iVote/ivote2/iter_vote2.cuh>
#include <stim/iVote/ivote2/local_max.cuh>
#include <stim/math/constants.h>
#include <stim/math/vector.h>
#include <stim/visualization/colormap.h>
namespace stim {
// this function precomputes the atan2 values
template<typename T>
void atan_2(T* cpuTable, unsigned int rmax) {
int xsize = 2 * rmax + 1; //initialize the width and height of the window which atan2 are computed in.
int ysize = 2 * rmax + 1;
int yi = rmax; // assign the center coordinates of the atan2 window to yi and xi
int xi = rmax;
for (int xt = 0; xt < xsize; xt++) { //for each element in the atan2 table
for (int yt = 0; yt < ysize; yt++) {
int id = yt * xsize + xt; //convert the current 2D coordinates to 1D
int xd = xi - xt; // calculate the distance between the pixel and the center of the atan2 window
int yd = yi - yt;
T atan_2d = atan2((T)yd, (T)xd); // calculate the angle between the pixel and the center of the atan2 window and store the result.
cpuTable[id] = atan_2d;
}
}
}
//this kernel invert the 2D image
template<typename T>
__global__ void cuda_invert(T* gpuI, size_t x, size_t y) {
// calculate the 2D coordinates for this current thread.
size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
if (xi >= x || yi >= y) return;
size_t i = yi * x + xi; // convert 2D coordinates to 1D
gpuI[i] = 255 - gpuI[i]; //invert the pixel intensity
}
//this function calculate the threshold using OTSU method
template<typename T>
T th_otsu(T* pts, size_t pixels, unsigned int th_num = 20) {
T Imax = pts[0]; //initialize the maximum value to the first one
T Imin = pts[0]; //initialize the maximum value to the first on
for (size_t n = 0; n < pixels; n++) { //for every value
if (pts[n] > Imax) { //if the value is higher than the current max
Imax = pts[n];
}
}
for (size_t n = 0; n< pixels; n++) { //for every value
if (pts[n] < Imin) { //if the value is higher than the current max
Imin = pts[n];
}
}
T th_step = ((Imax - Imin) / th_num);
std::vector<T> var_b;
for (unsigned int t0 = 0; t0 < th_num; t0++) {
T th = t0 * th_step + Imin;
unsigned int n_b(0), n_o(0); //these variables save the number of elements that are below and over the threshold
T m_b(0), m_o(0); //these variables save the mean value for each cluster
for (unsigned int idx = 0; idx < pixels; idx++) {
if (pts[idx] <= th) {
m_b += pts[idx];
n_b += 1;
}
else {
m_o += pts[idx];
n_o += 1;
}
}
m_b = m_b / n_b; //calculate the mean value for the below threshold cluster
m_o = m_o / n_o; //calculate the mean value for the over threshold cluster
var_b.push_back(n_b * n_o * pow((m_b - m_o), 2));
}
std::vector<float>::iterator max_var = std::max_element(var_b.begin(), var_b.end()); //finding maximum elements in the vector
size_t th_idx = std::distance(var_b.begin(), max_var);
T threshold = Imin + (T)(th_idx * th_step);
return threshold;
}
//this function performs the 2D iterative voting algorithm on the image stored in the gpu
template<typename T>
void gpu_ivote2(T* gpuI, unsigned int rmax, size_t x, size_t y, bool invert = false, T t = 0, int iter = 8, T phi = 15.0f * (float)stim::PI / 180, int conn = 8, bool debug = false) {
size_t pixels = x * y; //compute the size of input image
//
if (invert) { //if inversion is required call the kernel to invert the image
unsigned int max_threads = stim::maxThreadsPerBlock();
dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads));
dim3 blocks((unsigned int)x / threads.x + 1, (unsigned int)y / threads.y + 1);
cuda_invert << <blocks, threads >> > (gpuI, x, y);
}
//
size_t table_bytes = (size_t)(pow(2 * rmax + 1, 2) * sizeof(T)); // create the atan2 table
T* cpuTable = (T*)malloc(table_bytes); //assign memory on the cpu for atan2 table
atan_2<T>(cpuTable, rmax); //call the function to precompute the atan2 table
T* gpuTable; HANDLE_ERROR(cudaMalloc(&gpuTable, table_bytes));
HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, table_bytes, cudaMemcpyHostToDevice)); //copy atan2 table to the gpu
size_t bytes = pixels* sizeof(T); //calculate the bytes of the input
float dphi = phi / iter; //change in phi for each iteration
float* gpuGrad; HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes * 2)); //allocate space to store the 2D gradient
float* gpuVote; HANDLE_ERROR(cudaMalloc(&gpuVote, bytes)); //allocate space to store the vote image
stim::cuda::gpu_gradient_2d<float>(gpuGrad, gpuI, x, y); //calculate the 2D gradient
stim::cuda::gpu_cart2polar<float>(gpuGrad, x, y); //convert cartesian coordinate of gradient to the polar
for (int i = 0; i < iter; i++) { //for each iteration
cudaMemset(gpuVote, 0, bytes); //reset the vote image to 0
stim::cuda::gpu_vote<float>(gpuVote, gpuGrad, gpuTable, phi, rmax, x, y, debug); //perform voting
stim::cuda::gpu_update_dir<float>(gpuVote, gpuGrad, gpuTable, phi, rmax, x, y, debug); //update the voter directions
phi = phi - dphi; //decrement phi
}
stim::cuda::gpu_local_max<float>(gpuI, gpuVote, conn, x, y); //calculate the local maxima
if (t > 0) {
T* pts = (T*)malloc(bytes); //allocate memory on the cpu to store the output of iterative voting
HANDLE_ERROR(cudaMemcpy(pts, gpuI, bytes, cudaMemcpyDeviceToHost)); //copy the output from gpu to the cpu memory
T threshold;
threshold = t;
size_t ind;
for (size_t ix = 0; ix < x; ix++) {
for (size_t iy = 0; iy < y; iy++) {
ind = iy * x + ix;
if (pts[ind] > threshold) {
pts[ind] = 1;
}
else pts[ind] = 0;
}
}
HANDLE_ERROR(cudaMemcpy(gpuI, pts, bytes, cudaMemcpyHostToDevice)); //copy the points to the gpu
}
}
template<typename T>
void cpu_ivote2(T* cpuI, unsigned int rmax, size_t x, size_t y, float &gpu_time, bool invert = false, T t = 0, int iter = 8, T phi = 15.0f * (float)stim::PI / 180, int conn = 8, bool debug = false) {
size_t bytes = x*y * sizeof(T);
T* gpuI; //allocate space on the gpu to save the input image
gpuTimer_start();
HANDLE_ERROR(cudaMalloc(&gpuI, bytes));
HANDLE_ERROR(cudaMemcpy(gpuI, cpuI, bytes, cudaMemcpyHostToDevice)); //copy the image to the gpu
stim::gpu_ivote2<T>(gpuI, rmax, x, y, invert, t, iter, phi, conn, debug); //call the gpu version of the ivote
HANDLE_ERROR(cudaMemcpy(cpuI, gpuI, bytes, cudaMemcpyDeviceToHost)); //copy the output to the cpu
gpu_time = gpuTimer_end();
}
}
#endif