vote_atomic_bb.cuh
5.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#ifndef STIM_CUDA_VOTE_ATOMIC_BB_H
#define STIM_CUDA_VOTE_ATOMIC_BB_H
# include <iostream>
# include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <stim/cuda/sharedmem.cuh>
#include <stim/visualization/aabb2.h>
#include <stim/visualization/colormap.h>
#include <math.h>
namespace stim{
namespace cuda{
// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
template<typename T>
__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, size_t x, size_t y, bool gradmag = true){
extern __shared__ T S[];
T* shared_atan = S;
size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
// calculate the 2D coordinates for this current thread.
size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
if(xi >= x || yi >= y) return;
// convert 2D coordinates to 1D
size_t i = yi * x + xi;
// calculate the voting direction based on the grtadient direction
float theta = gpuGrad[2*i];
//calculate the amount of vote for the voter
float mag = gpuGrad[2*i + 1];
stim::aabb2<int> bb(xi, yi); //initialize a bounding box at the current point
bb.insert(xi + ceil(rmax * cos(theta)), ceil(yi + rmax * sin(theta)));
bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi))); //insert one corner of the triangle into the bounding box
bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi))); //insert the final corner into the bounding box
// compute the size of window which will be checked for finding the proper voters for this pixel
int x_table = 2*rmax +1;
int rmax_sq = rmax * rmax;
int lut_i;
T dx_sq, dy_sq;
bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image
bb.trim_high(x-1, y-1);
size_t by, bx;
int dx, dy;
unsigned int ind_g; //initialize the maximum vote value to zero
T alpha;
for(by = bb.low[1]; by <= bb.high[1]; by++){ //for each element in the bounding box
dy = by - yi; //calculate the y coordinate of the current point relative to yi
dy_sq = dy * dy;
for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
dx = bx - xi;
dx_sq = dx * dx;
lut_i = (rmax - dy) * x_table + rmax - dx;
alpha = shared_atan[lut_i];
if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
ind_g = (by)*x + (bx);
if(gradmag) atomicAdd(&gpuVote[ind_g], mag); //add the gradient magnitude (if the gradmag flag is enabled)
else atomicAdd(&gpuVote[ind_g], 1.0f); //otherwise just add 1
}
}
}
}
/// Iterative voting for an image
/// @param gpuVote is the resulting vote image
/// @param gpuGrad is the gradient of the input image
/// @param gpuTable is the pre-computed atan2() table
/// @param phi is the angle of the vote region
/// @param rmax is the estimated radius of the blob (defines the "width" of the vote region)
/// @param x and y are the spatial dimensions of the gradient image
/// @param gradmag defines whether or not the gradient magnitude is taken into account during the vote
template<typename T>
void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y, bool gradmag = true){
unsigned int max_threads = stim::maxThreadsPerBlock();
dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
size_t shared_mem_req = table_bytes;// + template_bytes;
if (DEBUG) std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
size_t shared_mem = stim::sharedMemPerBlock();
if(shared_mem_req > shared_mem){
std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
exit(1);
}
//call the kernel to do the voting
cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y, gradmag);
}
template<typename T>
void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
//calculate the number of bytes in the array
unsigned int bytes = x * y * sizeof(T);
//calculate the number of bytes in the atan2 table
unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
//allocate space on the GPU for the Vote Image
T* gpuVote;
cudaMalloc(&gpuVote, bytes);
//allocate space on the GPU for the input Gradient image
T* gpuGrad;
HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
//copy the Gradient Magnitude data to the GPU
HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
//allocate space on the GPU for the atan2 table
T* gpuTable;
HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
//copy the atan2 values to the GPU
HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
//call the GPU version of the vote calculation function
gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
//copy the Vote Data back to the CPU
cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
//free allocated memory
cudaFree(gpuTable);
cudaFree(gpuVote);
cudaFree(gpuGrad);
}
}
}
#endif