vote_atomic_bb.cuh
4.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#ifndef STIM_CUDA_VOTE_ATOMIC_BB_H
#define STIM_CUDA_VOTE_ATOMIC_BB_H
# include <iostream>
# include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <stim/cuda/sharedmem.cuh>
#include <stim/visualization/aabb2.h>
#include <stim/visualization/colormap.h>
#include <math.h>
namespace stim{
namespace cuda{
// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
template<typename T>
__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
extern __shared__ T S[];
T* shared_atan = S;
size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
// calculate the 2D coordinates for this current thread.
int xi = blockIdx.x * blockDim.x + threadIdx.x;
int yi = blockIdx.y * blockDim.y + threadIdx.y;
if(xi >= x || yi >= y) return;
// convert 2D coordinates to 1D
int i = yi * x + xi;
// calculate the voting direction based on the grtadient direction
float theta = gpuGrad[2*i];
//calculate the amount of vote for the voter
float mag = gpuGrad[2*i + 1];
stim::aabb2<int> bb(xi, yi); //initialize a bounding box at the current point
bb.insert(xi + ceil(rmax * cos(theta)), ceil(yi + rmax * sin(theta)));
bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi))); //insert one corner of the triangle into the bounding box
bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi))); //insert the final corner into the bounding box
// compute the size of window which will be checked for finding the proper voters for this pixel
int x_table = 2*rmax +1;
int rmax_sq = rmax * rmax;
int lut_i;
T dx_sq, dy_sq;
bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image
bb.trim_high(x-1, y-1);
int by, bx;
int dx, dy;
unsigned int ind_g; //initialize the maximum vote value to zero
T alpha;
for(by = bb.low[1]; by <= bb.high[1]; by++){ //for each element in the bounding box
dy = by - yi; //calculate the y coordinate of the current point relative to yi
dy_sq = dy * dy;
for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
dx = bx - xi;
dx_sq = dx * dx;
lut_i = (rmax - dy) * x_table + rmax - dx;
alpha = shared_atan[lut_i];
if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
ind_g = (by)*x + (bx);
atomicAdd(&gpuVote[ind_g], mag);
}
}
}
}
template<typename T>
void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
unsigned int max_threads = stim::maxThreadsPerBlock();
dim3 threads( sqrt(max_threads), sqrt(max_threads) );
dim3 blocks(x/threads.x + 1, y/threads.y + 1);
size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
size_t shared_mem_req = table_bytes;// + template_bytes;
std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
size_t shared_mem = stim::sharedMemPerBlock();
if(shared_mem_req > shared_mem){
std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
exit(1);
}
//call the kernel to do the voting
cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
}
template<typename T>
void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
//calculate the number of bytes in the array
unsigned int bytes = x * y * sizeof(T);
//calculate the number of bytes in the atan2 table
unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
//allocate space on the GPU for the Vote Image
T* gpuVote;
cudaMalloc(&gpuVote, bytes);
//allocate space on the GPU for the input Gradient image
T* gpuGrad;
HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
//copy the Gradient Magnitude data to the GPU
HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
//allocate space on the GPU for the atan2 table
T* gpuTable;
HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
//copy the atan2 values to the GPU
HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
//call the GPU version of the vote calculation function
gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
//copy the Vote Data back to the CPU
cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
//free allocated memory
cudaFree(gpuTable);
cudaFree(gpuVote);
cudaFree(gpuGrad);
}
}
}
#endif