vote_atomic_shared.cuh
5.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#ifndef STIM_CUDA_VOTE_ATOMIC_SHARED_H
#define STIM_CUDA_VOTE_ATOMIC_SHARED_H
# include <iostream>
# include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <stim/cuda/sharedmem.cuh>
#include "cpyToshare.cuh"
//#include "writebackshared.cuh"
namespace stim{
namespace cuda{
// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
template<typename T>
__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
//generate a pointer to the shared memory
extern __shared__ float s_vote[];
// calculate the 2D coordinates for this current thread.
int bxi = blockIdx.x * blockDim.x;
int byi = blockIdx.y * blockDim.y;
int xi = bxi + threadIdx.x;
int yi = byi + threadIdx.y;
// convert 2D coordinates to 1D
int i = yi * x + xi;
// calculate the voting direction based on the gradient direction
float theta = gpuGrad[2*i];
//calculate the amount of vote for the voter
float mag = gpuGrad[2*i + 1];
//find the starting points and size of window, wich will be copied to the shared memory
int bxs = bxi - rmax;
int bys = byi - rmax;
int xwidth = 2*rmax + blockDim.x;
int ywidth = 2*rmax + blockDim.y;
//compute the coordinations of this pixel in the 2D-shared memory.
int sx_rx = threadIdx.x + rmax;
int sy_ry = threadIdx.y + rmax;
// compute the size of window which will be checked for finding the counters for this voter
int x_table = 2*rmax +1;
int rmax_sq = rmax * rmax;
//calculate some parameters for indexing shared memory
//calculate the total number of threads available
unsigned int tThreads = blockDim.x * blockDim.y;
//calculate the current 1D thread ID
unsigned int ti = threadIdx.y * (blockDim.x) + threadIdx.x;
//calculate the number of iteration required
unsigned int In = xwidth*ywidth/tThreads + 1;
if(xi < x && yi < y){
__syncthreads();
//initialize the shared memory to zero
for (unsigned int i = 0; i < In; i++){
unsigned int sIdx0 = i * tThreads + ti;
if (sIdx0< xwidth*ywidth) {
s_vote[sIdx0] = 0;
}
}
__syncthreads();
//for every line (along y)
for(int yr = -rmax; yr <= rmax; yr++){
//compute the position of the current voter in the shared memory along the y axis.
unsigned int sIdx_y1d = (sy_ry + yr)* xwidth;
for(int xr = -rmax; xr <= rmax; xr++){
//find the location of the current pixel in the atan2 table
unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
// calculate the angle between the voter and the current pixel in x and y directions
float atan_angle = gpuTable[ind_t];
// check if the current pixel is located in the voting area of this voter.
if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
//compute the position of the current voter in the 2D-shared memory along the x axis.
unsigned int sIdx_x = (sx_rx + xr);
//find the 1D index of this voter in the 2D-shared memory.
unsigned int s_Idx = (sIdx_y1d + sIdx_x);
atomicAdd(&s_vote[s_Idx], mag);
}
}
}
//write shared memory back to global memory
__syncthreads();
for (unsigned int i = 0; i < In; i++){
unsigned int sIdx = i * tThreads + ti;
if (sIdx>= xwidth*ywidth) return;
unsigned int sy = sIdx/xwidth;
unsigned int sx = sIdx - (sy * xwidth);
unsigned int gx = bxs + sx;
unsigned int gy = bys + sy;
if (gx<x&& gy<y){
unsigned int gIdx = gy * x + gx;
//write shared to global memory
atomicAdd(&gpuVote[gIdx], s_vote[sIdx]);
}
}
}
}
template<typename T>
void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
unsigned int max_threads = stim::maxThreadsPerBlock();
dim3 threads(sqrt(max_threads), sqrt(max_threads));
dim3 blocks(x/threads.x + 1 , y/threads.y+1);
// specify share memory
unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*sizeof(T);
//call the kernel to do the voting
cuda_vote <<< blocks, threads, share_bytes>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
}
template<typename T>
void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
//calculate the number of bytes in the array
unsigned int bytes = x * y * sizeof(T);
//calculate the number of bytes in the atan2 table
unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
//allocate space on the GPU for the Vote Image
T* gpuVote;
cudaMalloc(&gpuVote, bytes);
//allocate space on the GPU for the input Gradient image
T* gpuGrad;
HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
//copy the Gradient Magnitude data to the GPU
HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
//allocate space on the GPU for the atan2 table
T* gpuTable;
HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
//copy the atan2 values to the GPU
HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
//call the GPU version of the vote calculation function
gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
//copy the Vote Data back to the CPU
cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
//free allocated memory
cudaFree(gpuTable);
cudaFree(gpuVote);
cudaFree(gpuGrad);
}
}
}
#endif