vote3.cuh
5.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#ifndef STIM_CUDA_VOTE3_H
#define STIM_CUDA_VOTE3_H
#include <iostream>
#include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <stim/cuda/cudatools/error.h>
#include "cpyToshare.cuh"
// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
template<typename T>
__global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
extern __shared__ float s[];
//calculate x,y,z coordinates for this thread
int xi = blockIdx.x * blockDim.x + threadIdx.x;
//find the grid size along y
int grid_y = y / blockDim.y;
int blockidx_y = blockIdx.y % grid_y;
int yi = blockidx_y * blockDim.y + threadIdx.y;
int zi = blockIdx.y / grid_y;
int i = zi * x * y + yi * x + xi;
if(xi>=x || yi>=y || zi>=z) return;
//find the starting points and the size of the window, which will be copied to the 2D-shared memory
int bxs = blockIdx.x * blockDim.x - rx;
int bys = blockidx_y * blockDim.y - ry;
int xwidth = 2 * rx + blockDim.x;
int ywidth = 2 * ry + blockDim.y;
//calculate the starting point of shared memory for storing the magnitude.
unsigned int b_s = 3 * xwidth * ywidth;
//compute the coordinations of this pixel in the 2D-shared memory.
int sx_rx = threadIdx.x + rx;
int sy_ry = threadIdx.y + ry;
// define a local variable to sum the votes from the voters
float sum = 0;
int rx_sq = rx * rx;
int ry_sq = ry * ry;
int rz_sq = rz * rz;
for (int z_v = -rz; z_v<=rz; z_v++){
int zi_v = zi + z_v;
if ((zi_v) >=0 && (zi_v) <z){
//call the function to copy one slide of the gradient from global to the 2D-shared memory.
__syncthreads();
cpyG2S2D3ch<float>(s, gpu_grad, bxs, bys, zi + z_v, 3*xwidth, ywidth, threadIdx, blockDim, x, y);
__syncthreads();
mag_share2D<float>(s, b_s, xwidth, ywidth, threadIdx, blockDim);
__syncthreads();
float z_sq = z_v * z_v;
float d_z_sq = z_sq/rz_sq;
for(int y_v = -ry; y_v <= ry; y_v++){
int yi_v = (yi + y_v) ;
//compute the position of the current voter in the shared memory along the y axis.
unsigned int sIdx_y1d = (sy_ry + y_v)* xwidth;
float y_sq = y_v * y_v;
float yz_sq = z_sq + y_sq;
float d_yz_sq = y_sq/ry_sq + d_z_sq;
for(int x_v = -rx; x_v <= rx; x_v++){
//check if the current voter is inside of the data-set
int xi_v = (xi + x_v) ;
if (yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){
//compute the position of the current voter in the 2D-shared memory along the x axis.
unsigned int sIdx_x = (sx_rx + x_v);
//find the 1D index of this voter in the 2D-shared memory.
unsigned int s_Idx = (sIdx_y1d + sIdx_x);
unsigned int s_Idx3 = s_Idx * 3;
//save the gradient values for the current voter to the local variables and compute the gradient magnitude.
float g_v_x = s[s_Idx3];
float g_v_y = s[s_Idx3 + 1];
float g_v_z = s[s_Idx3 + 2];
float mag_v = s[b_s + s_Idx]; //sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);
//calculate the distance between the pixel and the current voter.
float x_sq = x_v * x_v;
float d_pv = sqrt(x_sq + yz_sq);
// calculate the angle between the pixel and the current voter.
float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v);
// check if the current voter is located in the voting area of this pixel.
if ((((x_sq)/rx_sq + d_yz_sq)<= 1) && (cos_diff >= cos_phi)){
sum += mag_v;
}
}
}
}
}
}
gpu_vote[i] = sum;
}
template<typename T>
void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
unsigned int max_threads = stim::maxThreadsPerBlock();
dim3 threads(sqrt (max_threads),sqrt (max_threads));
dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*4*sizeof(T);
//call the kernel to do the voting
vote3 <T> <<< blocks, threads, shared_bytes >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);
}
template<typename T>
void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
//calculate the number of bytes in the array
unsigned int bytes = x * y * z * sizeof(T);
//allocate space on the GPU for the Vote Image
T* gpu_vote;
cudaMalloc(&gpu_vote, bytes);
//allocate space on the GPU for the input Gradient image
T* gpu_grad;
cudaMalloc(&gpu_grad, bytes*3);
//copy the Gradient data to the GPU
cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
//call the GPU version of the vote calculation function
gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z);
//copy the Vote Data back to the CPU
cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;
//free allocated memory
cudaFree(gpu_vote);
cudaFree(gpu_grad);
}
#endif