vote3.cuh
4.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#ifndef STIM_CUDA_VOTE3_H
#define STIM_CUDA_VOTE3_H
#include <iostream>
#include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <stim/cuda/sharedmem.cuh>
#include <stim/cuda/cudatools/error.h>
// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
template<typename T>
__global__ void vote3(T* gpu_vote, T* gpu_grad, T* gpu_mag, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
//calculate x,y,z coordinates for this thread
int xi = blockIdx.x * blockDim.x + threadIdx.x;
//find the grid size along y
int grid_y = y / blockDim.y;
int blockidx_y = blockIdx.y % grid_y;
int yi = blockidx_y * blockDim.y + threadIdx.y;
int zi = blockIdx.y / grid_y;
int i = zi * x * y + yi * x + xi;
if(xi>=x || yi>=y || zi>=z) return;
// define a local variable to sum the votes from the voters
float sum = 0;
int rx_sq = rx * rx;
int ry_sq = ry * ry;
int rz_sq = rz * rz;
for (int z_v = -rz; z_v<=rz; z_v++){
for(int y_v = -ry; y_v <= ry; y_v++){
for(int x_v = -rx; x_v <= rx; x_v++){
//calculate the x, y ,z indices for the current voter
int xi_v = (xi + x_v) ;
int yi_v = (yi + y_v) ;
int zi_v = (zi + z_v) ;
if (zi_v >=0 && zi_v < z && yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){
//calculate the 1D index for the current voter
unsigned int id_v = (zi_v) * x * y + (yi_v) * x + (xi_v);
//find the gradient values along the x, y ,z axis, and the gradient magnitude for this voter
float mag_v = gpu_mag[id_v];
float g_v_x = gpu_grad[id_v * 3 + 0];
float g_v_y = gpu_grad[id_v * 3 + 1];
float g_v_z = gpu_grad[id_v * 3 + 2];
float g_v_m = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);
//calculate the distance between the pixel and the current voter.
float x_sq = x_v * x_v;
float y_sq = y_v * y_v;
float z_sq = z_v * z_v;
float d_pv = sqrt(x_sq + y_sq + z_sq);
// calculate the angle between the pixel and the current voter.
float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * g_v_m);
// check if the current voter is located in the voting area of this pixel.
if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){
sum += mag_v;
}
}
}
}
}
gpu_vote[i] = sum;
}
template<typename T>
void gpu_vote3(T* gpu_vote, T* gpu_grad, T* gpu_mag, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
int rx = r[0];
int ry = r[1];
int rz = r[2];
unsigned int max_threads = stim::maxThreadsPerBlock();
dim3 threads(sqrt (max_threads),sqrt (max_threads));
dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
//call the kernel to do the voting
vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, gpu_mag, cos_phi, rx, ry, rz, x , y, z);
}
template<typename T>
void cpu_vote3(T* cpu_vote, T* cpu_grad, T* cpu_mag, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
//calculate the number of bytes in the array
unsigned int bytes = x * y * z * sizeof(T);
//allocate space on the GPU for the Vote Image
T* gpu_vote;
cudaMalloc(&gpu_vote, bytes);
//allocate space on the GPU for the input Gradient image
T* gpu_grad;
cudaMalloc(&gpu_grad, bytes*3);
//allocate space on the GPU for the Gradient magnitude
T* gpu_mag;
cudaMalloc(&gpu_mag, bytes);
//copy the Gradient data to the GPU
cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
//copy the gradient magnitude to the GPU
cudaMemcpy(gpu_mag, cpu_mag, bytes, cudaMemcpyHostToDevice);
//call the GPU version of the vote calculation function
gpu_vote3<T>(gpu_vote, gpu_grad, gpu_mag, cos_phi, r, x , y, z);
//copy the Vote Data back to the CPU
cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;
//free allocated memory
cudaFree(gpu_vote);
cudaFree(gpu_grad);
cudaFree(gpu_mag);
}
#endif