vote3_atomic.cuh
3.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#ifndef STIM_CUDA_VOTE3_ATOMIC_H
#define STIM_CUDA_VOTE3_ATOMIC_H
#include <iostream>
#include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <stim/cuda/cudatools/error.h>
#include "cpyToshare.cuh"
// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
template<typename T>
__global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread
int grid_y = y / blockDim.y; //find the grid size along y
int blockidx_y = blockIdx.y % grid_y;
int yi = blockidx_y * blockDim.y + threadIdx.y;
int zi = blockIdx.y / grid_y;
if(xi>=x || yi>=y || zi>=z) return;
int i = zi * x * y + yi * x + xi; // calculate the 1D index of the voter
float gx_v = gpu_grad[3*i]; // find the gradient information in cartesian coordinate for the voter - global memory fetch
float gy_v = gpu_grad[3*i+1];
float gz_v = gpu_grad[3*i+2];
float mag_v = sqrt(gx_v*gx_v + gy_v*gy_v + gz_v*gz_v); // compute the gradient magnitude for the voter
float gx_v_n = gx_v/mag_v; // normalize the gradient vector for the voter
float gy_v_n = gy_v/mag_v;
float gz_v_n = gz_v/mag_v;
float rx_sq = rx * rx; // compute the square for rmax
float ry_sq = ry * ry;
float rz_sq = rz * rz;
float x_sq, y_sq, z_sq, d_c, cos_diff;
int xi_c, yi_c, zi_c, idx_c;
for (int z_c=-rz; z_c<=rz; z_c++){
zi_c = zi + z_c; // calculate the z position for the current counter
if (zi_c <z && zi_c>=0){ // make sure the current counter is inside the image
z_sq = z_c * z_c;
for (int y_c=-ry; y_c<=ry; y_c++){
yi_c = yi + y_c;
if (yi_c < y && yi_c>=0){
y_sq = y_c * y_c;
for (int x_c=-rx; x_c<=rx; x_c++){
xi_c = xi + x_c;
if (xi_c < x && xi_c>=0){
x_sq = x_c * x_c;
d_c = sqrt(x_sq + y_sq + z_sq); //calculate the distance between the voter and the current counter
cos_diff = (gx_v_n * x_c + gy_v_n * y_c + gz_v_n * z_c)/(d_c); // calculate the cosine of angle between the voter and the current counter
if ( ( (x_sq/rx_sq + y_sq/ry_sq + z_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){
idx_c = (zi_c * y + yi_c) * x + xi_c; //calculate the 1D index for the current counter
atomicAdd (&gpu_vote[idx_c] , mag_v);
}
}
}
}
}
}
}
}
template<typename T>
void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
unsigned int max_threads = stim::maxThreadsPerBlock();
dim3 threads(sqrt (max_threads),sqrt (max_threads));
dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
//unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*4*sizeof(T);
//call the kernel to do the voting
vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);
}
template<typename T>
void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
//calculate the number of bytes in the array
unsigned int bytes = x * y * z * sizeof(T);
//allocate space on the GPU for the Vote Image
T* gpu_vote;
cudaMalloc(&gpu_vote, bytes);
//allocate space on the GPU for the input Gradient image
T* gpu_grad;
cudaMalloc(&gpu_grad, bytes*3);
//copy the Gradient data to the GPU
cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
//call the GPU version of the vote calculation function
gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z);
//copy the Vote Data back to the CPU
cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;
//free allocated memory
cudaFree(gpu_vote);
cudaFree(gpu_grad);
}
#endif