5c079506
Laila Saadatifard
upload the ivote ...
|
1
2
3
4
5
6
|
#ifndef STIM_CUDA_VOTE3_H
#define STIM_CUDA_VOTE3_H
#include <iostream>
#include <cuda.h>
#include <stim/cuda/cudatools.h>
|
5c079506
Laila Saadatifard
upload the ivote ...
|
7
|
#include <stim/cuda/cudatools/error.h>
|
89604e92
Laila Saadatifard
ivote3 run on the...
|
8
|
#include "cpyToshare.cuh"
|
5c079506
Laila Saadatifard
upload the ivote ...
|
9
10
11
|
// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
template<typename T>
|
02fb26b3
Laila Saadatifard
change the vote a...
|
12
|
__global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
|
5c079506
Laila Saadatifard
upload the ivote ...
|
13
|
|
89604e92
Laila Saadatifard
ivote3 run on the...
|
14
|
extern __shared__ float s[];
|
5c079506
Laila Saadatifard
upload the ivote ...
|
15
16
17
18
19
20
21
22
23
24
25
|
//calculate x,y,z coordinates for this thread
int xi = blockIdx.x * blockDim.x + threadIdx.x;
//find the grid size along y
int grid_y = y / blockDim.y;
int blockidx_y = blockIdx.y % grid_y;
int yi = blockidx_y * blockDim.y + threadIdx.y;
int zi = blockIdx.y / grid_y;
int i = zi * x * y + yi * x + xi;
if(xi>=x || yi>=y || zi>=z) return;
|
89604e92
Laila Saadatifard
ivote3 run on the...
|
26
27
28
29
30
31
32
33
34
35
36
|
//find the starting points and the size of the window, which will be copied to the 2D-shared memory
int bxs = blockIdx.x * blockDim.x - rx;
int bys = blockidx_y * blockDim.y - ry;
int xwidth = 2 * rx + blockDim.x;
int ywidth = 2 * ry + blockDim.y;
//calculate the starting point of shared memory for storing the magnitude.
unsigned int b_s = 3 * xwidth * ywidth;
//compute the coordinations of this pixel in the 2D-shared memory.
int sx_rx = threadIdx.x + rx;
int sy_ry = threadIdx.y + ry;
|
5c079506
Laila Saadatifard
upload the ivote ...
|
37
38
39
40
41
42
43
44
|
// define a local variable to sum the votes from the voters
float sum = 0;
int rx_sq = rx * rx;
int ry_sq = ry * ry;
int rz_sq = rz * rz;
for (int z_v = -rz; z_v<=rz; z_v++){
|
89604e92
Laila Saadatifard
ivote3 run on the...
|
45
46
47
48
49
50
51
52
53
54
55
56
|
int zi_v = zi + z_v;
if ((zi_v) >=0 && (zi_v) <z){
//call the function to copy one slide of the gradient from global to the 2D-shared memory.
__syncthreads();
cpyG2S2D3ch<float>(s, gpu_grad, bxs, bys, zi + z_v, 3*xwidth, ywidth, threadIdx, blockDim, x, y);
__syncthreads();
mag_share2D<float>(s, b_s, xwidth, ywidth, threadIdx, blockDim);
__syncthreads();
float z_sq = z_v * z_v;
float d_z_sq = z_sq/rz_sq;
for(int y_v = -ry; y_v <= ry; y_v++){
|
5c079506
Laila Saadatifard
upload the ivote ...
|
57
|
int yi_v = (yi + y_v) ;
|
89604e92
Laila Saadatifard
ivote3 run on the...
|
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
//compute the position of the current voter in the shared memory along the y axis.
unsigned int sIdx_y1d = (sy_ry + y_v)* xwidth;
float y_sq = y_v * y_v;
float yz_sq = z_sq + y_sq;
float d_yz_sq = y_sq/ry_sq + d_z_sq;
for(int x_v = -rx; x_v <= rx; x_v++){
//check if the current voter is inside of the data-set
int xi_v = (xi + x_v) ;
if (yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){
//compute the position of the current voter in the 2D-shared memory along the x axis.
unsigned int sIdx_x = (sx_rx + x_v);
//find the 1D index of this voter in the 2D-shared memory.
unsigned int s_Idx = (sIdx_y1d + sIdx_x);
unsigned int s_Idx3 = s_Idx * 3;
//save the gradient values for the current voter to the local variables and compute the gradient magnitude.
float g_v_x = s[s_Idx3];
float g_v_y = s[s_Idx3 + 1];
float g_v_z = s[s_Idx3 + 2];
float mag_v = s[b_s + s_Idx]; //sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);
//calculate the distance between the pixel and the current voter.
float x_sq = x_v * x_v;
float d_pv = sqrt(x_sq + yz_sq);
|
5c079506
Laila Saadatifard
upload the ivote ...
|
85
|
|
89604e92
Laila Saadatifard
ivote3 run on the...
|
86
87
|
// calculate the angle between the pixel and the current voter.
float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v);
|
5c079506
Laila Saadatifard
upload the ivote ...
|
88
|
|
89604e92
Laila Saadatifard
ivote3 run on the...
|
89
90
|
// check if the current voter is located in the voting area of this pixel.
if ((((x_sq)/rx_sq + d_yz_sq)<= 1) && (cos_diff >= cos_phi)){
|
5c079506
Laila Saadatifard
upload the ivote ...
|
91
|
|
89604e92
Laila Saadatifard
ivote3 run on the...
|
92
93
|
sum += mag_v;
}
|
5c079506
Laila Saadatifard
upload the ivote ...
|
94
|
}
|
89604e92
Laila Saadatifard
ivote3 run on the...
|
95
96
|
}
}
|
5c079506
Laila Saadatifard
upload the ivote ...
|
97
98
99
100
101
102
103
|
}
}
gpu_vote[i] = sum;
}
template<typename T>
|
1f55a874
Laila Saadatifard
upload the fixed ...
|
104
|
void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
|
5c079506
Laila Saadatifard
upload the ivote ...
|
105
|
|
02fb26b3
Laila Saadatifard
change the vote a...
|
106
|
|
5c079506
Laila Saadatifard
upload the ivote ...
|
107
108
109
|
unsigned int max_threads = stim::maxThreadsPerBlock();
dim3 threads(sqrt (max_threads),sqrt (max_threads));
dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
|
89604e92
Laila Saadatifard
ivote3 run on the...
|
110
|
unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*4*sizeof(T);
|
5c079506
Laila Saadatifard
upload the ivote ...
|
111
|
//call the kernel to do the voting
|
89604e92
Laila Saadatifard
ivote3 run on the...
|
112
|
vote3 <T> <<< blocks, threads, shared_bytes >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);
|
5c079506
Laila Saadatifard
upload the ivote ...
|
113
114
115
116
117
|
}
template<typename T>
|
02fb26b3
Laila Saadatifard
change the vote a...
|
118
|
void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
|
5c079506
Laila Saadatifard
upload the ivote ...
|
119
120
121
122
123
124
125
126
127
128
129
130
131
|
//calculate the number of bytes in the array
unsigned int bytes = x * y * z * sizeof(T);
//allocate space on the GPU for the Vote Image
T* gpu_vote;
cudaMalloc(&gpu_vote, bytes);
//allocate space on the GPU for the input Gradient image
T* gpu_grad;
cudaMalloc(&gpu_grad, bytes*3);
|
5c079506
Laila Saadatifard
upload the ivote ...
|
132
|
|
5c079506
Laila Saadatifard
upload the ivote ...
|
133
134
135
|
//copy the Gradient data to the GPU
cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
|
02fb26b3
Laila Saadatifard
change the vote a...
|
136
|
|
5c079506
Laila Saadatifard
upload the ivote ...
|
137
|
//call the GPU version of the vote calculation function
|
02fb26b3
Laila Saadatifard
change the vote a...
|
138
|
gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z);
|
5c079506
Laila Saadatifard
upload the ivote ...
|
139
140
141
142
143
144
145
|
//copy the Vote Data back to the CPU
cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;
//free allocated memory
cudaFree(gpu_vote);
cudaFree(gpu_grad);
|
02fb26b3
Laila Saadatifard
change the vote a...
|
146
|
|
5c079506
Laila Saadatifard
upload the ivote ...
|
147
148
149
150
|
}
#endif
|