Blame view

cpp/vote3.cuh 4.13 KB
5c079506   Laila Saadatifard   upload the ivote ...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
  #ifndef STIM_CUDA_VOTE3_H
  #define STIM_CUDA_VOTE3_H
  
  #include <iostream>
  #include <cuda.h>
  #include <stim/cuda/cudatools.h>
  #include <stim/cuda/sharedmem.cuh>
  #include <stim/cuda/cudatools/error.h>
  
  
  		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  		template<typename T>
  		__global__ void vote3(T* gpu_vote, T* gpu_grad, T* gpu_mag, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
  
  			//calculate x,y,z coordinates for this thread
  			int xi = blockIdx.x * blockDim.x + threadIdx.x;
  			//find the grid size along y
  			int grid_y = y / blockDim.y;
  			int blockidx_y = blockIdx.y % grid_y;
  			int yi = blockidx_y * blockDim.y + threadIdx.y;
  			int zi = blockIdx.y / grid_y;
  			int i = zi * x * y + yi * x + xi;			
  			
  			if(xi>=x || yi>=y || zi>=z) return;
  			
  			// define a local variable to sum the votes from the voters
  			float sum = 0;
  			
  			int rx_sq = rx * rx;
  			int ry_sq = ry * ry;
  			int rz_sq = rz * rz;
  
  			for (int z_v = -rz; z_v<=rz; z_v++){
  							
  				for(int y_v = -ry; y_v <= ry; y_v++){
  
  					for(int x_v = -rx; x_v <= rx; x_v++){
  
  						//calculate the x, y ,z indices for the current voter
  						int xi_v = (xi + x_v) ;
  						int yi_v = (yi + y_v) ;
  						int zi_v = (zi + z_v) ; 
  						if (zi_v >=0 && zi_v < z && yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){
  														
  							//calculate the 1D index for the current voter
  							unsigned int id_v = (zi_v) * x * y + (yi_v) * x + (xi_v);
  
  							//find the gradient values along the x, y ,z axis, and the gradient magnitude for this voter
  							float mag_v = gpu_mag[id_v];
  							float g_v_x = gpu_grad[id_v * 3 + 0];
  							float g_v_y = gpu_grad[id_v * 3 + 1];
  							float g_v_z = gpu_grad[id_v * 3 + 2];
  							float g_v_m = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);
  
  							//calculate the distance between the pixel and the current voter.
  							float x_sq = x_v * x_v;
  							float y_sq = y_v * y_v;
  							float z_sq = z_v * z_v;
  							float d_pv = sqrt(x_sq + y_sq + z_sq);
  
  							// calculate the angle between the pixel and the current voter.
  							float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * g_v_m);
  						
  							// check if the current voter is located in the voting area of this pixel.
  							if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){
  												
  								sum += mag_v;	
  							}
  						}
  					}	
  				}
  			}
  						
  			gpu_vote[i] = sum;			
  		}
  
  		template<typename T>
  		void gpu_vote3(T* gpu_vote, T* gpu_grad, T* gpu_mag, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
  
  			int rx = r[0];
  			int ry = r[1];
  			int rz = r[2];
  			unsigned int max_threads = stim::maxThreadsPerBlock();
  			dim3 threads(sqrt (max_threads),sqrt (max_threads));
  			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  						
  			//call the kernel to do the voting
  			vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, gpu_mag, cos_phi, rx, ry, rz, x , y, z);
  
  		}
  
  
  		template<typename T>
  		void cpu_vote3(T* cpu_vote, T* cpu_grad, T* cpu_mag, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
  
  			//calculate the number of bytes in the array
  			unsigned int bytes = x * y * z * sizeof(T);
  
  			
  			//allocate space on the GPU for the Vote Image
  			T* gpu_vote;
  			cudaMalloc(&gpu_vote, bytes);		
  
  			//allocate space on the GPU for the input Gradient image
  			T* gpu_grad;
  			cudaMalloc(&gpu_grad, bytes*3);
  			
  			//allocate space on the GPU for the Gradient magnitude
  			T* gpu_mag;
  			cudaMalloc(&gpu_mag, bytes);
  			
  
  			//copy the Gradient data to the GPU
  			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
  			
  			//copy the gradient magnitude to the GPU
  			cudaMemcpy(gpu_mag, cpu_mag, bytes, cudaMemcpyHostToDevice);
  						
  			//call the GPU version of the vote calculation function
  			gpu_vote3<T>(gpu_vote, gpu_grad, gpu_mag, cos_phi, r, x , y, z);
  							
  			//copy the Vote Data back to the CPU
  			cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;
  
  			//free allocated memory
  			cudaFree(gpu_vote);
  			cudaFree(gpu_grad);
  			cudaFree(gpu_mag);
  		}
  
  
  #endif