Blame view

cpp/vote3_atomic.cuh 3.96 KB
a744d027   Laila Saadatifard   upload the ivote3...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
  #ifndef STIM_CUDA_VOTE3_ATOMIC_H
  #define STIM_CUDA_VOTE3_ATOMIC_H
  
  #include <iostream>
  #include <cuda.h>
  #include <stim/cuda/cudatools.h>
  #include <stim/cuda/cudatools/error.h>
  #include "cpyToshare.cuh"
  
  		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  		template<typename T>
  		__global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
  
  			int xi = blockIdx.x * blockDim.x + threadIdx.x;						//calculate x,y,z coordinates for this thread
  			
  			int grid_y = y / blockDim.y;										//find the grid size along y
  			int blockidx_y = blockIdx.y % grid_y;
  			int yi = blockidx_y * blockDim.y + threadIdx.y;
  			int zi = blockIdx.y / grid_y;
  						
  			if(xi>=x || yi>=y || zi>=z) return;
  
  			int i = zi * x * y + yi * x + xi;	  // calculate the 1D index of the voter
  			float gx_v = gpu_grad[3*i];           // find the gradient information in cartesian coordinate for the voter - global memory fetch
  			float gy_v = gpu_grad[3*i+1];
  			float gz_v = gpu_grad[3*i+2];
  
  			float mag_v = sqrt(gx_v*gx_v + gy_v*gy_v + gz_v*gz_v);  // compute the gradient magnitude for the voter
  
  			float gx_v_n = gx_v/mag_v;      // normalize the gradient vector for the voter
  			float gy_v_n = gy_v/mag_v;
  			float gz_v_n = gz_v/mag_v;
  
  			float rx_sq = rx * rx;        // compute the square for rmax 
  			float ry_sq = ry * ry;
  			float rz_sq = rz * rz;
  			float x_sq, y_sq, z_sq, d_c, cos_diff;
  			int xi_c, yi_c, zi_c, idx_c;
  
  			for (int z_c=-rz; z_c<=rz; z_c++){
  				zi_c = zi + z_c;              // calculate the z position for the current counter
  				if (zi_c <z && zi_c>=0){      // make sure the current counter is inside the image
  					z_sq = z_c * z_c;
  					for (int y_c=-ry; y_c<=ry; y_c++){
  						yi_c = yi + y_c;
  						if (yi_c < y && yi_c>=0){
  							y_sq = y_c * y_c;
  							for (int x_c=-rx; x_c<=rx; x_c++){
  								xi_c = xi + x_c;
  								if (xi_c < x && xi_c>=0){
  									x_sq = x_c * x_c;
  									d_c = sqrt(x_sq + y_sq + z_sq);			//calculate the distance between the voter and the current counter
  									cos_diff = (gx_v_n * x_c + gy_v_n * y_c +  gz_v_n * z_c)/(d_c);			 // calculate the cosine of angle between the voter and the current counter
  									if ( ( (x_sq/rx_sq + y_sq/ry_sq + z_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){      
  										idx_c = (zi_c * y + yi_c) * x + xi_c;			//calculate the 1D index for the current counter
  										atomicAdd (&gpu_vote[idx_c] , mag_v);
  									}
  								}
  							}
  						}
  					}
  				}
  			}	
  		}
  
  		template<typename T>
  		void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
  
  			
  			unsigned int max_threads = stim::maxThreadsPerBlock();
  			dim3 threads(sqrt (max_threads),sqrt (max_threads));
  			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  			//unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*4*sizeof(T);			
  			//call the kernel to do the voting
  			vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);
  
  		}
  
  
  		template<typename T>
  		void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
  
  			//calculate the number of bytes in the array
  			unsigned int bytes = x * y * z * sizeof(T);
  
  			
  			//allocate space on the GPU for the Vote Image
  			T* gpu_vote;
  			cudaMalloc(&gpu_vote, bytes);		
  
  			//allocate space on the GPU for the input Gradient image
  			T* gpu_grad;
  			cudaMalloc(&gpu_grad, bytes*3);
  			
  			
  			//copy the Gradient data to the GPU
  			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
  			
  					
  			//call the GPU version of the vote calculation function
  			gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z);
  							
  			//copy the Vote Data back to the CPU
  			cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;
  
  			//free allocated memory
  			cudaFree(gpu_vote);
  			cudaFree(gpu_grad);
  			
  		}
  
  
  #endif