Blame view

stim/cuda/ivote/vote_atomic_shared.cuh 5.55 KB
11cd127f   Laila Saadatifard   Leila's ivote pro...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
  #ifndef STIM_CUDA_VOTE_ATOMIC_SHARED_H
  #define STIM_CUDA_VOTE_ATOMIC_SHARED_H
  
  # include <iostream>
  # include <cuda.h>
  #include <stim/cuda/cudatools.h>
  #include <stim/cuda/sharedmem.cuh>
  #include "cpyToshare.cuh"
  //#include "writebackshared.cuh"
  namespace stim{
  	namespace cuda{
  
  		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  		template<typename T>
  		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  
  			//generate a pointer to the shared memory
  			extern __shared__ float s_vote[];
  			// calculate the 2D coordinates for this current thread.
  			int bxi = blockIdx.x * blockDim.x;
  			int byi = blockIdx.y * blockDim.y;
  			int xi = bxi + threadIdx.x;
  			int yi = byi + threadIdx.y;
  			// convert 2D coordinates to 1D
  			int i = yi * x + xi;
  
  			// calculate the voting direction based on the gradient direction
  			float theta = gpuGrad[2*i];
  			//calculate the amount of vote for the voter
  			float mag = gpuGrad[2*i + 1];
  			
  			//find the starting points and size of window, wich will be copied to the shared memory
  			int bxs = bxi - rmax;
  			int bys = byi - rmax;
  			int xwidth = 2*rmax + blockDim.x;
  			int ywidth = 2*rmax + blockDim.y;
  			//compute the coordinations of this pixel in the 2D-shared memory.
  			int sx_rx = threadIdx.x + rmax;
  			int sy_ry = threadIdx.y + rmax;
  			// compute the size of window which will be checked for finding the counters for this voter
  			int x_table = 2*rmax +1;
  			int rmax_sq = rmax * rmax;
  			//calculate some parameters for indexing shared memory
  				//calculate the total number of threads available
  				unsigned int tThreads = blockDim.x * blockDim.y;
  				//calculate the current 1D thread ID
  				unsigned int ti =  threadIdx.y * (blockDim.x) + threadIdx.x;
  				//calculate the number of iteration required
  				unsigned int In = xwidth*ywidth/tThreads + 1;
  			if(xi < x && yi < y){
  				__syncthreads();
  				//initialize the shared memory to zero				
  				for (unsigned int i = 0; i < In; i++){								
  					unsigned int sIdx0 = i * tThreads + ti;
  					if (sIdx0< xwidth*ywidth) {
  						s_vote[sIdx0] = 0;
  					}
  				}
  				__syncthreads();
  				//for every line (along y)
  				for(int yr = -rmax; yr <= rmax; yr++){	
  					//compute the position of the current voter in the shared memory along the y axis.
  					unsigned int sIdx_y1d = (sy_ry + yr)* xwidth;
  					for(int xr = -rmax; xr <= rmax; xr++){												
  						
  						//find the location of the current pixel in the atan2 table
  						unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
  
  						// calculate the angle between the voter and the current pixel in x and y directions
  						float atan_angle = gpuTable[ind_t];
  							
  						// check if the current pixel is located in the voting area of this voter.
  						if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  							//compute the position of the current voter in the 2D-shared memory along the x axis.
  							unsigned int sIdx_x = (sx_rx + xr);
  							//find the 1D index of this voter in the 2D-shared memory.
  							unsigned int s_Idx = (sIdx_y1d  + sIdx_x);
  								
  							atomicAdd(&s_vote[s_Idx], mag);								
  							}
  					}
  				}	
  				//write shared memory back to global memory
  				
  				__syncthreads();						
  				for (unsigned int i = 0; i < In; i++){
  				
  					unsigned int sIdx = i * tThreads + ti;
  					if (sIdx>= xwidth*ywidth) return;
  
  					unsigned int sy = sIdx/xwidth;
  					unsigned int sx = sIdx - (sy * xwidth);
  					
  					unsigned int gx = bxs + sx;
  					unsigned int gy = bys + sy;
  					if (gx<x&& gy<y){
  						unsigned int gIdx = gy * x + gx;
  						//write shared to global memory
  						atomicAdd(&gpuVote[gIdx], s_vote[sIdx]);
  						
  					}						
  				}
  				
  			}
  		}
  
  		template<typename T>
  		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  
  							
  			unsigned int max_threads = stim::maxThreadsPerBlock();
  			dim3 threads(sqrt(max_threads), sqrt(max_threads));
  			dim3 blocks(x/threads.x + 1 , y/threads.y+1);
  					
  			// specify  share memory
  			unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*sizeof(T);
  			
  			//call the kernel to do the voting
  			cuda_vote <<< blocks, threads, share_bytes>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  
  		}
  
  
  		template<typename T>
  		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  
  			//calculate the number of bytes in the array
  			unsigned int bytes = x * y * sizeof(T);
  
  			//calculate the number of bytes in the atan2 table
  			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  
  			//allocate space on the GPU for the Vote Image
  			T* gpuVote;
  			cudaMalloc(&gpuVote, bytes);		
  
  			//allocate space on the GPU for the input Gradient image
  			T* gpuGrad;
  			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  
  			//copy the Gradient Magnitude data to the GPU
  			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  
  			//allocate space on the GPU for the atan2 table
  			T* gpuTable;
  			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  
  			//copy the atan2 values to the GPU
  			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  						
  			//call the GPU version of the vote calculation function
  			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  							
  			//copy the Vote Data back to the CPU
  			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  
  			//free allocated memory
  			cudaFree(gpuTable);
  			cudaFree(gpuVote);
  			cudaFree(gpuGrad);
  		}
  		
  	}
  }
  
  #endif