Blame view

stim/cuda/ivote/vote_atomic_bb.cuh 4.75 KB
3f0de7dd   Laila Saadatifard   upload the vote a...
1
2
  #ifndef STIM_CUDA_VOTE_ATOMIC_BB_H
  #define STIM_CUDA_VOTE_ATOMIC_BB_H
03428452   Laila Saadatifard   update the ivote ...
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
  
  # include <iostream>
  # include <cuda.h>
  #include <stim/cuda/cudatools.h>
  #include <stim/cuda/sharedmem.cuh>
  #include <stim/visualization/aabb2.h>
  #include <stim/visualization/colormap.h>
  #include <math.h>
  
  namespace stim{
  	namespace cuda{
  
  		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  		template<typename T>
  		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  
  			extern __shared__ T S[];
  			T* shared_atan = S;
  			size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
  			stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
  			
  			// calculate the 2D coordinates for this current thread.
  			int xi = blockIdx.x * blockDim.x + threadIdx.x;
  			int yi = blockIdx.y * blockDim.y + threadIdx.y;
  			
  			if(xi >= x || yi >= y) return;			
  			// convert 2D coordinates to 1D
  			int i = yi * x + xi;
  
  			// calculate the voting direction based on the grtadient direction
  			float theta = gpuGrad[2*i];
  			//calculate the amount of vote for the voter
  			float mag = gpuGrad[2*i + 1];
  			
  
  			stim::aabb2<int> bb(xi, yi);								//initialize a bounding box at the current point
  			bb.insert(xi + ceil(rmax * cos(theta)),       ceil(yi + rmax * sin(theta)));
  			bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi)));		//insert one corner of the triangle into the bounding box
  			bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi)));		//insert the final corner into the bounding box
  			
  			// compute the size of window which will be checked for finding the proper voters for this pixel
  			int x_table = 2*rmax +1;
  			int rmax_sq = rmax * rmax;
  			
  			int lut_i;
  			T dx_sq, dy_sq;
  
  			bb.trim_low(0, 0);															//make sure the bounding box doesn't go outside the image
  			bb.trim_high(x-1, y-1);
  
  			int by, bx;
  			int dx, dy;					
  			
  			unsigned int ind_g;											//initialize the maximum vote value to zero
  			T alpha;
  			
  			for(by = bb.low[1]; by <= bb.high[1]; by++){					//for each element in the bounding box
  				dy = by - yi;											//calculate the y coordinate of the current point relative to yi
  				dy_sq = dy * dy;
  				for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
  					dx = bx - xi;
  					dx_sq = dx * dx;
  					lut_i = (rmax - dy) * x_table + rmax - dx;
  					alpha = shared_atan[lut_i];
  					if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
  						ind_g = (by)*x + (bx);
  						atomicAdd(&gpuVote[ind_g], mag);
  					
  					}
  				}
  			}			
  			
  		}
  	
  
  		template<typename T>
  		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  
  							
  			unsigned int max_threads = stim::maxThreadsPerBlock();
  			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
  			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
  			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
  			size_t shared_mem_req = table_bytes;// + template_bytes;
  			std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;		
  			size_t shared_mem = stim::sharedMemPerBlock();
  			if(shared_mem_req > shared_mem){
  				std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
  				exit(1);
  			}
  
  			//call the kernel to do the voting
  			cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  
  		}
  
  
  		template<typename T>
  		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  
  			//calculate the number of bytes in the array
  			unsigned int bytes = x * y * sizeof(T);
  
  			//calculate the number of bytes in the atan2 table
  			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  
  			//allocate space on the GPU for the Vote Image
  			T* gpuVote;
  			cudaMalloc(&gpuVote, bytes);		
  
  			//allocate space on the GPU for the input Gradient image
  			T* gpuGrad;
  			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  
  			//copy the Gradient Magnitude data to the GPU
  			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  
  			//allocate space on the GPU for the atan2 table
  			T* gpuTable;
  			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  
  			//copy the atan2 values to the GPU
  			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  						
  			//call the GPU version of the vote calculation function
  			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  							
  			//copy the Vote Data back to the CPU
  			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  
  			//free allocated memory
  			cudaFree(gpuTable);
  			cudaFree(gpuVote);
  			cudaFree(gpuGrad);
  		}
  		
  	}
  }
  
  #endif