Blame view

stim/iVote/ivote2/vote_atomic_bb.cuh 5.52 KB
3f0de7dd   Laila Saadatifard   upload the vote a...
1
2
  #ifndef STIM_CUDA_VOTE_ATOMIC_BB_H
  #define STIM_CUDA_VOTE_ATOMIC_BB_H
03428452   Laila Saadatifard   update the ivote ...
3
4
5
6
7
8
9
10
11
  
  # include <iostream>
  # include <cuda.h>
  #include <stim/cuda/cudatools.h>
  #include <stim/cuda/sharedmem.cuh>
  #include <stim/visualization/aabb2.h>
  #include <stim/visualization/colormap.h>
  #include <math.h>
  
276f9b23   David Mayerich   forced matching t...
12
13
  
  
03428452   Laila Saadatifard   update the ivote ...
14
15
16
17
18
  namespace stim{
  	namespace cuda{
  
  		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  		template<typename T>
276f9b23   David Mayerich   forced matching t...
19
  		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, size_t x, size_t y, bool gradmag = true){
03428452   Laila Saadatifard   update the ivote ...
20
21
22
23
24
25
26
  
  			extern __shared__ T S[];
  			T* shared_atan = S;
  			size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
  			stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
  			
  			// calculate the 2D coordinates for this current thread.
276f9b23   David Mayerich   forced matching t...
27
28
  			size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
  			size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
03428452   Laila Saadatifard   update the ivote ...
29
30
31
  			
  			if(xi >= x || yi >= y) return;			
  			// convert 2D coordinates to 1D
276f9b23   David Mayerich   forced matching t...
32
  			size_t i = yi * x + xi;
03428452   Laila Saadatifard   update the ivote ...
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
  
  			// calculate the voting direction based on the grtadient direction
  			float theta = gpuGrad[2*i];
  			//calculate the amount of vote for the voter
  			float mag = gpuGrad[2*i + 1];
  			
  
  			stim::aabb2<int> bb(xi, yi);								//initialize a bounding box at the current point
  			bb.insert(xi + ceil(rmax * cos(theta)),       ceil(yi + rmax * sin(theta)));
  			bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi)));		//insert one corner of the triangle into the bounding box
  			bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi)));		//insert the final corner into the bounding box
  			
  			// compute the size of window which will be checked for finding the proper voters for this pixel
  			int x_table = 2*rmax +1;
  			int rmax_sq = rmax * rmax;
  			
  			int lut_i;
  			T dx_sq, dy_sq;
  
  			bb.trim_low(0, 0);															//make sure the bounding box doesn't go outside the image
  			bb.trim_high(x-1, y-1);
  
276f9b23   David Mayerich   forced matching t...
55
  			size_t by, bx;
03428452   Laila Saadatifard   update the ivote ...
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
  			int dx, dy;					
  			
  			unsigned int ind_g;											//initialize the maximum vote value to zero
  			T alpha;
  			
  			for(by = bb.low[1]; by <= bb.high[1]; by++){					//for each element in the bounding box
  				dy = by - yi;											//calculate the y coordinate of the current point relative to yi
  				dy_sq = dy * dy;
  				for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
  					dx = bx - xi;
  					dx_sq = dx * dx;
  					lut_i = (rmax - dy) * x_table + rmax - dx;
  					alpha = shared_atan[lut_i];
  					if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
  						ind_g = (by)*x + (bx);
276f9b23   David Mayerich   forced matching t...
71
72
  						if(gradmag) atomicAdd(&gpuVote[ind_g], mag);			//add the gradient magnitude (if the gradmag flag is enabled)
  						else		atomicAdd(&gpuVote[ind_g], 1.0f);			//otherwise just add 1
03428452   Laila Saadatifard   update the ivote ...
73
74
75
76
77
78
79
80
  					
  					}
  				}
  			}			
  			
  		}
  	
  
276f9b23   David Mayerich   forced matching t...
81
82
83
84
85
86
87
88
  		/// Iterative voting for an image
  		/// @param gpuVote is the resulting vote image
  		/// @param gpuGrad is the gradient of the input image
  		/// @param gpuTable is the pre-computed atan2() table
  		/// @param phi is the angle of the vote region
  		/// @param rmax is the estimated radius of the blob (defines the "width" of the vote region)
  		/// @param x and y are the spatial dimensions of the gradient image
  		/// @param gradmag defines whether or not the gradient magnitude is taken into account during the vote
03428452   Laila Saadatifard   update the ivote ...
89
  		template<typename T>
6316fca0   Laila Saadatifard   fix the 'unresolv...
90
  		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y, bool DEBUG = false, bool gradmag = true){
03428452   Laila Saadatifard   update the ivote ...
91
  			unsigned int max_threads = stim::maxThreadsPerBlock();
276f9b23   David Mayerich   forced matching t...
92
93
  			dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
  			dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
03428452   Laila Saadatifard   update the ivote ...
94
95
  			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
  			size_t shared_mem_req = table_bytes;// + template_bytes;
276f9b23   David Mayerich   forced matching t...
96
  			if (DEBUG) std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
03428452   Laila Saadatifard   update the ivote ...
97
98
  			size_t shared_mem = stim::sharedMemPerBlock();
  			if(shared_mem_req > shared_mem){
6316fca0   Laila Saadatifard   fix the 'unresolv...
99
  				std::cout<<"Error: insufficient shared memory for this implementation of cuda_vote()."<<std::endl;
03428452   Laila Saadatifard   update the ivote ...
100
101
102
103
  				exit(1);
  			}
  
  			//call the kernel to do the voting
276f9b23   David Mayerich   forced matching t...
104
  			cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y, gradmag);
03428452   Laila Saadatifard   update the ivote ...
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
  
  		}
  
  
  		template<typename T>
  		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  
  			//calculate the number of bytes in the array
  			unsigned int bytes = x * y * sizeof(T);
  
  			//calculate the number of bytes in the atan2 table
  			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  
  			//allocate space on the GPU for the Vote Image
  			T* gpuVote;
  			cudaMalloc(&gpuVote, bytes);		
  
  			//allocate space on the GPU for the input Gradient image
  			T* gpuGrad;
  			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  
  			//copy the Gradient Magnitude data to the GPU
  			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  
  			//allocate space on the GPU for the atan2 table
  			T* gpuTable;
  			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  
  			//copy the atan2 values to the GPU
  			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  						
  			//call the GPU version of the vote calculation function
  			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  							
  			//copy the Vote Data back to the CPU
  			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  
  			//free allocated memory
  			cudaFree(gpuTable);
  			cudaFree(gpuVote);
  			cudaFree(gpuGrad);
  		}
  		
  	}
  }
  
  #endif