Blame view

stim/cuda/ivote/local_max.cuh 3.67 KB
13fe3c84   Laila Saadatifard   update the stimli...
1
2
3
4
5
6
  #ifndef STIM_CUDA_LOCAL_MAX_H
  #define STIM_CUDA_LOCAL_MAX_H
  
  
  # include <iostream>
  # include <cuda.h>
96f9b10f   Laila Saadatifard   change the header...
7
  #include <stim/cuda/cudatools.h>
13fe3c84   Laila Saadatifard   update the stimli...
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
  
  namespace stim{
  	namespace cuda{
  
  
  		// this kernel calculates the local maximum for finding the cell centers
  		template<typename T>
  		__global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
  
  			// calculate the 2D coordinates for this current thread.
  			int xi = blockIdx.x * blockDim.x + threadIdx.x;
  			int yi = blockIdx.y;
  			// convert 2D coordinates to 1D
  			int i = yi * x + xi;
  
  
  
  			//calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
  			int xl = xi - conn;
  			int yl = yi - conn;
  			
  			// use zero for the lowest limits if the xi or yi is less than conn.
  			if (xi <= conn)
  				xl = 0;
  			if (yi <= conn)
  				yl = 0;
  
  			//calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
  			int xh = xi + conn;
  			int yh = yi + conn;
  
  			// use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn.
  			if (xi >= x - conn)
  				xh = x;
  			if (yi>= y - conn)
  				yh = y;
  
  			// calculate the limits for finding the local maximum location in the connected neighbors for the current pixel
  			int n_l = yl * x + xl;
  			int n_h = yh * x + xh;
  			
  			//initial the centers image to zero
  			gpuCenters[i] = 0;
  
  
  			int n = n_l;
  
  			float l_value = 0;
  
  			if (i < x * y)
  
  				// check if the vote value for this pixel is greater than threshold, so this pixel may be a local max.
  				if (gpuVote[i]>final_t){
  
  					// compare the vote value for this pixel with the vote values with its neighbors.
  					while (n<=n_h){
  
  						// check if this vote value is a local max in its neighborhood.
  							if (gpuVote[i] < gpuVote[n]){
  								l_value  = 0;
  								n =n_h+1;
  							}
  							else if (n == n_h){
  								l_value = 1;
  								n = n+1;
  							}
  							// check if the current neighbor is the last one at the current row
  							else if ((n - n_l - 2*conn)% x ==0){
  								n = n + x - 2*conn -1;
  								n ++;						
  							}
  							else
  								n ++;
  				}
  					// set the center value for this pixel to high if it's a local max ,and to low if not.
  					gpuCenters[i] = l_value ;
  				}
  
  		}
  
  
  
  		template<typename T>
  		void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
  
  			
  			
  			
  			unsigned int max_threads = stim::maxThreadsPerBlock();
  			dim3 threads(max_threads, 1);
  			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  			
  			
  			
  			//call the kernel to find the local maximum.
  			cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y);
  
  
  		}
  
  
  
  		template<typename T>
  		void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
  		
  
  			//calculate the number of bytes in the array
  			unsigned int bytes = x * y * sizeof(T);
  
  			// allocate space on the GPU for the detected cell centes
  			T* gpuCenters;
  			cudaMalloc(&gpuCenters, bytes);		
  
  
  			//allocate space on the GPU for the input Vote Image
  			T* gpuVote;
  			cudaMalloc(&gpuVote, bytes);		
  
  			
  			//copy the Vote image data to the GPU
  			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
  
  			
  			//call the GPU version of the local max function
  			gpu_local_max<T>(gpuCenters, gpuVote, final_t, conn, x, y);
  
  				
  			//copy the cell centers data to the CPU
  			cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ;
  
  			
  			//free allocated memory
  			cudaFree(gpuCenters);
  			cudaFree(gpuVote);
  			cudaFree(gpuGrad);
  		}
  		
  	}
  }
  
  
  
  #endif