Commit 213d109cb4b746856b5b3e7760ee6d8ef7ed90e7
Merge branch 'master' of git.stim.ee.uh.edu:codebase/stimlib
Showing
14 changed files
with
1196 additions
and
18 deletions
Show diff stats
stim/cuda/arraymath.cuh
@@ -10,7 +10,8 @@ | @@ -10,7 +10,8 @@ | ||
10 | #include <stim/cuda/arraymath/array_atan.cuh> | 10 | #include <stim/cuda/arraymath/array_atan.cuh> |
11 | #include <stim/cuda/arraymath/array_abs.cuh> | 11 | #include <stim/cuda/arraymath/array_abs.cuh> |
12 | #include <stim/cuda/arraymath/array_cart2polar.cuh> | 12 | #include <stim/cuda/arraymath/array_cart2polar.cuh> |
13 | - | 13 | +//#include <stim/cuda/arraymath/array_threshold.cuh> |
14 | +//#include <stim/cuda/arraymath/array_add3.cuh> | ||
14 | namespace stim{ | 15 | namespace stim{ |
15 | namespace cuda{ | 16 | namespace cuda{ |
16 | 17 |
1 | +#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H | ||
2 | +#define STIM_CUDA_UPDATE_DIR_GLOBAL_H | ||
3 | + | ||
4 | +# include <iostream> | ||
5 | +# include <cuda.h> | ||
6 | +#include <stim/cuda/cudatools.h> | ||
7 | +#include <stim/cuda/sharedmem.cuh> | ||
8 | +#include <math.h> | ||
9 | +#include "cpyToshare.cuh" | ||
10 | + | ||
11 | +#define RMAX_TEST 8 | ||
12 | + | ||
13 | +namespace stim{ | ||
14 | + namespace cuda{ | ||
15 | + | ||
16 | + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. | ||
17 | + template<typename T> | ||
18 | + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | ||
19 | + extern __shared__ T atan2_table[]; | ||
20 | + | ||
21 | + //calculate the start point for this block | ||
22 | + //int bxi = blockIdx.x * blockDim.x; | ||
23 | + | ||
24 | + stim::cuda::sharedMemcpy(atan2_table, gpuTable, (2 * rmax + 1) * (2 * rmax + 1), threadIdx.x, blockDim.x); | ||
25 | + | ||
26 | + __syncthreads(); | ||
27 | + | ||
28 | + // calculate the 2D coordinates for this current thread. | ||
29 | + //int xi = bxi + threadIdx.x; | ||
30 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
31 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
32 | + if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel | ||
33 | + | ||
34 | + int i = yi * x + xi; // convert 2D coordinates to 1D | ||
35 | + | ||
36 | + float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch | ||
37 | + gpuDir[i] = 0; //initialize the vote direction to zero | ||
38 | + float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter | ||
39 | + int id_x = 0; // define two local variables for the x and y position of the maximum | ||
40 | + int id_y = 0; | ||
41 | + | ||
42 | + int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter | ||
43 | + int rmax_sq = rmax * rmax; | ||
44 | + int tx_rmax = threadIdx.x + rmax; | ||
45 | + float atan_angle; | ||
46 | + float vote_c; | ||
47 | + unsigned int ind_t; | ||
48 | + for(int yr = -rmax; yr <= rmax; yr++){ //for each counter in the y direction | ||
49 | + if (yi+yr >= 0 && yi + yr < y){ //if the counter exists (we aren't looking outside of the image) | ||
50 | + for(int xr = -rmax; xr <= rmax; xr++){ //for each counter in the x direction | ||
51 | + if((xr * xr + yr *yr)< rmax_sq){ //if the counter is within range of the voter | ||
52 | + | ||
53 | + ind_t = (rmax - yr) * x_table + rmax - xr; //calculate the index to the atan2 table | ||
54 | + atan_angle = atan2_table[ind_t]; //retrieve the direction vector from the table | ||
55 | + | ||
56 | + //atan_angle = atan2((float)yr, (float)xr); | ||
57 | + | ||
58 | + if (abs(atan_angle - theta) <phi){ // check if the current pixel is located in the voting angle of this voter. | ||
59 | + vote_c = gpuVote[(yi+yr)*x + (xi+xr)]; // find the vote value for the current counter | ||
60 | + if(vote_c>max) { // compare the vote value of this pixel with the max value to find the maxima and its index. | ||
61 | + max = vote_c; | ||
62 | + id_x = xr; | ||
63 | + id_y = yr; | ||
64 | + } | ||
65 | + } | ||
66 | + } | ||
67 | + } | ||
68 | + } | ||
69 | + } | ||
70 | + | ||
71 | + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); | ||
72 | + float new_angle = gpuTable[ind_m]; | ||
73 | + | ||
74 | + if(xi < x && yi < y) | ||
75 | + gpuDir[i] = new_angle; | ||
76 | + } //end kernel | ||
77 | + | ||
78 | + // this kernel updates the gradient direction by the calculated voting direction. | ||
79 | + template<typename T> | ||
80 | + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ | ||
81 | + | ||
82 | + // calculate the 2D coordinates for this current thread. | ||
83 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
84 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
85 | + | ||
86 | + // convert 2D coordinates to 1D | ||
87 | + int i = yi * x + xi; | ||
88 | + | ||
89 | + //update the gradient image with the vote direction | ||
90 | + gpuGrad[2*i] = gpuDir[i]; | ||
91 | + } | ||
92 | + | ||
93 | + template<typename T> | ||
94 | + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
95 | + | ||
96 | + | ||
97 | + | ||
98 | + //calculate the number of bytes in the array | ||
99 | + unsigned int bytes = x * y * sizeof(T); | ||
100 | + | ||
101 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | ||
102 | + | ||
103 | + dim3 threads(sqrt(max_threads), sqrt(max_threads)); | ||
104 | + dim3 blocks(x/threads.x + 1, y/threads.y + 1); | ||
105 | + | ||
106 | + | ||
107 | + | ||
108 | + // allocate space on the GPU for the updated vote direction | ||
109 | + T* gpuDir; | ||
110 | + cudaMalloc(&gpuDir, bytes); | ||
111 | + | ||
112 | + size_t shared_mem = sizeof(T) * std::pow((2 * rmax + 1), 2); | ||
113 | + std::cout<<"Shared memory for atan2 table: "<<shared_mem<<std::endl; | ||
114 | + | ||
115 | + //call the kernel to calculate the new voting direction | ||
116 | + cuda_update_dir <<< blocks, threads, shared_mem>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
117 | + | ||
118 | + //call the kernel to update the gradient direction | ||
119 | + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); | ||
120 | + | ||
121 | + //free allocated memory | ||
122 | + cudaFree(gpuDir); | ||
123 | + | ||
124 | + } | ||
125 | + | ||
126 | + template<typename T> | ||
127 | + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
128 | + | ||
129 | + //calculate the number of bytes in the array | ||
130 | + unsigned int bytes = x * y * sizeof(T); | ||
131 | + | ||
132 | + //calculate the number of bytes in the atan2 table | ||
133 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | ||
134 | + | ||
135 | + //allocate space on the GPU for the Vote Image | ||
136 | + T* gpuVote; | ||
137 | + cudaMalloc(&gpuVote, bytes); | ||
138 | + | ||
139 | + //copy the input vote image to the GPU | ||
140 | + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); | ||
141 | + | ||
142 | + //allocate space on the GPU for the input Gradient image | ||
143 | + T* gpuGrad; | ||
144 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | ||
145 | + | ||
146 | + //copy the Gradient data to the GPU | ||
147 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | ||
148 | + | ||
149 | + //allocate space on the GPU for the atan2 table | ||
150 | + T* gpuTable; | ||
151 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | ||
152 | + | ||
153 | + //copy the atan2 values to the GPU | ||
154 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | ||
155 | + | ||
156 | + //call the GPU version of the update direction function | ||
157 | + gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
158 | + | ||
159 | + //copy the new gradient image back to the CPU | ||
160 | + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; | ||
161 | + | ||
162 | + //free allocated memory | ||
163 | + cudaFree(gpuTable); | ||
164 | + cudaFree(gpuVote); | ||
165 | + cudaFree(gpuGrad); | ||
166 | + } | ||
167 | + | ||
168 | + } | ||
169 | +} | ||
170 | + | ||
171 | +#endif | ||
0 | \ No newline at end of file | 172 | \ No newline at end of file |
stim/cuda/ivote/local_max.cuh
@@ -29,17 +29,33 @@ namespace stim{ | @@ -29,17 +29,33 @@ namespace stim{ | ||
29 | //compare to the threshold | 29 | //compare to the threshold |
30 | if(val < final_t) return; | 30 | if(val < final_t) return; |
31 | 31 | ||
32 | + //define an array to store indices with same vote value | ||
33 | + /*int * IdxEq; | ||
34 | + IdxEq = new int [2*conn]; | ||
35 | + int n = 0;*/ | ||
36 | + | ||
32 | for(int xl = xi - conn; xl < xi + conn; xl++){ | 37 | for(int xl = xi - conn; xl < xi + conn; xl++){ |
33 | for(int yl = yi - conn; yl < yi + conn; yl++){ | 38 | for(int yl = yi - conn; yl < yi + conn; yl++){ |
34 | if(xl >= 0 && xl < x && yl >= 0 && yl < y){ | 39 | if(xl >= 0 && xl < x && yl >= 0 && yl < y){ |
35 | int il = yl * x + xl; | 40 | int il = yl * x + xl; |
36 | if(gpuVote[il] > val){ | 41 | if(gpuVote[il] > val){ |
37 | return; | 42 | return; |
38 | - } | ||
39 | - } | 43 | + } |
44 | + if (gpuVote[il] == val){ | ||
45 | + /*IdxEq[n] = il; | ||
46 | + n = n+1;*/ | ||
47 | + if( il > i){ | ||
48 | + return; | ||
49 | + } | ||
50 | + } | ||
51 | + } | ||
40 | } | 52 | } |
41 | } | 53 | } |
42 | - | 54 | + /*if (n!=0){ |
55 | + if(IdxEq[n/2] !=i){ | ||
56 | + return; | ||
57 | + } | ||
58 | + } */ | ||
43 | gpuCenters[i] = 1; | 59 | gpuCenters[i] = 1; |
44 | } | 60 | } |
45 | 61 |
1 | +#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H | ||
2 | +#define STIM_CUDA_UPDATE_DIR_GLOBAL_H | ||
3 | + | ||
4 | +# include <iostream> | ||
5 | +# include <cuda.h> | ||
6 | +#include <stim/cuda/cudatools.h> | ||
7 | +#include <stim/cuda/sharedmem.cuh> | ||
8 | +#include <stim/visualization/aabb2.h> | ||
9 | +#include <stim/visualization/colormap.h> | ||
10 | +#include <math.h> | ||
11 | +#include "cpyToshare.cuh" | ||
12 | + | ||
13 | +//#define RMAX_TEST 8 | ||
14 | + | ||
15 | +namespace stim{ | ||
16 | + namespace cuda{ | ||
17 | + | ||
18 | + template<typename T> | ||
19 | + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | ||
20 | + extern __shared__ T S[]; | ||
21 | + T* shared_atan = S; | ||
22 | + size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1); | ||
23 | + stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x); | ||
24 | + | ||
25 | + //T* shared_vote = &S[n_table]; | ||
26 | + //size_t template_size_x = (blockDim.x + 2 * rmax); | ||
27 | + //size_t template_size_y = (blockDim.y + 2 * rmax); | ||
28 | + //stim::cuda::threadedMemcpy2D((char*)shared_vote, (char*)gpuVote, template_size_x, template_size_y, x, threadIdx.y * blockDim.x + threadIdx.x, blockDim.x * blockDim.y); | ||
29 | + | ||
30 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate the 2D coordinates for this current thread. | ||
31 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
32 | + | ||
33 | + if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel | ||
34 | + | ||
35 | + int i = yi * x + xi; //convert 2D coordinates to 1D | ||
36 | + float theta = gpuGrad[2*i]; //calculate the voting direction based on the grtadient direction - global memory fetch | ||
37 | + | ||
38 | + stim::aabb2<int> bb(xi, yi); //initialize a bounding box at the current point | ||
39 | + bb.insert(xi + ceil(rmax * cos(theta)), ceil(yi + rmax * sin(theta))); | ||
40 | + bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi))); //insert one corner of the triangle into the bounding box | ||
41 | + bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi))); //insert the final corner into the bounding box | ||
42 | + | ||
43 | + int x_table = 2*rmax +1; | ||
44 | + int lut_i; | ||
45 | + T rmax_sq = rmax * rmax; | ||
46 | + T dx_sq, dy_sq; | ||
47 | + | ||
48 | + bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image | ||
49 | + bb.trim_high(x-1, y-1); | ||
50 | + | ||
51 | + int by, bx; | ||
52 | + int dx, dy; //coordinate relative to (xi, yi) | ||
53 | + T v; | ||
54 | + T max_v = 0; //initialize the maximum vote value to zero | ||
55 | + T alpha; | ||
56 | + int max_dx = bb.low[0]; | ||
57 | + int max_dy = bb.low[1]; | ||
58 | + for(by = bb.low[1]; by <= bb.high[1]; by++){ //for each element in the bounding box | ||
59 | + dy = by - yi; //calculate the y coordinate of the current point relative to yi | ||
60 | + dy_sq = dy * dy; | ||
61 | + for(bx = bb.low[0]; bx <= bb.high[0]; bx++){ | ||
62 | + dx = bx - xi; | ||
63 | + dx_sq = dx * dx; | ||
64 | + lut_i = (rmax - dy) * x_table + rmax - dx; | ||
65 | + alpha = shared_atan[lut_i]; | ||
66 | + if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){ | ||
67 | + v = gpuVote[by * x + bx]; // find the vote value for the current counter | ||
68 | + if(v > max_v){ | ||
69 | + max_v = v; | ||
70 | + max_dx = dx; | ||
71 | + max_dy = dy; | ||
72 | + } | ||
73 | + } | ||
74 | + } | ||
75 | + } | ||
76 | + gpuDir[i] = atan2((T)max_dy, (T)max_dx); | ||
77 | + } | ||
78 | + | ||
79 | + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. | ||
80 | + template<typename T> | ||
81 | + __global__ void leila_cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | ||
82 | + | ||
83 | + | ||
84 | + // calculate the 2D coordinates for this current thread. | ||
85 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
86 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
87 | + | ||
88 | + if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel | ||
89 | + | ||
90 | + int i = yi * x + xi; // convert 2D coordinates to 1D | ||
91 | + | ||
92 | + float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch | ||
93 | + gpuDir[i] = 0; //initialize the vote direction to zero | ||
94 | + float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter | ||
95 | + int id_x = 0; // define two local variables for the x and y position of the maximum | ||
96 | + int id_y = 0; | ||
97 | + | ||
98 | + int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter | ||
99 | + int rmax_sq = rmax * rmax; | ||
100 | + int tx_rmax = threadIdx.x + rmax; | ||
101 | + float atan_angle; | ||
102 | + float vote_c; | ||
103 | + int xidx, yidx, yr_sq, xr_sq; | ||
104 | + for(int yr = -rmax; yr <= rmax; yr++){ | ||
105 | + yidx = yi + yr; //compute the index into the image | ||
106 | + if (yidx >= 0 && yidx < y){ //if the current y-index is inside the image | ||
107 | + yr_sq = yr * yr; //compute the square of yr, to save time later | ||
108 | + for(int xr = -rmax; xr <= rmax; xr++){ | ||
109 | + xidx = xi + xr; | ||
110 | + if(xidx >= 0 && xidx < x){ | ||
111 | + xr_sq = xr * xr; | ||
112 | + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; | ||
113 | + | ||
114 | + // calculate the angle between the voter and the current pixel in x and y directions | ||
115 | + atan_angle = gpuTable[ind_t]; | ||
116 | + //atan_angle = atan2((T)yr, (T)xr); | ||
117 | + | ||
118 | + // check if the current pixel is located in the voting area of this voter. | ||
119 | + if (((xr_sq + yr_sq)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | ||
120 | + | ||
121 | + vote_c = gpuVote[yidx * x + xidx]; // find the vote value for the current counter | ||
122 | + // compare the vote value of this pixel with the max value to find the maxima and its index. | ||
123 | + if (vote_c>max) { | ||
124 | + | ||
125 | + max = vote_c; | ||
126 | + id_x = xr; | ||
127 | + id_y = yr; | ||
128 | + } | ||
129 | + } | ||
130 | + } | ||
131 | + } | ||
132 | + } | ||
133 | + } | ||
134 | + | ||
135 | + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); | ||
136 | + float new_angle = gpuTable[ind_m]; | ||
137 | + | ||
138 | + if(xi < x && yi < y) | ||
139 | + gpuDir[i] = new_angle; | ||
140 | + } //end kernel | ||
141 | + | ||
142 | + | ||
143 | + // this kernel updates the gradient direction by the calculated voting direction. | ||
144 | + template<typename T> | ||
145 | + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ | ||
146 | + | ||
147 | + // calculate the 2D coordinates for this current thread. | ||
148 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
149 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
150 | + | ||
151 | + if(xi >= x || yi >= y) return; | ||
152 | + | ||
153 | + // convert 2D coordinates to 1D | ||
154 | + int i = yi * x + xi; | ||
155 | + | ||
156 | + //update the gradient image with the vote direction | ||
157 | + gpuGrad[2*i] = gpuDir[i]; | ||
158 | + } | ||
159 | + | ||
160 | + template<typename T> | ||
161 | + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
162 | + | ||
163 | + //calculate the number of bytes in the array | ||
164 | + unsigned int bytes = x * y * sizeof(T); | ||
165 | + | ||
166 | + // allocate space on the GPU for the updated vote direction | ||
167 | + T* gpuDir; | ||
168 | + HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) ); | ||
169 | + | ||
170 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | ||
171 | + //dim3 threads(min(x, max_threads), 1); | ||
172 | + //dim3 blocks(x/threads.x, y); | ||
173 | + | ||
174 | + dim3 threads( sqrt(max_threads), sqrt(max_threads) ); | ||
175 | + dim3 blocks(x/threads.x + 1, y/threads.y + 1); | ||
176 | + | ||
177 | + size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); | ||
178 | + //size_t curtain = 2 * rmax; | ||
179 | + //size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain); | ||
180 | + size_t shared_mem_req = table_bytes;// + template_bytes; | ||
181 | + std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl; | ||
182 | + | ||
183 | + size_t shared_mem = stim::sharedMemPerBlock(); | ||
184 | + if(shared_mem_req > shared_mem){ | ||
185 | + std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl; | ||
186 | + exit(1); | ||
187 | + } | ||
188 | + | ||
189 | + //call the kernel to calculate the new voting direction | ||
190 | + cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
191 | + stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer); | ||
192 | + | ||
193 | + //exit(0); | ||
194 | + | ||
195 | + threads = dim3( sqrt(max_threads), sqrt(max_threads) ); | ||
196 | + blocks = dim3(x/threads.x + 1, y/threads.y + 1); | ||
197 | + | ||
198 | + //call the kernel to update the gradient direction | ||
199 | + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); | ||
200 | + //free allocated memory | ||
201 | + HANDLE_ERROR( cudaFree(gpuDir) ); | ||
202 | + | ||
203 | + } | ||
204 | + | ||
205 | + template<typename T> | ||
206 | + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
207 | + | ||
208 | + //calculate the number of bytes in the array | ||
209 | + unsigned int bytes = x * y * sizeof(T); | ||
210 | + | ||
211 | + //calculate the number of bytes in the atan2 table | ||
212 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | ||
213 | + | ||
214 | + //allocate space on the GPU for the Vote Image | ||
215 | + T* gpuVote; | ||
216 | + cudaMalloc(&gpuVote, bytes); | ||
217 | + | ||
218 | + //copy the input vote image to the GPU | ||
219 | + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); | ||
220 | + | ||
221 | + //allocate space on the GPU for the input Gradient image | ||
222 | + T* gpuGrad; | ||
223 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | ||
224 | + | ||
225 | + //copy the Gradient data to the GPU | ||
226 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | ||
227 | + | ||
228 | + //allocate space on the GPU for the atan2 table | ||
229 | + T* gpuTable; | ||
230 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | ||
231 | + | ||
232 | + //copy the atan2 values to the GPU | ||
233 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | ||
234 | + | ||
235 | + //call the GPU version of the update direction function | ||
236 | + gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
237 | + | ||
238 | + //copy the new gradient image back to the CPU | ||
239 | + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; | ||
240 | + | ||
241 | + //free allocated memory | ||
242 | + cudaFree(gpuTable); | ||
243 | + cudaFree(gpuVote); | ||
244 | + cudaFree(gpuGrad); | ||
245 | + } | ||
246 | + | ||
247 | + } | ||
248 | +} | ||
249 | + | ||
250 | +#endif | ||
0 | \ No newline at end of file | 251 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_UPDATE_DIR_SHARED_H | ||
2 | +#define STIM_CUDA_UPDATE_DIR_SHARED_H | ||
3 | + | ||
4 | +# include <iostream> | ||
5 | +# include <cuda.h> | ||
6 | +#include <stim/cuda/cudatools.h> | ||
7 | +#include <stim/cuda/sharedmem.cuh> | ||
8 | +#include "cpyToshare.cuh" | ||
9 | + | ||
10 | +namespace stim{ | ||
11 | + namespace cuda{ | ||
12 | + | ||
13 | + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. | ||
14 | + template<typename T> | ||
15 | + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | ||
16 | + | ||
17 | + //generate a pointer to shared memory (size will be specified as a kernel parameter) | ||
18 | + extern __shared__ float s_vote[]; | ||
19 | + | ||
20 | + //calculate the start point for this block | ||
21 | + int bxi = blockIdx.x * blockDim.x; | ||
22 | + | ||
23 | + // calculate the 2D coordinates for this current thread. | ||
24 | + int xi = bxi + threadIdx.x; | ||
25 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
26 | + // convert 2D coordinates to 1D | ||
27 | + int i = yi * x + xi; | ||
28 | + | ||
29 | + // calculate the voting direction based on the grtadient direction | ||
30 | + float theta = gpuGrad[2*i]; | ||
31 | + | ||
32 | + //initialize the vote direction to zero | ||
33 | + gpuDir[i] = 0; | ||
34 | + | ||
35 | + // define a local variable to maximum value of the vote image in the voting area for this voter | ||
36 | + float max = 0; | ||
37 | + | ||
38 | + // define two local variables for the x and y coordinations where the maximum happened | ||
39 | + int id_x = 0; | ||
40 | + int id_y = 0; | ||
41 | + | ||
42 | + //calculate the width of the shared memory block | ||
43 | + int swidth = 2 * rmax + blockDim.x; | ||
44 | + | ||
45 | + // compute the size of window which will be checked for finding the voting area for this voter | ||
46 | + int x_table = 2*rmax +1; | ||
47 | + int rmax_sq = rmax * rmax; | ||
48 | + int tx_rmax = threadIdx.x + rmax; | ||
49 | + int bxs = bxi - rmax; | ||
50 | + | ||
51 | + for(int yr = -rmax; yr <= rmax; yr++){ | ||
52 | + //if (yi+yr >= 0 && yi + yr < y){ | ||
53 | + //copy the portion of the image necessary for this block to shared memory | ||
54 | + __syncthreads(); | ||
55 | + cpyG2S1D<float>(s_vote, gpuVote, bxs, yi + yr , swidth, 1, threadIdx, blockDim, x, y); | ||
56 | + __syncthreads(); | ||
57 | + | ||
58 | + //if the current thread is outside of the image, it doesn't have to be computed | ||
59 | + if(xi < x && yi < y){ | ||
60 | + | ||
61 | + for(int xr = -rmax; xr <= rmax; xr++){ | ||
62 | + | ||
63 | + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; | ||
64 | + | ||
65 | + // calculate the angle between the voter and the current pixel in x and y directions | ||
66 | + float atan_angle = gpuTable[ind_t]; | ||
67 | + | ||
68 | + // calculate the voting direction based on the grtadient direction | ||
69 | + int idx_share_update = xr + tx_rmax ; | ||
70 | + float share_vote = s_vote[idx_share_update]; | ||
71 | + | ||
72 | + // check if the current pixel is located in the voting area of this voter. | ||
73 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | ||
74 | + | ||
75 | + // compare the vote value of this pixel with the max value to find the maxima and its index. | ||
76 | + if (share_vote>max) { | ||
77 | + | ||
78 | + max = share_vote; | ||
79 | + id_x = xr; | ||
80 | + id_y = yr; | ||
81 | + } | ||
82 | + } | ||
83 | + } | ||
84 | + } | ||
85 | + //} | ||
86 | + } | ||
87 | + | ||
88 | + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); | ||
89 | + float new_angle = gpuTable[ind_m]; | ||
90 | + | ||
91 | + if(xi < x && yi < y) | ||
92 | + gpuDir[i] = new_angle; | ||
93 | + | ||
94 | + } | ||
95 | + | ||
96 | + // this kernel updates the gradient direction by the calculated voting direction. | ||
97 | + template<typename T> | ||
98 | + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ | ||
99 | + | ||
100 | + // calculate the 2D coordinates for this current thread. | ||
101 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
102 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
103 | + | ||
104 | + // convert 2D coordinates to 1D | ||
105 | + int i = yi * x + xi; | ||
106 | + | ||
107 | + //update the gradient image with the vote direction | ||
108 | + gpuGrad[2*i] = gpuDir[i]; | ||
109 | + } | ||
110 | + | ||
111 | + template<typename T> | ||
112 | + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
113 | + | ||
114 | + //calculate the number of bytes in the array | ||
115 | + unsigned int bytes = x * y * sizeof(T); | ||
116 | + | ||
117 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | ||
118 | + dim3 threads(max_threads, 1); | ||
119 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | ||
120 | + | ||
121 | + // specify share memory | ||
122 | + unsigned int share_bytes = (2*rmax + threads.x)*(1)*4; | ||
123 | + | ||
124 | + // allocate space on the GPU for the updated vote direction | ||
125 | + T* gpuDir; | ||
126 | + cudaMalloc(&gpuDir, bytes); | ||
127 | + | ||
128 | + //call the kernel to calculate the new voting direction | ||
129 | + cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
130 | + | ||
131 | + //call the kernel to update the gradient direction | ||
132 | + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); | ||
133 | + | ||
134 | + //free allocated memory | ||
135 | + cudaFree(gpuDir); | ||
136 | + | ||
137 | + } | ||
138 | + | ||
139 | + template<typename T> | ||
140 | + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
141 | + | ||
142 | + //calculate the number of bytes in the array | ||
143 | + unsigned int bytes = x * y * sizeof(T); | ||
144 | + | ||
145 | + //calculate the number of bytes in the atan2 table | ||
146 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | ||
147 | + | ||
148 | + //allocate space on the GPU for the Vote Image | ||
149 | + T* gpuVote; | ||
150 | + cudaMalloc(&gpuVote, bytes); | ||
151 | + | ||
152 | + //copy the input vote image to the GPU | ||
153 | + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); | ||
154 | + | ||
155 | + //allocate space on the GPU for the input Gradient image | ||
156 | + T* gpuGrad; | ||
157 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | ||
158 | + | ||
159 | + //copy the Gradient data to the GPU | ||
160 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | ||
161 | + | ||
162 | + //allocate space on the GPU for the atan2 table | ||
163 | + T* gpuTable; | ||
164 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | ||
165 | + | ||
166 | + //copy the atan2 values to the GPU | ||
167 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | ||
168 | + | ||
169 | + //call the GPU version of the update direction function | ||
170 | + gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
171 | + | ||
172 | + //copy the new gradient image back to the CPU | ||
173 | + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; | ||
174 | + | ||
175 | + //free allocated memory | ||
176 | + cudaFree(gpuTable); | ||
177 | + cudaFree(gpuVote); | ||
178 | + cudaFree(gpuGrad); | ||
179 | + } | ||
180 | + | ||
181 | + } | ||
182 | +} | ||
183 | + | ||
184 | +#endif | ||
0 | \ No newline at end of file | 185 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_VOTE_ATOMIC_H | ||
2 | +#define STIM_CUDA_VOTE_ATOMIC_H | ||
3 | + | ||
4 | +# include <iostream> | ||
5 | +# include <cuda.h> | ||
6 | +#include <stim/cuda/cudatools.h> | ||
7 | +#include <stim/cuda/sharedmem.cuh> | ||
8 | +#include "cpyToshare.cuh" | ||
9 | + | ||
10 | +namespace stim{ | ||
11 | + namespace cuda{ | ||
12 | + | ||
13 | + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | ||
14 | + template<typename T> | ||
15 | + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | ||
16 | + | ||
17 | + | ||
18 | + // calculate the 2D coordinates for this current thread. | ||
19 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
20 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
21 | + // convert 2D coordinates to 1D | ||
22 | + int i = yi * x + xi; | ||
23 | + | ||
24 | + // calculate the voting direction based on the grtadient direction | ||
25 | + float theta = gpuGrad[2*i]; | ||
26 | + //calculate the amount of vote for the voter | ||
27 | + float mag = gpuGrad[2*i + 1]; | ||
28 | + | ||
29 | + // compute the size of window which will be checked for finding the proper voters for this pixel | ||
30 | + int x_table = 2*rmax +1; | ||
31 | + int rmax_sq = rmax * rmax; | ||
32 | + if(xi < x && yi < y){ | ||
33 | + //for every line (along y) | ||
34 | + for(int yr = -rmax; yr <= rmax; yr++){ | ||
35 | + for(int xr = -rmax; xr <= rmax; xr++){ | ||
36 | + if ((yi+yr)>=0 && (yi+yr)<y && (xi+xr)>=0 && (xi+xr)<x){ | ||
37 | + | ||
38 | + //find the location of the current pixel in the atan2 table | ||
39 | + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; | ||
40 | + | ||
41 | + // calculate the angle between the voter and the current pixel in x and y directions | ||
42 | + float atan_angle = gpuTable[ind_t]; | ||
43 | + | ||
44 | + // check if the current pixel is located in the voting area of this voter. | ||
45 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | ||
46 | + // calculate the 1D index for the current pixel in global memory | ||
47 | + unsigned int ind_g = (yi+yr)*x + (xi+xr); | ||
48 | + atomicAdd(&gpuVote[ind_g], mag); | ||
49 | + | ||
50 | + } | ||
51 | + } | ||
52 | + } | ||
53 | + } | ||
54 | + } | ||
55 | + } | ||
56 | + | ||
57 | + template<typename T> | ||
58 | + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
59 | + | ||
60 | + | ||
61 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | ||
62 | + dim3 threads(max_threads, 1); | ||
63 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | ||
64 | + | ||
65 | + // specify share memory | ||
66 | + //unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4; | ||
67 | + | ||
68 | + //call the kernel to do the voting | ||
69 | + cuda_vote <<< blocks, threads>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
70 | + | ||
71 | + } | ||
72 | + | ||
73 | + | ||
74 | + template<typename T> | ||
75 | + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
76 | + | ||
77 | + //calculate the number of bytes in the array | ||
78 | + unsigned int bytes = x * y * sizeof(T); | ||
79 | + | ||
80 | + //calculate the number of bytes in the atan2 table | ||
81 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | ||
82 | + | ||
83 | + //allocate space on the GPU for the Vote Image | ||
84 | + T* gpuVote; | ||
85 | + cudaMalloc(&gpuVote, bytes); | ||
86 | + | ||
87 | + //allocate space on the GPU for the input Gradient image | ||
88 | + T* gpuGrad; | ||
89 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | ||
90 | + | ||
91 | + //copy the Gradient Magnitude data to the GPU | ||
92 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | ||
93 | + | ||
94 | + //allocate space on the GPU for the atan2 table | ||
95 | + T* gpuTable; | ||
96 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | ||
97 | + | ||
98 | + //copy the atan2 values to the GPU | ||
99 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | ||
100 | + | ||
101 | + //call the GPU version of the vote calculation function | ||
102 | + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
103 | + | ||
104 | + //copy the Vote Data back to the CPU | ||
105 | + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; | ||
106 | + | ||
107 | + //free allocated memory | ||
108 | + cudaFree(gpuTable); | ||
109 | + cudaFree(gpuVote); | ||
110 | + cudaFree(gpuGrad); | ||
111 | + } | ||
112 | + | ||
113 | + } | ||
114 | +} | ||
115 | + | ||
116 | +#endif | ||
0 | \ No newline at end of file | 117 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_VOTE_ATOMIC_SHARED_H | ||
2 | +#define STIM_CUDA_VOTE_ATOMIC_SHARED_H | ||
3 | + | ||
4 | +# include <iostream> | ||
5 | +# include <cuda.h> | ||
6 | +#include <stim/cuda/cudatools.h> | ||
7 | +#include <stim/cuda/sharedmem.cuh> | ||
8 | +#include "cpyToshare.cuh" | ||
9 | +//#include "writebackshared.cuh" | ||
10 | +namespace stim{ | ||
11 | + namespace cuda{ | ||
12 | + | ||
13 | + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | ||
14 | + template<typename T> | ||
15 | + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | ||
16 | + | ||
17 | + //generate a pointer to the shared memory | ||
18 | + extern __shared__ float s_vote[]; | ||
19 | + // calculate the 2D coordinates for this current thread. | ||
20 | + int bxi = blockIdx.x * blockDim.x; | ||
21 | + int byi = blockIdx.y * blockDim.y; | ||
22 | + int xi = bxi + threadIdx.x; | ||
23 | + int yi = byi + threadIdx.y; | ||
24 | + // convert 2D coordinates to 1D | ||
25 | + int i = yi * x + xi; | ||
26 | + | ||
27 | + // calculate the voting direction based on the gradient direction | ||
28 | + float theta = gpuGrad[2*i]; | ||
29 | + //calculate the amount of vote for the voter | ||
30 | + float mag = gpuGrad[2*i + 1]; | ||
31 | + | ||
32 | + //find the starting points and size of window, wich will be copied to the shared memory | ||
33 | + int bxs = bxi - rmax; | ||
34 | + int bys = byi - rmax; | ||
35 | + int xwidth = 2*rmax + blockDim.x; | ||
36 | + int ywidth = 2*rmax + blockDim.y; | ||
37 | + //compute the coordinations of this pixel in the 2D-shared memory. | ||
38 | + int sx_rx = threadIdx.x + rmax; | ||
39 | + int sy_ry = threadIdx.y + rmax; | ||
40 | + // compute the size of window which will be checked for finding the counters for this voter | ||
41 | + int x_table = 2*rmax +1; | ||
42 | + int rmax_sq = rmax * rmax; | ||
43 | + //calculate some parameters for indexing shared memory | ||
44 | + //calculate the total number of threads available | ||
45 | + unsigned int tThreads = blockDim.x * blockDim.y; | ||
46 | + //calculate the current 1D thread ID | ||
47 | + unsigned int ti = threadIdx.y * (blockDim.x) + threadIdx.x; | ||
48 | + //calculate the number of iteration required | ||
49 | + unsigned int In = xwidth*ywidth/tThreads + 1; | ||
50 | + if(xi < x && yi < y){ | ||
51 | + __syncthreads(); | ||
52 | + //initialize the shared memory to zero | ||
53 | + for (unsigned int i = 0; i < In; i++){ | ||
54 | + unsigned int sIdx0 = i * tThreads + ti; | ||
55 | + if (sIdx0< xwidth*ywidth) { | ||
56 | + s_vote[sIdx0] = 0; | ||
57 | + } | ||
58 | + } | ||
59 | + __syncthreads(); | ||
60 | + //for every line (along y) | ||
61 | + for(int yr = -rmax; yr <= rmax; yr++){ | ||
62 | + //compute the position of the current voter in the shared memory along the y axis. | ||
63 | + unsigned int sIdx_y1d = (sy_ry + yr)* xwidth; | ||
64 | + for(int xr = -rmax; xr <= rmax; xr++){ | ||
65 | + | ||
66 | + //find the location of the current pixel in the atan2 table | ||
67 | + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; | ||
68 | + | ||
69 | + // calculate the angle between the voter and the current pixel in x and y directions | ||
70 | + float atan_angle = gpuTable[ind_t]; | ||
71 | + | ||
72 | + // check if the current pixel is located in the voting area of this voter. | ||
73 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | ||
74 | + //compute the position of the current voter in the 2D-shared memory along the x axis. | ||
75 | + unsigned int sIdx_x = (sx_rx + xr); | ||
76 | + //find the 1D index of this voter in the 2D-shared memory. | ||
77 | + unsigned int s_Idx = (sIdx_y1d + sIdx_x); | ||
78 | + | ||
79 | + atomicAdd(&s_vote[s_Idx], mag); | ||
80 | + } | ||
81 | + } | ||
82 | + } | ||
83 | + //write shared memory back to global memory | ||
84 | + | ||
85 | + __syncthreads(); | ||
86 | + for (unsigned int i = 0; i < In; i++){ | ||
87 | + | ||
88 | + unsigned int sIdx = i * tThreads + ti; | ||
89 | + if (sIdx>= xwidth*ywidth) return; | ||
90 | + | ||
91 | + unsigned int sy = sIdx/xwidth; | ||
92 | + unsigned int sx = sIdx - (sy * xwidth); | ||
93 | + | ||
94 | + unsigned int gx = bxs + sx; | ||
95 | + unsigned int gy = bys + sy; | ||
96 | + if (gx<x&& gy<y){ | ||
97 | + unsigned int gIdx = gy * x + gx; | ||
98 | + //write shared to global memory | ||
99 | + atomicAdd(&gpuVote[gIdx], s_vote[sIdx]); | ||
100 | + | ||
101 | + } | ||
102 | + } | ||
103 | + | ||
104 | + } | ||
105 | + } | ||
106 | + | ||
107 | + template<typename T> | ||
108 | + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
109 | + | ||
110 | + | ||
111 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | ||
112 | + dim3 threads(sqrt(max_threads), sqrt(max_threads)); | ||
113 | + dim3 blocks(x/threads.x + 1 , y/threads.y+1); | ||
114 | + | ||
115 | + // specify share memory | ||
116 | + unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*sizeof(T); | ||
117 | + | ||
118 | + //call the kernel to do the voting | ||
119 | + cuda_vote <<< blocks, threads, share_bytes>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
120 | + | ||
121 | + } | ||
122 | + | ||
123 | + | ||
124 | + template<typename T> | ||
125 | + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
126 | + | ||
127 | + //calculate the number of bytes in the array | ||
128 | + unsigned int bytes = x * y * sizeof(T); | ||
129 | + | ||
130 | + //calculate the number of bytes in the atan2 table | ||
131 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | ||
132 | + | ||
133 | + //allocate space on the GPU for the Vote Image | ||
134 | + T* gpuVote; | ||
135 | + cudaMalloc(&gpuVote, bytes); | ||
136 | + | ||
137 | + //allocate space on the GPU for the input Gradient image | ||
138 | + T* gpuGrad; | ||
139 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | ||
140 | + | ||
141 | + //copy the Gradient Magnitude data to the GPU | ||
142 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | ||
143 | + | ||
144 | + //allocate space on the GPU for the atan2 table | ||
145 | + T* gpuTable; | ||
146 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | ||
147 | + | ||
148 | + //copy the atan2 values to the GPU | ||
149 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | ||
150 | + | ||
151 | + //call the GPU version of the vote calculation function | ||
152 | + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
153 | + | ||
154 | + //copy the Vote Data back to the CPU | ||
155 | + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; | ||
156 | + | ||
157 | + //free allocated memory | ||
158 | + cudaFree(gpuTable); | ||
159 | + cudaFree(gpuVote); | ||
160 | + cudaFree(gpuGrad); | ||
161 | + } | ||
162 | + | ||
163 | + } | ||
164 | +} | ||
165 | + | ||
166 | +#endif | ||
0 | \ No newline at end of file | 167 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_VOTE_SHARED_H | ||
2 | +#define STIM_CUDA_VOTE_SHARED | ||
3 | +# include <iostream> | ||
4 | +# include <cuda.h> | ||
5 | +#include <stim/cuda/cudatools.h> | ||
6 | +#include <stim/cuda/sharedmem.cuh> | ||
7 | +#include "cpyToshare.cuh" | ||
8 | + | ||
9 | +namespace stim{ | ||
10 | + namespace cuda{ | ||
11 | + | ||
12 | + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | ||
13 | + template<typename T> | ||
14 | + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | ||
15 | + | ||
16 | + //generate a pointer to shared memory (size will be specified as a kernel parameter) | ||
17 | + extern __shared__ float s_grad[]; | ||
18 | + | ||
19 | + //calculate the start point for this block | ||
20 | + int bxi = blockIdx.x * blockDim.x; | ||
21 | + int byi = blockIdx.y * blockDim.y; | ||
22 | + // calculate the 2D coordinates for this current thread. | ||
23 | + int xi = bxi + threadIdx.x; | ||
24 | + int yi = byi + threadIdx.y; | ||
25 | + // convert 2D coordinates to 1D | ||
26 | + int i = yi * x + xi; | ||
27 | + | ||
28 | + // define a local variable to sum the votes from the voters | ||
29 | + float sum = 0; | ||
30 | + | ||
31 | + //calculate the width of the shared memory block | ||
32 | + int xwidth = 2 * rmax + blockDim.x; | ||
33 | + int ywidth = 2 * rmax + blockDim.y; | ||
34 | + // compute the size of window which will be checked for finding the proper voters for this pixel | ||
35 | + int x_table = 2*rmax +1; | ||
36 | + int rmax_sq = rmax * rmax; | ||
37 | + int tx_rmax = threadIdx.x + rmax; | ||
38 | + int bxs = bxi - rmax; | ||
39 | + int bys = byi - rmax; | ||
40 | + //compute the coordinations of this pixel in the 2D-shared memory. | ||
41 | + int sx_rx = threadIdx.x + rmax; | ||
42 | + int sy_ry = threadIdx.y + rmax; | ||
43 | + //copy the portion of the image necessary for this block to shared memory | ||
44 | + __syncthreads(); | ||
45 | + cpyG2S2D2ch<float>(s_grad, gpuGrad, bxs, bys, 2*xwidth, ywidth, threadIdx, blockDim, x, y); | ||
46 | + __syncthreads(); | ||
47 | + | ||
48 | + for(int yr = -rmax; yr <= rmax; yr++){ | ||
49 | + int yi_v = (yi + yr) ; | ||
50 | + //compute the position of the current voter in the shared memory along the y axis. | ||
51 | + unsigned int sIdx_y1d = (sy_ry + yr)* xwidth; | ||
52 | + //if (yi+yr<y && yi+yr>=0){ | ||
53 | + if(xi < x && yi < y){ | ||
54 | + | ||
55 | + for(int xr = -rmax; xr <= rmax; xr++){ | ||
56 | + | ||
57 | + //compute the position of the current voter in the 2D-shared memory along the x axis. | ||
58 | + unsigned int sIdx_x = (sx_rx + xr); | ||
59 | + //find the 1D index of this voter in the 2D-shared memory. | ||
60 | + unsigned int s_Idx = (sIdx_y1d + sIdx_x); | ||
61 | + unsigned int s_Idx2 = s_Idx * 2; | ||
62 | + | ||
63 | + //find the location of this voter in the atan2 table | ||
64 | + int id_t = (yr + rmax) * x_table + xr + rmax; | ||
65 | + | ||
66 | + // calculate the angle between the pixel and the current voter in x and y directions | ||
67 | + float atan_angle = gpuTable[id_t]; | ||
68 | + | ||
69 | + // calculate the voting direction based on the grtadient direction | ||
70 | + //int idx_share = xr + tx_rmax ; | ||
71 | + float theta = s_grad[s_Idx2]; | ||
72 | + float mag = s_grad[s_Idx2 + 1]; | ||
73 | + | ||
74 | + | ||
75 | + // check if the current voter is located in the voting area of this pixel. | ||
76 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | ||
77 | + sum += mag; | ||
78 | + | ||
79 | + } | ||
80 | + } | ||
81 | + | ||
82 | + } | ||
83 | + //} | ||
84 | + } | ||
85 | + if(xi < x && yi < y) | ||
86 | + gpuVote[i] = sum; | ||
87 | + | ||
88 | + } | ||
89 | + | ||
90 | + template<typename T> | ||
91 | + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
92 | + | ||
93 | + | ||
94 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | ||
95 | + dim3 threads(sqrt(max_threads), sqrt(max_threads)); | ||
96 | + dim3 blocks(x/threads.x + 1 , y/threads.y+1); | ||
97 | + | ||
98 | + | ||
99 | + // specify share memory | ||
100 | + unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*2*sizeof(T); | ||
101 | + | ||
102 | + //call the kernel to do the voting | ||
103 | + cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
104 | + | ||
105 | + } | ||
106 | + | ||
107 | + | ||
108 | + template<typename T> | ||
109 | + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
110 | + | ||
111 | + //calculate the number of bytes in the array | ||
112 | + unsigned int bytes = x * y * sizeof(T); | ||
113 | + | ||
114 | + //calculate the number of bytes in the atan2 table | ||
115 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | ||
116 | + | ||
117 | + //allocate space on the GPU for the Vote Image | ||
118 | + T* gpuVote; | ||
119 | + cudaMalloc(&gpuVote, bytes); | ||
120 | + | ||
121 | + //allocate space on the GPU for the input Gradient image | ||
122 | + T* gpuGrad; | ||
123 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | ||
124 | + | ||
125 | + //copy the Gradient Magnitude data to the GPU | ||
126 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | ||
127 | + | ||
128 | + //allocate space on the GPU for the atan2 table | ||
129 | + T* gpuTable; | ||
130 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | ||
131 | + | ||
132 | + //copy the atan2 values to the GPU | ||
133 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | ||
134 | + | ||
135 | + //call the GPU version of the vote calculation function | ||
136 | + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
137 | + | ||
138 | + //copy the Vote Data back to the CPU | ||
139 | + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; | ||
140 | + | ||
141 | + //free allocated memory | ||
142 | + cudaFree(gpuTable); | ||
143 | + cudaFree(gpuVote); | ||
144 | + cudaFree(gpuGrad); | ||
145 | + } | ||
146 | + | ||
147 | + } | ||
148 | +} | ||
149 | + | ||
150 | +#endif | ||
0 | \ No newline at end of file | 151 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_IVOTE_ATOMIC_H | ||
2 | +#define STIM_CUDA_IVOTE_ATOMIC_H | ||
3 | + | ||
4 | +#include <stim/cuda/ivote/down_sample.cuh> | ||
5 | +#include <stim/cuda/ivote/local_max.cuh> | ||
6 | +#include <stim/cuda/ivote/update_dir_global.cuh> | ||
7 | +//#include <stim/cuda/ivote/vote_shared_32-32.cuh> | ||
8 | +#include <stim/cuda/ivote/vote_atomic_shared.cuh> | ||
9 | +//#include <stim/cuda/ivote/re_sample.cuh> | ||
10 | +namespace stim{ | ||
11 | + namespace cuda{ | ||
12 | + | ||
13 | + } | ||
14 | +} | ||
15 | + | ||
16 | + | ||
17 | + | ||
18 | +#endif | ||
0 | \ No newline at end of file | 19 | \ No newline at end of file |
stim/cuda/sharedmem.cuh
@@ -35,10 +35,8 @@ namespace stim{ | @@ -35,10 +35,8 @@ namespace stim{ | ||
35 | } | 35 | } |
36 | } | 36 | } |
37 | 37 | ||
38 | - // Copies values from global memory to shared memory, optimizing threads | ||
39 | - template<typename T> | ||
40 | - __device__ void sharedMemcpy(T* dest, T* src, size_t N, size_t tid, size_t nt){ | ||
41 | - | 38 | + // Threaded copying of data on a CUDA device. |
39 | + __device__ void threadedMemcpy(char* dest, char* src, size_t N, size_t tid, size_t nt){ | ||
42 | size_t I = N / nt + 1; //calculate the number of iterations required to make the copy | 40 | size_t I = N / nt + 1; //calculate the number of iterations required to make the copy |
43 | size_t xi = tid; //initialize the source and destination index to the thread ID | 41 | size_t xi = tid; //initialize the source and destination index to the thread ID |
44 | for(size_t i = 0; i < I; i++){ //for each iteration | 42 | for(size_t i = 0; i < I; i++){ //for each iteration |
@@ -48,7 +46,37 @@ namespace stim{ | @@ -48,7 +46,37 @@ namespace stim{ | ||
48 | } | 46 | } |
49 | } | 47 | } |
50 | 48 | ||
51 | - | 49 | + /// Threaded copying of 2D data on a CUDA device |
50 | + /// @param dest is a linear destination array of size nx * ny | ||
51 | + /// @param src is a 2D image stored as a linear array with a pitch of X | ||
52 | + /// @param X is the number of bytes in a row of src | ||
53 | + /// @param tid is a 1D id for the current thread | ||
54 | + /// @param nt is the number of threads in the block | ||
55 | + template<typename T> | ||
56 | + __device__ void threadedMemcpy2D(T* dest, size_t nx, size_t ny, | ||
57 | + T* src, size_t x, size_t y, size_t sX, size_t sY, | ||
58 | + size_t tid, size_t nt){ | ||
59 | + | ||
60 | + size_t vals = nx * ny; //calculate the total number of bytes to be copied | ||
61 | + size_t I = vals / nt + 1; //calculate the number of iterations required to perform the copy | ||
62 | + | ||
63 | + size_t src_i, dest_i; | ||
64 | + size_t dest_x, dest_y, src_x, src_y; | ||
65 | + for(size_t i = 0; i < I; i++){ //for each iteration | ||
66 | + dest_i = i * nt + tid; //calculate the index into the destination array | ||
67 | + dest_y = dest_i / nx; | ||
68 | + dest_x = dest_i - dest_y * nx; | ||
69 | + | ||
70 | + if(dest_y < ny && dest_x < nx){ | ||
71 | + | ||
72 | + src_x = x + dest_x; | ||
73 | + src_y = y + dest_y; | ||
74 | + | ||
75 | + src_i = src_y * sX + src_x; | ||
76 | + dest[dest_i] = src[src_i]; | ||
77 | + } | ||
78 | + } | ||
79 | + } | ||
52 | } | 80 | } |
53 | } | 81 | } |
54 | 82 |
stim/cuda/templates/conv2sep.cuh
@@ -30,7 +30,8 @@ namespace stim{ | @@ -30,7 +30,8 @@ namespace stim{ | ||
30 | int byi = blockIdx.y; | 30 | int byi = blockIdx.y; |
31 | 31 | ||
32 | //copy the portion of the image necessary for this block to shared memory | 32 | //copy the portion of the image necessary for this block to shared memory |
33 | - stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); | 33 | + //stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); |
34 | + stim::cuda::sharedMemcpy_tex2D<float>(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); | ||
34 | 35 | ||
35 | //calculate the thread index | 36 | //calculate the thread index |
36 | int ti = threadIdx.x; | 37 | int ti = threadIdx.x; |
@@ -88,7 +89,8 @@ namespace stim{ | @@ -88,7 +89,8 @@ namespace stim{ | ||
88 | int byi = blockIdx.y * blockDim.y; | 89 | int byi = blockIdx.y * blockDim.y; |
89 | 90 | ||
90 | //copy the portion of the image necessary for this block to shared memory | 91 | //copy the portion of the image necessary for this block to shared memory |
91 | - stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); | 92 | + //stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); |
93 | + stim::cuda::sharedMemcpy_tex2D<float>(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); | ||
92 | 94 | ||
93 | //calculate the thread index | 95 | //calculate the thread index |
94 | int ti = threadIdx.y; | 96 | int ti = threadIdx.y; |
stim/image/image.h
@@ -160,11 +160,26 @@ public: | @@ -160,11 +160,26 @@ public: | ||
160 | exit(1); | 160 | exit(1); |
161 | } | 161 | } |
162 | allocate(cvImage.cols, cvImage.rows, cvImage.channels()); //allocate space for the image | 162 | allocate(cvImage.cols, cvImage.rows, cvImage.channels()); //allocate space for the image |
163 | - T* cv_ptr = (T*)cvImage.data; | 163 | + unsigned char* cv_ptr = (unsigned char*)cvImage.data; |
164 | if(C() == 1) //if this is a single-color image, just copy the data | 164 | if(C() == 1) //if this is a single-color image, just copy the data |
165 | memcpy(img, cv_ptr, bytes()); | 165 | memcpy(img, cv_ptr, bytes()); |
166 | if(C() == 3) //if this is a 3-color image, OpenCV uses BGR interleaving | 166 | if(C() == 3) //if this is a 3-color image, OpenCV uses BGR interleaving |
167 | - set_interleaved_bgr(cv_ptr, X(), Y()); | 167 | + from_opencv(cv_ptr, X(), Y()); |
168 | + } | ||
169 | + | ||
170 | + void from_opencv(unsigned char* buffer, size_t width, size_t height){ | ||
171 | + allocate(width, height, 3); | ||
172 | + T value; | ||
173 | + size_t i; | ||
174 | + for(size_t c = 0; c < C(); c++){ //copy directly | ||
175 | + for(size_t y = 0; y < Y(); y++){ | ||
176 | + for(size_t x = 0; x < X(); x++){ | ||
177 | + i = y * X() * C() + x * C() + (2-c); | ||
178 | + value = buffer[i]; | ||
179 | + img[idx(x, y, c)] = value; | ||
180 | + } | ||
181 | + } | ||
182 | + } | ||
168 | } | 183 | } |
169 | 184 | ||
170 | //save a file | 185 | //save a file |
@@ -180,23 +195,35 @@ public: | @@ -180,23 +195,35 @@ public: | ||
180 | cv::imwrite(filename, cvImage); | 195 | cv::imwrite(filename, cvImage); |
181 | } | 196 | } |
182 | 197 | ||
198 | + void set_interleaved(T* buffer, size_t width, size_t height, size_t channels){ | ||
199 | + allocate(width, height, channels); | ||
200 | + memcpy(img, buffer, bytes()); | ||
201 | + } | ||
202 | + | ||
183 | //create an image from an interleaved buffer | 203 | //create an image from an interleaved buffer |
184 | void set_interleaved_rgb(T* buffer, size_t width, size_t height){ | 204 | void set_interleaved_rgb(T* buffer, size_t width, size_t height){ |
185 | - allocate(width, height, 3); | ||
186 | - memcpy(img, buffer, bytes()); | 205 | + set_interleaved(buffer, width, height, 3); |
187 | } | 206 | } |
188 | 207 | ||
189 | void set_interleaved_bgr(T* buffer, size_t width, size_t height){ | 208 | void set_interleaved_bgr(T* buffer, size_t width, size_t height){ |
190 | allocate(width, height, 3); | 209 | allocate(width, height, 3); |
210 | + T value; | ||
211 | + size_t i; | ||
191 | for(size_t c = 0; c < C(); c++){ //copy directly | 212 | for(size_t c = 0; c < C(); c++){ //copy directly |
192 | for(size_t y = 0; y < Y(); y++){ | 213 | for(size_t y = 0; y < Y(); y++){ |
193 | for(size_t x = 0; x < X(); x++){ | 214 | for(size_t x = 0; x < X(); x++){ |
194 | - img[idx(x, y, c)] = buffer[y * X() * C() + x * C() + (2-c)]; | 215 | + i = y * X() * C() + x * C() + (2-c); |
216 | + value = buffer[i]; | ||
217 | + img[idx(x, y, c)] = value; | ||
195 | } | 218 | } |
196 | } | 219 | } |
197 | } | 220 | } |
198 | } | 221 | } |
199 | 222 | ||
223 | + void set_interleaved(T* buffer, size_t width, size_t height){ | ||
224 | + set_interleaved_rgb(buffer, width, height); | ||
225 | + } | ||
226 | + | ||
200 | void get_interleaved_bgr(T* data){ | 227 | void get_interleaved_bgr(T* data){ |
201 | 228 | ||
202 | //for each channel | 229 | //for each channel |
stim/optics/scalarfield.h
@@ -71,8 +71,8 @@ public: | @@ -71,8 +71,8 @@ public: | ||
71 | void to_cpu(){ | 71 | void to_cpu(){ |
72 | if(loc == CPUmem) return; | 72 | if(loc == CPUmem) return; |
73 | else{ | 73 | else{ |
74 | - stim::complex<T>* host_E = (stim::complex<T>*) malloc(e_bytes()); //allocate space in main memory | ||
75 | - HANDLE_ERROR( cudaMemcpy(host_E, E, e_bytes(), cudaMemcpyDeviceToHost) ); //copy from GPU to CPU | 74 | + stim::complex<T>* host_E = (stim::complex<T>*) malloc(grid_bytes()); //allocate space in main memory |
75 | + HANDLE_ERROR( cudaMemcpy(host_E, E, grid_bytes(), cudaMemcpyDeviceToHost) ); //copy from GPU to CPU | ||
76 | HANDLE_ERROR( cudaFree(E) ); //free device memory | 76 | HANDLE_ERROR( cudaFree(E) ); //free device memory |
77 | E = host_E; //swap pointers | 77 | E = host_E; //swap pointers |
78 | } | 78 | } |
1 | +#ifndef STIM_AABB2_H | ||
2 | +#define STIM_AABB2_H | ||
3 | + | ||
4 | +#include <stim/cuda/cudatools/callable.h> | ||
5 | + | ||
6 | +namespace stim{ | ||
7 | + | ||
8 | +/// Structure for a 2D axis aligned bounding box | ||
9 | +template<typename T> | ||
10 | +struct aabb2{ | ||
11 | + | ||
12 | +//protected: | ||
13 | + | ||
14 | + T low[2]; //top left corner position | ||
15 | + T high[2]; //dimensions along x and y | ||
16 | + | ||
17 | +//public: | ||
18 | + | ||
19 | + CUDA_CALLABLE aabb2(T x, T y){ //initialize an axis aligned bounding box of size 0 at the given position | ||
20 | + low[0] = high[0] = x; //set the position to the user specified coordinates | ||
21 | + low[1] = high[1] = y; | ||
22 | + } | ||
23 | + | ||
24 | + //insert a point into the bounding box, growing the box appropriately | ||
25 | + CUDA_CALLABLE void insert(T x, T y){ | ||
26 | + if(x < low[0]) low[0] = x; | ||
27 | + if(y < low[1]) low[1] = y; | ||
28 | + | ||
29 | + if(x > high[0]) high[0] = x; | ||
30 | + if(y > high[1]) high[1] = y; | ||
31 | + } | ||
32 | + | ||
33 | + //trim the bounding box so that the lower bounds are (x, y) | ||
34 | + CUDA_CALLABLE void trim_low(T x, T y){ | ||
35 | + if(low[0] < x) low[0] = x; | ||
36 | + if(low[1] < y) low[1] = y; | ||
37 | + } | ||
38 | + | ||
39 | + CUDA_CALLABLE void trim_high(T x, T y){ | ||
40 | + if(high[0] > x) high[0] = x; | ||
41 | + if(high[1] > y) high[1] = y; | ||
42 | + } | ||
43 | + | ||
44 | +}; | ||
45 | + | ||
46 | +} | ||
47 | + | ||
48 | + | ||
49 | +#endif | ||
0 | \ No newline at end of file | 50 | \ No newline at end of file |