Commit 84ca9bba937e85387fce73113f32e9409d3c2698
1 parent
93de94e6
fix some bugs in the vote, update_dir, and local_max codes
Showing
6 changed files
with
78 additions
and
43 deletions
Show diff stats
stim/cuda/ivote/down_sample.cuh
stim/cuda/ivote/local_max.cuh
... | ... | @@ -12,20 +12,48 @@ namespace stim{ |
12 | 12 | |
13 | 13 | // this kernel calculates the local maximum for finding the cell centers |
14 | 14 | template<typename T> |
15 | - __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ | |
15 | + __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, int conn, int x, int y){ | |
16 | 16 | |
17 | 17 | // calculate the 2D coordinates for this current thread. |
18 | 18 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
19 | 19 | int yi = blockIdx.y; |
20 | + | |
21 | + if(xi >= x || yi >= y) | |
22 | + return; | |
23 | + | |
24 | + | |
20 | 25 | // convert 2D coordinates to 1D |
21 | - int i = yi * x + xi; | |
22 | - | |
23 | - | |
24 | - | |
26 | + int i = yi * x + xi; | |
27 | + | |
28 | + // START DAVID | |
29 | + | |
30 | + gpuCenters[i] = 0; //initialize the value at this location to zero | |
31 | + | |
32 | + T val = gpuVote[i]; | |
33 | + | |
34 | + //compare to the threshold | |
35 | + if(val < final_t) return; | |
36 | + | |
37 | + for(int xl = xi - conn; xl < xi + conn; xl++){ | |
38 | + for(int yl = yi - conn; yl < yi + conn; yl++){ | |
39 | + if(xl >= 0 && xl < x && yl >= 0 && yl < y){ | |
40 | + int il = yl * x + xl; | |
41 | + if(gpuVote[il] > val){ | |
42 | + return; | |
43 | + } | |
44 | + } | |
45 | + } | |
46 | + } | |
47 | + | |
48 | + gpuCenters[i] = 1; | |
49 | + | |
50 | + // END DAVID | |
51 | + /* | |
25 | 52 | //calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. |
26 | 53 | int xl = xi - conn; |
27 | 54 | int yl = yi - conn; |
28 | 55 | |
56 | + | |
29 | 57 | // use zero for the lowest limits if the xi or yi is less than conn. |
30 | 58 | if (xi <= conn) |
31 | 59 | xl = 0; |
... | ... | @@ -82,6 +110,7 @@ namespace stim{ |
82 | 110 | // set the center value for this pixel to high if it's a local max ,and to low if not. |
83 | 111 | gpuCenters[i] = l_value ; |
84 | 112 | } |
113 | + */ | |
85 | 114 | |
86 | 115 | } |
87 | 116 | ... | ... |
stim/cuda/ivote/update_dir.cuh
... | ... | @@ -12,7 +12,7 @@ namespace stim{ |
12 | 12 | |
13 | 13 | // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. |
14 | 14 | template<typename T> |
15 | - __global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
15 | + __global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | |
16 | 16 | |
17 | 17 | //generate a pointer to shared memory (size will be specified as a kernel parameter) |
18 | 18 | extern __shared__ float s_vote[]; |
... | ... | @@ -20,12 +20,9 @@ namespace stim{ |
20 | 20 | //calculate the start point for this block |
21 | 21 | int bxi = blockIdx.x * blockDim.x; |
22 | 22 | |
23 | - //calculate the width of the shared memory block | |
24 | - int swidth = 2 * rmax + blockDim.x; | |
25 | - | |
26 | 23 | // calculate the 2D coordinates for this current thread. |
27 | 24 | int xi = bxi + threadIdx.x; |
28 | - int yi = blockIdx.y; | |
25 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
29 | 26 | |
30 | 27 | // convert 2D coordinates to 1D |
31 | 28 | int i = yi * x + xi; |
... | ... | @@ -43,14 +40,17 @@ namespace stim{ |
43 | 40 | int id_x = 0; |
44 | 41 | int id_y = 0; |
45 | 42 | |
43 | + //calculate the width of the shared memory block | |
44 | + int swidth = 2 * rmax + blockDim.x; | |
45 | + | |
46 | 46 | // compute the size of window which will be checked for finding the voting area for this voter |
47 | - unsigned int x_table = 2*rmax +1; | |
48 | - unsigned int rmax_sq = rmax * rmax; | |
49 | - int r = (int)rmax; | |
47 | + int x_table = 2*rmax +1; | |
48 | + int rmax_sq = rmax * rmax; | |
50 | 49 | int tx_rmax = threadIdx.x + rmax; |
51 | 50 | int bxs = bxi - rmax; |
52 | 51 | |
53 | - for(int yr = -r; yr <= r; yr++){ | |
52 | + | |
53 | + for(int yr = -rmax; yr <= rmax; yr++){ | |
54 | 54 | |
55 | 55 | //copy the portion of the image necessary for this block to shared memory |
56 | 56 | __syncthreads(); |
... | ... | @@ -60,7 +60,7 @@ namespace stim{ |
60 | 60 | //if the current thread is outside of the image, it doesn't have to be computed |
61 | 61 | if(xi < x && yi < y){ |
62 | 62 | |
63 | - for(int xr = -r; xr <= r; xr++){ | |
63 | + for(int xr = -rmax; xr <= rmax; xr++){ | |
64 | 64 | |
65 | 65 | unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; |
66 | 66 | |
... | ... | @@ -93,19 +93,21 @@ namespace stim{ |
93 | 93 | |
94 | 94 | float new_angle = gpuTable[ind_m]; |
95 | 95 | |
96 | - gpuDir[i] = new_angle; | |
96 | + if(xi < x && yi < y) | |
97 | + gpuDir[i] = new_angle; | |
97 | 98 | |
98 | 99 | } |
99 | 100 | |
100 | 101 | // this kernel updates the gradient direction by the calculated voting direction. |
101 | 102 | template<typename T> |
102 | - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, unsigned int x, unsigned int y){ | |
103 | + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ | |
103 | 104 | |
104 | 105 | //************ when the number of threads are (1024,1) ************* |
105 | 106 | |
106 | 107 | // calculate the 2D coordinates for this current thread. |
107 | 108 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
108 | - int yi = blockIdx.y; | |
109 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
110 | + | |
109 | 111 | // convert 2D coordinates to 1D |
110 | 112 | int i = yi * x + xi; |
111 | 113 | |
... | ... | @@ -126,7 +128,8 @@ namespace stim{ |
126 | 128 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
127 | 129 | dim3 threads(max_threads, 1); |
128 | 130 | dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); |
129 | - | |
131 | + //dim3 threads(1, 1); | |
132 | + //dim3 blocks(x, y); | |
130 | 133 | // Allocate CUDA array in device memory |
131 | 134 | |
132 | 135 | //define a channel descriptor for a single 32-bit channel | ... | ... |
stim/cuda/ivote/vote.cuh
... | ... | @@ -13,7 +13,7 @@ namespace stim{ |
13 | 13 | |
14 | 14 | // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area |
15 | 15 | template<typename T> |
16 | - __global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
16 | + __global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, int rmax, int x, int y){ | |
17 | 17 | |
18 | 18 | //generate a pointer to shared memory (size will be specified as a kernel parameter) |
19 | 19 | extern __shared__ float2 s_grad[]; |
... | ... | @@ -21,42 +21,41 @@ namespace stim{ |
21 | 21 | //calculate the start point for this block |
22 | 22 | int bxi = blockIdx.x * blockDim.x; |
23 | 23 | |
24 | - //calculate the width of the shared memory block | |
25 | - int swidth = 2 * rmax + blockDim.x; | |
26 | - | |
27 | 24 | // calculate the 2D coordinates for this current thread. |
28 | 25 | int xi = bxi + threadIdx.x; |
29 | - int yi = blockIdx.y; | |
26 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
30 | 27 | // convert 2D coordinates to 1D |
31 | 28 | int i = yi * x + xi; |
32 | - | |
29 | + | |
30 | + | |
33 | 31 | |
34 | 32 | // define a local variable to sum the votes from the voters |
35 | 33 | float sum = 0; |
34 | + | |
36 | 35 | |
36 | + //calculate the width of the shared memory block | |
37 | + int swidth = 2 * rmax + blockDim.x; | |
38 | + | |
37 | 39 | // compute the size of window which will be checked for finding the proper voters for this pixel |
38 | - unsigned int x_table = 2*rmax +1; | |
39 | - | |
40 | - unsigned int rmax_sq = rmax * rmax; | |
41 | - int r = (int)rmax; | |
40 | + int x_table = 2*rmax +1; | |
41 | + int rmax_sq = rmax * rmax; | |
42 | 42 | int tx_rmax = threadIdx.x + rmax; |
43 | 43 | int bxs = bxi - rmax; |
44 | 44 | |
45 | - | |
46 | - for(int yr = -r; yr <= r; yr++){ | |
45 | + //for every line (along y) | |
46 | + for(int yr = -rmax; yr <= rmax; yr++){ | |
47 | 47 | |
48 | 48 | //copy the portion of the image necessary for this block to shared memory |
49 | 49 | __syncthreads(); |
50 | 50 | stim::cuda::sharedMemcpy_tex2D<float2>(s_grad, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim); |
51 | 51 | __syncthreads(); |
52 | 52 | |
53 | - //if the current thread is outside of the image, it doesn't have to be computed | |
54 | 53 | if(xi < x && yi < y){ |
55 | - | |
56 | - for(int xr = -r; xr <= r; xr++){ | |
54 | + | |
55 | + for(int xr = -rmax; xr <= rmax; xr++){ | |
57 | 56 | |
58 | 57 | //find the location of this voter in the atan2 table |
59 | - unsigned int id_t = (yr + rmax) * x_table + xr + rmax; | |
58 | + int id_t = (yr + rmax) * x_table + xr + rmax; | |
60 | 59 | |
61 | 60 | // calculate the angle between the pixel and the current voter in x and y directions |
62 | 61 | float atan_angle = gpuTable[id_t]; |
... | ... | @@ -74,10 +73,12 @@ namespace stim{ |
74 | 73 | } |
75 | 74 | |
76 | 75 | } |
76 | + | |
77 | 77 | } |
78 | 78 | } |
79 | - | |
80 | - gpuVote[i] = sum; | |
79 | + if(xi < x && yi < y) | |
80 | + gpuVote[i] = sum; | |
81 | + | |
81 | 82 | } |
82 | 83 | |
83 | 84 | template<typename T> |
... | ... | @@ -92,7 +93,9 @@ namespace stim{ |
92 | 93 | //unsigned int thread_dim = sqrt(max_threads); |
93 | 94 | dim3 threads(max_threads, 1); |
94 | 95 | dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); |
95 | - | |
96 | + //dim3 threads(1,1); | |
97 | + //dim3 blocks(x, y); | |
98 | + | |
96 | 99 | // Allocate CUDA array in device memory |
97 | 100 | |
98 | 101 | //define a channel descriptor for a single 32-bit channel | ... | ... |
stim/cuda/templates/gradient.cuh
... | ... | @@ -9,7 +9,7 @@ namespace stim{ |
9 | 9 | namespace cuda{ |
10 | 10 | |
11 | 11 | template<typename T> |
12 | - __global__ void gradient_2d(T* out, T* in, unsigned int x, unsigned int y){ | |
12 | + __global__ void gradient_2d(T* out, T* in, int x, int y){ | |
13 | 13 | |
14 | 14 | //calculate the 1D image index for this thread |
15 | 15 | int i = blockIdx.x * blockDim.x + threadIdx.x; | ... | ... |
stim/image/image.h
... | ... | @@ -48,7 +48,7 @@ public: |
48 | 48 | //create an image from an interleaved buffer |
49 | 49 | void set_interleaved(T* buffer, unsigned int width, unsigned int height, unsigned int channels = 1){ |
50 | 50 | |
51 | - unsigned char* non_interleaved = (unsigned char*)malloc(width * height * 3); | |
51 | + T* non_interleaved = (T*)malloc(width * height * 3 * sizeof(T)); | |
52 | 52 | unsigned int S = width * height; |
53 | 53 | |
54 | 54 | for(unsigned int i = 0; i < S; i++){ |
... | ... | @@ -57,7 +57,7 @@ public: |
57 | 57 | } |
58 | 58 | } |
59 | 59 | |
60 | - img = cimg_library::CImg<unsigned char>(non_interleaved, width, height, 1, channels); | |
60 | + img = cimg_library::CImg<T>(non_interleaved, width, height, 1, channels); | |
61 | 61 | } |
62 | 62 | |
63 | 63 | //fills an allocated region of memory with non-interleaved data | ... | ... |