Commit 84ca9bba937e85387fce73113f32e9409d3c2698

Authored by Laila Saadatifard
1 parent 93de94e6

fix some bugs in the vote, update_dir, and local_max codes

stim/cuda/ivote/down_sample.cuh
... ... @@ -25,8 +25,8 @@ namespace stim{
25 25  
26 26 if(xi< x_ds && yi< y_ds){
27 27  
28   - int x_org = xi * sigma_ds;
29   - int y_org = yi * sigma_ds;
  28 + int x_org = xi * sigma_ds ;
  29 + int y_org = yi * sigma_ds ;
30 30 int i_org = y_org * x + x_org;
31 31 gpuI[i] = gpuI0[i_org];
32 32 }
... ...
stim/cuda/ivote/local_max.cuh
... ... @@ -12,20 +12,48 @@ namespace stim{
12 12  
13 13 // this kernel calculates the local maximum for finding the cell centers
14 14 template<typename T>
15   - __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
  15 + __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, int conn, int x, int y){
16 16  
17 17 // calculate the 2D coordinates for this current thread.
18 18 int xi = blockIdx.x * blockDim.x + threadIdx.x;
19 19 int yi = blockIdx.y;
  20 +
  21 + if(xi >= x || yi >= y)
  22 + return;
  23 +
  24 +
20 25 // convert 2D coordinates to 1D
21   - int i = yi * x + xi;
22   -
23   -
24   -
  26 + int i = yi * x + xi;
  27 +
  28 + // START DAVID
  29 +
  30 + gpuCenters[i] = 0; //initialize the value at this location to zero
  31 +
  32 + T val = gpuVote[i];
  33 +
  34 + //compare to the threshold
  35 + if(val < final_t) return;
  36 +
  37 + for(int xl = xi - conn; xl < xi + conn; xl++){
  38 + for(int yl = yi - conn; yl < yi + conn; yl++){
  39 + if(xl >= 0 && xl < x && yl >= 0 && yl < y){
  40 + int il = yl * x + xl;
  41 + if(gpuVote[il] > val){
  42 + return;
  43 + }
  44 + }
  45 + }
  46 + }
  47 +
  48 + gpuCenters[i] = 1;
  49 +
  50 + // END DAVID
  51 + /*
25 52 //calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
26 53 int xl = xi - conn;
27 54 int yl = yi - conn;
28 55  
  56 +
29 57 // use zero for the lowest limits if the xi or yi is less than conn.
30 58 if (xi <= conn)
31 59 xl = 0;
... ... @@ -82,6 +110,7 @@ namespace stim{
82 110 // set the center value for this pixel to high if it's a local max ,and to low if not.
83 111 gpuCenters[i] = l_value ;
84 112 }
  113 + */
85 114  
86 115 }
87 116  
... ...
stim/cuda/ivote/update_dir.cuh
... ... @@ -12,7 +12,7 @@ namespace stim{
12 12  
13 13 // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
14 14 template<typename T>
15   - __global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  15 + __global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
16 16  
17 17 //generate a pointer to shared memory (size will be specified as a kernel parameter)
18 18 extern __shared__ float s_vote[];
... ... @@ -20,12 +20,9 @@ namespace stim{
20 20 //calculate the start point for this block
21 21 int bxi = blockIdx.x * blockDim.x;
22 22  
23   - //calculate the width of the shared memory block
24   - int swidth = 2 * rmax + blockDim.x;
25   -
26 23 // calculate the 2D coordinates for this current thread.
27 24 int xi = bxi + threadIdx.x;
28   - int yi = blockIdx.y;
  25 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
29 26  
30 27 // convert 2D coordinates to 1D
31 28 int i = yi * x + xi;
... ... @@ -43,14 +40,17 @@ namespace stim{
43 40 int id_x = 0;
44 41 int id_y = 0;
45 42  
  43 + //calculate the width of the shared memory block
  44 + int swidth = 2 * rmax + blockDim.x;
  45 +
46 46 // compute the size of window which will be checked for finding the voting area for this voter
47   - unsigned int x_table = 2*rmax +1;
48   - unsigned int rmax_sq = rmax * rmax;
49   - int r = (int)rmax;
  47 + int x_table = 2*rmax +1;
  48 + int rmax_sq = rmax * rmax;
50 49 int tx_rmax = threadIdx.x + rmax;
51 50 int bxs = bxi - rmax;
52 51  
53   - for(int yr = -r; yr <= r; yr++){
  52 +
  53 + for(int yr = -rmax; yr <= rmax; yr++){
54 54  
55 55 //copy the portion of the image necessary for this block to shared memory
56 56 __syncthreads();
... ... @@ -60,7 +60,7 @@ namespace stim{
60 60 //if the current thread is outside of the image, it doesn't have to be computed
61 61 if(xi < x && yi < y){
62 62  
63   - for(int xr = -r; xr <= r; xr++){
  63 + for(int xr = -rmax; xr <= rmax; xr++){
64 64  
65 65 unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
66 66  
... ... @@ -93,19 +93,21 @@ namespace stim{
93 93  
94 94 float new_angle = gpuTable[ind_m];
95 95  
96   - gpuDir[i] = new_angle;
  96 + if(xi < x && yi < y)
  97 + gpuDir[i] = new_angle;
97 98  
98 99 }
99 100  
100 101 // this kernel updates the gradient direction by the calculated voting direction.
101 102 template<typename T>
102   - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, unsigned int x, unsigned int y){
  103 + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
103 104  
104 105 //************ when the number of threads are (1024,1) *************
105 106  
106 107 // calculate the 2D coordinates for this current thread.
107 108 int xi = blockIdx.x * blockDim.x + threadIdx.x;
108   - int yi = blockIdx.y;
  109 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  110 +
109 111 // convert 2D coordinates to 1D
110 112 int i = yi * x + xi;
111 113  
... ... @@ -126,7 +128,8 @@ namespace stim{
126 128 unsigned int max_threads = stim::maxThreadsPerBlock();
127 129 dim3 threads(max_threads, 1);
128 130 dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
129   -
  131 + //dim3 threads(1, 1);
  132 + //dim3 blocks(x, y);
130 133 // Allocate CUDA array in device memory
131 134  
132 135 //define a channel descriptor for a single 32-bit channel
... ...
stim/cuda/ivote/vote.cuh
... ... @@ -13,7 +13,7 @@ namespace stim{
13 13  
14 14 // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
15 15 template<typename T>
16   - __global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  16 + __global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, int rmax, int x, int y){
17 17  
18 18 //generate a pointer to shared memory (size will be specified as a kernel parameter)
19 19 extern __shared__ float2 s_grad[];
... ... @@ -21,42 +21,41 @@ namespace stim{
21 21 //calculate the start point for this block
22 22 int bxi = blockIdx.x * blockDim.x;
23 23  
24   - //calculate the width of the shared memory block
25   - int swidth = 2 * rmax + blockDim.x;
26   -
27 24 // calculate the 2D coordinates for this current thread.
28 25 int xi = bxi + threadIdx.x;
29   - int yi = blockIdx.y;
  26 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
30 27 // convert 2D coordinates to 1D
31 28 int i = yi * x + xi;
32   -
  29 +
  30 +
33 31  
34 32 // define a local variable to sum the votes from the voters
35 33 float sum = 0;
  34 +
36 35  
  36 + //calculate the width of the shared memory block
  37 + int swidth = 2 * rmax + blockDim.x;
  38 +
37 39 // compute the size of window which will be checked for finding the proper voters for this pixel
38   - unsigned int x_table = 2*rmax +1;
39   -
40   - unsigned int rmax_sq = rmax * rmax;
41   - int r = (int)rmax;
  40 + int x_table = 2*rmax +1;
  41 + int rmax_sq = rmax * rmax;
42 42 int tx_rmax = threadIdx.x + rmax;
43 43 int bxs = bxi - rmax;
44 44  
45   -
46   - for(int yr = -r; yr <= r; yr++){
  45 + //for every line (along y)
  46 + for(int yr = -rmax; yr <= rmax; yr++){
47 47  
48 48 //copy the portion of the image necessary for this block to shared memory
49 49 __syncthreads();
50 50 stim::cuda::sharedMemcpy_tex2D<float2>(s_grad, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim);
51 51 __syncthreads();
52 52  
53   - //if the current thread is outside of the image, it doesn't have to be computed
54 53 if(xi < x && yi < y){
55   -
56   - for(int xr = -r; xr <= r; xr++){
  54 +
  55 + for(int xr = -rmax; xr <= rmax; xr++){
57 56  
58 57 //find the location of this voter in the atan2 table
59   - unsigned int id_t = (yr + rmax) * x_table + xr + rmax;
  58 + int id_t = (yr + rmax) * x_table + xr + rmax;
60 59  
61 60 // calculate the angle between the pixel and the current voter in x and y directions
62 61 float atan_angle = gpuTable[id_t];
... ... @@ -74,10 +73,12 @@ namespace stim{
74 73 }
75 74  
76 75 }
  76 +
77 77 }
78 78 }
79   -
80   - gpuVote[i] = sum;
  79 + if(xi < x && yi < y)
  80 + gpuVote[i] = sum;
  81 +
81 82 }
82 83  
83 84 template<typename T>
... ... @@ -92,7 +93,9 @@ namespace stim{
92 93 //unsigned int thread_dim = sqrt(max_threads);
93 94 dim3 threads(max_threads, 1);
94 95 dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
95   -
  96 + //dim3 threads(1,1);
  97 + //dim3 blocks(x, y);
  98 +
96 99 // Allocate CUDA array in device memory
97 100  
98 101 //define a channel descriptor for a single 32-bit channel
... ...
stim/cuda/templates/gradient.cuh
... ... @@ -9,7 +9,7 @@ namespace stim{
9 9 namespace cuda{
10 10  
11 11 template<typename T>
12   - __global__ void gradient_2d(T* out, T* in, unsigned int x, unsigned int y){
  12 + __global__ void gradient_2d(T* out, T* in, int x, int y){
13 13  
14 14 //calculate the 1D image index for this thread
15 15 int i = blockIdx.x * blockDim.x + threadIdx.x;
... ...
stim/image/image.h
... ... @@ -48,7 +48,7 @@ public:
48 48 //create an image from an interleaved buffer
49 49 void set_interleaved(T* buffer, unsigned int width, unsigned int height, unsigned int channels = 1){
50 50  
51   - unsigned char* non_interleaved = (unsigned char*)malloc(width * height * 3);
  51 + T* non_interleaved = (T*)malloc(width * height * 3 * sizeof(T));
52 52 unsigned int S = width * height;
53 53  
54 54 for(unsigned int i = 0; i < S; i++){
... ... @@ -57,7 +57,7 @@ public:
57 57 }
58 58 }
59 59  
60   - img = cimg_library::CImg<unsigned char>(non_interleaved, width, height, 1, channels);
  60 + img = cimg_library::CImg<T>(non_interleaved, width, height, 1, channels);
61 61 }
62 62  
63 63 //fills an allocated region of memory with non-interleaved data
... ...