Commit 84ca9bba937e85387fce73113f32e9409d3c2698

Authored by Laila Saadatifard
1 parent 93de94e6

fix some bugs in the vote, update_dir, and local_max codes

stim/cuda/ivote/down_sample.cuh
@@ -25,8 +25,8 @@ namespace stim{ @@ -25,8 +25,8 @@ namespace stim{
25 25
26 if(xi< x_ds && yi< y_ds){ 26 if(xi< x_ds && yi< y_ds){
27 27
28 - int x_org = xi * sigma_ds;  
29 - int y_org = yi * sigma_ds; 28 + int x_org = xi * sigma_ds ;
  29 + int y_org = yi * sigma_ds ;
30 int i_org = y_org * x + x_org; 30 int i_org = y_org * x + x_org;
31 gpuI[i] = gpuI0[i_org]; 31 gpuI[i] = gpuI0[i_org];
32 } 32 }
stim/cuda/ivote/local_max.cuh
@@ -12,20 +12,48 @@ namespace stim{ @@ -12,20 +12,48 @@ namespace stim{
12 12
13 // this kernel calculates the local maximum for finding the cell centers 13 // this kernel calculates the local maximum for finding the cell centers
14 template<typename T> 14 template<typename T>
15 - __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ 15 + __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, int conn, int x, int y){
16 16
17 // calculate the 2D coordinates for this current thread. 17 // calculate the 2D coordinates for this current thread.
18 int xi = blockIdx.x * blockDim.x + threadIdx.x; 18 int xi = blockIdx.x * blockDim.x + threadIdx.x;
19 int yi = blockIdx.y; 19 int yi = blockIdx.y;
  20 +
  21 + if(xi >= x || yi >= y)
  22 + return;
  23 +
  24 +
20 // convert 2D coordinates to 1D 25 // convert 2D coordinates to 1D
21 - int i = yi * x + xi;  
22 -  
23 -  
24 - 26 + int i = yi * x + xi;
  27 +
  28 + // START DAVID
  29 +
  30 + gpuCenters[i] = 0; //initialize the value at this location to zero
  31 +
  32 + T val = gpuVote[i];
  33 +
  34 + //compare to the threshold
  35 + if(val < final_t) return;
  36 +
  37 + for(int xl = xi - conn; xl < xi + conn; xl++){
  38 + for(int yl = yi - conn; yl < yi + conn; yl++){
  39 + if(xl >= 0 && xl < x && yl >= 0 && yl < y){
  40 + int il = yl * x + xl;
  41 + if(gpuVote[il] > val){
  42 + return;
  43 + }
  44 + }
  45 + }
  46 + }
  47 +
  48 + gpuCenters[i] = 1;
  49 +
  50 + // END DAVID
  51 + /*
25 //calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. 52 //calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
26 int xl = xi - conn; 53 int xl = xi - conn;
27 int yl = yi - conn; 54 int yl = yi - conn;
28 55
  56 +
29 // use zero for the lowest limits if the xi or yi is less than conn. 57 // use zero for the lowest limits if the xi or yi is less than conn.
30 if (xi <= conn) 58 if (xi <= conn)
31 xl = 0; 59 xl = 0;
@@ -82,6 +110,7 @@ namespace stim{ @@ -82,6 +110,7 @@ namespace stim{
82 // set the center value for this pixel to high if it's a local max ,and to low if not. 110 // set the center value for this pixel to high if it's a local max ,and to low if not.
83 gpuCenters[i] = l_value ; 111 gpuCenters[i] = l_value ;
84 } 112 }
  113 + */
85 114
86 } 115 }
87 116
stim/cuda/ivote/update_dir.cuh
@@ -12,7 +12,7 @@ namespace stim{ @@ -12,7 +12,7 @@ namespace stim{
12 12
13 // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. 13 // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
14 template<typename T> 14 template<typename T>
15 - __global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ 15 + __global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
16 16
17 //generate a pointer to shared memory (size will be specified as a kernel parameter) 17 //generate a pointer to shared memory (size will be specified as a kernel parameter)
18 extern __shared__ float s_vote[]; 18 extern __shared__ float s_vote[];
@@ -20,12 +20,9 @@ namespace stim{ @@ -20,12 +20,9 @@ namespace stim{
20 //calculate the start point for this block 20 //calculate the start point for this block
21 int bxi = blockIdx.x * blockDim.x; 21 int bxi = blockIdx.x * blockDim.x;
22 22
23 - //calculate the width of the shared memory block  
24 - int swidth = 2 * rmax + blockDim.x;  
25 -  
26 // calculate the 2D coordinates for this current thread. 23 // calculate the 2D coordinates for this current thread.
27 int xi = bxi + threadIdx.x; 24 int xi = bxi + threadIdx.x;
28 - int yi = blockIdx.y; 25 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
29 26
30 // convert 2D coordinates to 1D 27 // convert 2D coordinates to 1D
31 int i = yi * x + xi; 28 int i = yi * x + xi;
@@ -43,14 +40,17 @@ namespace stim{ @@ -43,14 +40,17 @@ namespace stim{
43 int id_x = 0; 40 int id_x = 0;
44 int id_y = 0; 41 int id_y = 0;
45 42
  43 + //calculate the width of the shared memory block
  44 + int swidth = 2 * rmax + blockDim.x;
  45 +
46 // compute the size of window which will be checked for finding the voting area for this voter 46 // compute the size of window which will be checked for finding the voting area for this voter
47 - unsigned int x_table = 2*rmax +1;  
48 - unsigned int rmax_sq = rmax * rmax;  
49 - int r = (int)rmax; 47 + int x_table = 2*rmax +1;
  48 + int rmax_sq = rmax * rmax;
50 int tx_rmax = threadIdx.x + rmax; 49 int tx_rmax = threadIdx.x + rmax;
51 int bxs = bxi - rmax; 50 int bxs = bxi - rmax;
52 51
53 - for(int yr = -r; yr <= r; yr++){ 52 +
  53 + for(int yr = -rmax; yr <= rmax; yr++){
54 54
55 //copy the portion of the image necessary for this block to shared memory 55 //copy the portion of the image necessary for this block to shared memory
56 __syncthreads(); 56 __syncthreads();
@@ -60,7 +60,7 @@ namespace stim{ @@ -60,7 +60,7 @@ namespace stim{
60 //if the current thread is outside of the image, it doesn't have to be computed 60 //if the current thread is outside of the image, it doesn't have to be computed
61 if(xi < x && yi < y){ 61 if(xi < x && yi < y){
62 62
63 - for(int xr = -r; xr <= r; xr++){ 63 + for(int xr = -rmax; xr <= rmax; xr++){
64 64
65 unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; 65 unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
66 66
@@ -93,19 +93,21 @@ namespace stim{ @@ -93,19 +93,21 @@ namespace stim{
93 93
94 float new_angle = gpuTable[ind_m]; 94 float new_angle = gpuTable[ind_m];
95 95
96 - gpuDir[i] = new_angle; 96 + if(xi < x && yi < y)
  97 + gpuDir[i] = new_angle;
97 98
98 } 99 }
99 100
100 // this kernel updates the gradient direction by the calculated voting direction. 101 // this kernel updates the gradient direction by the calculated voting direction.
101 template<typename T> 102 template<typename T>
102 - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, unsigned int x, unsigned int y){ 103 + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
103 104
104 //************ when the number of threads are (1024,1) ************* 105 //************ when the number of threads are (1024,1) *************
105 106
106 // calculate the 2D coordinates for this current thread. 107 // calculate the 2D coordinates for this current thread.
107 int xi = blockIdx.x * blockDim.x + threadIdx.x; 108 int xi = blockIdx.x * blockDim.x + threadIdx.x;
108 - int yi = blockIdx.y; 109 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  110 +
109 // convert 2D coordinates to 1D 111 // convert 2D coordinates to 1D
110 int i = yi * x + xi; 112 int i = yi * x + xi;
111 113
@@ -126,7 +128,8 @@ namespace stim{ @@ -126,7 +128,8 @@ namespace stim{
126 unsigned int max_threads = stim::maxThreadsPerBlock(); 128 unsigned int max_threads = stim::maxThreadsPerBlock();
127 dim3 threads(max_threads, 1); 129 dim3 threads(max_threads, 1);
128 dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); 130 dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
129 - 131 + //dim3 threads(1, 1);
  132 + //dim3 blocks(x, y);
130 // Allocate CUDA array in device memory 133 // Allocate CUDA array in device memory
131 134
132 //define a channel descriptor for a single 32-bit channel 135 //define a channel descriptor for a single 32-bit channel
stim/cuda/ivote/vote.cuh
@@ -13,7 +13,7 @@ namespace stim{ @@ -13,7 +13,7 @@ namespace stim{
13 13
14 // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area 14 // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
15 template<typename T> 15 template<typename T>
16 - __global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ 16 + __global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, int rmax, int x, int y){
17 17
18 //generate a pointer to shared memory (size will be specified as a kernel parameter) 18 //generate a pointer to shared memory (size will be specified as a kernel parameter)
19 extern __shared__ float2 s_grad[]; 19 extern __shared__ float2 s_grad[];
@@ -21,42 +21,41 @@ namespace stim{ @@ -21,42 +21,41 @@ namespace stim{
21 //calculate the start point for this block 21 //calculate the start point for this block
22 int bxi = blockIdx.x * blockDim.x; 22 int bxi = blockIdx.x * blockDim.x;
23 23
24 - //calculate the width of the shared memory block  
25 - int swidth = 2 * rmax + blockDim.x;  
26 -  
27 // calculate the 2D coordinates for this current thread. 24 // calculate the 2D coordinates for this current thread.
28 int xi = bxi + threadIdx.x; 25 int xi = bxi + threadIdx.x;
29 - int yi = blockIdx.y; 26 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
30 // convert 2D coordinates to 1D 27 // convert 2D coordinates to 1D
31 int i = yi * x + xi; 28 int i = yi * x + xi;
32 - 29 +
  30 +
33 31
34 // define a local variable to sum the votes from the voters 32 // define a local variable to sum the votes from the voters
35 float sum = 0; 33 float sum = 0;
  34 +
36 35
  36 + //calculate the width of the shared memory block
  37 + int swidth = 2 * rmax + blockDim.x;
  38 +
37 // compute the size of window which will be checked for finding the proper voters for this pixel 39 // compute the size of window which will be checked for finding the proper voters for this pixel
38 - unsigned int x_table = 2*rmax +1;  
39 -  
40 - unsigned int rmax_sq = rmax * rmax;  
41 - int r = (int)rmax; 40 + int x_table = 2*rmax +1;
  41 + int rmax_sq = rmax * rmax;
42 int tx_rmax = threadIdx.x + rmax; 42 int tx_rmax = threadIdx.x + rmax;
43 int bxs = bxi - rmax; 43 int bxs = bxi - rmax;
44 44
45 -  
46 - for(int yr = -r; yr <= r; yr++){ 45 + //for every line (along y)
  46 + for(int yr = -rmax; yr <= rmax; yr++){
47 47
48 //copy the portion of the image necessary for this block to shared memory 48 //copy the portion of the image necessary for this block to shared memory
49 __syncthreads(); 49 __syncthreads();
50 stim::cuda::sharedMemcpy_tex2D<float2>(s_grad, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim); 50 stim::cuda::sharedMemcpy_tex2D<float2>(s_grad, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim);
51 __syncthreads(); 51 __syncthreads();
52 52
53 - //if the current thread is outside of the image, it doesn't have to be computed  
54 if(xi < x && yi < y){ 53 if(xi < x && yi < y){
55 -  
56 - for(int xr = -r; xr <= r; xr++){ 54 +
  55 + for(int xr = -rmax; xr <= rmax; xr++){
57 56
58 //find the location of this voter in the atan2 table 57 //find the location of this voter in the atan2 table
59 - unsigned int id_t = (yr + rmax) * x_table + xr + rmax; 58 + int id_t = (yr + rmax) * x_table + xr + rmax;
60 59
61 // calculate the angle between the pixel and the current voter in x and y directions 60 // calculate the angle between the pixel and the current voter in x and y directions
62 float atan_angle = gpuTable[id_t]; 61 float atan_angle = gpuTable[id_t];
@@ -74,10 +73,12 @@ namespace stim{ @@ -74,10 +73,12 @@ namespace stim{
74 } 73 }
75 74
76 } 75 }
  76 +
77 } 77 }
78 } 78 }
79 -  
80 - gpuVote[i] = sum; 79 + if(xi < x && yi < y)
  80 + gpuVote[i] = sum;
  81 +
81 } 82 }
82 83
83 template<typename T> 84 template<typename T>
@@ -92,7 +93,9 @@ namespace stim{ @@ -92,7 +93,9 @@ namespace stim{
92 //unsigned int thread_dim = sqrt(max_threads); 93 //unsigned int thread_dim = sqrt(max_threads);
93 dim3 threads(max_threads, 1); 94 dim3 threads(max_threads, 1);
94 dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); 95 dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
95 - 96 + //dim3 threads(1,1);
  97 + //dim3 blocks(x, y);
  98 +
96 // Allocate CUDA array in device memory 99 // Allocate CUDA array in device memory
97 100
98 //define a channel descriptor for a single 32-bit channel 101 //define a channel descriptor for a single 32-bit channel
stim/cuda/templates/gradient.cuh
@@ -9,7 +9,7 @@ namespace stim{ @@ -9,7 +9,7 @@ namespace stim{
9 namespace cuda{ 9 namespace cuda{
10 10
11 template<typename T> 11 template<typename T>
12 - __global__ void gradient_2d(T* out, T* in, unsigned int x, unsigned int y){ 12 + __global__ void gradient_2d(T* out, T* in, int x, int y){
13 13
14 //calculate the 1D image index for this thread 14 //calculate the 1D image index for this thread
15 int i = blockIdx.x * blockDim.x + threadIdx.x; 15 int i = blockIdx.x * blockDim.x + threadIdx.x;
stim/image/image.h
@@ -48,7 +48,7 @@ public: @@ -48,7 +48,7 @@ public:
48 //create an image from an interleaved buffer 48 //create an image from an interleaved buffer
49 void set_interleaved(T* buffer, unsigned int width, unsigned int height, unsigned int channels = 1){ 49 void set_interleaved(T* buffer, unsigned int width, unsigned int height, unsigned int channels = 1){
50 50
51 - unsigned char* non_interleaved = (unsigned char*)malloc(width * height * 3); 51 + T* non_interleaved = (T*)malloc(width * height * 3 * sizeof(T));
52 unsigned int S = width * height; 52 unsigned int S = width * height;
53 53
54 for(unsigned int i = 0; i < S; i++){ 54 for(unsigned int i = 0; i < S; i++){
@@ -57,7 +57,7 @@ public: @@ -57,7 +57,7 @@ public:
57 } 57 }
58 } 58 }
59 59
60 - img = cimg_library::CImg<unsigned char>(non_interleaved, width, height, 1, channels); 60 + img = cimg_library::CImg<T>(non_interleaved, width, height, 1, channels);
61 } 61 }
62 62
63 //fills an allocated region of memory with non-interleaved data 63 //fills an allocated region of memory with non-interleaved data