Merged
Merge Request #4 · created by Laila Saadatifard


Ivote_bounding_box

I found out the ivote directory has useless functions, so I removed them, and also I had not push the last changes for vote kernel that used bounding box.


From ivote_bb into master

Merged by David Mayerich

2 participants

stim/cuda/ivote/local_max.cuh
... ... @@ -14,7 +14,7 @@ namespace stim{
14 14  
15 15 // calculate the 2D coordinates for this current thread.
16 16 int xi = blockIdx.x * blockDim.x + threadIdx.x;
17   - int yi = blockIdx.y;
  17 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
18 18  
19 19 if(xi >= x || yi >= y)
20 20 return;
... ... @@ -63,8 +63,10 @@ namespace stim{
63 63 void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
64 64  
65 65 unsigned int max_threads = stim::maxThreadsPerBlock();
66   - dim3 threads(max_threads, 1);
67   - dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  66 + /*dim3 threads(max_threads, 1);
  67 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/
  68 + dim3 threads( sqrt(max_threads), sqrt(max_threads) );
  69 + dim3 blocks(x/threads.x + 1, y/threads.y + 1);
68 70  
69 71 //call the kernel to find the local maximum.
70 72 cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y);
... ...
stim/cuda/ivote/re_sample.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_RE_SAMPLE_H
  2 +#define STIM_CUDA_RE_SAMPLE_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/templates/gaussian_blur.cuh>
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + template<typename T>
  13 + __global__ void cuda_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
  14 +
  15 + unsigned int sigma_ds = 1/resize;
  16 + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
  17 + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
  18 +
  19 +
  20 + // calculate the 2D coordinates for this current thread.
  21 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  22 + int yi = blockIdx.y;
  23 + // convert 2D coordinates to 1D
  24 + int i = yi * x + xi;
  25 +
  26 + if(xi< x && yi< y){
  27 + if(xi%sigma_ds==0){
  28 + if(yi%sigma_ds==0){
  29 + gpuI[i] = gpuI0[(yi/sigma_ds)*x_ds + xi/sigma_ds];
  30 + }
  31 + }
  32 + else gpuI[i] = 0;
  33 +
  34 + //int x_org = xi * sigma_ds ;
  35 + //int y_org = yi * sigma_ds ;
  36 + //int i_org = y_org * x + x_org;
  37 + //gpuI[i] = gpuI0[i_org];
  38 + }
  39 +
  40 + }
  41 +
  42 +
  43 + /// Applies a Gaussian blur to a 2D image stored on the GPU
  44 + template<typename T>
  45 + void gpu_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
  46 +
  47 +
  48 + //unsigned int sigma_ds = 1/resize;
  49 + //unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
  50 + //unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
  51 +
  52 + //get the number of pixels in the image
  53 + //unsigned int pixels_ds = x_ds * y_ds;
  54 +
  55 + unsigned int max_threads = stim::maxThreadsPerBlock();
  56 + dim3 threads(max_threads, 1);
  57 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  58 +
  59 + //stim::cuda::gpu_gaussian_blur2<float>(gpuI0, sigma_ds,x ,y);
  60 +
  61 + //resample the image
  62 + cuda_re_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y);
  63 +
  64 + }
  65 +
  66 + /// Applies a Gaussian blur to a 2D image stored on the CPU
  67 + template<typename T>
  68 + void cpu_re_sample(T* out, T* in, T resize, unsigned int x, unsigned int y){
  69 +
  70 + //get the number of pixels in the image
  71 + unsigned int pixels = x*y;
  72 + unsigned int bytes = sizeof(T) * pixels;
  73 +
  74 + unsigned int sigma_ds = 1/resize;
  75 + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
  76 + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
  77 + unsigned int bytes_ds = sizeof(T) * x_ds * y_ds;
  78 +
  79 +
  80 +
  81 + //allocate space on the GPU for the original image
  82 + T* gpuI0;
  83 + cudaMalloc(&gpuI0, bytes_ds);
  84 +
  85 +
  86 + //copy the image data to the GPU
  87 + cudaMemcpy(gpuI0, in, bytes_ds, cudaMemcpyHostToDevice);
  88 +
  89 + //allocate space on the GPU for the down sampled image
  90 + T* gpuI;
  91 + cudaMalloc(&gpuI, bytes);
  92 +
  93 + //run the GPU-based version of the algorithm
  94 + gpu_re_sample<T>(gpuI, gpuI0, resize, x, y);
  95 +
  96 + //copy the image data to the GPU
  97 + cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
  98 +
  99 + cudaFree(gpuI0);
  100 + cudeFree(gpuI);
  101 + }
  102 +
  103 + }
  104 +}
  105 +
  106 +#endif
0 107 \ No newline at end of file
... ...
stim/cuda/ivote/update_dir_global.cuh renamed to stim/cuda/ivote/update_dir_bb.cuh
1   -#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
2   -#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
  1 +#ifndef STIM_CUDA_UPDATE_DIR_BB_H
  2 +#define STIM_CUDA_UPDATE_DIR_BB_H
3 3  
4 4 # include <iostream>
5 5 # include <cuda.h>
... ... @@ -7,8 +7,7 @@
7 7 #include <stim/cuda/sharedmem.cuh>
8 8 #include <stim/visualization/aabb2.h>
9 9 #include <stim/visualization/colormap.h>
10   -#include <math.h>
11   -#include "cpyToshare.cuh"
  10 +#include <math.h>
12 11  
13 12 //#define RMAX_TEST 8
14 13  
... ... @@ -76,68 +75,6 @@ namespace stim{
76 75 gpuDir[i] = atan2((T)max_dy, (T)max_dx);
77 76 }
78 77  
79   - // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
80   - template<typename T>
81   - __global__ void leila_cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
82   -
83   -
84   - // calculate the 2D coordinates for this current thread.
85   - int xi = blockIdx.x * blockDim.x + threadIdx.x;
86   - int yi = blockIdx.y * blockDim.y + threadIdx.y;
87   -
88   - if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel
89   -
90   - int i = yi * x + xi; // convert 2D coordinates to 1D
91   -
92   - float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch
93   - gpuDir[i] = 0; //initialize the vote direction to zero
94   - float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter
95   - int id_x = 0; // define two local variables for the x and y position of the maximum
96   - int id_y = 0;
97   -
98   - int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter
99   - int rmax_sq = rmax * rmax;
100   - int tx_rmax = threadIdx.x + rmax;
101   - float atan_angle;
102   - float vote_c;
103   - int xidx, yidx, yr_sq, xr_sq;
104   - for(int yr = -rmax; yr <= rmax; yr++){
105   - yidx = yi + yr; //compute the index into the image
106   - if (yidx >= 0 && yidx < y){ //if the current y-index is inside the image
107   - yr_sq = yr * yr; //compute the square of yr, to save time later
108   - for(int xr = -rmax; xr <= rmax; xr++){
109   - xidx = xi + xr;
110   - if(xidx >= 0 && xidx < x){
111   - xr_sq = xr * xr;
112   - unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
113   -
114   - // calculate the angle between the voter and the current pixel in x and y directions
115   - atan_angle = gpuTable[ind_t];
116   - //atan_angle = atan2((T)yr, (T)xr);
117   -
118   - // check if the current pixel is located in the voting area of this voter.
119   - if (((xr_sq + yr_sq)< rmax_sq) && (abs(atan_angle - theta) <phi)){
120   -
121   - vote_c = gpuVote[yidx * x + xidx]; // find the vote value for the current counter
122   - // compare the vote value of this pixel with the max value to find the maxima and its index.
123   - if (vote_c>max) {
124   -
125   - max = vote_c;
126   - id_x = xr;
127   - id_y = yr;
128   - }
129   - }
130   - }
131   - }
132   - }
133   - }
134   -
135   - unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
136   - float new_angle = gpuTable[ind_m];
137   -
138   - if(xi < x && yi < y)
139   - gpuDir[i] = new_angle;
140   - } //end kernel
141 78  
142 79  
143 80 // this kernel updates the gradient direction by the calculated voting direction.
... ... @@ -168,9 +105,7 @@ namespace stim{
168 105 HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) );
169 106  
170 107 unsigned int max_threads = stim::maxThreadsPerBlock();
171   - //dim3 threads(min(x, max_threads), 1);
172   - //dim3 blocks(x/threads.x, y);
173   -
  108 +
174 109 dim3 threads( sqrt(max_threads), sqrt(max_threads) );
175 110 dim3 blocks(x/threads.x + 1, y/threads.y + 1);
176 111  
... ... @@ -188,12 +123,12 @@ namespace stim{
188 123  
189 124 //call the kernel to calculate the new voting direction
190 125 cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
191   - stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
  126 + //stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
192 127  
193 128 //exit(0);
194 129  
195   - threads = dim3( sqrt(max_threads), sqrt(max_threads) );
196   - blocks = dim3(x/threads.x + 1, y/threads.y + 1);
  130 + //threads = dim3( sqrt(max_threads), sqrt(max_threads) );
  131 + //blocks = dim3(x/threads.x + 1, y/threads.y + 1);
197 132  
198 133 //call the kernel to update the gradient direction
199 134 cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
... ...
stim/cuda/ivote/david_update_dir_global.cuh renamed to stim/cuda/ivote/update_dir_threshold_global.cuh
1   -#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
2   -#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
  1 +#ifndef STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBALD_H
  2 +#define STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBAL_H
3 3  
4 4 # include <iostream>
5 5 # include <cuda.h>
6 6 #include <stim/cuda/cudatools.h>
7 7 #include <stim/cuda/sharedmem.cuh>
8   -#include <math.h>
9   -#include "cpyToshare.cuh"
10   -
11   -#define RMAX_TEST 8
  8 +#include "cpyToshare.cuh"
12 9  
13 10 namespace stim{
14 11 namespace cuda{
15 12  
16 13 // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
17 14 template<typename T>
18   - __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
19   - extern __shared__ T atan2_table[];
20   -
21   - //calculate the start point for this block
22   - //int bxi = blockIdx.x * blockDim.x;
23   -
24   - stim::cuda::sharedMemcpy(atan2_table, gpuTable, (2 * rmax + 1) * (2 * rmax + 1), threadIdx.x, blockDim.x);
  15 + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x, int y){
25 16  
26   - __syncthreads();
27 17  
28   - // calculate the 2D coordinates for this current thread.
29   - //int xi = bxi + threadIdx.x;
  18 +
  19 + // calculate the coordinate for this current thread.
30 20 int xi = blockIdx.x * blockDim.x + threadIdx.x;
31   - int yi = blockIdx.y * blockDim.y + threadIdx.y;
32   - if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel
33   -
34   - int i = yi * x + xi; // convert 2D coordinates to 1D
  21 + // calculate the voting direction based on the grtadient direction
  22 + float theta = gpuTh[3*xi];
35 23  
36   - float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch
37   - gpuDir[i] = 0; //initialize the vote direction to zero
38   - float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter
39   - int id_x = 0; // define two local variables for the x and y position of the maximum
40   - int id_y = 0;
  24 + //calculate the position and x, y coordinations of this voter in the original image
  25 + unsigned int i_v = gpuTh[3*xi+2];
  26 + unsigned int y_v = i_v/x;
  27 + unsigned int x_v = i_v - (y_v*x);
41 28  
42   - int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter
  29 + //initialize the vote direction to zero
  30 + gpuDir[xi] = 0;
  31 +
  32 + // define a local variable to maximum value of the vote image in the voting area for this voter
  33 + float max = 0;
  34 +
  35 + // define two local variables for the x and y coordinations where the maximum happened
  36 + int id_x = 0;
  37 + int id_y = 0;
  38 +
  39 + // compute the size of window which will be checked for finding the voting area for this voter
  40 + int x_table = 2*rmax +1;
43 41 int rmax_sq = rmax * rmax;
44 42 int tx_rmax = threadIdx.x + rmax;
45   - float atan_angle;
46   - float vote_c;
47   - unsigned int ind_t;
48   - for(int yr = -rmax; yr <= rmax; yr++){ //for each counter in the y direction
49   - if (yi+yr >= 0 && yi + yr < y){ //if the counter exists (we aren't looking outside of the image)
50   - for(int xr = -rmax; xr <= rmax; xr++){ //for each counter in the x direction
51   - if((xr * xr + yr *yr)< rmax_sq){ //if the counter is within range of the voter
52   -
53   - ind_t = (rmax - yr) * x_table + rmax - xr; //calculate the index to the atan2 table
54   - atan_angle = atan2_table[ind_t]; //retrieve the direction vector from the table
55   -
56   - //atan_angle = atan2((float)yr, (float)xr);
57   -
58   - if (abs(atan_angle - theta) <phi){ // check if the current pixel is located in the voting angle of this voter.
59   - vote_c = gpuVote[(yi+yr)*x + (xi+xr)]; // find the vote value for the current counter
60   - if(vote_c>max) { // compare the vote value of this pixel with the max value to find the maxima and its index.
61   - max = vote_c;
62   - id_x = xr;
63   - id_y = yr;
64   - }
  43 + if(xi < th_size){
  44 +
  45 + for(int yr = -rmax; yr <= rmax; yr++){
  46 +
  47 + for(int xr = -rmax; xr <= rmax; xr++){
  48 +
  49 + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
  50 +
  51 + // find the angle between the voter and the current pixel in x and y directions
  52 + float atan_angle = gpuTable[ind_t];
  53 +
  54 + // check if the current pixel is located in the voting area of this voter.
  55 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  56 + // find the vote value for the current counter
  57 + float vote_c = gpuVote[(y_v+yr)*x + (x_v+xr)];
  58 + // compare the vote value of this pixel with the max value to find the maxima and its index.
  59 + if (vote_c>max) {
  60 +
  61 + max = vote_c;
  62 + id_x = xr;
  63 + id_y = yr;
65 64 }
66 65 }
67 66 }
68 67 }
69   - }
  68 +
70 69  
71   - unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
72   - float new_angle = gpuTable[ind_m];
  70 + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
  71 + float new_angle = gpuTable[ind_m];
  72 + gpuDir[xi] = new_angle;
  73 + }
73 74  
74   - if(xi < x && yi < y)
75   - gpuDir[i] = new_angle;
76   - } //end kernel
  75 + }
77 76  
78 77 // this kernel updates the gradient direction by the calculated voting direction.
79 78 template<typename T>
80   - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
  79 + __global__ void cuda_update_grad(T* gpuTh, T* gpuDir, int th_size, int x, int y){
81 80  
82   - // calculate the 2D coordinates for this current thread.
  81 + // calculate the coordinate for this current thread.
83 82 int xi = blockIdx.x * blockDim.x + threadIdx.x;
84   - int yi = blockIdx.y * blockDim.y + threadIdx.y;
85   -
86   - // convert 2D coordinates to 1D
87   - int i = yi * x + xi;
88 83  
  84 +
89 85 //update the gradient image with the vote direction
90   - gpuGrad[2*i] = gpuDir[i];
  86 + gpuTh[3*xi] = gpuDir[xi];
91 87 }
92 88  
93 89 template<typename T>
94   - void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
95   -
96   -
  90 + void gpu_update_dir(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){
97 91  
98 92 //calculate the number of bytes in the array
99   - unsigned int bytes = x * y * sizeof(T);
  93 + unsigned int bytes_th = th_size* sizeof(T);
100 94  
101 95 unsigned int max_threads = stim::maxThreadsPerBlock();
102   -
103   - dim3 threads(sqrt(max_threads), sqrt(max_threads));
104   - dim3 blocks(x/threads.x + 1, y/threads.y + 1);
105   -
106   -
  96 + dim3 threads(max_threads);
  97 + dim3 blocks(th_size/threads.x+1);
107 98  
108 99 // allocate space on the GPU for the updated vote direction
109 100 T* gpuDir;
110   - cudaMalloc(&gpuDir, bytes);
111   -
112   - size_t shared_mem = sizeof(T) * std::pow((2 * rmax + 1), 2);
113   - std::cout<<"Shared memory for atan2 table: "<<shared_mem<<std::endl;
  101 + cudaMalloc(&gpuDir, bytes_th);
114 102  
115 103 //call the kernel to calculate the new voting direction
116   - cuda_update_dir <<< blocks, threads, shared_mem>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  104 + cuda_update_dir <<< blocks, threads>>>(gpuDir, gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y);
117 105  
118 106 //call the kernel to update the gradient direction
119   - cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
  107 + cuda_update_grad <<< blocks, threads >>>(gpuTh, gpuDir, th_size, x , y);
120 108  
121 109 //free allocated memory
122 110 cudaFree(gpuDir);
... ...
stim/cuda/ivote/vote_atomic_bb.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_VOTE_ATOMIC_BB_H
  2 +#define STIM_CUDA_VOTE_ATOMIC_BB_H
  3 +
  4 +# include <iostream>
  5 +# include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include <stim/visualization/aabb2.h>
  9 +#include <stim/visualization/colormap.h>
  10 +#include <math.h>
  11 +
  12 +namespace stim{
  13 + namespace cuda{
  14 +
  15 + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  16 + template<typename T>
  17 + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  18 +
  19 + extern __shared__ T S[];
  20 + T* shared_atan = S;
  21 + size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
  22 + stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
  23 +
  24 + // calculate the 2D coordinates for this current thread.
  25 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  26 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  27 +
  28 + if(xi >= x || yi >= y) return;
  29 + // convert 2D coordinates to 1D
  30 + int i = yi * x + xi;
  31 +
  32 + // calculate the voting direction based on the grtadient direction
  33 + float theta = gpuGrad[2*i];
  34 + //calculate the amount of vote for the voter
  35 + float mag = gpuGrad[2*i + 1];
  36 +
  37 +
  38 + stim::aabb2<int> bb(xi, yi); //initialize a bounding box at the current point
  39 + bb.insert(xi + ceil(rmax * cos(theta)), ceil(yi + rmax * sin(theta)));
  40 + bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi))); //insert one corner of the triangle into the bounding box
  41 + bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi))); //insert the final corner into the bounding box
  42 +
  43 + // compute the size of window which will be checked for finding the proper voters for this pixel
  44 + int x_table = 2*rmax +1;
  45 + int rmax_sq = rmax * rmax;
  46 +
  47 + int lut_i;
  48 + T dx_sq, dy_sq;
  49 +
  50 + bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image
  51 + bb.trim_high(x-1, y-1);
  52 +
  53 + int by, bx;
  54 + int dx, dy;
  55 +
  56 + unsigned int ind_g; //initialize the maximum vote value to zero
  57 + T alpha;
  58 +
  59 + for(by = bb.low[1]; by <= bb.high[1]; by++){ //for each element in the bounding box
  60 + dy = by - yi; //calculate the y coordinate of the current point relative to yi
  61 + dy_sq = dy * dy;
  62 + for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
  63 + dx = bx - xi;
  64 + dx_sq = dx * dx;
  65 + lut_i = (rmax - dy) * x_table + rmax - dx;
  66 + alpha = shared_atan[lut_i];
  67 + if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
  68 + ind_g = (by)*x + (bx);
  69 + atomicAdd(&gpuVote[ind_g], mag);
  70 +
  71 + }
  72 + }
  73 + }
  74 +
  75 + }
  76 +
  77 +
  78 + template<typename T>
  79 + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  80 +
  81 +
  82 + unsigned int max_threads = stim::maxThreadsPerBlock();
  83 + dim3 threads( sqrt(max_threads), sqrt(max_threads) );
  84 + dim3 blocks(x/threads.x + 1, y/threads.y + 1);
  85 + size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
  86 + size_t shared_mem_req = table_bytes;// + template_bytes;
  87 + std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
  88 + size_t shared_mem = stim::sharedMemPerBlock();
  89 + if(shared_mem_req > shared_mem){
  90 + std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
  91 + exit(1);
  92 + }
  93 +
  94 + //call the kernel to do the voting
  95 + cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  96 +
  97 + }
  98 +
  99 +
  100 + template<typename T>
  101 + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  102 +
  103 + //calculate the number of bytes in the array
  104 + unsigned int bytes = x * y * sizeof(T);
  105 +
  106 + //calculate the number of bytes in the atan2 table
  107 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  108 +
  109 + //allocate space on the GPU for the Vote Image
  110 + T* gpuVote;
  111 + cudaMalloc(&gpuVote, bytes);
  112 +
  113 + //allocate space on the GPU for the input Gradient image
  114 + T* gpuGrad;
  115 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  116 +
  117 + //copy the Gradient Magnitude data to the GPU
  118 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  119 +
  120 + //allocate space on the GPU for the atan2 table
  121 + T* gpuTable;
  122 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  123 +
  124 + //copy the atan2 values to the GPU
  125 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  126 +
  127 + //call the GPU version of the vote calculation function
  128 + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  129 +
  130 + //copy the Vote Data back to the CPU
  131 + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  132 +
  133 + //free allocated memory
  134 + cudaFree(gpuTable);
  135 + cudaFree(gpuVote);
  136 + cudaFree(gpuGrad);
  137 + }
  138 +
  139 + }
  140 +}
  141 +
  142 +#endif
0 143 \ No newline at end of file
... ...
stim/cuda/ivote/vote_atomic_shared.cuh
... ... @@ -5,7 +5,7 @@
5 5 # include <cuda.h>
6 6 #include <stim/cuda/cudatools.h>
7 7 #include <stim/cuda/sharedmem.cuh>
8   -#include "cpyToshare.cuh"
  8 +
9 9 //#include "writebackshared.cuh"
10 10 namespace stim{
11 11 namespace cuda{
... ...
stim/cuda/ivote/vote_shared.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_VOTE_SHARED_H
  2 +#define STIM_CUDA_VOTE_SHARED
  3 +# include <iostream>
  4 +# include <cuda.h>
  5 +#include <stim/cuda/cudatools.h>
  6 +#include <stim/cuda/sharedmem.cuh>
  7 +#include "cpyToshare.cuh"
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  13 + template<typename T>
  14 + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  15 +
  16 + //generate a pointer to shared memory (size will be specified as a kernel parameter)
  17 + extern __shared__ float s_grad[];
  18 +
  19 + //calculate the start point for this block
  20 + int bxi = blockIdx.x * blockDim.x;
  21 +
  22 + // calculate the 2D coordinates for this current thread.
  23 + int xi = bxi + threadIdx.x;
  24 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  25 + // convert 2D coordinates to 1D
  26 + int i = yi * x + xi;
  27 +
  28 + // define a local variable to sum the votes from the voters
  29 + float sum = 0;
  30 +
  31 + //calculate the width of the shared memory block
  32 + int swidth = 2 * rmax + blockDim.x;
  33 +
  34 + // compute the size of window which will be checked for finding the proper voters for this pixel
  35 + int x_table = 2*rmax +1;
  36 + int rmax_sq = rmax * rmax;
  37 + int tx_rmax = threadIdx.x + rmax;
  38 + int bxs = bxi - rmax;
  39 +
  40 + //for every line (along y)
  41 + for(int yr = -rmax; yr <= rmax; yr++){
  42 + if (yi+yr<y && yi+yr>=0){
  43 + //copy the portion of the image necessary for this block to shared memory
  44 + __syncthreads();
  45 + cpyG2S1D2ch<float>(s_grad, gpuGrad, bxs, yi + yr , 2*swidth, 1, threadIdx, blockDim, x, y);
  46 + __syncthreads();
  47 +
  48 + if(xi < x && yi < y){
  49 +
  50 + for(int xr = -rmax; xr <= rmax; xr++){
  51 +
  52 + //find the location of this voter in the atan2 table
  53 + int id_t = (yr + rmax) * x_table + xr + rmax;
  54 +
  55 + // calculate the angle between the pixel and the current voter in x and y directions
  56 + float atan_angle = gpuTable[id_t];
  57 +
  58 + // calculate the voting direction based on the grtadient direction
  59 + int idx_share = xr + tx_rmax ;
  60 + float theta = s_grad[idx_share*2];
  61 + float mag = s_grad[idx_share*2 + 1];
  62 +
  63 +
  64 + // check if the current voter is located in the voting area of this pixel.
  65 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  66 + sum += mag;
  67 +
  68 + }
  69 + }
  70 +
  71 + }
  72 + }
  73 + }
  74 + if(xi < x && yi < y)
  75 + gpuVote[i] = sum;
  76 +
  77 + }
  78 +
  79 + template<typename T>
  80 + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  81 +
  82 +
  83 + unsigned int max_threads = stim::maxThreadsPerBlock();
  84 + dim3 threads(max_threads, 1);
  85 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  86 +
  87 +
  88 + // specify share memory
  89 + unsigned int share_bytes = (2*rmax + threads.x)*1*2*sizeof(T);
  90 +
  91 + //call the kernel to do the voting
  92 + cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  93 +
  94 + }
  95 +
  96 +
  97 + template<typename T>
  98 + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  99 +
  100 + //calculate the number of bytes in the array
  101 + unsigned int bytes = x * y * sizeof(T);
  102 +
  103 + //calculate the number of bytes in the atan2 table
  104 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  105 +
  106 + //allocate space on the GPU for the Vote Image
  107 + T* gpuVote;
  108 + cudaMalloc(&gpuVote, bytes);
  109 +
  110 + //allocate space on the GPU for the input Gradient image
  111 + T* gpuGrad;
  112 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  113 +
  114 + //copy the Gradient Magnitude data to the GPU
  115 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  116 +
  117 + //allocate space on the GPU for the atan2 table
  118 + T* gpuTable;
  119 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  120 +
  121 + //copy the atan2 values to the GPU
  122 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  123 +
  124 + //call the GPU version of the vote calculation function
  125 + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  126 +
  127 + //copy the Vote Data back to the CPU
  128 + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  129 +
  130 + //free allocated memory
  131 + cudaFree(gpuTable);
  132 + cudaFree(gpuVote);
  133 + cudaFree(gpuGrad);
  134 + }
  135 +
  136 + }
  137 +}
  138 +
  139 +#endif
0 140 \ No newline at end of file
... ...
stim/cuda/ivote/vote_threshold_global.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H
  2 +#define STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H
  3 +# include <iostream>
  4 +# include <cuda.h>
  5 +#include <stim/cuda/cudatools.h>
  6 +#include <stim/cuda/sharedmem.cuh>
  7 +#include "cpyToshare.cuh"
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  13 + template<typename T>
  14 + __global__ void cuda_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x, int y){
  15 +
  16 +
  17 + // calculate the x coordinate for this current thread.
  18 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  19 +
  20 + // calculate the voting direction based on the grtadient direction
  21 + float theta = gpuTh[3*xi];
  22 + //find the gradient magnitude for the current voter
  23 + float mag = gpuTh[3*xi + 1];
  24 + //calculate the position and x, y coordinations of this voter in the original image
  25 + unsigned int i_v = gpuTh[3*xi+2];
  26 + unsigned int y_v = i_v/x;
  27 + unsigned int x_v = i_v - (y_v*x);
  28 +
  29 + // compute the size of window which will be checked for finding the proper voters for this pixel
  30 + int x_table = 2*rmax +1;
  31 + int rmax_sq = rmax * rmax;
  32 + if(xi < th_size){
  33 + for(int yr = -rmax; yr <= rmax; yr++){
  34 + for(int xr = -rmax; xr <= rmax; xr++){
  35 + if ((y_v+yr)>=0 && (y_v+yr)<y && (x_v+xr)>=0 && (x_v+xr)<x){
  36 +
  37 + //find the location of the current pixel in the atan2 table
  38 + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
  39 +
  40 + // calculate the angle between the voter and the current pixel in x and y directions
  41 + float atan_angle = gpuTable[ind_t];
  42 +
  43 + // check if the current pixel is located in the voting area of this voter.
  44 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  45 + // calculate the 1D index for the current pixel in global memory
  46 + unsigned int ind_g = (y_v+yr)*x + (x_v+xr);
  47 + atomicAdd(&gpuVote[ind_g], mag);
  48 +
  49 + }
  50 + }
  51 + }
  52 + }
  53 + }
  54 + }
  55 +
  56 + template<typename T>
  57 + void gpu_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){
  58 +
  59 +
  60 + unsigned int max_threads = stim::maxThreadsPerBlock();
  61 + dim3 threads(max_threads);
  62 + dim3 blocks(th_size/threads.x + 1);
  63 +
  64 + //call the kernel to do the voting
  65 + cuda_vote <<< blocks, threads>>>(gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y);
  66 +
  67 + }
  68 +
  69 +
  70 + template<typename T>
  71 + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  72 +
  73 + //calculate the number of bytes in the array
  74 + unsigned int bytes = x * y * sizeof(T);
  75 +
  76 + //calculate the number of bytes in the atan2 table
  77 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  78 +
  79 + //allocate space on the GPU for the Vote Image
  80 + T* gpuVote;
  81 + cudaMalloc(&gpuVote, bytes);
  82 +
  83 + //allocate space on the GPU for the input Gradient image
  84 + T* gpuGrad;
  85 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  86 +
  87 + //copy the Gradient Magnitude data to the GPU
  88 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  89 +
  90 + //allocate space on the GPU for the atan2 table
  91 + T* gpuTable;
  92 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  93 +
  94 + //copy the atan2 values to the GPU
  95 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  96 +
  97 + //call the GPU version of the vote calculation function
  98 + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  99 +
  100 + //copy the Vote Data back to the CPU
  101 + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  102 +
  103 + //free allocated memory
  104 + cudaFree(gpuTable);
  105 + cudaFree(gpuVote);
  106 + cudaFree(gpuGrad);
  107 + }
  108 +
  109 + }
  110 +}
  111 +
  112 +#endif
0 113 \ No newline at end of file
... ...
stim/cuda/ivote_atomic.cuh renamed to stim/cuda/ivote_atomic_bb.cuh
1   -#ifndef STIM_CUDA_IVOTE_ATOMIC_H
2   -#define STIM_CUDA_IVOTE_ATOMIC_H
  1 +#ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H
  2 +#define STIM_CUDA_IVOTE_ATOMIC_BB_H
3 3  
4 4 #include <stim/cuda/ivote/down_sample.cuh>
5 5 #include <stim/cuda/ivote/local_max.cuh>
6   -#include <stim/cuda/ivote/update_dir_global.cuh>
7   -//#include <stim/cuda/ivote/vote_shared_32-32.cuh>
8   -#include <stim/cuda/ivote/vote_atomic_shared.cuh>
9   -//#include <stim/cuda/ivote/re_sample.cuh>
  6 +#include <stim/cuda/ivote/update_dir_bb.cuh>
  7 +#include <stim/cuda/ivote/vote_atomic_bb.cuh>
  8 +
10 9 namespace stim{
11 10 namespace cuda{
12 11  
... ...