Commit 03428452ea14e91183646fafa962d61255abd52f

Authored by Laila Saadatifard
1 parent 4252d827

update the ivote 2d for the last version of fast running code using bounding box…

…es for vote and update dir, and shared memory and atomic operation
stim/cuda/ivote/local_max.cuh
... ... @@ -14,7 +14,7 @@ namespace stim{
14 14  
15 15 // calculate the 2D coordinates for this current thread.
16 16 int xi = blockIdx.x * blockDim.x + threadIdx.x;
17   - int yi = blockIdx.y;
  17 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
18 18  
19 19 if(xi >= x || yi >= y)
20 20 return;
... ... @@ -63,8 +63,10 @@ namespace stim{
63 63 void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
64 64  
65 65 unsigned int max_threads = stim::maxThreadsPerBlock();
66   - dim3 threads(max_threads, 1);
67   - dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  66 + /*dim3 threads(max_threads, 1);
  67 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/
  68 + dim3 threads( sqrt(max_threads), sqrt(max_threads) );
  69 + dim3 blocks(x/threads.x + 1, y/threads.y + 1);
68 70  
69 71 //call the kernel to find the local maximum.
70 72 cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y);
... ...
stim/cuda/ivote/update_dir_global.cuh
... ... @@ -7,8 +7,7 @@
7 7 #include <stim/cuda/sharedmem.cuh>
8 8 #include <stim/visualization/aabb2.h>
9 9 #include <stim/visualization/colormap.h>
10   -#include <math.h>
11   -#include "cpyToshare.cuh"
  10 +#include <math.h>
12 11  
13 12 //#define RMAX_TEST 8
14 13  
... ... @@ -76,68 +75,6 @@ namespace stim{
76 75 gpuDir[i] = atan2((T)max_dy, (T)max_dx);
77 76 }
78 77  
79   - // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
80   - template<typename T>
81   - __global__ void leila_cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
82   -
83   -
84   - // calculate the 2D coordinates for this current thread.
85   - int xi = blockIdx.x * blockDim.x + threadIdx.x;
86   - int yi = blockIdx.y * blockDim.y + threadIdx.y;
87   -
88   - if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel
89   -
90   - int i = yi * x + xi; // convert 2D coordinates to 1D
91   -
92   - float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch
93   - gpuDir[i] = 0; //initialize the vote direction to zero
94   - float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter
95   - int id_x = 0; // define two local variables for the x and y position of the maximum
96   - int id_y = 0;
97   -
98   - int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter
99   - int rmax_sq = rmax * rmax;
100   - int tx_rmax = threadIdx.x + rmax;
101   - float atan_angle;
102   - float vote_c;
103   - int xidx, yidx, yr_sq, xr_sq;
104   - for(int yr = -rmax; yr <= rmax; yr++){
105   - yidx = yi + yr; //compute the index into the image
106   - if (yidx >= 0 && yidx < y){ //if the current y-index is inside the image
107   - yr_sq = yr * yr; //compute the square of yr, to save time later
108   - for(int xr = -rmax; xr <= rmax; xr++){
109   - xidx = xi + xr;
110   - if(xidx >= 0 && xidx < x){
111   - xr_sq = xr * xr;
112   - unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
113   -
114   - // calculate the angle between the voter and the current pixel in x and y directions
115   - atan_angle = gpuTable[ind_t];
116   - //atan_angle = atan2((T)yr, (T)xr);
117   -
118   - // check if the current pixel is located in the voting area of this voter.
119   - if (((xr_sq + yr_sq)< rmax_sq) && (abs(atan_angle - theta) <phi)){
120   -
121   - vote_c = gpuVote[yidx * x + xidx]; // find the vote value for the current counter
122   - // compare the vote value of this pixel with the max value to find the maxima and its index.
123   - if (vote_c>max) {
124   -
125   - max = vote_c;
126   - id_x = xr;
127   - id_y = yr;
128   - }
129   - }
130   - }
131   - }
132   - }
133   - }
134   -
135   - unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
136   - float new_angle = gpuTable[ind_m];
137   -
138   - if(xi < x && yi < y)
139   - gpuDir[i] = new_angle;
140   - } //end kernel
141 78  
142 79  
143 80 // this kernel updates the gradient direction by the calculated voting direction.
... ... @@ -168,9 +105,7 @@ namespace stim{
168 105 HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) );
169 106  
170 107 unsigned int max_threads = stim::maxThreadsPerBlock();
171   - //dim3 threads(min(x, max_threads), 1);
172   - //dim3 blocks(x/threads.x, y);
173   -
  108 +
174 109 dim3 threads( sqrt(max_threads), sqrt(max_threads) );
175 110 dim3 blocks(x/threads.x + 1, y/threads.y + 1);
176 111  
... ... @@ -188,12 +123,12 @@ namespace stim{
188 123  
189 124 //call the kernel to calculate the new voting direction
190 125 cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
191   - stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
  126 + //stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
192 127  
193 128 //exit(0);
194 129  
195   - threads = dim3( sqrt(max_threads), sqrt(max_threads) );
196   - blocks = dim3(x/threads.x + 1, y/threads.y + 1);
  130 + //threads = dim3( sqrt(max_threads), sqrt(max_threads) );
  131 + //blocks = dim3(x/threads.x + 1, y/threads.y + 1);
197 132  
198 133 //call the kernel to update the gradient direction
199 134 cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
... ...
stim/cuda/ivote/update_dir_threshold_global.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBALD_H
  2 +#define STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBAL_H
  3 +
  4 +# include <iostream>
  5 +# include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include "cpyToshare.cuh"
  9 +
  10 +namespace stim{
  11 + namespace cuda{
  12 +
  13 + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
  14 + template<typename T>
  15 + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x, int y){
  16 +
  17 +
  18 +
  19 + // calculate the coordinate for this current thread.
  20 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  21 + // calculate the voting direction based on the grtadient direction
  22 + float theta = gpuTh[3*xi];
  23 +
  24 + //calculate the position and x, y coordinations of this voter in the original image
  25 + unsigned int i_v = gpuTh[3*xi+2];
  26 + unsigned int y_v = i_v/x;
  27 + unsigned int x_v = i_v - (y_v*x);
  28 +
  29 + //initialize the vote direction to zero
  30 + gpuDir[xi] = 0;
  31 +
  32 + // define a local variable to maximum value of the vote image in the voting area for this voter
  33 + float max = 0;
  34 +
  35 + // define two local variables for the x and y coordinations where the maximum happened
  36 + int id_x = 0;
  37 + int id_y = 0;
  38 +
  39 + // compute the size of window which will be checked for finding the voting area for this voter
  40 + int x_table = 2*rmax +1;
  41 + int rmax_sq = rmax * rmax;
  42 + int tx_rmax = threadIdx.x + rmax;
  43 + if(xi < th_size){
  44 +
  45 + for(int yr = -rmax; yr <= rmax; yr++){
  46 +
  47 + for(int xr = -rmax; xr <= rmax; xr++){
  48 +
  49 + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
  50 +
  51 + // find the angle between the voter and the current pixel in x and y directions
  52 + float atan_angle = gpuTable[ind_t];
  53 +
  54 + // check if the current pixel is located in the voting area of this voter.
  55 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  56 + // find the vote value for the current counter
  57 + float vote_c = gpuVote[(y_v+yr)*x + (x_v+xr)];
  58 + // compare the vote value of this pixel with the max value to find the maxima and its index.
  59 + if (vote_c>max) {
  60 +
  61 + max = vote_c;
  62 + id_x = xr;
  63 + id_y = yr;
  64 + }
  65 + }
  66 + }
  67 + }
  68 +
  69 +
  70 + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
  71 + float new_angle = gpuTable[ind_m];
  72 + gpuDir[xi] = new_angle;
  73 + }
  74 +
  75 + }
  76 +
  77 + // this kernel updates the gradient direction by the calculated voting direction.
  78 + template<typename T>
  79 + __global__ void cuda_update_grad(T* gpuTh, T* gpuDir, int th_size, int x, int y){
  80 +
  81 + // calculate the coordinate for this current thread.
  82 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  83 +
  84 +
  85 + //update the gradient image with the vote direction
  86 + gpuTh[3*xi] = gpuDir[xi];
  87 + }
  88 +
  89 + template<typename T>
  90 + void gpu_update_dir(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){
  91 +
  92 + //calculate the number of bytes in the array
  93 + unsigned int bytes_th = th_size* sizeof(T);
  94 +
  95 + unsigned int max_threads = stim::maxThreadsPerBlock();
  96 + dim3 threads(max_threads);
  97 + dim3 blocks(th_size/threads.x+1);
  98 +
  99 + // allocate space on the GPU for the updated vote direction
  100 + T* gpuDir;
  101 + cudaMalloc(&gpuDir, bytes_th);
  102 +
  103 + //call the kernel to calculate the new voting direction
  104 + cuda_update_dir <<< blocks, threads>>>(gpuDir, gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y);
  105 +
  106 + //call the kernel to update the gradient direction
  107 + cuda_update_grad <<< blocks, threads >>>(gpuTh, gpuDir, th_size, x , y);
  108 +
  109 + //free allocated memory
  110 + cudaFree(gpuDir);
  111 +
  112 + }
  113 +
  114 + template<typename T>
  115 + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  116 +
  117 + //calculate the number of bytes in the array
  118 + unsigned int bytes = x * y * sizeof(T);
  119 +
  120 + //calculate the number of bytes in the atan2 table
  121 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  122 +
  123 + //allocate space on the GPU for the Vote Image
  124 + T* gpuVote;
  125 + cudaMalloc(&gpuVote, bytes);
  126 +
  127 + //copy the input vote image to the GPU
  128 + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
  129 +
  130 + //allocate space on the GPU for the input Gradient image
  131 + T* gpuGrad;
  132 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  133 +
  134 + //copy the Gradient data to the GPU
  135 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  136 +
  137 + //allocate space on the GPU for the atan2 table
  138 + T* gpuTable;
  139 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  140 +
  141 + //copy the atan2 values to the GPU
  142 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  143 +
  144 + //call the GPU version of the update direction function
  145 + gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  146 +
  147 + //copy the new gradient image back to the CPU
  148 + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
  149 +
  150 + //free allocated memory
  151 + cudaFree(gpuTable);
  152 + cudaFree(gpuVote);
  153 + cudaFree(gpuGrad);
  154 + }
  155 +
  156 + }
  157 +}
  158 +
  159 +#endif
0 160 \ No newline at end of file
... ...
stim/cuda/ivote/vote_atomic_global.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_VOTE_ATOMIC_GLOBAL_H
  2 +#define STIM_CUDA_VOTE_ATOMIC_GLOBAL_H
  3 +
  4 +# include <iostream>
  5 +# include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include <stim/visualization/aabb2.h>
  9 +#include <stim/visualization/colormap.h>
  10 +#include <math.h>
  11 +
  12 +namespace stim{
  13 + namespace cuda{
  14 +
  15 + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  16 + template<typename T>
  17 + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  18 +
  19 + extern __shared__ T S[];
  20 + T* shared_atan = S;
  21 + size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
  22 + stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
  23 +
  24 + // calculate the 2D coordinates for this current thread.
  25 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  26 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  27 +
  28 + if(xi >= x || yi >= y) return;
  29 + // convert 2D coordinates to 1D
  30 + int i = yi * x + xi;
  31 +
  32 + // calculate the voting direction based on the grtadient direction
  33 + float theta = gpuGrad[2*i];
  34 + //calculate the amount of vote for the voter
  35 + float mag = gpuGrad[2*i + 1];
  36 +
  37 +
  38 + stim::aabb2<int> bb(xi, yi); //initialize a bounding box at the current point
  39 + bb.insert(xi + ceil(rmax * cos(theta)), ceil(yi + rmax * sin(theta)));
  40 + bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi))); //insert one corner of the triangle into the bounding box
  41 + bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi))); //insert the final corner into the bounding box
  42 +
  43 + // compute the size of window which will be checked for finding the proper voters for this pixel
  44 + int x_table = 2*rmax +1;
  45 + int rmax_sq = rmax * rmax;
  46 +
  47 + int lut_i;
  48 + T dx_sq, dy_sq;
  49 +
  50 + bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image
  51 + bb.trim_high(x-1, y-1);
  52 +
  53 + int by, bx;
  54 + int dx, dy;
  55 +
  56 + unsigned int ind_g; //initialize the maximum vote value to zero
  57 + T alpha;
  58 +
  59 + for(by = bb.low[1]; by <= bb.high[1]; by++){ //for each element in the bounding box
  60 + dy = by - yi; //calculate the y coordinate of the current point relative to yi
  61 + dy_sq = dy * dy;
  62 + for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
  63 + dx = bx - xi;
  64 + dx_sq = dx * dx;
  65 + lut_i = (rmax - dy) * x_table + rmax - dx;
  66 + alpha = shared_atan[lut_i];
  67 + if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
  68 + ind_g = (by)*x + (bx);
  69 + atomicAdd(&gpuVote[ind_g], mag);
  70 +
  71 + }
  72 + }
  73 + }
  74 +
  75 + }
  76 +
  77 +
  78 + template<typename T>
  79 + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  80 +
  81 +
  82 + unsigned int max_threads = stim::maxThreadsPerBlock();
  83 + dim3 threads( sqrt(max_threads), sqrt(max_threads) );
  84 + dim3 blocks(x/threads.x + 1, y/threads.y + 1);
  85 + size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
  86 + size_t shared_mem_req = table_bytes;// + template_bytes;
  87 + std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
  88 + size_t shared_mem = stim::sharedMemPerBlock();
  89 + if(shared_mem_req > shared_mem){
  90 + std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
  91 + exit(1);
  92 + }
  93 +
  94 + //call the kernel to do the voting
  95 + cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  96 +
  97 + }
  98 +
  99 +
  100 + template<typename T>
  101 + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  102 +
  103 + //calculate the number of bytes in the array
  104 + unsigned int bytes = x * y * sizeof(T);
  105 +
  106 + //calculate the number of bytes in the atan2 table
  107 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  108 +
  109 + //allocate space on the GPU for the Vote Image
  110 + T* gpuVote;
  111 + cudaMalloc(&gpuVote, bytes);
  112 +
  113 + //allocate space on the GPU for the input Gradient image
  114 + T* gpuGrad;
  115 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  116 +
  117 + //copy the Gradient Magnitude data to the GPU
  118 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  119 +
  120 + //allocate space on the GPU for the atan2 table
  121 + T* gpuTable;
  122 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  123 +
  124 + //copy the atan2 values to the GPU
  125 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  126 +
  127 + //call the GPU version of the vote calculation function
  128 + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  129 +
  130 + //copy the Vote Data back to the CPU
  131 + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  132 +
  133 + //free allocated memory
  134 + cudaFree(gpuTable);
  135 + cudaFree(gpuVote);
  136 + cudaFree(gpuGrad);
  137 + }
  138 +
  139 + }
  140 +}
  141 +
  142 +#endif
0 143 \ No newline at end of file
... ...
stim/cuda/ivote/vote_atomic_shared.cuh
... ... @@ -5,7 +5,7 @@
5 5 # include <cuda.h>
6 6 #include <stim/cuda/cudatools.h>
7 7 #include <stim/cuda/sharedmem.cuh>
8   -#include "cpyToshare.cuh"
  8 +
9 9 //#include "writebackshared.cuh"
10 10 namespace stim{
11 11 namespace cuda{
... ...
stim/cuda/ivote/vote_shared.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_VOTE_SHARED_H
  2 +#define STIM_CUDA_VOTE_SHARED
  3 +# include <iostream>
  4 +# include <cuda.h>
  5 +#include <stim/cuda/cudatools.h>
  6 +#include <stim/cuda/sharedmem.cuh>
  7 +#include "cpyToshare.cuh"
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  13 + template<typename T>
  14 + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  15 +
  16 + //generate a pointer to shared memory (size will be specified as a kernel parameter)
  17 + extern __shared__ float s_grad[];
  18 +
  19 + //calculate the start point for this block
  20 + int bxi = blockIdx.x * blockDim.x;
  21 +
  22 + // calculate the 2D coordinates for this current thread.
  23 + int xi = bxi + threadIdx.x;
  24 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  25 + // convert 2D coordinates to 1D
  26 + int i = yi * x + xi;
  27 +
  28 + // define a local variable to sum the votes from the voters
  29 + float sum = 0;
  30 +
  31 + //calculate the width of the shared memory block
  32 + int swidth = 2 * rmax + blockDim.x;
  33 +
  34 + // compute the size of window which will be checked for finding the proper voters for this pixel
  35 + int x_table = 2*rmax +1;
  36 + int rmax_sq = rmax * rmax;
  37 + int tx_rmax = threadIdx.x + rmax;
  38 + int bxs = bxi - rmax;
  39 +
  40 + //for every line (along y)
  41 + for(int yr = -rmax; yr <= rmax; yr++){
  42 + if (yi+yr<y && yi+yr>=0){
  43 + //copy the portion of the image necessary for this block to shared memory
  44 + __syncthreads();
  45 + cpyG2S1D2ch<float>(s_grad, gpuGrad, bxs, yi + yr , 2*swidth, 1, threadIdx, blockDim, x, y);
  46 + __syncthreads();
  47 +
  48 + if(xi < x && yi < y){
  49 +
  50 + for(int xr = -rmax; xr <= rmax; xr++){
  51 +
  52 + //find the location of this voter in the atan2 table
  53 + int id_t = (yr + rmax) * x_table + xr + rmax;
  54 +
  55 + // calculate the angle between the pixel and the current voter in x and y directions
  56 + float atan_angle = gpuTable[id_t];
  57 +
  58 + // calculate the voting direction based on the grtadient direction
  59 + int idx_share = xr + tx_rmax ;
  60 + float theta = s_grad[idx_share*2];
  61 + float mag = s_grad[idx_share*2 + 1];
  62 +
  63 +
  64 + // check if the current voter is located in the voting area of this pixel.
  65 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  66 + sum += mag;
  67 +
  68 + }
  69 + }
  70 +
  71 + }
  72 + }
  73 + }
  74 + if(xi < x && yi < y)
  75 + gpuVote[i] = sum;
  76 +
  77 + }
  78 +
  79 + template<typename T>
  80 + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  81 +
  82 +
  83 + unsigned int max_threads = stim::maxThreadsPerBlock();
  84 + dim3 threads(max_threads, 1);
  85 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  86 +
  87 +
  88 + // specify share memory
  89 + unsigned int share_bytes = (2*rmax + threads.x)*1*2*sizeof(T);
  90 +
  91 + //call the kernel to do the voting
  92 + cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  93 +
  94 + }
  95 +
  96 +
  97 + template<typename T>
  98 + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  99 +
  100 + //calculate the number of bytes in the array
  101 + unsigned int bytes = x * y * sizeof(T);
  102 +
  103 + //calculate the number of bytes in the atan2 table
  104 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  105 +
  106 + //allocate space on the GPU for the Vote Image
  107 + T* gpuVote;
  108 + cudaMalloc(&gpuVote, bytes);
  109 +
  110 + //allocate space on the GPU for the input Gradient image
  111 + T* gpuGrad;
  112 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  113 +
  114 + //copy the Gradient Magnitude data to the GPU
  115 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  116 +
  117 + //allocate space on the GPU for the atan2 table
  118 + T* gpuTable;
  119 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  120 +
  121 + //copy the atan2 values to the GPU
  122 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  123 +
  124 + //call the GPU version of the vote calculation function
  125 + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  126 +
  127 + //copy the Vote Data back to the CPU
  128 + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  129 +
  130 + //free allocated memory
  131 + cudaFree(gpuTable);
  132 + cudaFree(gpuVote);
  133 + cudaFree(gpuGrad);
  134 + }
  135 +
  136 + }
  137 +}
  138 +
  139 +#endif
0 140 \ No newline at end of file
... ...
stim/cuda/ivote/vote_threshold_global.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H
  2 +#define STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H
  3 +# include <iostream>
  4 +# include <cuda.h>
  5 +#include <stim/cuda/cudatools.h>
  6 +#include <stim/cuda/sharedmem.cuh>
  7 +#include "cpyToshare.cuh"
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  13 + template<typename T>
  14 + __global__ void cuda_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x, int y){
  15 +
  16 +
  17 + // calculate the x coordinate for this current thread.
  18 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  19 +
  20 + // calculate the voting direction based on the grtadient direction
  21 + float theta = gpuTh[3*xi];
  22 + //find the gradient magnitude for the current voter
  23 + float mag = gpuTh[3*xi + 1];
  24 + //calculate the position and x, y coordinations of this voter in the original image
  25 + unsigned int i_v = gpuTh[3*xi+2];
  26 + unsigned int y_v = i_v/x;
  27 + unsigned int x_v = i_v - (y_v*x);
  28 +
  29 + // compute the size of window which will be checked for finding the proper voters for this pixel
  30 + int x_table = 2*rmax +1;
  31 + int rmax_sq = rmax * rmax;
  32 + if(xi < th_size){
  33 + for(int yr = -rmax; yr <= rmax; yr++){
  34 + for(int xr = -rmax; xr <= rmax; xr++){
  35 + if ((y_v+yr)>=0 && (y_v+yr)<y && (x_v+xr)>=0 && (x_v+xr)<x){
  36 +
  37 + //find the location of the current pixel in the atan2 table
  38 + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
  39 +
  40 + // calculate the angle between the voter and the current pixel in x and y directions
  41 + float atan_angle = gpuTable[ind_t];
  42 +
  43 + // check if the current pixel is located in the voting area of this voter.
  44 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  45 + // calculate the 1D index for the current pixel in global memory
  46 + unsigned int ind_g = (y_v+yr)*x + (x_v+xr);
  47 + atomicAdd(&gpuVote[ind_g], mag);
  48 +
  49 + }
  50 + }
  51 + }
  52 + }
  53 + }
  54 + }
  55 +
  56 + template<typename T>
  57 + void gpu_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){
  58 +
  59 +
  60 + unsigned int max_threads = stim::maxThreadsPerBlock();
  61 + dim3 threads(max_threads);
  62 + dim3 blocks(th_size/threads.x + 1);
  63 +
  64 + //call the kernel to do the voting
  65 + cuda_vote <<< blocks, threads>>>(gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y);
  66 +
  67 + }
  68 +
  69 +
  70 + template<typename T>
  71 + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  72 +
  73 + //calculate the number of bytes in the array
  74 + unsigned int bytes = x * y * sizeof(T);
  75 +
  76 + //calculate the number of bytes in the atan2 table
  77 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  78 +
  79 + //allocate space on the GPU for the Vote Image
  80 + T* gpuVote;
  81 + cudaMalloc(&gpuVote, bytes);
  82 +
  83 + //allocate space on the GPU for the input Gradient image
  84 + T* gpuGrad;
  85 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  86 +
  87 + //copy the Gradient Magnitude data to the GPU
  88 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  89 +
  90 + //allocate space on the GPU for the atan2 table
  91 + T* gpuTable;
  92 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  93 +
  94 + //copy the atan2 values to the GPU
  95 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  96 +
  97 + //call the GPU version of the vote calculation function
  98 + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  99 +
  100 + //copy the Vote Data back to the CPU
  101 + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  102 +
  103 + //free allocated memory
  104 + cudaFree(gpuTable);
  105 + cudaFree(gpuVote);
  106 + cudaFree(gpuGrad);
  107 + }
  108 +
  109 + }
  110 +}
  111 +
  112 +#endif
0 113 \ No newline at end of file
... ...
stim/cuda/ivote_atomic.cuh
... ... @@ -5,7 +5,7 @@
5 5 #include <stim/cuda/ivote/local_max.cuh>
6 6 #include <stim/cuda/ivote/update_dir_global.cuh>
7 7 //#include <stim/cuda/ivote/vote_shared_32-32.cuh>
8   -#include <stim/cuda/ivote/vote_atomic_shared.cuh>
  8 +#include <stim/cuda/ivote/vote_atomic_global.cuh>
9 9 //#include <stim/cuda/ivote/re_sample.cuh>
10 10 namespace stim{
11 11 namespace cuda{
... ...