Commit 63d349748fe796c11620ca470fb3e49bee47ff5c

Authored by Pavel Govyadinov
2 parents 03c403fa 11cd127f

Merge branch 'ivote-profiling' into 'master'

Leila's ivote profiling push

This is a test

See merge request !1
stim/cuda/arraymath.cuh
... ... @@ -10,7 +10,8 @@
10 10 #include <stim/cuda/arraymath/array_atan.cuh>
11 11 #include <stim/cuda/arraymath/array_abs.cuh>
12 12 #include <stim/cuda/arraymath/array_cart2polar.cuh>
13   -
  13 +#include <stim/cuda/arraymath/array_threshold.cuh>
  14 +#include <stim/cuda/arraymath/array_add3.cuh>
14 15 namespace stim{
15 16 namespace cuda{
16 17  
... ...
stim/cuda/ivote/local_max.cuh
... ... @@ -29,17 +29,33 @@ namespace stim{
29 29 //compare to the threshold
30 30 if(val < final_t) return;
31 31  
  32 + //define an array to store indices with same vote value
  33 + /*int * IdxEq;
  34 + IdxEq = new int [2*conn];
  35 + int n = 0;*/
  36 +
32 37 for(int xl = xi - conn; xl < xi + conn; xl++){
33 38 for(int yl = yi - conn; yl < yi + conn; yl++){
34 39 if(xl >= 0 && xl < x && yl >= 0 && yl < y){
35 40 int il = yl * x + xl;
36 41 if(gpuVote[il] > val){
37 42 return;
38   - }
39   - }
  43 + }
  44 + if (gpuVote[il] == val){
  45 + /*IdxEq[n] = il;
  46 + n = n+1;*/
  47 + if( il > i){
  48 + return;
  49 + }
  50 + }
  51 + }
40 52 }
41 53 }
42   -
  54 + /*if (n!=0){
  55 + if(IdxEq[n/2] !=i){
  56 + return;
  57 + }
  58 + } */
43 59 gpuCenters[i] = 1;
44 60 }
45 61  
... ...
stim/cuda/ivote/update_dir_global.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
  2 +#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
  3 +
  4 +# include <iostream>
  5 +# include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include "cpyToshare.cuh"
  9 +
  10 +#define RMAX_TEST 8
  11 +
  12 +namespace stim{
  13 + namespace cuda{
  14 +
  15 + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
  16 + template<typename T>
  17 + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  18 +
  19 +
  20 + //calculate the start point for this block
  21 + int bxi = blockIdx.x * blockDim.x;
  22 +
  23 + // calculate the 2D coordinates for this current thread.
  24 + int xi = bxi + threadIdx.x;
  25 + if(xi >= x) return; //if the index is outside of the image, terminate the kernel
  26 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  27 + int i = yi * x + xi; // convert 2D coordinates to 1D
  28 +
  29 + float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch
  30 + gpuDir[i] = 0; //initialize the vote direction to zero
  31 + float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter
  32 + int id_x = 0; // define two local variables for the x and y position of the maximum
  33 + int id_y = 0;
  34 +
  35 + int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter
  36 + int rmax_sq = rmax * rmax;
  37 + int tx_rmax = threadIdx.x + rmax;
  38 + float atan_angle;
  39 + float vote_c;
  40 + for(int yr = -RMAX_TEST; yr <= RMAX_TEST; yr++){
  41 + if (yi+yr >= 0 && yi + yr < y){
  42 + for(int xr = -RMAX_TEST; xr <= RMAX_TEST; xr++){
  43 +
  44 + unsigned int ind_t = (RMAX_TEST - yr) * x_table + RMAX_TEST - xr;
  45 +
  46 + // calculate the angle between the voter and the current pixel in x and y directions
  47 + atan_angle = gpuTable[ind_t];
  48 +
  49 + // find the vote value for the current counter
  50 + vote_c = gpuVote[(yi+yr)*x + (xi+xr)];
  51 +
  52 + // check if the current pixel is located in the voting area of this voter.
  53 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  54 +
  55 + // compare the vote value of this pixel with the max value to find the maxima and its index.
  56 + if (vote_c>max) {
  57 +
  58 + max = vote_c;
  59 + id_x = xr;
  60 + id_y = yr;
  61 + }
  62 + }
  63 + }
  64 + }
  65 + }
  66 +
  67 + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
  68 + float new_angle = gpuTable[ind_m];
  69 +
  70 + if(xi < x && yi < y)
  71 + gpuDir[i] = new_angle;
  72 + } //end kernel
  73 +
  74 + // this kernel updates the gradient direction by the calculated voting direction.
  75 + template<typename T>
  76 + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
  77 +
  78 + // calculate the 2D coordinates for this current thread.
  79 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  80 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  81 +
  82 + // convert 2D coordinates to 1D
  83 + int i = yi * x + xi;
  84 +
  85 + //update the gradient image with the vote direction
  86 + gpuGrad[2*i] = gpuDir[i];
  87 + }
  88 +
  89 + template<typename T>
  90 + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  91 +
  92 + //calculate the number of bytes in the array
  93 + unsigned int bytes = x * y * sizeof(T);
  94 +
  95 + unsigned int max_threads = stim::maxThreadsPerBlock();
  96 + dim3 threads(max_threads, 1);
  97 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  98 +
  99 + // allocate space on the GPU for the updated vote direction
  100 + T* gpuDir;
  101 + cudaMalloc(&gpuDir, bytes);
  102 +
  103 + //call the kernel to calculate the new voting direction
  104 + cuda_update_dir <<< blocks, threads>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  105 +
  106 + //call the kernel to update the gradient direction
  107 + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
  108 +
  109 + //free allocated memory
  110 + cudaFree(gpuDir);
  111 +
  112 + }
  113 +
  114 + template<typename T>
  115 + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  116 +
  117 + //calculate the number of bytes in the array
  118 + unsigned int bytes = x * y * sizeof(T);
  119 +
  120 + //calculate the number of bytes in the atan2 table
  121 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  122 +
  123 + //allocate space on the GPU for the Vote Image
  124 + T* gpuVote;
  125 + cudaMalloc(&gpuVote, bytes);
  126 +
  127 + //copy the input vote image to the GPU
  128 + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
  129 +
  130 + //allocate space on the GPU for the input Gradient image
  131 + T* gpuGrad;
  132 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  133 +
  134 + //copy the Gradient data to the GPU
  135 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  136 +
  137 + //allocate space on the GPU for the atan2 table
  138 + T* gpuTable;
  139 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  140 +
  141 + //copy the atan2 values to the GPU
  142 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  143 +
  144 + //call the GPU version of the update direction function
  145 + gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  146 +
  147 + //copy the new gradient image back to the CPU
  148 + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
  149 +
  150 + //free allocated memory
  151 + cudaFree(gpuTable);
  152 + cudaFree(gpuVote);
  153 + cudaFree(gpuGrad);
  154 + }
  155 +
  156 + }
  157 +}
  158 +
  159 +#endif
0 160 \ No newline at end of file
... ...
stim/cuda/ivote/update_dir_shared.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_UPDATE_DIR_SHARED_H
  2 +#define STIM_CUDA_UPDATE_DIR_SHARED_H
  3 +
  4 +# include <iostream>
  5 +# include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include "cpyToshare.cuh"
  9 +
  10 +namespace stim{
  11 + namespace cuda{
  12 +
  13 + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
  14 + template<typename T>
  15 + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  16 +
  17 + //generate a pointer to shared memory (size will be specified as a kernel parameter)
  18 + extern __shared__ float s_vote[];
  19 +
  20 + //calculate the start point for this block
  21 + int bxi = blockIdx.x * blockDim.x;
  22 +
  23 + // calculate the 2D coordinates for this current thread.
  24 + int xi = bxi + threadIdx.x;
  25 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  26 + // convert 2D coordinates to 1D
  27 + int i = yi * x + xi;
  28 +
  29 + // calculate the voting direction based on the grtadient direction
  30 + float theta = gpuGrad[2*i];
  31 +
  32 + //initialize the vote direction to zero
  33 + gpuDir[i] = 0;
  34 +
  35 + // define a local variable to maximum value of the vote image in the voting area for this voter
  36 + float max = 0;
  37 +
  38 + // define two local variables for the x and y coordinations where the maximum happened
  39 + int id_x = 0;
  40 + int id_y = 0;
  41 +
  42 + //calculate the width of the shared memory block
  43 + int swidth = 2 * rmax + blockDim.x;
  44 +
  45 + // compute the size of window which will be checked for finding the voting area for this voter
  46 + int x_table = 2*rmax +1;
  47 + int rmax_sq = rmax * rmax;
  48 + int tx_rmax = threadIdx.x + rmax;
  49 + int bxs = bxi - rmax;
  50 +
  51 + for(int yr = -rmax; yr <= rmax; yr++){
  52 + //if (yi+yr >= 0 && yi + yr < y){
  53 + //copy the portion of the image necessary for this block to shared memory
  54 + __syncthreads();
  55 + cpyG2S1D<float>(s_vote, gpuVote, bxs, yi + yr , swidth, 1, threadIdx, blockDim, x, y);
  56 + __syncthreads();
  57 +
  58 + //if the current thread is outside of the image, it doesn't have to be computed
  59 + if(xi < x && yi < y){
  60 +
  61 + for(int xr = -rmax; xr <= rmax; xr++){
  62 +
  63 + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
  64 +
  65 + // calculate the angle between the voter and the current pixel in x and y directions
  66 + float atan_angle = gpuTable[ind_t];
  67 +
  68 + // calculate the voting direction based on the grtadient direction
  69 + int idx_share_update = xr + tx_rmax ;
  70 + float share_vote = s_vote[idx_share_update];
  71 +
  72 + // check if the current pixel is located in the voting area of this voter.
  73 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  74 +
  75 + // compare the vote value of this pixel with the max value to find the maxima and its index.
  76 + if (share_vote>max) {
  77 +
  78 + max = share_vote;
  79 + id_x = xr;
  80 + id_y = yr;
  81 + }
  82 + }
  83 + }
  84 + }
  85 + //}
  86 + }
  87 +
  88 + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
  89 + float new_angle = gpuTable[ind_m];
  90 +
  91 + if(xi < x && yi < y)
  92 + gpuDir[i] = new_angle;
  93 +
  94 + }
  95 +
  96 + // this kernel updates the gradient direction by the calculated voting direction.
  97 + template<typename T>
  98 + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
  99 +
  100 + // calculate the 2D coordinates for this current thread.
  101 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  102 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  103 +
  104 + // convert 2D coordinates to 1D
  105 + int i = yi * x + xi;
  106 +
  107 + //update the gradient image with the vote direction
  108 + gpuGrad[2*i] = gpuDir[i];
  109 + }
  110 +
  111 + template<typename T>
  112 + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  113 +
  114 + //calculate the number of bytes in the array
  115 + unsigned int bytes = x * y * sizeof(T);
  116 +
  117 + unsigned int max_threads = stim::maxThreadsPerBlock();
  118 + dim3 threads(max_threads, 1);
  119 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  120 +
  121 + // specify share memory
  122 + unsigned int share_bytes = (2*rmax + threads.x)*(1)*4;
  123 +
  124 + // allocate space on the GPU for the updated vote direction
  125 + T* gpuDir;
  126 + cudaMalloc(&gpuDir, bytes);
  127 +
  128 + //call the kernel to calculate the new voting direction
  129 + cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  130 +
  131 + //call the kernel to update the gradient direction
  132 + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
  133 +
  134 + //free allocated memory
  135 + cudaFree(gpuDir);
  136 +
  137 + }
  138 +
  139 + template<typename T>
  140 + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  141 +
  142 + //calculate the number of bytes in the array
  143 + unsigned int bytes = x * y * sizeof(T);
  144 +
  145 + //calculate the number of bytes in the atan2 table
  146 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  147 +
  148 + //allocate space on the GPU for the Vote Image
  149 + T* gpuVote;
  150 + cudaMalloc(&gpuVote, bytes);
  151 +
  152 + //copy the input vote image to the GPU
  153 + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
  154 +
  155 + //allocate space on the GPU for the input Gradient image
  156 + T* gpuGrad;
  157 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  158 +
  159 + //copy the Gradient data to the GPU
  160 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  161 +
  162 + //allocate space on the GPU for the atan2 table
  163 + T* gpuTable;
  164 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  165 +
  166 + //copy the atan2 values to the GPU
  167 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  168 +
  169 + //call the GPU version of the update direction function
  170 + gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  171 +
  172 + //copy the new gradient image back to the CPU
  173 + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
  174 +
  175 + //free allocated memory
  176 + cudaFree(gpuTable);
  177 + cudaFree(gpuVote);
  178 + cudaFree(gpuGrad);
  179 + }
  180 +
  181 + }
  182 +}
  183 +
  184 +#endif
0 185 \ No newline at end of file
... ...
stim/cuda/ivote/vote_atomic.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_VOTE_ATOMIC_H
  2 +#define STIM_CUDA_VOTE_ATOMIC_H
  3 +
  4 +# include <iostream>
  5 +# include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include "cpyToshare.cuh"
  9 +
  10 +namespace stim{
  11 + namespace cuda{
  12 +
  13 + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  14 + template<typename T>
  15 + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  16 +
  17 +
  18 + // calculate the 2D coordinates for this current thread.
  19 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  20 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  21 + // convert 2D coordinates to 1D
  22 + int i = yi * x + xi;
  23 +
  24 + // calculate the voting direction based on the grtadient direction
  25 + float theta = gpuGrad[2*i];
  26 + //calculate the amount of vote for the voter
  27 + float mag = gpuGrad[2*i + 1];
  28 +
  29 + // compute the size of window which will be checked for finding the proper voters for this pixel
  30 + int x_table = 2*rmax +1;
  31 + int rmax_sq = rmax * rmax;
  32 + if(xi < x && yi < y){
  33 + //for every line (along y)
  34 + for(int yr = -rmax; yr <= rmax; yr++){
  35 + for(int xr = -rmax; xr <= rmax; xr++){
  36 + if ((yi+yr)>=0 && (yi+yr)<y && (xi+xr)>=0 && (xi+xr)<x){
  37 +
  38 + //find the location of the current pixel in the atan2 table
  39 + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
  40 +
  41 + // calculate the angle between the voter and the current pixel in x and y directions
  42 + float atan_angle = gpuTable[ind_t];
  43 +
  44 + // check if the current pixel is located in the voting area of this voter.
  45 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  46 + // calculate the 1D index for the current pixel in global memory
  47 + unsigned int ind_g = (yi+yr)*x + (xi+xr);
  48 + atomicAdd(&gpuVote[ind_g], mag);
  49 +
  50 + }
  51 + }
  52 + }
  53 + }
  54 + }
  55 + }
  56 +
  57 + template<typename T>
  58 + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  59 +
  60 +
  61 + unsigned int max_threads = stim::maxThreadsPerBlock();
  62 + dim3 threads(max_threads, 1);
  63 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  64 +
  65 + // specify share memory
  66 + //unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4;
  67 +
  68 + //call the kernel to do the voting
  69 + cuda_vote <<< blocks, threads>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  70 +
  71 + }
  72 +
  73 +
  74 + template<typename T>
  75 + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  76 +
  77 + //calculate the number of bytes in the array
  78 + unsigned int bytes = x * y * sizeof(T);
  79 +
  80 + //calculate the number of bytes in the atan2 table
  81 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  82 +
  83 + //allocate space on the GPU for the Vote Image
  84 + T* gpuVote;
  85 + cudaMalloc(&gpuVote, bytes);
  86 +
  87 + //allocate space on the GPU for the input Gradient image
  88 + T* gpuGrad;
  89 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  90 +
  91 + //copy the Gradient Magnitude data to the GPU
  92 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  93 +
  94 + //allocate space on the GPU for the atan2 table
  95 + T* gpuTable;
  96 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  97 +
  98 + //copy the atan2 values to the GPU
  99 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  100 +
  101 + //call the GPU version of the vote calculation function
  102 + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  103 +
  104 + //copy the Vote Data back to the CPU
  105 + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  106 +
  107 + //free allocated memory
  108 + cudaFree(gpuTable);
  109 + cudaFree(gpuVote);
  110 + cudaFree(gpuGrad);
  111 + }
  112 +
  113 + }
  114 +}
  115 +
  116 +#endif
0 117 \ No newline at end of file
... ...
stim/cuda/ivote/vote_atomic_shared.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_VOTE_ATOMIC_SHARED_H
  2 +#define STIM_CUDA_VOTE_ATOMIC_SHARED_H
  3 +
  4 +# include <iostream>
  5 +# include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include "cpyToshare.cuh"
  9 +//#include "writebackshared.cuh"
  10 +namespace stim{
  11 + namespace cuda{
  12 +
  13 + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  14 + template<typename T>
  15 + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  16 +
  17 + //generate a pointer to the shared memory
  18 + extern __shared__ float s_vote[];
  19 + // calculate the 2D coordinates for this current thread.
  20 + int bxi = blockIdx.x * blockDim.x;
  21 + int byi = blockIdx.y * blockDim.y;
  22 + int xi = bxi + threadIdx.x;
  23 + int yi = byi + threadIdx.y;
  24 + // convert 2D coordinates to 1D
  25 + int i = yi * x + xi;
  26 +
  27 + // calculate the voting direction based on the gradient direction
  28 + float theta = gpuGrad[2*i];
  29 + //calculate the amount of vote for the voter
  30 + float mag = gpuGrad[2*i + 1];
  31 +
  32 + //find the starting points and size of window, wich will be copied to the shared memory
  33 + int bxs = bxi - rmax;
  34 + int bys = byi - rmax;
  35 + int xwidth = 2*rmax + blockDim.x;
  36 + int ywidth = 2*rmax + blockDim.y;
  37 + //compute the coordinations of this pixel in the 2D-shared memory.
  38 + int sx_rx = threadIdx.x + rmax;
  39 + int sy_ry = threadIdx.y + rmax;
  40 + // compute the size of window which will be checked for finding the counters for this voter
  41 + int x_table = 2*rmax +1;
  42 + int rmax_sq = rmax * rmax;
  43 + //calculate some parameters for indexing shared memory
  44 + //calculate the total number of threads available
  45 + unsigned int tThreads = blockDim.x * blockDim.y;
  46 + //calculate the current 1D thread ID
  47 + unsigned int ti = threadIdx.y * (blockDim.x) + threadIdx.x;
  48 + //calculate the number of iteration required
  49 + unsigned int In = xwidth*ywidth/tThreads + 1;
  50 + if(xi < x && yi < y){
  51 + __syncthreads();
  52 + //initialize the shared memory to zero
  53 + for (unsigned int i = 0; i < In; i++){
  54 + unsigned int sIdx0 = i * tThreads + ti;
  55 + if (sIdx0< xwidth*ywidth) {
  56 + s_vote[sIdx0] = 0;
  57 + }
  58 + }
  59 + __syncthreads();
  60 + //for every line (along y)
  61 + for(int yr = -rmax; yr <= rmax; yr++){
  62 + //compute the position of the current voter in the shared memory along the y axis.
  63 + unsigned int sIdx_y1d = (sy_ry + yr)* xwidth;
  64 + for(int xr = -rmax; xr <= rmax; xr++){
  65 +
  66 + //find the location of the current pixel in the atan2 table
  67 + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
  68 +
  69 + // calculate the angle between the voter and the current pixel in x and y directions
  70 + float atan_angle = gpuTable[ind_t];
  71 +
  72 + // check if the current pixel is located in the voting area of this voter.
  73 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  74 + //compute the position of the current voter in the 2D-shared memory along the x axis.
  75 + unsigned int sIdx_x = (sx_rx + xr);
  76 + //find the 1D index of this voter in the 2D-shared memory.
  77 + unsigned int s_Idx = (sIdx_y1d + sIdx_x);
  78 +
  79 + atomicAdd(&s_vote[s_Idx], mag);
  80 + }
  81 + }
  82 + }
  83 + //write shared memory back to global memory
  84 +
  85 + __syncthreads();
  86 + for (unsigned int i = 0; i < In; i++){
  87 +
  88 + unsigned int sIdx = i * tThreads + ti;
  89 + if (sIdx>= xwidth*ywidth) return;
  90 +
  91 + unsigned int sy = sIdx/xwidth;
  92 + unsigned int sx = sIdx - (sy * xwidth);
  93 +
  94 + unsigned int gx = bxs + sx;
  95 + unsigned int gy = bys + sy;
  96 + if (gx<x&& gy<y){
  97 + unsigned int gIdx = gy * x + gx;
  98 + //write shared to global memory
  99 + atomicAdd(&gpuVote[gIdx], s_vote[sIdx]);
  100 +
  101 + }
  102 + }
  103 +
  104 + }
  105 + }
  106 +
  107 + template<typename T>
  108 + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  109 +
  110 +
  111 + unsigned int max_threads = stim::maxThreadsPerBlock();
  112 + dim3 threads(sqrt(max_threads), sqrt(max_threads));
  113 + dim3 blocks(x/threads.x + 1 , y/threads.y+1);
  114 +
  115 + // specify share memory
  116 + unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*sizeof(T);
  117 +
  118 + //call the kernel to do the voting
  119 + cuda_vote <<< blocks, threads, share_bytes>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  120 +
  121 + }
  122 +
  123 +
  124 + template<typename T>
  125 + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  126 +
  127 + //calculate the number of bytes in the array
  128 + unsigned int bytes = x * y * sizeof(T);
  129 +
  130 + //calculate the number of bytes in the atan2 table
  131 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  132 +
  133 + //allocate space on the GPU for the Vote Image
  134 + T* gpuVote;
  135 + cudaMalloc(&gpuVote, bytes);
  136 +
  137 + //allocate space on the GPU for the input Gradient image
  138 + T* gpuGrad;
  139 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  140 +
  141 + //copy the Gradient Magnitude data to the GPU
  142 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  143 +
  144 + //allocate space on the GPU for the atan2 table
  145 + T* gpuTable;
  146 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  147 +
  148 + //copy the atan2 values to the GPU
  149 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  150 +
  151 + //call the GPU version of the vote calculation function
  152 + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  153 +
  154 + //copy the Vote Data back to the CPU
  155 + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  156 +
  157 + //free allocated memory
  158 + cudaFree(gpuTable);
  159 + cudaFree(gpuVote);
  160 + cudaFree(gpuGrad);
  161 + }
  162 +
  163 + }
  164 +}
  165 +
  166 +#endif
0 167 \ No newline at end of file
... ...
stim/cuda/ivote/vote_shared_32-32.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_VOTE_SHARED_H
  2 +#define STIM_CUDA_VOTE_SHARED
  3 +# include <iostream>
  4 +# include <cuda.h>
  5 +#include <stim/cuda/cudatools.h>
  6 +#include <stim/cuda/sharedmem.cuh>
  7 +#include "cpyToshare.cuh"
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  13 + template<typename T>
  14 + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  15 +
  16 + //generate a pointer to shared memory (size will be specified as a kernel parameter)
  17 + extern __shared__ float s_grad[];
  18 +
  19 + //calculate the start point for this block
  20 + int bxi = blockIdx.x * blockDim.x;
  21 + int byi = blockIdx.y * blockDim.y;
  22 + // calculate the 2D coordinates for this current thread.
  23 + int xi = bxi + threadIdx.x;
  24 + int yi = byi + threadIdx.y;
  25 + // convert 2D coordinates to 1D
  26 + int i = yi * x + xi;
  27 +
  28 + // define a local variable to sum the votes from the voters
  29 + float sum = 0;
  30 +
  31 + //calculate the width of the shared memory block
  32 + int xwidth = 2 * rmax + blockDim.x;
  33 + int ywidth = 2 * rmax + blockDim.y;
  34 + // compute the size of window which will be checked for finding the proper voters for this pixel
  35 + int x_table = 2*rmax +1;
  36 + int rmax_sq = rmax * rmax;
  37 + int tx_rmax = threadIdx.x + rmax;
  38 + int bxs = bxi - rmax;
  39 + int bys = byi - rmax;
  40 + //compute the coordinations of this pixel in the 2D-shared memory.
  41 + int sx_rx = threadIdx.x + rmax;
  42 + int sy_ry = threadIdx.y + rmax;
  43 + //copy the portion of the image necessary for this block to shared memory
  44 + __syncthreads();
  45 + cpyG2S2D2ch<float>(s_grad, gpuGrad, bxs, bys, 2*xwidth, ywidth, threadIdx, blockDim, x, y);
  46 + __syncthreads();
  47 +
  48 + for(int yr = -rmax; yr <= rmax; yr++){
  49 + int yi_v = (yi + yr) ;
  50 + //compute the position of the current voter in the shared memory along the y axis.
  51 + unsigned int sIdx_y1d = (sy_ry + yr)* xwidth;
  52 + //if (yi+yr<y && yi+yr>=0){
  53 + if(xi < x && yi < y){
  54 +
  55 + for(int xr = -rmax; xr <= rmax; xr++){
  56 +
  57 + //compute the position of the current voter in the 2D-shared memory along the x axis.
  58 + unsigned int sIdx_x = (sx_rx + xr);
  59 + //find the 1D index of this voter in the 2D-shared memory.
  60 + unsigned int s_Idx = (sIdx_y1d + sIdx_x);
  61 + unsigned int s_Idx2 = s_Idx * 2;
  62 +
  63 + //find the location of this voter in the atan2 table
  64 + int id_t = (yr + rmax) * x_table + xr + rmax;
  65 +
  66 + // calculate the angle between the pixel and the current voter in x and y directions
  67 + float atan_angle = gpuTable[id_t];
  68 +
  69 + // calculate the voting direction based on the grtadient direction
  70 + //int idx_share = xr + tx_rmax ;
  71 + float theta = s_grad[s_Idx2];
  72 + float mag = s_grad[s_Idx2 + 1];
  73 +
  74 +
  75 + // check if the current voter is located in the voting area of this pixel.
  76 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  77 + sum += mag;
  78 +
  79 + }
  80 + }
  81 +
  82 + }
  83 + //}
  84 + }
  85 + if(xi < x && yi < y)
  86 + gpuVote[i] = sum;
  87 +
  88 + }
  89 +
  90 + template<typename T>
  91 + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  92 +
  93 +
  94 + unsigned int max_threads = stim::maxThreadsPerBlock();
  95 + dim3 threads(sqrt(max_threads), sqrt(max_threads));
  96 + dim3 blocks(x/threads.x + 1 , y/threads.y+1);
  97 +
  98 +
  99 + // specify share memory
  100 + unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*2*sizeof(T);
  101 +
  102 + //call the kernel to do the voting
  103 + cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  104 +
  105 + }
  106 +
  107 +
  108 + template<typename T>
  109 + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  110 +
  111 + //calculate the number of bytes in the array
  112 + unsigned int bytes = x * y * sizeof(T);
  113 +
  114 + //calculate the number of bytes in the atan2 table
  115 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  116 +
  117 + //allocate space on the GPU for the Vote Image
  118 + T* gpuVote;
  119 + cudaMalloc(&gpuVote, bytes);
  120 +
  121 + //allocate space on the GPU for the input Gradient image
  122 + T* gpuGrad;
  123 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  124 +
  125 + //copy the Gradient Magnitude data to the GPU
  126 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  127 +
  128 + //allocate space on the GPU for the atan2 table
  129 + T* gpuTable;
  130 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  131 +
  132 + //copy the atan2 values to the GPU
  133 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  134 +
  135 + //call the GPU version of the vote calculation function
  136 + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  137 +
  138 + //copy the Vote Data back to the CPU
  139 + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  140 +
  141 + //free allocated memory
  142 + cudaFree(gpuTable);
  143 + cudaFree(gpuVote);
  144 + cudaFree(gpuGrad);
  145 + }
  146 +
  147 + }
  148 +}
  149 +
  150 +#endif
0 151 \ No newline at end of file
... ...
stim/cuda/ivote_atomic.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_IVOTE_ATOMIC_H
  2 +#define STIM_CUDA_IVOTE_ATOMIC_H
  3 +
  4 +#include <stim/cuda/ivote/down_sample.cuh>
  5 +#include <stim/cuda/ivote/local_max.cuh>
  6 +#include <stim/cuda/ivote/update_dir_global.cuh>
  7 +//#include <stim/cuda/ivote/vote_shared_32-32.cuh>
  8 +#include <stim/cuda/ivote/vote_atomic_shared.cuh>
  9 +#include <stim/cuda/ivote/re_sample.cuh>
  10 +namespace stim{
  11 + namespace cuda{
  12 +
  13 + }
  14 +}
  15 +
  16 +
  17 +
  18 +#endif
0 19 \ No newline at end of file
... ...