Commit 213d109cb4b746856b5b3e7760ee6d8ef7ed90e7

Authored by Pavel Govyadinov
2 parents 4fa3f483 ca99f951

Merge branch 'master' of git.stim.ee.uh.edu:codebase/stimlib

stim/cuda/arraymath.cuh
... ... @@ -10,7 +10,8 @@
10 10 #include <stim/cuda/arraymath/array_atan.cuh>
11 11 #include <stim/cuda/arraymath/array_abs.cuh>
12 12 #include <stim/cuda/arraymath/array_cart2polar.cuh>
13   -
  13 +//#include <stim/cuda/arraymath/array_threshold.cuh>
  14 +//#include <stim/cuda/arraymath/array_add3.cuh>
14 15 namespace stim{
15 16 namespace cuda{
16 17  
... ...
stim/cuda/ivote/david_update_dir_global.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
  2 +#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
  3 +
  4 +# include <iostream>
  5 +# include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include <math.h>
  9 +#include "cpyToshare.cuh"
  10 +
  11 +#define RMAX_TEST 8
  12 +
  13 +namespace stim{
  14 + namespace cuda{
  15 +
  16 + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
  17 + template<typename T>
  18 + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  19 + extern __shared__ T atan2_table[];
  20 +
  21 + //calculate the start point for this block
  22 + //int bxi = blockIdx.x * blockDim.x;
  23 +
  24 + stim::cuda::sharedMemcpy(atan2_table, gpuTable, (2 * rmax + 1) * (2 * rmax + 1), threadIdx.x, blockDim.x);
  25 +
  26 + __syncthreads();
  27 +
  28 + // calculate the 2D coordinates for this current thread.
  29 + //int xi = bxi + threadIdx.x;
  30 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  31 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  32 + if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel
  33 +
  34 + int i = yi * x + xi; // convert 2D coordinates to 1D
  35 +
  36 + float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch
  37 + gpuDir[i] = 0; //initialize the vote direction to zero
  38 + float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter
  39 + int id_x = 0; // define two local variables for the x and y position of the maximum
  40 + int id_y = 0;
  41 +
  42 + int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter
  43 + int rmax_sq = rmax * rmax;
  44 + int tx_rmax = threadIdx.x + rmax;
  45 + float atan_angle;
  46 + float vote_c;
  47 + unsigned int ind_t;
  48 + for(int yr = -rmax; yr <= rmax; yr++){ //for each counter in the y direction
  49 + if (yi+yr >= 0 && yi + yr < y){ //if the counter exists (we aren't looking outside of the image)
  50 + for(int xr = -rmax; xr <= rmax; xr++){ //for each counter in the x direction
  51 + if((xr * xr + yr *yr)< rmax_sq){ //if the counter is within range of the voter
  52 +
  53 + ind_t = (rmax - yr) * x_table + rmax - xr; //calculate the index to the atan2 table
  54 + atan_angle = atan2_table[ind_t]; //retrieve the direction vector from the table
  55 +
  56 + //atan_angle = atan2((float)yr, (float)xr);
  57 +
  58 + if (abs(atan_angle - theta) <phi){ // check if the current pixel is located in the voting angle of this voter.
  59 + vote_c = gpuVote[(yi+yr)*x + (xi+xr)]; // find the vote value for the current counter
  60 + if(vote_c>max) { // compare the vote value of this pixel with the max value to find the maxima and its index.
  61 + max = vote_c;
  62 + id_x = xr;
  63 + id_y = yr;
  64 + }
  65 + }
  66 + }
  67 + }
  68 + }
  69 + }
  70 +
  71 + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
  72 + float new_angle = gpuTable[ind_m];
  73 +
  74 + if(xi < x && yi < y)
  75 + gpuDir[i] = new_angle;
  76 + } //end kernel
  77 +
  78 + // this kernel updates the gradient direction by the calculated voting direction.
  79 + template<typename T>
  80 + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
  81 +
  82 + // calculate the 2D coordinates for this current thread.
  83 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  84 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  85 +
  86 + // convert 2D coordinates to 1D
  87 + int i = yi * x + xi;
  88 +
  89 + //update the gradient image with the vote direction
  90 + gpuGrad[2*i] = gpuDir[i];
  91 + }
  92 +
  93 + template<typename T>
  94 + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  95 +
  96 +
  97 +
  98 + //calculate the number of bytes in the array
  99 + unsigned int bytes = x * y * sizeof(T);
  100 +
  101 + unsigned int max_threads = stim::maxThreadsPerBlock();
  102 +
  103 + dim3 threads(sqrt(max_threads), sqrt(max_threads));
  104 + dim3 blocks(x/threads.x + 1, y/threads.y + 1);
  105 +
  106 +
  107 +
  108 + // allocate space on the GPU for the updated vote direction
  109 + T* gpuDir;
  110 + cudaMalloc(&gpuDir, bytes);
  111 +
  112 + size_t shared_mem = sizeof(T) * std::pow((2 * rmax + 1), 2);
  113 + std::cout<<"Shared memory for atan2 table: "<<shared_mem<<std::endl;
  114 +
  115 + //call the kernel to calculate the new voting direction
  116 + cuda_update_dir <<< blocks, threads, shared_mem>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  117 +
  118 + //call the kernel to update the gradient direction
  119 + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
  120 +
  121 + //free allocated memory
  122 + cudaFree(gpuDir);
  123 +
  124 + }
  125 +
  126 + template<typename T>
  127 + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  128 +
  129 + //calculate the number of bytes in the array
  130 + unsigned int bytes = x * y * sizeof(T);
  131 +
  132 + //calculate the number of bytes in the atan2 table
  133 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  134 +
  135 + //allocate space on the GPU for the Vote Image
  136 + T* gpuVote;
  137 + cudaMalloc(&gpuVote, bytes);
  138 +
  139 + //copy the input vote image to the GPU
  140 + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
  141 +
  142 + //allocate space on the GPU for the input Gradient image
  143 + T* gpuGrad;
  144 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  145 +
  146 + //copy the Gradient data to the GPU
  147 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  148 +
  149 + //allocate space on the GPU for the atan2 table
  150 + T* gpuTable;
  151 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  152 +
  153 + //copy the atan2 values to the GPU
  154 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  155 +
  156 + //call the GPU version of the update direction function
  157 + gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  158 +
  159 + //copy the new gradient image back to the CPU
  160 + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
  161 +
  162 + //free allocated memory
  163 + cudaFree(gpuTable);
  164 + cudaFree(gpuVote);
  165 + cudaFree(gpuGrad);
  166 + }
  167 +
  168 + }
  169 +}
  170 +
  171 +#endif
0 172 \ No newline at end of file
... ...
stim/cuda/ivote/local_max.cuh
... ... @@ -29,17 +29,33 @@ namespace stim{
29 29 //compare to the threshold
30 30 if(val < final_t) return;
31 31  
  32 + //define an array to store indices with same vote value
  33 + /*int * IdxEq;
  34 + IdxEq = new int [2*conn];
  35 + int n = 0;*/
  36 +
32 37 for(int xl = xi - conn; xl < xi + conn; xl++){
33 38 for(int yl = yi - conn; yl < yi + conn; yl++){
34 39 if(xl >= 0 && xl < x && yl >= 0 && yl < y){
35 40 int il = yl * x + xl;
36 41 if(gpuVote[il] > val){
37 42 return;
38   - }
39   - }
  43 + }
  44 + if (gpuVote[il] == val){
  45 + /*IdxEq[n] = il;
  46 + n = n+1;*/
  47 + if( il > i){
  48 + return;
  49 + }
  50 + }
  51 + }
40 52 }
41 53 }
42   -
  54 + /*if (n!=0){
  55 + if(IdxEq[n/2] !=i){
  56 + return;
  57 + }
  58 + } */
43 59 gpuCenters[i] = 1;
44 60 }
45 61  
... ...
stim/cuda/ivote/update_dir_global.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
  2 +#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
  3 +
  4 +# include <iostream>
  5 +# include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include <stim/visualization/aabb2.h>
  9 +#include <stim/visualization/colormap.h>
  10 +#include <math.h>
  11 +#include "cpyToshare.cuh"
  12 +
  13 +//#define RMAX_TEST 8
  14 +
  15 +namespace stim{
  16 + namespace cuda{
  17 +
  18 + template<typename T>
  19 + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  20 + extern __shared__ T S[];
  21 + T* shared_atan = S;
  22 + size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
  23 + stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
  24 +
  25 + //T* shared_vote = &S[n_table];
  26 + //size_t template_size_x = (blockDim.x + 2 * rmax);
  27 + //size_t template_size_y = (blockDim.y + 2 * rmax);
  28 + //stim::cuda::threadedMemcpy2D((char*)shared_vote, (char*)gpuVote, template_size_x, template_size_y, x, threadIdx.y * blockDim.x + threadIdx.x, blockDim.x * blockDim.y);
  29 +
  30 + int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate the 2D coordinates for this current thread.
  31 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  32 +
  33 + if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel
  34 +
  35 + int i = yi * x + xi; //convert 2D coordinates to 1D
  36 + float theta = gpuGrad[2*i]; //calculate the voting direction based on the grtadient direction - global memory fetch
  37 +
  38 + stim::aabb2<int> bb(xi, yi); //initialize a bounding box at the current point
  39 + bb.insert(xi + ceil(rmax * cos(theta)), ceil(yi + rmax * sin(theta)));
  40 + bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi))); //insert one corner of the triangle into the bounding box
  41 + bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi))); //insert the final corner into the bounding box
  42 +
  43 + int x_table = 2*rmax +1;
  44 + int lut_i;
  45 + T rmax_sq = rmax * rmax;
  46 + T dx_sq, dy_sq;
  47 +
  48 + bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image
  49 + bb.trim_high(x-1, y-1);
  50 +
  51 + int by, bx;
  52 + int dx, dy; //coordinate relative to (xi, yi)
  53 + T v;
  54 + T max_v = 0; //initialize the maximum vote value to zero
  55 + T alpha;
  56 + int max_dx = bb.low[0];
  57 + int max_dy = bb.low[1];
  58 + for(by = bb.low[1]; by <= bb.high[1]; by++){ //for each element in the bounding box
  59 + dy = by - yi; //calculate the y coordinate of the current point relative to yi
  60 + dy_sq = dy * dy;
  61 + for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
  62 + dx = bx - xi;
  63 + dx_sq = dx * dx;
  64 + lut_i = (rmax - dy) * x_table + rmax - dx;
  65 + alpha = shared_atan[lut_i];
  66 + if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
  67 + v = gpuVote[by * x + bx]; // find the vote value for the current counter
  68 + if(v > max_v){
  69 + max_v = v;
  70 + max_dx = dx;
  71 + max_dy = dy;
  72 + }
  73 + }
  74 + }
  75 + }
  76 + gpuDir[i] = atan2((T)max_dy, (T)max_dx);
  77 + }
  78 +
  79 + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
  80 + template<typename T>
  81 + __global__ void leila_cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  82 +
  83 +
  84 + // calculate the 2D coordinates for this current thread.
  85 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  86 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  87 +
  88 + if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel
  89 +
  90 + int i = yi * x + xi; // convert 2D coordinates to 1D
  91 +
  92 + float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch
  93 + gpuDir[i] = 0; //initialize the vote direction to zero
  94 + float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter
  95 + int id_x = 0; // define two local variables for the x and y position of the maximum
  96 + int id_y = 0;
  97 +
  98 + int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter
  99 + int rmax_sq = rmax * rmax;
  100 + int tx_rmax = threadIdx.x + rmax;
  101 + float atan_angle;
  102 + float vote_c;
  103 + int xidx, yidx, yr_sq, xr_sq;
  104 + for(int yr = -rmax; yr <= rmax; yr++){
  105 + yidx = yi + yr; //compute the index into the image
  106 + if (yidx >= 0 && yidx < y){ //if the current y-index is inside the image
  107 + yr_sq = yr * yr; //compute the square of yr, to save time later
  108 + for(int xr = -rmax; xr <= rmax; xr++){
  109 + xidx = xi + xr;
  110 + if(xidx >= 0 && xidx < x){
  111 + xr_sq = xr * xr;
  112 + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
  113 +
  114 + // calculate the angle between the voter and the current pixel in x and y directions
  115 + atan_angle = gpuTable[ind_t];
  116 + //atan_angle = atan2((T)yr, (T)xr);
  117 +
  118 + // check if the current pixel is located in the voting area of this voter.
  119 + if (((xr_sq + yr_sq)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  120 +
  121 + vote_c = gpuVote[yidx * x + xidx]; // find the vote value for the current counter
  122 + // compare the vote value of this pixel with the max value to find the maxima and its index.
  123 + if (vote_c>max) {
  124 +
  125 + max = vote_c;
  126 + id_x = xr;
  127 + id_y = yr;
  128 + }
  129 + }
  130 + }
  131 + }
  132 + }
  133 + }
  134 +
  135 + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
  136 + float new_angle = gpuTable[ind_m];
  137 +
  138 + if(xi < x && yi < y)
  139 + gpuDir[i] = new_angle;
  140 + } //end kernel
  141 +
  142 +
  143 + // this kernel updates the gradient direction by the calculated voting direction.
  144 + template<typename T>
  145 + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
  146 +
  147 + // calculate the 2D coordinates for this current thread.
  148 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  149 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  150 +
  151 + if(xi >= x || yi >= y) return;
  152 +
  153 + // convert 2D coordinates to 1D
  154 + int i = yi * x + xi;
  155 +
  156 + //update the gradient image with the vote direction
  157 + gpuGrad[2*i] = gpuDir[i];
  158 + }
  159 +
  160 + template<typename T>
  161 + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  162 +
  163 + //calculate the number of bytes in the array
  164 + unsigned int bytes = x * y * sizeof(T);
  165 +
  166 + // allocate space on the GPU for the updated vote direction
  167 + T* gpuDir;
  168 + HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) );
  169 +
  170 + unsigned int max_threads = stim::maxThreadsPerBlock();
  171 + //dim3 threads(min(x, max_threads), 1);
  172 + //dim3 blocks(x/threads.x, y);
  173 +
  174 + dim3 threads( sqrt(max_threads), sqrt(max_threads) );
  175 + dim3 blocks(x/threads.x + 1, y/threads.y + 1);
  176 +
  177 + size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
  178 + //size_t curtain = 2 * rmax;
  179 + //size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain);
  180 + size_t shared_mem_req = table_bytes;// + template_bytes;
  181 + std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
  182 +
  183 + size_t shared_mem = stim::sharedMemPerBlock();
  184 + if(shared_mem_req > shared_mem){
  185 + std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
  186 + exit(1);
  187 + }
  188 +
  189 + //call the kernel to calculate the new voting direction
  190 + cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  191 + stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
  192 +
  193 + //exit(0);
  194 +
  195 + threads = dim3( sqrt(max_threads), sqrt(max_threads) );
  196 + blocks = dim3(x/threads.x + 1, y/threads.y + 1);
  197 +
  198 + //call the kernel to update the gradient direction
  199 + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
  200 + //free allocated memory
  201 + HANDLE_ERROR( cudaFree(gpuDir) );
  202 +
  203 + }
  204 +
  205 + template<typename T>
  206 + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  207 +
  208 + //calculate the number of bytes in the array
  209 + unsigned int bytes = x * y * sizeof(T);
  210 +
  211 + //calculate the number of bytes in the atan2 table
  212 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  213 +
  214 + //allocate space on the GPU for the Vote Image
  215 + T* gpuVote;
  216 + cudaMalloc(&gpuVote, bytes);
  217 +
  218 + //copy the input vote image to the GPU
  219 + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
  220 +
  221 + //allocate space on the GPU for the input Gradient image
  222 + T* gpuGrad;
  223 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  224 +
  225 + //copy the Gradient data to the GPU
  226 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  227 +
  228 + //allocate space on the GPU for the atan2 table
  229 + T* gpuTable;
  230 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  231 +
  232 + //copy the atan2 values to the GPU
  233 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  234 +
  235 + //call the GPU version of the update direction function
  236 + gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  237 +
  238 + //copy the new gradient image back to the CPU
  239 + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
  240 +
  241 + //free allocated memory
  242 + cudaFree(gpuTable);
  243 + cudaFree(gpuVote);
  244 + cudaFree(gpuGrad);
  245 + }
  246 +
  247 + }
  248 +}
  249 +
  250 +#endif
0 251 \ No newline at end of file
... ...
stim/cuda/ivote/update_dir_shared.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_UPDATE_DIR_SHARED_H
  2 +#define STIM_CUDA_UPDATE_DIR_SHARED_H
  3 +
  4 +# include <iostream>
  5 +# include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include "cpyToshare.cuh"
  9 +
  10 +namespace stim{
  11 + namespace cuda{
  12 +
  13 + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
  14 + template<typename T>
  15 + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  16 +
  17 + //generate a pointer to shared memory (size will be specified as a kernel parameter)
  18 + extern __shared__ float s_vote[];
  19 +
  20 + //calculate the start point for this block
  21 + int bxi = blockIdx.x * blockDim.x;
  22 +
  23 + // calculate the 2D coordinates for this current thread.
  24 + int xi = bxi + threadIdx.x;
  25 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  26 + // convert 2D coordinates to 1D
  27 + int i = yi * x + xi;
  28 +
  29 + // calculate the voting direction based on the grtadient direction
  30 + float theta = gpuGrad[2*i];
  31 +
  32 + //initialize the vote direction to zero
  33 + gpuDir[i] = 0;
  34 +
  35 + // define a local variable to maximum value of the vote image in the voting area for this voter
  36 + float max = 0;
  37 +
  38 + // define two local variables for the x and y coordinations where the maximum happened
  39 + int id_x = 0;
  40 + int id_y = 0;
  41 +
  42 + //calculate the width of the shared memory block
  43 + int swidth = 2 * rmax + blockDim.x;
  44 +
  45 + // compute the size of window which will be checked for finding the voting area for this voter
  46 + int x_table = 2*rmax +1;
  47 + int rmax_sq = rmax * rmax;
  48 + int tx_rmax = threadIdx.x + rmax;
  49 + int bxs = bxi - rmax;
  50 +
  51 + for(int yr = -rmax; yr <= rmax; yr++){
  52 + //if (yi+yr >= 0 && yi + yr < y){
  53 + //copy the portion of the image necessary for this block to shared memory
  54 + __syncthreads();
  55 + cpyG2S1D<float>(s_vote, gpuVote, bxs, yi + yr , swidth, 1, threadIdx, blockDim, x, y);
  56 + __syncthreads();
  57 +
  58 + //if the current thread is outside of the image, it doesn't have to be computed
  59 + if(xi < x && yi < y){
  60 +
  61 + for(int xr = -rmax; xr <= rmax; xr++){
  62 +
  63 + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
  64 +
  65 + // calculate the angle between the voter and the current pixel in x and y directions
  66 + float atan_angle = gpuTable[ind_t];
  67 +
  68 + // calculate the voting direction based on the grtadient direction
  69 + int idx_share_update = xr + tx_rmax ;
  70 + float share_vote = s_vote[idx_share_update];
  71 +
  72 + // check if the current pixel is located in the voting area of this voter.
  73 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  74 +
  75 + // compare the vote value of this pixel with the max value to find the maxima and its index.
  76 + if (share_vote>max) {
  77 +
  78 + max = share_vote;
  79 + id_x = xr;
  80 + id_y = yr;
  81 + }
  82 + }
  83 + }
  84 + }
  85 + //}
  86 + }
  87 +
  88 + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
  89 + float new_angle = gpuTable[ind_m];
  90 +
  91 + if(xi < x && yi < y)
  92 + gpuDir[i] = new_angle;
  93 +
  94 + }
  95 +
  96 + // this kernel updates the gradient direction by the calculated voting direction.
  97 + template<typename T>
  98 + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
  99 +
  100 + // calculate the 2D coordinates for this current thread.
  101 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  102 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  103 +
  104 + // convert 2D coordinates to 1D
  105 + int i = yi * x + xi;
  106 +
  107 + //update the gradient image with the vote direction
  108 + gpuGrad[2*i] = gpuDir[i];
  109 + }
  110 +
  111 + template<typename T>
  112 + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  113 +
  114 + //calculate the number of bytes in the array
  115 + unsigned int bytes = x * y * sizeof(T);
  116 +
  117 + unsigned int max_threads = stim::maxThreadsPerBlock();
  118 + dim3 threads(max_threads, 1);
  119 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  120 +
  121 + // specify share memory
  122 + unsigned int share_bytes = (2*rmax + threads.x)*(1)*4;
  123 +
  124 + // allocate space on the GPU for the updated vote direction
  125 + T* gpuDir;
  126 + cudaMalloc(&gpuDir, bytes);
  127 +
  128 + //call the kernel to calculate the new voting direction
  129 + cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  130 +
  131 + //call the kernel to update the gradient direction
  132 + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
  133 +
  134 + //free allocated memory
  135 + cudaFree(gpuDir);
  136 +
  137 + }
  138 +
  139 + template<typename T>
  140 + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  141 +
  142 + //calculate the number of bytes in the array
  143 + unsigned int bytes = x * y * sizeof(T);
  144 +
  145 + //calculate the number of bytes in the atan2 table
  146 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  147 +
  148 + //allocate space on the GPU for the Vote Image
  149 + T* gpuVote;
  150 + cudaMalloc(&gpuVote, bytes);
  151 +
  152 + //copy the input vote image to the GPU
  153 + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
  154 +
  155 + //allocate space on the GPU for the input Gradient image
  156 + T* gpuGrad;
  157 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  158 +
  159 + //copy the Gradient data to the GPU
  160 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  161 +
  162 + //allocate space on the GPU for the atan2 table
  163 + T* gpuTable;
  164 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  165 +
  166 + //copy the atan2 values to the GPU
  167 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  168 +
  169 + //call the GPU version of the update direction function
  170 + gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  171 +
  172 + //copy the new gradient image back to the CPU
  173 + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
  174 +
  175 + //free allocated memory
  176 + cudaFree(gpuTable);
  177 + cudaFree(gpuVote);
  178 + cudaFree(gpuGrad);
  179 + }
  180 +
  181 + }
  182 +}
  183 +
  184 +#endif
0 185 \ No newline at end of file
... ...
stim/cuda/ivote/vote_atomic.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_VOTE_ATOMIC_H
  2 +#define STIM_CUDA_VOTE_ATOMIC_H
  3 +
  4 +# include <iostream>
  5 +# include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include "cpyToshare.cuh"
  9 +
  10 +namespace stim{
  11 + namespace cuda{
  12 +
  13 + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  14 + template<typename T>
  15 + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  16 +
  17 +
  18 + // calculate the 2D coordinates for this current thread.
  19 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  20 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  21 + // convert 2D coordinates to 1D
  22 + int i = yi * x + xi;
  23 +
  24 + // calculate the voting direction based on the grtadient direction
  25 + float theta = gpuGrad[2*i];
  26 + //calculate the amount of vote for the voter
  27 + float mag = gpuGrad[2*i + 1];
  28 +
  29 + // compute the size of window which will be checked for finding the proper voters for this pixel
  30 + int x_table = 2*rmax +1;
  31 + int rmax_sq = rmax * rmax;
  32 + if(xi < x && yi < y){
  33 + //for every line (along y)
  34 + for(int yr = -rmax; yr <= rmax; yr++){
  35 + for(int xr = -rmax; xr <= rmax; xr++){
  36 + if ((yi+yr)>=0 && (yi+yr)<y && (xi+xr)>=0 && (xi+xr)<x){
  37 +
  38 + //find the location of the current pixel in the atan2 table
  39 + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
  40 +
  41 + // calculate the angle between the voter and the current pixel in x and y directions
  42 + float atan_angle = gpuTable[ind_t];
  43 +
  44 + // check if the current pixel is located in the voting area of this voter.
  45 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  46 + // calculate the 1D index for the current pixel in global memory
  47 + unsigned int ind_g = (yi+yr)*x + (xi+xr);
  48 + atomicAdd(&gpuVote[ind_g], mag);
  49 +
  50 + }
  51 + }
  52 + }
  53 + }
  54 + }
  55 + }
  56 +
  57 + template<typename T>
  58 + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  59 +
  60 +
  61 + unsigned int max_threads = stim::maxThreadsPerBlock();
  62 + dim3 threads(max_threads, 1);
  63 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  64 +
  65 + // specify share memory
  66 + //unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4;
  67 +
  68 + //call the kernel to do the voting
  69 + cuda_vote <<< blocks, threads>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  70 +
  71 + }
  72 +
  73 +
  74 + template<typename T>
  75 + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  76 +
  77 + //calculate the number of bytes in the array
  78 + unsigned int bytes = x * y * sizeof(T);
  79 +
  80 + //calculate the number of bytes in the atan2 table
  81 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  82 +
  83 + //allocate space on the GPU for the Vote Image
  84 + T* gpuVote;
  85 + cudaMalloc(&gpuVote, bytes);
  86 +
  87 + //allocate space on the GPU for the input Gradient image
  88 + T* gpuGrad;
  89 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  90 +
  91 + //copy the Gradient Magnitude data to the GPU
  92 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  93 +
  94 + //allocate space on the GPU for the atan2 table
  95 + T* gpuTable;
  96 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  97 +
  98 + //copy the atan2 values to the GPU
  99 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  100 +
  101 + //call the GPU version of the vote calculation function
  102 + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  103 +
  104 + //copy the Vote Data back to the CPU
  105 + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  106 +
  107 + //free allocated memory
  108 + cudaFree(gpuTable);
  109 + cudaFree(gpuVote);
  110 + cudaFree(gpuGrad);
  111 + }
  112 +
  113 + }
  114 +}
  115 +
  116 +#endif
0 117 \ No newline at end of file
... ...
stim/cuda/ivote/vote_atomic_shared.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_VOTE_ATOMIC_SHARED_H
  2 +#define STIM_CUDA_VOTE_ATOMIC_SHARED_H
  3 +
  4 +# include <iostream>
  5 +# include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include "cpyToshare.cuh"
  9 +//#include "writebackshared.cuh"
  10 +namespace stim{
  11 + namespace cuda{
  12 +
  13 + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  14 + template<typename T>
  15 + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  16 +
  17 + //generate a pointer to the shared memory
  18 + extern __shared__ float s_vote[];
  19 + // calculate the 2D coordinates for this current thread.
  20 + int bxi = blockIdx.x * blockDim.x;
  21 + int byi = blockIdx.y * blockDim.y;
  22 + int xi = bxi + threadIdx.x;
  23 + int yi = byi + threadIdx.y;
  24 + // convert 2D coordinates to 1D
  25 + int i = yi * x + xi;
  26 +
  27 + // calculate the voting direction based on the gradient direction
  28 + float theta = gpuGrad[2*i];
  29 + //calculate the amount of vote for the voter
  30 + float mag = gpuGrad[2*i + 1];
  31 +
  32 + //find the starting points and size of window, wich will be copied to the shared memory
  33 + int bxs = bxi - rmax;
  34 + int bys = byi - rmax;
  35 + int xwidth = 2*rmax + blockDim.x;
  36 + int ywidth = 2*rmax + blockDim.y;
  37 + //compute the coordinations of this pixel in the 2D-shared memory.
  38 + int sx_rx = threadIdx.x + rmax;
  39 + int sy_ry = threadIdx.y + rmax;
  40 + // compute the size of window which will be checked for finding the counters for this voter
  41 + int x_table = 2*rmax +1;
  42 + int rmax_sq = rmax * rmax;
  43 + //calculate some parameters for indexing shared memory
  44 + //calculate the total number of threads available
  45 + unsigned int tThreads = blockDim.x * blockDim.y;
  46 + //calculate the current 1D thread ID
  47 + unsigned int ti = threadIdx.y * (blockDim.x) + threadIdx.x;
  48 + //calculate the number of iteration required
  49 + unsigned int In = xwidth*ywidth/tThreads + 1;
  50 + if(xi < x && yi < y){
  51 + __syncthreads();
  52 + //initialize the shared memory to zero
  53 + for (unsigned int i = 0; i < In; i++){
  54 + unsigned int sIdx0 = i * tThreads + ti;
  55 + if (sIdx0< xwidth*ywidth) {
  56 + s_vote[sIdx0] = 0;
  57 + }
  58 + }
  59 + __syncthreads();
  60 + //for every line (along y)
  61 + for(int yr = -rmax; yr <= rmax; yr++){
  62 + //compute the position of the current voter in the shared memory along the y axis.
  63 + unsigned int sIdx_y1d = (sy_ry + yr)* xwidth;
  64 + for(int xr = -rmax; xr <= rmax; xr++){
  65 +
  66 + //find the location of the current pixel in the atan2 table
  67 + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
  68 +
  69 + // calculate the angle between the voter and the current pixel in x and y directions
  70 + float atan_angle = gpuTable[ind_t];
  71 +
  72 + // check if the current pixel is located in the voting area of this voter.
  73 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  74 + //compute the position of the current voter in the 2D-shared memory along the x axis.
  75 + unsigned int sIdx_x = (sx_rx + xr);
  76 + //find the 1D index of this voter in the 2D-shared memory.
  77 + unsigned int s_Idx = (sIdx_y1d + sIdx_x);
  78 +
  79 + atomicAdd(&s_vote[s_Idx], mag);
  80 + }
  81 + }
  82 + }
  83 + //write shared memory back to global memory
  84 +
  85 + __syncthreads();
  86 + for (unsigned int i = 0; i < In; i++){
  87 +
  88 + unsigned int sIdx = i * tThreads + ti;
  89 + if (sIdx>= xwidth*ywidth) return;
  90 +
  91 + unsigned int sy = sIdx/xwidth;
  92 + unsigned int sx = sIdx - (sy * xwidth);
  93 +
  94 + unsigned int gx = bxs + sx;
  95 + unsigned int gy = bys + sy;
  96 + if (gx<x&& gy<y){
  97 + unsigned int gIdx = gy * x + gx;
  98 + //write shared to global memory
  99 + atomicAdd(&gpuVote[gIdx], s_vote[sIdx]);
  100 +
  101 + }
  102 + }
  103 +
  104 + }
  105 + }
  106 +
  107 + template<typename T>
  108 + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  109 +
  110 +
  111 + unsigned int max_threads = stim::maxThreadsPerBlock();
  112 + dim3 threads(sqrt(max_threads), sqrt(max_threads));
  113 + dim3 blocks(x/threads.x + 1 , y/threads.y+1);
  114 +
  115 + // specify share memory
  116 + unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*sizeof(T);
  117 +
  118 + //call the kernel to do the voting
  119 + cuda_vote <<< blocks, threads, share_bytes>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  120 +
  121 + }
  122 +
  123 +
  124 + template<typename T>
  125 + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  126 +
  127 + //calculate the number of bytes in the array
  128 + unsigned int bytes = x * y * sizeof(T);
  129 +
  130 + //calculate the number of bytes in the atan2 table
  131 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  132 +
  133 + //allocate space on the GPU for the Vote Image
  134 + T* gpuVote;
  135 + cudaMalloc(&gpuVote, bytes);
  136 +
  137 + //allocate space on the GPU for the input Gradient image
  138 + T* gpuGrad;
  139 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  140 +
  141 + //copy the Gradient Magnitude data to the GPU
  142 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  143 +
  144 + //allocate space on the GPU for the atan2 table
  145 + T* gpuTable;
  146 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  147 +
  148 + //copy the atan2 values to the GPU
  149 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  150 +
  151 + //call the GPU version of the vote calculation function
  152 + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  153 +
  154 + //copy the Vote Data back to the CPU
  155 + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  156 +
  157 + //free allocated memory
  158 + cudaFree(gpuTable);
  159 + cudaFree(gpuVote);
  160 + cudaFree(gpuGrad);
  161 + }
  162 +
  163 + }
  164 +}
  165 +
  166 +#endif
0 167 \ No newline at end of file
... ...
stim/cuda/ivote/vote_shared_32-32.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_VOTE_SHARED_H
  2 +#define STIM_CUDA_VOTE_SHARED
  3 +# include <iostream>
  4 +# include <cuda.h>
  5 +#include <stim/cuda/cudatools.h>
  6 +#include <stim/cuda/sharedmem.cuh>
  7 +#include "cpyToshare.cuh"
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  13 + template<typename T>
  14 + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  15 +
  16 + //generate a pointer to shared memory (size will be specified as a kernel parameter)
  17 + extern __shared__ float s_grad[];
  18 +
  19 + //calculate the start point for this block
  20 + int bxi = blockIdx.x * blockDim.x;
  21 + int byi = blockIdx.y * blockDim.y;
  22 + // calculate the 2D coordinates for this current thread.
  23 + int xi = bxi + threadIdx.x;
  24 + int yi = byi + threadIdx.y;
  25 + // convert 2D coordinates to 1D
  26 + int i = yi * x + xi;
  27 +
  28 + // define a local variable to sum the votes from the voters
  29 + float sum = 0;
  30 +
  31 + //calculate the width of the shared memory block
  32 + int xwidth = 2 * rmax + blockDim.x;
  33 + int ywidth = 2 * rmax + blockDim.y;
  34 + // compute the size of window which will be checked for finding the proper voters for this pixel
  35 + int x_table = 2*rmax +1;
  36 + int rmax_sq = rmax * rmax;
  37 + int tx_rmax = threadIdx.x + rmax;
  38 + int bxs = bxi - rmax;
  39 + int bys = byi - rmax;
  40 + //compute the coordinations of this pixel in the 2D-shared memory.
  41 + int sx_rx = threadIdx.x + rmax;
  42 + int sy_ry = threadIdx.y + rmax;
  43 + //copy the portion of the image necessary for this block to shared memory
  44 + __syncthreads();
  45 + cpyG2S2D2ch<float>(s_grad, gpuGrad, bxs, bys, 2*xwidth, ywidth, threadIdx, blockDim, x, y);
  46 + __syncthreads();
  47 +
  48 + for(int yr = -rmax; yr <= rmax; yr++){
  49 + int yi_v = (yi + yr) ;
  50 + //compute the position of the current voter in the shared memory along the y axis.
  51 + unsigned int sIdx_y1d = (sy_ry + yr)* xwidth;
  52 + //if (yi+yr<y && yi+yr>=0){
  53 + if(xi < x && yi < y){
  54 +
  55 + for(int xr = -rmax; xr <= rmax; xr++){
  56 +
  57 + //compute the position of the current voter in the 2D-shared memory along the x axis.
  58 + unsigned int sIdx_x = (sx_rx + xr);
  59 + //find the 1D index of this voter in the 2D-shared memory.
  60 + unsigned int s_Idx = (sIdx_y1d + sIdx_x);
  61 + unsigned int s_Idx2 = s_Idx * 2;
  62 +
  63 + //find the location of this voter in the atan2 table
  64 + int id_t = (yr + rmax) * x_table + xr + rmax;
  65 +
  66 + // calculate the angle between the pixel and the current voter in x and y directions
  67 + float atan_angle = gpuTable[id_t];
  68 +
  69 + // calculate the voting direction based on the grtadient direction
  70 + //int idx_share = xr + tx_rmax ;
  71 + float theta = s_grad[s_Idx2];
  72 + float mag = s_grad[s_Idx2 + 1];
  73 +
  74 +
  75 + // check if the current voter is located in the voting area of this pixel.
  76 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  77 + sum += mag;
  78 +
  79 + }
  80 + }
  81 +
  82 + }
  83 + //}
  84 + }
  85 + if(xi < x && yi < y)
  86 + gpuVote[i] = sum;
  87 +
  88 + }
  89 +
  90 + template<typename T>
  91 + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  92 +
  93 +
  94 + unsigned int max_threads = stim::maxThreadsPerBlock();
  95 + dim3 threads(sqrt(max_threads), sqrt(max_threads));
  96 + dim3 blocks(x/threads.x + 1 , y/threads.y+1);
  97 +
  98 +
  99 + // specify share memory
  100 + unsigned int share_bytes = (2*rmax + threads.x)*(2*rmax + threads.y)*2*sizeof(T);
  101 +
  102 + //call the kernel to do the voting
  103 + cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  104 +
  105 + }
  106 +
  107 +
  108 + template<typename T>
  109 + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  110 +
  111 + //calculate the number of bytes in the array
  112 + unsigned int bytes = x * y * sizeof(T);
  113 +
  114 + //calculate the number of bytes in the atan2 table
  115 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  116 +
  117 + //allocate space on the GPU for the Vote Image
  118 + T* gpuVote;
  119 + cudaMalloc(&gpuVote, bytes);
  120 +
  121 + //allocate space on the GPU for the input Gradient image
  122 + T* gpuGrad;
  123 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  124 +
  125 + //copy the Gradient Magnitude data to the GPU
  126 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  127 +
  128 + //allocate space on the GPU for the atan2 table
  129 + T* gpuTable;
  130 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  131 +
  132 + //copy the atan2 values to the GPU
  133 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  134 +
  135 + //call the GPU version of the vote calculation function
  136 + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  137 +
  138 + //copy the Vote Data back to the CPU
  139 + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  140 +
  141 + //free allocated memory
  142 + cudaFree(gpuTable);
  143 + cudaFree(gpuVote);
  144 + cudaFree(gpuGrad);
  145 + }
  146 +
  147 + }
  148 +}
  149 +
  150 +#endif
0 151 \ No newline at end of file
... ...
stim/cuda/ivote_atomic.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_IVOTE_ATOMIC_H
  2 +#define STIM_CUDA_IVOTE_ATOMIC_H
  3 +
  4 +#include <stim/cuda/ivote/down_sample.cuh>
  5 +#include <stim/cuda/ivote/local_max.cuh>
  6 +#include <stim/cuda/ivote/update_dir_global.cuh>
  7 +//#include <stim/cuda/ivote/vote_shared_32-32.cuh>
  8 +#include <stim/cuda/ivote/vote_atomic_shared.cuh>
  9 +//#include <stim/cuda/ivote/re_sample.cuh>
  10 +namespace stim{
  11 + namespace cuda{
  12 +
  13 + }
  14 +}
  15 +
  16 +
  17 +
  18 +#endif
0 19 \ No newline at end of file
... ...
stim/cuda/sharedmem.cuh
... ... @@ -35,10 +35,8 @@ namespace stim{
35 35 }
36 36 }
37 37  
38   - // Copies values from global memory to shared memory, optimizing threads
39   - template<typename T>
40   - __device__ void sharedMemcpy(T* dest, T* src, size_t N, size_t tid, size_t nt){
41   -
  38 + // Threaded copying of data on a CUDA device.
  39 + __device__ void threadedMemcpy(char* dest, char* src, size_t N, size_t tid, size_t nt){
42 40 size_t I = N / nt + 1; //calculate the number of iterations required to make the copy
43 41 size_t xi = tid; //initialize the source and destination index to the thread ID
44 42 for(size_t i = 0; i < I; i++){ //for each iteration
... ... @@ -48,7 +46,37 @@ namespace stim{
48 46 }
49 47 }
50 48  
51   -
  49 + /// Threaded copying of 2D data on a CUDA device
  50 + /// @param dest is a linear destination array of size nx * ny
  51 + /// @param src is a 2D image stored as a linear array with a pitch of X
  52 + /// @param X is the number of bytes in a row of src
  53 + /// @param tid is a 1D id for the current thread
  54 + /// @param nt is the number of threads in the block
  55 + template<typename T>
  56 + __device__ void threadedMemcpy2D(T* dest, size_t nx, size_t ny,
  57 + T* src, size_t x, size_t y, size_t sX, size_t sY,
  58 + size_t tid, size_t nt){
  59 +
  60 + size_t vals = nx * ny; //calculate the total number of bytes to be copied
  61 + size_t I = vals / nt + 1; //calculate the number of iterations required to perform the copy
  62 +
  63 + size_t src_i, dest_i;
  64 + size_t dest_x, dest_y, src_x, src_y;
  65 + for(size_t i = 0; i < I; i++){ //for each iteration
  66 + dest_i = i * nt + tid; //calculate the index into the destination array
  67 + dest_y = dest_i / nx;
  68 + dest_x = dest_i - dest_y * nx;
  69 +
  70 + if(dest_y < ny && dest_x < nx){
  71 +
  72 + src_x = x + dest_x;
  73 + src_y = y + dest_y;
  74 +
  75 + src_i = src_y * sX + src_x;
  76 + dest[dest_i] = src[src_i];
  77 + }
  78 + }
  79 + }
52 80 }
53 81 }
54 82  
... ...
stim/cuda/templates/conv2sep.cuh
... ... @@ -30,7 +30,8 @@ namespace stim{
30 30 int byi = blockIdx.y;
31 31  
32 32 //copy the portion of the image necessary for this block to shared memory
33   - stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
  33 + //stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
  34 + stim::cuda::sharedMemcpy_tex2D<float>(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
34 35  
35 36 //calculate the thread index
36 37 int ti = threadIdx.x;
... ... @@ -88,7 +89,8 @@ namespace stim{
88 89 int byi = blockIdx.y * blockDim.y;
89 90  
90 91 //copy the portion of the image necessary for this block to shared memory
91   - stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
  92 + //stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
  93 + stim::cuda::sharedMemcpy_tex2D<float>(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
92 94  
93 95 //calculate the thread index
94 96 int ti = threadIdx.y;
... ...
stim/image/image.h
... ... @@ -160,11 +160,26 @@ public:
160 160 exit(1);
161 161 }
162 162 allocate(cvImage.cols, cvImage.rows, cvImage.channels()); //allocate space for the image
163   - T* cv_ptr = (T*)cvImage.data;
  163 + unsigned char* cv_ptr = (unsigned char*)cvImage.data;
164 164 if(C() == 1) //if this is a single-color image, just copy the data
165 165 memcpy(img, cv_ptr, bytes());
166 166 if(C() == 3) //if this is a 3-color image, OpenCV uses BGR interleaving
167   - set_interleaved_bgr(cv_ptr, X(), Y());
  167 + from_opencv(cv_ptr, X(), Y());
  168 + }
  169 +
  170 + void from_opencv(unsigned char* buffer, size_t width, size_t height){
  171 + allocate(width, height, 3);
  172 + T value;
  173 + size_t i;
  174 + for(size_t c = 0; c < C(); c++){ //copy directly
  175 + for(size_t y = 0; y < Y(); y++){
  176 + for(size_t x = 0; x < X(); x++){
  177 + i = y * X() * C() + x * C() + (2-c);
  178 + value = buffer[i];
  179 + img[idx(x, y, c)] = value;
  180 + }
  181 + }
  182 + }
168 183 }
169 184  
170 185 //save a file
... ... @@ -180,23 +195,35 @@ public:
180 195 cv::imwrite(filename, cvImage);
181 196 }
182 197  
  198 + void set_interleaved(T* buffer, size_t width, size_t height, size_t channels){
  199 + allocate(width, height, channels);
  200 + memcpy(img, buffer, bytes());
  201 + }
  202 +
183 203 //create an image from an interleaved buffer
184 204 void set_interleaved_rgb(T* buffer, size_t width, size_t height){
185   - allocate(width, height, 3);
186   - memcpy(img, buffer, bytes());
  205 + set_interleaved(buffer, width, height, 3);
187 206 }
188 207  
189 208 void set_interleaved_bgr(T* buffer, size_t width, size_t height){
190 209 allocate(width, height, 3);
  210 + T value;
  211 + size_t i;
191 212 for(size_t c = 0; c < C(); c++){ //copy directly
192 213 for(size_t y = 0; y < Y(); y++){
193 214 for(size_t x = 0; x < X(); x++){
194   - img[idx(x, y, c)] = buffer[y * X() * C() + x * C() + (2-c)];
  215 + i = y * X() * C() + x * C() + (2-c);
  216 + value = buffer[i];
  217 + img[idx(x, y, c)] = value;
195 218 }
196 219 }
197 220 }
198 221 }
199 222  
  223 + void set_interleaved(T* buffer, size_t width, size_t height){
  224 + set_interleaved_rgb(buffer, width, height);
  225 + }
  226 +
200 227 void get_interleaved_bgr(T* data){
201 228  
202 229 //for each channel
... ...
stim/optics/scalarfield.h
... ... @@ -71,8 +71,8 @@ public:
71 71 void to_cpu(){
72 72 if(loc == CPUmem) return;
73 73 else{
74   - stim::complex<T>* host_E = (stim::complex<T>*) malloc(e_bytes()); //allocate space in main memory
75   - HANDLE_ERROR( cudaMemcpy(host_E, E, e_bytes(), cudaMemcpyDeviceToHost) ); //copy from GPU to CPU
  74 + stim::complex<T>* host_E = (stim::complex<T>*) malloc(grid_bytes()); //allocate space in main memory
  75 + HANDLE_ERROR( cudaMemcpy(host_E, E, grid_bytes(), cudaMemcpyDeviceToHost) ); //copy from GPU to CPU
76 76 HANDLE_ERROR( cudaFree(E) ); //free device memory
77 77 E = host_E; //swap pointers
78 78 }
... ...
stim/visualization/aabb2.h 0 → 100644
  1 +#ifndef STIM_AABB2_H
  2 +#define STIM_AABB2_H
  3 +
  4 +#include <stim/cuda/cudatools/callable.h>
  5 +
  6 +namespace stim{
  7 +
  8 +/// Structure for a 2D axis aligned bounding box
  9 +template<typename T>
  10 +struct aabb2{
  11 +
  12 +//protected:
  13 +
  14 + T low[2]; //top left corner position
  15 + T high[2]; //dimensions along x and y
  16 +
  17 +//public:
  18 +
  19 + CUDA_CALLABLE aabb2(T x, T y){ //initialize an axis aligned bounding box of size 0 at the given position
  20 + low[0] = high[0] = x; //set the position to the user specified coordinates
  21 + low[1] = high[1] = y;
  22 + }
  23 +
  24 + //insert a point into the bounding box, growing the box appropriately
  25 + CUDA_CALLABLE void insert(T x, T y){
  26 + if(x < low[0]) low[0] = x;
  27 + if(y < low[1]) low[1] = y;
  28 +
  29 + if(x > high[0]) high[0] = x;
  30 + if(y > high[1]) high[1] = y;
  31 + }
  32 +
  33 + //trim the bounding box so that the lower bounds are (x, y)
  34 + CUDA_CALLABLE void trim_low(T x, T y){
  35 + if(low[0] < x) low[0] = x;
  36 + if(low[1] < y) low[1] = y;
  37 + }
  38 +
  39 + CUDA_CALLABLE void trim_high(T x, T y){
  40 + if(high[0] > x) high[0] = x;
  41 + if(high[1] > y) high[1] = y;
  42 + }
  43 +
  44 +};
  45 +
  46 +}
  47 +
  48 +
  49 +#endif
0 50 \ No newline at end of file
... ...