Commit 276f9b23d700babac82af74c461e6055fbd69eb0

Authored by David Mayerich
1 parent c282a801

forced matching to Matlab, removed warnings

stim/cuda/arraymath/array_cart2polar.cuh
... ... @@ -4,14 +4,14 @@
4 4 namespace stim{
5 5 namespace cuda{
6 6 template<typename T>
7   - __global__ void cuda_cart2polar(T* a, int x, int y, float rotation){
  7 + __global__ void cuda_cart2polar(T* a, size_t x, size_t y, float rotation){
8 8  
9 9  
10 10 // calculate the 2D coordinates for this current thread.
11   - int xi = blockIdx.x * blockDim.x + threadIdx.x;
12   - int yi = blockIdx.y * blockDim.y + threadIdx.y;
  11 + size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
  12 + size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
13 13 // convert 2D coordinates to 1D
14   - int i = yi * x + xi;
  14 + size_t i = yi * x + xi;
15 15  
16 16  
17 17 if(xi >= x|| yi >= y) return;
... ... @@ -27,11 +27,11 @@ namespace stim{
27 27  
28 28  
29 29 template<typename T>
30   - void gpu_cart2polar(T* gpuGrad, unsigned int x, unsigned int y, float rotation = 0){
  30 + void gpu_cart2polar(T* gpuGrad, size_t x, size_t y, float rotation = 0){
31 31  
32 32 unsigned int max_threads = stim::maxThreadsPerBlock();
33 33 dim3 threads(max_threads, 1);
34   - dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  34 + dim3 blocks(((unsigned int)x/threads.x) + ((unsigned int)x %threads.x == 0 ? 0:1) , (unsigned int)y);
35 35  
36 36 //call the kernel to do the multiplication
37 37 cuda_cart2polar <<< blocks, threads >>>(gpuGrad, x, y, rotation);
... ... @@ -40,11 +40,11 @@ namespace stim{
40 40  
41 41  
42 42 template<typename T>
43   - void cpu_cart2polar(T* a, unsigned int x, unsigned int y){
  43 + void cpu_cart2polar(T* a, size_t x, size_t y){
44 44  
45 45 //calculate the number of bytes in the array
46   - unsigned int N = x *y;
47   - unsigned int bytes = N * sizeof(T) * 2;
  46 + size_t N = x *y;
  47 + size_t bytes = N * sizeof(T) * 2;
48 48  
49 49 //allocate memory on the GPU for the array
50 50 T* gpuA;
... ...
stim/cuda/ivote/local_max.cuh
... ... @@ -10,17 +10,17 @@ namespace stim{
10 10  
11 11 // this kernel calculates the local maximum for finding the cell centers
12 12 template<typename T>
13   - __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, int x, int y){
  13 + __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, size_t x, size_t y){
14 14  
15 15 // calculate the 2D coordinates for this current thread.
16   - int xi = blockIdx.x * blockDim.x + threadIdx.x;
17   - int yi = blockIdx.y * blockDim.y + threadIdx.y;
  16 + size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
  17 + size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
18 18  
19 19 if(xi >= x || yi >= y)
20 20 return;
21 21  
22 22 // convert 2D coordinates to 1D
23   - int i = yi * x + xi;
  23 + size_t i = yi * x + xi;
24 24  
25 25 gpuCenters[i] = 0; //initialize the value at this location to zero
26 26  
... ... @@ -61,13 +61,13 @@ namespace stim{
61 61 }
62 62  
63 63 template<typename T>
64   - void gpu_local_max(T* gpuCenters, T* gpuVote, unsigned int conn, unsigned int x, unsigned int y){
  64 + void gpu_local_max(T* gpuCenters, T* gpuVote, unsigned int conn, size_t x, size_t y){
65 65  
66 66 unsigned int max_threads = stim::maxThreadsPerBlock();
67 67 /*dim3 threads(max_threads, 1);
68 68 dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/
69   - dim3 threads( sqrt(max_threads), sqrt(max_threads) );
70   - dim3 blocks(x/threads.x + 1, y/threads.y + 1);
  69 + dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
  70 + dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
71 71  
72 72 //call the kernel to find the local maximum.
73 73 cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, conn, x, y);
... ...
stim/cuda/ivote/update_dir_bb.cuh
... ... @@ -81,26 +81,26 @@ namespace stim{
81 81  
82 82 // this kernel updates the gradient direction by the calculated voting direction.
83 83 template<typename T>
84   - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
  84 + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, size_t x, size_t y){
85 85  
86 86 // calculate the 2D coordinates for this current thread.
87   - int xi = blockIdx.x * blockDim.x + threadIdx.x;
88   - int yi = blockIdx.y * blockDim.y + threadIdx.y;
  87 + size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
  88 + size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
89 89  
90 90 if(xi >= x || yi >= y) return;
91 91  
92 92 // convert 2D coordinates to 1D
93   - int i = yi * x + xi;
  93 + size_t i = yi * x + xi;
94 94  
95 95 //update the gradient image with the vote direction
96 96 gpuGrad[2*i] = gpuDir[i];
97 97 }
98 98  
99 99 template<typename T>
100   - void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  100 + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y){
101 101  
102 102 //calculate the number of bytes in the array
103   - unsigned int bytes = x * y * sizeof(T);
  103 + size_t bytes = x * y * sizeof(T);
104 104  
105 105 // allocate space on the GPU for the updated vote direction
106 106 T* gpuDir;
... ... @@ -108,14 +108,14 @@ namespace stim{
108 108  
109 109 unsigned int max_threads = stim::maxThreadsPerBlock();
110 110  
111   - dim3 threads( sqrt(max_threads), sqrt(max_threads) );
112   - dim3 blocks(x/threads.x + 1, y/threads.y + 1);
  111 + dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
  112 + dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
113 113  
114 114 size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
115 115 //size_t curtain = 2 * rmax;
116 116 //size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain);
117 117 size_t shared_mem_req = table_bytes;// + template_bytes;
118   - std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
  118 + if (DEBUG) std::cout << "Shared Memory required: " << shared_mem_req << std::endl;
119 119  
120 120 size_t shared_mem = stim::sharedMemPerBlock();
121 121 if(shared_mem_req > shared_mem){
... ... @@ -124,16 +124,10 @@ namespace stim{
124 124 }
125 125  
126 126 //call the kernel to calculate the new voting direction
127   - cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
128   - //stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
129   -
130   - //exit(0);
131   -
132   - //threads = dim3( sqrt(max_threads), sqrt(max_threads) );
133   - //blocks = dim3(x/threads.x + 1, y/threads.y + 1);
  127 + cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, (int)x , (int)y);
134 128  
135 129 //call the kernel to update the gradient direction
136   - cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
  130 + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, (int)x , (int)y);
137 131 //free allocated memory
138 132 HANDLE_ERROR( cudaFree(gpuDir) );
139 133  
... ...
stim/cuda/ivote/vote_atomic_bb.cuh
... ... @@ -9,12 +9,14 @@
9 9 #include <stim/visualization/colormap.h>
10 10 #include <math.h>
11 11  
  12 +
  13 +
12 14 namespace stim{
13 15 namespace cuda{
14 16  
15 17 // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
16 18 template<typename T>
17   - __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  19 + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, size_t x, size_t y, bool gradmag = true){
18 20  
19 21 extern __shared__ T S[];
20 22 T* shared_atan = S;
... ... @@ -22,12 +24,12 @@ namespace stim{
22 24 stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
23 25  
24 26 // calculate the 2D coordinates for this current thread.
25   - int xi = blockIdx.x * blockDim.x + threadIdx.x;
26   - int yi = blockIdx.y * blockDim.y + threadIdx.y;
  27 + size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
  28 + size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
27 29  
28 30 if(xi >= x || yi >= y) return;
29 31 // convert 2D coordinates to 1D
30   - int i = yi * x + xi;
  32 + size_t i = yi * x + xi;
31 33  
32 34 // calculate the voting direction based on the grtadient direction
33 35 float theta = gpuGrad[2*i];
... ... @@ -50,7 +52,7 @@ namespace stim{
50 52 bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image
51 53 bb.trim_high(x-1, y-1);
52 54  
53   - int by, bx;
  55 + size_t by, bx;
54 56 int dx, dy;
55 57  
56 58 unsigned int ind_g; //initialize the maximum vote value to zero
... ... @@ -66,7 +68,8 @@ namespace stim{
66 68 alpha = shared_atan[lut_i];
67 69 if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
68 70 ind_g = (by)*x + (bx);
69   - atomicAdd(&gpuVote[ind_g], mag);
  71 + if(gradmag) atomicAdd(&gpuVote[ind_g], mag); //add the gradient magnitude (if the gradmag flag is enabled)
  72 + else atomicAdd(&gpuVote[ind_g], 1.0f); //otherwise just add 1
70 73  
71 74 }
72 75 }
... ... @@ -75,16 +78,22 @@ namespace stim{
75 78 }
76 79  
77 80  
  81 + /// Iterative voting for an image
  82 + /// @param gpuVote is the resulting vote image
  83 + /// @param gpuGrad is the gradient of the input image
  84 + /// @param gpuTable is the pre-computed atan2() table
  85 + /// @param phi is the angle of the vote region
  86 + /// @param rmax is the estimated radius of the blob (defines the "width" of the vote region)
  87 + /// @param x and y are the spatial dimensions of the gradient image
  88 + /// @param gradmag defines whether or not the gradient magnitude is taken into account during the vote
78 89 template<typename T>
79   - void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
80   -
81   -
  90 + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y, bool gradmag = true){
82 91 unsigned int max_threads = stim::maxThreadsPerBlock();
83   - dim3 threads( sqrt(max_threads), sqrt(max_threads) );
84   - dim3 blocks(x/threads.x + 1, y/threads.y + 1);
  92 + dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
  93 + dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
85 94 size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
86 95 size_t shared_mem_req = table_bytes;// + template_bytes;
87   - std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
  96 + if (DEBUG) std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
88 97 size_t shared_mem = stim::sharedMemPerBlock();
89 98 if(shared_mem_req > shared_mem){
90 99 std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
... ... @@ -92,7 +101,7 @@ namespace stim{
92 101 }
93 102  
94 103 //call the kernel to do the voting
95   - cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  104 + cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y, gradmag);
96 105  
97 106 }
98 107  
... ...
stim/cuda/ivote_atomic_bb.cuh
1 1 #ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H
2 2 #define STIM_CUDA_IVOTE_ATOMIC_BB_H
3 3  
  4 +extern bool DEBUG;
4 5 #include <stim/cuda/ivote/down_sample.cuh>
5 6 #include <stim/cuda/ivote/local_max.cuh>
6 7 #include <stim/cuda/ivote/update_dir_bb.cuh>
... ...
stim/cuda/templates/gradient.cuh
... ... @@ -9,25 +9,25 @@ namespace stim{
9 9 namespace cuda{
10 10  
11 11 template<typename T>
12   - __global__ void gradient_2d(T* out, T* in, int x, int y){
  12 + __global__ void gradient_2d(T* out, T* in, size_t x, size_t y){
13 13  
14 14  
15 15 // calculate the 2D coordinates for this current thread.
16   - int xi = blockIdx.x * blockDim.x + threadIdx.x;
17   - int yi = blockIdx.y * blockDim.y + threadIdx.y;
  16 + size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
  17 + size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
18 18 // convert 2D coordinates to 1D
19   - int i = yi * x + xi;
  19 + size_t i = yi * x + xi;
20 20  
21 21 //return if the pixel is outside of the image
22 22 if(xi >= x || yi >= y) return;
23 23  
24 24 //calculate indices for the forward difference
25   - int i_xp = yi * x + (xi + 1);
26   - int i_yp = (yi + 1) * x + xi;
  25 + size_t i_xp = yi * x + (xi + 1);
  26 + size_t i_yp = (yi + 1) * x + xi;
27 27  
28 28 //calculate indices for the backward difference
29   - int i_xn = yi * x + (xi - 1);
30   - int i_yn = (yi - 1) * x + xi;
  29 + size_t i_xn = yi * x + (xi - 1);
  30 + size_t i_yn = (yi - 1) * x + xi;
31 31  
32 32 //use forward differences if a coordinate is zero
33 33 if(xi == 0)
... ... @@ -47,12 +47,12 @@ namespace stim{
47 47 }
48 48  
49 49 template<typename T>
50   - void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){
  50 + void gpu_gradient_2d(T* gpuGrad, T* gpuI, size_t x, size_t y){
51 51  
52 52 //get the maximum number of threads per block for the CUDA device
53 53 unsigned int max_threads = stim::maxThreadsPerBlock();
54 54 dim3 threads(max_threads, 1);
55   - dim3 blocks(x/threads.x + 1 , y);
  55 + dim3 blocks((unsigned int)(x/threads.x) + 1 , (unsigned int)y);
56 56  
57 57  
58 58 //call the GPU kernel to determine the gradient
... ...
stim/image/image.h
... ... @@ -122,9 +122,9 @@ public:
122 122 }
123 123  
124 124 stim::image<T>& operator=(const stim::image<T>& I){
125   - init();
126 125 if(&I == this) //handle self-assignment
127 126 return *this;
  127 + init();
128 128 allocate(I.X(), I.Y(), I.C());
129 129 memcpy(img, I.img, bytes());
130 130 return *this;
... ... @@ -423,6 +423,33 @@ public:
423 423 return result;
424 424 }
425 425  
  426 + /// Adds curcular padding for the specified number of pixels - in this case replicating the boundary pixels
  427 + image<T> pad_replicate(size_t p) {
  428 + image<T> result(width() + p * 2, height() + p * 2, channels()); //create an output image
  429 + result = 0;
  430 + //result = value; //assign the border value to all pixels in the new image
  431 + for (size_t y = 0; y < height(); y++) { //for each pixel in the original image
  432 + for (size_t x = 0; x < width(); x++) {
  433 + size_t n = (y + p) * (width() + p * 2) + x + p; //calculate the index of the corresponding pixel in the result image
  434 + size_t n0 = idx(x, y); //calculate the index for this pixel in the original image
  435 + result.data()[n] = img[n0]; // copy the original image to the result image afer the border area
  436 + }
  437 + }
  438 + size_t l = p;
  439 + size_t r = p + width() - 1;
  440 + size_t t = p;
  441 + size_t b = p + height() - 1;
  442 + for (size_t y = 0; y < p; y++) for (size_t x = l; x <= r; x++) result(x, y) = result(x, t); //pad the top
  443 + for (size_t y = b + 1; y < result.height(); y++) for (size_t x = l; x <= r; x++) result(x, y) = result(x, b); //pad the bottom
  444 + for (size_t y = t; y <= b; y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, y); //pad the left
  445 + for (size_t y = t; y <= b; y++) for (size_t x = r+1; x < result.width(); x++) result(x, y) = result(r, y); //pad the right
  446 + for (size_t y = 0; y < t; y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, t); //pad the top left
  447 + for (size_t y = 0; y < t; y++) for (size_t x = r+1; x < result.width(); x++) result(x, y) = result(r, t); //pad the top right
  448 + for (size_t y = b+1; y < result.height(); y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, b); //pad the bottom left
  449 + for (size_t y = b+1; y < result.height(); y++) for (size_t x = r + 1; x < result.width(); x++) result(x, y) = result(r, b); //pad the bottom right
  450 + return result;
  451 + }
  452 +
426 453 /// Copy the given data to the specified channel
427 454  
428 455 /// @param c is the channel number that the data will be copied to
... ... @@ -546,7 +573,7 @@ public:
546 573 }
547 574  
548 575 //crop regions given by an array of 1D index values
549   - std::vector<image<T>> crop_idx(size_t w, size_t h, std::vector<size_t> idx) {
  576 + std::vector< image<T> > crop_idx(size_t w, size_t h, std::vector<size_t> idx) {
550 577 std::vector<image<T>> result(idx.size()); //create an array of image files to return
551 578 for (size_t i = 0; i < idx.size(); i++) { //for each specified index point
552 579 size_t y = idx[i] / X(); //calculate the y coordinate from the 1D index (center of ROI)
... ...
stim/math/filters/gauss2.cuh
... ... @@ -18,8 +18,8 @@ namespace stim {
18 18 ///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel
19 19 template<typename T, typename K>
20 20 stim::image<T> cpu_gauss2(const stim::image<T>& in, K stdx, K stdy, size_t nstds = 3) {
21   - size_t kx = stdx * nstds * 2 + 1; //calculate the kernel sizes
22   - size_t ky = stdy * nstds * 2 + 1;
  21 + size_t kx = (size_t)(stdx * nstds * 2) + 1; //calculate the kernel sizes
  22 + size_t ky = (size_t)(stdy * nstds * 2) + 1;
23 23 size_t X = in.width() - kx + 1; //calculate the size of the output image
24 24 size_t Y = in.height() - ky + 1;
25 25 stim::image<T> r(X, Y, in.channels()); //create an output image
... ...
stim/math/filters/sepconv2.cuh
... ... @@ -20,12 +20,12 @@ namespace stim {
20 20 cudaDeviceProp p;
21 21 HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
22 22 size_t tmax = p.maxThreadsPerBlock;
23   - dim3 nt(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions
  23 + dim3 nt((unsigned int)sqrt(tmax), (unsigned int)sqrt(tmax)); //calculate the block dimensions
24 24 size_t X = sx - kx + 1; //calculate the x size of the output image
25 25 T* temp; //declare a temporary variable to store the intermediate image
26 26 HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T))); //allocate memory for the intermediate image
27 27  
28   - dim3 nb(X / nt.x + 1, sy / nt.y + 1); //calculate the grid dimensions
  28 + dim3 nb((unsigned int)(X / nt.x) + 1, (unsigned int)(sy / nt.y) + 1); //calculate the grid dimensions
29 29 size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T); //shared memory bytes required to store block data
30 30 if (sm > p.sharedMemPerBlock) {
31 31 std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
... ... @@ -34,7 +34,7 @@ namespace stim {
34 34 kernel_conv2 <<<nb, nt, sm>>> (temp, in, k0, sx, sy, kx, 1); //launch the kernel to compute the intermediate image
35 35  
36 36 size_t Y = sy - ky + 1; //calculate the y size of the output image
37   - nb.y = Y / nt.y + 1; //update the grid dimensions to reflect the Y-axis size of the output image
  37 + nb.y = (unsigned int)(Y / nt.y) + 1; //update the grid dimensions to reflect the Y-axis size of the output image
38 38 sm = nt.x * (nt.y + ky - 1) * sizeof(T); //calculate the amount of shared memory needed for the second pass
39 39 if (sm > p.sharedMemPerBlock) {
40 40 std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
... ...
stim/parser/arguments.h
... ... @@ -535,7 +535,13 @@ namespace stim{
535 535  
536 536 /// @param a is the index of the requested argument
537 537 std::string arg(size_t a){
538   - return args[a];
  538 + if (a < nargs()) {
  539 + return args[a];
  540 + }
  541 + else {
  542 + std::cout << "stim::arguments ERROR: argument index exceeds number of available arguments" << std::endl;
  543 + exit(1);
  544 + }
539 545 }
540 546  
541 547 /// Returns an std::vector of argument strings
... ...