Commit 276f9b23d700babac82af74c461e6055fbd69eb0

Authored by David Mayerich
1 parent c282a801

forced matching to Matlab, removed warnings

stim/cuda/arraymath/array_cart2polar.cuh
@@ -4,14 +4,14 @@ @@ -4,14 +4,14 @@
4 namespace stim{ 4 namespace stim{
5 namespace cuda{ 5 namespace cuda{
6 template<typename T> 6 template<typename T>
7 - __global__ void cuda_cart2polar(T* a, int x, int y, float rotation){ 7 + __global__ void cuda_cart2polar(T* a, size_t x, size_t y, float rotation){
8 8
9 9
10 // calculate the 2D coordinates for this current thread. 10 // calculate the 2D coordinates for this current thread.
11 - int xi = blockIdx.x * blockDim.x + threadIdx.x;  
12 - int yi = blockIdx.y * blockDim.y + threadIdx.y; 11 + size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
  12 + size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
13 // convert 2D coordinates to 1D 13 // convert 2D coordinates to 1D
14 - int i = yi * x + xi; 14 + size_t i = yi * x + xi;
15 15
16 16
17 if(xi >= x|| yi >= y) return; 17 if(xi >= x|| yi >= y) return;
@@ -27,11 +27,11 @@ namespace stim{ @@ -27,11 +27,11 @@ namespace stim{
27 27
28 28
29 template<typename T> 29 template<typename T>
30 - void gpu_cart2polar(T* gpuGrad, unsigned int x, unsigned int y, float rotation = 0){ 30 + void gpu_cart2polar(T* gpuGrad, size_t x, size_t y, float rotation = 0){
31 31
32 unsigned int max_threads = stim::maxThreadsPerBlock(); 32 unsigned int max_threads = stim::maxThreadsPerBlock();
33 dim3 threads(max_threads, 1); 33 dim3 threads(max_threads, 1);
34 - dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); 34 + dim3 blocks(((unsigned int)x/threads.x) + ((unsigned int)x %threads.x == 0 ? 0:1) , (unsigned int)y);
35 35
36 //call the kernel to do the multiplication 36 //call the kernel to do the multiplication
37 cuda_cart2polar <<< blocks, threads >>>(gpuGrad, x, y, rotation); 37 cuda_cart2polar <<< blocks, threads >>>(gpuGrad, x, y, rotation);
@@ -40,11 +40,11 @@ namespace stim{ @@ -40,11 +40,11 @@ namespace stim{
40 40
41 41
42 template<typename T> 42 template<typename T>
43 - void cpu_cart2polar(T* a, unsigned int x, unsigned int y){ 43 + void cpu_cart2polar(T* a, size_t x, size_t y){
44 44
45 //calculate the number of bytes in the array 45 //calculate the number of bytes in the array
46 - unsigned int N = x *y;  
47 - unsigned int bytes = N * sizeof(T) * 2; 46 + size_t N = x *y;
  47 + size_t bytes = N * sizeof(T) * 2;
48 48
49 //allocate memory on the GPU for the array 49 //allocate memory on the GPU for the array
50 T* gpuA; 50 T* gpuA;
stim/cuda/ivote/local_max.cuh
@@ -10,17 +10,17 @@ namespace stim{ @@ -10,17 +10,17 @@ namespace stim{
10 10
11 // this kernel calculates the local maximum for finding the cell centers 11 // this kernel calculates the local maximum for finding the cell centers
12 template<typename T> 12 template<typename T>
13 - __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, int x, int y){ 13 + __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, size_t x, size_t y){
14 14
15 // calculate the 2D coordinates for this current thread. 15 // calculate the 2D coordinates for this current thread.
16 - int xi = blockIdx.x * blockDim.x + threadIdx.x;  
17 - int yi = blockIdx.y * blockDim.y + threadIdx.y; 16 + size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
  17 + size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
18 18
19 if(xi >= x || yi >= y) 19 if(xi >= x || yi >= y)
20 return; 20 return;
21 21
22 // convert 2D coordinates to 1D 22 // convert 2D coordinates to 1D
23 - int i = yi * x + xi; 23 + size_t i = yi * x + xi;
24 24
25 gpuCenters[i] = 0; //initialize the value at this location to zero 25 gpuCenters[i] = 0; //initialize the value at this location to zero
26 26
@@ -61,13 +61,13 @@ namespace stim{ @@ -61,13 +61,13 @@ namespace stim{
61 } 61 }
62 62
63 template<typename T> 63 template<typename T>
64 - void gpu_local_max(T* gpuCenters, T* gpuVote, unsigned int conn, unsigned int x, unsigned int y){ 64 + void gpu_local_max(T* gpuCenters, T* gpuVote, unsigned int conn, size_t x, size_t y){
65 65
66 unsigned int max_threads = stim::maxThreadsPerBlock(); 66 unsigned int max_threads = stim::maxThreadsPerBlock();
67 /*dim3 threads(max_threads, 1); 67 /*dim3 threads(max_threads, 1);
68 dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/ 68 dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/
69 - dim3 threads( sqrt(max_threads), sqrt(max_threads) );  
70 - dim3 blocks(x/threads.x + 1, y/threads.y + 1); 69 + dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
  70 + dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
71 71
72 //call the kernel to find the local maximum. 72 //call the kernel to find the local maximum.
73 cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, conn, x, y); 73 cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, conn, x, y);
stim/cuda/ivote/update_dir_bb.cuh
@@ -81,26 +81,26 @@ namespace stim{ @@ -81,26 +81,26 @@ namespace stim{
81 81
82 // this kernel updates the gradient direction by the calculated voting direction. 82 // this kernel updates the gradient direction by the calculated voting direction.
83 template<typename T> 83 template<typename T>
84 - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ 84 + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, size_t x, size_t y){
85 85
86 // calculate the 2D coordinates for this current thread. 86 // calculate the 2D coordinates for this current thread.
87 - int xi = blockIdx.x * blockDim.x + threadIdx.x;  
88 - int yi = blockIdx.y * blockDim.y + threadIdx.y; 87 + size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
  88 + size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
89 89
90 if(xi >= x || yi >= y) return; 90 if(xi >= x || yi >= y) return;
91 91
92 // convert 2D coordinates to 1D 92 // convert 2D coordinates to 1D
93 - int i = yi * x + xi; 93 + size_t i = yi * x + xi;
94 94
95 //update the gradient image with the vote direction 95 //update the gradient image with the vote direction
96 gpuGrad[2*i] = gpuDir[i]; 96 gpuGrad[2*i] = gpuDir[i];
97 } 97 }
98 98
99 template<typename T> 99 template<typename T>
100 - void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ 100 + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y){
101 101
102 //calculate the number of bytes in the array 102 //calculate the number of bytes in the array
103 - unsigned int bytes = x * y * sizeof(T); 103 + size_t bytes = x * y * sizeof(T);
104 104
105 // allocate space on the GPU for the updated vote direction 105 // allocate space on the GPU for the updated vote direction
106 T* gpuDir; 106 T* gpuDir;
@@ -108,14 +108,14 @@ namespace stim{ @@ -108,14 +108,14 @@ namespace stim{
108 108
109 unsigned int max_threads = stim::maxThreadsPerBlock(); 109 unsigned int max_threads = stim::maxThreadsPerBlock();
110 110
111 - dim3 threads( sqrt(max_threads), sqrt(max_threads) );  
112 - dim3 blocks(x/threads.x + 1, y/threads.y + 1); 111 + dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
  112 + dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
113 113
114 size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); 114 size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
115 //size_t curtain = 2 * rmax; 115 //size_t curtain = 2 * rmax;
116 //size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain); 116 //size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain);
117 size_t shared_mem_req = table_bytes;// + template_bytes; 117 size_t shared_mem_req = table_bytes;// + template_bytes;
118 - std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl; 118 + if (DEBUG) std::cout << "Shared Memory required: " << shared_mem_req << std::endl;
119 119
120 size_t shared_mem = stim::sharedMemPerBlock(); 120 size_t shared_mem = stim::sharedMemPerBlock();
121 if(shared_mem_req > shared_mem){ 121 if(shared_mem_req > shared_mem){
@@ -124,16 +124,10 @@ namespace stim{ @@ -124,16 +124,10 @@ namespace stim{
124 } 124 }
125 125
126 //call the kernel to calculate the new voting direction 126 //call the kernel to calculate the new voting direction
127 - cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);  
128 - //stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);  
129 -  
130 - //exit(0);  
131 -  
132 - //threads = dim3( sqrt(max_threads), sqrt(max_threads) );  
133 - //blocks = dim3(x/threads.x + 1, y/threads.y + 1); 127 + cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, (int)x , (int)y);
134 128
135 //call the kernel to update the gradient direction 129 //call the kernel to update the gradient direction
136 - cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); 130 + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, (int)x , (int)y);
137 //free allocated memory 131 //free allocated memory
138 HANDLE_ERROR( cudaFree(gpuDir) ); 132 HANDLE_ERROR( cudaFree(gpuDir) );
139 133
stim/cuda/ivote/vote_atomic_bb.cuh
@@ -9,12 +9,14 @@ @@ -9,12 +9,14 @@
9 #include <stim/visualization/colormap.h> 9 #include <stim/visualization/colormap.h>
10 #include <math.h> 10 #include <math.h>
11 11
  12 +
  13 +
12 namespace stim{ 14 namespace stim{
13 namespace cuda{ 15 namespace cuda{
14 16
15 // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area 17 // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
16 template<typename T> 18 template<typename T>
17 - __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ 19 + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, size_t x, size_t y, bool gradmag = true){
18 20
19 extern __shared__ T S[]; 21 extern __shared__ T S[];
20 T* shared_atan = S; 22 T* shared_atan = S;
@@ -22,12 +24,12 @@ namespace stim{ @@ -22,12 +24,12 @@ namespace stim{
22 stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x); 24 stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
23 25
24 // calculate the 2D coordinates for this current thread. 26 // calculate the 2D coordinates for this current thread.
25 - int xi = blockIdx.x * blockDim.x + threadIdx.x;  
26 - int yi = blockIdx.y * blockDim.y + threadIdx.y; 27 + size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
  28 + size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
27 29
28 if(xi >= x || yi >= y) return; 30 if(xi >= x || yi >= y) return;
29 // convert 2D coordinates to 1D 31 // convert 2D coordinates to 1D
30 - int i = yi * x + xi; 32 + size_t i = yi * x + xi;
31 33
32 // calculate the voting direction based on the grtadient direction 34 // calculate the voting direction based on the grtadient direction
33 float theta = gpuGrad[2*i]; 35 float theta = gpuGrad[2*i];
@@ -50,7 +52,7 @@ namespace stim{ @@ -50,7 +52,7 @@ namespace stim{
50 bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image 52 bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image
51 bb.trim_high(x-1, y-1); 53 bb.trim_high(x-1, y-1);
52 54
53 - int by, bx; 55 + size_t by, bx;
54 int dx, dy; 56 int dx, dy;
55 57
56 unsigned int ind_g; //initialize the maximum vote value to zero 58 unsigned int ind_g; //initialize the maximum vote value to zero
@@ -66,7 +68,8 @@ namespace stim{ @@ -66,7 +68,8 @@ namespace stim{
66 alpha = shared_atan[lut_i]; 68 alpha = shared_atan[lut_i];
67 if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){ 69 if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
68 ind_g = (by)*x + (bx); 70 ind_g = (by)*x + (bx);
69 - atomicAdd(&gpuVote[ind_g], mag); 71 + if(gradmag) atomicAdd(&gpuVote[ind_g], mag); //add the gradient magnitude (if the gradmag flag is enabled)
  72 + else atomicAdd(&gpuVote[ind_g], 1.0f); //otherwise just add 1
70 73
71 } 74 }
72 } 75 }
@@ -75,16 +78,22 @@ namespace stim{ @@ -75,16 +78,22 @@ namespace stim{
75 } 78 }
76 79
77 80
  81 + /// Iterative voting for an image
  82 + /// @param gpuVote is the resulting vote image
  83 + /// @param gpuGrad is the gradient of the input image
  84 + /// @param gpuTable is the pre-computed atan2() table
  85 + /// @param phi is the angle of the vote region
  86 + /// @param rmax is the estimated radius of the blob (defines the "width" of the vote region)
  87 + /// @param x and y are the spatial dimensions of the gradient image
  88 + /// @param gradmag defines whether or not the gradient magnitude is taken into account during the vote
78 template<typename T> 89 template<typename T>
79 - void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){  
80 -  
81 - 90 + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y, bool gradmag = true){
82 unsigned int max_threads = stim::maxThreadsPerBlock(); 91 unsigned int max_threads = stim::maxThreadsPerBlock();
83 - dim3 threads( sqrt(max_threads), sqrt(max_threads) );  
84 - dim3 blocks(x/threads.x + 1, y/threads.y + 1); 92 + dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
  93 + dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
85 size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); 94 size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
86 size_t shared_mem_req = table_bytes;// + template_bytes; 95 size_t shared_mem_req = table_bytes;// + template_bytes;
87 - std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl; 96 + if (DEBUG) std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
88 size_t shared_mem = stim::sharedMemPerBlock(); 97 size_t shared_mem = stim::sharedMemPerBlock();
89 if(shared_mem_req > shared_mem){ 98 if(shared_mem_req > shared_mem){
90 std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl; 99 std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
@@ -92,7 +101,7 @@ namespace stim{ @@ -92,7 +101,7 @@ namespace stim{
92 } 101 }
93 102
94 //call the kernel to do the voting 103 //call the kernel to do the voting
95 - cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); 104 + cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y, gradmag);
96 105
97 } 106 }
98 107
stim/cuda/ivote_atomic_bb.cuh
1 #ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H 1 #ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H
2 #define STIM_CUDA_IVOTE_ATOMIC_BB_H 2 #define STIM_CUDA_IVOTE_ATOMIC_BB_H
3 3
  4 +extern bool DEBUG;
4 #include <stim/cuda/ivote/down_sample.cuh> 5 #include <stim/cuda/ivote/down_sample.cuh>
5 #include <stim/cuda/ivote/local_max.cuh> 6 #include <stim/cuda/ivote/local_max.cuh>
6 #include <stim/cuda/ivote/update_dir_bb.cuh> 7 #include <stim/cuda/ivote/update_dir_bb.cuh>
stim/cuda/templates/gradient.cuh
@@ -9,25 +9,25 @@ namespace stim{ @@ -9,25 +9,25 @@ namespace stim{
9 namespace cuda{ 9 namespace cuda{
10 10
11 template<typename T> 11 template<typename T>
12 - __global__ void gradient_2d(T* out, T* in, int x, int y){ 12 + __global__ void gradient_2d(T* out, T* in, size_t x, size_t y){
13 13
14 14
15 // calculate the 2D coordinates for this current thread. 15 // calculate the 2D coordinates for this current thread.
16 - int xi = blockIdx.x * blockDim.x + threadIdx.x;  
17 - int yi = blockIdx.y * blockDim.y + threadIdx.y; 16 + size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
  17 + size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
18 // convert 2D coordinates to 1D 18 // convert 2D coordinates to 1D
19 - int i = yi * x + xi; 19 + size_t i = yi * x + xi;
20 20
21 //return if the pixel is outside of the image 21 //return if the pixel is outside of the image
22 if(xi >= x || yi >= y) return; 22 if(xi >= x || yi >= y) return;
23 23
24 //calculate indices for the forward difference 24 //calculate indices for the forward difference
25 - int i_xp = yi * x + (xi + 1);  
26 - int i_yp = (yi + 1) * x + xi; 25 + size_t i_xp = yi * x + (xi + 1);
  26 + size_t i_yp = (yi + 1) * x + xi;
27 27
28 //calculate indices for the backward difference 28 //calculate indices for the backward difference
29 - int i_xn = yi * x + (xi - 1);  
30 - int i_yn = (yi - 1) * x + xi; 29 + size_t i_xn = yi * x + (xi - 1);
  30 + size_t i_yn = (yi - 1) * x + xi;
31 31
32 //use forward differences if a coordinate is zero 32 //use forward differences if a coordinate is zero
33 if(xi == 0) 33 if(xi == 0)
@@ -47,12 +47,12 @@ namespace stim{ @@ -47,12 +47,12 @@ namespace stim{
47 } 47 }
48 48
49 template<typename T> 49 template<typename T>
50 - void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){ 50 + void gpu_gradient_2d(T* gpuGrad, T* gpuI, size_t x, size_t y){
51 51
52 //get the maximum number of threads per block for the CUDA device 52 //get the maximum number of threads per block for the CUDA device
53 unsigned int max_threads = stim::maxThreadsPerBlock(); 53 unsigned int max_threads = stim::maxThreadsPerBlock();
54 dim3 threads(max_threads, 1); 54 dim3 threads(max_threads, 1);
55 - dim3 blocks(x/threads.x + 1 , y); 55 + dim3 blocks((unsigned int)(x/threads.x) + 1 , (unsigned int)y);
56 56
57 57
58 //call the GPU kernel to determine the gradient 58 //call the GPU kernel to determine the gradient
stim/image/image.h
@@ -122,9 +122,9 @@ public: @@ -122,9 +122,9 @@ public:
122 } 122 }
123 123
124 stim::image<T>& operator=(const stim::image<T>& I){ 124 stim::image<T>& operator=(const stim::image<T>& I){
125 - init();  
126 if(&I == this) //handle self-assignment 125 if(&I == this) //handle self-assignment
127 return *this; 126 return *this;
  127 + init();
128 allocate(I.X(), I.Y(), I.C()); 128 allocate(I.X(), I.Y(), I.C());
129 memcpy(img, I.img, bytes()); 129 memcpy(img, I.img, bytes());
130 return *this; 130 return *this;
@@ -423,6 +423,33 @@ public: @@ -423,6 +423,33 @@ public:
423 return result; 423 return result;
424 } 424 }
425 425
  426 + /// Adds curcular padding for the specified number of pixels - in this case replicating the boundary pixels
  427 + image<T> pad_replicate(size_t p) {
  428 + image<T> result(width() + p * 2, height() + p * 2, channels()); //create an output image
  429 + result = 0;
  430 + //result = value; //assign the border value to all pixels in the new image
  431 + for (size_t y = 0; y < height(); y++) { //for each pixel in the original image
  432 + for (size_t x = 0; x < width(); x++) {
  433 + size_t n = (y + p) * (width() + p * 2) + x + p; //calculate the index of the corresponding pixel in the result image
  434 + size_t n0 = idx(x, y); //calculate the index for this pixel in the original image
  435 + result.data()[n] = img[n0]; // copy the original image to the result image afer the border area
  436 + }
  437 + }
  438 + size_t l = p;
  439 + size_t r = p + width() - 1;
  440 + size_t t = p;
  441 + size_t b = p + height() - 1;
  442 + for (size_t y = 0; y < p; y++) for (size_t x = l; x <= r; x++) result(x, y) = result(x, t); //pad the top
  443 + for (size_t y = b + 1; y < result.height(); y++) for (size_t x = l; x <= r; x++) result(x, y) = result(x, b); //pad the bottom
  444 + for (size_t y = t; y <= b; y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, y); //pad the left
  445 + for (size_t y = t; y <= b; y++) for (size_t x = r+1; x < result.width(); x++) result(x, y) = result(r, y); //pad the right
  446 + for (size_t y = 0; y < t; y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, t); //pad the top left
  447 + for (size_t y = 0; y < t; y++) for (size_t x = r+1; x < result.width(); x++) result(x, y) = result(r, t); //pad the top right
  448 + for (size_t y = b+1; y < result.height(); y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, b); //pad the bottom left
  449 + for (size_t y = b+1; y < result.height(); y++) for (size_t x = r + 1; x < result.width(); x++) result(x, y) = result(r, b); //pad the bottom right
  450 + return result;
  451 + }
  452 +
426 /// Copy the given data to the specified channel 453 /// Copy the given data to the specified channel
427 454
428 /// @param c is the channel number that the data will be copied to 455 /// @param c is the channel number that the data will be copied to
@@ -546,7 +573,7 @@ public: @@ -546,7 +573,7 @@ public:
546 } 573 }
547 574
548 //crop regions given by an array of 1D index values 575 //crop regions given by an array of 1D index values
549 - std::vector<image<T>> crop_idx(size_t w, size_t h, std::vector<size_t> idx) { 576 + std::vector< image<T> > crop_idx(size_t w, size_t h, std::vector<size_t> idx) {
550 std::vector<image<T>> result(idx.size()); //create an array of image files to return 577 std::vector<image<T>> result(idx.size()); //create an array of image files to return
551 for (size_t i = 0; i < idx.size(); i++) { //for each specified index point 578 for (size_t i = 0; i < idx.size(); i++) { //for each specified index point
552 size_t y = idx[i] / X(); //calculate the y coordinate from the 1D index (center of ROI) 579 size_t y = idx[i] / X(); //calculate the y coordinate from the 1D index (center of ROI)
stim/math/filters/gauss2.cuh
@@ -18,8 +18,8 @@ namespace stim { @@ -18,8 +18,8 @@ namespace stim {
18 ///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel 18 ///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel
19 template<typename T, typename K> 19 template<typename T, typename K>
20 stim::image<T> cpu_gauss2(const stim::image<T>& in, K stdx, K stdy, size_t nstds = 3) { 20 stim::image<T> cpu_gauss2(const stim::image<T>& in, K stdx, K stdy, size_t nstds = 3) {
21 - size_t kx = stdx * nstds * 2 + 1; //calculate the kernel sizes  
22 - size_t ky = stdy * nstds * 2 + 1; 21 + size_t kx = (size_t)(stdx * nstds * 2) + 1; //calculate the kernel sizes
  22 + size_t ky = (size_t)(stdy * nstds * 2) + 1;
23 size_t X = in.width() - kx + 1; //calculate the size of the output image 23 size_t X = in.width() - kx + 1; //calculate the size of the output image
24 size_t Y = in.height() - ky + 1; 24 size_t Y = in.height() - ky + 1;
25 stim::image<T> r(X, Y, in.channels()); //create an output image 25 stim::image<T> r(X, Y, in.channels()); //create an output image
stim/math/filters/sepconv2.cuh
@@ -20,12 +20,12 @@ namespace stim { @@ -20,12 +20,12 @@ namespace stim {
20 cudaDeviceProp p; 20 cudaDeviceProp p;
21 HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); 21 HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
22 size_t tmax = p.maxThreadsPerBlock; 22 size_t tmax = p.maxThreadsPerBlock;
23 - dim3 nt(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions 23 + dim3 nt((unsigned int)sqrt(tmax), (unsigned int)sqrt(tmax)); //calculate the block dimensions
24 size_t X = sx - kx + 1; //calculate the x size of the output image 24 size_t X = sx - kx + 1; //calculate the x size of the output image
25 T* temp; //declare a temporary variable to store the intermediate image 25 T* temp; //declare a temporary variable to store the intermediate image
26 HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T))); //allocate memory for the intermediate image 26 HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T))); //allocate memory for the intermediate image
27 27
28 - dim3 nb(X / nt.x + 1, sy / nt.y + 1); //calculate the grid dimensions 28 + dim3 nb((unsigned int)(X / nt.x) + 1, (unsigned int)(sy / nt.y) + 1); //calculate the grid dimensions
29 size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T); //shared memory bytes required to store block data 29 size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T); //shared memory bytes required to store block data
30 if (sm > p.sharedMemPerBlock) { 30 if (sm > p.sharedMemPerBlock) {
31 std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; 31 std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
@@ -34,7 +34,7 @@ namespace stim { @@ -34,7 +34,7 @@ namespace stim {
34 kernel_conv2 <<<nb, nt, sm>>> (temp, in, k0, sx, sy, kx, 1); //launch the kernel to compute the intermediate image 34 kernel_conv2 <<<nb, nt, sm>>> (temp, in, k0, sx, sy, kx, 1); //launch the kernel to compute the intermediate image
35 35
36 size_t Y = sy - ky + 1; //calculate the y size of the output image 36 size_t Y = sy - ky + 1; //calculate the y size of the output image
37 - nb.y = Y / nt.y + 1; //update the grid dimensions to reflect the Y-axis size of the output image 37 + nb.y = (unsigned int)(Y / nt.y) + 1; //update the grid dimensions to reflect the Y-axis size of the output image
38 sm = nt.x * (nt.y + ky - 1) * sizeof(T); //calculate the amount of shared memory needed for the second pass 38 sm = nt.x * (nt.y + ky - 1) * sizeof(T); //calculate the amount of shared memory needed for the second pass
39 if (sm > p.sharedMemPerBlock) { 39 if (sm > p.sharedMemPerBlock) {
40 std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; 40 std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
stim/parser/arguments.h
@@ -535,7 +535,13 @@ namespace stim{ @@ -535,7 +535,13 @@ namespace stim{
535 535
536 /// @param a is the index of the requested argument 536 /// @param a is the index of the requested argument
537 std::string arg(size_t a){ 537 std::string arg(size_t a){
538 - return args[a]; 538 + if (a < nargs()) {
  539 + return args[a];
  540 + }
  541 + else {
  542 + std::cout << "stim::arguments ERROR: argument index exceeds number of available arguments" << std::endl;
  543 + exit(1);
  544 + }
539 } 545 }
540 546
541 /// Returns an std::vector of argument strings 547 /// Returns an std::vector of argument strings