Commit 276f9b23d700babac82af74c461e6055fbd69eb0
1 parent
c282a801
forced matching to Matlab, removed warnings
Showing
10 changed files
with
101 additions
and
64 deletions
Show diff stats
stim/cuda/arraymath/array_cart2polar.cuh
... | ... | @@ -4,14 +4,14 @@ |
4 | 4 | namespace stim{ |
5 | 5 | namespace cuda{ |
6 | 6 | template<typename T> |
7 | - __global__ void cuda_cart2polar(T* a, int x, int y, float rotation){ | |
7 | + __global__ void cuda_cart2polar(T* a, size_t x, size_t y, float rotation){ | |
8 | 8 | |
9 | 9 | |
10 | 10 | // calculate the 2D coordinates for this current thread. |
11 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
12 | - int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
11 | + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; | |
12 | + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; | |
13 | 13 | // convert 2D coordinates to 1D |
14 | - int i = yi * x + xi; | |
14 | + size_t i = yi * x + xi; | |
15 | 15 | |
16 | 16 | |
17 | 17 | if(xi >= x|| yi >= y) return; |
... | ... | @@ -27,11 +27,11 @@ namespace stim{ |
27 | 27 | |
28 | 28 | |
29 | 29 | template<typename T> |
30 | - void gpu_cart2polar(T* gpuGrad, unsigned int x, unsigned int y, float rotation = 0){ | |
30 | + void gpu_cart2polar(T* gpuGrad, size_t x, size_t y, float rotation = 0){ | |
31 | 31 | |
32 | 32 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
33 | 33 | dim3 threads(max_threads, 1); |
34 | - dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | |
34 | + dim3 blocks(((unsigned int)x/threads.x) + ((unsigned int)x %threads.x == 0 ? 0:1) , (unsigned int)y); | |
35 | 35 | |
36 | 36 | //call the kernel to do the multiplication |
37 | 37 | cuda_cart2polar <<< blocks, threads >>>(gpuGrad, x, y, rotation); |
... | ... | @@ -40,11 +40,11 @@ namespace stim{ |
40 | 40 | |
41 | 41 | |
42 | 42 | template<typename T> |
43 | - void cpu_cart2polar(T* a, unsigned int x, unsigned int y){ | |
43 | + void cpu_cart2polar(T* a, size_t x, size_t y){ | |
44 | 44 | |
45 | 45 | //calculate the number of bytes in the array |
46 | - unsigned int N = x *y; | |
47 | - unsigned int bytes = N * sizeof(T) * 2; | |
46 | + size_t N = x *y; | |
47 | + size_t bytes = N * sizeof(T) * 2; | |
48 | 48 | |
49 | 49 | //allocate memory on the GPU for the array |
50 | 50 | T* gpuA; | ... | ... |
stim/cuda/ivote/local_max.cuh
... | ... | @@ -10,17 +10,17 @@ namespace stim{ |
10 | 10 | |
11 | 11 | // this kernel calculates the local maximum for finding the cell centers |
12 | 12 | template<typename T> |
13 | - __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, int x, int y){ | |
13 | + __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, size_t x, size_t y){ | |
14 | 14 | |
15 | 15 | // calculate the 2D coordinates for this current thread. |
16 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
17 | - int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
16 | + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; | |
17 | + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; | |
18 | 18 | |
19 | 19 | if(xi >= x || yi >= y) |
20 | 20 | return; |
21 | 21 | |
22 | 22 | // convert 2D coordinates to 1D |
23 | - int i = yi * x + xi; | |
23 | + size_t i = yi * x + xi; | |
24 | 24 | |
25 | 25 | gpuCenters[i] = 0; //initialize the value at this location to zero |
26 | 26 | |
... | ... | @@ -61,13 +61,13 @@ namespace stim{ |
61 | 61 | } |
62 | 62 | |
63 | 63 | template<typename T> |
64 | - void gpu_local_max(T* gpuCenters, T* gpuVote, unsigned int conn, unsigned int x, unsigned int y){ | |
64 | + void gpu_local_max(T* gpuCenters, T* gpuVote, unsigned int conn, size_t x, size_t y){ | |
65 | 65 | |
66 | 66 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
67 | 67 | /*dim3 threads(max_threads, 1); |
68 | 68 | dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/ |
69 | - dim3 threads( sqrt(max_threads), sqrt(max_threads) ); | |
70 | - dim3 blocks(x/threads.x + 1, y/threads.y + 1); | |
69 | + dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) ); | |
70 | + dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1); | |
71 | 71 | |
72 | 72 | //call the kernel to find the local maximum. |
73 | 73 | cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, conn, x, y); | ... | ... |
stim/cuda/ivote/update_dir_bb.cuh
... | ... | @@ -81,26 +81,26 @@ namespace stim{ |
81 | 81 | |
82 | 82 | // this kernel updates the gradient direction by the calculated voting direction. |
83 | 83 | template<typename T> |
84 | - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ | |
84 | + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, size_t x, size_t y){ | |
85 | 85 | |
86 | 86 | // calculate the 2D coordinates for this current thread. |
87 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
88 | - int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
87 | + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; | |
88 | + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; | |
89 | 89 | |
90 | 90 | if(xi >= x || yi >= y) return; |
91 | 91 | |
92 | 92 | // convert 2D coordinates to 1D |
93 | - int i = yi * x + xi; | |
93 | + size_t i = yi * x + xi; | |
94 | 94 | |
95 | 95 | //update the gradient image with the vote direction |
96 | 96 | gpuGrad[2*i] = gpuDir[i]; |
97 | 97 | } |
98 | 98 | |
99 | 99 | template<typename T> |
100 | - void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
100 | + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y){ | |
101 | 101 | |
102 | 102 | //calculate the number of bytes in the array |
103 | - unsigned int bytes = x * y * sizeof(T); | |
103 | + size_t bytes = x * y * sizeof(T); | |
104 | 104 | |
105 | 105 | // allocate space on the GPU for the updated vote direction |
106 | 106 | T* gpuDir; |
... | ... | @@ -108,14 +108,14 @@ namespace stim{ |
108 | 108 | |
109 | 109 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
110 | 110 | |
111 | - dim3 threads( sqrt(max_threads), sqrt(max_threads) ); | |
112 | - dim3 blocks(x/threads.x + 1, y/threads.y + 1); | |
111 | + dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) ); | |
112 | + dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1); | |
113 | 113 | |
114 | 114 | size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); |
115 | 115 | //size_t curtain = 2 * rmax; |
116 | 116 | //size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain); |
117 | 117 | size_t shared_mem_req = table_bytes;// + template_bytes; |
118 | - std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl; | |
118 | + if (DEBUG) std::cout << "Shared Memory required: " << shared_mem_req << std::endl; | |
119 | 119 | |
120 | 120 | size_t shared_mem = stim::sharedMemPerBlock(); |
121 | 121 | if(shared_mem_req > shared_mem){ |
... | ... | @@ -124,16 +124,10 @@ namespace stim{ |
124 | 124 | } |
125 | 125 | |
126 | 126 | //call the kernel to calculate the new voting direction |
127 | - cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
128 | - //stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer); | |
129 | - | |
130 | - //exit(0); | |
131 | - | |
132 | - //threads = dim3( sqrt(max_threads), sqrt(max_threads) ); | |
133 | - //blocks = dim3(x/threads.x + 1, y/threads.y + 1); | |
127 | + cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, (int)x , (int)y); | |
134 | 128 | |
135 | 129 | //call the kernel to update the gradient direction |
136 | - cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); | |
130 | + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, (int)x , (int)y); | |
137 | 131 | //free allocated memory |
138 | 132 | HANDLE_ERROR( cudaFree(gpuDir) ); |
139 | 133 | ... | ... |
stim/cuda/ivote/vote_atomic_bb.cuh
... | ... | @@ -9,12 +9,14 @@ |
9 | 9 | #include <stim/visualization/colormap.h> |
10 | 10 | #include <math.h> |
11 | 11 | |
12 | + | |
13 | + | |
12 | 14 | namespace stim{ |
13 | 15 | namespace cuda{ |
14 | 16 | |
15 | 17 | // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area |
16 | 18 | template<typename T> |
17 | - __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | |
19 | + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, size_t x, size_t y, bool gradmag = true){ | |
18 | 20 | |
19 | 21 | extern __shared__ T S[]; |
20 | 22 | T* shared_atan = S; |
... | ... | @@ -22,12 +24,12 @@ namespace stim{ |
22 | 24 | stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x); |
23 | 25 | |
24 | 26 | // calculate the 2D coordinates for this current thread. |
25 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
26 | - int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
27 | + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; | |
28 | + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; | |
27 | 29 | |
28 | 30 | if(xi >= x || yi >= y) return; |
29 | 31 | // convert 2D coordinates to 1D |
30 | - int i = yi * x + xi; | |
32 | + size_t i = yi * x + xi; | |
31 | 33 | |
32 | 34 | // calculate the voting direction based on the grtadient direction |
33 | 35 | float theta = gpuGrad[2*i]; |
... | ... | @@ -50,7 +52,7 @@ namespace stim{ |
50 | 52 | bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image |
51 | 53 | bb.trim_high(x-1, y-1); |
52 | 54 | |
53 | - int by, bx; | |
55 | + size_t by, bx; | |
54 | 56 | int dx, dy; |
55 | 57 | |
56 | 58 | unsigned int ind_g; //initialize the maximum vote value to zero |
... | ... | @@ -66,7 +68,8 @@ namespace stim{ |
66 | 68 | alpha = shared_atan[lut_i]; |
67 | 69 | if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){ |
68 | 70 | ind_g = (by)*x + (bx); |
69 | - atomicAdd(&gpuVote[ind_g], mag); | |
71 | + if(gradmag) atomicAdd(&gpuVote[ind_g], mag); //add the gradient magnitude (if the gradmag flag is enabled) | |
72 | + else atomicAdd(&gpuVote[ind_g], 1.0f); //otherwise just add 1 | |
70 | 73 | |
71 | 74 | } |
72 | 75 | } |
... | ... | @@ -75,16 +78,22 @@ namespace stim{ |
75 | 78 | } |
76 | 79 | |
77 | 80 | |
81 | + /// Iterative voting for an image | |
82 | + /// @param gpuVote is the resulting vote image | |
83 | + /// @param gpuGrad is the gradient of the input image | |
84 | + /// @param gpuTable is the pre-computed atan2() table | |
85 | + /// @param phi is the angle of the vote region | |
86 | + /// @param rmax is the estimated radius of the blob (defines the "width" of the vote region) | |
87 | + /// @param x and y are the spatial dimensions of the gradient image | |
88 | + /// @param gradmag defines whether or not the gradient magnitude is taken into account during the vote | |
78 | 89 | template<typename T> |
79 | - void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
80 | - | |
81 | - | |
90 | + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y, bool gradmag = true){ | |
82 | 91 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
83 | - dim3 threads( sqrt(max_threads), sqrt(max_threads) ); | |
84 | - dim3 blocks(x/threads.x + 1, y/threads.y + 1); | |
92 | + dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) ); | |
93 | + dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1); | |
85 | 94 | size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); |
86 | 95 | size_t shared_mem_req = table_bytes;// + template_bytes; |
87 | - std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl; | |
96 | + if (DEBUG) std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl; | |
88 | 97 | size_t shared_mem = stim::sharedMemPerBlock(); |
89 | 98 | if(shared_mem_req > shared_mem){ |
90 | 99 | std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl; |
... | ... | @@ -92,7 +101,7 @@ namespace stim{ |
92 | 101 | } |
93 | 102 | |
94 | 103 | //call the kernel to do the voting |
95 | - cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
104 | + cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y, gradmag); | |
96 | 105 | |
97 | 106 | } |
98 | 107 | ... | ... |
stim/cuda/ivote_atomic_bb.cuh
stim/cuda/templates/gradient.cuh
... | ... | @@ -9,25 +9,25 @@ namespace stim{ |
9 | 9 | namespace cuda{ |
10 | 10 | |
11 | 11 | template<typename T> |
12 | - __global__ void gradient_2d(T* out, T* in, int x, int y){ | |
12 | + __global__ void gradient_2d(T* out, T* in, size_t x, size_t y){ | |
13 | 13 | |
14 | 14 | |
15 | 15 | // calculate the 2D coordinates for this current thread. |
16 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
17 | - int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
16 | + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; | |
17 | + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; | |
18 | 18 | // convert 2D coordinates to 1D |
19 | - int i = yi * x + xi; | |
19 | + size_t i = yi * x + xi; | |
20 | 20 | |
21 | 21 | //return if the pixel is outside of the image |
22 | 22 | if(xi >= x || yi >= y) return; |
23 | 23 | |
24 | 24 | //calculate indices for the forward difference |
25 | - int i_xp = yi * x + (xi + 1); | |
26 | - int i_yp = (yi + 1) * x + xi; | |
25 | + size_t i_xp = yi * x + (xi + 1); | |
26 | + size_t i_yp = (yi + 1) * x + xi; | |
27 | 27 | |
28 | 28 | //calculate indices for the backward difference |
29 | - int i_xn = yi * x + (xi - 1); | |
30 | - int i_yn = (yi - 1) * x + xi; | |
29 | + size_t i_xn = yi * x + (xi - 1); | |
30 | + size_t i_yn = (yi - 1) * x + xi; | |
31 | 31 | |
32 | 32 | //use forward differences if a coordinate is zero |
33 | 33 | if(xi == 0) |
... | ... | @@ -47,12 +47,12 @@ namespace stim{ |
47 | 47 | } |
48 | 48 | |
49 | 49 | template<typename T> |
50 | - void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){ | |
50 | + void gpu_gradient_2d(T* gpuGrad, T* gpuI, size_t x, size_t y){ | |
51 | 51 | |
52 | 52 | //get the maximum number of threads per block for the CUDA device |
53 | 53 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
54 | 54 | dim3 threads(max_threads, 1); |
55 | - dim3 blocks(x/threads.x + 1 , y); | |
55 | + dim3 blocks((unsigned int)(x/threads.x) + 1 , (unsigned int)y); | |
56 | 56 | |
57 | 57 | |
58 | 58 | //call the GPU kernel to determine the gradient | ... | ... |
stim/image/image.h
... | ... | @@ -122,9 +122,9 @@ public: |
122 | 122 | } |
123 | 123 | |
124 | 124 | stim::image<T>& operator=(const stim::image<T>& I){ |
125 | - init(); | |
126 | 125 | if(&I == this) //handle self-assignment |
127 | 126 | return *this; |
127 | + init(); | |
128 | 128 | allocate(I.X(), I.Y(), I.C()); |
129 | 129 | memcpy(img, I.img, bytes()); |
130 | 130 | return *this; |
... | ... | @@ -423,6 +423,33 @@ public: |
423 | 423 | return result; |
424 | 424 | } |
425 | 425 | |
426 | + /// Adds curcular padding for the specified number of pixels - in this case replicating the boundary pixels | |
427 | + image<T> pad_replicate(size_t p) { | |
428 | + image<T> result(width() + p * 2, height() + p * 2, channels()); //create an output image | |
429 | + result = 0; | |
430 | + //result = value; //assign the border value to all pixels in the new image | |
431 | + for (size_t y = 0; y < height(); y++) { //for each pixel in the original image | |
432 | + for (size_t x = 0; x < width(); x++) { | |
433 | + size_t n = (y + p) * (width() + p * 2) + x + p; //calculate the index of the corresponding pixel in the result image | |
434 | + size_t n0 = idx(x, y); //calculate the index for this pixel in the original image | |
435 | + result.data()[n] = img[n0]; // copy the original image to the result image afer the border area | |
436 | + } | |
437 | + } | |
438 | + size_t l = p; | |
439 | + size_t r = p + width() - 1; | |
440 | + size_t t = p; | |
441 | + size_t b = p + height() - 1; | |
442 | + for (size_t y = 0; y < p; y++) for (size_t x = l; x <= r; x++) result(x, y) = result(x, t); //pad the top | |
443 | + for (size_t y = b + 1; y < result.height(); y++) for (size_t x = l; x <= r; x++) result(x, y) = result(x, b); //pad the bottom | |
444 | + for (size_t y = t; y <= b; y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, y); //pad the left | |
445 | + for (size_t y = t; y <= b; y++) for (size_t x = r+1; x < result.width(); x++) result(x, y) = result(r, y); //pad the right | |
446 | + for (size_t y = 0; y < t; y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, t); //pad the top left | |
447 | + for (size_t y = 0; y < t; y++) for (size_t x = r+1; x < result.width(); x++) result(x, y) = result(r, t); //pad the top right | |
448 | + for (size_t y = b+1; y < result.height(); y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, b); //pad the bottom left | |
449 | + for (size_t y = b+1; y < result.height(); y++) for (size_t x = r + 1; x < result.width(); x++) result(x, y) = result(r, b); //pad the bottom right | |
450 | + return result; | |
451 | + } | |
452 | + | |
426 | 453 | /// Copy the given data to the specified channel |
427 | 454 | |
428 | 455 | /// @param c is the channel number that the data will be copied to |
... | ... | @@ -546,7 +573,7 @@ public: |
546 | 573 | } |
547 | 574 | |
548 | 575 | //crop regions given by an array of 1D index values |
549 | - std::vector<image<T>> crop_idx(size_t w, size_t h, std::vector<size_t> idx) { | |
576 | + std::vector< image<T> > crop_idx(size_t w, size_t h, std::vector<size_t> idx) { | |
550 | 577 | std::vector<image<T>> result(idx.size()); //create an array of image files to return |
551 | 578 | for (size_t i = 0; i < idx.size(); i++) { //for each specified index point |
552 | 579 | size_t y = idx[i] / X(); //calculate the y coordinate from the 1D index (center of ROI) | ... | ... |
stim/math/filters/gauss2.cuh
... | ... | @@ -18,8 +18,8 @@ namespace stim { |
18 | 18 | ///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel |
19 | 19 | template<typename T, typename K> |
20 | 20 | stim::image<T> cpu_gauss2(const stim::image<T>& in, K stdx, K stdy, size_t nstds = 3) { |
21 | - size_t kx = stdx * nstds * 2 + 1; //calculate the kernel sizes | |
22 | - size_t ky = stdy * nstds * 2 + 1; | |
21 | + size_t kx = (size_t)(stdx * nstds * 2) + 1; //calculate the kernel sizes | |
22 | + size_t ky = (size_t)(stdy * nstds * 2) + 1; | |
23 | 23 | size_t X = in.width() - kx + 1; //calculate the size of the output image |
24 | 24 | size_t Y = in.height() - ky + 1; |
25 | 25 | stim::image<T> r(X, Y, in.channels()); //create an output image | ... | ... |
stim/math/filters/sepconv2.cuh
... | ... | @@ -20,12 +20,12 @@ namespace stim { |
20 | 20 | cudaDeviceProp p; |
21 | 21 | HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); |
22 | 22 | size_t tmax = p.maxThreadsPerBlock; |
23 | - dim3 nt(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions | |
23 | + dim3 nt((unsigned int)sqrt(tmax), (unsigned int)sqrt(tmax)); //calculate the block dimensions | |
24 | 24 | size_t X = sx - kx + 1; //calculate the x size of the output image |
25 | 25 | T* temp; //declare a temporary variable to store the intermediate image |
26 | 26 | HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T))); //allocate memory for the intermediate image |
27 | 27 | |
28 | - dim3 nb(X / nt.x + 1, sy / nt.y + 1); //calculate the grid dimensions | |
28 | + dim3 nb((unsigned int)(X / nt.x) + 1, (unsigned int)(sy / nt.y) + 1); //calculate the grid dimensions | |
29 | 29 | size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T); //shared memory bytes required to store block data |
30 | 30 | if (sm > p.sharedMemPerBlock) { |
31 | 31 | std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; |
... | ... | @@ -34,7 +34,7 @@ namespace stim { |
34 | 34 | kernel_conv2 <<<nb, nt, sm>>> (temp, in, k0, sx, sy, kx, 1); //launch the kernel to compute the intermediate image |
35 | 35 | |
36 | 36 | size_t Y = sy - ky + 1; //calculate the y size of the output image |
37 | - nb.y = Y / nt.y + 1; //update the grid dimensions to reflect the Y-axis size of the output image | |
37 | + nb.y = (unsigned int)(Y / nt.y) + 1; //update the grid dimensions to reflect the Y-axis size of the output image | |
38 | 38 | sm = nt.x * (nt.y + ky - 1) * sizeof(T); //calculate the amount of shared memory needed for the second pass |
39 | 39 | if (sm > p.sharedMemPerBlock) { |
40 | 40 | std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; | ... | ... |
stim/parser/arguments.h
... | ... | @@ -535,7 +535,13 @@ namespace stim{ |
535 | 535 | |
536 | 536 | /// @param a is the index of the requested argument |
537 | 537 | std::string arg(size_t a){ |
538 | - return args[a]; | |
538 | + if (a < nargs()) { | |
539 | + return args[a]; | |
540 | + } | |
541 | + else { | |
542 | + std::cout << "stim::arguments ERROR: argument index exceeds number of available arguments" << std::endl; | |
543 | + exit(1); | |
544 | + } | |
539 | 545 | } |
540 | 546 | |
541 | 547 | /// Returns an std::vector of argument strings | ... | ... |