Commit 276f9b23d700babac82af74c461e6055fbd69eb0
1 parent
c282a801
forced matching to Matlab, removed warnings
Showing
10 changed files
with
101 additions
and
64 deletions
Show diff stats
stim/cuda/arraymath/array_cart2polar.cuh
@@ -4,14 +4,14 @@ | @@ -4,14 +4,14 @@ | ||
4 | namespace stim{ | 4 | namespace stim{ |
5 | namespace cuda{ | 5 | namespace cuda{ |
6 | template<typename T> | 6 | template<typename T> |
7 | - __global__ void cuda_cart2polar(T* a, int x, int y, float rotation){ | 7 | + __global__ void cuda_cart2polar(T* a, size_t x, size_t y, float rotation){ |
8 | 8 | ||
9 | 9 | ||
10 | // calculate the 2D coordinates for this current thread. | 10 | // calculate the 2D coordinates for this current thread. |
11 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
12 | - int yi = blockIdx.y * blockDim.y + threadIdx.y; | 11 | + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; |
12 | + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
13 | // convert 2D coordinates to 1D | 13 | // convert 2D coordinates to 1D |
14 | - int i = yi * x + xi; | 14 | + size_t i = yi * x + xi; |
15 | 15 | ||
16 | 16 | ||
17 | if(xi >= x|| yi >= y) return; | 17 | if(xi >= x|| yi >= y) return; |
@@ -27,11 +27,11 @@ namespace stim{ | @@ -27,11 +27,11 @@ namespace stim{ | ||
27 | 27 | ||
28 | 28 | ||
29 | template<typename T> | 29 | template<typename T> |
30 | - void gpu_cart2polar(T* gpuGrad, unsigned int x, unsigned int y, float rotation = 0){ | 30 | + void gpu_cart2polar(T* gpuGrad, size_t x, size_t y, float rotation = 0){ |
31 | 31 | ||
32 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 32 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
33 | dim3 threads(max_threads, 1); | 33 | dim3 threads(max_threads, 1); |
34 | - dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | 34 | + dim3 blocks(((unsigned int)x/threads.x) + ((unsigned int)x %threads.x == 0 ? 0:1) , (unsigned int)y); |
35 | 35 | ||
36 | //call the kernel to do the multiplication | 36 | //call the kernel to do the multiplication |
37 | cuda_cart2polar <<< blocks, threads >>>(gpuGrad, x, y, rotation); | 37 | cuda_cart2polar <<< blocks, threads >>>(gpuGrad, x, y, rotation); |
@@ -40,11 +40,11 @@ namespace stim{ | @@ -40,11 +40,11 @@ namespace stim{ | ||
40 | 40 | ||
41 | 41 | ||
42 | template<typename T> | 42 | template<typename T> |
43 | - void cpu_cart2polar(T* a, unsigned int x, unsigned int y){ | 43 | + void cpu_cart2polar(T* a, size_t x, size_t y){ |
44 | 44 | ||
45 | //calculate the number of bytes in the array | 45 | //calculate the number of bytes in the array |
46 | - unsigned int N = x *y; | ||
47 | - unsigned int bytes = N * sizeof(T) * 2; | 46 | + size_t N = x *y; |
47 | + size_t bytes = N * sizeof(T) * 2; | ||
48 | 48 | ||
49 | //allocate memory on the GPU for the array | 49 | //allocate memory on the GPU for the array |
50 | T* gpuA; | 50 | T* gpuA; |
stim/cuda/ivote/local_max.cuh
@@ -10,17 +10,17 @@ namespace stim{ | @@ -10,17 +10,17 @@ namespace stim{ | ||
10 | 10 | ||
11 | // this kernel calculates the local maximum for finding the cell centers | 11 | // this kernel calculates the local maximum for finding the cell centers |
12 | template<typename T> | 12 | template<typename T> |
13 | - __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, int x, int y){ | 13 | + __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, size_t x, size_t y){ |
14 | 14 | ||
15 | // calculate the 2D coordinates for this current thread. | 15 | // calculate the 2D coordinates for this current thread. |
16 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
17 | - int yi = blockIdx.y * blockDim.y + threadIdx.y; | 16 | + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; |
17 | + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
18 | 18 | ||
19 | if(xi >= x || yi >= y) | 19 | if(xi >= x || yi >= y) |
20 | return; | 20 | return; |
21 | 21 | ||
22 | // convert 2D coordinates to 1D | 22 | // convert 2D coordinates to 1D |
23 | - int i = yi * x + xi; | 23 | + size_t i = yi * x + xi; |
24 | 24 | ||
25 | gpuCenters[i] = 0; //initialize the value at this location to zero | 25 | gpuCenters[i] = 0; //initialize the value at this location to zero |
26 | 26 | ||
@@ -61,13 +61,13 @@ namespace stim{ | @@ -61,13 +61,13 @@ namespace stim{ | ||
61 | } | 61 | } |
62 | 62 | ||
63 | template<typename T> | 63 | template<typename T> |
64 | - void gpu_local_max(T* gpuCenters, T* gpuVote, unsigned int conn, unsigned int x, unsigned int y){ | 64 | + void gpu_local_max(T* gpuCenters, T* gpuVote, unsigned int conn, size_t x, size_t y){ |
65 | 65 | ||
66 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 66 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
67 | /*dim3 threads(max_threads, 1); | 67 | /*dim3 threads(max_threads, 1); |
68 | dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/ | 68 | dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/ |
69 | - dim3 threads( sqrt(max_threads), sqrt(max_threads) ); | ||
70 | - dim3 blocks(x/threads.x + 1, y/threads.y + 1); | 69 | + dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) ); |
70 | + dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1); | ||
71 | 71 | ||
72 | //call the kernel to find the local maximum. | 72 | //call the kernel to find the local maximum. |
73 | cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, conn, x, y); | 73 | cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, conn, x, y); |
stim/cuda/ivote/update_dir_bb.cuh
@@ -81,26 +81,26 @@ namespace stim{ | @@ -81,26 +81,26 @@ namespace stim{ | ||
81 | 81 | ||
82 | // this kernel updates the gradient direction by the calculated voting direction. | 82 | // this kernel updates the gradient direction by the calculated voting direction. |
83 | template<typename T> | 83 | template<typename T> |
84 | - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ | 84 | + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, size_t x, size_t y){ |
85 | 85 | ||
86 | // calculate the 2D coordinates for this current thread. | 86 | // calculate the 2D coordinates for this current thread. |
87 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
88 | - int yi = blockIdx.y * blockDim.y + threadIdx.y; | 87 | + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; |
88 | + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
89 | 89 | ||
90 | if(xi >= x || yi >= y) return; | 90 | if(xi >= x || yi >= y) return; |
91 | 91 | ||
92 | // convert 2D coordinates to 1D | 92 | // convert 2D coordinates to 1D |
93 | - int i = yi * x + xi; | 93 | + size_t i = yi * x + xi; |
94 | 94 | ||
95 | //update the gradient image with the vote direction | 95 | //update the gradient image with the vote direction |
96 | gpuGrad[2*i] = gpuDir[i]; | 96 | gpuGrad[2*i] = gpuDir[i]; |
97 | } | 97 | } |
98 | 98 | ||
99 | template<typename T> | 99 | template<typename T> |
100 | - void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | 100 | + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y){ |
101 | 101 | ||
102 | //calculate the number of bytes in the array | 102 | //calculate the number of bytes in the array |
103 | - unsigned int bytes = x * y * sizeof(T); | 103 | + size_t bytes = x * y * sizeof(T); |
104 | 104 | ||
105 | // allocate space on the GPU for the updated vote direction | 105 | // allocate space on the GPU for the updated vote direction |
106 | T* gpuDir; | 106 | T* gpuDir; |
@@ -108,14 +108,14 @@ namespace stim{ | @@ -108,14 +108,14 @@ namespace stim{ | ||
108 | 108 | ||
109 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 109 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
110 | 110 | ||
111 | - dim3 threads( sqrt(max_threads), sqrt(max_threads) ); | ||
112 | - dim3 blocks(x/threads.x + 1, y/threads.y + 1); | 111 | + dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) ); |
112 | + dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1); | ||
113 | 113 | ||
114 | size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); | 114 | size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); |
115 | //size_t curtain = 2 * rmax; | 115 | //size_t curtain = 2 * rmax; |
116 | //size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain); | 116 | //size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain); |
117 | size_t shared_mem_req = table_bytes;// + template_bytes; | 117 | size_t shared_mem_req = table_bytes;// + template_bytes; |
118 | - std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl; | 118 | + if (DEBUG) std::cout << "Shared Memory required: " << shared_mem_req << std::endl; |
119 | 119 | ||
120 | size_t shared_mem = stim::sharedMemPerBlock(); | 120 | size_t shared_mem = stim::sharedMemPerBlock(); |
121 | if(shared_mem_req > shared_mem){ | 121 | if(shared_mem_req > shared_mem){ |
@@ -124,16 +124,10 @@ namespace stim{ | @@ -124,16 +124,10 @@ namespace stim{ | ||
124 | } | 124 | } |
125 | 125 | ||
126 | //call the kernel to calculate the new voting direction | 126 | //call the kernel to calculate the new voting direction |
127 | - cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | ||
128 | - //stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer); | ||
129 | - | ||
130 | - //exit(0); | ||
131 | - | ||
132 | - //threads = dim3( sqrt(max_threads), sqrt(max_threads) ); | ||
133 | - //blocks = dim3(x/threads.x + 1, y/threads.y + 1); | 127 | + cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, (int)x , (int)y); |
134 | 128 | ||
135 | //call the kernel to update the gradient direction | 129 | //call the kernel to update the gradient direction |
136 | - cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); | 130 | + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, (int)x , (int)y); |
137 | //free allocated memory | 131 | //free allocated memory |
138 | HANDLE_ERROR( cudaFree(gpuDir) ); | 132 | HANDLE_ERROR( cudaFree(gpuDir) ); |
139 | 133 |
stim/cuda/ivote/vote_atomic_bb.cuh
@@ -9,12 +9,14 @@ | @@ -9,12 +9,14 @@ | ||
9 | #include <stim/visualization/colormap.h> | 9 | #include <stim/visualization/colormap.h> |
10 | #include <math.h> | 10 | #include <math.h> |
11 | 11 | ||
12 | + | ||
13 | + | ||
12 | namespace stim{ | 14 | namespace stim{ |
13 | namespace cuda{ | 15 | namespace cuda{ |
14 | 16 | ||
15 | // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | 17 | // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area |
16 | template<typename T> | 18 | template<typename T> |
17 | - __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | 19 | + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, size_t x, size_t y, bool gradmag = true){ |
18 | 20 | ||
19 | extern __shared__ T S[]; | 21 | extern __shared__ T S[]; |
20 | T* shared_atan = S; | 22 | T* shared_atan = S; |
@@ -22,12 +24,12 @@ namespace stim{ | @@ -22,12 +24,12 @@ namespace stim{ | ||
22 | stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x); | 24 | stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x); |
23 | 25 | ||
24 | // calculate the 2D coordinates for this current thread. | 26 | // calculate the 2D coordinates for this current thread. |
25 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
26 | - int yi = blockIdx.y * blockDim.y + threadIdx.y; | 27 | + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; |
28 | + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
27 | 29 | ||
28 | if(xi >= x || yi >= y) return; | 30 | if(xi >= x || yi >= y) return; |
29 | // convert 2D coordinates to 1D | 31 | // convert 2D coordinates to 1D |
30 | - int i = yi * x + xi; | 32 | + size_t i = yi * x + xi; |
31 | 33 | ||
32 | // calculate the voting direction based on the grtadient direction | 34 | // calculate the voting direction based on the grtadient direction |
33 | float theta = gpuGrad[2*i]; | 35 | float theta = gpuGrad[2*i]; |
@@ -50,7 +52,7 @@ namespace stim{ | @@ -50,7 +52,7 @@ namespace stim{ | ||
50 | bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image | 52 | bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image |
51 | bb.trim_high(x-1, y-1); | 53 | bb.trim_high(x-1, y-1); |
52 | 54 | ||
53 | - int by, bx; | 55 | + size_t by, bx; |
54 | int dx, dy; | 56 | int dx, dy; |
55 | 57 | ||
56 | unsigned int ind_g; //initialize the maximum vote value to zero | 58 | unsigned int ind_g; //initialize the maximum vote value to zero |
@@ -66,7 +68,8 @@ namespace stim{ | @@ -66,7 +68,8 @@ namespace stim{ | ||
66 | alpha = shared_atan[lut_i]; | 68 | alpha = shared_atan[lut_i]; |
67 | if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){ | 69 | if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){ |
68 | ind_g = (by)*x + (bx); | 70 | ind_g = (by)*x + (bx); |
69 | - atomicAdd(&gpuVote[ind_g], mag); | 71 | + if(gradmag) atomicAdd(&gpuVote[ind_g], mag); //add the gradient magnitude (if the gradmag flag is enabled) |
72 | + else atomicAdd(&gpuVote[ind_g], 1.0f); //otherwise just add 1 | ||
70 | 73 | ||
71 | } | 74 | } |
72 | } | 75 | } |
@@ -75,16 +78,22 @@ namespace stim{ | @@ -75,16 +78,22 @@ namespace stim{ | ||
75 | } | 78 | } |
76 | 79 | ||
77 | 80 | ||
81 | + /// Iterative voting for an image | ||
82 | + /// @param gpuVote is the resulting vote image | ||
83 | + /// @param gpuGrad is the gradient of the input image | ||
84 | + /// @param gpuTable is the pre-computed atan2() table | ||
85 | + /// @param phi is the angle of the vote region | ||
86 | + /// @param rmax is the estimated radius of the blob (defines the "width" of the vote region) | ||
87 | + /// @param x and y are the spatial dimensions of the gradient image | ||
88 | + /// @param gradmag defines whether or not the gradient magnitude is taken into account during the vote | ||
78 | template<typename T> | 89 | template<typename T> |
79 | - void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | ||
80 | - | ||
81 | - | 90 | + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y, bool gradmag = true){ |
82 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 91 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
83 | - dim3 threads( sqrt(max_threads), sqrt(max_threads) ); | ||
84 | - dim3 blocks(x/threads.x + 1, y/threads.y + 1); | 92 | + dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) ); |
93 | + dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1); | ||
85 | size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); | 94 | size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); |
86 | size_t shared_mem_req = table_bytes;// + template_bytes; | 95 | size_t shared_mem_req = table_bytes;// + template_bytes; |
87 | - std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl; | 96 | + if (DEBUG) std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl; |
88 | size_t shared_mem = stim::sharedMemPerBlock(); | 97 | size_t shared_mem = stim::sharedMemPerBlock(); |
89 | if(shared_mem_req > shared_mem){ | 98 | if(shared_mem_req > shared_mem){ |
90 | std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl; | 99 | std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl; |
@@ -92,7 +101,7 @@ namespace stim{ | @@ -92,7 +101,7 @@ namespace stim{ | ||
92 | } | 101 | } |
93 | 102 | ||
94 | //call the kernel to do the voting | 103 | //call the kernel to do the voting |
95 | - cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | 104 | + cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y, gradmag); |
96 | 105 | ||
97 | } | 106 | } |
98 | 107 |
stim/cuda/ivote_atomic_bb.cuh
1 | #ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H | 1 | #ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H |
2 | #define STIM_CUDA_IVOTE_ATOMIC_BB_H | 2 | #define STIM_CUDA_IVOTE_ATOMIC_BB_H |
3 | 3 | ||
4 | +extern bool DEBUG; | ||
4 | #include <stim/cuda/ivote/down_sample.cuh> | 5 | #include <stim/cuda/ivote/down_sample.cuh> |
5 | #include <stim/cuda/ivote/local_max.cuh> | 6 | #include <stim/cuda/ivote/local_max.cuh> |
6 | #include <stim/cuda/ivote/update_dir_bb.cuh> | 7 | #include <stim/cuda/ivote/update_dir_bb.cuh> |
stim/cuda/templates/gradient.cuh
@@ -9,25 +9,25 @@ namespace stim{ | @@ -9,25 +9,25 @@ namespace stim{ | ||
9 | namespace cuda{ | 9 | namespace cuda{ |
10 | 10 | ||
11 | template<typename T> | 11 | template<typename T> |
12 | - __global__ void gradient_2d(T* out, T* in, int x, int y){ | 12 | + __global__ void gradient_2d(T* out, T* in, size_t x, size_t y){ |
13 | 13 | ||
14 | 14 | ||
15 | // calculate the 2D coordinates for this current thread. | 15 | // calculate the 2D coordinates for this current thread. |
16 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
17 | - int yi = blockIdx.y * blockDim.y + threadIdx.y; | 16 | + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; |
17 | + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
18 | // convert 2D coordinates to 1D | 18 | // convert 2D coordinates to 1D |
19 | - int i = yi * x + xi; | 19 | + size_t i = yi * x + xi; |
20 | 20 | ||
21 | //return if the pixel is outside of the image | 21 | //return if the pixel is outside of the image |
22 | if(xi >= x || yi >= y) return; | 22 | if(xi >= x || yi >= y) return; |
23 | 23 | ||
24 | //calculate indices for the forward difference | 24 | //calculate indices for the forward difference |
25 | - int i_xp = yi * x + (xi + 1); | ||
26 | - int i_yp = (yi + 1) * x + xi; | 25 | + size_t i_xp = yi * x + (xi + 1); |
26 | + size_t i_yp = (yi + 1) * x + xi; | ||
27 | 27 | ||
28 | //calculate indices for the backward difference | 28 | //calculate indices for the backward difference |
29 | - int i_xn = yi * x + (xi - 1); | ||
30 | - int i_yn = (yi - 1) * x + xi; | 29 | + size_t i_xn = yi * x + (xi - 1); |
30 | + size_t i_yn = (yi - 1) * x + xi; | ||
31 | 31 | ||
32 | //use forward differences if a coordinate is zero | 32 | //use forward differences if a coordinate is zero |
33 | if(xi == 0) | 33 | if(xi == 0) |
@@ -47,12 +47,12 @@ namespace stim{ | @@ -47,12 +47,12 @@ namespace stim{ | ||
47 | } | 47 | } |
48 | 48 | ||
49 | template<typename T> | 49 | template<typename T> |
50 | - void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){ | 50 | + void gpu_gradient_2d(T* gpuGrad, T* gpuI, size_t x, size_t y){ |
51 | 51 | ||
52 | //get the maximum number of threads per block for the CUDA device | 52 | //get the maximum number of threads per block for the CUDA device |
53 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 53 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
54 | dim3 threads(max_threads, 1); | 54 | dim3 threads(max_threads, 1); |
55 | - dim3 blocks(x/threads.x + 1 , y); | 55 | + dim3 blocks((unsigned int)(x/threads.x) + 1 , (unsigned int)y); |
56 | 56 | ||
57 | 57 | ||
58 | //call the GPU kernel to determine the gradient | 58 | //call the GPU kernel to determine the gradient |
stim/image/image.h
@@ -122,9 +122,9 @@ public: | @@ -122,9 +122,9 @@ public: | ||
122 | } | 122 | } |
123 | 123 | ||
124 | stim::image<T>& operator=(const stim::image<T>& I){ | 124 | stim::image<T>& operator=(const stim::image<T>& I){ |
125 | - init(); | ||
126 | if(&I == this) //handle self-assignment | 125 | if(&I == this) //handle self-assignment |
127 | return *this; | 126 | return *this; |
127 | + init(); | ||
128 | allocate(I.X(), I.Y(), I.C()); | 128 | allocate(I.X(), I.Y(), I.C()); |
129 | memcpy(img, I.img, bytes()); | 129 | memcpy(img, I.img, bytes()); |
130 | return *this; | 130 | return *this; |
@@ -423,6 +423,33 @@ public: | @@ -423,6 +423,33 @@ public: | ||
423 | return result; | 423 | return result; |
424 | } | 424 | } |
425 | 425 | ||
426 | + /// Adds curcular padding for the specified number of pixels - in this case replicating the boundary pixels | ||
427 | + image<T> pad_replicate(size_t p) { | ||
428 | + image<T> result(width() + p * 2, height() + p * 2, channels()); //create an output image | ||
429 | + result = 0; | ||
430 | + //result = value; //assign the border value to all pixels in the new image | ||
431 | + for (size_t y = 0; y < height(); y++) { //for each pixel in the original image | ||
432 | + for (size_t x = 0; x < width(); x++) { | ||
433 | + size_t n = (y + p) * (width() + p * 2) + x + p; //calculate the index of the corresponding pixel in the result image | ||
434 | + size_t n0 = idx(x, y); //calculate the index for this pixel in the original image | ||
435 | + result.data()[n] = img[n0]; // copy the original image to the result image afer the border area | ||
436 | + } | ||
437 | + } | ||
438 | + size_t l = p; | ||
439 | + size_t r = p + width() - 1; | ||
440 | + size_t t = p; | ||
441 | + size_t b = p + height() - 1; | ||
442 | + for (size_t y = 0; y < p; y++) for (size_t x = l; x <= r; x++) result(x, y) = result(x, t); //pad the top | ||
443 | + for (size_t y = b + 1; y < result.height(); y++) for (size_t x = l; x <= r; x++) result(x, y) = result(x, b); //pad the bottom | ||
444 | + for (size_t y = t; y <= b; y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, y); //pad the left | ||
445 | + for (size_t y = t; y <= b; y++) for (size_t x = r+1; x < result.width(); x++) result(x, y) = result(r, y); //pad the right | ||
446 | + for (size_t y = 0; y < t; y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, t); //pad the top left | ||
447 | + for (size_t y = 0; y < t; y++) for (size_t x = r+1; x < result.width(); x++) result(x, y) = result(r, t); //pad the top right | ||
448 | + for (size_t y = b+1; y < result.height(); y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, b); //pad the bottom left | ||
449 | + for (size_t y = b+1; y < result.height(); y++) for (size_t x = r + 1; x < result.width(); x++) result(x, y) = result(r, b); //pad the bottom right | ||
450 | + return result; | ||
451 | + } | ||
452 | + | ||
426 | /// Copy the given data to the specified channel | 453 | /// Copy the given data to the specified channel |
427 | 454 | ||
428 | /// @param c is the channel number that the data will be copied to | 455 | /// @param c is the channel number that the data will be copied to |
@@ -546,7 +573,7 @@ public: | @@ -546,7 +573,7 @@ public: | ||
546 | } | 573 | } |
547 | 574 | ||
548 | //crop regions given by an array of 1D index values | 575 | //crop regions given by an array of 1D index values |
549 | - std::vector<image<T>> crop_idx(size_t w, size_t h, std::vector<size_t> idx) { | 576 | + std::vector< image<T> > crop_idx(size_t w, size_t h, std::vector<size_t> idx) { |
550 | std::vector<image<T>> result(idx.size()); //create an array of image files to return | 577 | std::vector<image<T>> result(idx.size()); //create an array of image files to return |
551 | for (size_t i = 0; i < idx.size(); i++) { //for each specified index point | 578 | for (size_t i = 0; i < idx.size(); i++) { //for each specified index point |
552 | size_t y = idx[i] / X(); //calculate the y coordinate from the 1D index (center of ROI) | 579 | size_t y = idx[i] / X(); //calculate the y coordinate from the 1D index (center of ROI) |
stim/math/filters/gauss2.cuh
@@ -18,8 +18,8 @@ namespace stim { | @@ -18,8 +18,8 @@ namespace stim { | ||
18 | ///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel | 18 | ///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel |
19 | template<typename T, typename K> | 19 | template<typename T, typename K> |
20 | stim::image<T> cpu_gauss2(const stim::image<T>& in, K stdx, K stdy, size_t nstds = 3) { | 20 | stim::image<T> cpu_gauss2(const stim::image<T>& in, K stdx, K stdy, size_t nstds = 3) { |
21 | - size_t kx = stdx * nstds * 2 + 1; //calculate the kernel sizes | ||
22 | - size_t ky = stdy * nstds * 2 + 1; | 21 | + size_t kx = (size_t)(stdx * nstds * 2) + 1; //calculate the kernel sizes |
22 | + size_t ky = (size_t)(stdy * nstds * 2) + 1; | ||
23 | size_t X = in.width() - kx + 1; //calculate the size of the output image | 23 | size_t X = in.width() - kx + 1; //calculate the size of the output image |
24 | size_t Y = in.height() - ky + 1; | 24 | size_t Y = in.height() - ky + 1; |
25 | stim::image<T> r(X, Y, in.channels()); //create an output image | 25 | stim::image<T> r(X, Y, in.channels()); //create an output image |
stim/math/filters/sepconv2.cuh
@@ -20,12 +20,12 @@ namespace stim { | @@ -20,12 +20,12 @@ namespace stim { | ||
20 | cudaDeviceProp p; | 20 | cudaDeviceProp p; |
21 | HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); | 21 | HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); |
22 | size_t tmax = p.maxThreadsPerBlock; | 22 | size_t tmax = p.maxThreadsPerBlock; |
23 | - dim3 nt(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions | 23 | + dim3 nt((unsigned int)sqrt(tmax), (unsigned int)sqrt(tmax)); //calculate the block dimensions |
24 | size_t X = sx - kx + 1; //calculate the x size of the output image | 24 | size_t X = sx - kx + 1; //calculate the x size of the output image |
25 | T* temp; //declare a temporary variable to store the intermediate image | 25 | T* temp; //declare a temporary variable to store the intermediate image |
26 | HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T))); //allocate memory for the intermediate image | 26 | HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T))); //allocate memory for the intermediate image |
27 | 27 | ||
28 | - dim3 nb(X / nt.x + 1, sy / nt.y + 1); //calculate the grid dimensions | 28 | + dim3 nb((unsigned int)(X / nt.x) + 1, (unsigned int)(sy / nt.y) + 1); //calculate the grid dimensions |
29 | size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T); //shared memory bytes required to store block data | 29 | size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T); //shared memory bytes required to store block data |
30 | if (sm > p.sharedMemPerBlock) { | 30 | if (sm > p.sharedMemPerBlock) { |
31 | std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; | 31 | std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; |
@@ -34,7 +34,7 @@ namespace stim { | @@ -34,7 +34,7 @@ namespace stim { | ||
34 | kernel_conv2 <<<nb, nt, sm>>> (temp, in, k0, sx, sy, kx, 1); //launch the kernel to compute the intermediate image | 34 | kernel_conv2 <<<nb, nt, sm>>> (temp, in, k0, sx, sy, kx, 1); //launch the kernel to compute the intermediate image |
35 | 35 | ||
36 | size_t Y = sy - ky + 1; //calculate the y size of the output image | 36 | size_t Y = sy - ky + 1; //calculate the y size of the output image |
37 | - nb.y = Y / nt.y + 1; //update the grid dimensions to reflect the Y-axis size of the output image | 37 | + nb.y = (unsigned int)(Y / nt.y) + 1; //update the grid dimensions to reflect the Y-axis size of the output image |
38 | sm = nt.x * (nt.y + ky - 1) * sizeof(T); //calculate the amount of shared memory needed for the second pass | 38 | sm = nt.x * (nt.y + ky - 1) * sizeof(T); //calculate the amount of shared memory needed for the second pass |
39 | if (sm > p.sharedMemPerBlock) { | 39 | if (sm > p.sharedMemPerBlock) { |
40 | std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; | 40 | std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; |
stim/parser/arguments.h
@@ -535,7 +535,13 @@ namespace stim{ | @@ -535,7 +535,13 @@ namespace stim{ | ||
535 | 535 | ||
536 | /// @param a is the index of the requested argument | 536 | /// @param a is the index of the requested argument |
537 | std::string arg(size_t a){ | 537 | std::string arg(size_t a){ |
538 | - return args[a]; | 538 | + if (a < nargs()) { |
539 | + return args[a]; | ||
540 | + } | ||
541 | + else { | ||
542 | + std::cout << "stim::arguments ERROR: argument index exceeds number of available arguments" << std::endl; | ||
543 | + exit(1); | ||
544 | + } | ||
539 | } | 545 | } |
540 | 546 | ||
541 | /// Returns an std::vector of argument strings | 547 | /// Returns an std::vector of argument strings |