diff --git a/stim/cuda/arraymath/array_cart2polar.cuh b/stim/cuda/arraymath/array_cart2polar.cuh index 7f4d0e2..46a78bb 100644 --- a/stim/cuda/arraymath/array_cart2polar.cuh +++ b/stim/cuda/arraymath/array_cart2polar.cuh @@ -4,14 +4,14 @@ namespace stim{ namespace cuda{ template - __global__ void cuda_cart2polar(T* a, int x, int y, float rotation){ + __global__ void cuda_cart2polar(T* a, size_t x, size_t y, float rotation){ // calculate the 2D coordinates for this current thread. - int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y * blockDim.y + threadIdx.y; + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; // convert 2D coordinates to 1D - int i = yi * x + xi; + size_t i = yi * x + xi; if(xi >= x|| yi >= y) return; @@ -27,11 +27,11 @@ namespace stim{ template - void gpu_cart2polar(T* gpuGrad, unsigned int x, unsigned int y, float rotation = 0){ + void gpu_cart2polar(T* gpuGrad, size_t x, size_t y, float rotation = 0){ unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(max_threads, 1); - dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); + dim3 blocks(((unsigned int)x/threads.x) + ((unsigned int)x %threads.x == 0 ? 0:1) , (unsigned int)y); //call the kernel to do the multiplication cuda_cart2polar <<< blocks, threads >>>(gpuGrad, x, y, rotation); @@ -40,11 +40,11 @@ namespace stim{ template - void cpu_cart2polar(T* a, unsigned int x, unsigned int y){ + void cpu_cart2polar(T* a, size_t x, size_t y){ //calculate the number of bytes in the array - unsigned int N = x *y; - unsigned int bytes = N * sizeof(T) * 2; + size_t N = x *y; + size_t bytes = N * sizeof(T) * 2; //allocate memory on the GPU for the array T* gpuA; diff --git a/stim/cuda/ivote/local_max.cuh b/stim/cuda/ivote/local_max.cuh index b65f8a0..38b7096 100644 --- a/stim/cuda/ivote/local_max.cuh +++ b/stim/cuda/ivote/local_max.cuh @@ -10,17 +10,17 @@ namespace stim{ // this kernel calculates the local maximum for finding the cell centers template - __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, int x, int y){ + __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, size_t x, size_t y){ // calculate the 2D coordinates for this current thread. - int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y * blockDim.y + threadIdx.y; + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; if(xi >= x || yi >= y) return; // convert 2D coordinates to 1D - int i = yi * x + xi; + size_t i = yi * x + xi; gpuCenters[i] = 0; //initialize the value at this location to zero @@ -61,13 +61,13 @@ namespace stim{ } template - void gpu_local_max(T* gpuCenters, T* gpuVote, unsigned int conn, unsigned int x, unsigned int y){ + void gpu_local_max(T* gpuCenters, T* gpuVote, unsigned int conn, size_t x, size_t y){ unsigned int max_threads = stim::maxThreadsPerBlock(); /*dim3 threads(max_threads, 1); dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/ - dim3 threads( sqrt(max_threads), sqrt(max_threads) ); - dim3 blocks(x/threads.x + 1, y/threads.y + 1); + dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) ); + dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1); //call the kernel to find the local maximum. cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, conn, x, y); diff --git a/stim/cuda/ivote/update_dir_bb.cuh b/stim/cuda/ivote/update_dir_bb.cuh index f3c324d..43869ec 100644 --- a/stim/cuda/ivote/update_dir_bb.cuh +++ b/stim/cuda/ivote/update_dir_bb.cuh @@ -81,26 +81,26 @@ namespace stim{ // this kernel updates the gradient direction by the calculated voting direction. template - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, size_t x, size_t y){ // calculate the 2D coordinates for this current thread. - int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y * blockDim.y + threadIdx.y; + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; if(xi >= x || yi >= y) return; // convert 2D coordinates to 1D - int i = yi * x + xi; + size_t i = yi * x + xi; //update the gradient image with the vote direction gpuGrad[2*i] = gpuDir[i]; } template - void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y){ //calculate the number of bytes in the array - unsigned int bytes = x * y * sizeof(T); + size_t bytes = x * y * sizeof(T); // allocate space on the GPU for the updated vote direction T* gpuDir; @@ -108,14 +108,14 @@ namespace stim{ unsigned int max_threads = stim::maxThreadsPerBlock(); - dim3 threads( sqrt(max_threads), sqrt(max_threads) ); - dim3 blocks(x/threads.x + 1, y/threads.y + 1); + dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) ); + dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1); size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); //size_t curtain = 2 * rmax; //size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain); size_t shared_mem_req = table_bytes;// + template_bytes; - std::cout<<"Shared Memory required: "< shared_mem){ @@ -124,16 +124,10 @@ namespace stim{ } //call the kernel to calculate the new voting direction - cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); - //stim::gpu2image(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer); - - //exit(0); - - //threads = dim3( sqrt(max_threads), sqrt(max_threads) ); - //blocks = dim3(x/threads.x + 1, y/threads.y + 1); + cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, (int)x , (int)y); //call the kernel to update the gradient direction - cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, (int)x , (int)y); //free allocated memory HANDLE_ERROR( cudaFree(gpuDir) ); diff --git a/stim/cuda/ivote/vote_atomic_bb.cuh b/stim/cuda/ivote/vote_atomic_bb.cuh index 32de21f..5a05001 100644 --- a/stim/cuda/ivote/vote_atomic_bb.cuh +++ b/stim/cuda/ivote/vote_atomic_bb.cuh @@ -9,12 +9,14 @@ #include #include + + namespace stim{ namespace cuda{ // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area template - __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, size_t x, size_t y, bool gradmag = true){ extern __shared__ T S[]; T* shared_atan = S; @@ -22,12 +24,12 @@ namespace stim{ stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x); // calculate the 2D coordinates for this current thread. - int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y * blockDim.y + threadIdx.y; + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; if(xi >= x || yi >= y) return; // convert 2D coordinates to 1D - int i = yi * x + xi; + size_t i = yi * x + xi; // calculate the voting direction based on the grtadient direction float theta = gpuGrad[2*i]; @@ -50,7 +52,7 @@ namespace stim{ bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image bb.trim_high(x-1, y-1); - int by, bx; + size_t by, bx; int dx, dy; unsigned int ind_g; //initialize the maximum vote value to zero @@ -66,7 +68,8 @@ namespace stim{ alpha = shared_atan[lut_i]; if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){ ind_g = (by)*x + (bx); - atomicAdd(&gpuVote[ind_g], mag); + if(gradmag) atomicAdd(&gpuVote[ind_g], mag); //add the gradient magnitude (if the gradmag flag is enabled) + else atomicAdd(&gpuVote[ind_g], 1.0f); //otherwise just add 1 } } @@ -75,16 +78,22 @@ namespace stim{ } + /// Iterative voting for an image + /// @param gpuVote is the resulting vote image + /// @param gpuGrad is the gradient of the input image + /// @param gpuTable is the pre-computed atan2() table + /// @param phi is the angle of the vote region + /// @param rmax is the estimated radius of the blob (defines the "width" of the vote region) + /// @param x and y are the spatial dimensions of the gradient image + /// @param gradmag defines whether or not the gradient magnitude is taken into account during the vote template - void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ - - + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y, bool gradmag = true){ unsigned int max_threads = stim::maxThreadsPerBlock(); - dim3 threads( sqrt(max_threads), sqrt(max_threads) ); - dim3 blocks(x/threads.x + 1, y/threads.y + 1); + dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) ); + dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1); size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); size_t shared_mem_req = table_bytes;// + template_bytes; - std::cout<<"Shared Memory required: "< shared_mem){ std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y, gradmag); } diff --git a/stim/cuda/ivote_atomic_bb.cuh b/stim/cuda/ivote_atomic_bb.cuh index 14e002d..07f3224 100644 --- a/stim/cuda/ivote_atomic_bb.cuh +++ b/stim/cuda/ivote_atomic_bb.cuh @@ -1,6 +1,7 @@ #ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H #define STIM_CUDA_IVOTE_ATOMIC_BB_H +extern bool DEBUG; #include #include #include diff --git a/stim/cuda/templates/gradient.cuh b/stim/cuda/templates/gradient.cuh index febd57b..c3816ec 100644 --- a/stim/cuda/templates/gradient.cuh +++ b/stim/cuda/templates/gradient.cuh @@ -9,25 +9,25 @@ namespace stim{ namespace cuda{ template - __global__ void gradient_2d(T* out, T* in, int x, int y){ + __global__ void gradient_2d(T* out, T* in, size_t x, size_t y){ // calculate the 2D coordinates for this current thread. - int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y * blockDim.y + threadIdx.y; + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; // convert 2D coordinates to 1D - int i = yi * x + xi; + size_t i = yi * x + xi; //return if the pixel is outside of the image if(xi >= x || yi >= y) return; //calculate indices for the forward difference - int i_xp = yi * x + (xi + 1); - int i_yp = (yi + 1) * x + xi; + size_t i_xp = yi * x + (xi + 1); + size_t i_yp = (yi + 1) * x + xi; //calculate indices for the backward difference - int i_xn = yi * x + (xi - 1); - int i_yn = (yi - 1) * x + xi; + size_t i_xn = yi * x + (xi - 1); + size_t i_yn = (yi - 1) * x + xi; //use forward differences if a coordinate is zero if(xi == 0) @@ -47,12 +47,12 @@ namespace stim{ } template - void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){ + void gpu_gradient_2d(T* gpuGrad, T* gpuI, size_t x, size_t y){ //get the maximum number of threads per block for the CUDA device unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(max_threads, 1); - dim3 blocks(x/threads.x + 1 , y); + dim3 blocks((unsigned int)(x/threads.x) + 1 , (unsigned int)y); //call the GPU kernel to determine the gradient diff --git a/stim/image/image.h b/stim/image/image.h index fbb5499..bd96c67 100644 --- a/stim/image/image.h +++ b/stim/image/image.h @@ -122,9 +122,9 @@ public: } stim::image& operator=(const stim::image& I){ - init(); if(&I == this) //handle self-assignment return *this; + init(); allocate(I.X(), I.Y(), I.C()); memcpy(img, I.img, bytes()); return *this; @@ -423,6 +423,33 @@ public: return result; } + /// Adds curcular padding for the specified number of pixels - in this case replicating the boundary pixels + image pad_replicate(size_t p) { + image result(width() + p * 2, height() + p * 2, channels()); //create an output image + result = 0; + //result = value; //assign the border value to all pixels in the new image + for (size_t y = 0; y < height(); y++) { //for each pixel in the original image + for (size_t x = 0; x < width(); x++) { + size_t n = (y + p) * (width() + p * 2) + x + p; //calculate the index of the corresponding pixel in the result image + size_t n0 = idx(x, y); //calculate the index for this pixel in the original image + result.data()[n] = img[n0]; // copy the original image to the result image afer the border area + } + } + size_t l = p; + size_t r = p + width() - 1; + size_t t = p; + size_t b = p + height() - 1; + for (size_t y = 0; y < p; y++) for (size_t x = l; x <= r; x++) result(x, y) = result(x, t); //pad the top + for (size_t y = b + 1; y < result.height(); y++) for (size_t x = l; x <= r; x++) result(x, y) = result(x, b); //pad the bottom + for (size_t y = t; y <= b; y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, y); //pad the left + for (size_t y = t; y <= b; y++) for (size_t x = r+1; x < result.width(); x++) result(x, y) = result(r, y); //pad the right + for (size_t y = 0; y < t; y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, t); //pad the top left + for (size_t y = 0; y < t; y++) for (size_t x = r+1; x < result.width(); x++) result(x, y) = result(r, t); //pad the top right + for (size_t y = b+1; y < result.height(); y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, b); //pad the bottom left + for (size_t y = b+1; y < result.height(); y++) for (size_t x = r + 1; x < result.width(); x++) result(x, y) = result(r, b); //pad the bottom right + return result; + } + /// Copy the given data to the specified channel /// @param c is the channel number that the data will be copied to @@ -546,7 +573,7 @@ public: } //crop regions given by an array of 1D index values - std::vector> crop_idx(size_t w, size_t h, std::vector idx) { + std::vector< image > crop_idx(size_t w, size_t h, std::vector idx) { std::vector> result(idx.size()); //create an array of image files to return for (size_t i = 0; i < idx.size(); i++) { //for each specified index point size_t y = idx[i] / X(); //calculate the y coordinate from the 1D index (center of ROI) diff --git a/stim/math/filters/gauss2.cuh b/stim/math/filters/gauss2.cuh index 7e50ece..bd23d42 100644 --- a/stim/math/filters/gauss2.cuh +++ b/stim/math/filters/gauss2.cuh @@ -18,8 +18,8 @@ namespace stim { ///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel template stim::image cpu_gauss2(const stim::image& in, K stdx, K stdy, size_t nstds = 3) { - size_t kx = stdx * nstds * 2 + 1; //calculate the kernel sizes - size_t ky = stdy * nstds * 2 + 1; + size_t kx = (size_t)(stdx * nstds * 2) + 1; //calculate the kernel sizes + size_t ky = (size_t)(stdy * nstds * 2) + 1; size_t X = in.width() - kx + 1; //calculate the size of the output image size_t Y = in.height() - ky + 1; stim::image r(X, Y, in.channels()); //create an output image diff --git a/stim/math/filters/sepconv2.cuh b/stim/math/filters/sepconv2.cuh index 5946c6a..f733a76 100644 --- a/stim/math/filters/sepconv2.cuh +++ b/stim/math/filters/sepconv2.cuh @@ -20,12 +20,12 @@ namespace stim { cudaDeviceProp p; HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); size_t tmax = p.maxThreadsPerBlock; - dim3 nt(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions + dim3 nt((unsigned int)sqrt(tmax), (unsigned int)sqrt(tmax)); //calculate the block dimensions size_t X = sx - kx + 1; //calculate the x size of the output image T* temp; //declare a temporary variable to store the intermediate image HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T))); //allocate memory for the intermediate image - dim3 nb(X / nt.x + 1, sy / nt.y + 1); //calculate the grid dimensions + dim3 nb((unsigned int)(X / nt.x) + 1, (unsigned int)(sy / nt.y) + 1); //calculate the grid dimensions size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T); //shared memory bytes required to store block data if (sm > p.sharedMemPerBlock) { std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; @@ -34,7 +34,7 @@ namespace stim { kernel_conv2 <<>> (temp, in, k0, sx, sy, kx, 1); //launch the kernel to compute the intermediate image size_t Y = sy - ky + 1; //calculate the y size of the output image - nb.y = Y / nt.y + 1; //update the grid dimensions to reflect the Y-axis size of the output image + nb.y = (unsigned int)(Y / nt.y) + 1; //update the grid dimensions to reflect the Y-axis size of the output image sm = nt.x * (nt.y + ky - 1) * sizeof(T); //calculate the amount of shared memory needed for the second pass if (sm > p.sharedMemPerBlock) { std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; diff --git a/stim/parser/arguments.h b/stim/parser/arguments.h index 7a84ce3..06e258b 100644 --- a/stim/parser/arguments.h +++ b/stim/parser/arguments.h @@ -535,7 +535,13 @@ namespace stim{ /// @param a is the index of the requested argument std::string arg(size_t a){ - return args[a]; + if (a < nargs()) { + return args[a]; + } + else { + std::cout << "stim::arguments ERROR: argument index exceeds number of available arguments" << std::endl; + exit(1); + } } /// Returns an std::vector of argument strings -- libgit2 0.21.4