diff --git a/stim/cuda/ivote/down_sample.cuh b/stim/cuda/ivote/down_sample.cuh index 598d095..3e26b50 100644 --- a/stim/cuda/ivote/down_sample.cuh +++ b/stim/cuda/ivote/down_sample.cuh @@ -25,8 +25,8 @@ namespace stim{ if(xi< x_ds && yi< y_ds){ - int x_org = xi * sigma_ds; - int y_org = yi * sigma_ds; + int x_org = xi * sigma_ds ; + int y_org = yi * sigma_ds ; int i_org = y_org * x + x_org; gpuI[i] = gpuI0[i_org]; } diff --git a/stim/cuda/ivote/local_max.cuh b/stim/cuda/ivote/local_max.cuh index 9519735..057f544 100644 --- a/stim/cuda/ivote/local_max.cuh +++ b/stim/cuda/ivote/local_max.cuh @@ -12,20 +12,48 @@ namespace stim{ // this kernel calculates the local maximum for finding the cell centers template - __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ + __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, int conn, int x, int y){ // calculate the 2D coordinates for this current thread. int xi = blockIdx.x * blockDim.x + threadIdx.x; int yi = blockIdx.y; + + if(xi >= x || yi >= y) + return; + + // convert 2D coordinates to 1D - int i = yi * x + xi; - - - + int i = yi * x + xi; + + // START DAVID + + gpuCenters[i] = 0; //initialize the value at this location to zero + + T val = gpuVote[i]; + + //compare to the threshold + if(val < final_t) return; + + for(int xl = xi - conn; xl < xi + conn; xl++){ + for(int yl = yi - conn; yl < yi + conn; yl++){ + if(xl >= 0 && xl < x && yl >= 0 && yl < y){ + int il = yl * x + xl; + if(gpuVote[il] > val){ + return; + } + } + } + } + + gpuCenters[i] = 1; + + // END DAVID + /* //calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. int xl = xi - conn; int yl = yi - conn; + // use zero for the lowest limits if the xi or yi is less than conn. if (xi <= conn) xl = 0; @@ -82,6 +110,7 @@ namespace stim{ // set the center value for this pixel to high if it's a local max ,and to low if not. gpuCenters[i] = l_value ; } + */ } diff --git a/stim/cuda/ivote/update_dir.cuh b/stim/cuda/ivote/update_dir.cuh index 5ec9f0a..a4e5d62 100644 --- a/stim/cuda/ivote/update_dir.cuh +++ b/stim/cuda/ivote/update_dir.cuh @@ -12,7 +12,7 @@ namespace stim{ // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. template - __global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + __global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ //generate a pointer to shared memory (size will be specified as a kernel parameter) extern __shared__ float s_vote[]; @@ -20,12 +20,9 @@ namespace stim{ //calculate the start point for this block int bxi = blockIdx.x * blockDim.x; - //calculate the width of the shared memory block - int swidth = 2 * rmax + blockDim.x; - // calculate the 2D coordinates for this current thread. int xi = bxi + threadIdx.x; - int yi = blockIdx.y; + int yi = blockIdx.y * blockDim.y + threadIdx.y; // convert 2D coordinates to 1D int i = yi * x + xi; @@ -43,14 +40,17 @@ namespace stim{ int id_x = 0; int id_y = 0; + //calculate the width of the shared memory block + int swidth = 2 * rmax + blockDim.x; + // compute the size of window which will be checked for finding the voting area for this voter - unsigned int x_table = 2*rmax +1; - unsigned int rmax_sq = rmax * rmax; - int r = (int)rmax; + int x_table = 2*rmax +1; + int rmax_sq = rmax * rmax; int tx_rmax = threadIdx.x + rmax; int bxs = bxi - rmax; - for(int yr = -r; yr <= r; yr++){ + + for(int yr = -rmax; yr <= rmax; yr++){ //copy the portion of the image necessary for this block to shared memory __syncthreads(); @@ -60,7 +60,7 @@ namespace stim{ //if the current thread is outside of the image, it doesn't have to be computed if(xi < x && yi < y){ - for(int xr = -r; xr <= r; xr++){ + for(int xr = -rmax; xr <= rmax; xr++){ unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; @@ -93,19 +93,21 @@ namespace stim{ float new_angle = gpuTable[ind_m]; - gpuDir[i] = new_angle; + if(xi < x && yi < y) + gpuDir[i] = new_angle; } // this kernel updates the gradient direction by the calculated voting direction. template - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, unsigned int x, unsigned int y){ + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ //************ when the number of threads are (1024,1) ************* // calculate the 2D coordinates for this current thread. int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y; + int yi = blockIdx.y * blockDim.y + threadIdx.y; + // convert 2D coordinates to 1D int i = yi * x + xi; @@ -126,7 +128,8 @@ namespace stim{ unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(max_threads, 1); dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); - + //dim3 threads(1, 1); + //dim3 blocks(x, y); // Allocate CUDA array in device memory //define a channel descriptor for a single 32-bit channel diff --git a/stim/cuda/ivote/vote.cuh b/stim/cuda/ivote/vote.cuh index 7e98894..54daf54 100644 --- a/stim/cuda/ivote/vote.cuh +++ b/stim/cuda/ivote/vote.cuh @@ -13,7 +13,7 @@ namespace stim{ // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area template - __global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + __global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, int rmax, int x, int y){ //generate a pointer to shared memory (size will be specified as a kernel parameter) extern __shared__ float2 s_grad[]; @@ -21,42 +21,41 @@ namespace stim{ //calculate the start point for this block int bxi = blockIdx.x * blockDim.x; - //calculate the width of the shared memory block - int swidth = 2 * rmax + blockDim.x; - // calculate the 2D coordinates for this current thread. int xi = bxi + threadIdx.x; - int yi = blockIdx.y; + int yi = blockIdx.y * blockDim.y + threadIdx.y; // convert 2D coordinates to 1D int i = yi * x + xi; - + + // define a local variable to sum the votes from the voters float sum = 0; + + //calculate the width of the shared memory block + int swidth = 2 * rmax + blockDim.x; + // compute the size of window which will be checked for finding the proper voters for this pixel - unsigned int x_table = 2*rmax +1; - - unsigned int rmax_sq = rmax * rmax; - int r = (int)rmax; + int x_table = 2*rmax +1; + int rmax_sq = rmax * rmax; int tx_rmax = threadIdx.x + rmax; int bxs = bxi - rmax; - - for(int yr = -r; yr <= r; yr++){ + //for every line (along y) + for(int yr = -rmax; yr <= rmax; yr++){ //copy the portion of the image necessary for this block to shared memory __syncthreads(); stim::cuda::sharedMemcpy_tex2D(s_grad, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim); __syncthreads(); - //if the current thread is outside of the image, it doesn't have to be computed if(xi < x && yi < y){ - - for(int xr = -r; xr <= r; xr++){ + + for(int xr = -rmax; xr <= rmax; xr++){ //find the location of this voter in the atan2 table - unsigned int id_t = (yr + rmax) * x_table + xr + rmax; + int id_t = (yr + rmax) * x_table + xr + rmax; // calculate the angle between the pixel and the current voter in x and y directions float atan_angle = gpuTable[id_t]; @@ -74,10 +73,12 @@ namespace stim{ } } + } } - - gpuVote[i] = sum; + if(xi < x && yi < y) + gpuVote[i] = sum; + } template @@ -92,7 +93,9 @@ namespace stim{ //unsigned int thread_dim = sqrt(max_threads); dim3 threads(max_threads, 1); dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); - + //dim3 threads(1,1); + //dim3 blocks(x, y); + // Allocate CUDA array in device memory //define a channel descriptor for a single 32-bit channel diff --git a/stim/cuda/templates/gradient.cuh b/stim/cuda/templates/gradient.cuh index c55ec91..25e225e 100644 --- a/stim/cuda/templates/gradient.cuh +++ b/stim/cuda/templates/gradient.cuh @@ -9,7 +9,7 @@ namespace stim{ namespace cuda{ template - __global__ void gradient_2d(T* out, T* in, unsigned int x, unsigned int y){ + __global__ void gradient_2d(T* out, T* in, int x, int y){ //calculate the 1D image index for this thread int i = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/stim/image/image.h b/stim/image/image.h index 8bbb507..439cbf6 100644 --- a/stim/image/image.h +++ b/stim/image/image.h @@ -48,7 +48,7 @@ public: //create an image from an interleaved buffer void set_interleaved(T* buffer, unsigned int width, unsigned int height, unsigned int channels = 1){ - unsigned char* non_interleaved = (unsigned char*)malloc(width * height * 3); + T* non_interleaved = (T*)malloc(width * height * 3 * sizeof(T)); unsigned int S = width * height; for(unsigned int i = 0; i < S; i++){ @@ -57,7 +57,7 @@ public: } } - img = cimg_library::CImg(non_interleaved, width, height, 1, channels); + img = cimg_library::CImg(non_interleaved, width, height, 1, channels); } //fills an allocated region of memory with non-interleaved data -- libgit2 0.21.4