diff --git a/stim/cuda/arraymath/array_cart2polar.cuh b/stim/cuda/arraymath/array_cart2polar.cuh index 8889b03..37e44db 100644 --- a/stim/cuda/arraymath/array_cart2polar.cuh +++ b/stim/cuda/arraymath/array_cart2polar.cuh @@ -4,42 +4,47 @@ namespace stim{ namespace cuda{ template - __global__ void cuda_cart2polar(T* a, unsigned int N){ + __global__ void cuda_cart2polar(T* a, int x, int y){ - //calculate the 1D index for this thread - int i = blockIdx.x * blockDim.x + threadIdx.x; - - if(i < N){ - float x = a[i * 2 + 0]; - float y = a[i * 2 + 1]; - float theta = atan2( y, x ) ; - float r = sqrt(x * x + y * y); + // calculate the 2D coordinates for this current thread. + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y * blockDim.y + threadIdx.y; + // convert 2D coordinates to 1D + int i = yi * x + xi; + + + if(xi >= x|| yi >= y) return; + + float xl = a[i * 2 + 0]; + float yl = a[i * 2 + 1]; + float theta = atan2( yl, xl ) ; + float r = sqrt(xl * xl + yl * yl); a[i * 2 + 0] = theta; a[i * 2 + 1] = r; - } + } template - void gpu_cart2polar(T* gpuGrad, unsigned int N){ - - //get the maximum number of threads per block for the CUDA device - int threads = stim::maxThreadsPerBlock(); + void gpu_cart2polar(T* gpuGrad, unsigned int x, unsigned int y){ - //calculate the number of blocks - int blocks = N / threads + (N % threads == 0 ? 0:1); + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(max_threads, 1); + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); + //call the kernel to do the multiplication - cuda_cart2polar <<< blocks, threads >>>(gpuGrad, N); + cuda_cart2polar <<< blocks, threads >>>(gpuGrad, x, y); } template - void cpu_cart2polar(T* a, unsigned int N){ + void cpu_cart2polar(T* a, unsigned int x, unsigned int y){ //calculate the number of bytes in the array + unsigned int N = x *y; unsigned int bytes = N * sizeof(T) * 2; //allocate memory on the GPU for the array @@ -50,7 +55,7 @@ namespace stim{ HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) ); //call the GPU version of this function - gpu_cart2polar(gpuA, N); + gpu_cart2polar(gpuA, x, y); //copy the array back to the CPU HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) ); diff --git a/stim/cuda/ivote/local_max.cuh b/stim/cuda/ivote/local_max.cuh index 057f544..a2bd2fe 100644 --- a/stim/cuda/ivote/local_max.cuh +++ b/stim/cuda/ivote/local_max.cuh @@ -1,7 +1,6 @@ #ifndef STIM_CUDA_LOCAL_MAX_H #define STIM_CUDA_LOCAL_MAX_H - # include # include #include @@ -9,7 +8,6 @@ namespace stim{ namespace cuda{ - // this kernel calculates the local maximum for finding the cell centers template __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, int conn, int x, int y){ @@ -20,13 +18,10 @@ namespace stim{ if(xi >= x || yi >= y) return; - - + // convert 2D coordinates to 1D int i = yi * x + xi; - // START DAVID - gpuCenters[i] = 0; //initialize the value at this location to zero T val = gpuVote[i]; @@ -46,100 +41,22 @@ namespace stim{ } gpuCenters[i] = 1; - - // END DAVID - /* - //calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. - int xl = xi - conn; - int yl = yi - conn; - - - // use zero for the lowest limits if the xi or yi is less than conn. - if (xi <= conn) - xl = 0; - if (yi <= conn) - yl = 0; - - //calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'. - int xh = xi + conn; - int yh = yi + conn; - - // use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn. - if (xi >= x - conn) - xh = x; - if (yi>= y - conn) - yh = y; - - // calculate the limits for finding the local maximum location in the connected neighbors for the current pixel - int n_l = yl * x + xl; - int n_h = yh * x + xh; - - //initial the centers image to zero - gpuCenters[i] = 0; - - - int n = n_l; - - float l_value = 0; - - if (i < x * y) - - // check if the vote value for this pixel is greater than threshold, so this pixel may be a local max. - if (gpuVote[i]>final_t){ - - // compare the vote value for this pixel with the vote values with its neighbors. - while (n<=n_h){ - - // check if this vote value is a local max in its neighborhood. - if (gpuVote[i] < gpuVote[n]){ - l_value = 0; - n =n_h+1; - } - else if (n == n_h){ - l_value = 1; - n = n+1; - } - // check if the current neighbor is the last one at the current row - else if ((n - n_l - 2*conn)% x ==0){ - n = n + x - 2*conn -1; - n ++; - } - else - n ++; - } - // set the center value for this pixel to high if it's a local max ,and to low if not. - gpuCenters[i] = l_value ; - } - */ - } - - - + template void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ - - - unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(max_threads, 1); dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); - - //call the kernel to find the local maximum. cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y); - - } - - template void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ - //calculate the number of bytes in the array unsigned int bytes = x * y * sizeof(T); @@ -147,28 +64,22 @@ namespace stim{ T* gpuCenters; cudaMalloc(&gpuCenters, bytes); - //allocate space on the GPU for the input Vote Image T* gpuVote; cudaMalloc(&gpuVote, bytes); - //copy the Vote image data to the GPU HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); - - + //call the GPU version of the local max function gpu_local_max(gpuCenters, gpuVote, final_t, conn, x, y); - - + //copy the cell centers data to the CPU cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ; - - + //free allocated memory cudaFree(gpuCenters); cudaFree(gpuVote); - cudaFree(gpuGrad); } } diff --git a/stim/cuda/ivote/update_dir.cuh b/stim/cuda/ivote/update_dir.cuh index a4e5d62..5b3d37e 100644 --- a/stim/cuda/ivote/update_dir.cuh +++ b/stim/cuda/ivote/update_dir.cuh @@ -1,7 +1,6 @@ #ifndef STIM_CUDA_UPDATE_DIR_H #define STIM_CUDA_UPDATE_DIR_H - # include # include #include @@ -48,8 +47,7 @@ namespace stim{ int rmax_sq = rmax * rmax; int tx_rmax = threadIdx.x + rmax; int bxs = bxi - rmax; - - + for(int yr = -rmax; yr <= rmax; yr++){ //copy the portion of the image necessary for this block to shared memory @@ -66,8 +64,7 @@ namespace stim{ // calculate the angle between the voter and the current pixel in x and y directions float atan_angle = gpuTable[ind_t]; - - + // calculate the voting direction based on the grtadient direction int idx_share_update = xr + tx_rmax ; float share_vote = s_vote[idx_share_update]; @@ -86,11 +83,8 @@ namespace stim{ } } } - - - //float new_angle = atan2(dy, dx); + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); - float new_angle = gpuTable[ind_m]; if(xi < x && yi < y) @@ -102,35 +96,27 @@ namespace stim{ template __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ - //************ when the number of threads are (1024,1) ************* - // calculate the 2D coordinates for this current thread. int xi = blockIdx.x * blockDim.x + threadIdx.x; int yi = blockIdx.y * blockDim.y + threadIdx.y; // convert 2D coordinates to 1D int i = yi * x + xi; - - + //update the gradient image with the vote direction gpuGrad[2*i] = gpuDir[i]; } - - + template void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ //get the number of pixels in the image unsigned int pixels = x * y; unsigned int bytes = sizeof(T) * pixels; - - + unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(max_threads, 1); dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); - //dim3 threads(1, 1); - //dim3 blocks(x, y); - // Allocate CUDA array in device memory //define a channel descriptor for a single 32-bit channel cudaChannelFormatDesc channelDesc = @@ -174,14 +160,12 @@ namespace stim{ //call the kernel to update the gradient direction cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); - - + //free allocated memory cudaFree(gpuDir); } - - + template void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ @@ -211,12 +195,10 @@ namespace stim{ //copy the atan2 values to the GPU HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); - - + //call the GPU version of the update direction function gpu_update_dir(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); - - + //copy the new gradient image back to the CPU cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; @@ -229,6 +211,4 @@ namespace stim{ } } - - #endif \ No newline at end of file diff --git a/stim/cuda/ivote/vote.cuh b/stim/cuda/ivote/vote.cuh index 54daf54..fdc3677 100644 --- a/stim/cuda/ivote/vote.cuh +++ b/stim/cuda/ivote/vote.cuh @@ -1,13 +1,11 @@ #ifndef STIM_CUDA_VOTE_H #define STIM_CUDA_VOTE_H - # include # include #include #include - namespace stim{ namespace cuda{ @@ -26,13 +24,10 @@ namespace stim{ int yi = blockIdx.y * blockDim.y + threadIdx.y; // convert 2D coordinates to 1D int i = yi * x + xi; - - - + // define a local variable to sum the votes from the voters float sum = 0; - //calculate the width of the shared memory block int swidth = 2 * rmax + blockDim.x; @@ -59,8 +54,7 @@ namespace stim{ // calculate the angle between the pixel and the current voter in x and y directions float atan_angle = gpuTable[id_t]; - - + // calculate the voting direction based on the grtadient direction int idx_share = xr + tx_rmax ; float2 g = s_grad[idx_share]; @@ -71,7 +65,6 @@ namespace stim{ sum += g.y; } - } } @@ -87,15 +80,11 @@ namespace stim{ //get the number of pixels in the image unsigned int pixels = x * y; unsigned int bytes = sizeof(T) * pixels; - - + unsigned int max_threads = stim::maxThreadsPerBlock(); - //unsigned int thread_dim = sqrt(max_threads); dim3 threads(max_threads, 1); dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); - //dim3 threads(1,1); - //dim3 blocks(x, y); - + // Allocate CUDA array in device memory //define a channel descriptor for a single 32-bit channel @@ -131,7 +120,6 @@ namespace stim{ // specify share memory unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4; - //call the kernel to do the voting cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y); @@ -165,14 +153,10 @@ namespace stim{ //copy the atan2 values to the GPU HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); - - //cudaMemcpyToSymbol (cstTable, cpuTable, bytes_table ); - - + //call the GPU version of the vote calculation function gpu_vote(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); - - + //copy the Vote Data back to the CPU cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; @@ -185,6 +169,4 @@ namespace stim{ } } - - #endif \ No newline at end of file diff --git a/stim/cuda/templates/conv2sep.cuh b/stim/cuda/templates/conv2sep.cuh index d63d4e0..b7156c1 100644 --- a/stim/cuda/templates/conv2sep.cuh +++ b/stim/cuda/templates/conv2sep.cuh @@ -49,7 +49,7 @@ namespace stim{ __syncthreads(); //if the current pixel is outside of the image - if(xi > x || yi > y) + if(xi >= x || yi >= y) return; @@ -61,7 +61,7 @@ namespace stim{ //for each element of the kernel for(int ki = -kr; ki <= kr; ki++){ - sum += g[ki + kr] * s[si + ki]; + sum += s[si + ki] * g[ki + kr]; } //calculate the 1D image index for this thread diff --git a/stim/cuda/templates/gradient.cuh b/stim/cuda/templates/gradient.cuh index 25e225e..d0718b2 100644 --- a/stim/cuda/templates/gradient.cuh +++ b/stim/cuda/templates/gradient.cuh @@ -11,12 +11,12 @@ namespace stim{ template __global__ void gradient_2d(T* out, T* in, int x, int y){ - //calculate the 1D image index for this thread - int i = blockIdx.x * blockDim.x + threadIdx.x; - - //convert this to 2D pixel coordinates - int yi = i / x; - int xi = i - (yi * x); + + // calculate the 2D coordinates for this current thread. + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y * blockDim.y + threadIdx.y; + // convert 2D coordinates to 1D + int i = yi * x + xi; //return if the pixel is outside of the image if(xi >= x || yi >= y) return; @@ -25,16 +25,16 @@ namespace stim{ int i_xp = yi * x + (xi + 1); int i_yp = (yi + 1) * x + xi; + //calculate indices for the backward difference + int i_xn = yi * x + (xi - 1); + int i_yn = (yi - 1) * x + xi; + //use forward differences if a coordinate is zero if(xi == 0) out[i * 2 + 0] = in[i_xp] - in[i]; if(yi == 0) out[i * 2 + 1] = in[i_yp] - in[i]; - //calculate indices for the backward difference - int i_xn = yi * x + (xi - 1); - int i_yn = (yi - 1) * x + xi; - //use backward differences if the coordinate is at the maximum edge if(xi == x-1) out[i * 2 + 0] = in[i] - in[i_xn]; @@ -51,28 +51,16 @@ namespace stim{ } template - //void gpu_gradient_2d(T* gpuOut, T* gpuIn, unsigned int x, unsigned int y){ void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){ //get the number of pixels in the image unsigned int pixels = x * y; - //allocate space on the GPU for the input image - //T* gpuI; - //HANDLE_ERROR(cudaMalloc(&gpuI, bytes)); - - //cudaMemcpy(gpuI, gpuI0, bytes, cudaMemcpyDeviceToDevice); - - - //allocate space on the GPU for the output gradient image - //T* gpuGrad; - //cudaMalloc(&gpuGrad, bytes * 2); //the output image will have two channels (x, y) - //get the maximum number of threads per block for the CUDA device - int threads = stim::maxThreadsPerBlock(); - - //calculate the number of blocks - int blocks = pixels / threads + (pixels%threads == 0 ? 0:1); + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(max_threads, 1); + dim3 blocks(x/threads.x + 1 , y); + //call the GPU kernel to determine the gradient gradient_2d <<< blocks, threads >>>(gpuGrad, gpuI, x, y); @@ -105,6 +93,7 @@ namespace stim{ //free allocated memory cudaFree(gpuOut); + cudaFree(gpuIn); } } -- libgit2 0.21.4