diff --git a/stim/cuda/arraymath.cuh b/stim/cuda/arraymath.cuh index 548d442..fa9677b 100644 --- a/stim/cuda/arraymath.cuh +++ b/stim/cuda/arraymath.cuh @@ -10,8 +10,8 @@ #include #include #include -#include -#include +//#include +//#include namespace stim{ namespace cuda{ diff --git a/stim/cuda/ivote/david_update_dir_global.cuh b/stim/cuda/ivote/david_update_dir_global.cuh new file mode 100644 index 0000000..3ae624c --- /dev/null +++ b/stim/cuda/ivote/david_update_dir_global.cuh @@ -0,0 +1,171 @@ +#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H +#define STIM_CUDA_UPDATE_DIR_GLOBAL_H + +# include +# include +#include +#include +#include +#include "cpyToshare.cuh" + +#define RMAX_TEST 8 + +namespace stim{ + namespace cuda{ + + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. + template + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ + extern __shared__ T atan2_table[]; + + //calculate the start point for this block + //int bxi = blockIdx.x * blockDim.x; + + stim::cuda::sharedMemcpy(atan2_table, gpuTable, (2 * rmax + 1) * (2 * rmax + 1), threadIdx.x, blockDim.x); + + __syncthreads(); + + // calculate the 2D coordinates for this current thread. + //int xi = bxi + threadIdx.x; + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y * blockDim.y + threadIdx.y; + if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel + + int i = yi * x + xi; // convert 2D coordinates to 1D + + float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch + gpuDir[i] = 0; //initialize the vote direction to zero + float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter + int id_x = 0; // define two local variables for the x and y position of the maximum + int id_y = 0; + + int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter + int rmax_sq = rmax * rmax; + int tx_rmax = threadIdx.x + rmax; + float atan_angle; + float vote_c; + unsigned int ind_t; + for(int yr = -rmax; yr <= rmax; yr++){ //for each counter in the y direction + if (yi+yr >= 0 && yi + yr < y){ //if the counter exists (we aren't looking outside of the image) + for(int xr = -rmax; xr <= rmax; xr++){ //for each counter in the x direction + if((xr * xr + yr *yr)< rmax_sq){ //if the counter is within range of the voter + + ind_t = (rmax - yr) * x_table + rmax - xr; //calculate the index to the atan2 table + atan_angle = atan2_table[ind_t]; //retrieve the direction vector from the table + + //atan_angle = atan2((float)yr, (float)xr); + + if (abs(atan_angle - theta) max) { // compare the vote value of this pixel with the max value to find the maxima and its index. + max = vote_c; + id_x = xr; + id_y = yr; + } + } + } + } + } + } + + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); + float new_angle = gpuTable[ind_m]; + + if(xi < x && yi < y) + gpuDir[i] = new_angle; + } //end kernel + + // this kernel updates the gradient direction by the calculated voting direction. + template + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ + + // calculate the 2D coordinates for this current thread. + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y * blockDim.y + threadIdx.y; + + // convert 2D coordinates to 1D + int i = yi * x + xi; + + //update the gradient image with the vote direction + gpuGrad[2*i] = gpuDir[i]; + } + + template + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + + + //calculate the number of bytes in the array + unsigned int bytes = x * y * sizeof(T); + + unsigned int max_threads = stim::maxThreadsPerBlock(); + + dim3 threads(sqrt(max_threads), sqrt(max_threads)); + dim3 blocks(x/threads.x + 1, y/threads.y + 1); + + + + // allocate space on the GPU for the updated vote direction + T* gpuDir; + cudaMalloc(&gpuDir, bytes); + + size_t shared_mem = sizeof(T) * std::pow((2 * rmax + 1), 2); + std::cout<<"Shared memory for atan2 table: "<>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + //call the kernel to update the gradient direction + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); + + //free allocated memory + cudaFree(gpuDir); + + } + + template + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ + + //calculate the number of bytes in the array + unsigned int bytes = x * y * sizeof(T); + + //calculate the number of bytes in the atan2 table + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); + + //allocate space on the GPU for the Vote Image + T* gpuVote; + cudaMalloc(&gpuVote, bytes); + + //copy the input vote image to the GPU + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the input Gradient image + T* gpuGrad; + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); + + //copy the Gradient data to the GPU + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the atan2 table + T* gpuTable; + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); + + //copy the atan2 values to the GPU + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); + + //call the GPU version of the update direction function + gpu_update_dir(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + + //copy the new gradient image back to the CPU + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ; + + //free allocated memory + cudaFree(gpuTable); + cudaFree(gpuVote); + cudaFree(gpuGrad); + } + + } +} + +#endif \ No newline at end of file diff --git a/stim/cuda/ivote/update_dir_global.cuh b/stim/cuda/ivote/update_dir_global.cuh index 71eeebf..f489a8b 100644 --- a/stim/cuda/ivote/update_dir_global.cuh +++ b/stim/cuda/ivote/update_dir_global.cuh @@ -5,25 +5,88 @@ # include #include #include +#include +#include +#include #include "cpyToshare.cuh" -#define RMAX_TEST 8 +//#define RMAX_TEST 8 namespace stim{ namespace cuda{ - - // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. + template __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ + extern __shared__ T S[]; + T* shared_atan = S; + size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1); + stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x); + + //T* shared_vote = &S[n_table]; + //size_t template_size_x = (blockDim.x + 2 * rmax); + //size_t template_size_y = (blockDim.y + 2 * rmax); + //stim::cuda::threadedMemcpy2D((char*)shared_vote, (char*)gpuVote, template_size_x, template_size_y, x, threadIdx.y * blockDim.x + threadIdx.x, blockDim.x * blockDim.y); + + int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate the 2D coordinates for this current thread. + int yi = blockIdx.y * blockDim.y + threadIdx.y; + if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel + + int i = yi * x + xi; //convert 2D coordinates to 1D + float theta = gpuGrad[2*i]; //calculate the voting direction based on the grtadient direction - global memory fetch - //calculate the start point for this block - int bxi = blockIdx.x * blockDim.x; + stim::aabb2 bb(xi, yi); //initialize a bounding box at the current point + bb.insert(xi + ceil(rmax * cos(theta)), ceil(yi + rmax * sin(theta))); + bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi))); //insert one corner of the triangle into the bounding box + bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi))); //insert the final corner into the bounding box + + int x_table = 2*rmax +1; + int lut_i; + T rmax_sq = rmax * rmax; + T dx_sq, dy_sq; + + bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image + bb.trim_high(x-1, y-1); + + int by, bx; + int dx, dy; //coordinate relative to (xi, yi) + T v; + T max_v = 0; //initialize the maximum vote value to zero + T alpha; + int max_dx = bb.low[0]; + int max_dy = bb.low[1]; + for(by = bb.low[1]; by <= bb.high[1]; by++){ //for each element in the bounding box + dy = by - yi; //calculate the y coordinate of the current point relative to yi + dy_sq = dy * dy; + for(bx = bb.low[0]; bx <= bb.high[0]; bx++){ + dx = bx - xi; + dx_sq = dx * dx; + lut_i = (rmax - dy) * x_table + rmax - dx; + alpha = shared_atan[lut_i]; + if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){ + v = gpuVote[by * x + bx]; // find the vote value for the current counter + if(v > max_v){ + max_v = v; + max_dx = dx; + max_dy = dy; + } + } + } + } + gpuDir[i] = atan2((T)max_dy, (T)max_dx); + } + + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. + template + __global__ void leila_cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ + // calculate the 2D coordinates for this current thread. - int xi = bxi + threadIdx.x; - if(xi >= x) return; //if the index is outside of the image, terminate the kernel - int yi = blockIdx.y * blockDim.y + threadIdx.y; + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y * blockDim.y + threadIdx.y; + + if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel + int i = yi * x + xi; // convert 2D coordinates to 1D float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch @@ -37,27 +100,32 @@ namespace stim{ int tx_rmax = threadIdx.x + rmax; float atan_angle; float vote_c; - for(int yr = -RMAX_TEST; yr <= RMAX_TEST; yr++){ - if (yi+yr >= 0 && yi + yr < y){ - for(int xr = -RMAX_TEST; xr <= RMAX_TEST; xr++){ - - unsigned int ind_t = (RMAX_TEST - yr) * x_table + RMAX_TEST - xr; - - // calculate the angle between the voter and the current pixel in x and y directions - atan_angle = gpuTable[ind_t]; - - // find the vote value for the current counter - vote_c = gpuVote[(yi+yr)*x + (xi+xr)]; - - // check if the current pixel is located in the voting area of this voter. - if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) max) { - - max = vote_c; - id_x = xr; - id_y = yr; + int xidx, yidx, yr_sq, xr_sq; + for(int yr = -rmax; yr <= rmax; yr++){ + yidx = yi + yr; //compute the index into the image + if (yidx >= 0 && yidx < y){ //if the current y-index is inside the image + yr_sq = yr * yr; //compute the square of yr, to save time later + for(int xr = -rmax; xr <= rmax; xr++){ + xidx = xi + xr; + if(xidx >= 0 && xidx < x){ + xr_sq = xr * xr; + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; + + // calculate the angle between the voter and the current pixel in x and y directions + atan_angle = gpuTable[ind_t]; + //atan_angle = atan2((T)yr, (T)xr); + + // check if the current pixel is located in the voting area of this voter. + if (((xr_sq + yr_sq)< rmax_sq) && (abs(atan_angle - theta) max) { + + max = vote_c; + id_x = xr; + id_y = yr; + } } } } @@ -70,6 +138,7 @@ namespace stim{ if(xi < x && yi < y) gpuDir[i] = new_angle; } //end kernel + // this kernel updates the gradient direction by the calculated voting direction. template @@ -78,6 +147,8 @@ namespace stim{ // calculate the 2D coordinates for this current thread. int xi = blockIdx.x * blockDim.x + threadIdx.x; int yi = blockIdx.y * blockDim.y + threadIdx.y; + + if(xi >= x || yi >= y) return; // convert 2D coordinates to 1D int i = yi * x + xi; @@ -91,23 +162,43 @@ namespace stim{ //calculate the number of bytes in the array unsigned int bytes = x * y * sizeof(T); - - unsigned int max_threads = stim::maxThreadsPerBlock(); - dim3 threads(max_threads, 1); - dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); // allocate space on the GPU for the updated vote direction T* gpuDir; - cudaMalloc(&gpuDir, bytes); + HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) ); + + unsigned int max_threads = stim::maxThreadsPerBlock(); + //dim3 threads(min(x, max_threads), 1); + //dim3 blocks(x/threads.x, y); + + dim3 threads( sqrt(max_threads), sqrt(max_threads) ); + dim3 blocks(x/threads.x + 1, y/threads.y + 1); + + size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); + //size_t curtain = 2 * rmax; + //size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain); + size_t shared_mem_req = table_bytes;// + template_bytes; + std::cout<<"Shared Memory required: "< shared_mem){ + std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); + stim::gpu2image(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer); + + //exit(0); + + threads = dim3( sqrt(max_threads), sqrt(max_threads) ); + blocks = dim3(x/threads.x + 1, y/threads.y + 1); //call the kernel to update the gradient direction cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); - //free allocated memory - cudaFree(gpuDir); + HANDLE_ERROR( cudaFree(gpuDir) ); } diff --git a/stim/cuda/ivote_atomic.cuh b/stim/cuda/ivote_atomic.cuh index c7a827f..4890a9d 100644 --- a/stim/cuda/ivote_atomic.cuh +++ b/stim/cuda/ivote_atomic.cuh @@ -6,7 +6,7 @@ #include //#include #include -#include +//#include namespace stim{ namespace cuda{ diff --git a/stim/cuda/sharedmem.cuh b/stim/cuda/sharedmem.cuh index b8d4105..a5f05cc 100644 --- a/stim/cuda/sharedmem.cuh +++ b/stim/cuda/sharedmem.cuh @@ -35,10 +35,8 @@ namespace stim{ } } - // Copies values from global memory to shared memory, optimizing threads - template - __device__ void sharedMemcpy(T* dest, T* src, size_t N, size_t tid, size_t nt){ - + // Threaded copying of data on a CUDA device. + __device__ void threadedMemcpy(char* dest, char* src, size_t N, size_t tid, size_t nt){ size_t I = N / nt + 1; //calculate the number of iterations required to make the copy size_t xi = tid; //initialize the source and destination index to the thread ID for(size_t i = 0; i < I; i++){ //for each iteration @@ -48,7 +46,37 @@ namespace stim{ } } - + /// Threaded copying of 2D data on a CUDA device + /// @param dest is a linear destination array of size nx * ny + /// @param src is a 2D image stored as a linear array with a pitch of X + /// @param X is the number of bytes in a row of src + /// @param tid is a 1D id for the current thread + /// @param nt is the number of threads in the block + template + __device__ void threadedMemcpy2D(T* dest, size_t nx, size_t ny, + T* src, size_t x, size_t y, size_t sX, size_t sY, + size_t tid, size_t nt){ + + size_t vals = nx * ny; //calculate the total number of bytes to be copied + size_t I = vals / nt + 1; //calculate the number of iterations required to perform the copy + + size_t src_i, dest_i; + size_t dest_x, dest_y, src_x, src_y; + for(size_t i = 0; i < I; i++){ //for each iteration + dest_i = i * nt + tid; //calculate the index into the destination array + dest_y = dest_i / nx; + dest_x = dest_i - dest_y * nx; + + if(dest_y < ny && dest_x < nx){ + + src_x = x + dest_x; + src_y = y + dest_y; + + src_i = src_y * sX + src_x; + dest[dest_i] = src[src_i]; + } + } + } } } diff --git a/stim/cuda/templates/conv2sep.cuh b/stim/cuda/templates/conv2sep.cuh index bb33af7..d2e111a 100644 --- a/stim/cuda/templates/conv2sep.cuh +++ b/stim/cuda/templates/conv2sep.cuh @@ -30,7 +30,8 @@ namespace stim{ int byi = blockIdx.y; //copy the portion of the image necessary for this block to shared memory - stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); + //stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); + stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); //calculate the thread index int ti = threadIdx.x; @@ -88,7 +89,8 @@ namespace stim{ int byi = blockIdx.y * blockDim.y; //copy the portion of the image necessary for this block to shared memory - stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); + //stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); + stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); //calculate the thread index int ti = threadIdx.y; diff --git a/stim/image/image.h b/stim/image/image.h index beae621..5fc8120 100644 --- a/stim/image/image.h +++ b/stim/image/image.h @@ -160,11 +160,26 @@ public: exit(1); } allocate(cvImage.cols, cvImage.rows, cvImage.channels()); //allocate space for the image - T* cv_ptr = (T*)cvImage.data; + unsigned char* cv_ptr = (unsigned char*)cvImage.data; if(C() == 1) //if this is a single-color image, just copy the data memcpy(img, cv_ptr, bytes()); if(C() == 3) //if this is a 3-color image, OpenCV uses BGR interleaving - set_interleaved_bgr(cv_ptr, X(), Y()); + from_opencv(cv_ptr, X(), Y()); + } + + void from_opencv(unsigned char* buffer, size_t width, size_t height){ + allocate(width, height, 3); + T value; + size_t i; + for(size_t c = 0; c < C(); c++){ //copy directly + for(size_t y = 0; y < Y(); y++){ + for(size_t x = 0; x < X(); x++){ + i = y * X() * C() + x * C() + (2-c); + value = buffer[i]; + img[idx(x, y, c)] = value; + } + } + } } //save a file @@ -180,23 +195,35 @@ public: cv::imwrite(filename, cvImage); } + void set_interleaved(T* buffer, size_t width, size_t height, size_t channels){ + allocate(width, height, channels); + memcpy(img, buffer, bytes()); + } + //create an image from an interleaved buffer void set_interleaved_rgb(T* buffer, size_t width, size_t height){ - allocate(width, height, 3); - memcpy(img, buffer, bytes()); + set_interleaved(buffer, width, height, 3); } void set_interleaved_bgr(T* buffer, size_t width, size_t height){ allocate(width, height, 3); + T value; + size_t i; for(size_t c = 0; c < C(); c++){ //copy directly for(size_t y = 0; y < Y(); y++){ for(size_t x = 0; x < X(); x++){ - img[idx(x, y, c)] = buffer[y * X() * C() + x * C() + (2-c)]; + i = y * X() * C() + x * C() + (2-c); + value = buffer[i]; + img[idx(x, y, c)] = value; } } } } + void set_interleaved(T* buffer, size_t width, size_t height){ + set_interleaved_rgb(buffer, width, height); + } + void get_interleaved_bgr(T* data){ //for each channel diff --git a/stim/optics/scalarfield.h b/stim/optics/scalarfield.h index 1c73002..4d7cf37 100644 --- a/stim/optics/scalarfield.h +++ b/stim/optics/scalarfield.h @@ -71,8 +71,8 @@ public: void to_cpu(){ if(loc == CPUmem) return; else{ - stim::complex* host_E = (stim::complex*) malloc(e_bytes()); //allocate space in main memory - HANDLE_ERROR( cudaMemcpy(host_E, E, e_bytes(), cudaMemcpyDeviceToHost) ); //copy from GPU to CPU + stim::complex* host_E = (stim::complex*) malloc(grid_bytes()); //allocate space in main memory + HANDLE_ERROR( cudaMemcpy(host_E, E, grid_bytes(), cudaMemcpyDeviceToHost) ); //copy from GPU to CPU HANDLE_ERROR( cudaFree(E) ); //free device memory E = host_E; //swap pointers } diff --git a/stim/visualization/aabb2.h b/stim/visualization/aabb2.h new file mode 100644 index 0000000..fe38f74 --- /dev/null +++ b/stim/visualization/aabb2.h @@ -0,0 +1,49 @@ +#ifndef STIM_AABB2_H +#define STIM_AABB2_H + +#include + +namespace stim{ + +/// Structure for a 2D axis aligned bounding box +template +struct aabb2{ + +//protected: + + T low[2]; //top left corner position + T high[2]; //dimensions along x and y + +//public: + + CUDA_CALLABLE aabb2(T x, T y){ //initialize an axis aligned bounding box of size 0 at the given position + low[0] = high[0] = x; //set the position to the user specified coordinates + low[1] = high[1] = y; + } + + //insert a point into the bounding box, growing the box appropriately + CUDA_CALLABLE void insert(T x, T y){ + if(x < low[0]) low[0] = x; + if(y < low[1]) low[1] = y; + + if(x > high[0]) high[0] = x; + if(y > high[1]) high[1] = y; + } + + //trim the bounding box so that the lower bounds are (x, y) + CUDA_CALLABLE void trim_low(T x, T y){ + if(low[0] < x) low[0] = x; + if(low[1] < y) low[1] = y; + } + + CUDA_CALLABLE void trim_high(T x, T y){ + if(high[0] > x) high[0] = x; + if(high[1] > y) high[1] = y; + } + +}; + +} + + +#endif \ No newline at end of file -- libgit2 0.21.4