From 310a169858bcdd5efb908f5548e3fd086cd76488 Mon Sep 17 00:00:00 2001 From: leila saadatifard Date: Fri, 7 Apr 2017 13:57:45 -0500 Subject: [PATCH] update the ivote3 project --- Matlab_3D/local_max.m | 4 ++-- cpp/cudafunc.cu | 51 ++++++++++++++++++++++----------------------------- cpp/gaussian_blur3.cuh | 60 +++++++++++++++++++++++++++++++++++++++++------------------- cpp/gradient3.cuh | 18 ++++++++---------- cpp/main.cpp | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------- cpp/update_dir3_aabb.cuh | 29 +++++++++++++++-------------- cpp/vote3_atomic_aabb.cuh | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------------------------------------------------------ 7 files changed, 257 insertions(+), 243 deletions(-) diff --git a/Matlab_3D/local_max.m b/Matlab_3D/local_max.m index e00cd19..9be5ad4 100644 --- a/Matlab_3D/local_max.m +++ b/Matlab_3D/local_max.m @@ -26,8 +26,8 @@ for li = 1: nl ind_m = sub2ind(size(Ivote), cx, cy, cz); lm_Ivote = A(ind_m); [~, lm_indx] = max(lm_Ivote(:)); - if ind_f(li) == ind_m(lm_indx); - Icenter(ind_f(li)) = 255; + if ind_f(li) == ind_m(lm_indx) + Icenter(ind_f(li)) = A(ind_f(li)); %255 end end diff --git a/cpp/cudafunc.cu b/cpp/cudafunc.cu index 4c5dbd6..db74079 100644 --- a/cpp/cudafunc.cu +++ b/cpp/cudafunc.cu @@ -5,71 +5,65 @@ #include "vote3_atomic_aabb.cuh" #include "update_dir3_aabb.cuh" #include "local_max3.cuh" - +#include void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], - int iter, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ + int iter, float t, unsigned int conn[], size_t x, size_t y, size_t z){ - cudaSetDevice(0); - - unsigned int bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data + size_t bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data float* gpuI0; //assign memory on gpu for the input data cudaMalloc(&gpuI0, bytes); cudaMemcpy(gpuI0, img, bytes, cudaMemcpyHostToDevice); //copy the image data to the GPU. - gpu_gaussian_blur3(gpuI0, sigma, x, y, z); //call the blurring function from the gpu. cudaDeviceSynchronize(); float* gpu_grad; //assign memory on the gpu for the gradient along the X, y, z. cudaMalloc(&gpu_grad, bytes*3); - gpu_gradient3(gpu_grad, gpuI0, 1, x, y, z); //call the gradient function from the gpu. + gpu_gradient3(gpu_grad, gpuI0, x, y, z); //call the gradient function from the gpu. cudaFree(gpuI0); float* gpu_vote; cudaMalloc(&gpu_vote, bytes); float cos_phi = cos(phi); - + //call the vote function. for (int i = 0; i < iter; i++){ cudaMemset(gpu_vote, 0, bytes); gpu_vote3(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z); cudaDeviceSynchronize(); - - //if (phi >= d_phi){ + if (i == 0) { + cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); + std::ofstream fvote("00-vote1_aabb.vol", std::ofstream::out | std::ofstream::binary); + fvote.write((char*)img, bytes); + fvote.close(); + } + if (i == 1) { + cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); + std::ofstream fvote("00-vote2_aabb.vol", std::ofstream::out | std::ofstream::binary); + fvote.write((char*)img, bytes); + fvote.close(); + } gpu_update_dir3(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z); - cudaDeviceSynchronize(); - phi = phi - d_phi; - cos_phi = cos(phi); - //} - + cudaDeviceSynchronize(); + phi = phi - d_phi; + cos_phi = cos(phi); } cudaFree(gpu_grad); cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); - //allocate space on the gpu for the final detected cells. - //float* gpu_output; - //cudaMalloc(&gpu_output, bytes); - - ////call the local max function - //gpu_local_max3(gpu_output, gpu_vote, t, conn, x, y, z); - - ////copy the final result to the cpu. - //cudaMemcpy(center, gpu_output, bytes, cudaMemcpyDeviceToHost); - // - // cudaFree(gpu_vote); //cudaFree(gpu_output); } -void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ +void lmax(float* out, float* in, float t, unsigned int conn[], size_t x, size_t y, size_t z){ unsigned int bytes = x * y * z * sizeof(float); cudaSetDevice(0); @@ -89,5 +83,4 @@ void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, u cudaFree(gpuV); cudaFree(gpuOut); -} - +} \ No newline at end of file diff --git a/cpp/gaussian_blur3.cuh b/cpp/gaussian_blur3.cuh index fa083ca..f668a50 100644 --- a/cpp/gaussian_blur3.cuh +++ b/cpp/gaussian_blur3.cuh @@ -8,6 +8,7 @@ #define pi 3.14159 + template __global__ void blur_x(T* out, T* in, T sigma, int x, int y, int z){ @@ -22,7 +23,7 @@ int i = zi * x * y + yi * x + xi; // calculate the kernel size - int k_size = sigma * 4; + T k_size = sigma * 4; //if the current pixel is outside of the image if(xi >= x || yi >= y || zi>=z) @@ -33,14 +34,21 @@ T G; T sum = 0; //running weighted sum across the kernel out[i] = 0; - T sigma_sq = 2 * sigma * sigma; - T a = 1.0 / (sigma * sqrt(2 * pi)); + T sigma_sq = 2.0 * sigma * sigma; + T a = 1.0 / (sigma * sqrt(2.0 * pi)); + + //handle the boundary in x direction + int kl = -(int)k_size; + //if (xi < k_size) kl = -xi; + int kh = (int)k_size; + //if (xi >= x - (int)k_size) kh = x - xi - 1; + //for each element of the kernel - for(int ki = -k_size; ki <= k_size; ki++){ + for(int ki = kl; ki <= kh; ki++){ //calculate the gaussian value - G = a * exp(-(ki*ki) / (sigma_sq)); + G = a * exp(-(T)(ki*ki) / (sigma_sq)); //calculate the global coordinates for this point in the kernel gx = (xi + ki) % x; gi = zi * x * y + yi * x + gx; @@ -65,7 +73,7 @@ int i = zi * x * y + yi * x + xi; // calculate the kernel size - int k_size = sigma * 4; + T k_size = sigma * 4; //if the current pixel is outside of the image if(xi >= x || yi >= y || zi>=z) @@ -76,14 +84,22 @@ T G; T sum = 0; //running weighted sum across the kernel out[i] = 0; - T sigma_sq = 2 * sigma * sigma; - T a = 1.0 / (sigma * sqrt(2 * pi)); + T sigma_sq = 2.0 * sigma * sigma; + T a = 1.0 / (sigma * sqrt(2.0 * pi)); + + //handle the boundary in y direction + int kl = -(int)k_size; + //if (yi < k_size) kl = -yi; + int kh = (int)k_size; + //if (yi >= y - (int)k_size) kh = y - yi - 1; + + //for each element of the kernel - for(int ki = -k_size; ki <= k_size; ki++){ + for(int ki = kl; ki <= kh; ki++){ //calculate the gaussian value - G = a * exp(-(ki*ki) / sigma_sq); + G = a * exp(-(T)(ki*ki) / sigma_sq); //calculate the global coordinates for this point in the kernel gy = (yi + ki ) % y; gi = zi * x * y + gy * x + xi; @@ -108,7 +124,7 @@ int i = zi * x * y + yi * x + xi; // calculate the kernel size - int k_size = sigma * 4; + T k_size = sigma * 4; //if the current pixel is outside of the image if(xi >= x || yi >= y || zi>=z) @@ -119,14 +135,20 @@ T G; T sum = 0; //running weighted sum across the kernel out[i] = 0; - T sigma_sq = 2 * sigma * sigma; - T a = 1.0 / (sigma * sqrt(2 * pi)); + T sigma_sq = 2.0 * sigma * sigma; + T a = 1.0 / (sigma * sqrt(2.0 * pi)); + + //handle the boundary in z direction + int kl = -(int)k_size; + //if (zi < k_size) kl = -zi; + int kh = (int)k_size; + //if (zi >= z - (int)k_size) kh = z - zi - 1; //for each element of the kernel - for(int ki = -k_size; ki <= k_size; ki++){ + for(int ki = kl; ki <= kh; ki++){ //calculate the gaussian value - G = a * exp(-(ki*ki) / sigma_sq); + G = a * exp(-(T)(ki*ki) / sigma_sq); //calculate the global coordinates for this point in the kernel gz = (zi + ki) % z; gi = gz * x * y + yi * x + xi; @@ -138,13 +160,13 @@ } template - void gpu_gaussian_blur3(T* image, T sigma[], unsigned int x, unsigned int y, unsigned int z){ + void gpu_gaussian_blur3(T* image, T sigma[], size_t x, size_t y, size_t z){ //get the number of pixels in the image - unsigned int pixels = x * y * z; - unsigned int bytes = sizeof(T) * pixels; + size_t pixels = x * y * z; + size_t bytes = sizeof(T) * pixels; - int max_threads = stim::maxThreadsPerBlock(); + unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(sqrt (max_threads),sqrt (max_threads)); dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); diff --git a/cpp/gradient3.cuh b/cpp/gradient3.cuh index e5407d2..92f0ccf 100644 --- a/cpp/gradient3.cuh +++ b/cpp/gradient3.cuh @@ -7,7 +7,7 @@ #include template -__global__ void gradient3(T* out, T* in, float anisotropy, int x, int y, int z){ +__global__ void gradient3(T* out, T* in, int x, int y, int z){ //calculate x,y,z coordinates for this thread int xi = blockIdx.x * blockDim.x + threadIdx.x; @@ -55,30 +55,28 @@ __global__ void gradient3(T* out, T* in, float anisotropy, int x, int y, int z){ if(zi > 0 && zi < z-1) out[i * 3 + 2] = (in[i_zp] - in[i_zn]) / 2; - out[i * 3 + 2] *= 1/anisotropy; - } template -void gpu_gradient3(T* gpuGrad, T* gpuI, float anisotropy, unsigned int x, unsigned int y, unsigned int z){ +void gpu_gradient3(T* gpuGrad, T* gpuI, size_t x, size_t y, size_t z){ - int max_threads = stim::maxThreadsPerBlock(); + unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(sqrt (max_threads),sqrt (max_threads)); dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); //call the GPU kernel to determine the gradient - gradient3 <<< blocks, threads >>>(gpuGrad, gpuI, anisotropy, x, y, z); + gradient3 <<< blocks, threads >>>(gpuGrad, gpuI, x, y, z); } template -void cpu_gradient3(T* out, T* in, float anisotropy, unsigned int x, unsigned int y, unsigned int z){ +void cpu_gradient3(T* out, T* in, size_t x, size_t y, size_t z){ //get the number of pixels in the image - unsigned int pixels = x * y * z; - unsigned int bytes = pixels * sizeof(T); + size_t pixels = x * y * z; + size_t bytes = pixels * sizeof(T); //allocate space on the GPU for the input image T* gpuIn; @@ -92,7 +90,7 @@ void cpu_gradient3(T* out, T* in, float anisotropy, unsigned int x, unsigned int cudaMalloc(&gpuOut, bytes * 3); //the output image will have two channels (x, y) //call the GPU version of this function - gpu_gradient3(gpuOut, gpuIn, anisotropy, x, y, z); + gpu_gradient3(gpuOut, gpuIn, x, y, z); //copy the results to the CPU cudaMemcpy(out, gpuOut, bytes * 3, cudaMemcpyDeviceToHost); diff --git a/cpp/main.cpp b/cpp/main.cpp index 196e41a..1f0d7a2 100644 --- a/cpp/main.cpp +++ b/cpp/main.cpp @@ -19,21 +19,22 @@ float phi; size_t x, y, z; -void ivote3(float* img, float std[], float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[], - unsigned int x, unsigned int y, unsigned int z); -void lmax(float* center, float* vote, float t1, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z); - -void invert_data(float* cpuI, unsigned int x, unsigned int y, unsigned int z){ - for(int ix = 0; ix < x; ix++){ - for (int iy = 0; iy < y; iy++){ - for (int iz = 0; iz < z; iz++){ - int idx = iz * x * y + iy * x + ix; - cpuI[idx] = 255 - cpuI[idx]; - } +void ivote3(float* img, float std[], float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[], + size_t x, size_t y, size_t z); +void lmax(float* center, float* vote, float t1, unsigned int conn[], size_t x, size_t y, size_t z); + +void invert_data(float* cpuI, size_t x, size_t y, size_t z) { + size_t idx; + for (size_t ix = 0; ix < x; ix++) { + for (size_t iy = 0; iy < y; iy++) { + for (size_t iz = 0; iz < z; iz++) { + idx = iz * x * y + iy * x + ix; + cpuI[idx] = 255 - cpuI[idx]; } } } - +} + void advertise() { @@ -57,12 +58,12 @@ void init_args(int argc, char* argv[]) { args.add("x", "size of the dataset along X axis", "positive value"); args.add("y", "size of the dataset along Y axis", "positive value"); args.add("z", "size of the dataset along Z axis", "positive value"); - args.add("t", "threshold value for the final result", "positive valu"); + args.add("t", "threshold value for the final result", "0", "positive valu"); args.add("invert", "to invert the input data set", "string"); args.add("rmax", "maximum possible radius of the cells in the input image", "10", "[positive value]"); args.add("phi", "starting angle for the vote region (in degrees)", "25.0", "0 <= phi < 180"); args.add("iter", "number of iterations for voting", "8", "i > 0"); - args.add("sigma", "the gaussian blur standard deviation", "5", "s >=0 (s = 0, no blurring)"); + args.add("sigma", "the gaussian blur standard deviation", "3", "s >=0 (s = 0, no blurring)"); args.add("conn", "the number of connected neighbors for calculating the local maxima", "5", "[positive value]"); //parse the command line arguments. args.parse(argc, argv); @@ -95,94 +96,93 @@ void init_args(int argc, char* argv[]) { t = args["t"].as_float(); sigma = args["sigma"].as_float(); phi = (float)args["phi"].as_float() * (float)stim::PI / 180; - + } -int main(int argc, char** argv){ +int main(int argc, char** argv) { cudaDeviceProp prop; int count; cudaGetDeviceCount(&count); - for (int i=0; i0){ - nod++; - list << ix << " " << iy << " "<< iz << " " << cpu_out[idx] << '\n' ; - - } - } + + std::ofstream list(args.arg(2)); + // set the number of detected cells to zero. + int nod = 0; + if (list.is_open()) { + + for (int iz = 0; iz0) { + nod++; + list << ix << " " << iy << " " << iz << " " << cpu_out[idx] << '\n'; + } } - - list.close(); + } } + list.close(); + } + //} - cudaDeviceReset(); - -} + cudaDeviceReset(); +} \ No newline at end of file diff --git a/cpp/update_dir3_aabb.cuh b/cpp/update_dir3_aabb.cuh index bd64026..5491985 100644 --- a/cpp/update_dir3_aabb.cuh +++ b/cpp/update_dir3_aabb.cuh @@ -46,7 +46,7 @@ float step = 360.0/n; stim::circle cir(center, d, norm); stim::aabb3 bb(xi,yi,zi); - bb.insert(xc,yc,zc); + bb.insert((int) xc, (int)yc, (int)zc); for(float j = 0; j <360.0; j += step){ stim::vec3 out = cir.p(j); bb.insert(out[0], out[1], out[2]); @@ -67,7 +67,7 @@ float id_x = g[0]; // define local variables for the x, y, and z coordinations point to the vote direction float id_y = g[1]; float id_z = g[2]; - + for (bz=bb.low[2]; bz<=bb.high[2]; bz++){ dz = bz - zi; //compute the distance bw the voter and the current counter along z axis dz_sq = dz * dz; @@ -85,9 +85,9 @@ l_vote = gpu_vote[idx_c]; if (l_vote>max) { max = l_vote; - id_x = dx; - id_y = dy; - id_z = dz; + id_x = (float)dx; + id_y = (float)dy; + id_z = (float)dz; } } } @@ -103,25 +103,26 @@ // this kernel updates the gradient direction by the calculated voting direction. template - __global__ void update_grad3(T* gpu_grad, T* gpu_dir, int x, int y, int z){ + __global__ void update_grad3(T* gpu_grad, T* gpu_dir, size_t x, size_t y, size_t z){ - int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread + + size_t grid_y = y / blockDim.y; //find the grid size along y + size_t blockidx_y = blockIdx.y % grid_y; + size_t yi = blockidx_y * blockDim.y + threadIdx.y; + size_t zi = blockIdx.y / grid_y; - int grid_y = y / blockDim.y; //find the grid size along y - int blockidx_y = blockIdx.y % grid_y; - int yi = blockidx_y * blockDim.y + threadIdx.y; - int zi = blockIdx.y / grid_y; - int i = zi * x * y + yi * x + xi; - if(xi >= x || yi >= y || zi >= z) return; + size_t i = zi * x * y + yi * x + xi; + gpu_grad[i * 3 + 0] = gpu_dir [i * 3 + 0]; //update the gradient image with the new direction direction gpu_grad[i * 3 + 1] = gpu_dir [i * 3 + 1]; gpu_grad[i * 3 + 2] = gpu_dir [i * 3 + 2]; } template - void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ + void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z){ unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(sqrt (max_threads),sqrt (max_threads)); diff --git a/cpp/vote3_atomic_aabb.cuh b/cpp/vote3_atomic_aabb.cuh index 763eba6..f5f21d8 100644 --- a/cpp/vote3_atomic_aabb.cuh +++ b/cpp/vote3_atomic_aabb.cuh @@ -13,121 +13,121 @@ #include #include - // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area - template - __global__ void vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ - - int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread - - int grid_y = y / blockDim.y; //find the grid size along y - int blockidx_y = blockIdx.y % grid_y; - int yi = blockidx_y * blockDim.y + threadIdx.y; - int zi = blockIdx.y / grid_y; - - if(xi>=x || yi>=y || zi>=z) return; - - int i = zi * x * y + yi * x + xi; // calculate the 1D index of the voter - - - float rx_sq = rx * rx; // compute the square for rmax - float ry_sq = ry * ry; - float rz_sq = rz * rz; - - stim::vec3 g(gpu_grad[3*i],gpu_grad[3*i+1],gpu_grad[3*i+2]); // form a vec3 variable for the gradient vector - stim::vec3 g_sph = g.cart2sph(); //convert cartesian coordinate to spherical for the gradient vector - float n =8; //set the number of points to find the boundaries of the conical voting area - float xc = rx * cos(g_sph[1]) * sin(g_sph[2]); //calculate the center point of the surface of the voting area for the voter - float yc = ry * sin(g_sph[1]) * sin(g_sph[2]) ; - float zc = rz * cos(g_sph[2]) ; - float r = sqrt(xc*xc + yc*yc + zc*zc); - xc+=xi; - yc+=yi; - zc+=zi; - stim::vec3 center(xc,yc,zc); - - float d = 2 * r * tan(phi); //find the diameter of the conical voting area - stim::vec3 norm = g.norm(); //compute the normalize gradient vector - float step = 360.0/n; - stim::circle cir(center, d, norm); - stim::aabb3 bb(xi,yi,zi); - bb.insert((int)xc, (int)yc, (int)zc); - for(float j = 0; j <360.0; j += step){ - stim::vec3 out = cir.p(j); - bb.insert(out[0], out[1], out[2]); +// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area +template +__global__ void vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, int rx, int ry, int rz, int x, int y, int z) { + + int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread + + int grid_y = y / blockDim.y; //find the grid size along y + int blockidx_y = blockIdx.y % grid_y; + int yi = blockidx_y * blockDim.y + threadIdx.y; + int zi = blockIdx.y / grid_y; + + if (xi >= x || yi >= y || zi >= z) return; + + int i = zi * x * y + yi * x + xi; // calculate the 1D index of the voter + + + float rx_sq = rx * rx; // compute the square for rmax + float ry_sq = ry * ry; + float rz_sq = rz * rz; + + stim::vec3 g(gpu_grad[3 * i], gpu_grad[3 * i + 1], gpu_grad[3 * i + 2]); // form a vec3 variable for the gradient vector + stim::vec3 g_sph = g.cart2sph(); //convert cartesian coordinate to spherical for the gradient vector + float n = 8; //set the number of points to find the boundaries of the conical voting area + float xc = rx * cos(g_sph[1]) * sin(g_sph[2]); //calculate the center point of the surface of the voting area for the voter + float yc = ry * sin(g_sph[1]) * sin(g_sph[2]); + float zc = rz * cos(g_sph[2]); + float r = sqrt(xc*xc + yc*yc + zc*zc); + xc += xi; + yc += yi; + zc += zi; + stim::vec3 center(xc, yc, zc); + + float d = 2 * r * tan(phi); //find the diameter of the conical voting area + stim::vec3 norm = g.norm(); //compute the normalize gradient vector + float step = 360.0 / n; + stim::circle cir(center, d, norm); + stim::aabb3 bb(xi, yi, zi); + bb.insert((int)xc, (int)yc, (int)zc); + for (float j = 0; j <360.0; j += step) { + stim::vec3 out = cir.p(j); + bb.insert(out[0], out[1], out[2]); + } + bb.trim_low(xi - rx, yi - ry, zi - rz); + bb.trim_low(0, 0, 0); + bb.trim_high(xi + rx, yi + ry, zi + rz); + bb.trim_high(x - 1, y - 1, z - 1); + int bx, by, bz; + int dx, dy, dz; + float dx_sq, dy_sq, dz_sq; + float dist, cos_diff; + int idx_c; + for (bz = bb.low[2]; bz <= bb.high[2]; bz++) { + dz = bz - zi; //compute the distance bw the voter and the current counter along z axis + dz_sq = dz * dz; + for (by = bb.low[1]; by <= bb.high[1]; by++) { + dy = by - yi; //compute the distance bw the voter and the current counter along y axis + dy_sq = dy * dy; + for (bx = bb.low[0]; bx <= bb.high[0]; bx++) { + dx = bx - xi; //compute the distance bw the voter and the current counter along x axis + dx_sq = dx * dx; + + dist = sqrt(dx_sq + dy_sq + dz_sq); //calculate the distance between the voter and the current counter + cos_diff = (norm[0] * dx + norm[1] * dy + norm[2] * dz) / dist; // calculate the cosine of angle between the voter and the current counter + if (((dx_sq / rx_sq + dy_sq / ry_sq + dz_sq / rz_sq) <= 1) && (cos_diff >= cos_phi)) { //check if the current counter located in the voting area of the voter + idx_c = (bz* y + by) * x + bx; //calculate the 1D index for the current counter + atomicAdd(&gpu_vote[idx_c], g_sph[0]); + } } - bb.trim_low(xi-rx, yi-ry, zi-rz); - bb.trim_low(0,0,0); - bb.trim_high(xi+rx, yi+ry, zi+rz); - bb.trim_high(x-1, y-1, z-1); - int bx,by,bz; - int dx, dy, dz; - float dx_sq, dy_sq, dz_sq; - float dist, cos_diff; - int idx_c; - for (bz=bb.low[2]; bz<=bb.high[2]; bz++){ - dz = bz - zi; //compute the distance bw the voter and the current counter along z axis - dz_sq = dz * dz; - for (by=bb.low[1]; by<=bb.high[1]; by++){ - dy = by - yi; //compute the distance bw the voter and the current counter along y axis - dy_sq = dy * dy; - for (bx=bb.low[0]; bx<=bb.high[0]; bx++){ - dx = bx - xi; //compute the distance bw the voter and the current counter along x axis - dx_sq = dx * dx; - - dist = sqrt(dx_sq + dy_sq + dz_sq); //calculate the distance between the voter and the current counter - cos_diff = (norm[0] * dx + norm[1] * dy + norm[2] * dz)/dist; // calculate the cosine of angle between the voter and the current counter - if ( ( (dx_sq/rx_sq + dy_sq/ry_sq + dz_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){ //check if the current counter located in the voting area of the voter - idx_c = (bz* y + by) * x + bx; //calculate the 1D index for the current counter - atomicAdd (&gpu_vote[idx_c] , g_sph[0]); - } - } - } - } } + } +} - template - void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ +template +void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z) { - - unsigned int max_threads = stim::maxThreadsPerBlock(); - dim3 threads(sqrt (max_threads),sqrt (max_threads)); - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); - vote3 <<< blocks, threads >>>(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x , y, z); //call the kernel to do the voting - } + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(sqrt(max_threads), sqrt(max_threads)); + dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); + vote3 << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x, y, z); //call the kernel to do the voting +} - template - void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ - - //calculate the number of bytes in the array - unsigned int bytes = x * y * z * sizeof(T); - - - //allocate space on the GPU for the Vote Image - T* gpu_vote; - cudaMalloc(&gpu_vote, bytes); - - //allocate space on the GPU for the input Gradient image - T* gpu_grad; - cudaMalloc(&gpu_grad, bytes*3); - - - //copy the Gradient data to the GPU - cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice); - - - //call the GPU version of the vote calculation function - gpu_vote3(gpu_vote, gpu_grad, cos_phi, r, x , y, z); - - //copy the Vote Data back to the CPU - cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ; - - //free allocated memory - cudaFree(gpu_vote); - cudaFree(gpu_grad); - - } + +template +void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z) { + + //calculate the number of bytes in the array + unsigned int bytes = x * y * z * sizeof(T); + + + //allocate space on the GPU for the Vote Image + T* gpu_vote; + cudaMalloc(&gpu_vote, bytes); + + //allocate space on the GPU for the input Gradient image + T* gpu_grad; + cudaMalloc(&gpu_grad, bytes * 3); + + + //copy the Gradient data to the GPU + cudaMemcpy(gpu_grad, cpu_grad, bytes * 3, cudaMemcpyHostToDevice); + + + //call the GPU version of the vote calculation function + gpu_vote3(gpu_vote, gpu_grad, cos_phi, r, x, y, z); + + //copy the Vote Data back to the CPU + cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost); + + //free allocated memory + cudaFree(gpu_vote); + cudaFree(gpu_grad); + +} #endif \ No newline at end of file -- libgit2 0.21.4