From a744d027c4d9d6583db594dc75811c6ff90eaef6 Mon Sep 17 00:00:00 2001 From: laila Saadatifard Date: Thu, 7 Jul 2016 10:44:37 -0500 Subject: [PATCH] upload the ivote3 functions using the new method for finding voting area (boundaries of voting area)' --- Matlab_3D/gt2.vol | Bin 0 -> 8388608 bytes Matlab_3D/validation.m | 157 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------------------------------------------------------- cpp/CMakeLists.txt | 3 +++ cpp/args.txt | 1 + cpp/cudafunc.cu | 30 ++++++++++++++++++++---------- cpp/main.cpp | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------------- cpp/update_dir3_aabb.cuh | 193 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ cpp/vote3_atomic.cuh | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ cpp/vote3_atomic_aabb.cuh | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 657 insertions(+), 122 deletions(-) create mode 100644 Matlab_3D/gt2.vol create mode 100644 cpp/args.txt create mode 100644 cpp/update_dir3_aabb.cuh create mode 100644 cpp/vote3_atomic.cuh create mode 100644 cpp/vote3_atomic_aabb.cuh diff --git a/Matlab_3D/gt2.vol b/Matlab_3D/gt2.vol new file mode 100644 index 0000000..1399c1a Binary files /dev/null and b/Matlab_3D/gt2.vol differ diff --git a/Matlab_3D/validation.m b/Matlab_3D/validation.m index 99b6aba..6e0056c 100644 --- a/Matlab_3D/validation.m +++ b/Matlab_3D/validation.m @@ -1,5 +1,6 @@ clear all; -disp('***************** NEW RUN *********************'); +%for t=100:100:5000 +t=2350; X = 128; Y = 128; Z = 128; @@ -7,15 +8,15 @@ D = 10; t0=1; r1=12; r2=10; -t=2300; -itr=8; -vote=10; +% t=2300; +itr=5; +vote=7; std = [5 5]; gt_filename = 'gt2.vol'; % out_filename = sprintf('128-128-128/0-nissl-std%d.%d-t0%d-r%d.%d-t%d-out%d.%d.vol',std(1), std(2),t0,r1,r2,t,itr,vote); -out_filename = sprintf('D:/build/ivote3-bld/shared2D-v8/centers.%d.vol',t); +out_filename = sprintf('D:/build/ivote3-bld/0-out.%d.vol',t); % txt_filename = sprintf('128-128-128/0-validation-nissl-std%d.%d-r%d.%d-t%d-out%d.%d-D%d.txt',std(1), std(2),r1,r2,t,itr,vote,D); -txt_filename = sprintf('0-t%d--1.txt',t); +txt_filename = sprintf('D:/build/ivote3-bld/0-t%d-atomic-aabb.txt',t); spec = sprintf('Nissl-std%d.%d-r%d.%d-t%d-out%d.%d',std(1), std(2),r1,r2,t,itr,vote); fid0 = fopen(gt_filename); gt = fread(fid0,[X Y*Z], 'single'); @@ -79,73 +80,73 @@ for k=1:size(id_ref,1) oseg = oseg+1; end end -%**** -% check if the oversegmented detected cells could be assigned to the gt's element with no detected cell! -%find the indices of gt's element with no detected cell -u_idx1 = find(b==0); -%find the coordinates of gt's -ref1 = ref(u_idx1,:); -% find the indices of detected cells which are assigned to a gt's element -ub = unique(b(b>0)); -% set the number of detected cell which are not assigned to a gt's element to zero -mm=0; -% find the indices of detected cell which are not assigned to a gt's element -for j=1:size(cent,1) - z = sum(ub(:)==j); - if z==0 - mm = mm+1; - cent_id1(mm) = j; - end -end -% find the coordinated of of detected cells -cent1 = cent(cent_id1,:); - -% check if there are osme low enough distances -[idx1, dist1] = knnsearch(cent1,ref1); -ind_eq_idx_rc = zeros(numel(idx1), 10); -%set number of cells with the same detected cell center to zero. -noc_rc=0; -for i_rc=1:numel(idx1) - % check the current element hasn't considered yet - s_ind_rc = sum(ind_eq_idx_rc==i_rc); - if s_ind_rc==0 - % check how many of the elemnts assigned to the same detected cell - s_rc = sum(idx1==idx1(i_rc)); - if s_rc>1 - noc_rc = noc_rc+1; - %save the index and number of gt's elements with the same assigned detected cells. - id_ref_rc(noc_rc,1) = i_rc; - id_ref_rc(noc_rc,2) = s_rc; - ind1_rc = find(idx1==idx1(i_rc)); - ind_eq_idx_rc(i_rc, 1:numel(ind1_rc)) = ind1_rc; - end - end -end -% determine those indices which hs assigned to only one detected cell -b_rc = idx1; -u_eq_idx_rc = unique(ind_eq_idx_rc(ind_eq_idx_rc>0)); -b_rc(u_eq_idx_rc)=0; -% set the number of over sefgmented cells to zero. -oseg_rc=0; -for k_rc=1:size(id_ref_rc,1) - k1_rc = id_ref_rc(k_rc,1); - k2_rc = id_ref_rc(k_rc,2); - %find the minimum distance to the detected cell happened for which of the gt's element - l_rc = ind_eq_idx_rc(k1_rc,1:k2_rc); - [~, local_id_min_rc] = min(dist1(l_rc)); - % set the element with minimum distance to the corresponding detected cell - b_rc(l_rc(local_id_min_rc)) = idx1(k1_rc); - %remove the proper element from the list of indices with same designated detected cells - u_eq_idx_rc (u_eq_idx_rc == l_rc(local_id_min_rc))=0; - % check if the indices remained in the list has distance less than the value is set for validation - ly_rc = l_rc (l_rc~=l_rc(local_id_min_rc)); - distl_rc = dist1(ly_rc); - sl_rc = sum(distl_rc0 - oseg_rc = oseg_rc+1; - end -end +% %**** +% % check if the oversegmented detected cells could be assigned to the gt's element with no detected cell! +% %find the indices of gt's element with no detected cell +% u_idx1 = find(b==0); +% %find the coordinates of gt's +% ref1 = ref(u_idx1,:); +% % find the indices of detected cells which are assigned to a gt's element +% ub = unique(b(b>0)); +% % set the number of detected cell which are not assigned to a gt's element to zero +% mm=0; +% % find the indices of detected cell which are not assigned to a gt's element +% for j=1:size(cent,1) +% z = sum(ub(:)==j); +% if z==0 +% mm = mm+1; +% cent_id1(mm) = j; +% end +% end +% % find the coordinated of of detected cells +% cent1 = cent(cent_id1,:); +% +% % check if there are osme low enough distances +% [idx1, dist1] = knnsearch(cent1,ref1); +% ind_eq_idx_rc = zeros(numel(idx1), 10); +% %set number of cells with the same detected cell center to zero. +% noc_rc=0; +% for i_rc=1:numel(idx1) +% % check the current element hasn't considered yet +% s_ind_rc = sum(ind_eq_idx_rc==i_rc); +% if s_ind_rc==0 +% % check how many of the elemnts assigned to the same detected cell +% s_rc = sum(idx1==idx1(i_rc)); +% if s_rc>1 +% noc_rc = noc_rc+1; +% %save the index and number of gt's elements with the same assigned detected cells. +% id_ref_rc(noc_rc,1) = i_rc; +% id_ref_rc(noc_rc,2) = s_rc; +% ind1_rc = find(idx1==idx1(i_rc)); +% ind_eq_idx_rc(i_rc, 1:numel(ind1_rc)) = ind1_rc; +% end +% end +% end +% % determine those indices which hs assigned to only one detected cell +% b_rc = idx1; +% u_eq_idx_rc = unique(ind_eq_idx_rc(ind_eq_idx_rc>0)); +% b_rc(u_eq_idx_rc)=0; +% % set the number of over sefgmented cells to zero. +% oseg_rc=0; +% for k_rc=1:size(id_ref_rc,1) +% k1_rc = id_ref_rc(k_rc,1); +% k2_rc = id_ref_rc(k_rc,2); +% %find the minimum distance to the detected cell happened for which of the gt's element +% l_rc = ind_eq_idx_rc(k1_rc,1:k2_rc); +% [~, local_id_min_rc] = min(dist1(l_rc)); +% % set the element with minimum distance to the corresponding detected cell +% b_rc(l_rc(local_id_min_rc)) = idx1(k1_rc); +% %remove the proper element from the list of indices with same designated detected cells +% u_eq_idx_rc (u_eq_idx_rc == l_rc(local_id_min_rc))=0; +% % check if the indices remained in the list has distance less than the value is set for validation +% ly_rc = l_rc (l_rc~=l_rc(local_id_min_rc)); +% distl_rc = dist1(ly_rc); +% sl_rc = sum(distl_rc0 +% oseg_rc = oseg_rc+1; +% end +% end %****** % b include the gt's element and its detected cells, for those element with no cell detection, b has zero value @@ -154,11 +155,11 @@ b_dist = dist; % remove the disatances for those elements with no detected cells from distance array b_dist(b_ind)=-1; -b_ind_rc = find(b_rc==0); -b_dist_rc = dist1; -b_dist_rc(b_ind_rc)=-1; +% b_ind_rc = find(b_rc==0); +% b_dist_rc = dist1; +% b_dist_rc(b_ind_rc)=-1; % calculate Ttrue Positive, number of detected cells that have low enough distance to one of the gt's elements. -TP = sum(b_dist>=0) - sum(b_dist>D) + (sum(b_dist_rc>=0) - sum(b_dist_rc>D)); +TP = sum(b_dist>=0) - sum(b_dist>D); % + (sum(b_dist_rc>=0) - sum(b_dist_rc>D)); % calculate False Negative, number of gt's elements with no detected cells. FN = size(ref, 1) - TP; % calculate False Positive, number of detected cells, which their distance to the gt's element is long @@ -179,3 +180,5 @@ fprintf(fid_text, 'Precision = %f\n', Precision); fprintf(fid_text, 'Accuracy = %f\n', Accuracy); fprintf(fid_text, 'Recall = %f\n', Recall); fclose(fid_text); +clear all; +%end \ No newline at end of file diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d21dd51..387d47e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -10,6 +10,9 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}") #set up CUDA find_package(CUDA REQUIRED) +#set up opencv +find_package(opencv) + #find the STIM library find_package(STIM REQUIRED) diff --git a/cpp/args.txt b/cpp/args.txt new file mode 100644 index 0000000..6302d1b --- /dev/null +++ b/cpp/args.txt @@ -0,0 +1 @@ +nissl-float-128.128.128.vol 0-out --t 2350 --x 128 --y 128 --z 128 --anisotropy 1 --invert \ No newline at end of file diff --git a/cpp/cudafunc.cu b/cpp/cudafunc.cu index 5a89eaf..fe5734d 100644 --- a/cpp/cudafunc.cu +++ b/cpp/cudafunc.cu @@ -1,7 +1,15 @@ +/*#include "circle_check.cuh" + +void test_3(float* gpu_out, float* gpu_grad, float rmax, float phi, int n, int x, int y, int z){ +gpu_test3(gpu_out, gpu_grad, rmax, phi, n, x, y, z); +} +*/ + + #include "gaussian_blur3.cuh" #include "gradient3.cuh" #include "mag3.cuh" -#include "vote3.cuh" +#include "vote3_atomic_aabb.cuh" #include "update_dir3.cuh" #include "local_max3.cuh" @@ -10,11 +18,11 @@ void ivote3(float* img, float sigma[], float anisotropy, float phi, float d_phi, int iter, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ - cudaSetDevice(0); + cudaSetDevice(1); // compute the number of bytes in the input data unsigned int bytes = x * y * z * sizeof(float); - //assign memory on gpu for the input data. + //assign memory on gpu for the input data.z float* gpuI0; cudaMalloc(&gpuI0, bytes); @@ -41,18 +49,17 @@ void ivote3(float* img, float sigma[], float anisotropy, float phi, float d_phi, //call the vote function. for (int i = 0; i < iter; i++){ - + + cudaMemset(gpu_vote, 0, bytes); gpu_vote3(gpu_vote, gpu_grad, cos_phi, r, x, y, z); cudaDeviceSynchronize(); - /*if (i==7) - cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);*/ - - if (phi >= d_phi){ + + //if (phi >= d_phi){ gpu_update_dir3(gpu_grad, gpu_vote, cos_phi, r, x, y, z); cudaDeviceSynchronize(); phi = phi - d_phi; cos_phi = cos(phi); - } + //} } @@ -78,6 +85,8 @@ void ivote3(float* img, float sigma[], float anisotropy, float phi, float d_phi, void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ unsigned int bytes = x * y * z * sizeof(float); + cudaSetDevice(1); + //assign memory on gpu for the input data. float* gpuV; cudaMalloc(&gpuV, bytes); @@ -96,4 +105,5 @@ void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, u cudaFree(gpuV); cudaFree(gpuOut); -} \ No newline at end of file +} + diff --git a/cpp/main.cpp b/cpp/main.cpp index 47d76d8..d2a9636 100644 --- a/cpp/main.cpp +++ b/cpp/main.cpp @@ -9,6 +9,84 @@ #include #define pi 3.14159 +#define M_PI 3.14159 +#include +#include +#include +#include +//#include +//#include +//#include + + +/*void test_3(float* gpu_out, float* gpu_grad, float rmax, float phi, int n, int x, int y, int z); + +int main(){ + + int n=20; + float rmax; + float phi_deg; + float phi; + rmax=4; + phi_deg = 15; + phi = phi_deg * pi/180; + int x,y,z; + x=y=z=1; + unsigned int size = x*y*z*sizeof (float); + float* cpu_grad = (float*) malloc(3*size); + float* gpu_grad; + cudaMalloc(&gpu_grad, 3*size); + cpu_grad[0]=1; + cpu_grad[1]=0; + cpu_grad[2]=-0.5; + cudaMemcpy(gpu_grad, cpu_grad, 3*size, cudaMemcpyHostToDevice); + float* cpu_out = (float*) malloc(3*size*(n+1)); + float* gpu_out; + cudaMalloc(&gpu_out, 3*size*(n+1)); + test_3(gpu_out, gpu_grad, rmax, phi, n, x, y, z); + cudaMemcpy(cpu_out, gpu_out, 3*size*(n+1), cudaMemcpyDeviceToHost); + std::ofstream list("circle_check_cuda1.txt"); + if (list.is_open()){ + for (int j=0; j<=n; ++j) + list << cpu_out[3*j] << '\t' << cpu_out[3*j +1] << '\t' << cpu_out[3*j + 2] << '\n'; + } + list.close(); + */ + /* + int n=10; + stim::circle cir; + float* c0= (float*) malloc(3*sizeof(float)); + c0[0] =-4; + c0[1]=0; + c0[2] = 3; + stim::vec3 c(c0[0],c0[1],c0[2]); + float len = c.len(); + stim::vec3 norm(c0[0]/len,c0[1]/len,c0[2]/len); + std::cout<< len << '\n'; + std::cout<< norm << '\n'; + cir.center(c); + cir.normal(norm); + cir.scale(2); + stim::vec3 out = cir.p(45); + std::vector> out2 = cir.getPoints(n); + + std::cout<< out << '\n'; + std::cout <>::const_iterator i = out2.begin(); i != out2.end(); ++i) + std::cout << *i << '\n'; + std::ofstream list("circle_check.txt"); + if (list.is_open()){ + for (std::vector>::const_iterator j = out2.begin(); j != out2.end(); ++j) + list << *j << '\n'; + } + list.close(); + std::cin >> n; + +} + +*/ void ivote3(float* img, float std[], float anisotropy, float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z); @@ -98,7 +176,7 @@ int main(int argc, char** argv){ float t = args["t"].as_float(); //set the anisotropy float anisotropy = args["anisotropy"].as_float(); - unsigned int rmax = 10 ; + unsigned int rmax = 10; unsigned int r[3] = { 12, rmax, rmax}; float std = 5; float sigma[3] = { std, std, std}; @@ -137,50 +215,51 @@ int main(int argc, char** argv){ std::ofstream fblur("shared2D-v8/vote8.vol", std::ofstream::out | std::ofstream::binary); fblur.write((char*)cpuI, bytes); fblur.close(); - /* - stim::imageimgrad3; - imgrad3.set_interleaved3(cpu_out, 128,128,128,3); - std::ofstream fgx("syn/gx-128.vol", std::ofstream::out | std::ofstream::binary); - fgx.write((char*)imgrad3.channel(0).data(), bytes); - fgx.close(); - */ + + //stim::imageimgrad3; + //imgrad3.set_interleaved3(cpu_out, 128,128,128,3); + //std::ofstream fgx("syn/gx-128.vol", std::ofstream::out | std::ofstream::binary); + //fgx.write((char*)imgrad3.channel(0).data(), bytes); + //fgx.close(); + //write the output file. - //for (int t0=2000; t0<=2500; t0+=100){ + //for (int t0=0; t0<=5000; t0+=100){ + // float t1 = t0; int t0 = t; lmax(cpu_out, cpuI, t, conn, x, y, z); //std::ofstream fo("shared2D-v8/" + OutName.str(), std::ofstream::out | std::ofstream::binary); - std::ofstream fo("shared2D-v8/" + OutName.str()+std::to_string(t0)+".vol", std::ofstream::out | std::ofstream::binary); + std::ofstream fo( OutName.str()+std::to_string(t0)+".vol", std::ofstream::out | std::ofstream::binary); fo.write((char*)cpu_out, bytes); fo.close(); // creat a file for saving the list centers - std::ofstream list("shared2D-v8/" + OutName.str()+std::to_string(t0)+".obj"); - // set the number of detected cells to zero. - int nod = 0; - if (list.is_open()){ - - for (int iz=0; iz +# include +#include +#include "cpyToshare.cuh" +#define M_PI 3.14159 +#include +#include +#include +#include +#include + + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. + template + __global__ void update_dir3(T* gpu_dir, T* gpu_grad, T* gpu_vote, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ + //extern __shared__ float s_vote[]; + + int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread + + int grid_y = y / blockDim.y; //find the grid size along y + int blockidx_y = blockIdx.y % grid_y; + int yi = blockidx_y * blockDim.y + threadIdx.y; + int zi = blockIdx.y / grid_y; + if(xi >= x|| yi >= y || zi>= z) return; + int i = zi * x * y + yi * x + xi; //compute the global 1D index for this pixel + + + // find the starting points for this block along the x and y directions + //int bxi = blockIdx.x * blockDim.x; + //int byi = blockidx_y * blockDim.y; + //find the starting points and the size of the window, which will be copied to the 2D-shared memory + //int bxs = bxi - rx; + //int bys = byi - ry; + //int xwidth = 2 * rx + blockDim.x; + //int ywidth = 2 * ry + blockDim.y; + //compute the coordinations of this pixel in the 2D-shared memory. + //int sx_rx = threadIdx.x + rx; + //int sy_ry = threadIdx.y + ry; + + float rx_sq = rx * rx; // compute the square for rmax + float ry_sq = ry * ry; + float rz_sq = rz * rz; + + stim::vec3 g(gpu_grad[3*i],gpu_grad[3*i+1],gpu_grad[3*i+2]); // form a vec3 variable for the gradient vector + stim::vec3 g_sph = g.cart2sph(); //convert cartesian coordinate to spherical for the gradient vector + int n =4; //set the number of points to find the boundaries of the conical voting area + float xc = rx * cos(g_sph[1]) * sin(g_sph[2]) ; //calculate the center point of the surface of the voting area for the voter + float yc = ry * sin(g_sph[1]) * sin(g_sph[2]) ; + float zc = rz * cos(g_sph[2]) ; + float r = sqrt(xc*xc + yc*yc + zc*zc); + xc+=xi; + yc+=yi; + zc+=zi; + stim::vec3 center(xc,yc,zc); + float d = 2 * r * tan(acos(cos_phi) ); //find the diameter of the conical voting area + stim::vec3 norm = g.norm(); //compute the normalize gradient vector + float step = 360.0/(float) n; + stim::circle cir(center, d, norm); + stim::aabb3 bb(xi,yi,zi); + bb.insert(xc,yc,zc); + for(float j = 0; j <360.0; j += step){ + stim::vec3 out = cir.p(j); + bb.insert(out[0], out[1], out[2]); + } + + bb.trim_low(0,0,0); + bb.trim_high(x-1, y-1, z-1); + int bx,by,bz; + int dx, dy, dz; + float dx_sq, dy_sq, dz_sq; + + float dist, cos_diff; + int idx_c; + + float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter + float l_vote = 0; + + float id_x = g[0]; // define local variables for the x, y, and z coordinations point to the vote direction + float id_y = g[1]; + float id_z = g[2]; + + for (bz=bb.low[2]; bz<=bb.high[2]; bz++){ + dz = bz - zi; //compute the distance bw the voter and the current counter along z axis + dz_sq = dz * dz; + for (by=bb.low[1]; by<=bb.high[1]; by++){ + dy = by - yi; //compute the distance bw the voter and the current counter along y axis + dy_sq = dy * dy; + for (bx=bb.low[0]; bx<=bb.high[0]; bx++){ + dx = bx - xi; //compute the distance bw the voter and the current counter along x axis + dx_sq = dx * dx; + + dist = sqrt(dx_sq + dy_sq + dz_sq); //calculate the distance between the voter and the current counter + cos_diff = (norm[0] * dx + norm[1] * dy + norm[2] * dz)/dist; // calculate the cosine of angle between the voter and the current counter + if ( ( (dx_sq/rx_sq + dy_sq/ry_sq + dz_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){ //check if the current counter located in the voting area of the voter + idx_c = (bz* y + by) * x + bx; //calculate the 1D index for the current counter + l_vote = gpu_vote[idx_c]; + if (l_vote>max) { + max = l_vote; + id_x = dx; + id_y = dy; + id_z = dz; + } + } + } + } + } + float m_id = sqrt (id_x*id_x + id_y*id_y + id_z*id_z); + gpu_dir[i * 3 + 0] = g_sph[0] * (id_x/m_id); + gpu_dir[i * 3 + 1] = g_sph[0] * (id_y/m_id); + gpu_dir[i * 3 + 2] = g_sph[0] * (id_z/m_id); + } + + + + // this kernel updates the gradient direction by the calculated voting direction. + template + __global__ void update_grad3(T* gpu_grad, T* gpu_dir, int x, int y, int z){ + + //calculate x,y,z coordinates for this thread + int xi = blockIdx.x * blockDim.x + threadIdx.x; + //find the grid size along y + int grid_y = y / blockDim.y; + int blockidx_y = blockIdx.y % grid_y; + int yi = blockidx_y * blockDim.y + threadIdx.y; + int zi = blockIdx.y / grid_y; + int i = zi * x * y + yi * x + xi; + + if(xi >= x || yi >= y || zi >= z) return; + //update the gradient image with the new direction direction + gpu_grad[i * 3 + 0] = gpu_dir [i * 3 + 0]; + gpu_grad[i * 3 + 1] = gpu_dir [i * 3 + 1]; + gpu_grad[i * 3 + 2] = gpu_dir [i * 3 + 2]; + } + + template + void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ + + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(sqrt (max_threads),sqrt (max_threads)); + dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); + //unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*sizeof(T); + // allocate space on the GPU for the updated vote direction + T* gpu_dir; + cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3); + + //call the kernel to calculate the new voting direction + update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z); + + + //call the kernel to update the gradient direction + update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z); + + //free allocated memory + cudaFree(gpu_dir); + + } + + template + void cpu_update_dir3(T* cpu_grad, T* cpu_vote, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ + + //calculate the number of bytes in the array + unsigned int bytes = x * y * z * sizeof(T); + + //allocate space on the GPU for the Vote data + T* gpu_vote; + cudaMalloc(&gpu_vote, bytes); + + //copy the input vote data to the GPU + cudaMemcpy(gpu_vote, cpu_vote, bytes, cudaMemcpyHostToDevice); + + //allocate space on the GPU for the Gradient data + T* gpu_grad; + cudaMalloc(&gpu_grad, bytes*3); + + //copy the Gradient data to the GPU + cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice); + + //call the GPU version of the update direction function + gpu_update_dir3(gpu_grad, gpu_vote, cos_phi, r, x , y, z); + + //copy the new gradient image back to the CPU + cudaMemcpy(cpu_grad, gpu_grad, bytes*3, cudaMemcpyDeviceToHost) ; + + //free allocated memory + cudaFree(gpu_vote); + cudaFree(gpu_grad); + } + + + +#endif \ No newline at end of file diff --git a/cpp/vote3_atomic.cuh b/cpp/vote3_atomic.cuh new file mode 100644 index 0000000..8d7f609 --- /dev/null +++ b/cpp/vote3_atomic.cuh @@ -0,0 +1,113 @@ +#ifndef STIM_CUDA_VOTE3_ATOMIC_H +#define STIM_CUDA_VOTE3_ATOMIC_H + +#include +#include +#include +#include +#include "cpyToshare.cuh" + + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area + template + __global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ + + int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread + + int grid_y = y / blockDim.y; //find the grid size along y + int blockidx_y = blockIdx.y % grid_y; + int yi = blockidx_y * blockDim.y + threadIdx.y; + int zi = blockIdx.y / grid_y; + + if(xi>=x || yi>=y || zi>=z) return; + + int i = zi * x * y + yi * x + xi; // calculate the 1D index of the voter + float gx_v = gpu_grad[3*i]; // find the gradient information in cartesian coordinate for the voter - global memory fetch + float gy_v = gpu_grad[3*i+1]; + float gz_v = gpu_grad[3*i+2]; + + float mag_v = sqrt(gx_v*gx_v + gy_v*gy_v + gz_v*gz_v); // compute the gradient magnitude for the voter + + float gx_v_n = gx_v/mag_v; // normalize the gradient vector for the voter + float gy_v_n = gy_v/mag_v; + float gz_v_n = gz_v/mag_v; + + float rx_sq = rx * rx; // compute the square for rmax + float ry_sq = ry * ry; + float rz_sq = rz * rz; + float x_sq, y_sq, z_sq, d_c, cos_diff; + int xi_c, yi_c, zi_c, idx_c; + + for (int z_c=-rz; z_c<=rz; z_c++){ + zi_c = zi + z_c; // calculate the z position for the current counter + if (zi_c =0){ // make sure the current counter is inside the image + z_sq = z_c * z_c; + for (int y_c=-ry; y_c<=ry; y_c++){ + yi_c = yi + y_c; + if (yi_c < y && yi_c>=0){ + y_sq = y_c * y_c; + for (int x_c=-rx; x_c<=rx; x_c++){ + xi_c = xi + x_c; + if (xi_c < x && xi_c>=0){ + x_sq = x_c * x_c; + d_c = sqrt(x_sq + y_sq + z_sq); //calculate the distance between the voter and the current counter + cos_diff = (gx_v_n * x_c + gy_v_n * y_c + gz_v_n * z_c)/(d_c); // calculate the cosine of angle between the voter and the current counter + if ( ( (x_sq/rx_sq + y_sq/ry_sq + z_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){ + idx_c = (zi_c * y + yi_c) * x + xi_c; //calculate the 1D index for the current counter + atomicAdd (&gpu_vote[idx_c] , mag_v); + } + } + } + } + } + } + } + } + + template + void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ + + + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(sqrt (max_threads),sqrt (max_threads)); + dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); + //unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*4*sizeof(T); + //call the kernel to do the voting + vote3 <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z); + + } + + + template + void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ + + //calculate the number of bytes in the array + unsigned int bytes = x * y * z * sizeof(T); + + + //allocate space on the GPU for the Vote Image + T* gpu_vote; + cudaMalloc(&gpu_vote, bytes); + + //allocate space on the GPU for the input Gradient image + T* gpu_grad; + cudaMalloc(&gpu_grad, bytes*3); + + + //copy the Gradient data to the GPU + cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice); + + + //call the GPU version of the vote calculation function + gpu_vote3(gpu_vote, gpu_grad, cos_phi, r, x , y, z); + + //copy the Vote Data back to the CPU + cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ; + + //free allocated memory + cudaFree(gpu_vote); + cudaFree(gpu_grad); + + } + + +#endif \ No newline at end of file diff --git a/cpp/vote3_atomic_aabb.cuh b/cpp/vote3_atomic_aabb.cuh new file mode 100644 index 0000000..ab0b802 --- /dev/null +++ b/cpp/vote3_atomic_aabb.cuh @@ -0,0 +1,133 @@ +#ifndef STIM_CUDA_VOTE3_ATOMIC_AABB_H +#define STIM_CUDA_VOTE3_ATOMIC_AABB_H + +#include +#include +#include +#include +#include "cpyToshare.cuh" +#define M_PI 3.14159 +#include +#include +#include +#include +#include + + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area + template + __global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ + + int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread + + int grid_y = y / blockDim.y; //find the grid size along y + int blockidx_y = blockIdx.y % grid_y; + int yi = blockidx_y * blockDim.y + threadIdx.y; + int zi = blockIdx.y / grid_y; + + if(xi>=x || yi>=y || zi>=z) return; + + int i = zi * x * y + yi * x + xi; // calculate the 1D index of the voter + + + float rx_sq = rx * rx; // compute the square for rmax + float ry_sq = ry * ry; + float rz_sq = rz * rz; + float dist, cos_diff; + int idx_c; + + //float rmax = sqrt(rx_sq + ry_sq + rz_sq); + stim::vec3 g(gpu_grad[3*i],gpu_grad[3*i+1],gpu_grad[3*i+2]); // form a vec3 variable for the gradient vector + stim::vec3 g_sph = g.cart2sph(); //convert cartesian coordinate to spherical for the gradient vector + int n =4; //set the number of points to find the boundaries of the conical voting area + float xc = rx * cos(g_sph[1]) * sin(g_sph[2]); //calculate the center point of the surface of the voting area for the voter + float yc = ry * sin(g_sph[1]) * sin(g_sph[2]) ; + float zc = rz * cos(g_sph[2]) ; + float r = sqrt(xc*xc + yc*yc + zc*zc); + xc+=xi; + yc+=yi; + zc+=zi; + stim::vec3 center(xc,yc,zc); + + float d = 2 * r * tan(acos(cos_phi) ); //find the diameter of the conical voting area + stim::vec3 norm = g.norm(); //compute the normalize gradient vector + float step = 360.0/(float) n; + stim::circle cir(center, d, norm); + stim::aabb3 bb(xi,yi,zi); + bb.insert(xc,yc,zc); + for(float j = 0; j <360.0; j += step){ + stim::vec3 out = cir.p(j); + bb.insert(out[0], out[1], out[2]); + } + + bb.trim_low(0,0,0); + bb.trim_high(x-1, y-1, z-1); + int bx,by,bz; + int dx, dy, dz; + float dx_sq, dy_sq, dz_sq; + for (bz=bb.low[2]; bz<=bb.high[2]; bz++){ + dz = bz - zi; //compute the distance bw the voter and the current counter along z axis + dz_sq = dz * dz; + for (by=bb.low[1]; by<=bb.high[1]; by++){ + dy = by - yi; //compute the distance bw the voter and the current counter along y axis + dy_sq = dy * dy; + for (bx=bb.low[0]; bx<=bb.high[0]; bx++){ + dx = bx - xi; //compute the distance bw the voter and the current counter along x axis + dx_sq = dx * dx; + + dist = sqrt(dx_sq + dy_sq + dz_sq); //calculate the distance between the voter and the current counter + cos_diff = (norm[0] * dx + norm[1] * dy + norm[2] * dz)/dist; // calculate the cosine of angle between the voter and the current counter + if ( ( (dx_sq/rx_sq + dy_sq/ry_sq + dz_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){ //check if the current counter located in the voting area of the voter + idx_c = (bz* y + by) * x + bx; //calculate the 1D index for the current counter + atomicAdd (&gpu_vote[idx_c] , g_sph[0]); + } + } + } + } + } + + template + void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ + + + unsigned int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(sqrt (max_threads),sqrt (max_threads)); + dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); + vote3 <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z); //call the kernel to do the voting + + } + + + template + void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ + + //calculate the number of bytes in the array + unsigned int bytes = x * y * z * sizeof(T); + + + //allocate space on the GPU for the Vote Image + T* gpu_vote; + cudaMalloc(&gpu_vote, bytes); + + //allocate space on the GPU for the input Gradient image + T* gpu_grad; + cudaMalloc(&gpu_grad, bytes*3); + + + //copy the Gradient data to the GPU + cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice); + + + //call the GPU version of the vote calculation function + gpu_vote3(gpu_vote, gpu_grad, cos_phi, r, x , y, z); + + //copy the Vote Data back to the CPU + cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ; + + //free allocated memory + cudaFree(gpu_vote); + cudaFree(gpu_grad); + + } + + +#endif \ No newline at end of file -- libgit2 0.21.4