From 89604e92d1faa3934a762e9cf42e61f5644c78bd Mon Sep 17 00:00:00 2001 From: Laila Saadatifard Date: Tue, 2 Feb 2016 15:14:52 -0600 Subject: [PATCH] ivote3 run on the shared memory --- Matlab_3D/0-gt.vol | Bin 0 -> 8388608 bytes Matlab_3D/ivote3.m | 111 +++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------ Matlab_3D/ivote3_new.m | 341 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Matlab_3D/validation.m | 9 +++++---- cpp/cpyToshare.cuh | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ cpp/float_to_half.cuh | 73 ------------------------------------------------------------------------- cpp/half_to_float.cuh | 75 --------------------------------------------------------------------------- cpp/main.cpp | 15 +++++++-------- cpp/set_rmax.cuh | 67 ------------------------------------------------------------------- cpp/update_dir3.cuh | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------- cpp/vote3.cuh | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------------- 11 files changed, 328 insertions(+), 694 deletions(-) create mode 100644 Matlab_3D/0-gt.vol delete mode 100644 Matlab_3D/ivote3_new.m create mode 100644 cpp/cpyToshare.cuh delete mode 100644 cpp/float_to_half.cuh delete mode 100644 cpp/half_to_float.cuh delete mode 100644 cpp/set_rmax.cuh diff --git a/Matlab_3D/0-gt.vol b/Matlab_3D/0-gt.vol new file mode 100644 index 0000000..a323ed5 Binary files /dev/null and b/Matlab_3D/0-gt.vol differ diff --git a/Matlab_3D/ivote3.m b/Matlab_3D/ivote3.m index 83bde12..bbdecf6 100644 --- a/Matlab_3D/ivote3.m +++ b/Matlab_3D/ivote3.m @@ -1,16 +1,17 @@ -clc; + clear; disp('***************** NEW RUN *********************'); total = tic; % ******* Initialize voting parameters ************************************** -rmax = [16 16 8]; %maximum radius of the cell -ang_deg = 20.1; %half the angular range of the voting area +rmax = [10 10 10]; %maximum radius of the cell +rmin = [1 1 1]; +ang_deg = 25.1; %half the angular range of the voting area ang = ang_deg * pi / 180; -iter = 5; %number of voting iterations -t0 = 1.0; %threshold color -sigma = [3, 3, 1.5]; +iter = 8 ; %number of voting iterations +t0 = 1; +sigma = [5, 5, 5]; % t = 0.1; -d_ang= ang / (iter); +d_ang= ang / (iter+2); % ******** Testing parameters ****************************************** % p = [50, 50, 150]; % ps = [400, 400, 200]; @@ -22,15 +23,14 @@ d_ang= ang / (iter); % X = S(1); % Y = S(2); % Z = S(3); -filename = 'nissl-float-128.128.128.vol'; +filename = '128-128-128/nissl-float-128.128.128.vol'; %'nissl-float-128.128.128.vol'; X = 128; Y = 128; Z = 128; -fid = fopen(filename); +fidi = fopen(filename); % load the VOL data into a 2D matrix -I = fread(fid,[X Y*Z], 'single'); -fclose(fid); -%% +I = fread(fidi,[X Y*Z], 'single'); +fclose(fidi); %change this to a 3D matrix I = (reshape(I, [X, Y, Z])); % invert the intensity @@ -38,26 +38,21 @@ I = (255 - I); %perform a gaussian blur Iblur = gauss_blur3d(I, sigma); - -% %crop out a small subregion of I and Iblur -% Iblur = Iblur(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1); -% I = I(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1); -% % compute the gradient [Igrad_y, Igrad_x, Igrad_z] = gradient(Iblur); %calculate the gradient magnitude Imag = sqrt(Igrad_x .^ 2 + Igrad_y .^ 2 + Igrad_z .^2); Isize = size(I); -I = single(I); -Iblur = single(Iblur); -%h = reshape(Imag, [X*Y*Z, 1]); -%hist(h, 100); +% h = reshape(Imag, [X*Y*Z, 1]); +% hist(h, 100); %set a threshold for the gradient magnitude It = Imag > t0; - +fidt = fopen('128-128-128/It.vol', 'w'); +fwrite(fidt, It, 'single'); +fclose(fidt); %Set the boundaries of the threshold image to zero It(1:rmax(1), :, :) = 0; It(X - rmax(1):X, :,:) = 0; @@ -65,13 +60,12 @@ It(:, 1:rmax(2), :) = 0; It(:, Y - rmax(2):Y,:) = 0; It(:, :, 1:rmax(3)) = 0; It(:,:, Z - rmax(3):Z) = 0; -%% + %get the indices of all of the nonzero values in the threshold image % (voter positions) [Itx,Ity,Itz] = ind2sub(size(It),find(It)); Vi =(find(It)); nV = nnz(It); -% % create a meshgrid describing coordinates relative to the voter position rangex = -rmax(1):rmax(1); %create an array of values between -rmax and rmax rangey = -rmax(2):rmax(2); @@ -80,8 +74,9 @@ rangez = -rmax(3):rmax(3); m_mag = (sqrt(mx.^2 + my.^2 + mz.^2)); %create a template describing the distance from the center of a small cube % create a mask for the voting area -M_dist = (mx.^2/rmax(1)^2 + my.^2/rmax(2)^2 + mz.^2/rmax(3)^2) <= 1; %mask for the voting area distance (all values < rmax from the center) -%% +M_dist1 = (mx.^2/rmax(1)^2 + my.^2/rmax(2)^2 + mz.^2/rmax(3)^2) <= 1 ; %mask for the voting area distance (all values < rmax from the center) +M_dist2 = (mx.^2/rmin(1)^2 + my.^2/rmin(2)^2 + mz.^2/rmin(3)^2) >= 1 ; +M_dist = M_dist1 .* M_dist2; % calculate the direction vector between a pixel and voter LV_x = mx./m_mag; LV_y = my./m_mag; @@ -89,23 +84,24 @@ LV_z = mz./m_mag; %number of pixels in the voting area of each voter (initialize to zero) validPixels = (zeros(nV,1)); -%% %indices of pixels in the voting area of each voter % indices reference the 3D image g_v_prime = zeros(nV, ceil(rmax(1)*rmax(2)*rmax(3)*ang)); -%% vote +% vote tic; +mask = zeros(Isize); +mask1 = zeros(Isize); + %for each iteration (in iterative voting) -for itr = 1 : iter+1 +for itr = 1 :iter %initialize the vote image to zero Ivote = zeros(Isize); - + %for each voter (nonzero pixels in the threshold image It) for v = 1: nV - %get the cartesian coordinates of the voter v in the main image I vx = Itx(v); vy = Ity(v); @@ -135,7 +131,7 @@ for itr = 1 : iter+1 M_angle = cos_diff >= cos(ang); %combine the two masks to mask out the voting angle - M = M_angle .* M_dist; + M = M_angle.* M_dist; % get the coordinates of each pixel in the final voter mask M pi = find(M); @@ -156,33 +152,20 @@ for itr = 1 : iter+1 Ivote( global_pi ) = Ivote( global_pi ) + vmag; - - end - fid = fopen(sprintf('128-128-128/vote%d',itr), 'w'); - if itr ==1 - fwrite(fid, Ivote, 'single'); - - elseif itr ==2 - fwrite(fid, Ivote, 'single'); - - elseif itr ==3 - fwrite(fid, Ivote, 'single'); - - elseif itr ==4 - fwrite(fid, Ivote, 'single'); - - elseif itr == 5 - fwrite(fid, Ivote, 'single'); - elseif itr == 6 - fwrite(fid, Ivote, 'single'); end + fid = fopen(sprintf('128-128-128/nissl-vote%d',itr), 'w'); + fwrite(fid, single(Ivote), '*single'); fclose(fid); + t_v1 = toc; disp(['voting done. time =',num2str(t_v1)]); - + % update the voting direction - if ang>0 + if ang>=d_ang tic; + Igrad_x = zeros(Isize); + Igrad_y = zeros(Isize); + Igrad_z = zeros(Isize); for v = 1: nV % coordinates of the current voter vx = Itx(v); @@ -199,24 +182,32 @@ for itr = 1 : iter+1 [g_px, g_py, g_pz] = ind2sub(size(Ivote), g_v_prime(v,local_max_idx)); %compute the vector from the voter position to this position - Igrad_x(vx, vy, vz) = g_px - vx; + + Igrad_x(vx, vy, vz) = g_px - vx ; Igrad_y(vx, vy, vz) = g_py - vy; Igrad_z(vx, vy, vz) = g_pz - vz; - end - + tdir1 = toc; display (['updating dir done. time = ', num2str(tdir1)]); ang = ang - d_ang; end + end +hv = reshape(Ivote, [X*Y*Z, 1]); +hist(hv, 250); +%% +t = 300; +conn = [5 5 5]; +Icenter = local_max(Ivote, conn, t); +fidc = fopen(sprintf('std3.2-r10.10-v8/out%d-t%d.vol',t,t0), 'w'); +fwrite(fidc, single(Icenter), '*single'); +fclose(fidc); +nnz(Icenter) +% [cxx1, cyy1, czz1] = ind2sub(size(Icenter),find(Icenter)); -% %% -% t = 350; -% conn = [5 5 3]; -% Icenter = local_max(Ivote, conn, t); % % center = Ivote1; % % center(center t0; -fidt = fopen('128-128-128/It.vol', 'w'); -fwrite(fidt, It, 'single'); -fclose(fidt); -%Set the boundaries of the threshold image to zero -It(1:rmax(1), :, :) = 0; -It(X - rmax(1):X, :,:) = 0; -It(:, 1:rmax(2), :) = 0; -It(:, Y - rmax(2):Y,:) = 0; -It(:, :, 1:rmax(3)) = 0; -It(:,:, Z - rmax(3):Z) = 0; - -%get the indices of all of the nonzero values in the threshold image -% (voter positions) -[Itx,Ity,Itz] = ind2sub(size(It),find(It)); -Vi =(find(It)); -nV = nnz(It); -% create a meshgrid describing coordinates relative to the voter position -rangex = -rmax(1):rmax(1); %create an array of values between -rmax and rmax -rangey = -rmax(2):rmax(2); -rangez = -rmax(3):rmax(3); -[mx, my, mz] = meshgrid(rangex, rangey, rangez); %create a template describing local pixel position in a small cube -m_mag = (sqrt(mx.^2 + my.^2 + mz.^2)); %create a template describing the distance from the center of a small cube - -% create a mask for the voting area -M_dist1 = (mx.^2/rmax(1)^2 + my.^2/rmax(2)^2 + mz.^2/rmax(3)^2) <= 1 ; %mask for the voting area distance (all values < rmax from the center) -M_dist2 = (mx.^2/rmin(1)^2 + my.^2/rmin(2)^2 + mz.^2/rmin(3)^2) >= 1 ; -M_dist = M_dist1 .* M_dist2; -% calculate the direction vector between a pixel and voter -LV_x = mx./m_mag; -LV_y = my./m_mag; -LV_z = mz./m_mag; - -%number of pixels in the voting area of each voter (initialize to zero) -validPixels = (zeros(nV,1)); -%indices of pixels in the voting area of each voter -% indices reference the 3D image -g_v_prime = zeros(nV, ceil(rmax(1)*rmax(2)*rmax(3)*ang)); - - -% vote -tic; -mask = zeros(Isize); -mask1 = zeros(Isize); - -%for each iteration (in iterative voting) -for itr = 1 :iter - - %initialize the vote image to zero - Ivote = zeros(Isize); - - %for each voter (nonzero pixels in the threshold image It) - for v = 1: nV - %get the cartesian coordinates of the voter v in the main image I - vx = Itx(v); - vy = Ity(v); - vz = Itz(v); - vi = Vi(v); - - %retreive the gradient magnitude at the voter position - vmag = Imag(vi); - - %retrieve the gradient - gx = Igrad_x(vi); - gy = Igrad_y(vi); - gz = Igrad_z(vi); - - %calculate the gradient magnitude - dmag = sqrt (gx^2 + gy^2 + gz^2); - - %calculate the normalized gradient direction - dx = gx / dmag; - dy = gy / dmag; - dz = gz / dmag; - - %calculate the angle between the voter direction and the pixel direction - cos_diff = LV_x .* dx + LV_y .* dy + LV_z .* dz; - - %create an angular mask for the voting area - M_angle = cos_diff >= cos(ang); - - %combine the two masks to mask out the voting angle - M = M_angle.* M_dist; - - % get the coordinates of each pixel in the final voter mask M - pi = find(M); - - %calculate the number of pixels in the voting region - npts = nnz(M); - validPixels(v) = npts; - - %convert every index in the voting area from a local 3D index to a global 3D index (into the original image I) - global_px = vx + mx(pi); - global_py = vy + my(pi); - global_pz = vz + mz(pi); - - %convert the global 3D index of each point into a global 1D index - global_pi = sub2ind(Isize, global_px, global_py, global_pz); - - g_v_prime (v, 1:npts) = global_pi; - - - Ivote( global_pi ) = Ivote( global_pi ) + vmag; -% if itr ==3 -% if mod(v, 5000)==0 -% mask(global_pi)= mask(global_pi) + 0.1; -% mask (vi) = mask(vi) + 0.5; -% end -% end -% if itr ==6 -% if mod(v, 5000)==0 -% mask1(global_pi)= mask1(global_pi) + 0.1; -% mask1 (vi) = mask1(vi) + 0.5; -% end -% end -% if itr==1 -% for ix = -12:12 -% for iy = -12:12 -% for iz = -12:12 -% mask(vx+ix, vy+iy, vz+iz) = M(ix+13,iy+13,iz+13)+ mask(vx+ix, vy+iy, vz+iz); -% end -% end -% end -% end -% if itr==2 -% for ix = -12:12 -% for iy = -12:12 -% for iz = -12:12 -% mask1(vx+ix, vy+iy, vz+iz) = M(ix+13,iy+13,iz+13)+ mask1(vx+ix, vy+iy, vz+iz); -% end -% end -% end -% end - - - end -% fid = fopen(sprintf('128-128-128/nissl-vote%d',itr), 'w'); -% fwrite(fid, single(Ivote), '*single'); - if itr ==1 - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w'); - fwrite(fid, single(Ivote), '*single'); - fclose(fid); - end - if itr ==8 - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w'); - fwrite(fid, single(Ivote), '*single'); - fclose(fid); - Ivote8 = (Ivote); - end - if itr ==9 - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w'); - fwrite(fid, single(Ivote), '*single'); - fclose(fid); - end - if itr ==10 - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w'); - fwrite(fid, single(Ivote), '*single'); - fclose(fid); - end - -% Ivote1 = single(Ivote); -% fwrite(fid, Ivote1, '*single'); -% -% -% elseif itr ==2 -% Ivote2 = single(Ivote); -% fwrite(fid, Ivote2, '*single'); -% -% -% elseif itr ==3 -% Ivote3 = single(Ivote); -% fwrite(fid, Ivote3, '*single'); -% -% -% elseif itr ==4 -% Ivote4 = single(Ivote); -% fwrite(fid, Ivote4, '*single'); -% -% -% elseif itr == 5 -% Ivote5 = single(Ivote); -% fwrite(fid, Ivote5, '*single'); -% -% -% elseif itr == 6 -% fwrite(fid, single(Ivote), '*single'); -% elseif itr == 7 -% fwrite(fid, single(Ivote), '*single'); -% elseif itr == 8 -% fwrite(fid, single(Ivote), '*single'); -% elseif itr == 9 -% fwrite(fid, single(Ivote), '*single'); -% end -% fclose(fid); - t_v1 = toc; - disp(['voting done. time =',num2str(t_v1)]); - - % update the voting direction - if ang>=d_ang - tic; - Igrad_x = zeros(Isize); - Igrad_y = zeros(Isize); - Igrad_z = zeros(Isize); - for v = 1: nV - % coordinates of the current voter - vx = Itx(v); - vy = Ity(v); - vz = Itz(v); - - %get the local value of the voting image - local_Ivote = Ivote(g_v_prime(v,1:validPixels(v))); - - %find the index of the maximum value - [~, local_max_idx] = max(local_Ivote); - - %convert this into a global subscript - [g_px, g_py, g_pz] = ind2sub(size(Ivote), g_v_prime(v,local_max_idx)); - - %compute the vector from the voter position to this position - - Igrad_x(vx, vy, vz) = g_px - vx ; - Igrad_y(vx, vy, vz) = g_py - vy; - Igrad_z(vx, vy, vz) = g_pz - vz; -% if itr ==3 -% if mod(v, 5000)==0 -% mask(g_px, g_py, g_pz)= mask(g_px, g_py, g_pz) + 1; -% end -% end -% if itr ==6 -% if mod(v, 5000)==0 -% mask1(g_px, g_py, g_pz)= mask1(g_px, g_py, g_pz) + 1; -% end -% end - end - - - tdir1 = toc; - display (['updating dir done. time = ', num2str(tdir1)]); - ang = ang - d_ang; - end - - end - -hv = reshape(Ivote, [X*Y*Z, 1]); -hist(hv, 250); -%% -t = 4300; -conn = [5 5 5]; -Icenter = local_max(Ivote, conn, t); -fidc = fopen(sprintf('std3.2-r10.10-v8/out%d-t%d.vol',t,t0), 'w'); -fwrite(fidc, single(Icenter), '*single'); -fclose(fidc); -nnz(Icenter) -% [cxx1, cyy1, czz1] = ind2sub(size(Icenter),find(Icenter)); - -% % center = Ivote1; -% % center(center + __device__ void cpyG2S1D(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y, + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){ + + //calculate the total number of threads available + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; + + //calculate the current 1D thread ID + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; + + //calculate the number of iteration require for the copy + unsigned int I = X/tThreads + 1; + + //the specified start position in global memory is (x, y, z) + unsigned int gstart = z*I_x*I_y + y*I_x + x; + + for (unsigned int i = 0; i < I; i++){ + + //each iteration will copy tThreads elements, so the starting index in shared memory + //for each iteration will be i * tThreads (iteration # times the number of elements transferred per iteration) + unsigned int sIdx = i * tThreads + ti; + if (sIdx>= X*Y) return; + + //each iteration will copy tThreads elements from the global index + unsigned int gIdx = gstart + sIdx; + + //copy global to share + dest[sIdx] = src[gIdx]; + + } + } + //this function copy one channel data from global to shared memory in two dimensions with size of X*Y bytes. + template + __device__ void cpyG2S2D(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y, + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){ + //calculate the total number of threads available + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; + + //calculate the current 1D thread ID + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; + + //calculate the number of iteration require for the copy + unsigned int I = X*Y/tThreads + 1; + + unsigned int gz1 = z*I_x*I_y ; + + for (unsigned int i = 0; i < I; i++){ + + unsigned int sIdx = i * tThreads + ti; + if (sIdx>= X*Y) return; + + unsigned int sy = sIdx/X; + unsigned int sx = sIdx - (sy * X); + + unsigned int gx = x + sx; + unsigned int gy = y + sy; + if (gx + __device__ void cpyG2S1D3ch(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y, + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){ + + //calculate the total number of threads available + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; + + //calculate the current 1D thread ID + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; + + //calculate the number of iteration require for the copy + unsigned int I = X/tThreads + 1; + + //the specified start position in global memory is (x, y, z) + unsigned int gstart = z*I_x*I_y + y*I_x + x; + + for (unsigned int i = 0; i < I; i++){ + + //each iteration will copy tThreads elements, so the starting index in shared memory + //for each iteration will be i * tThreads (iteration # times the number of elements transferred per iteration) + unsigned int sIdx = i * tThreads + ti; + if (sIdx>= X*Y) return; + unsigned int gIdx = gstart*3 + sIdx; + //copy global to share + dest[sIdx] = src[gIdx]; + + } + } + //this function copy three channels data from global to shared memory in two dimensions with size of X*Y bytes. + template + __device__ void cpyG2S2D3ch(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y, + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){ + //calculate the total number of threads available + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; + + //calculate the current 1D thread ID + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; + + //calculate the number of iteration require for the copy + unsigned int I = X*Y/tThreads + 1; + + unsigned int gz1 = z*I_x*I_y ; + + for (unsigned int i = 0; i < I; i++){ + + unsigned int sIdx = i * tThreads + ti; + if (sIdx>= X*Y) return; + unsigned int sy = sIdx/X; + unsigned int sx = sIdx - (sy * X); + + unsigned int gx = x + sx/3; + unsigned int gy = y + sy; + if (gx + __device__ void mag_share2D(T* grad, unsigned int bs, unsigned int X, unsigned int Y, dim3 threadIdx, dim3 blockDim){ + + //calculate the total number of threads available + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; + //calculate the current 1D thread ID + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; + //calculate the number of iteration require for the copy + unsigned int I = X*Y/tThreads + 1; + for (unsigned int i = 0; i < I; i++){ + + unsigned int sIdx = i * tThreads + ti; + if (sIdx>= X*Y) return; + float gx = grad[sIdx*3]; + float gy = grad[sIdx*3 + 1]; + float gz = grad[sIdx*3 + 2]; + float mag = sqrt(gx*gx + gy*gy + gz*gz); + grad[bs + sIdx] = mag; + + } + } +#endif \ No newline at end of file diff --git a/cpp/float_to_half.cuh b/cpp/float_to_half.cuh deleted file mode 100644 index ed00b23..0000000 --- a/cpp/float_to_half.cuh +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef STIM_CUDA_FLOAT_TO_HALF_H -#define STIM_CUDA_FLOAT_TO_HALF_H - -#include -#include -#include -#include -#include -#include -#include - __global__ void cuda_f2h(half* gpu_half, float* gpu_float, int x, int y, int z){ - - - //calculate x,y,z coordinates for this thread - int xi = blockIdx.x * blockDim.x + threadIdx.x; - //find the grid size along y - int grid_y = y / blockDim.y; - int blockidx_y = blockIdx.y % grid_y; - int yi = blockidx_y * blockDim.y + threadIdx.y; - int zi = blockIdx.y / grid_y; - int i = zi * x * y + yi * x + xi; - - if(xi >= x|| yi >= y || zi>= z) return; - - - gpu_half[i] = __float2half(gpu_float[i]); - - - } - - - void gpu_f2h(half* gpu_half, float* gpu_float, unsigned int x, unsigned int y, unsigned int z){ - - - int max_threads = stim::maxThreadsPerBlock(); - dim3 threads(sqrt (max_threads),sqrt (max_threads)); - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); - - //call the GPU kernel to determine the gradient - cuda_f2h <<< blocks, threads >>>(gpu_half, gpu_float, x, y, z); - - } - - - - void cpu_f2h(half* h_out, float* f_in, unsigned int x, unsigned int y, unsigned int z){ - - //calculate the number of pixels in the array - unsigned int pix = x* y* z; - - //allocate memory on the GPU for the input float precision. - float* gpu_float; - cudaMalloc(&gpu_float, pix * sizeof(float)); - cudaMemcpy(gpu_float, f_in, pix * sizeof(float), cudaMemcpyHostToDevice); - - //allocate memory on the GPU for the output half precision - half* gpu_half; - cudaMalloc(&gpu_half, pix * sizeof(half)); - - //call the GPU version of this function - gpu_f2h(gpu_half, gpu_float, x, y, z); - - //copy the array back to the CPU - cudaMemcpy(h_out, gpu_half, pix * sizeof(half), cudaMemcpyDeviceToHost); - - //free allocated memory - cudaFree(gpu_float); - cudaFree(gpu_half); - - } - - -#endif \ No newline at end of file diff --git a/cpp/half_to_float.cuh b/cpp/half_to_float.cuh deleted file mode 100644 index 93e6040..0000000 --- a/cpp/half_to_float.cuh +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef STIM_CUDA_HALF_TO_FLOAT_H -#define STIM_CUDA_HALF_TO_FLOAT_H - -#include -#include -#include -#include -#include -#include "cuda_fp16.h" - - __global__ void cuda_h2f(float* gpu_float, half* gpu_half, int x, int y, int z){ - - - //calculate x,y,z coordinates for this thread - int xi = blockIdx.x * blockDim.x + threadIdx.x; - //find the grid size along y - int grid_y = y / blockDim.y; - int blockidx_y = blockIdx.y % grid_y; - int yi = blockidx_y * blockDim.y + threadIdx.y; - int zi = blockIdx.y / grid_y; - int i = zi * x * y + yi * x + xi; - - if(xi >= x|| yi >= y || zi>= z) return; - - - gpu_float[i] = __half2float(gpu_half[i]); - - } - - - void gpu_h2f(float* gpu_float, half* gpu_half, unsigned int x, unsigned int y, unsigned int z){ - - - int max_threads = stim::maxThreadsPerBlock(); - dim3 threads(sqrt (max_threads),sqrt (max_threads)); - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); - - //call the GPU kernel to determine the gradient - cuda_h2f <<< blocks, threads >>>(gpu_float, gpu_half, x, y, z); - - } - - - - void cpu_f2h(float* f_out, half* h_in, unsigned int x, unsigned int y, unsigned int z){ - - //calculate the number of pixels in the array - unsigned int pix = x* y* z; - - //allocate memory on the GPU for the input half precision - half* gpu_half; - cudaMalloc(&gpu_half, pix * sizeof(half)); - cudaMemcpy(gpu_half, h_in, pix * sizeof(half), cudaMemcpyHostToDevice); - - //allocate memory on the GPU for the output float precision. - float* gpu_float; - cudaMalloc(&gpu_float, pix * sizeof(float)); - - - - //call the GPU version of this function - gpu_h2f(gpu_float, gpu_half, x, y, z); - - cudaMemcpy(f_out, gpu_float, pix * sizeof(float), cudaMemcpyDeviceToHost); - - - - //free allocated memory - cudaFree(gpu_float); - cudaFree(gpu_half); - - } - - -#endif \ No newline at end of file diff --git a/cpp/main.cpp b/cpp/main.cpp index 4438624..5d5e11e 100644 --- a/cpp/main.cpp +++ b/cpp/main.cpp @@ -37,10 +37,9 @@ int main(int argc, char** argv){ printf("current device ID: %d\n", i); printf("device name: %s\n", prop.name); printf("total global mem: %lu\n", prop.totalGlobalMem); + printf("shared memory per block: %lu\n", prop.sharedMemPerBlock); } - - - + //output advertisement std::cout< -#include -#include -#include -#include - -template - __global__ void cuda_set_rmax(T* gpu_r, int rx, int ry, int rz, int x, int y, int z){ - - //calculate x,y,z coordinates for this thread - int xi = blockIdx.x * blockDim.x + threadIdx.x; - //find the grid size along y - int grid_y = y / blockDim.y; - int blockidx_y = blockIdx.y % grid_y; - int yi = blockidx_y * blockDim.y + threadIdx.y; - int zi = blockIdx.y / grid_y; - int i = zi * x * y + yi * x + xi; - - if(xi>=x || yi>=y || zi>=z) return; - - gpu_r[i*3+0] = rx; - gpu_r[i*3+1] = ry; - gpu_r[i*3+2] = rz; - - } - -template - void gpu_set_rmax(T* gpu_r, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ - - - unsigned int max_threads = stim::maxThreadsPerBlock(); - dim3 threads(sqrt (max_threads),sqrt (max_threads)); - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); - - //call the kernel to do the voting - cuda_set_rmax <<< blocks, threads >>>(gpu_r, r[0], r[1], r[2], x , y, z); - - } -template - void cpu_set_rmax(T* cpu_r, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ - - //calculate the number of bytes in the array - unsigned int bytes = x * y * z * sizeof(T); - - //allocate space on the GPU for the rmax - T* gpu_r; - cudaMalloc(&gpu_vote, bytes*3); - - cudaMemcpy(gpu_r, cpu_r, bytes*3, cudaMemcpyHostToDevice); - - - //call the GPU version of the vote calculation function - gpu_set_rmax(gpu_r, r, x , y, z); - - //copy the Vote Data back to the CPU - cudaMemcpy(cpu_r, gpu_r, bytes*3, cudaMemcpyDeviceToHost) ; - - //free allocated memory - cudaFree(gpu_r); - - } - - -#endif \ No newline at end of file diff --git a/cpp/update_dir3.cuh b/cpp/update_dir3.cuh index ba216b5..5a2fc95 100644 --- a/cpp/update_dir3.cuh +++ b/cpp/update_dir3.cuh @@ -6,11 +6,12 @@ #include #include #include +#include "cpyToshare.cuh" // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. template __global__ void update_dir3(T* gpu_dir, T* gpu_grad, T* gpu_vote, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ - + extern __shared__ float s_vote[]; //calculate x,y,z coordinates for this thread int xi = blockIdx.x * blockDim.x + threadIdx.x; //find the grid size along y @@ -18,10 +19,21 @@ int blockidx_y = blockIdx.y % grid_y; int yi = blockidx_y * blockDim.y + threadIdx.y; int zi = blockIdx.y / grid_y; + //compute the global 1D index for this pixel int i = zi * x * y + yi * x + xi; if(xi >= x|| yi >= y || zi>= z) return; - + // find the starting points for this block along the x and y directions + int bxi = blockIdx.x * blockDim.x; + int byi = blockidx_y * blockDim.y; + //find the starting points and the size of the window, which will be copied to the 2D-shared memory + int bxs = bxi - rx; + int bys = byi - ry; + int xwidth = 2 * rx + blockDim.x; + int ywidth = 2 * ry + blockDim.y; + //compute the coordinations of this pixel in the 2D-shared memory. + int sx_rx = threadIdx.x + rx; + int sy_ry = threadIdx.y + ry; //find the gradient values along the x, y ,z axis, and the gradient magnitude for the voter float g_v_x = gpu_grad[i * 3 + 0]; float g_v_y = gpu_grad[i * 3 + 1]; @@ -42,39 +54,48 @@ int rz_sq = rz * rz; for (int z_p = -rz; z_p<=rz; z_p++){ - - for(int y_p = -ry; y_p <= ry; y_p++){ - - for(int x_p = -rx; x_p <= rx; x_p++){ - - //calculate the x, y ,z indices for the current pixel - int xi_p = (xi + x_p) ; + int zi_p = zi + z_p; + if ((zi_p) >=0 && (zi_p) < z){ + //call the function to copy one slide of vote date to the 2D-shared memory. + __syncthreads(); + cpyG2S2D(s_vote, gpu_vote, bxs, bys, zi + z_p, xwidth, ywidth, threadIdx, blockDim, x, y); + __syncthreads(); + float z_sq = z_p * z_p; + float d_z_sq = (z_sq)/rz_sq; + for(int y_p = -ry; y_p <= ry; y_p++){ + + float y_sq = y_p * y_p; + float yz_sq = y_sq + z_sq; int yi_p = (yi + y_p) ; - int zi_p = (zi + z_p) ; - if (zi_p >=0 && zi_p < z && yi_p >=0 && yi_p < y && xi_p >=0 && xi_p < x){ + float d_yz_sq = (y_sq)/ry_sq + d_z_sq; + unsigned int s_y1d = (sy_ry + y_p) * xwidth; + for(int x_p = -rx; x_p <= rx; x_p++){ - //calculate the distance between the pixel and the current voter. - float x_sq = x_p * x_p; - float y_sq = y_p * y_p; - float z_sq = z_p * z_p; - float d_pv = sqrt(x_sq + y_sq + z_sq); + //check if the current pixel is inside of the data-set. + int xi_p = (xi + x_p) ; + if (yi_p >=0 && yi_p < y && xi_p >=0 && xi_p < x){ - // calculate the angle between the pixel and the current voter. - float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * mag_v); + //calculate the distance between the pixel and the current voter. + float x_sq = x_p * x_p; + float d_pv = sqrt(x_sq + yz_sq); - if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){ + // calculate the angle between the pixel and the current voter. + float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * mag_v); - //calculate the 1D index for the current pixel - unsigned int id_p = (zi_p) * x * y + (yi_p) * x + (xi_p); - l_vote = gpu_vote[id_p]; - - // compare the vote value of this pixel with the max value to find the maxima and its index. - if (l_vote>max) { + if ((((x_sq)/rx_sq + d_yz_sq )<= 1) && (cos_diff >= cos_phi)){ - max = l_vote; - id_x = x_p; - id_y = y_p; - id_z = z_p; + //calculate the 1D index for the current pixel in the 2D-shared memory + unsigned int id_s = s_y1d + (sx_rx + x_p); + l_vote = s_vote[id_s]; + + // compare the vote value of this pixel with the max value to find the maxima and its index. + if (l_vote>max) { + + max = l_vote; + id_x = x_p; + id_y = y_p; + id_z = z_p; + } } } } @@ -115,13 +136,13 @@ unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(sqrt (max_threads),sqrt (max_threads)); dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); - + unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*sizeof(T); // allocate space on the GPU for the updated vote direction T* gpu_dir; cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3); //call the kernel to calculate the new voting direction - update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z); + update_dir3 <<< blocks, threads, shared_bytes >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z); //call the kernel to update the gradient direction diff --git a/cpp/vote3.cuh b/cpp/vote3.cuh index b148278..2e6b9ce 100644 --- a/cpp/vote3.cuh +++ b/cpp/vote3.cuh @@ -6,12 +6,13 @@ #include #include #include - +#include "cpyToshare.cuh" // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area template __global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ + extern __shared__ float s[]; //calculate x,y,z coordinates for this thread int xi = blockIdx.x * blockDim.x + threadIdx.x; //find the grid size along y @@ -23,6 +24,17 @@ if(xi>=x || yi>=y || zi>=z) return; + //find the starting points and the size of the window, which will be copied to the 2D-shared memory + int bxs = blockIdx.x * blockDim.x - rx; + int bys = blockidx_y * blockDim.y - ry; + int xwidth = 2 * rx + blockDim.x; + int ywidth = 2 * ry + blockDim.y; + //calculate the starting point of shared memory for storing the magnitude. + unsigned int b_s = 3 * xwidth * ywidth; + //compute the coordinations of this pixel in the 2D-shared memory. + int sx_rx = threadIdx.x + rx; + int sy_ry = threadIdx.y + ry; + // define a local variable to sum the votes from the voters float sum = 0; @@ -31,43 +43,58 @@ int rz_sq = rz * rz; for (int z_v = -rz; z_v<=rz; z_v++){ - - for(int y_v = -ry; y_v <= ry; y_v++){ - - for(int x_v = -rx; x_v <= rx; x_v++){ - - //calculate the x, y ,z indices for the current voter - int xi_v = (xi + x_v) ; + int zi_v = zi + z_v; + if ((zi_v) >=0 && (zi_v) (s, gpu_grad, bxs, bys, zi + z_v, 3*xwidth, ywidth, threadIdx, blockDim, x, y); + __syncthreads(); + mag_share2D(s, b_s, xwidth, ywidth, threadIdx, blockDim); + __syncthreads(); + float z_sq = z_v * z_v; + float d_z_sq = z_sq/rz_sq; + + for(int y_v = -ry; y_v <= ry; y_v++){ int yi_v = (yi + y_v) ; - int zi_v = (zi + z_v) ; - if (zi_v >=0 && zi_v < z && yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){ - - //calculate the 1D index for the current voter - unsigned int id_v = (zi_v) * x * y + (yi_v) * x + (xi_v); + //compute the position of the current voter in the shared memory along the y axis. + unsigned int sIdx_y1d = (sy_ry + y_v)* xwidth; + + float y_sq = y_v * y_v; + float yz_sq = z_sq + y_sq; + float d_yz_sq = y_sq/ry_sq + d_z_sq; + for(int x_v = -rx; x_v <= rx; x_v++){ + + //check if the current voter is inside of the data-set + int xi_v = (xi + x_v) ; + if (yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){ + + //compute the position of the current voter in the 2D-shared memory along the x axis. + unsigned int sIdx_x = (sx_rx + x_v); + //find the 1D index of this voter in the 2D-shared memory. + unsigned int s_Idx = (sIdx_y1d + sIdx_x); + unsigned int s_Idx3 = s_Idx * 3; + + //save the gradient values for the current voter to the local variables and compute the gradient magnitude. + float g_v_x = s[s_Idx3]; + float g_v_y = s[s_Idx3 + 1]; + float g_v_z = s[s_Idx3 + 2]; + float mag_v = s[b_s + s_Idx]; //sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z); + + //calculate the distance between the pixel and the current voter. + float x_sq = x_v * x_v; + float d_pv = sqrt(x_sq + yz_sq); - //find the gradient values along the x, y ,z axis, and the gradient magnitude for this voter - - float g_v_x = gpu_grad[id_v * 3 + 0]; - float g_v_y = gpu_grad[id_v * 3 + 1]; - float g_v_z = gpu_grad[id_v * 3 + 2]; - float mag_v = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z); - - //calculate the distance between the pixel and the current voter. - float x_sq = x_v * x_v; - float y_sq = y_v * y_v; - float z_sq = z_v * z_v; - float d_pv = sqrt(x_sq + y_sq + z_sq); - - // calculate the angle between the pixel and the current voter. - float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v); + // calculate the angle between the pixel and the current voter. + float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v); - // check if the current voter is located in the voting area of this pixel. - if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){ + // check if the current voter is located in the voting area of this pixel. + if ((((x_sq)/rx_sq + d_yz_sq)<= 1) && (cos_diff >= cos_phi)){ - sum += mag_v; + sum += mag_v; + } } - } - } + } + } } } @@ -81,9 +108,9 @@ unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(sqrt (max_threads),sqrt (max_threads)); dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); - + unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*4*sizeof(T); //call the kernel to do the voting - vote3 <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z); + vote3 <<< blocks, threads, shared_bytes >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z); } -- libgit2 0.21.4