ivote3 run on the shared memory

Laila Saadatifard
1 parent 3b5043cd
Showing 11 changed files with 328 additions and 694 deletions Show diff stats
Matlab_3D/0-gt.vol
Matlab_3D/ivote3.m
Matlab_3D/ivote3_new.m
Matlab_3D/validation.m
cpp/cpyToshare.cuh
cpp/float_to_half.cuh
cpp/half_to_float.cuh
cpp/main.cpp
cpp/set_rmax.cuh
cpp/update_dir3.cuh
cpp/vote3.cuh
-clc;
+
 clear;
 disp('***************** NEW RUN *********************');
 total = tic;
 % ******* Initialize voting parameters **************************************
-rmax = [16 16 8];		%maximum radius of the cell
-ang_deg = 20.1;		%half the angular range of the voting area
+rmax = [10 10 10];		%maximum radius of the cell
+rmin = [1 1 1];	
+ang_deg = 25.1;		%half the angular range of the voting area
 ang = ang_deg * pi / 180;
-iter = 5;	%number of voting iterations
-t0 = 1.0; %threshold color
-sigma = [3, 3, 1.5];
+iter = 8 ;	%number of voting iterations
+t0 = 1;
+sigma = [5, 5, 5];
 % t = 0.1;
-d_ang= ang / (iter);
+d_ang= ang / (iter+2);
 % ******** Testing parameters ******************************************
 % p = [50, 50, 150];
 % ps = [400, 400, 200];
@@ -22,15 +23,14 @@ d_ang= ang / (iter);
 % X = S(1);
 % Y = S(2);
 % Z = S(3);
-filename = 'nissl-float-128.128.128.vol';
+filename = '128-128-128/nissl-float-128.128.128.vol'; %'nissl-float-128.128.128.vol';
 X = 128;
 Y = 128;
 Z = 128;
-fid = fopen(filename);
+fidi = fopen(filename);
 % load the VOL data into a 2D matrix
-I = fread(fid,[X Y*Z], 'single');
-fclose(fid);
-%%
+I = fread(fidi,[X Y*Z], 'single');
+fclose(fidi);
 %change this to a 3D matrix
 I = (reshape(I, [X, Y, Z]));
 % invert the intensity
@@ -38,26 +38,21 @@ I = (255 - I);
 %perform a gaussian blur
 Iblur = gauss_blur3d(I, sigma);
-
-% %crop out a small subregion of I and Iblur
-% Iblur = Iblur(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1);
-% I = I(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1);
-% 
 % compute the gradient
 [Igrad_y, Igrad_x, Igrad_z] = gradient(Iblur);
 %calculate the gradient magnitude
 Imag = sqrt(Igrad_x .^ 2 + Igrad_y .^ 2 + Igrad_z .^2);
 Isize = size(I);
-I = single(I);
-Iblur = single(Iblur);
-%h  = reshape(Imag, [X*Y*Z, 1]);
-%hist(h, 100);
+% h  = reshape(Imag, [X*Y*Z, 1]);
+% hist(h, 100);
 %set a threshold for the gradient magnitude
 It = Imag > t0;
-
+fidt = fopen('128-128-128/It.vol', 'w');
+fwrite(fidt, It, 'single');
+fclose(fidt);
 %Set the boundaries of the threshold image to zero
 It(1:rmax(1), :, :) = 0;
 It(X - rmax(1):X, :,:) = 0;
@@ -65,13 +60,12 @@ It(:, 1:rmax(2), :) = 0;
 It(:, Y - rmax(2):Y,:) = 0;
 It(:, :, 1:rmax(3)) = 0;
 It(:,:, Z - rmax(3):Z) = 0;
-%%
+
 %get the indices of all of the nonzero values in the threshold image
 %   (voter positions)
 [Itx,Ity,Itz] = ind2sub(size(It),find(It));
 Vi =(find(It));
 nV = nnz(It);
-%
 % create a meshgrid describing coordinates relative to the voter position
 rangex = -rmax(1):rmax(1);                                 %create an array of values between -rmax and rmax
 rangey = -rmax(2):rmax(2);    
@@ -80,8 +74,9 @@ rangez = -rmax(3):rmax(3);
 m_mag = (sqrt(mx.^2 + my.^2 + mz.^2));                %create a template describing the distance from the center of a small cube
 % create a mask for the voting area
-M_dist =  (mx.^2/rmax(1)^2 + my.^2/rmax(2)^2 + mz.^2/rmax(3)^2)   <= 1;                             %mask for the voting area distance (all values < rmax from the center)
-%% 
+M_dist1 = (mx.^2/rmax(1)^2 + my.^2/rmax(2)^2 + mz.^2/rmax(3)^2)   <= 1 ;                             %mask for the voting area distance (all values < rmax from the center)
+M_dist2 = (mx.^2/rmin(1)^2 + my.^2/rmin(2)^2 + mz.^2/rmin(3)^2)   >= 1 ;      
+M_dist = M_dist1 .* M_dist2;
 % calculate the direction vector between a pixel and voter
 LV_x = mx./m_mag;
 LV_y = my./m_mag;
@@ -89,23 +84,24 @@ LV_z = mz./m_mag;
 %number of pixels in the voting area of each voter (initialize to zero)
 validPixels = (zeros(nV,1)); 
-%%
 %indices of pixels in the voting area of each voter
 %   indices reference the 3D image
 g_v_prime = zeros(nV, ceil(rmax(1)*rmax(2)*rmax(3)*ang));
-%% vote
+% vote
 tic;
+mask = zeros(Isize);
+mask1 = zeros(Isize);
+
 %for each iteration (in iterative voting)
-for itr = 1 : iter+1
+for itr = 1 :iter
 	%initialize the vote image to zero
 	Ivote = zeros(Isize);
-
+	
 	%for each voter (nonzero pixels in the threshold image It)
 	for v = 1: nV
-
 		%get the cartesian coordinates of the voter v in the main image I
 		vx = Itx(v);
 		vy = Ity(v);
@@ -135,7 +131,7 @@ for itr = 1 : iter+1
 		M_angle = cos_diff >= cos(ang);
 		%combine the two masks to mask out the voting angle
-		M = M_angle .* M_dist;		
+		M = M_angle.* M_dist;		
 		% get the coordinates of each pixel in the final voter mask M
 		pi = find(M);
@@ -156,33 +152,20 @@ for itr = 1 : iter+1
 		Ivote( global_pi ) = Ivote( global_pi ) + vmag;
-
-	end
-	fid = fopen(sprintf('128-128-128/vote%d',itr), 'w');
-	if itr ==1
-		fwrite(fid, Ivote, 'single');
-	
-	elseif itr ==2
-		fwrite(fid, Ivote, 'single');
-	
-	elseif itr ==3
-		fwrite(fid, Ivote, 'single');
-	
-	elseif itr ==4
-		fwrite(fid, Ivote, 'single');
-		
-	elseif itr == 5
-		fwrite(fid, Ivote, 'single');
-	elseif itr == 6
-		fwrite(fid, Ivote, 'single');
 	end
+ 	fid = fopen(sprintf('128-128-128/nissl-vote%d',itr), 'w');
+	fwrite(fid, single(Ivote), '*single');
 	fclose(fid);
+ 	
 	t_v1 = toc;   
 	disp(['voting done.  time =',num2str(t_v1)]);
-
+	
 	% update the voting direction
-	if ang>0
+	if ang>=d_ang
 		tic;
+		Igrad_x = zeros(Isize);
+		Igrad_y = zeros(Isize);
+		Igrad_z = zeros(Isize);
 		for v = 1: nV
 			% coordinates of the current voter
 			vx = Itx(v);
@@ -199,24 +182,32 @@ for itr = 1 : iter+1
 			[g_px, g_py, g_pz] = ind2sub(size(Ivote), g_v_prime(v,local_max_idx));
 			%compute the vector from the voter position to this position
-			Igrad_x(vx, vy, vz) = g_px - vx;
+			
+			Igrad_x(vx, vy, vz) = g_px - vx ;
 			Igrad_y(vx, vy, vz) = g_py - vy;
 			Igrad_z(vx, vy, vz) = g_pz - vz;
-
 		end
-	
+		
 	tdir1 = toc;
 	display (['updating dir done.  time = ', num2str(tdir1)]);
 	ang = ang - d_ang;
 	end
+	
  end
+hv  = reshape(Ivote, [X*Y*Z, 1]);
+hist(hv, 250);
+%%
+t = 300;
+conn = [5 5 5];
+Icenter = local_max(Ivote, conn, t);
+fidc = fopen(sprintf('std3.2-r10.10-v8/out%d-t%d.vol',t,t0), 'w');
+fwrite(fidc, single(Icenter), '*single');
+fclose(fidc);
+nnz(Icenter)
+% [cxx1, cyy1, czz1] = ind2sub(size(Icenter),find(Icenter));
-% %%
-% t = 350;
-% conn = [5 5 3];
-% Icenter = local_max(Ivote, conn, t);
 % % center = Ivote1;
 % % center(center<t) = 0;
 % % center = imregionalmax(center);
-
-clear;
-disp('***************** NEW RUN *********************');
-total = tic;
-% ******* Initialize voting parameters **************************************
-rmax = [10 10 10];		%maximum radius of the cell
-rmin = [1 1 1];	
-ang_deg = 25.1;		%half the angular range of the voting area
-ang = ang_deg * pi / 180;
-iter = 8 ;	%number of voting iterations
-t0 = 1;
-sigma = [3, 3, 2];
-% t = 0.1;
-d_ang= ang / (iter+2);
-% ******** Testing parameters ******************************************
-% p = [50, 50, 150];
-% ps = [400, 400, 200];
-% % ps = [100, 50, 40];
-% % I = syn_Img(rmax , ps);
-% volfile = 'nissl-rat.vol';
-% fid = fopen(volfile); % open the file that include the image
-% S = fread(fid, 3, 'int32');
-% X = S(1);
-% Y = S(2);
-% Z = S(3);
-filename = '128-128-128/nissl-float-128.128.128.vol'; %'nissl-float-128.128.128.vol';
-X = 128;
-Y = 128;
-Z = 128;
-fidi = fopen(filename);
-% load the VOL data into a 2D matrix
-I = fread(fidi,[X Y*Z], 'single');
-fclose(fidi);
-%change this to a 3D matrix
-I = (reshape(I, [X, Y, Z]));
-% invert the intensity
-I = (255 - I);
-
-%perform a gaussian blur
-Iblur = gauss_blur3d(I, sigma);
-% Iblur = I;
-% %crop out a small subregion of I and Iblur
-% Iblur = Iblur(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1);
-% I = I(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1);
-% compute the gradient
-[Igrad_y, Igrad_x, Igrad_z] = gradient(Iblur);
-
-%calculate the gradient magnitude
-Imag = sqrt(Igrad_x .^ 2 + Igrad_y .^ 2 + Igrad_z .^2);
-Isize = size(I);
-
-% h  = reshape(Imag, [X*Y*Z, 1]);
-% hist(h, 100);
-
-%set a threshold for the gradient magnitude
-It = Imag > t0;
-fidt = fopen('128-128-128/It.vol', 'w');
-fwrite(fidt, It, 'single');
-fclose(fidt);
-%Set the boundaries of the threshold image to zero
-It(1:rmax(1), :, :) = 0;
-It(X - rmax(1):X, :,:) = 0;
-It(:, 1:rmax(2), :) = 0;
-It(:, Y - rmax(2):Y,:) = 0;
-It(:, :, 1:rmax(3)) = 0;
-It(:,:, Z - rmax(3):Z) = 0;
-
-%get the indices of all of the nonzero values in the threshold image
-%   (voter positions)
-[Itx,Ity,Itz] = ind2sub(size(It),find(It));
-Vi =(find(It));
-nV = nnz(It);
-% create a meshgrid describing coordinates relative to the voter position
-rangex = -rmax(1):rmax(1);                                 %create an array of values between -rmax and rmax
-rangey = -rmax(2):rmax(2);    
-rangez = -rmax(3):rmax(3);    
-[mx, my, mz] = meshgrid(rangex, rangey, rangez);       %create a template describing local pixel position in a small cube
-m_mag = (sqrt(mx.^2 + my.^2 + mz.^2));                %create a template describing the distance from the center of a small cube
-
-% create a mask for the voting area
-M_dist1 = (mx.^2/rmax(1)^2 + my.^2/rmax(2)^2 + mz.^2/rmax(3)^2)   <= 1 ;                             %mask for the voting area distance (all values < rmax from the center)
-M_dist2 = (mx.^2/rmin(1)^2 + my.^2/rmin(2)^2 + mz.^2/rmin(3)^2)   >= 1 ;      
-M_dist = M_dist1 .* M_dist2;
-% calculate the direction vector between a pixel and voter
-LV_x = mx./m_mag;
-LV_y = my./m_mag;
-LV_z = mz./m_mag;
-
-%number of pixels in the voting area of each voter (initialize to zero)
-validPixels = (zeros(nV,1)); 
-%indices of pixels in the voting area of each voter
-%   indices reference the 3D image
-g_v_prime = zeros(nV, ceil(rmax(1)*rmax(2)*rmax(3)*ang));
-
-
-% vote
-tic;
-mask = zeros(Isize);
-mask1 = zeros(Isize);
-
-%for each iteration (in iterative voting)
-for itr = 1 :iter
-
-	%initialize the vote image to zero
-	Ivote = zeros(Isize);
-	
-	%for each voter (nonzero pixels in the threshold image It)
-	for v = 1: nV
-		%get the cartesian coordinates of the voter v in the main image I
-		vx = Itx(v);
-		vy = Ity(v);
-		vz = Itz(v);
-		vi = Vi(v);
-		
-		%retreive the gradient magnitude at the voter position
-		vmag = Imag(vi);
-		
-		%retrieve the gradient
-		gx = Igrad_x(vi);
-		gy = Igrad_y(vi);
-		gz = Igrad_z(vi);
-		
-		%calculate the gradient magnitude
-		dmag = sqrt (gx^2 + gy^2 + gz^2);
-		
-		%calculate the normalized gradient direction
-		dx = gx / dmag;
-		dy = gy / dmag;
-		dz = gz / dmag;
-		
-		%calculate the angle between the voter direction and the pixel direction
-		cos_diff = LV_x .* dx + LV_y .* dy + LV_z .* dz;
-		
-		%create an angular mask for the voting area
-		M_angle = cos_diff >= cos(ang);
-
-		%combine the two masks to mask out the voting angle
-		M = M_angle.* M_dist;		
-
-		% get the coordinates of each pixel in the final voter mask M
-		pi = find(M);
-		
-		%calculate the number of pixels in the voting region
-		npts =  nnz(M);
-		validPixels(v) = npts; 
-
-		%convert every index in the voting area from a local 3D index to a global 3D index (into the original image I)
-		global_px = vx + mx(pi);
-		global_py = vy + my(pi);
-		global_pz = vz + mz(pi);
-		
-		%convert the global 3D index of each point into a global 1D index
-		global_pi = sub2ind(Isize, global_px, global_py, global_pz);
-
-		g_v_prime (v, 1:npts) = global_pi;
-
-
-		Ivote( global_pi ) = Ivote( global_pi ) + vmag;
-% 		if itr ==3
-% 			if mod(v, 5000)==0
-% 				mask(global_pi)= mask(global_pi) + 0.1;
-% 				mask (vi) = mask(vi) + 0.5;
-% 			end
-% 		end
-% 		if itr ==6
-% 			if mod(v, 5000)==0
-% 				mask1(global_pi)= mask1(global_pi) + 0.1;
-% 				mask1 (vi) = mask1(vi) + 0.5;
-% 			end
-% 		end
-% 		if itr==1
-% 			for ix = -12:12
-% 				for iy = -12:12
-% 					for iz = -12:12
-% 						mask(vx+ix, vy+iy, vz+iz) = M(ix+13,iy+13,iz+13)+ mask(vx+ix, vy+iy, vz+iz);
-% 					end
-% 				end
-% 			end
-% 		end
-% 		if itr==2
-% 			for ix = -12:12
-% 				for iy = -12:12
-% 					for iz = -12:12
-% 						mask1(vx+ix, vy+iy, vz+iz) = M(ix+13,iy+13,iz+13)+ mask1(vx+ix, vy+iy, vz+iz);
-% 					end
-% 				end
-% 			end
-% 		end
-	
-
-	end
-%  	fid = fopen(sprintf('128-128-128/nissl-vote%d',itr), 'w');
-% 	fwrite(fid, single(Ivote), '*single');
- 	if itr ==1
-		fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w');
-		fwrite(fid, single(Ivote), '*single');
-		fclose(fid);
-	end
-	if itr ==8
-		fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w');
-		fwrite(fid, single(Ivote), '*single');
-		fclose(fid);
-		Ivote8 = (Ivote);
-	end
-	if itr ==9
-		fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w');
-		fwrite(fid, single(Ivote), '*single');
-		fclose(fid);
-	end
-	if itr ==10
-		fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w');
-		fwrite(fid, single(Ivote), '*single');
-		fclose(fid);
-	end
-	
-% 		Ivote1 = single(Ivote);
-% 		fwrite(fid, Ivote1, '*single');
-% 		
-% 	
-% 	elseif itr ==2
-% 		Ivote2 = single(Ivote);
-% 		fwrite(fid, Ivote2, '*single');
-% 		
-% 	
-% 	elseif itr ==3
-% 		Ivote3 = single(Ivote);
-% 		fwrite(fid, Ivote3, '*single');
-% 		
-% 	
-% 	elseif itr ==4
-% 		Ivote4 = single(Ivote);
-% 		fwrite(fid, Ivote4, '*single');
-% 		
-% 		
-% 	elseif itr == 5
-% 		Ivote5 = single(Ivote);
-% 		fwrite(fid, Ivote5, '*single');
-% 		
-% 		
-% 	elseif itr == 6
-% 		fwrite(fid, single(Ivote), '*single');
-% 	elseif itr == 7
-% 		fwrite(fid, single(Ivote), '*single');
-% 	elseif itr == 8
-% 		fwrite(fid, single(Ivote), '*single');
-% 	elseif itr == 9
-% 		fwrite(fid, single(Ivote), '*single');
-% 	end
-% 	fclose(fid);
-	t_v1 = toc;   
-	disp(['voting done.  time =',num2str(t_v1)]);
-	
-	% update the voting direction
-	if ang>=d_ang
-		tic;
-		Igrad_x = zeros(Isize);
-		Igrad_y = zeros(Isize);
-		Igrad_z = zeros(Isize);
-		for v = 1: nV
-			% coordinates of the current voter
-			vx = Itx(v);
-			vy = Ity(v);
-			vz = Itz(v);
-
-			%get the local value of the voting image
-			local_Ivote = Ivote(g_v_prime(v,1:validPixels(v)));
-
-			%find the index of the maximum value
-			[~, local_max_idx] = max(local_Ivote);
-
-			%convert this into a global subscript
-			[g_px, g_py, g_pz] = ind2sub(size(Ivote), g_v_prime(v,local_max_idx));
-
-			%compute the vector from the voter position to this position
-			
-			Igrad_x(vx, vy, vz) = g_px - vx ;
-			Igrad_y(vx, vy, vz) = g_py - vy;
-			Igrad_z(vx, vy, vz) = g_pz - vz;
-% 			if itr ==3
-% 				if mod(v, 5000)==0
-% 					mask(g_px, g_py, g_pz)= mask(g_px, g_py, g_pz) + 1;
-% 				end
-% 		end
-% 		if itr ==6
-% 			if mod(v, 5000)==0
-% 				mask1(g_px, g_py, g_pz)= mask1(g_px, g_py, g_pz) + 1;
-% 			end
-% 		end
-		end
-		
-
-	tdir1 = toc;
-	display (['updating dir done.  time = ', num2str(tdir1)]);
-	ang = ang - d_ang;
-	end
-	
- end
- 
-hv  = reshape(Ivote, [X*Y*Z, 1]);
-hist(hv, 250);
-%%
-t = 4300;
-conn = [5 5 5];
-Icenter = local_max(Ivote, conn, t);
-fidc = fopen(sprintf('std3.2-r10.10-v8/out%d-t%d.vol',t,t0), 'w');
-fwrite(fidc, single(Icenter), '*single');
-fclose(fidc);
-nnz(Icenter)
-% [cxx1, cyy1, czz1] = ind2sub(size(Icenter),find(Icenter));
-
-% % center = Ivote1;
-% % center(center<t) = 0;
-% % center = imregionalmax(center);
-% % cn = nnz(center);
-% % [cx, cy, cz] = ind2sub(size(center), find(center));
-% % Icenter = zeros(size(center));
-% % for cc =1:cn
-% % 	Icenter(cx(cc), cy(cc), cz(cc)) = 255;
-% % end
-% 
-% % fid_Ic = fopen('image_center2-300.vol', 'w');
-% % fwrite(fid_Ic, Icenter);
-% % fclose(fid_Ic);
-% cn = nnz(Icenter);
-% [cx, cy, cz] = ind2sub(size(Icenter), find(Icenter));
-% Ic2d = zeros(size(Icenter,1), size(Icenter,2));
-% for cc =1:cn
-% 	Ic2d(cx(cc), cy(cc)) = 1;
-% end
-% I2d = max(I, [], 3);
-% % figure(1),imagesc(I2d); colormap(gray);
-% % figure(2),imagesc(Ic2d); colormap(gray);
-% %
-% out1(:,:,1) = mat2gray(I2d);
-% out1(:,:,2) = mat2gray(Ic2d);
-% out1(:,:,3) = mat2gray(I2d);
-% figure(1), imagesc((out1));
-%%% % imwrite(mat2gray(c2d), 'vote.bmp');
-%%
-% figure(1); imagesc(squeeze(I(:,:,ceil(size(I,3)/2)))), colormap(gray);
-%  figure(33); imagesc(squeeze(Ivote3(:,:,ceil(size(Ivote,3)/2)))), colormap(gray);
+
 clear all;
 disp('***************** NEW RUN *********************');
 X = 128;
@@ -7,15 +8,15 @@ D = 10;
 t0=1;
 r1=10;
 r2=10;
-t=2000;
+t=2100;
 itr=8;
 vote=10;
 std = [5 5];
-gt_filename = '128-128-128/0-gt.vol';
+gt_filename = '0-gt.vol';
 % out_filename = sprintf('128-128-128/0-nissl-std%d.%d-t0%d-r%d.%d-t%d-out%d.%d.vol',std(1), std(2),t0,r1,r2,t,itr,vote);
-out_filename = sprintf('D:/build/ivote3-bld/std5.5-r10.10-v8-phi15/out%d.vol',t);
+out_filename = sprintf('D:/build/ivote3-bld/shared2D-v8/out%d.vol',t);
 % txt_filename = sprintf('128-128-128/0-validation-nissl-std%d.%d-r%d.%d-t%d-out%d.%d-D%d.txt',std(1), std(2),r1,r2,t,itr,vote,D);
-txt_filename = sprintf('D:/build/ivote3-bld/std5.5-r10.10-v8-phi15/t%d-D%d.txt',t,D);
+txt_filename = sprintf('D:/build/ivote3-bld/shared2D-v8/t%d-D%d.txt',t,D);
 spec = sprintf('Nissl-std%d.%d-r%d.%d-t%d-out%d.%d',std(1), std(2),r1,r2,t,itr,vote);
 fid0 = fopen(gt_filename);
 gt = fread(fid0,[X Y*Z], 'single');
+#ifndef STIM_CUDA_cpyToshare_H
+#define STIM_CUDA_cpyToshare_H
+
+		//this function copy one channel data from global to shared memory in one dimension with size of X bytes.
+		template<typename T>
+		__device__ void cpyG2S1D(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
+											  dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
+
+			//calculate the total number of threads available
+			unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
+			
+			//calculate the current 1D thread ID
+			unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
+
+			//calculate the number of iteration require for the copy
+			unsigned int I = X/tThreads + 1;
+
+			//the specified start position in global memory is (x, y, z)
+			unsigned int gstart = z*I_x*I_y + y*I_x + x;
+		
+			for (unsigned int i = 0; i < I; i++){
+		
+				//each iteration will copy tThreads elements, so the starting index in shared memory
+				//for each iteration will be i * tThreads (iteration # times the number of elements transferred per iteration)
+				unsigned int sIdx = i * tThreads + ti;
+				if (sIdx>= X*Y) return;
+				
+				//each iteration will copy tThreads elements from the global index
+				unsigned int gIdx = gstart + sIdx;
+
+				//copy global to share
+				dest[sIdx] = src[gIdx];
+				
+			}
+		}
+		//this function copy one channel data from global to shared memory in two dimensions with size of X*Y bytes.
+		template<typename T>
+		__device__ void cpyG2S2D(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
+											  dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
+			//calculate the total number of threads available
+			unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
+			
+			//calculate the current 1D thread ID
+			unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
+
+			//calculate the number of iteration require for the copy
+			unsigned int I = X*Y/tThreads + 1;
+
+			unsigned int gz1 = z*I_x*I_y ;
+		
+			for (unsigned int i = 0; i < I; i++){
+				
+					unsigned int sIdx = i * tThreads + ti;
+					if (sIdx>= X*Y) return;
+
+					unsigned int sy = sIdx/X;
+					unsigned int sx = sIdx - (sy * X);
+					
+					unsigned int gx = x + sx;
+					unsigned int gy = y + sy;
+					if (gx<I_x && gy<I_y){
+						unsigned int gIdx = gz1 + gy * I_x + gx;
+						//copy global to share
+						dest[sIdx] = src[gIdx];
+					}
+							
+			}
+		}
+		//this function copy three channels data from global to shared memory in one dimension with size of X bytes.
+		template<typename T>
+		__device__ void cpyG2S1D3ch(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
+											  dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
+
+			//calculate the total number of threads available
+			unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
+			
+			//calculate the current 1D thread ID
+			unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
+
+			//calculate the number of iteration require for the copy
+			unsigned int I = X/tThreads + 1;
+
+			//the specified start position in global memory is (x, y, z)
+			unsigned int gstart = z*I_x*I_y + y*I_x + x;
+		
+			for (unsigned int i = 0; i < I; i++){
+		
+				//each iteration will copy tThreads elements, so the starting index in shared memory
+				//for each iteration will be i * tThreads (iteration # times the number of elements transferred per iteration)
+				unsigned int sIdx = i * tThreads + ti;
+				if (sIdx>= X*Y) return;				
+				unsigned int gIdx = gstart*3 + sIdx; 
+				//copy global to share
+				dest[sIdx] = src[gIdx];
+				
+			}
+		}
+		//this function copy three channels data from global to shared memory in two dimensions with size of X*Y bytes.
+		template<typename T>
+		__device__ void cpyG2S2D3ch(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
+											  dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
+			//calculate the total number of threads available
+			unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
+			
+			//calculate the current 1D thread ID
+			unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
+
+			//calculate the number of iteration require for the copy
+			unsigned int I = X*Y/tThreads + 1;
+			
+			unsigned int gz1 = z*I_x*I_y ;
+		
+			for (unsigned int i = 0; i < I; i++){
+				
+					unsigned int sIdx = i * tThreads + ti;
+					if (sIdx>= X*Y) return;
+					unsigned int sy = sIdx/X;
+					unsigned int sx = sIdx - (sy * X);
+					
+					unsigned int gx = x + sx/3;
+					unsigned int gy = y + sy;
+					if (gx<I_x && gy<I_y){
+						unsigned int gIdx = (gz1 + gy * I_x + gx)*3 + (sx%3);
+						//copy global to share
+						dest[sIdx] = src[gIdx];			
+					}
+			}
+		}
+		// this function compute the gradient magnitude saved in the shared memory and stores the magnitude result in the rest of shared memory.
+		template<typename T>
+		__device__ void mag_share2D(T* grad, unsigned int bs, unsigned int X, unsigned int Y, dim3 threadIdx, dim3 blockDim){
+			
+			//calculate the total number of threads available
+			unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
+			//calculate the current 1D thread ID
+			unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
+			//calculate the number of iteration require for the copy
+			unsigned int I = X*Y/tThreads + 1;
+			for (unsigned int i = 0; i < I; i++){
+				
+				unsigned int sIdx = i * tThreads + ti;
+				if (sIdx>= X*Y) return;
+				float gx = grad[sIdx*3];
+				float gy = grad[sIdx*3 + 1];
+				float gz = grad[sIdx*3 + 2];
+				float mag = sqrt(gx*gx + gy*gy + gz*gz);
+				grad[bs + sIdx] = mag;
+					
+			}
+		}
+#endif
 \ No newline at end of file
-#ifndef STIM_CUDA_FLOAT_TO_HALF_H
-#define STIM_CUDA_FLOAT_TO_HALF_H
-
-#include <iostream>
-#include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-#include <stim/cuda/cudatools/error.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-		__global__ void cuda_f2h(half* gpu_half, float* gpu_float, int x, int y, int z){
-
-			
-			//calculate x,y,z coordinates for this thread
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			//find the grid size along y
-			int grid_y = y / blockDim.y;
-			int blockidx_y = blockIdx.y % grid_y;
-			int yi = blockidx_y * blockDim.y + threadIdx.y;
-			int zi = blockIdx.y / grid_y;
-			int i = zi * x * y + yi * x + xi;
-
-			if(xi >= x|| yi >= y || zi>= z) return;
-
-							
-			gpu_half[i] = __float2half(gpu_float[i]);
-			
-			
-		}
-
-
-		void gpu_f2h(half* gpu_half, float* gpu_float, unsigned int x, unsigned int y, unsigned int z){
-
-			
-			int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(sqrt (max_threads),sqrt (max_threads));
-			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
-
-			//call the GPU kernel to determine the gradient
-			cuda_f2h <<< blocks, threads >>>(gpu_half, gpu_float, x, y, z);
-
-		}
-
-
-		
-		void cpu_f2h(half* h_out, float* f_in, unsigned int x, unsigned int y, unsigned int z){
-
-			//calculate the number of pixels in the array
-			unsigned int pix = x* y* z;
-
-			//allocate memory on the GPU for the input float precision.
-			float* gpu_float;
-			cudaMalloc(&gpu_float, pix * sizeof(float));
-			cudaMemcpy(gpu_float, f_in, pix * sizeof(float), cudaMemcpyHostToDevice);
-			
-			//allocate memory on the GPU for the output half precision
-			half* gpu_half;
-			cudaMalloc(&gpu_half, pix * sizeof(half));
-
-			//call the GPU version of this function
-			gpu_f2h(gpu_half, gpu_float, x, y, z);
-
-			//copy the array back to the CPU
-			cudaMemcpy(h_out, gpu_half, pix * sizeof(half), cudaMemcpyDeviceToHost);
-
-			//free allocated memory
-			cudaFree(gpu_float);
-			cudaFree(gpu_half);
-
-		}
-
-
-#endif
 \ No newline at end of file
-#ifndef STIM_CUDA_HALF_TO_FLOAT_H
-#define STIM_CUDA_HALF_TO_FLOAT_H
-
-#include <iostream>
-#include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-#include <stim/cuda/cudatools/error.h>
-#include "cuda_fp16.h"
-
-		__global__ void cuda_h2f(float* gpu_float, half* gpu_half, int x, int y, int z){
-
-			
-			//calculate x,y,z coordinates for this thread
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			//find the grid size along y
-			int grid_y = y / blockDim.y;
-			int blockidx_y = blockIdx.y % grid_y;
-			int yi = blockidx_y * blockDim.y + threadIdx.y;
-			int zi = blockIdx.y / grid_y;
-			int i = zi * x * y + yi * x + xi;
-
-			if(xi >= x|| yi >= y || zi>= z) return;
-
-							
-			gpu_float[i] = __half2float(gpu_half[i]);
-							
-		}
-
-
-		void gpu_h2f(float* gpu_float, half* gpu_half, unsigned int x, unsigned int y, unsigned int z){
-
-			
-			int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(sqrt (max_threads),sqrt (max_threads));
-			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
-
-			//call the GPU kernel to determine the gradient
-			cuda_h2f <<< blocks, threads >>>(gpu_float, gpu_half, x, y, z);
-
-		}
-
-
-		
-		void cpu_f2h(float* f_out, half* h_in, unsigned int x, unsigned int y, unsigned int z){
-
-			//calculate the number of pixels in the array
-			unsigned int pix = x* y* z;
-
-			//allocate memory on the GPU for the input half precision
-			half* gpu_half;
-			cudaMalloc(&gpu_half, pix * sizeof(half));			
-			cudaMemcpy(gpu_half, h_in, pix * sizeof(half), cudaMemcpyHostToDevice);
-
-			//allocate memory on the GPU for the output float precision.
-			float* gpu_float;
-			cudaMalloc(&gpu_float, pix * sizeof(float));
-			
-			
-
-			//call the GPU version of this function
-			gpu_h2f(gpu_float, gpu_half, x, y, z);
-
-			cudaMemcpy(f_out, gpu_float, pix * sizeof(float), cudaMemcpyDeviceToHost);
-
-			
-
-			//free allocated memory
-			cudaFree(gpu_float);
-			cudaFree(gpu_half);
-
-		}
-
-
-#endif
 \ No newline at end of file
@@ -37,10 +37,9 @@ int main(int argc, char** argv){
 		printf("current device ID: %d\n", i);
 		printf("device name: %s\n", prop.name);
 		printf("total global mem: %lu\n", prop.totalGlobalMem);
+		printf("shared memory per block: %lu\n", prop.sharedMemPerBlock);
 	}
-	
-	
-	
+		
 	//output advertisement
 	std::cout<<std::endl<<std::endl;
 	std::cout<<"========================================================================="<<std::endl;
@@ -124,18 +123,18 @@ int main(int argc, char** argv){
 		invert_data(cpuI, x, y, z);
 	//write a new file from the cpuI.
-	std::ofstream original("std5.5-r10.10-v8/inv-128.vol", std::ofstream::out | std::ofstream::binary);
+	std::ofstream original("shared2D-v8/inv-128.vol", std::ofstream::out | std::ofstream::binary);
 	original.write((char*)cpuI, bytes);
 	original.close();
 	//allocate space on the cpu for the output result
-	float* cpu_out = (float*) malloc(bytes*3);
+	float* cpu_out = (float*) malloc(bytes);
 	// call the ivote function
 	ivote3(cpu_out, cpuI, sigma, anisotropy, phi, d_phi, r, iter, t, conn, x, y, z);
 	//write the blurred file from the cpuI.
-	std::ofstream fblur("vote-check/vote8.vol", std::ofstream::out | std::ofstream::binary);
+	std::ofstream fblur("shared2D-v8/vote8.vol", std::ofstream::out | std::ofstream::binary);
 	fblur.write((char*)cpuI, bytes);
 	fblur.close();
 	/*
@@ -146,13 +145,13 @@ int main(int argc, char** argv){
 	fgx.close();
 	*/
 	//write the output file.
-	std::ofstream fo("std5.5-r10.10-v8/" + OutName.str(), std::ofstream::out | std::ofstream::binary);
+	std::ofstream fo("shared2D-v8/" + OutName.str(), std::ofstream::out | std::ofstream::binary);
 	fo.write((char*)cpu_out, bytes);
 	fo.close();
 	// creat a file for saving the list centers
-		std::ofstream list("std5.5-r10.10-v8/center.txt");
+		std::ofstream list("shared2D-v8/center.txt");
 		if (list.is_open()){
 				for (int ix=0; ix<x; ix++){
-#ifndef STIM_CUDA_SET_RMAX_H
-#define STIM_CUDA_SET_RMAX_H
-
-#include <iostream>
-#include <cuda.h>
-#include <stim/cuda/cudatools.h>
-#include <stim/cuda/sharedmem.cuh>
-#include <stim/cuda/cudatools/error.h>
-
-template<typename T>
-	__global__ void cuda_set_rmax(T* gpu_r, int rx, int ry, int rz, int x, int y, int z){
-
-		//calculate x,y,z coordinates for this thread
-		int xi = blockIdx.x * blockDim.x + threadIdx.x;
-		//find the grid size along y
-		int grid_y = y / blockDim.y;
-		int blockidx_y = blockIdx.y % grid_y;
-		int yi = blockidx_y * blockDim.y + threadIdx.y;
-		int zi = blockIdx.y / grid_y;
-		int i = zi * x * y + yi * x + xi;			
-			
-		if(xi>=x || yi>=y || zi>=z) return;
-
-		gpu_r[i*3+0] = rx;
-		gpu_r[i*3+1] = ry;
-		gpu_r[i*3+2] = rz;
-			
-	}
-
-template<typename T>
-		void gpu_set_rmax(T* gpu_r, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
-
-			
-			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(sqrt (max_threads),sqrt (max_threads));
-			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
-						
-			//call the kernel to do the voting
-			cuda_set_rmax <T> <<< blocks, threads >>>(gpu_r, r[0], r[1], r[2], x , y, z);
-
-		}
-template<typename T>
-		void cpu_set_rmax(T* cpu_r, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
-
-			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * z * sizeof(T);
-
-			//allocate space on the GPU for the rmax
-			T* gpu_r;
-			cudaMalloc(&gpu_vote, bytes*3);		
-
-			cudaMemcpy(gpu_r, cpu_r, bytes*3, cudaMemcpyHostToDevice);
-			
-					
-			//call the GPU version of the vote calculation function
-			gpu_set_rmax<T>(gpu_r, r, x , y, z);
-							
-			//copy the Vote Data back to the CPU
-			cudaMemcpy(cpu_r, gpu_r, bytes*3, cudaMemcpyDeviceToHost) ;
-
-			//free allocated memory
-			cudaFree(gpu_r);
-					
-		}
-
-
-#endif
 \ No newline at end of file
@@ -6,11 +6,12 @@
 #include <stim/cuda/cudatools.h>
 #include <stim/cuda/sharedmem.cuh>
 #include <cuda_fp16.h>
+#include "cpyToshare.cuh"
 		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
 		template<typename T>
 		__global__ void update_dir3(T* gpu_dir, T* gpu_grad, T* gpu_vote, T cos_phi, int rx, int ry, int rz,  int x,  int y, int z){
-
+			extern __shared__ float s_vote[];
 			//calculate x,y,z coordinates for this thread
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
 			//find the grid size along y
@@ -18,10 +19,21 @@
 			int blockidx_y = blockIdx.y % grid_y;
 			int yi = blockidx_y * blockDim.y + threadIdx.y;
 			int zi = blockIdx.y / grid_y;
+			//compute the global 1D index for this pixel
 			int i = zi * x * y + yi * x + xi;
 			if(xi >= x|| yi >= y || zi>= z) return;
-
+			// find the starting points for this block along the x and y directions
+			int bxi = blockIdx.x * blockDim.x;
+			int byi = blockidx_y * blockDim.y;
+			//find the starting points and the size of the window, which will be copied to the 2D-shared memory
+			int bxs = bxi - rx;
+			int bys = byi - ry;
+			int xwidth = 2 * rx + blockDim.x;
+			int ywidth = 2 * ry + blockDim.y;
+			//compute the coordinations of this pixel in the 2D-shared memory.
+			int sx_rx = threadIdx.x + rx;
+			int sy_ry = threadIdx.y + ry;
 			//find the gradient values along the x, y ,z axis, and the gradient magnitude for the voter
 			float g_v_x = gpu_grad[i * 3 + 0];
 			float g_v_y = gpu_grad[i * 3 + 1];
@@ -42,39 +54,48 @@
 			int rz_sq = rz * rz;
 			for (int z_p = -rz; z_p<=rz; z_p++){
-							
-				for(int y_p = -ry; y_p <= ry; y_p++){
-
-					for(int x_p = -rx; x_p <= rx; x_p++){
-
-						//calculate the x, y ,z indices for the current pixel
-						int xi_p = (xi + x_p) ;
+				int zi_p = zi + z_p;
+				if ((zi_p) >=0 && (zi_p) < z){
+					//call the function to copy one slide of vote date to the 2D-shared memory.
+					__syncthreads();
+					cpyG2S2D<float>(s_vote, gpu_vote, bxs, bys, zi + z_p, xwidth, ywidth, threadIdx, blockDim, x, y);
+					__syncthreads();								
+					float z_sq = z_p * z_p;		
+					float d_z_sq = (z_sq)/rz_sq;
+					for(int y_p = -ry; y_p <= ry; y_p++){
+
+						float y_sq = y_p * y_p;
+						float yz_sq = y_sq + z_sq;
 						int yi_p = (yi + y_p) ;
-						int zi_p = (zi + z_p) ; 
-						if (zi_p >=0 && zi_p < z && yi_p >=0 && yi_p < y && xi_p >=0 && xi_p < x){
+						float d_yz_sq = (y_sq)/ry_sq + d_z_sq;
+						unsigned int s_y1d = (sy_ry + y_p) * xwidth;
+						for(int x_p = -rx; x_p <= rx; x_p++){
-							//calculate the distance between the pixel and the current voter.
-							float x_sq = x_p * x_p;
-							float y_sq = y_p * y_p;
-							float z_sq = z_p * z_p;
-							float d_pv = sqrt(x_sq + y_sq + z_sq);
+							//check if the current pixel is inside of the data-set.
+							int xi_p = (xi + x_p) ;
+							if (yi_p >=0 && yi_p < y && xi_p >=0 && xi_p < x){
-							// calculate the angle between the pixel and the current voter.
-							float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * mag_v);
+								//calculate the distance between the pixel and the current voter.
+								float x_sq = x_p * x_p;															
+								float d_pv = sqrt(x_sq + yz_sq);
-							if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){
+								// calculate the angle between the pixel and the current voter.
+								float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * mag_v);
-								//calculate the 1D index for the current pixel
-								unsigned int id_p = (zi_p) * x * y + (yi_p) * x + (xi_p);
-								l_vote = gpu_vote[id_p];
-						
-							// compare the vote value of this pixel with the max value to find the maxima and its index.
-								if  (l_vote>max) {
+								if ((((x_sq)/rx_sq + d_yz_sq )<= 1) && (cos_diff >= cos_phi)){
-									max = l_vote;
-									id_x = x_p;
-									id_y = y_p;
-									id_z = z_p;
+									//calculate the 1D index for the current pixel in the 2D-shared memory
+									unsigned int id_s = s_y1d + (sx_rx + x_p);
+									l_vote = s_vote[id_s];
+						
+								// compare the vote value of this pixel with the max value to find the maxima and its index.
+									if  (l_vote>max) {
+
+										max = l_vote;
+										id_x = x_p;
+										id_y = y_p;
+										id_z = z_p;
+									}
 								}
 							}
 						}
@@ -115,13 +136,13 @@
 			unsigned int max_threads = stim::maxThreadsPerBlock();
 			dim3 threads(sqrt (max_threads),sqrt (max_threads));
 			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
-			
+			unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*sizeof(T);			
 			// allocate space on the GPU for the updated vote direction
 			T* gpu_dir;
 			cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3);	
 			//call the kernel to calculate the new voting direction
-			update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z);
+			update_dir3 <<< blocks, threads, shared_bytes >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z);
 			//call the kernel to update the gradient direction
@@ -6,12 +6,13 @@
 #include <stim/cuda/cudatools.h>
 #include <stim/cuda/sharedmem.cuh>
 #include <stim/cuda/cudatools/error.h>
-
+#include "cpyToshare.cuh"
 		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
 		template<typename T>
 		__global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
+			extern __shared__ float s[];
 			//calculate x,y,z coordinates for this thread
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
 			//find the grid size along y
@@ -23,6 +24,17 @@
 			if(xi>=x || yi>=y || zi>=z) return;
+			//find the starting points and the size of the window, which will be copied to the 2D-shared memory
+			int bxs = blockIdx.x * blockDim.x - rx;
+			int bys = blockidx_y * blockDim.y - ry;
+			int xwidth = 2 * rx + blockDim.x;
+			int ywidth = 2 * ry + blockDim.y;
+			//calculate the starting point of shared memory for storing the magnitude.
+			unsigned int b_s = 3 * xwidth * ywidth;
+			//compute the coordinations of this pixel in the 2D-shared memory.
+			int sx_rx = threadIdx.x + rx;
+			int sy_ry = threadIdx.y + ry;
+
 			// define a local variable to sum the votes from the voters
 			float sum = 0;
@@ -31,43 +43,58 @@
 			int rz_sq = rz * rz;
 			for (int z_v = -rz; z_v<=rz; z_v++){
-							
-				for(int y_v = -ry; y_v <= ry; y_v++){
-
-					for(int x_v = -rx; x_v <= rx; x_v++){
-
-						//calculate the x, y ,z indices for the current voter
-						int xi_v = (xi + x_v) ;
+				int zi_v = zi + z_v;
+				if ((zi_v) >=0 && (zi_v) <z){
+					//call the function to copy one slide of the gradient from global to the 2D-shared memory.					
+					__syncthreads();
+					cpyG2S2D3ch<float>(s, gpu_grad, bxs, bys, zi + z_v, 3*xwidth, ywidth, threadIdx, blockDim, x, y);
+					__syncthreads();
+					mag_share2D<float>(s, b_s, xwidth, ywidth, threadIdx, blockDim);
+					__syncthreads();
+					float z_sq = z_v * z_v;	
+					float d_z_sq = z_sq/rz_sq;
+
+					for(int y_v = -ry; y_v <= ry; y_v++){
 						int yi_v = (yi + y_v) ;
-						int zi_v = (zi + z_v) ; 
-						if (zi_v >=0 && zi_v < z && yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){
-														
-							//calculate the 1D index for the current voter
-							unsigned int id_v = (zi_v) * x * y + (yi_v) * x + (xi_v);
+						//compute the position of the current voter in the shared memory along the y axis.
+						unsigned int sIdx_y1d = (sy_ry + y_v)* xwidth;
+
+						float y_sq = y_v * y_v;
+						float yz_sq = z_sq + y_sq;
+						float d_yz_sq = y_sq/ry_sq + d_z_sq;
+						for(int x_v = -rx; x_v <= rx; x_v++){
+
+							//check if the current voter is inside of the data-set
+							int xi_v = (xi + x_v) ;
+							if (yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){
+
+								//compute the position of the current voter in the 2D-shared memory along the x axis.
+								unsigned int sIdx_x = (sx_rx + x_v);
+								//find the 1D index of this voter in the 2D-shared memory.
+								unsigned int s_Idx = (sIdx_y1d  + sIdx_x);
+								unsigned int s_Idx3 = s_Idx * 3;
+								
+								//save the gradient values for the current voter to the local variables and compute the gradient magnitude.					
+								float g_v_x = s[s_Idx3];
+								float g_v_y = s[s_Idx3 + 1];
+								float g_v_z = s[s_Idx3 + 2];																						
+								float mag_v = s[b_s + s_Idx]; //sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);										
+						
+								//calculate the distance between the pixel and the current voter.
+								float x_sq = x_v * x_v;									
+								float d_pv = sqrt(x_sq + yz_sq);
-							//find the gradient values along the x, y ,z axis, and the gradient magnitude for this voter
-							
-							float g_v_x = gpu_grad[id_v * 3 + 0];
-							float g_v_y = gpu_grad[id_v * 3 + 1];
-							float g_v_z = gpu_grad[id_v * 3 + 2];
-							float mag_v = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);
-							
-							//calculate the distance between the pixel and the current voter.
-							float x_sq = x_v * x_v;
-							float y_sq = y_v * y_v;
-							float z_sq = z_v * z_v;
-							float d_pv = sqrt(x_sq + y_sq + z_sq);
-
-							// calculate the angle between the pixel and the current voter.
-							float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v);
+								// calculate the angle between the pixel and the current voter.
+								float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v);
-							// check if the current voter is located in the voting area of this pixel.
-							if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){
+								// check if the current voter is located in the voting area of this pixel.
+								if ((((x_sq)/rx_sq + d_yz_sq)<= 1) && (cos_diff >= cos_phi)){
-								sum += mag_v;	
+									sum += mag_v;	
+								}
 							}
-						}
-					}	
+						}	
+					}
 				}
 			}
@@ -81,9 +108,9 @@
 			unsigned int max_threads = stim::maxThreadsPerBlock();
 			dim3 threads(sqrt (max_threads),sqrt (max_threads));
 			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
-						
+			unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*4*sizeof(T);			
 			//call the kernel to do the voting
-			vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);
+			vote3 <T> <<< blocks, threads, shared_bytes >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);
 		}