Commit 89604e92d1faa3934a762e9cf42e61f5644c78bd

Authored by Laila Saadatifard
1 parent 3b5043cd

ivote3 run on the shared memory

Matlab_3D/0-gt.vol 0 → 100644
No preview for this file type
Matlab_3D/ivote3.m
1 -clc; 1 +
2 clear; 2 clear;
3 disp('***************** NEW RUN *********************'); 3 disp('***************** NEW RUN *********************');
4 total = tic; 4 total = tic;
5 % ******* Initialize voting parameters ************************************** 5 % ******* Initialize voting parameters **************************************
6 -rmax = [16 16 8]; %maximum radius of the cell  
7 -ang_deg = 20.1; %half the angular range of the voting area 6 +rmax = [10 10 10]; %maximum radius of the cell
  7 +rmin = [1 1 1];
  8 +ang_deg = 25.1; %half the angular range of the voting area
8 ang = ang_deg * pi / 180; 9 ang = ang_deg * pi / 180;
9 -iter = 5; %number of voting iterations  
10 -t0 = 1.0; %threshold color  
11 -sigma = [3, 3, 1.5]; 10 +iter = 8 ; %number of voting iterations
  11 +t0 = 1;
  12 +sigma = [5, 5, 5];
12 % t = 0.1; 13 % t = 0.1;
13 -d_ang= ang / (iter); 14 +d_ang= ang / (iter+2);
14 % ******** Testing parameters ****************************************** 15 % ******** Testing parameters ******************************************
15 % p = [50, 50, 150]; 16 % p = [50, 50, 150];
16 % ps = [400, 400, 200]; 17 % ps = [400, 400, 200];
@@ -22,15 +23,14 @@ d_ang= ang / (iter); @@ -22,15 +23,14 @@ d_ang= ang / (iter);
22 % X = S(1); 23 % X = S(1);
23 % Y = S(2); 24 % Y = S(2);
24 % Z = S(3); 25 % Z = S(3);
25 -filename = 'nissl-float-128.128.128.vol'; 26 +filename = '128-128-128/nissl-float-128.128.128.vol'; %'nissl-float-128.128.128.vol';
26 X = 128; 27 X = 128;
27 Y = 128; 28 Y = 128;
28 Z = 128; 29 Z = 128;
29 -fid = fopen(filename); 30 +fidi = fopen(filename);
30 % load the VOL data into a 2D matrix 31 % load the VOL data into a 2D matrix
31 -I = fread(fid,[X Y*Z], 'single');  
32 -fclose(fid);  
33 -%% 32 +I = fread(fidi,[X Y*Z], 'single');
  33 +fclose(fidi);
34 %change this to a 3D matrix 34 %change this to a 3D matrix
35 I = (reshape(I, [X, Y, Z])); 35 I = (reshape(I, [X, Y, Z]));
36 % invert the intensity 36 % invert the intensity
@@ -38,26 +38,21 @@ I = (255 - I); @@ -38,26 +38,21 @@ I = (255 - I);
38 38
39 %perform a gaussian blur 39 %perform a gaussian blur
40 Iblur = gauss_blur3d(I, sigma); 40 Iblur = gauss_blur3d(I, sigma);
41 -  
42 -% %crop out a small subregion of I and Iblur  
43 -% Iblur = Iblur(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1);  
44 -% I = I(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1);  
45 -%  
46 % compute the gradient 41 % compute the gradient
47 [Igrad_y, Igrad_x, Igrad_z] = gradient(Iblur); 42 [Igrad_y, Igrad_x, Igrad_z] = gradient(Iblur);
48 43
49 %calculate the gradient magnitude 44 %calculate the gradient magnitude
50 Imag = sqrt(Igrad_x .^ 2 + Igrad_y .^ 2 + Igrad_z .^2); 45 Imag = sqrt(Igrad_x .^ 2 + Igrad_y .^ 2 + Igrad_z .^2);
51 Isize = size(I); 46 Isize = size(I);
52 -I = single(I);  
53 -Iblur = single(Iblur);  
54 47
55 -%h = reshape(Imag, [X*Y*Z, 1]);  
56 -%hist(h, 100); 48 +% h = reshape(Imag, [X*Y*Z, 1]);
  49 +% hist(h, 100);
57 50
58 %set a threshold for the gradient magnitude 51 %set a threshold for the gradient magnitude
59 It = Imag > t0; 52 It = Imag > t0;
60 - 53 +fidt = fopen('128-128-128/It.vol', 'w');
  54 +fwrite(fidt, It, 'single');
  55 +fclose(fidt);
61 %Set the boundaries of the threshold image to zero 56 %Set the boundaries of the threshold image to zero
62 It(1:rmax(1), :, :) = 0; 57 It(1:rmax(1), :, :) = 0;
63 It(X - rmax(1):X, :,:) = 0; 58 It(X - rmax(1):X, :,:) = 0;
@@ -65,13 +60,12 @@ It(:, 1:rmax(2), :) = 0; @@ -65,13 +60,12 @@ It(:, 1:rmax(2), :) = 0;
65 It(:, Y - rmax(2):Y,:) = 0; 60 It(:, Y - rmax(2):Y,:) = 0;
66 It(:, :, 1:rmax(3)) = 0; 61 It(:, :, 1:rmax(3)) = 0;
67 It(:,:, Z - rmax(3):Z) = 0; 62 It(:,:, Z - rmax(3):Z) = 0;
68 -%% 63 +
69 %get the indices of all of the nonzero values in the threshold image 64 %get the indices of all of the nonzero values in the threshold image
70 % (voter positions) 65 % (voter positions)
71 [Itx,Ity,Itz] = ind2sub(size(It),find(It)); 66 [Itx,Ity,Itz] = ind2sub(size(It),find(It));
72 Vi =(find(It)); 67 Vi =(find(It));
73 nV = nnz(It); 68 nV = nnz(It);
74 -%  
75 % create a meshgrid describing coordinates relative to the voter position 69 % create a meshgrid describing coordinates relative to the voter position
76 rangex = -rmax(1):rmax(1); %create an array of values between -rmax and rmax 70 rangex = -rmax(1):rmax(1); %create an array of values between -rmax and rmax
77 rangey = -rmax(2):rmax(2); 71 rangey = -rmax(2):rmax(2);
@@ -80,8 +74,9 @@ rangez = -rmax(3):rmax(3); @@ -80,8 +74,9 @@ rangez = -rmax(3):rmax(3);
80 m_mag = (sqrt(mx.^2 + my.^2 + mz.^2)); %create a template describing the distance from the center of a small cube 74 m_mag = (sqrt(mx.^2 + my.^2 + mz.^2)); %create a template describing the distance from the center of a small cube
81 75
82 % create a mask for the voting area 76 % create a mask for the voting area
83 -M_dist = (mx.^2/rmax(1)^2 + my.^2/rmax(2)^2 + mz.^2/rmax(3)^2) <= 1; %mask for the voting area distance (all values < rmax from the center)  
84 -%% 77 +M_dist1 = (mx.^2/rmax(1)^2 + my.^2/rmax(2)^2 + mz.^2/rmax(3)^2) <= 1 ; %mask for the voting area distance (all values < rmax from the center)
  78 +M_dist2 = (mx.^2/rmin(1)^2 + my.^2/rmin(2)^2 + mz.^2/rmin(3)^2) >= 1 ;
  79 +M_dist = M_dist1 .* M_dist2;
85 % calculate the direction vector between a pixel and voter 80 % calculate the direction vector between a pixel and voter
86 LV_x = mx./m_mag; 81 LV_x = mx./m_mag;
87 LV_y = my./m_mag; 82 LV_y = my./m_mag;
@@ -89,23 +84,24 @@ LV_z = mz./m_mag; @@ -89,23 +84,24 @@ LV_z = mz./m_mag;
89 84
90 %number of pixels in the voting area of each voter (initialize to zero) 85 %number of pixels in the voting area of each voter (initialize to zero)
91 validPixels = (zeros(nV,1)); 86 validPixels = (zeros(nV,1));
92 -%%  
93 %indices of pixels in the voting area of each voter 87 %indices of pixels in the voting area of each voter
94 % indices reference the 3D image 88 % indices reference the 3D image
95 g_v_prime = zeros(nV, ceil(rmax(1)*rmax(2)*rmax(3)*ang)); 89 g_v_prime = zeros(nV, ceil(rmax(1)*rmax(2)*rmax(3)*ang));
96 90
97 91
98 -%% vote 92 +% vote
99 tic; 93 tic;
  94 +mask = zeros(Isize);
  95 +mask1 = zeros(Isize);
  96 +
100 %for each iteration (in iterative voting) 97 %for each iteration (in iterative voting)
101 -for itr = 1 : iter+1 98 +for itr = 1 :iter
102 99
103 %initialize the vote image to zero 100 %initialize the vote image to zero
104 Ivote = zeros(Isize); 101 Ivote = zeros(Isize);
105 - 102 +
106 %for each voter (nonzero pixels in the threshold image It) 103 %for each voter (nonzero pixels in the threshold image It)
107 for v = 1: nV 104 for v = 1: nV
108 -  
109 %get the cartesian coordinates of the voter v in the main image I 105 %get the cartesian coordinates of the voter v in the main image I
110 vx = Itx(v); 106 vx = Itx(v);
111 vy = Ity(v); 107 vy = Ity(v);
@@ -135,7 +131,7 @@ for itr = 1 : iter+1 @@ -135,7 +131,7 @@ for itr = 1 : iter+1
135 M_angle = cos_diff >= cos(ang); 131 M_angle = cos_diff >= cos(ang);
136 132
137 %combine the two masks to mask out the voting angle 133 %combine the two masks to mask out the voting angle
138 - M = M_angle .* M_dist; 134 + M = M_angle.* M_dist;
139 135
140 % get the coordinates of each pixel in the final voter mask M 136 % get the coordinates of each pixel in the final voter mask M
141 pi = find(M); 137 pi = find(M);
@@ -156,33 +152,20 @@ for itr = 1 : iter+1 @@ -156,33 +152,20 @@ for itr = 1 : iter+1
156 152
157 153
158 Ivote( global_pi ) = Ivote( global_pi ) + vmag; 154 Ivote( global_pi ) = Ivote( global_pi ) + vmag;
159 -  
160 - end  
161 - fid = fopen(sprintf('128-128-128/vote%d',itr), 'w');  
162 - if itr ==1  
163 - fwrite(fid, Ivote, 'single');  
164 -  
165 - elseif itr ==2  
166 - fwrite(fid, Ivote, 'single');  
167 -  
168 - elseif itr ==3  
169 - fwrite(fid, Ivote, 'single');  
170 -  
171 - elseif itr ==4  
172 - fwrite(fid, Ivote, 'single');  
173 -  
174 - elseif itr == 5  
175 - fwrite(fid, Ivote, 'single');  
176 - elseif itr == 6  
177 - fwrite(fid, Ivote, 'single');  
178 end 155 end
  156 + fid = fopen(sprintf('128-128-128/nissl-vote%d',itr), 'w');
  157 + fwrite(fid, single(Ivote), '*single');
179 fclose(fid); 158 fclose(fid);
  159 +
180 t_v1 = toc; 160 t_v1 = toc;
181 disp(['voting done. time =',num2str(t_v1)]); 161 disp(['voting done. time =',num2str(t_v1)]);
182 - 162 +
183 % update the voting direction 163 % update the voting direction
184 - if ang>0 164 + if ang>=d_ang
185 tic; 165 tic;
  166 + Igrad_x = zeros(Isize);
  167 + Igrad_y = zeros(Isize);
  168 + Igrad_z = zeros(Isize);
186 for v = 1: nV 169 for v = 1: nV
187 % coordinates of the current voter 170 % coordinates of the current voter
188 vx = Itx(v); 171 vx = Itx(v);
@@ -199,24 +182,32 @@ for itr = 1 : iter+1 @@ -199,24 +182,32 @@ for itr = 1 : iter+1
199 [g_px, g_py, g_pz] = ind2sub(size(Ivote), g_v_prime(v,local_max_idx)); 182 [g_px, g_py, g_pz] = ind2sub(size(Ivote), g_v_prime(v,local_max_idx));
200 183
201 %compute the vector from the voter position to this position 184 %compute the vector from the voter position to this position
202 - Igrad_x(vx, vy, vz) = g_px - vx; 185 +
  186 + Igrad_x(vx, vy, vz) = g_px - vx ;
203 Igrad_y(vx, vy, vz) = g_py - vy; 187 Igrad_y(vx, vy, vz) = g_py - vy;
204 Igrad_z(vx, vy, vz) = g_pz - vz; 188 Igrad_z(vx, vy, vz) = g_pz - vz;
205 -  
206 end 189 end
207 - 190 +
208 191
209 tdir1 = toc; 192 tdir1 = toc;
210 display (['updating dir done. time = ', num2str(tdir1)]); 193 display (['updating dir done. time = ', num2str(tdir1)]);
211 ang = ang - d_ang; 194 ang = ang - d_ang;
212 end 195 end
  196 +
213 end 197 end
214 198
  199 +hv = reshape(Ivote, [X*Y*Z, 1]);
  200 +hist(hv, 250);
  201 +%%
  202 +t = 300;
  203 +conn = [5 5 5];
  204 +Icenter = local_max(Ivote, conn, t);
  205 +fidc = fopen(sprintf('std3.2-r10.10-v8/out%d-t%d.vol',t,t0), 'w');
  206 +fwrite(fidc, single(Icenter), '*single');
  207 +fclose(fidc);
  208 +nnz(Icenter)
  209 +% [cxx1, cyy1, czz1] = ind2sub(size(Icenter),find(Icenter));
215 210
216 -% %%  
217 -% t = 350;  
218 -% conn = [5 5 3];  
219 -% Icenter = local_max(Ivote, conn, t);  
220 % % center = Ivote1; 211 % % center = Ivote1;
221 % % center(center<t) = 0; 212 % % center(center<t) = 0;
222 % % center = imregionalmax(center); 213 % % center = imregionalmax(center);
Matlab_3D/ivote3_new.m deleted
1 -  
2 -clear;  
3 -disp('***************** NEW RUN *********************');  
4 -total = tic;  
5 -% ******* Initialize voting parameters **************************************  
6 -rmax = [10 10 10]; %maximum radius of the cell  
7 -rmin = [1 1 1];  
8 -ang_deg = 25.1; %half the angular range of the voting area  
9 -ang = ang_deg * pi / 180;  
10 -iter = 8 ; %number of voting iterations  
11 -t0 = 1;  
12 -sigma = [3, 3, 2];  
13 -% t = 0.1;  
14 -d_ang= ang / (iter+2);  
15 -% ******** Testing parameters ******************************************  
16 -% p = [50, 50, 150];  
17 -% ps = [400, 400, 200];  
18 -% % ps = [100, 50, 40];  
19 -% % I = syn_Img(rmax , ps);  
20 -% volfile = 'nissl-rat.vol';  
21 -% fid = fopen(volfile); % open the file that include the image  
22 -% S = fread(fid, 3, 'int32');  
23 -% X = S(1);  
24 -% Y = S(2);  
25 -% Z = S(3);  
26 -filename = '128-128-128/nissl-float-128.128.128.vol'; %'nissl-float-128.128.128.vol';  
27 -X = 128;  
28 -Y = 128;  
29 -Z = 128;  
30 -fidi = fopen(filename);  
31 -% load the VOL data into a 2D matrix  
32 -I = fread(fidi,[X Y*Z], 'single');  
33 -fclose(fidi);  
34 -%change this to a 3D matrix  
35 -I = (reshape(I, [X, Y, Z]));  
36 -% invert the intensity  
37 -I = (255 - I);  
38 -  
39 -%perform a gaussian blur  
40 -Iblur = gauss_blur3d(I, sigma);  
41 -% Iblur = I;  
42 -% %crop out a small subregion of I and Iblur  
43 -% Iblur = Iblur(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1);  
44 -% I = I(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1);  
45 -% compute the gradient  
46 -[Igrad_y, Igrad_x, Igrad_z] = gradient(Iblur);  
47 -  
48 -%calculate the gradient magnitude  
49 -Imag = sqrt(Igrad_x .^ 2 + Igrad_y .^ 2 + Igrad_z .^2);  
50 -Isize = size(I);  
51 -  
52 -% h = reshape(Imag, [X*Y*Z, 1]);  
53 -% hist(h, 100);  
54 -  
55 -%set a threshold for the gradient magnitude  
56 -It = Imag > t0;  
57 -fidt = fopen('128-128-128/It.vol', 'w');  
58 -fwrite(fidt, It, 'single');  
59 -fclose(fidt);  
60 -%Set the boundaries of the threshold image to zero  
61 -It(1:rmax(1), :, :) = 0;  
62 -It(X - rmax(1):X, :,:) = 0;  
63 -It(:, 1:rmax(2), :) = 0;  
64 -It(:, Y - rmax(2):Y,:) = 0;  
65 -It(:, :, 1:rmax(3)) = 0;  
66 -It(:,:, Z - rmax(3):Z) = 0;  
67 -  
68 -%get the indices of all of the nonzero values in the threshold image  
69 -% (voter positions)  
70 -[Itx,Ity,Itz] = ind2sub(size(It),find(It));  
71 -Vi =(find(It));  
72 -nV = nnz(It);  
73 -% create a meshgrid describing coordinates relative to the voter position  
74 -rangex = -rmax(1):rmax(1); %create an array of values between -rmax and rmax  
75 -rangey = -rmax(2):rmax(2);  
76 -rangez = -rmax(3):rmax(3);  
77 -[mx, my, mz] = meshgrid(rangex, rangey, rangez); %create a template describing local pixel position in a small cube  
78 -m_mag = (sqrt(mx.^2 + my.^2 + mz.^2)); %create a template describing the distance from the center of a small cube  
79 -  
80 -% create a mask for the voting area  
81 -M_dist1 = (mx.^2/rmax(1)^2 + my.^2/rmax(2)^2 + mz.^2/rmax(3)^2) <= 1 ; %mask for the voting area distance (all values < rmax from the center)  
82 -M_dist2 = (mx.^2/rmin(1)^2 + my.^2/rmin(2)^2 + mz.^2/rmin(3)^2) >= 1 ;  
83 -M_dist = M_dist1 .* M_dist2;  
84 -% calculate the direction vector between a pixel and voter  
85 -LV_x = mx./m_mag;  
86 -LV_y = my./m_mag;  
87 -LV_z = mz./m_mag;  
88 -  
89 -%number of pixels in the voting area of each voter (initialize to zero)  
90 -validPixels = (zeros(nV,1));  
91 -%indices of pixels in the voting area of each voter  
92 -% indices reference the 3D image  
93 -g_v_prime = zeros(nV, ceil(rmax(1)*rmax(2)*rmax(3)*ang));  
94 -  
95 -  
96 -% vote  
97 -tic;  
98 -mask = zeros(Isize);  
99 -mask1 = zeros(Isize);  
100 -  
101 -%for each iteration (in iterative voting)  
102 -for itr = 1 :iter  
103 -  
104 - %initialize the vote image to zero  
105 - Ivote = zeros(Isize);  
106 -  
107 - %for each voter (nonzero pixels in the threshold image It)  
108 - for v = 1: nV  
109 - %get the cartesian coordinates of the voter v in the main image I  
110 - vx = Itx(v);  
111 - vy = Ity(v);  
112 - vz = Itz(v);  
113 - vi = Vi(v);  
114 -  
115 - %retreive the gradient magnitude at the voter position  
116 - vmag = Imag(vi);  
117 -  
118 - %retrieve the gradient  
119 - gx = Igrad_x(vi);  
120 - gy = Igrad_y(vi);  
121 - gz = Igrad_z(vi);  
122 -  
123 - %calculate the gradient magnitude  
124 - dmag = sqrt (gx^2 + gy^2 + gz^2);  
125 -  
126 - %calculate the normalized gradient direction  
127 - dx = gx / dmag;  
128 - dy = gy / dmag;  
129 - dz = gz / dmag;  
130 -  
131 - %calculate the angle between the voter direction and the pixel direction  
132 - cos_diff = LV_x .* dx + LV_y .* dy + LV_z .* dz;  
133 -  
134 - %create an angular mask for the voting area  
135 - M_angle = cos_diff >= cos(ang);  
136 -  
137 - %combine the two masks to mask out the voting angle  
138 - M = M_angle.* M_dist;  
139 -  
140 - % get the coordinates of each pixel in the final voter mask M  
141 - pi = find(M);  
142 -  
143 - %calculate the number of pixels in the voting region  
144 - npts = nnz(M);  
145 - validPixels(v) = npts;  
146 -  
147 - %convert every index in the voting area from a local 3D index to a global 3D index (into the original image I)  
148 - global_px = vx + mx(pi);  
149 - global_py = vy + my(pi);  
150 - global_pz = vz + mz(pi);  
151 -  
152 - %convert the global 3D index of each point into a global 1D index  
153 - global_pi = sub2ind(Isize, global_px, global_py, global_pz);  
154 -  
155 - g_v_prime (v, 1:npts) = global_pi;  
156 -  
157 -  
158 - Ivote( global_pi ) = Ivote( global_pi ) + vmag;  
159 -% if itr ==3  
160 -% if mod(v, 5000)==0  
161 -% mask(global_pi)= mask(global_pi) + 0.1;  
162 -% mask (vi) = mask(vi) + 0.5;  
163 -% end  
164 -% end  
165 -% if itr ==6  
166 -% if mod(v, 5000)==0  
167 -% mask1(global_pi)= mask1(global_pi) + 0.1;  
168 -% mask1 (vi) = mask1(vi) + 0.5;  
169 -% end  
170 -% end  
171 -% if itr==1  
172 -% for ix = -12:12  
173 -% for iy = -12:12  
174 -% for iz = -12:12  
175 -% mask(vx+ix, vy+iy, vz+iz) = M(ix+13,iy+13,iz+13)+ mask(vx+ix, vy+iy, vz+iz);  
176 -% end  
177 -% end  
178 -% end  
179 -% end  
180 -% if itr==2  
181 -% for ix = -12:12  
182 -% for iy = -12:12  
183 -% for iz = -12:12  
184 -% mask1(vx+ix, vy+iy, vz+iz) = M(ix+13,iy+13,iz+13)+ mask1(vx+ix, vy+iy, vz+iz);  
185 -% end  
186 -% end  
187 -% end  
188 -% end  
189 -  
190 -  
191 - end  
192 -% fid = fopen(sprintf('128-128-128/nissl-vote%d',itr), 'w');  
193 -% fwrite(fid, single(Ivote), '*single');  
194 - if itr ==1  
195 - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w');  
196 - fwrite(fid, single(Ivote), '*single');  
197 - fclose(fid);  
198 - end  
199 - if itr ==8  
200 - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w');  
201 - fwrite(fid, single(Ivote), '*single');  
202 - fclose(fid);  
203 - Ivote8 = (Ivote);  
204 - end  
205 - if itr ==9  
206 - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w');  
207 - fwrite(fid, single(Ivote), '*single');  
208 - fclose(fid);  
209 - end  
210 - if itr ==10  
211 - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w');  
212 - fwrite(fid, single(Ivote), '*single');  
213 - fclose(fid);  
214 - end  
215 -  
216 -% Ivote1 = single(Ivote);  
217 -% fwrite(fid, Ivote1, '*single');  
218 -%  
219 -%  
220 -% elseif itr ==2  
221 -% Ivote2 = single(Ivote);  
222 -% fwrite(fid, Ivote2, '*single');  
223 -%  
224 -%  
225 -% elseif itr ==3  
226 -% Ivote3 = single(Ivote);  
227 -% fwrite(fid, Ivote3, '*single');  
228 -%  
229 -%  
230 -% elseif itr ==4  
231 -% Ivote4 = single(Ivote);  
232 -% fwrite(fid, Ivote4, '*single');  
233 -%  
234 -%  
235 -% elseif itr == 5  
236 -% Ivote5 = single(Ivote);  
237 -% fwrite(fid, Ivote5, '*single');  
238 -%  
239 -%  
240 -% elseif itr == 6  
241 -% fwrite(fid, single(Ivote), '*single');  
242 -% elseif itr == 7  
243 -% fwrite(fid, single(Ivote), '*single');  
244 -% elseif itr == 8  
245 -% fwrite(fid, single(Ivote), '*single');  
246 -% elseif itr == 9  
247 -% fwrite(fid, single(Ivote), '*single');  
248 -% end  
249 -% fclose(fid);  
250 - t_v1 = toc;  
251 - disp(['voting done. time =',num2str(t_v1)]);  
252 -  
253 - % update the voting direction  
254 - if ang>=d_ang  
255 - tic;  
256 - Igrad_x = zeros(Isize);  
257 - Igrad_y = zeros(Isize);  
258 - Igrad_z = zeros(Isize);  
259 - for v = 1: nV  
260 - % coordinates of the current voter  
261 - vx = Itx(v);  
262 - vy = Ity(v);  
263 - vz = Itz(v);  
264 -  
265 - %get the local value of the voting image  
266 - local_Ivote = Ivote(g_v_prime(v,1:validPixels(v)));  
267 -  
268 - %find the index of the maximum value  
269 - [~, local_max_idx] = max(local_Ivote);  
270 -  
271 - %convert this into a global subscript  
272 - [g_px, g_py, g_pz] = ind2sub(size(Ivote), g_v_prime(v,local_max_idx));  
273 -  
274 - %compute the vector from the voter position to this position  
275 -  
276 - Igrad_x(vx, vy, vz) = g_px - vx ;  
277 - Igrad_y(vx, vy, vz) = g_py - vy;  
278 - Igrad_z(vx, vy, vz) = g_pz - vz;  
279 -% if itr ==3  
280 -% if mod(v, 5000)==0  
281 -% mask(g_px, g_py, g_pz)= mask(g_px, g_py, g_pz) + 1;  
282 -% end  
283 -% end  
284 -% if itr ==6  
285 -% if mod(v, 5000)==0  
286 -% mask1(g_px, g_py, g_pz)= mask1(g_px, g_py, g_pz) + 1;  
287 -% end  
288 -% end  
289 - end  
290 -  
291 -  
292 - tdir1 = toc;  
293 - display (['updating dir done. time = ', num2str(tdir1)]);  
294 - ang = ang - d_ang;  
295 - end  
296 -  
297 - end  
298 -  
299 -hv = reshape(Ivote, [X*Y*Z, 1]);  
300 -hist(hv, 250);  
301 -%%  
302 -t = 4300;  
303 -conn = [5 5 5];  
304 -Icenter = local_max(Ivote, conn, t);  
305 -fidc = fopen(sprintf('std3.2-r10.10-v8/out%d-t%d.vol',t,t0), 'w');  
306 -fwrite(fidc, single(Icenter), '*single');  
307 -fclose(fidc);  
308 -nnz(Icenter)  
309 -% [cxx1, cyy1, czz1] = ind2sub(size(Icenter),find(Icenter));  
310 -  
311 -% % center = Ivote1;  
312 -% % center(center<t) = 0;  
313 -% % center = imregionalmax(center);  
314 -% % cn = nnz(center);  
315 -% % [cx, cy, cz] = ind2sub(size(center), find(center));  
316 -% % Icenter = zeros(size(center));  
317 -% % for cc =1:cn  
318 -% % Icenter(cx(cc), cy(cc), cz(cc)) = 255;  
319 -% % end  
320 -%  
321 -% % fid_Ic = fopen('image_center2-300.vol', 'w');  
322 -% % fwrite(fid_Ic, Icenter);  
323 -% % fclose(fid_Ic);  
324 -% cn = nnz(Icenter);  
325 -% [cx, cy, cz] = ind2sub(size(Icenter), find(Icenter));  
326 -% Ic2d = zeros(size(Icenter,1), size(Icenter,2));  
327 -% for cc =1:cn  
328 -% Ic2d(cx(cc), cy(cc)) = 1;  
329 -% end  
330 -% I2d = max(I, [], 3);  
331 -% % figure(1),imagesc(I2d); colormap(gray);  
332 -% % figure(2),imagesc(Ic2d); colormap(gray);  
333 -% %  
334 -% out1(:,:,1) = mat2gray(I2d);  
335 -% out1(:,:,2) = mat2gray(Ic2d);  
336 -% out1(:,:,3) = mat2gray(I2d);  
337 -% figure(1), imagesc((out1));  
338 -%%% % imwrite(mat2gray(c2d), 'vote.bmp');  
339 -%%  
340 -% figure(1); imagesc(squeeze(I(:,:,ceil(size(I,3)/2)))), colormap(gray);  
341 -% figure(33); imagesc(squeeze(Ivote3(:,:,ceil(size(Ivote,3)/2)))), colormap(gray);  
Matlab_3D/validation.m
  1 +
1 clear all; 2 clear all;
2 disp('***************** NEW RUN *********************'); 3 disp('***************** NEW RUN *********************');
3 X = 128; 4 X = 128;
@@ -7,15 +8,15 @@ D = 10; @@ -7,15 +8,15 @@ D = 10;
7 t0=1; 8 t0=1;
8 r1=10; 9 r1=10;
9 r2=10; 10 r2=10;
10 -t=2000; 11 +t=2100;
11 itr=8; 12 itr=8;
12 vote=10; 13 vote=10;
13 std = [5 5]; 14 std = [5 5];
14 -gt_filename = '128-128-128/0-gt.vol'; 15 +gt_filename = '0-gt.vol';
15 % out_filename = sprintf('128-128-128/0-nissl-std%d.%d-t0%d-r%d.%d-t%d-out%d.%d.vol',std(1), std(2),t0,r1,r2,t,itr,vote); 16 % out_filename = sprintf('128-128-128/0-nissl-std%d.%d-t0%d-r%d.%d-t%d-out%d.%d.vol',std(1), std(2),t0,r1,r2,t,itr,vote);
16 -out_filename = sprintf('D:/build/ivote3-bld/std5.5-r10.10-v8-phi15/out%d.vol',t); 17 +out_filename = sprintf('D:/build/ivote3-bld/shared2D-v8/out%d.vol',t);
17 % txt_filename = sprintf('128-128-128/0-validation-nissl-std%d.%d-r%d.%d-t%d-out%d.%d-D%d.txt',std(1), std(2),r1,r2,t,itr,vote,D); 18 % txt_filename = sprintf('128-128-128/0-validation-nissl-std%d.%d-r%d.%d-t%d-out%d.%d-D%d.txt',std(1), std(2),r1,r2,t,itr,vote,D);
18 -txt_filename = sprintf('D:/build/ivote3-bld/std5.5-r10.10-v8-phi15/t%d-D%d.txt',t,D); 19 +txt_filename = sprintf('D:/build/ivote3-bld/shared2D-v8/t%d-D%d.txt',t,D);
19 spec = sprintf('Nissl-std%d.%d-r%d.%d-t%d-out%d.%d',std(1), std(2),r1,r2,t,itr,vote); 20 spec = sprintf('Nissl-std%d.%d-r%d.%d-t%d-out%d.%d',std(1), std(2),r1,r2,t,itr,vote);
20 fid0 = fopen(gt_filename); 21 fid0 = fopen(gt_filename);
21 gt = fread(fid0,[X Y*Z], 'single'); 22 gt = fread(fid0,[X Y*Z], 'single');
cpp/cpyToshare.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_cpyToshare_H
  2 +#define STIM_CUDA_cpyToshare_H
  3 +
  4 + //this function copy one channel data from global to shared memory in one dimension with size of X bytes.
  5 + template<typename T>
  6 + __device__ void cpyG2S1D(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
  7 + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
  8 +
  9 + //calculate the total number of threads available
  10 + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
  11 +
  12 + //calculate the current 1D thread ID
  13 + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
  14 +
  15 + //calculate the number of iteration require for the copy
  16 + unsigned int I = X/tThreads + 1;
  17 +
  18 + //the specified start position in global memory is (x, y, z)
  19 + unsigned int gstart = z*I_x*I_y + y*I_x + x;
  20 +
  21 + for (unsigned int i = 0; i < I; i++){
  22 +
  23 + //each iteration will copy tThreads elements, so the starting index in shared memory
  24 + //for each iteration will be i * tThreads (iteration # times the number of elements transferred per iteration)
  25 + unsigned int sIdx = i * tThreads + ti;
  26 + if (sIdx>= X*Y) return;
  27 +
  28 + //each iteration will copy tThreads elements from the global index
  29 + unsigned int gIdx = gstart + sIdx;
  30 +
  31 + //copy global to share
  32 + dest[sIdx] = src[gIdx];
  33 +
  34 + }
  35 + }
  36 + //this function copy one channel data from global to shared memory in two dimensions with size of X*Y bytes.
  37 + template<typename T>
  38 + __device__ void cpyG2S2D(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
  39 + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
  40 + //calculate the total number of threads available
  41 + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
  42 +
  43 + //calculate the current 1D thread ID
  44 + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
  45 +
  46 + //calculate the number of iteration require for the copy
  47 + unsigned int I = X*Y/tThreads + 1;
  48 +
  49 + unsigned int gz1 = z*I_x*I_y ;
  50 +
  51 + for (unsigned int i = 0; i < I; i++){
  52 +
  53 + unsigned int sIdx = i * tThreads + ti;
  54 + if (sIdx>= X*Y) return;
  55 +
  56 + unsigned int sy = sIdx/X;
  57 + unsigned int sx = sIdx - (sy * X);
  58 +
  59 + unsigned int gx = x + sx;
  60 + unsigned int gy = y + sy;
  61 + if (gx<I_x && gy<I_y){
  62 + unsigned int gIdx = gz1 + gy * I_x + gx;
  63 + //copy global to share
  64 + dest[sIdx] = src[gIdx];
  65 + }
  66 +
  67 + }
  68 + }
  69 + //this function copy three channels data from global to shared memory in one dimension with size of X bytes.
  70 + template<typename T>
  71 + __device__ void cpyG2S1D3ch(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
  72 + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
  73 +
  74 + //calculate the total number of threads available
  75 + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
  76 +
  77 + //calculate the current 1D thread ID
  78 + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
  79 +
  80 + //calculate the number of iteration require for the copy
  81 + unsigned int I = X/tThreads + 1;
  82 +
  83 + //the specified start position in global memory is (x, y, z)
  84 + unsigned int gstart = z*I_x*I_y + y*I_x + x;
  85 +
  86 + for (unsigned int i = 0; i < I; i++){
  87 +
  88 + //each iteration will copy tThreads elements, so the starting index in shared memory
  89 + //for each iteration will be i * tThreads (iteration # times the number of elements transferred per iteration)
  90 + unsigned int sIdx = i * tThreads + ti;
  91 + if (sIdx>= X*Y) return;
  92 + unsigned int gIdx = gstart*3 + sIdx;
  93 + //copy global to share
  94 + dest[sIdx] = src[gIdx];
  95 +
  96 + }
  97 + }
  98 + //this function copy three channels data from global to shared memory in two dimensions with size of X*Y bytes.
  99 + template<typename T>
  100 + __device__ void cpyG2S2D3ch(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
  101 + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
  102 + //calculate the total number of threads available
  103 + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
  104 +
  105 + //calculate the current 1D thread ID
  106 + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
  107 +
  108 + //calculate the number of iteration require for the copy
  109 + unsigned int I = X*Y/tThreads + 1;
  110 +
  111 + unsigned int gz1 = z*I_x*I_y ;
  112 +
  113 + for (unsigned int i = 0; i < I; i++){
  114 +
  115 + unsigned int sIdx = i * tThreads + ti;
  116 + if (sIdx>= X*Y) return;
  117 + unsigned int sy = sIdx/X;
  118 + unsigned int sx = sIdx - (sy * X);
  119 +
  120 + unsigned int gx = x + sx/3;
  121 + unsigned int gy = y + sy;
  122 + if (gx<I_x && gy<I_y){
  123 + unsigned int gIdx = (gz1 + gy * I_x + gx)*3 + (sx%3);
  124 + //copy global to share
  125 + dest[sIdx] = src[gIdx];
  126 + }
  127 + }
  128 + }
  129 + // this function compute the gradient magnitude saved in the shared memory and stores the magnitude result in the rest of shared memory.
  130 + template<typename T>
  131 + __device__ void mag_share2D(T* grad, unsigned int bs, unsigned int X, unsigned int Y, dim3 threadIdx, dim3 blockDim){
  132 +
  133 + //calculate the total number of threads available
  134 + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
  135 + //calculate the current 1D thread ID
  136 + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
  137 + //calculate the number of iteration require for the copy
  138 + unsigned int I = X*Y/tThreads + 1;
  139 + for (unsigned int i = 0; i < I; i++){
  140 +
  141 + unsigned int sIdx = i * tThreads + ti;
  142 + if (sIdx>= X*Y) return;
  143 + float gx = grad[sIdx*3];
  144 + float gy = grad[sIdx*3 + 1];
  145 + float gz = grad[sIdx*3 + 2];
  146 + float mag = sqrt(gx*gx + gy*gy + gz*gz);
  147 + grad[bs + sIdx] = mag;
  148 +
  149 + }
  150 + }
  151 +#endif
0 \ No newline at end of file 152 \ No newline at end of file
cpp/float_to_half.cuh deleted
1 -#ifndef STIM_CUDA_FLOAT_TO_HALF_H  
2 -#define STIM_CUDA_FLOAT_TO_HALF_H  
3 -  
4 -#include <iostream>  
5 -#include <cuda.h>  
6 -#include <stim/cuda/cudatools.h>  
7 -#include <stim/cuda/sharedmem.cuh>  
8 -#include <stim/cuda/cudatools/error.h>  
9 -#include <cuda_fp16.h>  
10 -#include <stdio.h>  
11 - __global__ void cuda_f2h(half* gpu_half, float* gpu_float, int x, int y, int z){  
12 -  
13 -  
14 - //calculate x,y,z coordinates for this thread  
15 - int xi = blockIdx.x * blockDim.x + threadIdx.x;  
16 - //find the grid size along y  
17 - int grid_y = y / blockDim.y;  
18 - int blockidx_y = blockIdx.y % grid_y;  
19 - int yi = blockidx_y * blockDim.y + threadIdx.y;  
20 - int zi = blockIdx.y / grid_y;  
21 - int i = zi * x * y + yi * x + xi;  
22 -  
23 - if(xi >= x|| yi >= y || zi>= z) return;  
24 -  
25 -  
26 - gpu_half[i] = __float2half(gpu_float[i]);  
27 -  
28 -  
29 - }  
30 -  
31 -  
32 - void gpu_f2h(half* gpu_half, float* gpu_float, unsigned int x, unsigned int y, unsigned int z){  
33 -  
34 -  
35 - int max_threads = stim::maxThreadsPerBlock();  
36 - dim3 threads(sqrt (max_threads),sqrt (max_threads));  
37 - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);  
38 -  
39 - //call the GPU kernel to determine the gradient  
40 - cuda_f2h <<< blocks, threads >>>(gpu_half, gpu_float, x, y, z);  
41 -  
42 - }  
43 -  
44 -  
45 -  
46 - void cpu_f2h(half* h_out, float* f_in, unsigned int x, unsigned int y, unsigned int z){  
47 -  
48 - //calculate the number of pixels in the array  
49 - unsigned int pix = x* y* z;  
50 -  
51 - //allocate memory on the GPU for the input float precision.  
52 - float* gpu_float;  
53 - cudaMalloc(&gpu_float, pix * sizeof(float));  
54 - cudaMemcpy(gpu_float, f_in, pix * sizeof(float), cudaMemcpyHostToDevice);  
55 -  
56 - //allocate memory on the GPU for the output half precision  
57 - half* gpu_half;  
58 - cudaMalloc(&gpu_half, pix * sizeof(half));  
59 -  
60 - //call the GPU version of this function  
61 - gpu_f2h(gpu_half, gpu_float, x, y, z);  
62 -  
63 - //copy the array back to the CPU  
64 - cudaMemcpy(h_out, gpu_half, pix * sizeof(half), cudaMemcpyDeviceToHost);  
65 -  
66 - //free allocated memory  
67 - cudaFree(gpu_float);  
68 - cudaFree(gpu_half);  
69 -  
70 - }  
71 -  
72 -  
73 -#endif  
74 \ No newline at end of file 0 \ No newline at end of file
cpp/half_to_float.cuh deleted
1 -#ifndef STIM_CUDA_HALF_TO_FLOAT_H  
2 -#define STIM_CUDA_HALF_TO_FLOAT_H  
3 -  
4 -#include <iostream>  
5 -#include <cuda.h>  
6 -#include <stim/cuda/cudatools.h>  
7 -#include <stim/cuda/sharedmem.cuh>  
8 -#include <stim/cuda/cudatools/error.h>  
9 -#include "cuda_fp16.h"  
10 -  
11 - __global__ void cuda_h2f(float* gpu_float, half* gpu_half, int x, int y, int z){  
12 -  
13 -  
14 - //calculate x,y,z coordinates for this thread  
15 - int xi = blockIdx.x * blockDim.x + threadIdx.x;  
16 - //find the grid size along y  
17 - int grid_y = y / blockDim.y;  
18 - int blockidx_y = blockIdx.y % grid_y;  
19 - int yi = blockidx_y * blockDim.y + threadIdx.y;  
20 - int zi = blockIdx.y / grid_y;  
21 - int i = zi * x * y + yi * x + xi;  
22 -  
23 - if(xi >= x|| yi >= y || zi>= z) return;  
24 -  
25 -  
26 - gpu_float[i] = __half2float(gpu_half[i]);  
27 -  
28 - }  
29 -  
30 -  
31 - void gpu_h2f(float* gpu_float, half* gpu_half, unsigned int x, unsigned int y, unsigned int z){  
32 -  
33 -  
34 - int max_threads = stim::maxThreadsPerBlock();  
35 - dim3 threads(sqrt (max_threads),sqrt (max_threads));  
36 - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);  
37 -  
38 - //call the GPU kernel to determine the gradient  
39 - cuda_h2f <<< blocks, threads >>>(gpu_float, gpu_half, x, y, z);  
40 -  
41 - }  
42 -  
43 -  
44 -  
45 - void cpu_f2h(float* f_out, half* h_in, unsigned int x, unsigned int y, unsigned int z){  
46 -  
47 - //calculate the number of pixels in the array  
48 - unsigned int pix = x* y* z;  
49 -  
50 - //allocate memory on the GPU for the input half precision  
51 - half* gpu_half;  
52 - cudaMalloc(&gpu_half, pix * sizeof(half));  
53 - cudaMemcpy(gpu_half, h_in, pix * sizeof(half), cudaMemcpyHostToDevice);  
54 -  
55 - //allocate memory on the GPU for the output float precision.  
56 - float* gpu_float;  
57 - cudaMalloc(&gpu_float, pix * sizeof(float));  
58 -  
59 -  
60 -  
61 - //call the GPU version of this function  
62 - gpu_h2f(gpu_float, gpu_half, x, y, z);  
63 -  
64 - cudaMemcpy(f_out, gpu_float, pix * sizeof(float), cudaMemcpyDeviceToHost);  
65 -  
66 -  
67 -  
68 - //free allocated memory  
69 - cudaFree(gpu_float);  
70 - cudaFree(gpu_half);  
71 -  
72 - }  
73 -  
74 -  
75 -#endif  
76 \ No newline at end of file 0 \ No newline at end of file
@@ -37,10 +37,9 @@ int main(int argc, char** argv){ @@ -37,10 +37,9 @@ int main(int argc, char** argv){
37 printf("current device ID: %d\n", i); 37 printf("current device ID: %d\n", i);
38 printf("device name: %s\n", prop.name); 38 printf("device name: %s\n", prop.name);
39 printf("total global mem: %lu\n", prop.totalGlobalMem); 39 printf("total global mem: %lu\n", prop.totalGlobalMem);
  40 + printf("shared memory per block: %lu\n", prop.sharedMemPerBlock);
40 } 41 }
41 -  
42 -  
43 - 42 +
44 //output advertisement 43 //output advertisement
45 std::cout<<std::endl<<std::endl; 44 std::cout<<std::endl<<std::endl;
46 std::cout<<"========================================================================="<<std::endl; 45 std::cout<<"========================================================================="<<std::endl;
@@ -124,18 +123,18 @@ int main(int argc, char** argv){ @@ -124,18 +123,18 @@ int main(int argc, char** argv){
124 invert_data(cpuI, x, y, z); 123 invert_data(cpuI, x, y, z);
125 124
126 //write a new file from the cpuI. 125 //write a new file from the cpuI.
127 - std::ofstream original("std5.5-r10.10-v8/inv-128.vol", std::ofstream::out | std::ofstream::binary); 126 + std::ofstream original("shared2D-v8/inv-128.vol", std::ofstream::out | std::ofstream::binary);
128 original.write((char*)cpuI, bytes); 127 original.write((char*)cpuI, bytes);
129 original.close(); 128 original.close();
130 129
131 //allocate space on the cpu for the output result 130 //allocate space on the cpu for the output result
132 - float* cpu_out = (float*) malloc(bytes*3); 131 + float* cpu_out = (float*) malloc(bytes);
133 132
134 // call the ivote function 133 // call the ivote function
135 ivote3(cpu_out, cpuI, sigma, anisotropy, phi, d_phi, r, iter, t, conn, x, y, z); 134 ivote3(cpu_out, cpuI, sigma, anisotropy, phi, d_phi, r, iter, t, conn, x, y, z);
136 135
137 //write the blurred file from the cpuI. 136 //write the blurred file from the cpuI.
138 - std::ofstream fblur("vote-check/vote8.vol", std::ofstream::out | std::ofstream::binary); 137 + std::ofstream fblur("shared2D-v8/vote8.vol", std::ofstream::out | std::ofstream::binary);
139 fblur.write((char*)cpuI, bytes); 138 fblur.write((char*)cpuI, bytes);
140 fblur.close(); 139 fblur.close();
141 /* 140 /*
@@ -146,13 +145,13 @@ int main(int argc, char** argv){ @@ -146,13 +145,13 @@ int main(int argc, char** argv){
146 fgx.close(); 145 fgx.close();
147 */ 146 */
148 //write the output file. 147 //write the output file.
149 - std::ofstream fo("std5.5-r10.10-v8/" + OutName.str(), std::ofstream::out | std::ofstream::binary); 148 + std::ofstream fo("shared2D-v8/" + OutName.str(), std::ofstream::out | std::ofstream::binary);
150 fo.write((char*)cpu_out, bytes); 149 fo.write((char*)cpu_out, bytes);
151 fo.close(); 150 fo.close();
152 151
153 // creat a file for saving the list centers 152 // creat a file for saving the list centers
154 153
155 - std::ofstream list("std5.5-r10.10-v8/center.txt"); 154 + std::ofstream list("shared2D-v8/center.txt");
156 if (list.is_open()){ 155 if (list.is_open()){
157 156
158 for (int ix=0; ix<x; ix++){ 157 for (int ix=0; ix<x; ix++){
cpp/set_rmax.cuh deleted
1 -#ifndef STIM_CUDA_SET_RMAX_H  
2 -#define STIM_CUDA_SET_RMAX_H  
3 -  
4 -#include <iostream>  
5 -#include <cuda.h>  
6 -#include <stim/cuda/cudatools.h>  
7 -#include <stim/cuda/sharedmem.cuh>  
8 -#include <stim/cuda/cudatools/error.h>  
9 -  
10 -template<typename T>  
11 - __global__ void cuda_set_rmax(T* gpu_r, int rx, int ry, int rz, int x, int y, int z){  
12 -  
13 - //calculate x,y,z coordinates for this thread  
14 - int xi = blockIdx.x * blockDim.x + threadIdx.x;  
15 - //find the grid size along y  
16 - int grid_y = y / blockDim.y;  
17 - int blockidx_y = blockIdx.y % grid_y;  
18 - int yi = blockidx_y * blockDim.y + threadIdx.y;  
19 - int zi = blockIdx.y / grid_y;  
20 - int i = zi * x * y + yi * x + xi;  
21 -  
22 - if(xi>=x || yi>=y || zi>=z) return;  
23 -  
24 - gpu_r[i*3+0] = rx;  
25 - gpu_r[i*3+1] = ry;  
26 - gpu_r[i*3+2] = rz;  
27 -  
28 - }  
29 -  
30 -template<typename T>  
31 - void gpu_set_rmax(T* gpu_r, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){  
32 -  
33 -  
34 - unsigned int max_threads = stim::maxThreadsPerBlock();  
35 - dim3 threads(sqrt (max_threads),sqrt (max_threads));  
36 - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);  
37 -  
38 - //call the kernel to do the voting  
39 - cuda_set_rmax <T> <<< blocks, threads >>>(gpu_r, r[0], r[1], r[2], x , y, z);  
40 -  
41 - }  
42 -template<typename T>  
43 - void cpu_set_rmax(T* cpu_r, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){  
44 -  
45 - //calculate the number of bytes in the array  
46 - unsigned int bytes = x * y * z * sizeof(T);  
47 -  
48 - //allocate space on the GPU for the rmax  
49 - T* gpu_r;  
50 - cudaMalloc(&gpu_vote, bytes*3);  
51 -  
52 - cudaMemcpy(gpu_r, cpu_r, bytes*3, cudaMemcpyHostToDevice);  
53 -  
54 -  
55 - //call the GPU version of the vote calculation function  
56 - gpu_set_rmax<T>(gpu_r, r, x , y, z);  
57 -  
58 - //copy the Vote Data back to the CPU  
59 - cudaMemcpy(cpu_r, gpu_r, bytes*3, cudaMemcpyDeviceToHost) ;  
60 -  
61 - //free allocated memory  
62 - cudaFree(gpu_r);  
63 -  
64 - }  
65 -  
66 -  
67 -#endif  
68 \ No newline at end of file 0 \ No newline at end of file
cpp/update_dir3.cuh
@@ -6,11 +6,12 @@ @@ -6,11 +6,12 @@
6 #include <stim/cuda/cudatools.h> 6 #include <stim/cuda/cudatools.h>
7 #include <stim/cuda/sharedmem.cuh> 7 #include <stim/cuda/sharedmem.cuh>
8 #include <cuda_fp16.h> 8 #include <cuda_fp16.h>
  9 +#include "cpyToshare.cuh"
9 10
10 // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. 11 // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
11 template<typename T> 12 template<typename T>
12 __global__ void update_dir3(T* gpu_dir, T* gpu_grad, T* gpu_vote, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ 13 __global__ void update_dir3(T* gpu_dir, T* gpu_grad, T* gpu_vote, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
13 - 14 + extern __shared__ float s_vote[];
14 //calculate x,y,z coordinates for this thread 15 //calculate x,y,z coordinates for this thread
15 int xi = blockIdx.x * blockDim.x + threadIdx.x; 16 int xi = blockIdx.x * blockDim.x + threadIdx.x;
16 //find the grid size along y 17 //find the grid size along y
@@ -18,10 +19,21 @@ @@ -18,10 +19,21 @@
18 int blockidx_y = blockIdx.y % grid_y; 19 int blockidx_y = blockIdx.y % grid_y;
19 int yi = blockidx_y * blockDim.y + threadIdx.y; 20 int yi = blockidx_y * blockDim.y + threadIdx.y;
20 int zi = blockIdx.y / grid_y; 21 int zi = blockIdx.y / grid_y;
  22 + //compute the global 1D index for this pixel
21 int i = zi * x * y + yi * x + xi; 23 int i = zi * x * y + yi * x + xi;
22 24
23 if(xi >= x|| yi >= y || zi>= z) return; 25 if(xi >= x|| yi >= y || zi>= z) return;
24 - 26 + // find the starting points for this block along the x and y directions
  27 + int bxi = blockIdx.x * blockDim.x;
  28 + int byi = blockidx_y * blockDim.y;
  29 + //find the starting points and the size of the window, which will be copied to the 2D-shared memory
  30 + int bxs = bxi - rx;
  31 + int bys = byi - ry;
  32 + int xwidth = 2 * rx + blockDim.x;
  33 + int ywidth = 2 * ry + blockDim.y;
  34 + //compute the coordinations of this pixel in the 2D-shared memory.
  35 + int sx_rx = threadIdx.x + rx;
  36 + int sy_ry = threadIdx.y + ry;
25 //find the gradient values along the x, y ,z axis, and the gradient magnitude for the voter 37 //find the gradient values along the x, y ,z axis, and the gradient magnitude for the voter
26 float g_v_x = gpu_grad[i * 3 + 0]; 38 float g_v_x = gpu_grad[i * 3 + 0];
27 float g_v_y = gpu_grad[i * 3 + 1]; 39 float g_v_y = gpu_grad[i * 3 + 1];
@@ -42,39 +54,48 @@ @@ -42,39 +54,48 @@
42 int rz_sq = rz * rz; 54 int rz_sq = rz * rz;
43 55
44 for (int z_p = -rz; z_p<=rz; z_p++){ 56 for (int z_p = -rz; z_p<=rz; z_p++){
45 -  
46 - for(int y_p = -ry; y_p <= ry; y_p++){  
47 -  
48 - for(int x_p = -rx; x_p <= rx; x_p++){  
49 -  
50 - //calculate the x, y ,z indices for the current pixel  
51 - int xi_p = (xi + x_p) ; 57 + int zi_p = zi + z_p;
  58 + if ((zi_p) >=0 && (zi_p) < z){
  59 + //call the function to copy one slide of vote date to the 2D-shared memory.
  60 + __syncthreads();
  61 + cpyG2S2D<float>(s_vote, gpu_vote, bxs, bys, zi + z_p, xwidth, ywidth, threadIdx, blockDim, x, y);
  62 + __syncthreads();
  63 + float z_sq = z_p * z_p;
  64 + float d_z_sq = (z_sq)/rz_sq;
  65 + for(int y_p = -ry; y_p <= ry; y_p++){
  66 +
  67 + float y_sq = y_p * y_p;
  68 + float yz_sq = y_sq + z_sq;
52 int yi_p = (yi + y_p) ; 69 int yi_p = (yi + y_p) ;
53 - int zi_p = (zi + z_p) ;  
54 - if (zi_p >=0 && zi_p < z && yi_p >=0 && yi_p < y && xi_p >=0 && xi_p < x){ 70 + float d_yz_sq = (y_sq)/ry_sq + d_z_sq;
  71 + unsigned int s_y1d = (sy_ry + y_p) * xwidth;
  72 + for(int x_p = -rx; x_p <= rx; x_p++){
55 73
56 - //calculate the distance between the pixel and the current voter.  
57 - float x_sq = x_p * x_p;  
58 - float y_sq = y_p * y_p;  
59 - float z_sq = z_p * z_p;  
60 - float d_pv = sqrt(x_sq + y_sq + z_sq); 74 + //check if the current pixel is inside of the data-set.
  75 + int xi_p = (xi + x_p) ;
  76 + if (yi_p >=0 && yi_p < y && xi_p >=0 && xi_p < x){
61 77
62 - // calculate the angle between the pixel and the current voter.  
63 - float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * mag_v); 78 + //calculate the distance between the pixel and the current voter.
  79 + float x_sq = x_p * x_p;
  80 + float d_pv = sqrt(x_sq + yz_sq);
64 81
65 - if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){ 82 + // calculate the angle between the pixel and the current voter.
  83 + float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * mag_v);
66 84
67 - //calculate the 1D index for the current pixel  
68 - unsigned int id_p = (zi_p) * x * y + (yi_p) * x + (xi_p);  
69 - l_vote = gpu_vote[id_p];  
70 -  
71 - // compare the vote value of this pixel with the max value to find the maxima and its index.  
72 - if (l_vote>max) { 85 + if ((((x_sq)/rx_sq + d_yz_sq )<= 1) && (cos_diff >= cos_phi)){
73 86
74 - max = l_vote;  
75 - id_x = x_p;  
76 - id_y = y_p;  
77 - id_z = z_p; 87 + //calculate the 1D index for the current pixel in the 2D-shared memory
  88 + unsigned int id_s = s_y1d + (sx_rx + x_p);
  89 + l_vote = s_vote[id_s];
  90 +
  91 + // compare the vote value of this pixel with the max value to find the maxima and its index.
  92 + if (l_vote>max) {
  93 +
  94 + max = l_vote;
  95 + id_x = x_p;
  96 + id_y = y_p;
  97 + id_z = z_p;
  98 + }
78 } 99 }
79 } 100 }
80 } 101 }
@@ -115,13 +136,13 @@ @@ -115,13 +136,13 @@
115 unsigned int max_threads = stim::maxThreadsPerBlock(); 136 unsigned int max_threads = stim::maxThreadsPerBlock();
116 dim3 threads(sqrt (max_threads),sqrt (max_threads)); 137 dim3 threads(sqrt (max_threads),sqrt (max_threads));
117 dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); 138 dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
118 - 139 + unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*sizeof(T);
119 // allocate space on the GPU for the updated vote direction 140 // allocate space on the GPU for the updated vote direction
120 T* gpu_dir; 141 T* gpu_dir;
121 cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3); 142 cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3);
122 143
123 //call the kernel to calculate the new voting direction 144 //call the kernel to calculate the new voting direction
124 - update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z); 145 + update_dir3 <<< blocks, threads, shared_bytes >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z);
125 146
126 147
127 //call the kernel to update the gradient direction 148 //call the kernel to update the gradient direction
@@ -6,12 +6,13 @@ @@ -6,12 +6,13 @@
6 #include <stim/cuda/cudatools.h> 6 #include <stim/cuda/cudatools.h>
7 #include <stim/cuda/sharedmem.cuh> 7 #include <stim/cuda/sharedmem.cuh>
8 #include <stim/cuda/cudatools/error.h> 8 #include <stim/cuda/cudatools/error.h>
9 - 9 +#include "cpyToshare.cuh"
10 10
11 // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area 11 // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
12 template<typename T> 12 template<typename T>
13 __global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ 13 __global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
14 14
  15 + extern __shared__ float s[];
15 //calculate x,y,z coordinates for this thread 16 //calculate x,y,z coordinates for this thread
16 int xi = blockIdx.x * blockDim.x + threadIdx.x; 17 int xi = blockIdx.x * blockDim.x + threadIdx.x;
17 //find the grid size along y 18 //find the grid size along y
@@ -23,6 +24,17 @@ @@ -23,6 +24,17 @@
23 24
24 if(xi>=x || yi>=y || zi>=z) return; 25 if(xi>=x || yi>=y || zi>=z) return;
25 26
  27 + //find the starting points and the size of the window, which will be copied to the 2D-shared memory
  28 + int bxs = blockIdx.x * blockDim.x - rx;
  29 + int bys = blockidx_y * blockDim.y - ry;
  30 + int xwidth = 2 * rx + blockDim.x;
  31 + int ywidth = 2 * ry + blockDim.y;
  32 + //calculate the starting point of shared memory for storing the magnitude.
  33 + unsigned int b_s = 3 * xwidth * ywidth;
  34 + //compute the coordinations of this pixel in the 2D-shared memory.
  35 + int sx_rx = threadIdx.x + rx;
  36 + int sy_ry = threadIdx.y + ry;
  37 +
26 // define a local variable to sum the votes from the voters 38 // define a local variable to sum the votes from the voters
27 float sum = 0; 39 float sum = 0;
28 40
@@ -31,43 +43,58 @@ @@ -31,43 +43,58 @@
31 int rz_sq = rz * rz; 43 int rz_sq = rz * rz;
32 44
33 for (int z_v = -rz; z_v<=rz; z_v++){ 45 for (int z_v = -rz; z_v<=rz; z_v++){
34 -  
35 - for(int y_v = -ry; y_v <= ry; y_v++){  
36 -  
37 - for(int x_v = -rx; x_v <= rx; x_v++){  
38 -  
39 - //calculate the x, y ,z indices for the current voter  
40 - int xi_v = (xi + x_v) ; 46 + int zi_v = zi + z_v;
  47 + if ((zi_v) >=0 && (zi_v) <z){
  48 + //call the function to copy one slide of the gradient from global to the 2D-shared memory.
  49 + __syncthreads();
  50 + cpyG2S2D3ch<float>(s, gpu_grad, bxs, bys, zi + z_v, 3*xwidth, ywidth, threadIdx, blockDim, x, y);
  51 + __syncthreads();
  52 + mag_share2D<float>(s, b_s, xwidth, ywidth, threadIdx, blockDim);
  53 + __syncthreads();
  54 + float z_sq = z_v * z_v;
  55 + float d_z_sq = z_sq/rz_sq;
  56 +
  57 + for(int y_v = -ry; y_v <= ry; y_v++){
41 int yi_v = (yi + y_v) ; 58 int yi_v = (yi + y_v) ;
42 - int zi_v = (zi + z_v) ;  
43 - if (zi_v >=0 && zi_v < z && yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){  
44 -  
45 - //calculate the 1D index for the current voter  
46 - unsigned int id_v = (zi_v) * x * y + (yi_v) * x + (xi_v); 59 + //compute the position of the current voter in the shared memory along the y axis.
  60 + unsigned int sIdx_y1d = (sy_ry + y_v)* xwidth;
  61 +
  62 + float y_sq = y_v * y_v;
  63 + float yz_sq = z_sq + y_sq;
  64 + float d_yz_sq = y_sq/ry_sq + d_z_sq;
  65 + for(int x_v = -rx; x_v <= rx; x_v++){
  66 +
  67 + //check if the current voter is inside of the data-set
  68 + int xi_v = (xi + x_v) ;
  69 + if (yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){
  70 +
  71 + //compute the position of the current voter in the 2D-shared memory along the x axis.
  72 + unsigned int sIdx_x = (sx_rx + x_v);
  73 + //find the 1D index of this voter in the 2D-shared memory.
  74 + unsigned int s_Idx = (sIdx_y1d + sIdx_x);
  75 + unsigned int s_Idx3 = s_Idx * 3;
  76 +
  77 + //save the gradient values for the current voter to the local variables and compute the gradient magnitude.
  78 + float g_v_x = s[s_Idx3];
  79 + float g_v_y = s[s_Idx3 + 1];
  80 + float g_v_z = s[s_Idx3 + 2];
  81 + float mag_v = s[b_s + s_Idx]; //sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);
  82 +
  83 + //calculate the distance between the pixel and the current voter.
  84 + float x_sq = x_v * x_v;
  85 + float d_pv = sqrt(x_sq + yz_sq);
47 86
48 - //find the gradient values along the x, y ,z axis, and the gradient magnitude for this voter  
49 -  
50 - float g_v_x = gpu_grad[id_v * 3 + 0];  
51 - float g_v_y = gpu_grad[id_v * 3 + 1];  
52 - float g_v_z = gpu_grad[id_v * 3 + 2];  
53 - float mag_v = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);  
54 -  
55 - //calculate the distance between the pixel and the current voter.  
56 - float x_sq = x_v * x_v;  
57 - float y_sq = y_v * y_v;  
58 - float z_sq = z_v * z_v;  
59 - float d_pv = sqrt(x_sq + y_sq + z_sq);  
60 -  
61 - // calculate the angle between the pixel and the current voter.  
62 - float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v); 87 + // calculate the angle between the pixel and the current voter.
  88 + float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v);
63 89
64 - // check if the current voter is located in the voting area of this pixel.  
65 - if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){ 90 + // check if the current voter is located in the voting area of this pixel.
  91 + if ((((x_sq)/rx_sq + d_yz_sq)<= 1) && (cos_diff >= cos_phi)){
66 92
67 - sum += mag_v; 93 + sum += mag_v;
  94 + }
68 } 95 }
69 - }  
70 - } 96 + }
  97 + }
71 } 98 }
72 } 99 }
73 100
@@ -81,9 +108,9 @@ @@ -81,9 +108,9 @@
81 unsigned int max_threads = stim::maxThreadsPerBlock(); 108 unsigned int max_threads = stim::maxThreadsPerBlock();
82 dim3 threads(sqrt (max_threads),sqrt (max_threads)); 109 dim3 threads(sqrt (max_threads),sqrt (max_threads));
83 dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); 110 dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
84 - 111 + unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*4*sizeof(T);
85 //call the kernel to do the voting 112 //call the kernel to do the voting
86 - vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z); 113 + vote3 <T> <<< blocks, threads, shared_bytes >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);
87 114
88 } 115 }
89 116