Commit 89604e92d1faa3934a762e9cf42e61f5644c78bd

Authored by Laila Saadatifard
1 parent 3b5043cd

ivote3 run on the shared memory

Matlab_3D/0-gt.vol 0 → 100644
No preview for this file type
Matlab_3D/ivote3.m
1   -clc;
  1 +
2 2 clear;
3 3 disp('***************** NEW RUN *********************');
4 4 total = tic;
5 5 % ******* Initialize voting parameters **************************************
6   -rmax = [16 16 8]; %maximum radius of the cell
7   -ang_deg = 20.1; %half the angular range of the voting area
  6 +rmax = [10 10 10]; %maximum radius of the cell
  7 +rmin = [1 1 1];
  8 +ang_deg = 25.1; %half the angular range of the voting area
8 9 ang = ang_deg * pi / 180;
9   -iter = 5; %number of voting iterations
10   -t0 = 1.0; %threshold color
11   -sigma = [3, 3, 1.5];
  10 +iter = 8 ; %number of voting iterations
  11 +t0 = 1;
  12 +sigma = [5, 5, 5];
12 13 % t = 0.1;
13   -d_ang= ang / (iter);
  14 +d_ang= ang / (iter+2);
14 15 % ******** Testing parameters ******************************************
15 16 % p = [50, 50, 150];
16 17 % ps = [400, 400, 200];
... ... @@ -22,15 +23,14 @@ d_ang= ang / (iter);
22 23 % X = S(1);
23 24 % Y = S(2);
24 25 % Z = S(3);
25   -filename = 'nissl-float-128.128.128.vol';
  26 +filename = '128-128-128/nissl-float-128.128.128.vol'; %'nissl-float-128.128.128.vol';
26 27 X = 128;
27 28 Y = 128;
28 29 Z = 128;
29   -fid = fopen(filename);
  30 +fidi = fopen(filename);
30 31 % load the VOL data into a 2D matrix
31   -I = fread(fid,[X Y*Z], 'single');
32   -fclose(fid);
33   -%%
  32 +I = fread(fidi,[X Y*Z], 'single');
  33 +fclose(fidi);
34 34 %change this to a 3D matrix
35 35 I = (reshape(I, [X, Y, Z]));
36 36 % invert the intensity
... ... @@ -38,26 +38,21 @@ I = (255 - I);
38 38  
39 39 %perform a gaussian blur
40 40 Iblur = gauss_blur3d(I, sigma);
41   -
42   -% %crop out a small subregion of I and Iblur
43   -% Iblur = Iblur(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1);
44   -% I = I(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1);
45   -%
46 41 % compute the gradient
47 42 [Igrad_y, Igrad_x, Igrad_z] = gradient(Iblur);
48 43  
49 44 %calculate the gradient magnitude
50 45 Imag = sqrt(Igrad_x .^ 2 + Igrad_y .^ 2 + Igrad_z .^2);
51 46 Isize = size(I);
52   -I = single(I);
53   -Iblur = single(Iblur);
54 47  
55   -%h = reshape(Imag, [X*Y*Z, 1]);
56   -%hist(h, 100);
  48 +% h = reshape(Imag, [X*Y*Z, 1]);
  49 +% hist(h, 100);
57 50  
58 51 %set a threshold for the gradient magnitude
59 52 It = Imag > t0;
60   -
  53 +fidt = fopen('128-128-128/It.vol', 'w');
  54 +fwrite(fidt, It, 'single');
  55 +fclose(fidt);
61 56 %Set the boundaries of the threshold image to zero
62 57 It(1:rmax(1), :, :) = 0;
63 58 It(X - rmax(1):X, :,:) = 0;
... ... @@ -65,13 +60,12 @@ It(:, 1:rmax(2), :) = 0;
65 60 It(:, Y - rmax(2):Y,:) = 0;
66 61 It(:, :, 1:rmax(3)) = 0;
67 62 It(:,:, Z - rmax(3):Z) = 0;
68   -%%
  63 +
69 64 %get the indices of all of the nonzero values in the threshold image
70 65 % (voter positions)
71 66 [Itx,Ity,Itz] = ind2sub(size(It),find(It));
72 67 Vi =(find(It));
73 68 nV = nnz(It);
74   -%
75 69 % create a meshgrid describing coordinates relative to the voter position
76 70 rangex = -rmax(1):rmax(1); %create an array of values between -rmax and rmax
77 71 rangey = -rmax(2):rmax(2);
... ... @@ -80,8 +74,9 @@ rangez = -rmax(3):rmax(3);
80 74 m_mag = (sqrt(mx.^2 + my.^2 + mz.^2)); %create a template describing the distance from the center of a small cube
81 75  
82 76 % create a mask for the voting area
83   -M_dist = (mx.^2/rmax(1)^2 + my.^2/rmax(2)^2 + mz.^2/rmax(3)^2) <= 1; %mask for the voting area distance (all values < rmax from the center)
84   -%%
  77 +M_dist1 = (mx.^2/rmax(1)^2 + my.^2/rmax(2)^2 + mz.^2/rmax(3)^2) <= 1 ; %mask for the voting area distance (all values < rmax from the center)
  78 +M_dist2 = (mx.^2/rmin(1)^2 + my.^2/rmin(2)^2 + mz.^2/rmin(3)^2) >= 1 ;
  79 +M_dist = M_dist1 .* M_dist2;
85 80 % calculate the direction vector between a pixel and voter
86 81 LV_x = mx./m_mag;
87 82 LV_y = my./m_mag;
... ... @@ -89,23 +84,24 @@ LV_z = mz./m_mag;
89 84  
90 85 %number of pixels in the voting area of each voter (initialize to zero)
91 86 validPixels = (zeros(nV,1));
92   -%%
93 87 %indices of pixels in the voting area of each voter
94 88 % indices reference the 3D image
95 89 g_v_prime = zeros(nV, ceil(rmax(1)*rmax(2)*rmax(3)*ang));
96 90  
97 91  
98   -%% vote
  92 +% vote
99 93 tic;
  94 +mask = zeros(Isize);
  95 +mask1 = zeros(Isize);
  96 +
100 97 %for each iteration (in iterative voting)
101   -for itr = 1 : iter+1
  98 +for itr = 1 :iter
102 99  
103 100 %initialize the vote image to zero
104 101 Ivote = zeros(Isize);
105   -
  102 +
106 103 %for each voter (nonzero pixels in the threshold image It)
107 104 for v = 1: nV
108   -
109 105 %get the cartesian coordinates of the voter v in the main image I
110 106 vx = Itx(v);
111 107 vy = Ity(v);
... ... @@ -135,7 +131,7 @@ for itr = 1 : iter+1
135 131 M_angle = cos_diff >= cos(ang);
136 132  
137 133 %combine the two masks to mask out the voting angle
138   - M = M_angle .* M_dist;
  134 + M = M_angle.* M_dist;
139 135  
140 136 % get the coordinates of each pixel in the final voter mask M
141 137 pi = find(M);
... ... @@ -156,33 +152,20 @@ for itr = 1 : iter+1
156 152  
157 153  
158 154 Ivote( global_pi ) = Ivote( global_pi ) + vmag;
159   -
160   - end
161   - fid = fopen(sprintf('128-128-128/vote%d',itr), 'w');
162   - if itr ==1
163   - fwrite(fid, Ivote, 'single');
164   -
165   - elseif itr ==2
166   - fwrite(fid, Ivote, 'single');
167   -
168   - elseif itr ==3
169   - fwrite(fid, Ivote, 'single');
170   -
171   - elseif itr ==4
172   - fwrite(fid, Ivote, 'single');
173   -
174   - elseif itr == 5
175   - fwrite(fid, Ivote, 'single');
176   - elseif itr == 6
177   - fwrite(fid, Ivote, 'single');
178 155 end
  156 + fid = fopen(sprintf('128-128-128/nissl-vote%d',itr), 'w');
  157 + fwrite(fid, single(Ivote), '*single');
179 158 fclose(fid);
  159 +
180 160 t_v1 = toc;
181 161 disp(['voting done. time =',num2str(t_v1)]);
182   -
  162 +
183 163 % update the voting direction
184   - if ang>0
  164 + if ang>=d_ang
185 165 tic;
  166 + Igrad_x = zeros(Isize);
  167 + Igrad_y = zeros(Isize);
  168 + Igrad_z = zeros(Isize);
186 169 for v = 1: nV
187 170 % coordinates of the current voter
188 171 vx = Itx(v);
... ... @@ -199,24 +182,32 @@ for itr = 1 : iter+1
199 182 [g_px, g_py, g_pz] = ind2sub(size(Ivote), g_v_prime(v,local_max_idx));
200 183  
201 184 %compute the vector from the voter position to this position
202   - Igrad_x(vx, vy, vz) = g_px - vx;
  185 +
  186 + Igrad_x(vx, vy, vz) = g_px - vx ;
203 187 Igrad_y(vx, vy, vz) = g_py - vy;
204 188 Igrad_z(vx, vy, vz) = g_pz - vz;
205   -
206 189 end
207   -
  190 +
208 191  
209 192 tdir1 = toc;
210 193 display (['updating dir done. time = ', num2str(tdir1)]);
211 194 ang = ang - d_ang;
212 195 end
  196 +
213 197 end
214 198  
  199 +hv = reshape(Ivote, [X*Y*Z, 1]);
  200 +hist(hv, 250);
  201 +%%
  202 +t = 300;
  203 +conn = [5 5 5];
  204 +Icenter = local_max(Ivote, conn, t);
  205 +fidc = fopen(sprintf('std3.2-r10.10-v8/out%d-t%d.vol',t,t0), 'w');
  206 +fwrite(fidc, single(Icenter), '*single');
  207 +fclose(fidc);
  208 +nnz(Icenter)
  209 +% [cxx1, cyy1, czz1] = ind2sub(size(Icenter),find(Icenter));
215 210  
216   -% %%
217   -% t = 350;
218   -% conn = [5 5 3];
219   -% Icenter = local_max(Ivote, conn, t);
220 211 % % center = Ivote1;
221 212 % % center(center<t) = 0;
222 213 % % center = imregionalmax(center);
... ...
Matlab_3D/ivote3_new.m deleted
1   -
2   -clear;
3   -disp('***************** NEW RUN *********************');
4   -total = tic;
5   -% ******* Initialize voting parameters **************************************
6   -rmax = [10 10 10]; %maximum radius of the cell
7   -rmin = [1 1 1];
8   -ang_deg = 25.1; %half the angular range of the voting area
9   -ang = ang_deg * pi / 180;
10   -iter = 8 ; %number of voting iterations
11   -t0 = 1;
12   -sigma = [3, 3, 2];
13   -% t = 0.1;
14   -d_ang= ang / (iter+2);
15   -% ******** Testing parameters ******************************************
16   -% p = [50, 50, 150];
17   -% ps = [400, 400, 200];
18   -% % ps = [100, 50, 40];
19   -% % I = syn_Img(rmax , ps);
20   -% volfile = 'nissl-rat.vol';
21   -% fid = fopen(volfile); % open the file that include the image
22   -% S = fread(fid, 3, 'int32');
23   -% X = S(1);
24   -% Y = S(2);
25   -% Z = S(3);
26   -filename = '128-128-128/nissl-float-128.128.128.vol'; %'nissl-float-128.128.128.vol';
27   -X = 128;
28   -Y = 128;
29   -Z = 128;
30   -fidi = fopen(filename);
31   -% load the VOL data into a 2D matrix
32   -I = fread(fidi,[X Y*Z], 'single');
33   -fclose(fidi);
34   -%change this to a 3D matrix
35   -I = (reshape(I, [X, Y, Z]));
36   -% invert the intensity
37   -I = (255 - I);
38   -
39   -%perform a gaussian blur
40   -Iblur = gauss_blur3d(I, sigma);
41   -% Iblur = I;
42   -% %crop out a small subregion of I and Iblur
43   -% Iblur = Iblur(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1);
44   -% I = I(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1);
45   -% compute the gradient
46   -[Igrad_y, Igrad_x, Igrad_z] = gradient(Iblur);
47   -
48   -%calculate the gradient magnitude
49   -Imag = sqrt(Igrad_x .^ 2 + Igrad_y .^ 2 + Igrad_z .^2);
50   -Isize = size(I);
51   -
52   -% h = reshape(Imag, [X*Y*Z, 1]);
53   -% hist(h, 100);
54   -
55   -%set a threshold for the gradient magnitude
56   -It = Imag > t0;
57   -fidt = fopen('128-128-128/It.vol', 'w');
58   -fwrite(fidt, It, 'single');
59   -fclose(fidt);
60   -%Set the boundaries of the threshold image to zero
61   -It(1:rmax(1), :, :) = 0;
62   -It(X - rmax(1):X, :,:) = 0;
63   -It(:, 1:rmax(2), :) = 0;
64   -It(:, Y - rmax(2):Y,:) = 0;
65   -It(:, :, 1:rmax(3)) = 0;
66   -It(:,:, Z - rmax(3):Z) = 0;
67   -
68   -%get the indices of all of the nonzero values in the threshold image
69   -% (voter positions)
70   -[Itx,Ity,Itz] = ind2sub(size(It),find(It));
71   -Vi =(find(It));
72   -nV = nnz(It);
73   -% create a meshgrid describing coordinates relative to the voter position
74   -rangex = -rmax(1):rmax(1); %create an array of values between -rmax and rmax
75   -rangey = -rmax(2):rmax(2);
76   -rangez = -rmax(3):rmax(3);
77   -[mx, my, mz] = meshgrid(rangex, rangey, rangez); %create a template describing local pixel position in a small cube
78   -m_mag = (sqrt(mx.^2 + my.^2 + mz.^2)); %create a template describing the distance from the center of a small cube
79   -
80   -% create a mask for the voting area
81   -M_dist1 = (mx.^2/rmax(1)^2 + my.^2/rmax(2)^2 + mz.^2/rmax(3)^2) <= 1 ; %mask for the voting area distance (all values < rmax from the center)
82   -M_dist2 = (mx.^2/rmin(1)^2 + my.^2/rmin(2)^2 + mz.^2/rmin(3)^2) >= 1 ;
83   -M_dist = M_dist1 .* M_dist2;
84   -% calculate the direction vector between a pixel and voter
85   -LV_x = mx./m_mag;
86   -LV_y = my./m_mag;
87   -LV_z = mz./m_mag;
88   -
89   -%number of pixels in the voting area of each voter (initialize to zero)
90   -validPixels = (zeros(nV,1));
91   -%indices of pixels in the voting area of each voter
92   -% indices reference the 3D image
93   -g_v_prime = zeros(nV, ceil(rmax(1)*rmax(2)*rmax(3)*ang));
94   -
95   -
96   -% vote
97   -tic;
98   -mask = zeros(Isize);
99   -mask1 = zeros(Isize);
100   -
101   -%for each iteration (in iterative voting)
102   -for itr = 1 :iter
103   -
104   - %initialize the vote image to zero
105   - Ivote = zeros(Isize);
106   -
107   - %for each voter (nonzero pixels in the threshold image It)
108   - for v = 1: nV
109   - %get the cartesian coordinates of the voter v in the main image I
110   - vx = Itx(v);
111   - vy = Ity(v);
112   - vz = Itz(v);
113   - vi = Vi(v);
114   -
115   - %retreive the gradient magnitude at the voter position
116   - vmag = Imag(vi);
117   -
118   - %retrieve the gradient
119   - gx = Igrad_x(vi);
120   - gy = Igrad_y(vi);
121   - gz = Igrad_z(vi);
122   -
123   - %calculate the gradient magnitude
124   - dmag = sqrt (gx^2 + gy^2 + gz^2);
125   -
126   - %calculate the normalized gradient direction
127   - dx = gx / dmag;
128   - dy = gy / dmag;
129   - dz = gz / dmag;
130   -
131   - %calculate the angle between the voter direction and the pixel direction
132   - cos_diff = LV_x .* dx + LV_y .* dy + LV_z .* dz;
133   -
134   - %create an angular mask for the voting area
135   - M_angle = cos_diff >= cos(ang);
136   -
137   - %combine the two masks to mask out the voting angle
138   - M = M_angle.* M_dist;
139   -
140   - % get the coordinates of each pixel in the final voter mask M
141   - pi = find(M);
142   -
143   - %calculate the number of pixels in the voting region
144   - npts = nnz(M);
145   - validPixels(v) = npts;
146   -
147   - %convert every index in the voting area from a local 3D index to a global 3D index (into the original image I)
148   - global_px = vx + mx(pi);
149   - global_py = vy + my(pi);
150   - global_pz = vz + mz(pi);
151   -
152   - %convert the global 3D index of each point into a global 1D index
153   - global_pi = sub2ind(Isize, global_px, global_py, global_pz);
154   -
155   - g_v_prime (v, 1:npts) = global_pi;
156   -
157   -
158   - Ivote( global_pi ) = Ivote( global_pi ) + vmag;
159   -% if itr ==3
160   -% if mod(v, 5000)==0
161   -% mask(global_pi)= mask(global_pi) + 0.1;
162   -% mask (vi) = mask(vi) + 0.5;
163   -% end
164   -% end
165   -% if itr ==6
166   -% if mod(v, 5000)==0
167   -% mask1(global_pi)= mask1(global_pi) + 0.1;
168   -% mask1 (vi) = mask1(vi) + 0.5;
169   -% end
170   -% end
171   -% if itr==1
172   -% for ix = -12:12
173   -% for iy = -12:12
174   -% for iz = -12:12
175   -% mask(vx+ix, vy+iy, vz+iz) = M(ix+13,iy+13,iz+13)+ mask(vx+ix, vy+iy, vz+iz);
176   -% end
177   -% end
178   -% end
179   -% end
180   -% if itr==2
181   -% for ix = -12:12
182   -% for iy = -12:12
183   -% for iz = -12:12
184   -% mask1(vx+ix, vy+iy, vz+iz) = M(ix+13,iy+13,iz+13)+ mask1(vx+ix, vy+iy, vz+iz);
185   -% end
186   -% end
187   -% end
188   -% end
189   -
190   -
191   - end
192   -% fid = fopen(sprintf('128-128-128/nissl-vote%d',itr), 'w');
193   -% fwrite(fid, single(Ivote), '*single');
194   - if itr ==1
195   - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w');
196   - fwrite(fid, single(Ivote), '*single');
197   - fclose(fid);
198   - end
199   - if itr ==8
200   - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w');
201   - fwrite(fid, single(Ivote), '*single');
202   - fclose(fid);
203   - Ivote8 = (Ivote);
204   - end
205   - if itr ==9
206   - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w');
207   - fwrite(fid, single(Ivote), '*single');
208   - fclose(fid);
209   - end
210   - if itr ==10
211   - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w');
212   - fwrite(fid, single(Ivote), '*single');
213   - fclose(fid);
214   - end
215   -
216   -% Ivote1 = single(Ivote);
217   -% fwrite(fid, Ivote1, '*single');
218   -%
219   -%
220   -% elseif itr ==2
221   -% Ivote2 = single(Ivote);
222   -% fwrite(fid, Ivote2, '*single');
223   -%
224   -%
225   -% elseif itr ==3
226   -% Ivote3 = single(Ivote);
227   -% fwrite(fid, Ivote3, '*single');
228   -%
229   -%
230   -% elseif itr ==4
231   -% Ivote4 = single(Ivote);
232   -% fwrite(fid, Ivote4, '*single');
233   -%
234   -%
235   -% elseif itr == 5
236   -% Ivote5 = single(Ivote);
237   -% fwrite(fid, Ivote5, '*single');
238   -%
239   -%
240   -% elseif itr == 6
241   -% fwrite(fid, single(Ivote), '*single');
242   -% elseif itr == 7
243   -% fwrite(fid, single(Ivote), '*single');
244   -% elseif itr == 8
245   -% fwrite(fid, single(Ivote), '*single');
246   -% elseif itr == 9
247   -% fwrite(fid, single(Ivote), '*single');
248   -% end
249   -% fclose(fid);
250   - t_v1 = toc;
251   - disp(['voting done. time =',num2str(t_v1)]);
252   -
253   - % update the voting direction
254   - if ang>=d_ang
255   - tic;
256   - Igrad_x = zeros(Isize);
257   - Igrad_y = zeros(Isize);
258   - Igrad_z = zeros(Isize);
259   - for v = 1: nV
260   - % coordinates of the current voter
261   - vx = Itx(v);
262   - vy = Ity(v);
263   - vz = Itz(v);
264   -
265   - %get the local value of the voting image
266   - local_Ivote = Ivote(g_v_prime(v,1:validPixels(v)));
267   -
268   - %find the index of the maximum value
269   - [~, local_max_idx] = max(local_Ivote);
270   -
271   - %convert this into a global subscript
272   - [g_px, g_py, g_pz] = ind2sub(size(Ivote), g_v_prime(v,local_max_idx));
273   -
274   - %compute the vector from the voter position to this position
275   -
276   - Igrad_x(vx, vy, vz) = g_px - vx ;
277   - Igrad_y(vx, vy, vz) = g_py - vy;
278   - Igrad_z(vx, vy, vz) = g_pz - vz;
279   -% if itr ==3
280   -% if mod(v, 5000)==0
281   -% mask(g_px, g_py, g_pz)= mask(g_px, g_py, g_pz) + 1;
282   -% end
283   -% end
284   -% if itr ==6
285   -% if mod(v, 5000)==0
286   -% mask1(g_px, g_py, g_pz)= mask1(g_px, g_py, g_pz) + 1;
287   -% end
288   -% end
289   - end
290   -
291   -
292   - tdir1 = toc;
293   - display (['updating dir done. time = ', num2str(tdir1)]);
294   - ang = ang - d_ang;
295   - end
296   -
297   - end
298   -
299   -hv = reshape(Ivote, [X*Y*Z, 1]);
300   -hist(hv, 250);
301   -%%
302   -t = 4300;
303   -conn = [5 5 5];
304   -Icenter = local_max(Ivote, conn, t);
305   -fidc = fopen(sprintf('std3.2-r10.10-v8/out%d-t%d.vol',t,t0), 'w');
306   -fwrite(fidc, single(Icenter), '*single');
307   -fclose(fidc);
308   -nnz(Icenter)
309   -% [cxx1, cyy1, czz1] = ind2sub(size(Icenter),find(Icenter));
310   -
311   -% % center = Ivote1;
312   -% % center(center<t) = 0;
313   -% % center = imregionalmax(center);
314   -% % cn = nnz(center);
315   -% % [cx, cy, cz] = ind2sub(size(center), find(center));
316   -% % Icenter = zeros(size(center));
317   -% % for cc =1:cn
318   -% % Icenter(cx(cc), cy(cc), cz(cc)) = 255;
319   -% % end
320   -%
321   -% % fid_Ic = fopen('image_center2-300.vol', 'w');
322   -% % fwrite(fid_Ic, Icenter);
323   -% % fclose(fid_Ic);
324   -% cn = nnz(Icenter);
325   -% [cx, cy, cz] = ind2sub(size(Icenter), find(Icenter));
326   -% Ic2d = zeros(size(Icenter,1), size(Icenter,2));
327   -% for cc =1:cn
328   -% Ic2d(cx(cc), cy(cc)) = 1;
329   -% end
330   -% I2d = max(I, [], 3);
331   -% % figure(1),imagesc(I2d); colormap(gray);
332   -% % figure(2),imagesc(Ic2d); colormap(gray);
333   -% %
334   -% out1(:,:,1) = mat2gray(I2d);
335   -% out1(:,:,2) = mat2gray(Ic2d);
336   -% out1(:,:,3) = mat2gray(I2d);
337   -% figure(1), imagesc((out1));
338   -%%% % imwrite(mat2gray(c2d), 'vote.bmp');
339   -%%
340   -% figure(1); imagesc(squeeze(I(:,:,ceil(size(I,3)/2)))), colormap(gray);
341   -% figure(33); imagesc(squeeze(Ivote3(:,:,ceil(size(Ivote,3)/2)))), colormap(gray);
Matlab_3D/validation.m
  1 +
1 2 clear all;
2 3 disp('***************** NEW RUN *********************');
3 4 X = 128;
... ... @@ -7,15 +8,15 @@ D = 10;
7 8 t0=1;
8 9 r1=10;
9 10 r2=10;
10   -t=2000;
  11 +t=2100;
11 12 itr=8;
12 13 vote=10;
13 14 std = [5 5];
14   -gt_filename = '128-128-128/0-gt.vol';
  15 +gt_filename = '0-gt.vol';
15 16 % out_filename = sprintf('128-128-128/0-nissl-std%d.%d-t0%d-r%d.%d-t%d-out%d.%d.vol',std(1), std(2),t0,r1,r2,t,itr,vote);
16   -out_filename = sprintf('D:/build/ivote3-bld/std5.5-r10.10-v8-phi15/out%d.vol',t);
  17 +out_filename = sprintf('D:/build/ivote3-bld/shared2D-v8/out%d.vol',t);
17 18 % txt_filename = sprintf('128-128-128/0-validation-nissl-std%d.%d-r%d.%d-t%d-out%d.%d-D%d.txt',std(1), std(2),r1,r2,t,itr,vote,D);
18   -txt_filename = sprintf('D:/build/ivote3-bld/std5.5-r10.10-v8-phi15/t%d-D%d.txt',t,D);
  19 +txt_filename = sprintf('D:/build/ivote3-bld/shared2D-v8/t%d-D%d.txt',t,D);
19 20 spec = sprintf('Nissl-std%d.%d-r%d.%d-t%d-out%d.%d',std(1), std(2),r1,r2,t,itr,vote);
20 21 fid0 = fopen(gt_filename);
21 22 gt = fread(fid0,[X Y*Z], 'single');
... ...
cpp/cpyToshare.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_cpyToshare_H
  2 +#define STIM_CUDA_cpyToshare_H
  3 +
  4 + //this function copy one channel data from global to shared memory in one dimension with size of X bytes.
  5 + template<typename T>
  6 + __device__ void cpyG2S1D(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
  7 + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
  8 +
  9 + //calculate the total number of threads available
  10 + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
  11 +
  12 + //calculate the current 1D thread ID
  13 + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
  14 +
  15 + //calculate the number of iteration require for the copy
  16 + unsigned int I = X/tThreads + 1;
  17 +
  18 + //the specified start position in global memory is (x, y, z)
  19 + unsigned int gstart = z*I_x*I_y + y*I_x + x;
  20 +
  21 + for (unsigned int i = 0; i < I; i++){
  22 +
  23 + //each iteration will copy tThreads elements, so the starting index in shared memory
  24 + //for each iteration will be i * tThreads (iteration # times the number of elements transferred per iteration)
  25 + unsigned int sIdx = i * tThreads + ti;
  26 + if (sIdx>= X*Y) return;
  27 +
  28 + //each iteration will copy tThreads elements from the global index
  29 + unsigned int gIdx = gstart + sIdx;
  30 +
  31 + //copy global to share
  32 + dest[sIdx] = src[gIdx];
  33 +
  34 + }
  35 + }
  36 + //this function copy one channel data from global to shared memory in two dimensions with size of X*Y bytes.
  37 + template<typename T>
  38 + __device__ void cpyG2S2D(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
  39 + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
  40 + //calculate the total number of threads available
  41 + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
  42 +
  43 + //calculate the current 1D thread ID
  44 + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
  45 +
  46 + //calculate the number of iteration require for the copy
  47 + unsigned int I = X*Y/tThreads + 1;
  48 +
  49 + unsigned int gz1 = z*I_x*I_y ;
  50 +
  51 + for (unsigned int i = 0; i < I; i++){
  52 +
  53 + unsigned int sIdx = i * tThreads + ti;
  54 + if (sIdx>= X*Y) return;
  55 +
  56 + unsigned int sy = sIdx/X;
  57 + unsigned int sx = sIdx - (sy * X);
  58 +
  59 + unsigned int gx = x + sx;
  60 + unsigned int gy = y + sy;
  61 + if (gx<I_x && gy<I_y){
  62 + unsigned int gIdx = gz1 + gy * I_x + gx;
  63 + //copy global to share
  64 + dest[sIdx] = src[gIdx];
  65 + }
  66 +
  67 + }
  68 + }
  69 + //this function copy three channels data from global to shared memory in one dimension with size of X bytes.
  70 + template<typename T>
  71 + __device__ void cpyG2S1D3ch(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
  72 + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
  73 +
  74 + //calculate the total number of threads available
  75 + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
  76 +
  77 + //calculate the current 1D thread ID
  78 + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
  79 +
  80 + //calculate the number of iteration require for the copy
  81 + unsigned int I = X/tThreads + 1;
  82 +
  83 + //the specified start position in global memory is (x, y, z)
  84 + unsigned int gstart = z*I_x*I_y + y*I_x + x;
  85 +
  86 + for (unsigned int i = 0; i < I; i++){
  87 +
  88 + //each iteration will copy tThreads elements, so the starting index in shared memory
  89 + //for each iteration will be i * tThreads (iteration # times the number of elements transferred per iteration)
  90 + unsigned int sIdx = i * tThreads + ti;
  91 + if (sIdx>= X*Y) return;
  92 + unsigned int gIdx = gstart*3 + sIdx;
  93 + //copy global to share
  94 + dest[sIdx] = src[gIdx];
  95 +
  96 + }
  97 + }
  98 + //this function copy three channels data from global to shared memory in two dimensions with size of X*Y bytes.
  99 + template<typename T>
  100 + __device__ void cpyG2S2D3ch(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
  101 + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
  102 + //calculate the total number of threads available
  103 + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
  104 +
  105 + //calculate the current 1D thread ID
  106 + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
  107 +
  108 + //calculate the number of iteration require for the copy
  109 + unsigned int I = X*Y/tThreads + 1;
  110 +
  111 + unsigned int gz1 = z*I_x*I_y ;
  112 +
  113 + for (unsigned int i = 0; i < I; i++){
  114 +
  115 + unsigned int sIdx = i * tThreads + ti;
  116 + if (sIdx>= X*Y) return;
  117 + unsigned int sy = sIdx/X;
  118 + unsigned int sx = sIdx - (sy * X);
  119 +
  120 + unsigned int gx = x + sx/3;
  121 + unsigned int gy = y + sy;
  122 + if (gx<I_x && gy<I_y){
  123 + unsigned int gIdx = (gz1 + gy * I_x + gx)*3 + (sx%3);
  124 + //copy global to share
  125 + dest[sIdx] = src[gIdx];
  126 + }
  127 + }
  128 + }
  129 + // this function compute the gradient magnitude saved in the shared memory and stores the magnitude result in the rest of shared memory.
  130 + template<typename T>
  131 + __device__ void mag_share2D(T* grad, unsigned int bs, unsigned int X, unsigned int Y, dim3 threadIdx, dim3 blockDim){
  132 +
  133 + //calculate the total number of threads available
  134 + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
  135 + //calculate the current 1D thread ID
  136 + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
  137 + //calculate the number of iteration require for the copy
  138 + unsigned int I = X*Y/tThreads + 1;
  139 + for (unsigned int i = 0; i < I; i++){
  140 +
  141 + unsigned int sIdx = i * tThreads + ti;
  142 + if (sIdx>= X*Y) return;
  143 + float gx = grad[sIdx*3];
  144 + float gy = grad[sIdx*3 + 1];
  145 + float gz = grad[sIdx*3 + 2];
  146 + float mag = sqrt(gx*gx + gy*gy + gz*gz);
  147 + grad[bs + sIdx] = mag;
  148 +
  149 + }
  150 + }
  151 +#endif
0 152 \ No newline at end of file
... ...
cpp/float_to_half.cuh deleted
1   -#ifndef STIM_CUDA_FLOAT_TO_HALF_H
2   -#define STIM_CUDA_FLOAT_TO_HALF_H
3   -
4   -#include <iostream>
5   -#include <cuda.h>
6   -#include <stim/cuda/cudatools.h>
7   -#include <stim/cuda/sharedmem.cuh>
8   -#include <stim/cuda/cudatools/error.h>
9   -#include <cuda_fp16.h>
10   -#include <stdio.h>
11   - __global__ void cuda_f2h(half* gpu_half, float* gpu_float, int x, int y, int z){
12   -
13   -
14   - //calculate x,y,z coordinates for this thread
15   - int xi = blockIdx.x * blockDim.x + threadIdx.x;
16   - //find the grid size along y
17   - int grid_y = y / blockDim.y;
18   - int blockidx_y = blockIdx.y % grid_y;
19   - int yi = blockidx_y * blockDim.y + threadIdx.y;
20   - int zi = blockIdx.y / grid_y;
21   - int i = zi * x * y + yi * x + xi;
22   -
23   - if(xi >= x|| yi >= y || zi>= z) return;
24   -
25   -
26   - gpu_half[i] = __float2half(gpu_float[i]);
27   -
28   -
29   - }
30   -
31   -
32   - void gpu_f2h(half* gpu_half, float* gpu_float, unsigned int x, unsigned int y, unsigned int z){
33   -
34   -
35   - int max_threads = stim::maxThreadsPerBlock();
36   - dim3 threads(sqrt (max_threads),sqrt (max_threads));
37   - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
38   -
39   - //call the GPU kernel to determine the gradient
40   - cuda_f2h <<< blocks, threads >>>(gpu_half, gpu_float, x, y, z);
41   -
42   - }
43   -
44   -
45   -
46   - void cpu_f2h(half* h_out, float* f_in, unsigned int x, unsigned int y, unsigned int z){
47   -
48   - //calculate the number of pixels in the array
49   - unsigned int pix = x* y* z;
50   -
51   - //allocate memory on the GPU for the input float precision.
52   - float* gpu_float;
53   - cudaMalloc(&gpu_float, pix * sizeof(float));
54   - cudaMemcpy(gpu_float, f_in, pix * sizeof(float), cudaMemcpyHostToDevice);
55   -
56   - //allocate memory on the GPU for the output half precision
57   - half* gpu_half;
58   - cudaMalloc(&gpu_half, pix * sizeof(half));
59   -
60   - //call the GPU version of this function
61   - gpu_f2h(gpu_half, gpu_float, x, y, z);
62   -
63   - //copy the array back to the CPU
64   - cudaMemcpy(h_out, gpu_half, pix * sizeof(half), cudaMemcpyDeviceToHost);
65   -
66   - //free allocated memory
67   - cudaFree(gpu_float);
68   - cudaFree(gpu_half);
69   -
70   - }
71   -
72   -
73   -#endif
74 0 \ No newline at end of file
cpp/half_to_float.cuh deleted
1   -#ifndef STIM_CUDA_HALF_TO_FLOAT_H
2   -#define STIM_CUDA_HALF_TO_FLOAT_H
3   -
4   -#include <iostream>
5   -#include <cuda.h>
6   -#include <stim/cuda/cudatools.h>
7   -#include <stim/cuda/sharedmem.cuh>
8   -#include <stim/cuda/cudatools/error.h>
9   -#include "cuda_fp16.h"
10   -
11   - __global__ void cuda_h2f(float* gpu_float, half* gpu_half, int x, int y, int z){
12   -
13   -
14   - //calculate x,y,z coordinates for this thread
15   - int xi = blockIdx.x * blockDim.x + threadIdx.x;
16   - //find the grid size along y
17   - int grid_y = y / blockDim.y;
18   - int blockidx_y = blockIdx.y % grid_y;
19   - int yi = blockidx_y * blockDim.y + threadIdx.y;
20   - int zi = blockIdx.y / grid_y;
21   - int i = zi * x * y + yi * x + xi;
22   -
23   - if(xi >= x|| yi >= y || zi>= z) return;
24   -
25   -
26   - gpu_float[i] = __half2float(gpu_half[i]);
27   -
28   - }
29   -
30   -
31   - void gpu_h2f(float* gpu_float, half* gpu_half, unsigned int x, unsigned int y, unsigned int z){
32   -
33   -
34   - int max_threads = stim::maxThreadsPerBlock();
35   - dim3 threads(sqrt (max_threads),sqrt (max_threads));
36   - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
37   -
38   - //call the GPU kernel to determine the gradient
39   - cuda_h2f <<< blocks, threads >>>(gpu_float, gpu_half, x, y, z);
40   -
41   - }
42   -
43   -
44   -
45   - void cpu_f2h(float* f_out, half* h_in, unsigned int x, unsigned int y, unsigned int z){
46   -
47   - //calculate the number of pixels in the array
48   - unsigned int pix = x* y* z;
49   -
50   - //allocate memory on the GPU for the input half precision
51   - half* gpu_half;
52   - cudaMalloc(&gpu_half, pix * sizeof(half));
53   - cudaMemcpy(gpu_half, h_in, pix * sizeof(half), cudaMemcpyHostToDevice);
54   -
55   - //allocate memory on the GPU for the output float precision.
56   - float* gpu_float;
57   - cudaMalloc(&gpu_float, pix * sizeof(float));
58   -
59   -
60   -
61   - //call the GPU version of this function
62   - gpu_h2f(gpu_float, gpu_half, x, y, z);
63   -
64   - cudaMemcpy(f_out, gpu_float, pix * sizeof(float), cudaMemcpyDeviceToHost);
65   -
66   -
67   -
68   - //free allocated memory
69   - cudaFree(gpu_float);
70   - cudaFree(gpu_half);
71   -
72   - }
73   -
74   -
75   -#endif
76 0 \ No newline at end of file
cpp/main.cpp
... ... @@ -37,10 +37,9 @@ int main(int argc, char** argv){
37 37 printf("current device ID: %d\n", i);
38 38 printf("device name: %s\n", prop.name);
39 39 printf("total global mem: %lu\n", prop.totalGlobalMem);
  40 + printf("shared memory per block: %lu\n", prop.sharedMemPerBlock);
40 41 }
41   -
42   -
43   -
  42 +
44 43 //output advertisement
45 44 std::cout<<std::endl<<std::endl;
46 45 std::cout<<"========================================================================="<<std::endl;
... ... @@ -124,18 +123,18 @@ int main(int argc, char** argv){
124 123 invert_data(cpuI, x, y, z);
125 124  
126 125 //write a new file from the cpuI.
127   - std::ofstream original("std5.5-r10.10-v8/inv-128.vol", std::ofstream::out | std::ofstream::binary);
  126 + std::ofstream original("shared2D-v8/inv-128.vol", std::ofstream::out | std::ofstream::binary);
128 127 original.write((char*)cpuI, bytes);
129 128 original.close();
130 129  
131 130 //allocate space on the cpu for the output result
132   - float* cpu_out = (float*) malloc(bytes*3);
  131 + float* cpu_out = (float*) malloc(bytes);
133 132  
134 133 // call the ivote function
135 134 ivote3(cpu_out, cpuI, sigma, anisotropy, phi, d_phi, r, iter, t, conn, x, y, z);
136 135  
137 136 //write the blurred file from the cpuI.
138   - std::ofstream fblur("vote-check/vote8.vol", std::ofstream::out | std::ofstream::binary);
  137 + std::ofstream fblur("shared2D-v8/vote8.vol", std::ofstream::out | std::ofstream::binary);
139 138 fblur.write((char*)cpuI, bytes);
140 139 fblur.close();
141 140 /*
... ... @@ -146,13 +145,13 @@ int main(int argc, char** argv){
146 145 fgx.close();
147 146 */
148 147 //write the output file.
149   - std::ofstream fo("std5.5-r10.10-v8/" + OutName.str(), std::ofstream::out | std::ofstream::binary);
  148 + std::ofstream fo("shared2D-v8/" + OutName.str(), std::ofstream::out | std::ofstream::binary);
150 149 fo.write((char*)cpu_out, bytes);
151 150 fo.close();
152 151  
153 152 // creat a file for saving the list centers
154 153  
155   - std::ofstream list("std5.5-r10.10-v8/center.txt");
  154 + std::ofstream list("shared2D-v8/center.txt");
156 155 if (list.is_open()){
157 156  
158 157 for (int ix=0; ix<x; ix++){
... ...
cpp/set_rmax.cuh deleted
1   -#ifndef STIM_CUDA_SET_RMAX_H
2   -#define STIM_CUDA_SET_RMAX_H
3   -
4   -#include <iostream>
5   -#include <cuda.h>
6   -#include <stim/cuda/cudatools.h>
7   -#include <stim/cuda/sharedmem.cuh>
8   -#include <stim/cuda/cudatools/error.h>
9   -
10   -template<typename T>
11   - __global__ void cuda_set_rmax(T* gpu_r, int rx, int ry, int rz, int x, int y, int z){
12   -
13   - //calculate x,y,z coordinates for this thread
14   - int xi = blockIdx.x * blockDim.x + threadIdx.x;
15   - //find the grid size along y
16   - int grid_y = y / blockDim.y;
17   - int blockidx_y = blockIdx.y % grid_y;
18   - int yi = blockidx_y * blockDim.y + threadIdx.y;
19   - int zi = blockIdx.y / grid_y;
20   - int i = zi * x * y + yi * x + xi;
21   -
22   - if(xi>=x || yi>=y || zi>=z) return;
23   -
24   - gpu_r[i*3+0] = rx;
25   - gpu_r[i*3+1] = ry;
26   - gpu_r[i*3+2] = rz;
27   -
28   - }
29   -
30   -template<typename T>
31   - void gpu_set_rmax(T* gpu_r, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
32   -
33   -
34   - unsigned int max_threads = stim::maxThreadsPerBlock();
35   - dim3 threads(sqrt (max_threads),sqrt (max_threads));
36   - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
37   -
38   - //call the kernel to do the voting
39   - cuda_set_rmax <T> <<< blocks, threads >>>(gpu_r, r[0], r[1], r[2], x , y, z);
40   -
41   - }
42   -template<typename T>
43   - void cpu_set_rmax(T* cpu_r, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
44   -
45   - //calculate the number of bytes in the array
46   - unsigned int bytes = x * y * z * sizeof(T);
47   -
48   - //allocate space on the GPU for the rmax
49   - T* gpu_r;
50   - cudaMalloc(&gpu_vote, bytes*3);
51   -
52   - cudaMemcpy(gpu_r, cpu_r, bytes*3, cudaMemcpyHostToDevice);
53   -
54   -
55   - //call the GPU version of the vote calculation function
56   - gpu_set_rmax<T>(gpu_r, r, x , y, z);
57   -
58   - //copy the Vote Data back to the CPU
59   - cudaMemcpy(cpu_r, gpu_r, bytes*3, cudaMemcpyDeviceToHost) ;
60   -
61   - //free allocated memory
62   - cudaFree(gpu_r);
63   -
64   - }
65   -
66   -
67   -#endif
68 0 \ No newline at end of file
cpp/update_dir3.cuh
... ... @@ -6,11 +6,12 @@
6 6 #include <stim/cuda/cudatools.h>
7 7 #include <stim/cuda/sharedmem.cuh>
8 8 #include <cuda_fp16.h>
  9 +#include "cpyToshare.cuh"
9 10  
10 11 // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
11 12 template<typename T>
12 13 __global__ void update_dir3(T* gpu_dir, T* gpu_grad, T* gpu_vote, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
13   -
  14 + extern __shared__ float s_vote[];
14 15 //calculate x,y,z coordinates for this thread
15 16 int xi = blockIdx.x * blockDim.x + threadIdx.x;
16 17 //find the grid size along y
... ... @@ -18,10 +19,21 @@
18 19 int blockidx_y = blockIdx.y % grid_y;
19 20 int yi = blockidx_y * blockDim.y + threadIdx.y;
20 21 int zi = blockIdx.y / grid_y;
  22 + //compute the global 1D index for this pixel
21 23 int i = zi * x * y + yi * x + xi;
22 24  
23 25 if(xi >= x|| yi >= y || zi>= z) return;
24   -
  26 + // find the starting points for this block along the x and y directions
  27 + int bxi = blockIdx.x * blockDim.x;
  28 + int byi = blockidx_y * blockDim.y;
  29 + //find the starting points and the size of the window, which will be copied to the 2D-shared memory
  30 + int bxs = bxi - rx;
  31 + int bys = byi - ry;
  32 + int xwidth = 2 * rx + blockDim.x;
  33 + int ywidth = 2 * ry + blockDim.y;
  34 + //compute the coordinations of this pixel in the 2D-shared memory.
  35 + int sx_rx = threadIdx.x + rx;
  36 + int sy_ry = threadIdx.y + ry;
25 37 //find the gradient values along the x, y ,z axis, and the gradient magnitude for the voter
26 38 float g_v_x = gpu_grad[i * 3 + 0];
27 39 float g_v_y = gpu_grad[i * 3 + 1];
... ... @@ -42,39 +54,48 @@
42 54 int rz_sq = rz * rz;
43 55  
44 56 for (int z_p = -rz; z_p<=rz; z_p++){
45   -
46   - for(int y_p = -ry; y_p <= ry; y_p++){
47   -
48   - for(int x_p = -rx; x_p <= rx; x_p++){
49   -
50   - //calculate the x, y ,z indices for the current pixel
51   - int xi_p = (xi + x_p) ;
  57 + int zi_p = zi + z_p;
  58 + if ((zi_p) >=0 && (zi_p) < z){
  59 + //call the function to copy one slide of vote date to the 2D-shared memory.
  60 + __syncthreads();
  61 + cpyG2S2D<float>(s_vote, gpu_vote, bxs, bys, zi + z_p, xwidth, ywidth, threadIdx, blockDim, x, y);
  62 + __syncthreads();
  63 + float z_sq = z_p * z_p;
  64 + float d_z_sq = (z_sq)/rz_sq;
  65 + for(int y_p = -ry; y_p <= ry; y_p++){
  66 +
  67 + float y_sq = y_p * y_p;
  68 + float yz_sq = y_sq + z_sq;
52 69 int yi_p = (yi + y_p) ;
53   - int zi_p = (zi + z_p) ;
54   - if (zi_p >=0 && zi_p < z && yi_p >=0 && yi_p < y && xi_p >=0 && xi_p < x){
  70 + float d_yz_sq = (y_sq)/ry_sq + d_z_sq;
  71 + unsigned int s_y1d = (sy_ry + y_p) * xwidth;
  72 + for(int x_p = -rx; x_p <= rx; x_p++){
55 73  
56   - //calculate the distance between the pixel and the current voter.
57   - float x_sq = x_p * x_p;
58   - float y_sq = y_p * y_p;
59   - float z_sq = z_p * z_p;
60   - float d_pv = sqrt(x_sq + y_sq + z_sq);
  74 + //check if the current pixel is inside of the data-set.
  75 + int xi_p = (xi + x_p) ;
  76 + if (yi_p >=0 && yi_p < y && xi_p >=0 && xi_p < x){
61 77  
62   - // calculate the angle between the pixel and the current voter.
63   - float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * mag_v);
  78 + //calculate the distance between the pixel and the current voter.
  79 + float x_sq = x_p * x_p;
  80 + float d_pv = sqrt(x_sq + yz_sq);
64 81  
65   - if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){
  82 + // calculate the angle between the pixel and the current voter.
  83 + float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * mag_v);
66 84  
67   - //calculate the 1D index for the current pixel
68   - unsigned int id_p = (zi_p) * x * y + (yi_p) * x + (xi_p);
69   - l_vote = gpu_vote[id_p];
70   -
71   - // compare the vote value of this pixel with the max value to find the maxima and its index.
72   - if (l_vote>max) {
  85 + if ((((x_sq)/rx_sq + d_yz_sq )<= 1) && (cos_diff >= cos_phi)){
73 86  
74   - max = l_vote;
75   - id_x = x_p;
76   - id_y = y_p;
77   - id_z = z_p;
  87 + //calculate the 1D index for the current pixel in the 2D-shared memory
  88 + unsigned int id_s = s_y1d + (sx_rx + x_p);
  89 + l_vote = s_vote[id_s];
  90 +
  91 + // compare the vote value of this pixel with the max value to find the maxima and its index.
  92 + if (l_vote>max) {
  93 +
  94 + max = l_vote;
  95 + id_x = x_p;
  96 + id_y = y_p;
  97 + id_z = z_p;
  98 + }
78 99 }
79 100 }
80 101 }
... ... @@ -115,13 +136,13 @@
115 136 unsigned int max_threads = stim::maxThreadsPerBlock();
116 137 dim3 threads(sqrt (max_threads),sqrt (max_threads));
117 138 dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
118   -
  139 + unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*sizeof(T);
119 140 // allocate space on the GPU for the updated vote direction
120 141 T* gpu_dir;
121 142 cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3);
122 143  
123 144 //call the kernel to calculate the new voting direction
124   - update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z);
  145 + update_dir3 <<< blocks, threads, shared_bytes >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z);
125 146  
126 147  
127 148 //call the kernel to update the gradient direction
... ...
cpp/vote3.cuh
... ... @@ -6,12 +6,13 @@
6 6 #include <stim/cuda/cudatools.h>
7 7 #include <stim/cuda/sharedmem.cuh>
8 8 #include <stim/cuda/cudatools/error.h>
9   -
  9 +#include "cpyToshare.cuh"
10 10  
11 11 // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
12 12 template<typename T>
13 13 __global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
14 14  
  15 + extern __shared__ float s[];
15 16 //calculate x,y,z coordinates for this thread
16 17 int xi = blockIdx.x * blockDim.x + threadIdx.x;
17 18 //find the grid size along y
... ... @@ -23,6 +24,17 @@
23 24  
24 25 if(xi>=x || yi>=y || zi>=z) return;
25 26  
  27 + //find the starting points and the size of the window, which will be copied to the 2D-shared memory
  28 + int bxs = blockIdx.x * blockDim.x - rx;
  29 + int bys = blockidx_y * blockDim.y - ry;
  30 + int xwidth = 2 * rx + blockDim.x;
  31 + int ywidth = 2 * ry + blockDim.y;
  32 + //calculate the starting point of shared memory for storing the magnitude.
  33 + unsigned int b_s = 3 * xwidth * ywidth;
  34 + //compute the coordinations of this pixel in the 2D-shared memory.
  35 + int sx_rx = threadIdx.x + rx;
  36 + int sy_ry = threadIdx.y + ry;
  37 +
26 38 // define a local variable to sum the votes from the voters
27 39 float sum = 0;
28 40  
... ... @@ -31,43 +43,58 @@
31 43 int rz_sq = rz * rz;
32 44  
33 45 for (int z_v = -rz; z_v<=rz; z_v++){
34   -
35   - for(int y_v = -ry; y_v <= ry; y_v++){
36   -
37   - for(int x_v = -rx; x_v <= rx; x_v++){
38   -
39   - //calculate the x, y ,z indices for the current voter
40   - int xi_v = (xi + x_v) ;
  46 + int zi_v = zi + z_v;
  47 + if ((zi_v) >=0 && (zi_v) <z){
  48 + //call the function to copy one slide of the gradient from global to the 2D-shared memory.
  49 + __syncthreads();
  50 + cpyG2S2D3ch<float>(s, gpu_grad, bxs, bys, zi + z_v, 3*xwidth, ywidth, threadIdx, blockDim, x, y);
  51 + __syncthreads();
  52 + mag_share2D<float>(s, b_s, xwidth, ywidth, threadIdx, blockDim);
  53 + __syncthreads();
  54 + float z_sq = z_v * z_v;
  55 + float d_z_sq = z_sq/rz_sq;
  56 +
  57 + for(int y_v = -ry; y_v <= ry; y_v++){
41 58 int yi_v = (yi + y_v) ;
42   - int zi_v = (zi + z_v) ;
43   - if (zi_v >=0 && zi_v < z && yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){
44   -
45   - //calculate the 1D index for the current voter
46   - unsigned int id_v = (zi_v) * x * y + (yi_v) * x + (xi_v);
  59 + //compute the position of the current voter in the shared memory along the y axis.
  60 + unsigned int sIdx_y1d = (sy_ry + y_v)* xwidth;
  61 +
  62 + float y_sq = y_v * y_v;
  63 + float yz_sq = z_sq + y_sq;
  64 + float d_yz_sq = y_sq/ry_sq + d_z_sq;
  65 + for(int x_v = -rx; x_v <= rx; x_v++){
  66 +
  67 + //check if the current voter is inside of the data-set
  68 + int xi_v = (xi + x_v) ;
  69 + if (yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){
  70 +
  71 + //compute the position of the current voter in the 2D-shared memory along the x axis.
  72 + unsigned int sIdx_x = (sx_rx + x_v);
  73 + //find the 1D index of this voter in the 2D-shared memory.
  74 + unsigned int s_Idx = (sIdx_y1d + sIdx_x);
  75 + unsigned int s_Idx3 = s_Idx * 3;
  76 +
  77 + //save the gradient values for the current voter to the local variables and compute the gradient magnitude.
  78 + float g_v_x = s[s_Idx3];
  79 + float g_v_y = s[s_Idx3 + 1];
  80 + float g_v_z = s[s_Idx3 + 2];
  81 + float mag_v = s[b_s + s_Idx]; //sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);
  82 +
  83 + //calculate the distance between the pixel and the current voter.
  84 + float x_sq = x_v * x_v;
  85 + float d_pv = sqrt(x_sq + yz_sq);
47 86  
48   - //find the gradient values along the x, y ,z axis, and the gradient magnitude for this voter
49   -
50   - float g_v_x = gpu_grad[id_v * 3 + 0];
51   - float g_v_y = gpu_grad[id_v * 3 + 1];
52   - float g_v_z = gpu_grad[id_v * 3 + 2];
53   - float mag_v = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);
54   -
55   - //calculate the distance between the pixel and the current voter.
56   - float x_sq = x_v * x_v;
57   - float y_sq = y_v * y_v;
58   - float z_sq = z_v * z_v;
59   - float d_pv = sqrt(x_sq + y_sq + z_sq);
60   -
61   - // calculate the angle between the pixel and the current voter.
62   - float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v);
  87 + // calculate the angle between the pixel and the current voter.
  88 + float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v);
63 89  
64   - // check if the current voter is located in the voting area of this pixel.
65   - if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){
  90 + // check if the current voter is located in the voting area of this pixel.
  91 + if ((((x_sq)/rx_sq + d_yz_sq)<= 1) && (cos_diff >= cos_phi)){
66 92  
67   - sum += mag_v;
  93 + sum += mag_v;
  94 + }
68 95 }
69   - }
70   - }
  96 + }
  97 + }
71 98 }
72 99 }
73 100  
... ... @@ -81,9 +108,9 @@
81 108 unsigned int max_threads = stim::maxThreadsPerBlock();
82 109 dim3 threads(sqrt (max_threads),sqrt (max_threads));
83 110 dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
84   -
  111 + unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*4*sizeof(T);
85 112 //call the kernel to do the voting
86   - vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);
  113 + vote3 <T> <<< blocks, threads, shared_bytes >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);
87 114  
88 115 }
89 116  
... ...