Commit 89604e92d1faa3934a762e9cf42e61f5644c78bd
1 parent
3b5043cd
ivote3 run on the shared memory
Showing
11 changed files
with
328 additions
and
694 deletions
Show diff stats
No preview for this file type
Matlab_3D/ivote3.m
1 | -clc; | 1 | + |
2 | clear; | 2 | clear; |
3 | disp('***************** NEW RUN *********************'); | 3 | disp('***************** NEW RUN *********************'); |
4 | total = tic; | 4 | total = tic; |
5 | % ******* Initialize voting parameters ************************************** | 5 | % ******* Initialize voting parameters ************************************** |
6 | -rmax = [16 16 8]; %maximum radius of the cell | ||
7 | -ang_deg = 20.1; %half the angular range of the voting area | 6 | +rmax = [10 10 10]; %maximum radius of the cell |
7 | +rmin = [1 1 1]; | ||
8 | +ang_deg = 25.1; %half the angular range of the voting area | ||
8 | ang = ang_deg * pi / 180; | 9 | ang = ang_deg * pi / 180; |
9 | -iter = 5; %number of voting iterations | ||
10 | -t0 = 1.0; %threshold color | ||
11 | -sigma = [3, 3, 1.5]; | 10 | +iter = 8 ; %number of voting iterations |
11 | +t0 = 1; | ||
12 | +sigma = [5, 5, 5]; | ||
12 | % t = 0.1; | 13 | % t = 0.1; |
13 | -d_ang= ang / (iter); | 14 | +d_ang= ang / (iter+2); |
14 | % ******** Testing parameters ****************************************** | 15 | % ******** Testing parameters ****************************************** |
15 | % p = [50, 50, 150]; | 16 | % p = [50, 50, 150]; |
16 | % ps = [400, 400, 200]; | 17 | % ps = [400, 400, 200]; |
@@ -22,15 +23,14 @@ d_ang= ang / (iter); | @@ -22,15 +23,14 @@ d_ang= ang / (iter); | ||
22 | % X = S(1); | 23 | % X = S(1); |
23 | % Y = S(2); | 24 | % Y = S(2); |
24 | % Z = S(3); | 25 | % Z = S(3); |
25 | -filename = 'nissl-float-128.128.128.vol'; | 26 | +filename = '128-128-128/nissl-float-128.128.128.vol'; %'nissl-float-128.128.128.vol'; |
26 | X = 128; | 27 | X = 128; |
27 | Y = 128; | 28 | Y = 128; |
28 | Z = 128; | 29 | Z = 128; |
29 | -fid = fopen(filename); | 30 | +fidi = fopen(filename); |
30 | % load the VOL data into a 2D matrix | 31 | % load the VOL data into a 2D matrix |
31 | -I = fread(fid,[X Y*Z], 'single'); | ||
32 | -fclose(fid); | ||
33 | -%% | 32 | +I = fread(fidi,[X Y*Z], 'single'); |
33 | +fclose(fidi); | ||
34 | %change this to a 3D matrix | 34 | %change this to a 3D matrix |
35 | I = (reshape(I, [X, Y, Z])); | 35 | I = (reshape(I, [X, Y, Z])); |
36 | % invert the intensity | 36 | % invert the intensity |
@@ -38,26 +38,21 @@ I = (255 - I); | @@ -38,26 +38,21 @@ I = (255 - I); | ||
38 | 38 | ||
39 | %perform a gaussian blur | 39 | %perform a gaussian blur |
40 | Iblur = gauss_blur3d(I, sigma); | 40 | Iblur = gauss_blur3d(I, sigma); |
41 | - | ||
42 | -% %crop out a small subregion of I and Iblur | ||
43 | -% Iblur = Iblur(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1); | ||
44 | -% I = I(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1); | ||
45 | -% | ||
46 | % compute the gradient | 41 | % compute the gradient |
47 | [Igrad_y, Igrad_x, Igrad_z] = gradient(Iblur); | 42 | [Igrad_y, Igrad_x, Igrad_z] = gradient(Iblur); |
48 | 43 | ||
49 | %calculate the gradient magnitude | 44 | %calculate the gradient magnitude |
50 | Imag = sqrt(Igrad_x .^ 2 + Igrad_y .^ 2 + Igrad_z .^2); | 45 | Imag = sqrt(Igrad_x .^ 2 + Igrad_y .^ 2 + Igrad_z .^2); |
51 | Isize = size(I); | 46 | Isize = size(I); |
52 | -I = single(I); | ||
53 | -Iblur = single(Iblur); | ||
54 | 47 | ||
55 | -%h = reshape(Imag, [X*Y*Z, 1]); | ||
56 | -%hist(h, 100); | 48 | +% h = reshape(Imag, [X*Y*Z, 1]); |
49 | +% hist(h, 100); | ||
57 | 50 | ||
58 | %set a threshold for the gradient magnitude | 51 | %set a threshold for the gradient magnitude |
59 | It = Imag > t0; | 52 | It = Imag > t0; |
60 | - | 53 | +fidt = fopen('128-128-128/It.vol', 'w'); |
54 | +fwrite(fidt, It, 'single'); | ||
55 | +fclose(fidt); | ||
61 | %Set the boundaries of the threshold image to zero | 56 | %Set the boundaries of the threshold image to zero |
62 | It(1:rmax(1), :, :) = 0; | 57 | It(1:rmax(1), :, :) = 0; |
63 | It(X - rmax(1):X, :,:) = 0; | 58 | It(X - rmax(1):X, :,:) = 0; |
@@ -65,13 +60,12 @@ It(:, 1:rmax(2), :) = 0; | @@ -65,13 +60,12 @@ It(:, 1:rmax(2), :) = 0; | ||
65 | It(:, Y - rmax(2):Y,:) = 0; | 60 | It(:, Y - rmax(2):Y,:) = 0; |
66 | It(:, :, 1:rmax(3)) = 0; | 61 | It(:, :, 1:rmax(3)) = 0; |
67 | It(:,:, Z - rmax(3):Z) = 0; | 62 | It(:,:, Z - rmax(3):Z) = 0; |
68 | -%% | 63 | + |
69 | %get the indices of all of the nonzero values in the threshold image | 64 | %get the indices of all of the nonzero values in the threshold image |
70 | % (voter positions) | 65 | % (voter positions) |
71 | [Itx,Ity,Itz] = ind2sub(size(It),find(It)); | 66 | [Itx,Ity,Itz] = ind2sub(size(It),find(It)); |
72 | Vi =(find(It)); | 67 | Vi =(find(It)); |
73 | nV = nnz(It); | 68 | nV = nnz(It); |
74 | -% | ||
75 | % create a meshgrid describing coordinates relative to the voter position | 69 | % create a meshgrid describing coordinates relative to the voter position |
76 | rangex = -rmax(1):rmax(1); %create an array of values between -rmax and rmax | 70 | rangex = -rmax(1):rmax(1); %create an array of values between -rmax and rmax |
77 | rangey = -rmax(2):rmax(2); | 71 | rangey = -rmax(2):rmax(2); |
@@ -80,8 +74,9 @@ rangez = -rmax(3):rmax(3); | @@ -80,8 +74,9 @@ rangez = -rmax(3):rmax(3); | ||
80 | m_mag = (sqrt(mx.^2 + my.^2 + mz.^2)); %create a template describing the distance from the center of a small cube | 74 | m_mag = (sqrt(mx.^2 + my.^2 + mz.^2)); %create a template describing the distance from the center of a small cube |
81 | 75 | ||
82 | % create a mask for the voting area | 76 | % create a mask for the voting area |
83 | -M_dist = (mx.^2/rmax(1)^2 + my.^2/rmax(2)^2 + mz.^2/rmax(3)^2) <= 1; %mask for the voting area distance (all values < rmax from the center) | ||
84 | -%% | 77 | +M_dist1 = (mx.^2/rmax(1)^2 + my.^2/rmax(2)^2 + mz.^2/rmax(3)^2) <= 1 ; %mask for the voting area distance (all values < rmax from the center) |
78 | +M_dist2 = (mx.^2/rmin(1)^2 + my.^2/rmin(2)^2 + mz.^2/rmin(3)^2) >= 1 ; | ||
79 | +M_dist = M_dist1 .* M_dist2; | ||
85 | % calculate the direction vector between a pixel and voter | 80 | % calculate the direction vector between a pixel and voter |
86 | LV_x = mx./m_mag; | 81 | LV_x = mx./m_mag; |
87 | LV_y = my./m_mag; | 82 | LV_y = my./m_mag; |
@@ -89,23 +84,24 @@ LV_z = mz./m_mag; | @@ -89,23 +84,24 @@ LV_z = mz./m_mag; | ||
89 | 84 | ||
90 | %number of pixels in the voting area of each voter (initialize to zero) | 85 | %number of pixels in the voting area of each voter (initialize to zero) |
91 | validPixels = (zeros(nV,1)); | 86 | validPixels = (zeros(nV,1)); |
92 | -%% | ||
93 | %indices of pixels in the voting area of each voter | 87 | %indices of pixels in the voting area of each voter |
94 | % indices reference the 3D image | 88 | % indices reference the 3D image |
95 | g_v_prime = zeros(nV, ceil(rmax(1)*rmax(2)*rmax(3)*ang)); | 89 | g_v_prime = zeros(nV, ceil(rmax(1)*rmax(2)*rmax(3)*ang)); |
96 | 90 | ||
97 | 91 | ||
98 | -%% vote | 92 | +% vote |
99 | tic; | 93 | tic; |
94 | +mask = zeros(Isize); | ||
95 | +mask1 = zeros(Isize); | ||
96 | + | ||
100 | %for each iteration (in iterative voting) | 97 | %for each iteration (in iterative voting) |
101 | -for itr = 1 : iter+1 | 98 | +for itr = 1 :iter |
102 | 99 | ||
103 | %initialize the vote image to zero | 100 | %initialize the vote image to zero |
104 | Ivote = zeros(Isize); | 101 | Ivote = zeros(Isize); |
105 | - | 102 | + |
106 | %for each voter (nonzero pixels in the threshold image It) | 103 | %for each voter (nonzero pixels in the threshold image It) |
107 | for v = 1: nV | 104 | for v = 1: nV |
108 | - | ||
109 | %get the cartesian coordinates of the voter v in the main image I | 105 | %get the cartesian coordinates of the voter v in the main image I |
110 | vx = Itx(v); | 106 | vx = Itx(v); |
111 | vy = Ity(v); | 107 | vy = Ity(v); |
@@ -135,7 +131,7 @@ for itr = 1 : iter+1 | @@ -135,7 +131,7 @@ for itr = 1 : iter+1 | ||
135 | M_angle = cos_diff >= cos(ang); | 131 | M_angle = cos_diff >= cos(ang); |
136 | 132 | ||
137 | %combine the two masks to mask out the voting angle | 133 | %combine the two masks to mask out the voting angle |
138 | - M = M_angle .* M_dist; | 134 | + M = M_angle.* M_dist; |
139 | 135 | ||
140 | % get the coordinates of each pixel in the final voter mask M | 136 | % get the coordinates of each pixel in the final voter mask M |
141 | pi = find(M); | 137 | pi = find(M); |
@@ -156,33 +152,20 @@ for itr = 1 : iter+1 | @@ -156,33 +152,20 @@ for itr = 1 : iter+1 | ||
156 | 152 | ||
157 | 153 | ||
158 | Ivote( global_pi ) = Ivote( global_pi ) + vmag; | 154 | Ivote( global_pi ) = Ivote( global_pi ) + vmag; |
159 | - | ||
160 | - end | ||
161 | - fid = fopen(sprintf('128-128-128/vote%d',itr), 'w'); | ||
162 | - if itr ==1 | ||
163 | - fwrite(fid, Ivote, 'single'); | ||
164 | - | ||
165 | - elseif itr ==2 | ||
166 | - fwrite(fid, Ivote, 'single'); | ||
167 | - | ||
168 | - elseif itr ==3 | ||
169 | - fwrite(fid, Ivote, 'single'); | ||
170 | - | ||
171 | - elseif itr ==4 | ||
172 | - fwrite(fid, Ivote, 'single'); | ||
173 | - | ||
174 | - elseif itr == 5 | ||
175 | - fwrite(fid, Ivote, 'single'); | ||
176 | - elseif itr == 6 | ||
177 | - fwrite(fid, Ivote, 'single'); | ||
178 | end | 155 | end |
156 | + fid = fopen(sprintf('128-128-128/nissl-vote%d',itr), 'w'); | ||
157 | + fwrite(fid, single(Ivote), '*single'); | ||
179 | fclose(fid); | 158 | fclose(fid); |
159 | + | ||
180 | t_v1 = toc; | 160 | t_v1 = toc; |
181 | disp(['voting done. time =',num2str(t_v1)]); | 161 | disp(['voting done. time =',num2str(t_v1)]); |
182 | - | 162 | + |
183 | % update the voting direction | 163 | % update the voting direction |
184 | - if ang>0 | 164 | + if ang>=d_ang |
185 | tic; | 165 | tic; |
166 | + Igrad_x = zeros(Isize); | ||
167 | + Igrad_y = zeros(Isize); | ||
168 | + Igrad_z = zeros(Isize); | ||
186 | for v = 1: nV | 169 | for v = 1: nV |
187 | % coordinates of the current voter | 170 | % coordinates of the current voter |
188 | vx = Itx(v); | 171 | vx = Itx(v); |
@@ -199,24 +182,32 @@ for itr = 1 : iter+1 | @@ -199,24 +182,32 @@ for itr = 1 : iter+1 | ||
199 | [g_px, g_py, g_pz] = ind2sub(size(Ivote), g_v_prime(v,local_max_idx)); | 182 | [g_px, g_py, g_pz] = ind2sub(size(Ivote), g_v_prime(v,local_max_idx)); |
200 | 183 | ||
201 | %compute the vector from the voter position to this position | 184 | %compute the vector from the voter position to this position |
202 | - Igrad_x(vx, vy, vz) = g_px - vx; | 185 | + |
186 | + Igrad_x(vx, vy, vz) = g_px - vx ; | ||
203 | Igrad_y(vx, vy, vz) = g_py - vy; | 187 | Igrad_y(vx, vy, vz) = g_py - vy; |
204 | Igrad_z(vx, vy, vz) = g_pz - vz; | 188 | Igrad_z(vx, vy, vz) = g_pz - vz; |
205 | - | ||
206 | end | 189 | end |
207 | - | 190 | + |
208 | 191 | ||
209 | tdir1 = toc; | 192 | tdir1 = toc; |
210 | display (['updating dir done. time = ', num2str(tdir1)]); | 193 | display (['updating dir done. time = ', num2str(tdir1)]); |
211 | ang = ang - d_ang; | 194 | ang = ang - d_ang; |
212 | end | 195 | end |
196 | + | ||
213 | end | 197 | end |
214 | 198 | ||
199 | +hv = reshape(Ivote, [X*Y*Z, 1]); | ||
200 | +hist(hv, 250); | ||
201 | +%% | ||
202 | +t = 300; | ||
203 | +conn = [5 5 5]; | ||
204 | +Icenter = local_max(Ivote, conn, t); | ||
205 | +fidc = fopen(sprintf('std3.2-r10.10-v8/out%d-t%d.vol',t,t0), 'w'); | ||
206 | +fwrite(fidc, single(Icenter), '*single'); | ||
207 | +fclose(fidc); | ||
208 | +nnz(Icenter) | ||
209 | +% [cxx1, cyy1, czz1] = ind2sub(size(Icenter),find(Icenter)); | ||
215 | 210 | ||
216 | -% %% | ||
217 | -% t = 350; | ||
218 | -% conn = [5 5 3]; | ||
219 | -% Icenter = local_max(Ivote, conn, t); | ||
220 | % % center = Ivote1; | 211 | % % center = Ivote1; |
221 | % % center(center<t) = 0; | 212 | % % center(center<t) = 0; |
222 | % % center = imregionalmax(center); | 213 | % % center = imregionalmax(center); |
Matlab_3D/ivote3_new.m deleted
1 | - | ||
2 | -clear; | ||
3 | -disp('***************** NEW RUN *********************'); | ||
4 | -total = tic; | ||
5 | -% ******* Initialize voting parameters ************************************** | ||
6 | -rmax = [10 10 10]; %maximum radius of the cell | ||
7 | -rmin = [1 1 1]; | ||
8 | -ang_deg = 25.1; %half the angular range of the voting area | ||
9 | -ang = ang_deg * pi / 180; | ||
10 | -iter = 8 ; %number of voting iterations | ||
11 | -t0 = 1; | ||
12 | -sigma = [3, 3, 2]; | ||
13 | -% t = 0.1; | ||
14 | -d_ang= ang / (iter+2); | ||
15 | -% ******** Testing parameters ****************************************** | ||
16 | -% p = [50, 50, 150]; | ||
17 | -% ps = [400, 400, 200]; | ||
18 | -% % ps = [100, 50, 40]; | ||
19 | -% % I = syn_Img(rmax , ps); | ||
20 | -% volfile = 'nissl-rat.vol'; | ||
21 | -% fid = fopen(volfile); % open the file that include the image | ||
22 | -% S = fread(fid, 3, 'int32'); | ||
23 | -% X = S(1); | ||
24 | -% Y = S(2); | ||
25 | -% Z = S(3); | ||
26 | -filename = '128-128-128/nissl-float-128.128.128.vol'; %'nissl-float-128.128.128.vol'; | ||
27 | -X = 128; | ||
28 | -Y = 128; | ||
29 | -Z = 128; | ||
30 | -fidi = fopen(filename); | ||
31 | -% load the VOL data into a 2D matrix | ||
32 | -I = fread(fidi,[X Y*Z], 'single'); | ||
33 | -fclose(fidi); | ||
34 | -%change this to a 3D matrix | ||
35 | -I = (reshape(I, [X, Y, Z])); | ||
36 | -% invert the intensity | ||
37 | -I = (255 - I); | ||
38 | - | ||
39 | -%perform a gaussian blur | ||
40 | -Iblur = gauss_blur3d(I, sigma); | ||
41 | -% Iblur = I; | ||
42 | -% %crop out a small subregion of I and Iblur | ||
43 | -% Iblur = Iblur(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1); | ||
44 | -% I = I(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1); | ||
45 | -% compute the gradient | ||
46 | -[Igrad_y, Igrad_x, Igrad_z] = gradient(Iblur); | ||
47 | - | ||
48 | -%calculate the gradient magnitude | ||
49 | -Imag = sqrt(Igrad_x .^ 2 + Igrad_y .^ 2 + Igrad_z .^2); | ||
50 | -Isize = size(I); | ||
51 | - | ||
52 | -% h = reshape(Imag, [X*Y*Z, 1]); | ||
53 | -% hist(h, 100); | ||
54 | - | ||
55 | -%set a threshold for the gradient magnitude | ||
56 | -It = Imag > t0; | ||
57 | -fidt = fopen('128-128-128/It.vol', 'w'); | ||
58 | -fwrite(fidt, It, 'single'); | ||
59 | -fclose(fidt); | ||
60 | -%Set the boundaries of the threshold image to zero | ||
61 | -It(1:rmax(1), :, :) = 0; | ||
62 | -It(X - rmax(1):X, :,:) = 0; | ||
63 | -It(:, 1:rmax(2), :) = 0; | ||
64 | -It(:, Y - rmax(2):Y,:) = 0; | ||
65 | -It(:, :, 1:rmax(3)) = 0; | ||
66 | -It(:,:, Z - rmax(3):Z) = 0; | ||
67 | - | ||
68 | -%get the indices of all of the nonzero values in the threshold image | ||
69 | -% (voter positions) | ||
70 | -[Itx,Ity,Itz] = ind2sub(size(It),find(It)); | ||
71 | -Vi =(find(It)); | ||
72 | -nV = nnz(It); | ||
73 | -% create a meshgrid describing coordinates relative to the voter position | ||
74 | -rangex = -rmax(1):rmax(1); %create an array of values between -rmax and rmax | ||
75 | -rangey = -rmax(2):rmax(2); | ||
76 | -rangez = -rmax(3):rmax(3); | ||
77 | -[mx, my, mz] = meshgrid(rangex, rangey, rangez); %create a template describing local pixel position in a small cube | ||
78 | -m_mag = (sqrt(mx.^2 + my.^2 + mz.^2)); %create a template describing the distance from the center of a small cube | ||
79 | - | ||
80 | -% create a mask for the voting area | ||
81 | -M_dist1 = (mx.^2/rmax(1)^2 + my.^2/rmax(2)^2 + mz.^2/rmax(3)^2) <= 1 ; %mask for the voting area distance (all values < rmax from the center) | ||
82 | -M_dist2 = (mx.^2/rmin(1)^2 + my.^2/rmin(2)^2 + mz.^2/rmin(3)^2) >= 1 ; | ||
83 | -M_dist = M_dist1 .* M_dist2; | ||
84 | -% calculate the direction vector between a pixel and voter | ||
85 | -LV_x = mx./m_mag; | ||
86 | -LV_y = my./m_mag; | ||
87 | -LV_z = mz./m_mag; | ||
88 | - | ||
89 | -%number of pixels in the voting area of each voter (initialize to zero) | ||
90 | -validPixels = (zeros(nV,1)); | ||
91 | -%indices of pixels in the voting area of each voter | ||
92 | -% indices reference the 3D image | ||
93 | -g_v_prime = zeros(nV, ceil(rmax(1)*rmax(2)*rmax(3)*ang)); | ||
94 | - | ||
95 | - | ||
96 | -% vote | ||
97 | -tic; | ||
98 | -mask = zeros(Isize); | ||
99 | -mask1 = zeros(Isize); | ||
100 | - | ||
101 | -%for each iteration (in iterative voting) | ||
102 | -for itr = 1 :iter | ||
103 | - | ||
104 | - %initialize the vote image to zero | ||
105 | - Ivote = zeros(Isize); | ||
106 | - | ||
107 | - %for each voter (nonzero pixels in the threshold image It) | ||
108 | - for v = 1: nV | ||
109 | - %get the cartesian coordinates of the voter v in the main image I | ||
110 | - vx = Itx(v); | ||
111 | - vy = Ity(v); | ||
112 | - vz = Itz(v); | ||
113 | - vi = Vi(v); | ||
114 | - | ||
115 | - %retreive the gradient magnitude at the voter position | ||
116 | - vmag = Imag(vi); | ||
117 | - | ||
118 | - %retrieve the gradient | ||
119 | - gx = Igrad_x(vi); | ||
120 | - gy = Igrad_y(vi); | ||
121 | - gz = Igrad_z(vi); | ||
122 | - | ||
123 | - %calculate the gradient magnitude | ||
124 | - dmag = sqrt (gx^2 + gy^2 + gz^2); | ||
125 | - | ||
126 | - %calculate the normalized gradient direction | ||
127 | - dx = gx / dmag; | ||
128 | - dy = gy / dmag; | ||
129 | - dz = gz / dmag; | ||
130 | - | ||
131 | - %calculate the angle between the voter direction and the pixel direction | ||
132 | - cos_diff = LV_x .* dx + LV_y .* dy + LV_z .* dz; | ||
133 | - | ||
134 | - %create an angular mask for the voting area | ||
135 | - M_angle = cos_diff >= cos(ang); | ||
136 | - | ||
137 | - %combine the two masks to mask out the voting angle | ||
138 | - M = M_angle.* M_dist; | ||
139 | - | ||
140 | - % get the coordinates of each pixel in the final voter mask M | ||
141 | - pi = find(M); | ||
142 | - | ||
143 | - %calculate the number of pixels in the voting region | ||
144 | - npts = nnz(M); | ||
145 | - validPixels(v) = npts; | ||
146 | - | ||
147 | - %convert every index in the voting area from a local 3D index to a global 3D index (into the original image I) | ||
148 | - global_px = vx + mx(pi); | ||
149 | - global_py = vy + my(pi); | ||
150 | - global_pz = vz + mz(pi); | ||
151 | - | ||
152 | - %convert the global 3D index of each point into a global 1D index | ||
153 | - global_pi = sub2ind(Isize, global_px, global_py, global_pz); | ||
154 | - | ||
155 | - g_v_prime (v, 1:npts) = global_pi; | ||
156 | - | ||
157 | - | ||
158 | - Ivote( global_pi ) = Ivote( global_pi ) + vmag; | ||
159 | -% if itr ==3 | ||
160 | -% if mod(v, 5000)==0 | ||
161 | -% mask(global_pi)= mask(global_pi) + 0.1; | ||
162 | -% mask (vi) = mask(vi) + 0.5; | ||
163 | -% end | ||
164 | -% end | ||
165 | -% if itr ==6 | ||
166 | -% if mod(v, 5000)==0 | ||
167 | -% mask1(global_pi)= mask1(global_pi) + 0.1; | ||
168 | -% mask1 (vi) = mask1(vi) + 0.5; | ||
169 | -% end | ||
170 | -% end | ||
171 | -% if itr==1 | ||
172 | -% for ix = -12:12 | ||
173 | -% for iy = -12:12 | ||
174 | -% for iz = -12:12 | ||
175 | -% mask(vx+ix, vy+iy, vz+iz) = M(ix+13,iy+13,iz+13)+ mask(vx+ix, vy+iy, vz+iz); | ||
176 | -% end | ||
177 | -% end | ||
178 | -% end | ||
179 | -% end | ||
180 | -% if itr==2 | ||
181 | -% for ix = -12:12 | ||
182 | -% for iy = -12:12 | ||
183 | -% for iz = -12:12 | ||
184 | -% mask1(vx+ix, vy+iy, vz+iz) = M(ix+13,iy+13,iz+13)+ mask1(vx+ix, vy+iy, vz+iz); | ||
185 | -% end | ||
186 | -% end | ||
187 | -% end | ||
188 | -% end | ||
189 | - | ||
190 | - | ||
191 | - end | ||
192 | -% fid = fopen(sprintf('128-128-128/nissl-vote%d',itr), 'w'); | ||
193 | -% fwrite(fid, single(Ivote), '*single'); | ||
194 | - if itr ==1 | ||
195 | - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w'); | ||
196 | - fwrite(fid, single(Ivote), '*single'); | ||
197 | - fclose(fid); | ||
198 | - end | ||
199 | - if itr ==8 | ||
200 | - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w'); | ||
201 | - fwrite(fid, single(Ivote), '*single'); | ||
202 | - fclose(fid); | ||
203 | - Ivote8 = (Ivote); | ||
204 | - end | ||
205 | - if itr ==9 | ||
206 | - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w'); | ||
207 | - fwrite(fid, single(Ivote), '*single'); | ||
208 | - fclose(fid); | ||
209 | - end | ||
210 | - if itr ==10 | ||
211 | - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w'); | ||
212 | - fwrite(fid, single(Ivote), '*single'); | ||
213 | - fclose(fid); | ||
214 | - end | ||
215 | - | ||
216 | -% Ivote1 = single(Ivote); | ||
217 | -% fwrite(fid, Ivote1, '*single'); | ||
218 | -% | ||
219 | -% | ||
220 | -% elseif itr ==2 | ||
221 | -% Ivote2 = single(Ivote); | ||
222 | -% fwrite(fid, Ivote2, '*single'); | ||
223 | -% | ||
224 | -% | ||
225 | -% elseif itr ==3 | ||
226 | -% Ivote3 = single(Ivote); | ||
227 | -% fwrite(fid, Ivote3, '*single'); | ||
228 | -% | ||
229 | -% | ||
230 | -% elseif itr ==4 | ||
231 | -% Ivote4 = single(Ivote); | ||
232 | -% fwrite(fid, Ivote4, '*single'); | ||
233 | -% | ||
234 | -% | ||
235 | -% elseif itr == 5 | ||
236 | -% Ivote5 = single(Ivote); | ||
237 | -% fwrite(fid, Ivote5, '*single'); | ||
238 | -% | ||
239 | -% | ||
240 | -% elseif itr == 6 | ||
241 | -% fwrite(fid, single(Ivote), '*single'); | ||
242 | -% elseif itr == 7 | ||
243 | -% fwrite(fid, single(Ivote), '*single'); | ||
244 | -% elseif itr == 8 | ||
245 | -% fwrite(fid, single(Ivote), '*single'); | ||
246 | -% elseif itr == 9 | ||
247 | -% fwrite(fid, single(Ivote), '*single'); | ||
248 | -% end | ||
249 | -% fclose(fid); | ||
250 | - t_v1 = toc; | ||
251 | - disp(['voting done. time =',num2str(t_v1)]); | ||
252 | - | ||
253 | - % update the voting direction | ||
254 | - if ang>=d_ang | ||
255 | - tic; | ||
256 | - Igrad_x = zeros(Isize); | ||
257 | - Igrad_y = zeros(Isize); | ||
258 | - Igrad_z = zeros(Isize); | ||
259 | - for v = 1: nV | ||
260 | - % coordinates of the current voter | ||
261 | - vx = Itx(v); | ||
262 | - vy = Ity(v); | ||
263 | - vz = Itz(v); | ||
264 | - | ||
265 | - %get the local value of the voting image | ||
266 | - local_Ivote = Ivote(g_v_prime(v,1:validPixels(v))); | ||
267 | - | ||
268 | - %find the index of the maximum value | ||
269 | - [~, local_max_idx] = max(local_Ivote); | ||
270 | - | ||
271 | - %convert this into a global subscript | ||
272 | - [g_px, g_py, g_pz] = ind2sub(size(Ivote), g_v_prime(v,local_max_idx)); | ||
273 | - | ||
274 | - %compute the vector from the voter position to this position | ||
275 | - | ||
276 | - Igrad_x(vx, vy, vz) = g_px - vx ; | ||
277 | - Igrad_y(vx, vy, vz) = g_py - vy; | ||
278 | - Igrad_z(vx, vy, vz) = g_pz - vz; | ||
279 | -% if itr ==3 | ||
280 | -% if mod(v, 5000)==0 | ||
281 | -% mask(g_px, g_py, g_pz)= mask(g_px, g_py, g_pz) + 1; | ||
282 | -% end | ||
283 | -% end | ||
284 | -% if itr ==6 | ||
285 | -% if mod(v, 5000)==0 | ||
286 | -% mask1(g_px, g_py, g_pz)= mask1(g_px, g_py, g_pz) + 1; | ||
287 | -% end | ||
288 | -% end | ||
289 | - end | ||
290 | - | ||
291 | - | ||
292 | - tdir1 = toc; | ||
293 | - display (['updating dir done. time = ', num2str(tdir1)]); | ||
294 | - ang = ang - d_ang; | ||
295 | - end | ||
296 | - | ||
297 | - end | ||
298 | - | ||
299 | -hv = reshape(Ivote, [X*Y*Z, 1]); | ||
300 | -hist(hv, 250); | ||
301 | -%% | ||
302 | -t = 4300; | ||
303 | -conn = [5 5 5]; | ||
304 | -Icenter = local_max(Ivote, conn, t); | ||
305 | -fidc = fopen(sprintf('std3.2-r10.10-v8/out%d-t%d.vol',t,t0), 'w'); | ||
306 | -fwrite(fidc, single(Icenter), '*single'); | ||
307 | -fclose(fidc); | ||
308 | -nnz(Icenter) | ||
309 | -% [cxx1, cyy1, czz1] = ind2sub(size(Icenter),find(Icenter)); | ||
310 | - | ||
311 | -% % center = Ivote1; | ||
312 | -% % center(center<t) = 0; | ||
313 | -% % center = imregionalmax(center); | ||
314 | -% % cn = nnz(center); | ||
315 | -% % [cx, cy, cz] = ind2sub(size(center), find(center)); | ||
316 | -% % Icenter = zeros(size(center)); | ||
317 | -% % for cc =1:cn | ||
318 | -% % Icenter(cx(cc), cy(cc), cz(cc)) = 255; | ||
319 | -% % end | ||
320 | -% | ||
321 | -% % fid_Ic = fopen('image_center2-300.vol', 'w'); | ||
322 | -% % fwrite(fid_Ic, Icenter); | ||
323 | -% % fclose(fid_Ic); | ||
324 | -% cn = nnz(Icenter); | ||
325 | -% [cx, cy, cz] = ind2sub(size(Icenter), find(Icenter)); | ||
326 | -% Ic2d = zeros(size(Icenter,1), size(Icenter,2)); | ||
327 | -% for cc =1:cn | ||
328 | -% Ic2d(cx(cc), cy(cc)) = 1; | ||
329 | -% end | ||
330 | -% I2d = max(I, [], 3); | ||
331 | -% % figure(1),imagesc(I2d); colormap(gray); | ||
332 | -% % figure(2),imagesc(Ic2d); colormap(gray); | ||
333 | -% % | ||
334 | -% out1(:,:,1) = mat2gray(I2d); | ||
335 | -% out1(:,:,2) = mat2gray(Ic2d); | ||
336 | -% out1(:,:,3) = mat2gray(I2d); | ||
337 | -% figure(1), imagesc((out1)); | ||
338 | -%%% % imwrite(mat2gray(c2d), 'vote.bmp'); | ||
339 | -%% | ||
340 | -% figure(1); imagesc(squeeze(I(:,:,ceil(size(I,3)/2)))), colormap(gray); | ||
341 | -% figure(33); imagesc(squeeze(Ivote3(:,:,ceil(size(Ivote,3)/2)))), colormap(gray); |
Matlab_3D/validation.m
1 | + | ||
1 | clear all; | 2 | clear all; |
2 | disp('***************** NEW RUN *********************'); | 3 | disp('***************** NEW RUN *********************'); |
3 | X = 128; | 4 | X = 128; |
@@ -7,15 +8,15 @@ D = 10; | @@ -7,15 +8,15 @@ D = 10; | ||
7 | t0=1; | 8 | t0=1; |
8 | r1=10; | 9 | r1=10; |
9 | r2=10; | 10 | r2=10; |
10 | -t=2000; | 11 | +t=2100; |
11 | itr=8; | 12 | itr=8; |
12 | vote=10; | 13 | vote=10; |
13 | std = [5 5]; | 14 | std = [5 5]; |
14 | -gt_filename = '128-128-128/0-gt.vol'; | 15 | +gt_filename = '0-gt.vol'; |
15 | % out_filename = sprintf('128-128-128/0-nissl-std%d.%d-t0%d-r%d.%d-t%d-out%d.%d.vol',std(1), std(2),t0,r1,r2,t,itr,vote); | 16 | % out_filename = sprintf('128-128-128/0-nissl-std%d.%d-t0%d-r%d.%d-t%d-out%d.%d.vol',std(1), std(2),t0,r1,r2,t,itr,vote); |
16 | -out_filename = sprintf('D:/build/ivote3-bld/std5.5-r10.10-v8-phi15/out%d.vol',t); | 17 | +out_filename = sprintf('D:/build/ivote3-bld/shared2D-v8/out%d.vol',t); |
17 | % txt_filename = sprintf('128-128-128/0-validation-nissl-std%d.%d-r%d.%d-t%d-out%d.%d-D%d.txt',std(1), std(2),r1,r2,t,itr,vote,D); | 18 | % txt_filename = sprintf('128-128-128/0-validation-nissl-std%d.%d-r%d.%d-t%d-out%d.%d-D%d.txt',std(1), std(2),r1,r2,t,itr,vote,D); |
18 | -txt_filename = sprintf('D:/build/ivote3-bld/std5.5-r10.10-v8-phi15/t%d-D%d.txt',t,D); | 19 | +txt_filename = sprintf('D:/build/ivote3-bld/shared2D-v8/t%d-D%d.txt',t,D); |
19 | spec = sprintf('Nissl-std%d.%d-r%d.%d-t%d-out%d.%d',std(1), std(2),r1,r2,t,itr,vote); | 20 | spec = sprintf('Nissl-std%d.%d-r%d.%d-t%d-out%d.%d',std(1), std(2),r1,r2,t,itr,vote); |
20 | fid0 = fopen(gt_filename); | 21 | fid0 = fopen(gt_filename); |
21 | gt = fread(fid0,[X Y*Z], 'single'); | 22 | gt = fread(fid0,[X Y*Z], 'single'); |
1 | +#ifndef STIM_CUDA_cpyToshare_H | ||
2 | +#define STIM_CUDA_cpyToshare_H | ||
3 | + | ||
4 | + //this function copy one channel data from global to shared memory in one dimension with size of X bytes. | ||
5 | + template<typename T> | ||
6 | + __device__ void cpyG2S1D(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y, | ||
7 | + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){ | ||
8 | + | ||
9 | + //calculate the total number of threads available | ||
10 | + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; | ||
11 | + | ||
12 | + //calculate the current 1D thread ID | ||
13 | + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; | ||
14 | + | ||
15 | + //calculate the number of iteration require for the copy | ||
16 | + unsigned int I = X/tThreads + 1; | ||
17 | + | ||
18 | + //the specified start position in global memory is (x, y, z) | ||
19 | + unsigned int gstart = z*I_x*I_y + y*I_x + x; | ||
20 | + | ||
21 | + for (unsigned int i = 0; i < I; i++){ | ||
22 | + | ||
23 | + //each iteration will copy tThreads elements, so the starting index in shared memory | ||
24 | + //for each iteration will be i * tThreads (iteration # times the number of elements transferred per iteration) | ||
25 | + unsigned int sIdx = i * tThreads + ti; | ||
26 | + if (sIdx>= X*Y) return; | ||
27 | + | ||
28 | + //each iteration will copy tThreads elements from the global index | ||
29 | + unsigned int gIdx = gstart + sIdx; | ||
30 | + | ||
31 | + //copy global to share | ||
32 | + dest[sIdx] = src[gIdx]; | ||
33 | + | ||
34 | + } | ||
35 | + } | ||
36 | + //this function copy one channel data from global to shared memory in two dimensions with size of X*Y bytes. | ||
37 | + template<typename T> | ||
38 | + __device__ void cpyG2S2D(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y, | ||
39 | + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){ | ||
40 | + //calculate the total number of threads available | ||
41 | + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; | ||
42 | + | ||
43 | + //calculate the current 1D thread ID | ||
44 | + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; | ||
45 | + | ||
46 | + //calculate the number of iteration require for the copy | ||
47 | + unsigned int I = X*Y/tThreads + 1; | ||
48 | + | ||
49 | + unsigned int gz1 = z*I_x*I_y ; | ||
50 | + | ||
51 | + for (unsigned int i = 0; i < I; i++){ | ||
52 | + | ||
53 | + unsigned int sIdx = i * tThreads + ti; | ||
54 | + if (sIdx>= X*Y) return; | ||
55 | + | ||
56 | + unsigned int sy = sIdx/X; | ||
57 | + unsigned int sx = sIdx - (sy * X); | ||
58 | + | ||
59 | + unsigned int gx = x + sx; | ||
60 | + unsigned int gy = y + sy; | ||
61 | + if (gx<I_x && gy<I_y){ | ||
62 | + unsigned int gIdx = gz1 + gy * I_x + gx; | ||
63 | + //copy global to share | ||
64 | + dest[sIdx] = src[gIdx]; | ||
65 | + } | ||
66 | + | ||
67 | + } | ||
68 | + } | ||
69 | + //this function copy three channels data from global to shared memory in one dimension with size of X bytes. | ||
70 | + template<typename T> | ||
71 | + __device__ void cpyG2S1D3ch(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y, | ||
72 | + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){ | ||
73 | + | ||
74 | + //calculate the total number of threads available | ||
75 | + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; | ||
76 | + | ||
77 | + //calculate the current 1D thread ID | ||
78 | + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; | ||
79 | + | ||
80 | + //calculate the number of iteration require for the copy | ||
81 | + unsigned int I = X/tThreads + 1; | ||
82 | + | ||
83 | + //the specified start position in global memory is (x, y, z) | ||
84 | + unsigned int gstart = z*I_x*I_y + y*I_x + x; | ||
85 | + | ||
86 | + for (unsigned int i = 0; i < I; i++){ | ||
87 | + | ||
88 | + //each iteration will copy tThreads elements, so the starting index in shared memory | ||
89 | + //for each iteration will be i * tThreads (iteration # times the number of elements transferred per iteration) | ||
90 | + unsigned int sIdx = i * tThreads + ti; | ||
91 | + if (sIdx>= X*Y) return; | ||
92 | + unsigned int gIdx = gstart*3 + sIdx; | ||
93 | + //copy global to share | ||
94 | + dest[sIdx] = src[gIdx]; | ||
95 | + | ||
96 | + } | ||
97 | + } | ||
98 | + //this function copy three channels data from global to shared memory in two dimensions with size of X*Y bytes. | ||
99 | + template<typename T> | ||
100 | + __device__ void cpyG2S2D3ch(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y, | ||
101 | + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){ | ||
102 | + //calculate the total number of threads available | ||
103 | + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; | ||
104 | + | ||
105 | + //calculate the current 1D thread ID | ||
106 | + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; | ||
107 | + | ||
108 | + //calculate the number of iteration require for the copy | ||
109 | + unsigned int I = X*Y/tThreads + 1; | ||
110 | + | ||
111 | + unsigned int gz1 = z*I_x*I_y ; | ||
112 | + | ||
113 | + for (unsigned int i = 0; i < I; i++){ | ||
114 | + | ||
115 | + unsigned int sIdx = i * tThreads + ti; | ||
116 | + if (sIdx>= X*Y) return; | ||
117 | + unsigned int sy = sIdx/X; | ||
118 | + unsigned int sx = sIdx - (sy * X); | ||
119 | + | ||
120 | + unsigned int gx = x + sx/3; | ||
121 | + unsigned int gy = y + sy; | ||
122 | + if (gx<I_x && gy<I_y){ | ||
123 | + unsigned int gIdx = (gz1 + gy * I_x + gx)*3 + (sx%3); | ||
124 | + //copy global to share | ||
125 | + dest[sIdx] = src[gIdx]; | ||
126 | + } | ||
127 | + } | ||
128 | + } | ||
129 | + // this function compute the gradient magnitude saved in the shared memory and stores the magnitude result in the rest of shared memory. | ||
130 | + template<typename T> | ||
131 | + __device__ void mag_share2D(T* grad, unsigned int bs, unsigned int X, unsigned int Y, dim3 threadIdx, dim3 blockDim){ | ||
132 | + | ||
133 | + //calculate the total number of threads available | ||
134 | + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; | ||
135 | + //calculate the current 1D thread ID | ||
136 | + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; | ||
137 | + //calculate the number of iteration require for the copy | ||
138 | + unsigned int I = X*Y/tThreads + 1; | ||
139 | + for (unsigned int i = 0; i < I; i++){ | ||
140 | + | ||
141 | + unsigned int sIdx = i * tThreads + ti; | ||
142 | + if (sIdx>= X*Y) return; | ||
143 | + float gx = grad[sIdx*3]; | ||
144 | + float gy = grad[sIdx*3 + 1]; | ||
145 | + float gz = grad[sIdx*3 + 2]; | ||
146 | + float mag = sqrt(gx*gx + gy*gy + gz*gz); | ||
147 | + grad[bs + sIdx] = mag; | ||
148 | + | ||
149 | + } | ||
150 | + } | ||
151 | +#endif | ||
0 | \ No newline at end of file | 152 | \ No newline at end of file |
cpp/float_to_half.cuh deleted
1 | -#ifndef STIM_CUDA_FLOAT_TO_HALF_H | ||
2 | -#define STIM_CUDA_FLOAT_TO_HALF_H | ||
3 | - | ||
4 | -#include <iostream> | ||
5 | -#include <cuda.h> | ||
6 | -#include <stim/cuda/cudatools.h> | ||
7 | -#include <stim/cuda/sharedmem.cuh> | ||
8 | -#include <stim/cuda/cudatools/error.h> | ||
9 | -#include <cuda_fp16.h> | ||
10 | -#include <stdio.h> | ||
11 | - __global__ void cuda_f2h(half* gpu_half, float* gpu_float, int x, int y, int z){ | ||
12 | - | ||
13 | - | ||
14 | - //calculate x,y,z coordinates for this thread | ||
15 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
16 | - //find the grid size along y | ||
17 | - int grid_y = y / blockDim.y; | ||
18 | - int blockidx_y = blockIdx.y % grid_y; | ||
19 | - int yi = blockidx_y * blockDim.y + threadIdx.y; | ||
20 | - int zi = blockIdx.y / grid_y; | ||
21 | - int i = zi * x * y + yi * x + xi; | ||
22 | - | ||
23 | - if(xi >= x|| yi >= y || zi>= z) return; | ||
24 | - | ||
25 | - | ||
26 | - gpu_half[i] = __float2half(gpu_float[i]); | ||
27 | - | ||
28 | - | ||
29 | - } | ||
30 | - | ||
31 | - | ||
32 | - void gpu_f2h(half* gpu_half, float* gpu_float, unsigned int x, unsigned int y, unsigned int z){ | ||
33 | - | ||
34 | - | ||
35 | - int max_threads = stim::maxThreadsPerBlock(); | ||
36 | - dim3 threads(sqrt (max_threads),sqrt (max_threads)); | ||
37 | - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | ||
38 | - | ||
39 | - //call the GPU kernel to determine the gradient | ||
40 | - cuda_f2h <<< blocks, threads >>>(gpu_half, gpu_float, x, y, z); | ||
41 | - | ||
42 | - } | ||
43 | - | ||
44 | - | ||
45 | - | ||
46 | - void cpu_f2h(half* h_out, float* f_in, unsigned int x, unsigned int y, unsigned int z){ | ||
47 | - | ||
48 | - //calculate the number of pixels in the array | ||
49 | - unsigned int pix = x* y* z; | ||
50 | - | ||
51 | - //allocate memory on the GPU for the input float precision. | ||
52 | - float* gpu_float; | ||
53 | - cudaMalloc(&gpu_float, pix * sizeof(float)); | ||
54 | - cudaMemcpy(gpu_float, f_in, pix * sizeof(float), cudaMemcpyHostToDevice); | ||
55 | - | ||
56 | - //allocate memory on the GPU for the output half precision | ||
57 | - half* gpu_half; | ||
58 | - cudaMalloc(&gpu_half, pix * sizeof(half)); | ||
59 | - | ||
60 | - //call the GPU version of this function | ||
61 | - gpu_f2h(gpu_half, gpu_float, x, y, z); | ||
62 | - | ||
63 | - //copy the array back to the CPU | ||
64 | - cudaMemcpy(h_out, gpu_half, pix * sizeof(half), cudaMemcpyDeviceToHost); | ||
65 | - | ||
66 | - //free allocated memory | ||
67 | - cudaFree(gpu_float); | ||
68 | - cudaFree(gpu_half); | ||
69 | - | ||
70 | - } | ||
71 | - | ||
72 | - | ||
73 | -#endif | ||
74 | \ No newline at end of file | 0 | \ No newline at end of file |
cpp/half_to_float.cuh deleted
1 | -#ifndef STIM_CUDA_HALF_TO_FLOAT_H | ||
2 | -#define STIM_CUDA_HALF_TO_FLOAT_H | ||
3 | - | ||
4 | -#include <iostream> | ||
5 | -#include <cuda.h> | ||
6 | -#include <stim/cuda/cudatools.h> | ||
7 | -#include <stim/cuda/sharedmem.cuh> | ||
8 | -#include <stim/cuda/cudatools/error.h> | ||
9 | -#include "cuda_fp16.h" | ||
10 | - | ||
11 | - __global__ void cuda_h2f(float* gpu_float, half* gpu_half, int x, int y, int z){ | ||
12 | - | ||
13 | - | ||
14 | - //calculate x,y,z coordinates for this thread | ||
15 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
16 | - //find the grid size along y | ||
17 | - int grid_y = y / blockDim.y; | ||
18 | - int blockidx_y = blockIdx.y % grid_y; | ||
19 | - int yi = blockidx_y * blockDim.y + threadIdx.y; | ||
20 | - int zi = blockIdx.y / grid_y; | ||
21 | - int i = zi * x * y + yi * x + xi; | ||
22 | - | ||
23 | - if(xi >= x|| yi >= y || zi>= z) return; | ||
24 | - | ||
25 | - | ||
26 | - gpu_float[i] = __half2float(gpu_half[i]); | ||
27 | - | ||
28 | - } | ||
29 | - | ||
30 | - | ||
31 | - void gpu_h2f(float* gpu_float, half* gpu_half, unsigned int x, unsigned int y, unsigned int z){ | ||
32 | - | ||
33 | - | ||
34 | - int max_threads = stim::maxThreadsPerBlock(); | ||
35 | - dim3 threads(sqrt (max_threads),sqrt (max_threads)); | ||
36 | - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | ||
37 | - | ||
38 | - //call the GPU kernel to determine the gradient | ||
39 | - cuda_h2f <<< blocks, threads >>>(gpu_float, gpu_half, x, y, z); | ||
40 | - | ||
41 | - } | ||
42 | - | ||
43 | - | ||
44 | - | ||
45 | - void cpu_f2h(float* f_out, half* h_in, unsigned int x, unsigned int y, unsigned int z){ | ||
46 | - | ||
47 | - //calculate the number of pixels in the array | ||
48 | - unsigned int pix = x* y* z; | ||
49 | - | ||
50 | - //allocate memory on the GPU for the input half precision | ||
51 | - half* gpu_half; | ||
52 | - cudaMalloc(&gpu_half, pix * sizeof(half)); | ||
53 | - cudaMemcpy(gpu_half, h_in, pix * sizeof(half), cudaMemcpyHostToDevice); | ||
54 | - | ||
55 | - //allocate memory on the GPU for the output float precision. | ||
56 | - float* gpu_float; | ||
57 | - cudaMalloc(&gpu_float, pix * sizeof(float)); | ||
58 | - | ||
59 | - | ||
60 | - | ||
61 | - //call the GPU version of this function | ||
62 | - gpu_h2f(gpu_float, gpu_half, x, y, z); | ||
63 | - | ||
64 | - cudaMemcpy(f_out, gpu_float, pix * sizeof(float), cudaMemcpyDeviceToHost); | ||
65 | - | ||
66 | - | ||
67 | - | ||
68 | - //free allocated memory | ||
69 | - cudaFree(gpu_float); | ||
70 | - cudaFree(gpu_half); | ||
71 | - | ||
72 | - } | ||
73 | - | ||
74 | - | ||
75 | -#endif | ||
76 | \ No newline at end of file | 0 | \ No newline at end of file |
cpp/main.cpp
@@ -37,10 +37,9 @@ int main(int argc, char** argv){ | @@ -37,10 +37,9 @@ int main(int argc, char** argv){ | ||
37 | printf("current device ID: %d\n", i); | 37 | printf("current device ID: %d\n", i); |
38 | printf("device name: %s\n", prop.name); | 38 | printf("device name: %s\n", prop.name); |
39 | printf("total global mem: %lu\n", prop.totalGlobalMem); | 39 | printf("total global mem: %lu\n", prop.totalGlobalMem); |
40 | + printf("shared memory per block: %lu\n", prop.sharedMemPerBlock); | ||
40 | } | 41 | } |
41 | - | ||
42 | - | ||
43 | - | 42 | + |
44 | //output advertisement | 43 | //output advertisement |
45 | std::cout<<std::endl<<std::endl; | 44 | std::cout<<std::endl<<std::endl; |
46 | std::cout<<"========================================================================="<<std::endl; | 45 | std::cout<<"========================================================================="<<std::endl; |
@@ -124,18 +123,18 @@ int main(int argc, char** argv){ | @@ -124,18 +123,18 @@ int main(int argc, char** argv){ | ||
124 | invert_data(cpuI, x, y, z); | 123 | invert_data(cpuI, x, y, z); |
125 | 124 | ||
126 | //write a new file from the cpuI. | 125 | //write a new file from the cpuI. |
127 | - std::ofstream original("std5.5-r10.10-v8/inv-128.vol", std::ofstream::out | std::ofstream::binary); | 126 | + std::ofstream original("shared2D-v8/inv-128.vol", std::ofstream::out | std::ofstream::binary); |
128 | original.write((char*)cpuI, bytes); | 127 | original.write((char*)cpuI, bytes); |
129 | original.close(); | 128 | original.close(); |
130 | 129 | ||
131 | //allocate space on the cpu for the output result | 130 | //allocate space on the cpu for the output result |
132 | - float* cpu_out = (float*) malloc(bytes*3); | 131 | + float* cpu_out = (float*) malloc(bytes); |
133 | 132 | ||
134 | // call the ivote function | 133 | // call the ivote function |
135 | ivote3(cpu_out, cpuI, sigma, anisotropy, phi, d_phi, r, iter, t, conn, x, y, z); | 134 | ivote3(cpu_out, cpuI, sigma, anisotropy, phi, d_phi, r, iter, t, conn, x, y, z); |
136 | 135 | ||
137 | //write the blurred file from the cpuI. | 136 | //write the blurred file from the cpuI. |
138 | - std::ofstream fblur("vote-check/vote8.vol", std::ofstream::out | std::ofstream::binary); | 137 | + std::ofstream fblur("shared2D-v8/vote8.vol", std::ofstream::out | std::ofstream::binary); |
139 | fblur.write((char*)cpuI, bytes); | 138 | fblur.write((char*)cpuI, bytes); |
140 | fblur.close(); | 139 | fblur.close(); |
141 | /* | 140 | /* |
@@ -146,13 +145,13 @@ int main(int argc, char** argv){ | @@ -146,13 +145,13 @@ int main(int argc, char** argv){ | ||
146 | fgx.close(); | 145 | fgx.close(); |
147 | */ | 146 | */ |
148 | //write the output file. | 147 | //write the output file. |
149 | - std::ofstream fo("std5.5-r10.10-v8/" + OutName.str(), std::ofstream::out | std::ofstream::binary); | 148 | + std::ofstream fo("shared2D-v8/" + OutName.str(), std::ofstream::out | std::ofstream::binary); |
150 | fo.write((char*)cpu_out, bytes); | 149 | fo.write((char*)cpu_out, bytes); |
151 | fo.close(); | 150 | fo.close(); |
152 | 151 | ||
153 | // creat a file for saving the list centers | 152 | // creat a file for saving the list centers |
154 | 153 | ||
155 | - std::ofstream list("std5.5-r10.10-v8/center.txt"); | 154 | + std::ofstream list("shared2D-v8/center.txt"); |
156 | if (list.is_open()){ | 155 | if (list.is_open()){ |
157 | 156 | ||
158 | for (int ix=0; ix<x; ix++){ | 157 | for (int ix=0; ix<x; ix++){ |
cpp/set_rmax.cuh deleted
1 | -#ifndef STIM_CUDA_SET_RMAX_H | ||
2 | -#define STIM_CUDA_SET_RMAX_H | ||
3 | - | ||
4 | -#include <iostream> | ||
5 | -#include <cuda.h> | ||
6 | -#include <stim/cuda/cudatools.h> | ||
7 | -#include <stim/cuda/sharedmem.cuh> | ||
8 | -#include <stim/cuda/cudatools/error.h> | ||
9 | - | ||
10 | -template<typename T> | ||
11 | - __global__ void cuda_set_rmax(T* gpu_r, int rx, int ry, int rz, int x, int y, int z){ | ||
12 | - | ||
13 | - //calculate x,y,z coordinates for this thread | ||
14 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
15 | - //find the grid size along y | ||
16 | - int grid_y = y / blockDim.y; | ||
17 | - int blockidx_y = blockIdx.y % grid_y; | ||
18 | - int yi = blockidx_y * blockDim.y + threadIdx.y; | ||
19 | - int zi = blockIdx.y / grid_y; | ||
20 | - int i = zi * x * y + yi * x + xi; | ||
21 | - | ||
22 | - if(xi>=x || yi>=y || zi>=z) return; | ||
23 | - | ||
24 | - gpu_r[i*3+0] = rx; | ||
25 | - gpu_r[i*3+1] = ry; | ||
26 | - gpu_r[i*3+2] = rz; | ||
27 | - | ||
28 | - } | ||
29 | - | ||
30 | -template<typename T> | ||
31 | - void gpu_set_rmax(T* gpu_r, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ | ||
32 | - | ||
33 | - | ||
34 | - unsigned int max_threads = stim::maxThreadsPerBlock(); | ||
35 | - dim3 threads(sqrt (max_threads),sqrt (max_threads)); | ||
36 | - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | ||
37 | - | ||
38 | - //call the kernel to do the voting | ||
39 | - cuda_set_rmax <T> <<< blocks, threads >>>(gpu_r, r[0], r[1], r[2], x , y, z); | ||
40 | - | ||
41 | - } | ||
42 | -template<typename T> | ||
43 | - void cpu_set_rmax(T* cpu_r, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ | ||
44 | - | ||
45 | - //calculate the number of bytes in the array | ||
46 | - unsigned int bytes = x * y * z * sizeof(T); | ||
47 | - | ||
48 | - //allocate space on the GPU for the rmax | ||
49 | - T* gpu_r; | ||
50 | - cudaMalloc(&gpu_vote, bytes*3); | ||
51 | - | ||
52 | - cudaMemcpy(gpu_r, cpu_r, bytes*3, cudaMemcpyHostToDevice); | ||
53 | - | ||
54 | - | ||
55 | - //call the GPU version of the vote calculation function | ||
56 | - gpu_set_rmax<T>(gpu_r, r, x , y, z); | ||
57 | - | ||
58 | - //copy the Vote Data back to the CPU | ||
59 | - cudaMemcpy(cpu_r, gpu_r, bytes*3, cudaMemcpyDeviceToHost) ; | ||
60 | - | ||
61 | - //free allocated memory | ||
62 | - cudaFree(gpu_r); | ||
63 | - | ||
64 | - } | ||
65 | - | ||
66 | - | ||
67 | -#endif | ||
68 | \ No newline at end of file | 0 | \ No newline at end of file |
cpp/update_dir3.cuh
@@ -6,11 +6,12 @@ | @@ -6,11 +6,12 @@ | ||
6 | #include <stim/cuda/cudatools.h> | 6 | #include <stim/cuda/cudatools.h> |
7 | #include <stim/cuda/sharedmem.cuh> | 7 | #include <stim/cuda/sharedmem.cuh> |
8 | #include <cuda_fp16.h> | 8 | #include <cuda_fp16.h> |
9 | +#include "cpyToshare.cuh" | ||
9 | 10 | ||
10 | // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. | 11 | // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. |
11 | template<typename T> | 12 | template<typename T> |
12 | __global__ void update_dir3(T* gpu_dir, T* gpu_grad, T* gpu_vote, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ | 13 | __global__ void update_dir3(T* gpu_dir, T* gpu_grad, T* gpu_vote, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ |
13 | - | 14 | + extern __shared__ float s_vote[]; |
14 | //calculate x,y,z coordinates for this thread | 15 | //calculate x,y,z coordinates for this thread |
15 | int xi = blockIdx.x * blockDim.x + threadIdx.x; | 16 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
16 | //find the grid size along y | 17 | //find the grid size along y |
@@ -18,10 +19,21 @@ | @@ -18,10 +19,21 @@ | ||
18 | int blockidx_y = blockIdx.y % grid_y; | 19 | int blockidx_y = blockIdx.y % grid_y; |
19 | int yi = blockidx_y * blockDim.y + threadIdx.y; | 20 | int yi = blockidx_y * blockDim.y + threadIdx.y; |
20 | int zi = blockIdx.y / grid_y; | 21 | int zi = blockIdx.y / grid_y; |
22 | + //compute the global 1D index for this pixel | ||
21 | int i = zi * x * y + yi * x + xi; | 23 | int i = zi * x * y + yi * x + xi; |
22 | 24 | ||
23 | if(xi >= x|| yi >= y || zi>= z) return; | 25 | if(xi >= x|| yi >= y || zi>= z) return; |
24 | - | 26 | + // find the starting points for this block along the x and y directions |
27 | + int bxi = blockIdx.x * blockDim.x; | ||
28 | + int byi = blockidx_y * blockDim.y; | ||
29 | + //find the starting points and the size of the window, which will be copied to the 2D-shared memory | ||
30 | + int bxs = bxi - rx; | ||
31 | + int bys = byi - ry; | ||
32 | + int xwidth = 2 * rx + blockDim.x; | ||
33 | + int ywidth = 2 * ry + blockDim.y; | ||
34 | + //compute the coordinations of this pixel in the 2D-shared memory. | ||
35 | + int sx_rx = threadIdx.x + rx; | ||
36 | + int sy_ry = threadIdx.y + ry; | ||
25 | //find the gradient values along the x, y ,z axis, and the gradient magnitude for the voter | 37 | //find the gradient values along the x, y ,z axis, and the gradient magnitude for the voter |
26 | float g_v_x = gpu_grad[i * 3 + 0]; | 38 | float g_v_x = gpu_grad[i * 3 + 0]; |
27 | float g_v_y = gpu_grad[i * 3 + 1]; | 39 | float g_v_y = gpu_grad[i * 3 + 1]; |
@@ -42,39 +54,48 @@ | @@ -42,39 +54,48 @@ | ||
42 | int rz_sq = rz * rz; | 54 | int rz_sq = rz * rz; |
43 | 55 | ||
44 | for (int z_p = -rz; z_p<=rz; z_p++){ | 56 | for (int z_p = -rz; z_p<=rz; z_p++){ |
45 | - | ||
46 | - for(int y_p = -ry; y_p <= ry; y_p++){ | ||
47 | - | ||
48 | - for(int x_p = -rx; x_p <= rx; x_p++){ | ||
49 | - | ||
50 | - //calculate the x, y ,z indices for the current pixel | ||
51 | - int xi_p = (xi + x_p) ; | 57 | + int zi_p = zi + z_p; |
58 | + if ((zi_p) >=0 && (zi_p) < z){ | ||
59 | + //call the function to copy one slide of vote date to the 2D-shared memory. | ||
60 | + __syncthreads(); | ||
61 | + cpyG2S2D<float>(s_vote, gpu_vote, bxs, bys, zi + z_p, xwidth, ywidth, threadIdx, blockDim, x, y); | ||
62 | + __syncthreads(); | ||
63 | + float z_sq = z_p * z_p; | ||
64 | + float d_z_sq = (z_sq)/rz_sq; | ||
65 | + for(int y_p = -ry; y_p <= ry; y_p++){ | ||
66 | + | ||
67 | + float y_sq = y_p * y_p; | ||
68 | + float yz_sq = y_sq + z_sq; | ||
52 | int yi_p = (yi + y_p) ; | 69 | int yi_p = (yi + y_p) ; |
53 | - int zi_p = (zi + z_p) ; | ||
54 | - if (zi_p >=0 && zi_p < z && yi_p >=0 && yi_p < y && xi_p >=0 && xi_p < x){ | 70 | + float d_yz_sq = (y_sq)/ry_sq + d_z_sq; |
71 | + unsigned int s_y1d = (sy_ry + y_p) * xwidth; | ||
72 | + for(int x_p = -rx; x_p <= rx; x_p++){ | ||
55 | 73 | ||
56 | - //calculate the distance between the pixel and the current voter. | ||
57 | - float x_sq = x_p * x_p; | ||
58 | - float y_sq = y_p * y_p; | ||
59 | - float z_sq = z_p * z_p; | ||
60 | - float d_pv = sqrt(x_sq + y_sq + z_sq); | 74 | + //check if the current pixel is inside of the data-set. |
75 | + int xi_p = (xi + x_p) ; | ||
76 | + if (yi_p >=0 && yi_p < y && xi_p >=0 && xi_p < x){ | ||
61 | 77 | ||
62 | - // calculate the angle between the pixel and the current voter. | ||
63 | - float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * mag_v); | 78 | + //calculate the distance between the pixel and the current voter. |
79 | + float x_sq = x_p * x_p; | ||
80 | + float d_pv = sqrt(x_sq + yz_sq); | ||
64 | 81 | ||
65 | - if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){ | 82 | + // calculate the angle between the pixel and the current voter. |
83 | + float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * mag_v); | ||
66 | 84 | ||
67 | - //calculate the 1D index for the current pixel | ||
68 | - unsigned int id_p = (zi_p) * x * y + (yi_p) * x + (xi_p); | ||
69 | - l_vote = gpu_vote[id_p]; | ||
70 | - | ||
71 | - // compare the vote value of this pixel with the max value to find the maxima and its index. | ||
72 | - if (l_vote>max) { | 85 | + if ((((x_sq)/rx_sq + d_yz_sq )<= 1) && (cos_diff >= cos_phi)){ |
73 | 86 | ||
74 | - max = l_vote; | ||
75 | - id_x = x_p; | ||
76 | - id_y = y_p; | ||
77 | - id_z = z_p; | 87 | + //calculate the 1D index for the current pixel in the 2D-shared memory |
88 | + unsigned int id_s = s_y1d + (sx_rx + x_p); | ||
89 | + l_vote = s_vote[id_s]; | ||
90 | + | ||
91 | + // compare the vote value of this pixel with the max value to find the maxima and its index. | ||
92 | + if (l_vote>max) { | ||
93 | + | ||
94 | + max = l_vote; | ||
95 | + id_x = x_p; | ||
96 | + id_y = y_p; | ||
97 | + id_z = z_p; | ||
98 | + } | ||
78 | } | 99 | } |
79 | } | 100 | } |
80 | } | 101 | } |
@@ -115,13 +136,13 @@ | @@ -115,13 +136,13 @@ | ||
115 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 136 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
116 | dim3 threads(sqrt (max_threads),sqrt (max_threads)); | 137 | dim3 threads(sqrt (max_threads),sqrt (max_threads)); |
117 | dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | 138 | dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); |
118 | - | 139 | + unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*sizeof(T); |
119 | // allocate space on the GPU for the updated vote direction | 140 | // allocate space on the GPU for the updated vote direction |
120 | T* gpu_dir; | 141 | T* gpu_dir; |
121 | cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3); | 142 | cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3); |
122 | 143 | ||
123 | //call the kernel to calculate the new voting direction | 144 | //call the kernel to calculate the new voting direction |
124 | - update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z); | 145 | + update_dir3 <<< blocks, threads, shared_bytes >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z); |
125 | 146 | ||
126 | 147 | ||
127 | //call the kernel to update the gradient direction | 148 | //call the kernel to update the gradient direction |
cpp/vote3.cuh
@@ -6,12 +6,13 @@ | @@ -6,12 +6,13 @@ | ||
6 | #include <stim/cuda/cudatools.h> | 6 | #include <stim/cuda/cudatools.h> |
7 | #include <stim/cuda/sharedmem.cuh> | 7 | #include <stim/cuda/sharedmem.cuh> |
8 | #include <stim/cuda/cudatools/error.h> | 8 | #include <stim/cuda/cudatools/error.h> |
9 | - | 9 | +#include "cpyToshare.cuh" |
10 | 10 | ||
11 | // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | 11 | // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area |
12 | template<typename T> | 12 | template<typename T> |
13 | __global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ | 13 | __global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ |
14 | 14 | ||
15 | + extern __shared__ float s[]; | ||
15 | //calculate x,y,z coordinates for this thread | 16 | //calculate x,y,z coordinates for this thread |
16 | int xi = blockIdx.x * blockDim.x + threadIdx.x; | 17 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
17 | //find the grid size along y | 18 | //find the grid size along y |
@@ -23,6 +24,17 @@ | @@ -23,6 +24,17 @@ | ||
23 | 24 | ||
24 | if(xi>=x || yi>=y || zi>=z) return; | 25 | if(xi>=x || yi>=y || zi>=z) return; |
25 | 26 | ||
27 | + //find the starting points and the size of the window, which will be copied to the 2D-shared memory | ||
28 | + int bxs = blockIdx.x * blockDim.x - rx; | ||
29 | + int bys = blockidx_y * blockDim.y - ry; | ||
30 | + int xwidth = 2 * rx + blockDim.x; | ||
31 | + int ywidth = 2 * ry + blockDim.y; | ||
32 | + //calculate the starting point of shared memory for storing the magnitude. | ||
33 | + unsigned int b_s = 3 * xwidth * ywidth; | ||
34 | + //compute the coordinations of this pixel in the 2D-shared memory. | ||
35 | + int sx_rx = threadIdx.x + rx; | ||
36 | + int sy_ry = threadIdx.y + ry; | ||
37 | + | ||
26 | // define a local variable to sum the votes from the voters | 38 | // define a local variable to sum the votes from the voters |
27 | float sum = 0; | 39 | float sum = 0; |
28 | 40 | ||
@@ -31,43 +43,58 @@ | @@ -31,43 +43,58 @@ | ||
31 | int rz_sq = rz * rz; | 43 | int rz_sq = rz * rz; |
32 | 44 | ||
33 | for (int z_v = -rz; z_v<=rz; z_v++){ | 45 | for (int z_v = -rz; z_v<=rz; z_v++){ |
34 | - | ||
35 | - for(int y_v = -ry; y_v <= ry; y_v++){ | ||
36 | - | ||
37 | - for(int x_v = -rx; x_v <= rx; x_v++){ | ||
38 | - | ||
39 | - //calculate the x, y ,z indices for the current voter | ||
40 | - int xi_v = (xi + x_v) ; | 46 | + int zi_v = zi + z_v; |
47 | + if ((zi_v) >=0 && (zi_v) <z){ | ||
48 | + //call the function to copy one slide of the gradient from global to the 2D-shared memory. | ||
49 | + __syncthreads(); | ||
50 | + cpyG2S2D3ch<float>(s, gpu_grad, bxs, bys, zi + z_v, 3*xwidth, ywidth, threadIdx, blockDim, x, y); | ||
51 | + __syncthreads(); | ||
52 | + mag_share2D<float>(s, b_s, xwidth, ywidth, threadIdx, blockDim); | ||
53 | + __syncthreads(); | ||
54 | + float z_sq = z_v * z_v; | ||
55 | + float d_z_sq = z_sq/rz_sq; | ||
56 | + | ||
57 | + for(int y_v = -ry; y_v <= ry; y_v++){ | ||
41 | int yi_v = (yi + y_v) ; | 58 | int yi_v = (yi + y_v) ; |
42 | - int zi_v = (zi + z_v) ; | ||
43 | - if (zi_v >=0 && zi_v < z && yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){ | ||
44 | - | ||
45 | - //calculate the 1D index for the current voter | ||
46 | - unsigned int id_v = (zi_v) * x * y + (yi_v) * x + (xi_v); | 59 | + //compute the position of the current voter in the shared memory along the y axis. |
60 | + unsigned int sIdx_y1d = (sy_ry + y_v)* xwidth; | ||
61 | + | ||
62 | + float y_sq = y_v * y_v; | ||
63 | + float yz_sq = z_sq + y_sq; | ||
64 | + float d_yz_sq = y_sq/ry_sq + d_z_sq; | ||
65 | + for(int x_v = -rx; x_v <= rx; x_v++){ | ||
66 | + | ||
67 | + //check if the current voter is inside of the data-set | ||
68 | + int xi_v = (xi + x_v) ; | ||
69 | + if (yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){ | ||
70 | + | ||
71 | + //compute the position of the current voter in the 2D-shared memory along the x axis. | ||
72 | + unsigned int sIdx_x = (sx_rx + x_v); | ||
73 | + //find the 1D index of this voter in the 2D-shared memory. | ||
74 | + unsigned int s_Idx = (sIdx_y1d + sIdx_x); | ||
75 | + unsigned int s_Idx3 = s_Idx * 3; | ||
76 | + | ||
77 | + //save the gradient values for the current voter to the local variables and compute the gradient magnitude. | ||
78 | + float g_v_x = s[s_Idx3]; | ||
79 | + float g_v_y = s[s_Idx3 + 1]; | ||
80 | + float g_v_z = s[s_Idx3 + 2]; | ||
81 | + float mag_v = s[b_s + s_Idx]; //sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z); | ||
82 | + | ||
83 | + //calculate the distance between the pixel and the current voter. | ||
84 | + float x_sq = x_v * x_v; | ||
85 | + float d_pv = sqrt(x_sq + yz_sq); | ||
47 | 86 | ||
48 | - //find the gradient values along the x, y ,z axis, and the gradient magnitude for this voter | ||
49 | - | ||
50 | - float g_v_x = gpu_grad[id_v * 3 + 0]; | ||
51 | - float g_v_y = gpu_grad[id_v * 3 + 1]; | ||
52 | - float g_v_z = gpu_grad[id_v * 3 + 2]; | ||
53 | - float mag_v = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z); | ||
54 | - | ||
55 | - //calculate the distance between the pixel and the current voter. | ||
56 | - float x_sq = x_v * x_v; | ||
57 | - float y_sq = y_v * y_v; | ||
58 | - float z_sq = z_v * z_v; | ||
59 | - float d_pv = sqrt(x_sq + y_sq + z_sq); | ||
60 | - | ||
61 | - // calculate the angle between the pixel and the current voter. | ||
62 | - float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v); | 87 | + // calculate the angle between the pixel and the current voter. |
88 | + float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v); | ||
63 | 89 | ||
64 | - // check if the current voter is located in the voting area of this pixel. | ||
65 | - if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){ | 90 | + // check if the current voter is located in the voting area of this pixel. |
91 | + if ((((x_sq)/rx_sq + d_yz_sq)<= 1) && (cos_diff >= cos_phi)){ | ||
66 | 92 | ||
67 | - sum += mag_v; | 93 | + sum += mag_v; |
94 | + } | ||
68 | } | 95 | } |
69 | - } | ||
70 | - } | 96 | + } |
97 | + } | ||
71 | } | 98 | } |
72 | } | 99 | } |
73 | 100 | ||
@@ -81,9 +108,9 @@ | @@ -81,9 +108,9 @@ | ||
81 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 108 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
82 | dim3 threads(sqrt (max_threads),sqrt (max_threads)); | 109 | dim3 threads(sqrt (max_threads),sqrt (max_threads)); |
83 | dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | 110 | dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); |
84 | - | 111 | + unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*4*sizeof(T); |
85 | //call the kernel to do the voting | 112 | //call the kernel to do the voting |
86 | - vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z); | 113 | + vote3 <T> <<< blocks, threads, shared_bytes >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z); |
87 | 114 | ||
88 | } | 115 | } |
89 | 116 |