Commit 89604e92d1faa3934a762e9cf42e61f5644c78bd
1 parent
3b5043cd
ivote3 run on the shared memory
Showing
11 changed files
with
328 additions
and
694 deletions
Show diff stats
No preview for this file type
Matlab_3D/ivote3.m
1 | -clc; | |
1 | + | |
2 | 2 | clear; |
3 | 3 | disp('***************** NEW RUN *********************'); |
4 | 4 | total = tic; |
5 | 5 | % ******* Initialize voting parameters ************************************** |
6 | -rmax = [16 16 8]; %maximum radius of the cell | |
7 | -ang_deg = 20.1; %half the angular range of the voting area | |
6 | +rmax = [10 10 10]; %maximum radius of the cell | |
7 | +rmin = [1 1 1]; | |
8 | +ang_deg = 25.1; %half the angular range of the voting area | |
8 | 9 | ang = ang_deg * pi / 180; |
9 | -iter = 5; %number of voting iterations | |
10 | -t0 = 1.0; %threshold color | |
11 | -sigma = [3, 3, 1.5]; | |
10 | +iter = 8 ; %number of voting iterations | |
11 | +t0 = 1; | |
12 | +sigma = [5, 5, 5]; | |
12 | 13 | % t = 0.1; |
13 | -d_ang= ang / (iter); | |
14 | +d_ang= ang / (iter+2); | |
14 | 15 | % ******** Testing parameters ****************************************** |
15 | 16 | % p = [50, 50, 150]; |
16 | 17 | % ps = [400, 400, 200]; |
... | ... | @@ -22,15 +23,14 @@ d_ang= ang / (iter); |
22 | 23 | % X = S(1); |
23 | 24 | % Y = S(2); |
24 | 25 | % Z = S(3); |
25 | -filename = 'nissl-float-128.128.128.vol'; | |
26 | +filename = '128-128-128/nissl-float-128.128.128.vol'; %'nissl-float-128.128.128.vol'; | |
26 | 27 | X = 128; |
27 | 28 | Y = 128; |
28 | 29 | Z = 128; |
29 | -fid = fopen(filename); | |
30 | +fidi = fopen(filename); | |
30 | 31 | % load the VOL data into a 2D matrix |
31 | -I = fread(fid,[X Y*Z], 'single'); | |
32 | -fclose(fid); | |
33 | -%% | |
32 | +I = fread(fidi,[X Y*Z], 'single'); | |
33 | +fclose(fidi); | |
34 | 34 | %change this to a 3D matrix |
35 | 35 | I = (reshape(I, [X, Y, Z])); |
36 | 36 | % invert the intensity |
... | ... | @@ -38,26 +38,21 @@ I = (255 - I); |
38 | 38 | |
39 | 39 | %perform a gaussian blur |
40 | 40 | Iblur = gauss_blur3d(I, sigma); |
41 | - | |
42 | -% %crop out a small subregion of I and Iblur | |
43 | -% Iblur = Iblur(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1); | |
44 | -% I = I(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1); | |
45 | -% | |
46 | 41 | % compute the gradient |
47 | 42 | [Igrad_y, Igrad_x, Igrad_z] = gradient(Iblur); |
48 | 43 | |
49 | 44 | %calculate the gradient magnitude |
50 | 45 | Imag = sqrt(Igrad_x .^ 2 + Igrad_y .^ 2 + Igrad_z .^2); |
51 | 46 | Isize = size(I); |
52 | -I = single(I); | |
53 | -Iblur = single(Iblur); | |
54 | 47 | |
55 | -%h = reshape(Imag, [X*Y*Z, 1]); | |
56 | -%hist(h, 100); | |
48 | +% h = reshape(Imag, [X*Y*Z, 1]); | |
49 | +% hist(h, 100); | |
57 | 50 | |
58 | 51 | %set a threshold for the gradient magnitude |
59 | 52 | It = Imag > t0; |
60 | - | |
53 | +fidt = fopen('128-128-128/It.vol', 'w'); | |
54 | +fwrite(fidt, It, 'single'); | |
55 | +fclose(fidt); | |
61 | 56 | %Set the boundaries of the threshold image to zero |
62 | 57 | It(1:rmax(1), :, :) = 0; |
63 | 58 | It(X - rmax(1):X, :,:) = 0; |
... | ... | @@ -65,13 +60,12 @@ It(:, 1:rmax(2), :) = 0; |
65 | 60 | It(:, Y - rmax(2):Y,:) = 0; |
66 | 61 | It(:, :, 1:rmax(3)) = 0; |
67 | 62 | It(:,:, Z - rmax(3):Z) = 0; |
68 | -%% | |
63 | + | |
69 | 64 | %get the indices of all of the nonzero values in the threshold image |
70 | 65 | % (voter positions) |
71 | 66 | [Itx,Ity,Itz] = ind2sub(size(It),find(It)); |
72 | 67 | Vi =(find(It)); |
73 | 68 | nV = nnz(It); |
74 | -% | |
75 | 69 | % create a meshgrid describing coordinates relative to the voter position |
76 | 70 | rangex = -rmax(1):rmax(1); %create an array of values between -rmax and rmax |
77 | 71 | rangey = -rmax(2):rmax(2); |
... | ... | @@ -80,8 +74,9 @@ rangez = -rmax(3):rmax(3); |
80 | 74 | m_mag = (sqrt(mx.^2 + my.^2 + mz.^2)); %create a template describing the distance from the center of a small cube |
81 | 75 | |
82 | 76 | % create a mask for the voting area |
83 | -M_dist = (mx.^2/rmax(1)^2 + my.^2/rmax(2)^2 + mz.^2/rmax(3)^2) <= 1; %mask for the voting area distance (all values < rmax from the center) | |
84 | -%% | |
77 | +M_dist1 = (mx.^2/rmax(1)^2 + my.^2/rmax(2)^2 + mz.^2/rmax(3)^2) <= 1 ; %mask for the voting area distance (all values < rmax from the center) | |
78 | +M_dist2 = (mx.^2/rmin(1)^2 + my.^2/rmin(2)^2 + mz.^2/rmin(3)^2) >= 1 ; | |
79 | +M_dist = M_dist1 .* M_dist2; | |
85 | 80 | % calculate the direction vector between a pixel and voter |
86 | 81 | LV_x = mx./m_mag; |
87 | 82 | LV_y = my./m_mag; |
... | ... | @@ -89,23 +84,24 @@ LV_z = mz./m_mag; |
89 | 84 | |
90 | 85 | %number of pixels in the voting area of each voter (initialize to zero) |
91 | 86 | validPixels = (zeros(nV,1)); |
92 | -%% | |
93 | 87 | %indices of pixels in the voting area of each voter |
94 | 88 | % indices reference the 3D image |
95 | 89 | g_v_prime = zeros(nV, ceil(rmax(1)*rmax(2)*rmax(3)*ang)); |
96 | 90 | |
97 | 91 | |
98 | -%% vote | |
92 | +% vote | |
99 | 93 | tic; |
94 | +mask = zeros(Isize); | |
95 | +mask1 = zeros(Isize); | |
96 | + | |
100 | 97 | %for each iteration (in iterative voting) |
101 | -for itr = 1 : iter+1 | |
98 | +for itr = 1 :iter | |
102 | 99 | |
103 | 100 | %initialize the vote image to zero |
104 | 101 | Ivote = zeros(Isize); |
105 | - | |
102 | + | |
106 | 103 | %for each voter (nonzero pixels in the threshold image It) |
107 | 104 | for v = 1: nV |
108 | - | |
109 | 105 | %get the cartesian coordinates of the voter v in the main image I |
110 | 106 | vx = Itx(v); |
111 | 107 | vy = Ity(v); |
... | ... | @@ -135,7 +131,7 @@ for itr = 1 : iter+1 |
135 | 131 | M_angle = cos_diff >= cos(ang); |
136 | 132 | |
137 | 133 | %combine the two masks to mask out the voting angle |
138 | - M = M_angle .* M_dist; | |
134 | + M = M_angle.* M_dist; | |
139 | 135 | |
140 | 136 | % get the coordinates of each pixel in the final voter mask M |
141 | 137 | pi = find(M); |
... | ... | @@ -156,33 +152,20 @@ for itr = 1 : iter+1 |
156 | 152 | |
157 | 153 | |
158 | 154 | Ivote( global_pi ) = Ivote( global_pi ) + vmag; |
159 | - | |
160 | - end | |
161 | - fid = fopen(sprintf('128-128-128/vote%d',itr), 'w'); | |
162 | - if itr ==1 | |
163 | - fwrite(fid, Ivote, 'single'); | |
164 | - | |
165 | - elseif itr ==2 | |
166 | - fwrite(fid, Ivote, 'single'); | |
167 | - | |
168 | - elseif itr ==3 | |
169 | - fwrite(fid, Ivote, 'single'); | |
170 | - | |
171 | - elseif itr ==4 | |
172 | - fwrite(fid, Ivote, 'single'); | |
173 | - | |
174 | - elseif itr == 5 | |
175 | - fwrite(fid, Ivote, 'single'); | |
176 | - elseif itr == 6 | |
177 | - fwrite(fid, Ivote, 'single'); | |
178 | 155 | end |
156 | + fid = fopen(sprintf('128-128-128/nissl-vote%d',itr), 'w'); | |
157 | + fwrite(fid, single(Ivote), '*single'); | |
179 | 158 | fclose(fid); |
159 | + | |
180 | 160 | t_v1 = toc; |
181 | 161 | disp(['voting done. time =',num2str(t_v1)]); |
182 | - | |
162 | + | |
183 | 163 | % update the voting direction |
184 | - if ang>0 | |
164 | + if ang>=d_ang | |
185 | 165 | tic; |
166 | + Igrad_x = zeros(Isize); | |
167 | + Igrad_y = zeros(Isize); | |
168 | + Igrad_z = zeros(Isize); | |
186 | 169 | for v = 1: nV |
187 | 170 | % coordinates of the current voter |
188 | 171 | vx = Itx(v); |
... | ... | @@ -199,24 +182,32 @@ for itr = 1 : iter+1 |
199 | 182 | [g_px, g_py, g_pz] = ind2sub(size(Ivote), g_v_prime(v,local_max_idx)); |
200 | 183 | |
201 | 184 | %compute the vector from the voter position to this position |
202 | - Igrad_x(vx, vy, vz) = g_px - vx; | |
185 | + | |
186 | + Igrad_x(vx, vy, vz) = g_px - vx ; | |
203 | 187 | Igrad_y(vx, vy, vz) = g_py - vy; |
204 | 188 | Igrad_z(vx, vy, vz) = g_pz - vz; |
205 | - | |
206 | 189 | end |
207 | - | |
190 | + | |
208 | 191 | |
209 | 192 | tdir1 = toc; |
210 | 193 | display (['updating dir done. time = ', num2str(tdir1)]); |
211 | 194 | ang = ang - d_ang; |
212 | 195 | end |
196 | + | |
213 | 197 | end |
214 | 198 | |
199 | +hv = reshape(Ivote, [X*Y*Z, 1]); | |
200 | +hist(hv, 250); | |
201 | +%% | |
202 | +t = 300; | |
203 | +conn = [5 5 5]; | |
204 | +Icenter = local_max(Ivote, conn, t); | |
205 | +fidc = fopen(sprintf('std3.2-r10.10-v8/out%d-t%d.vol',t,t0), 'w'); | |
206 | +fwrite(fidc, single(Icenter), '*single'); | |
207 | +fclose(fidc); | |
208 | +nnz(Icenter) | |
209 | +% [cxx1, cyy1, czz1] = ind2sub(size(Icenter),find(Icenter)); | |
215 | 210 | |
216 | -% %% | |
217 | -% t = 350; | |
218 | -% conn = [5 5 3]; | |
219 | -% Icenter = local_max(Ivote, conn, t); | |
220 | 211 | % % center = Ivote1; |
221 | 212 | % % center(center<t) = 0; |
222 | 213 | % % center = imregionalmax(center); | ... | ... |
Matlab_3D/ivote3_new.m deleted
1 | - | |
2 | -clear; | |
3 | -disp('***************** NEW RUN *********************'); | |
4 | -total = tic; | |
5 | -% ******* Initialize voting parameters ************************************** | |
6 | -rmax = [10 10 10]; %maximum radius of the cell | |
7 | -rmin = [1 1 1]; | |
8 | -ang_deg = 25.1; %half the angular range of the voting area | |
9 | -ang = ang_deg * pi / 180; | |
10 | -iter = 8 ; %number of voting iterations | |
11 | -t0 = 1; | |
12 | -sigma = [3, 3, 2]; | |
13 | -% t = 0.1; | |
14 | -d_ang= ang / (iter+2); | |
15 | -% ******** Testing parameters ****************************************** | |
16 | -% p = [50, 50, 150]; | |
17 | -% ps = [400, 400, 200]; | |
18 | -% % ps = [100, 50, 40]; | |
19 | -% % I = syn_Img(rmax , ps); | |
20 | -% volfile = 'nissl-rat.vol'; | |
21 | -% fid = fopen(volfile); % open the file that include the image | |
22 | -% S = fread(fid, 3, 'int32'); | |
23 | -% X = S(1); | |
24 | -% Y = S(2); | |
25 | -% Z = S(3); | |
26 | -filename = '128-128-128/nissl-float-128.128.128.vol'; %'nissl-float-128.128.128.vol'; | |
27 | -X = 128; | |
28 | -Y = 128; | |
29 | -Z = 128; | |
30 | -fidi = fopen(filename); | |
31 | -% load the VOL data into a 2D matrix | |
32 | -I = fread(fidi,[X Y*Z], 'single'); | |
33 | -fclose(fidi); | |
34 | -%change this to a 3D matrix | |
35 | -I = (reshape(I, [X, Y, Z])); | |
36 | -% invert the intensity | |
37 | -I = (255 - I); | |
38 | - | |
39 | -%perform a gaussian blur | |
40 | -Iblur = gauss_blur3d(I, sigma); | |
41 | -% Iblur = I; | |
42 | -% %crop out a small subregion of I and Iblur | |
43 | -% Iblur = Iblur(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1); | |
44 | -% I = I(p(1):p(1)+ps(1)-1, p(2):p(2)+ps(2)-1, p(3):p(3)+ps(3)-1); | |
45 | -% compute the gradient | |
46 | -[Igrad_y, Igrad_x, Igrad_z] = gradient(Iblur); | |
47 | - | |
48 | -%calculate the gradient magnitude | |
49 | -Imag = sqrt(Igrad_x .^ 2 + Igrad_y .^ 2 + Igrad_z .^2); | |
50 | -Isize = size(I); | |
51 | - | |
52 | -% h = reshape(Imag, [X*Y*Z, 1]); | |
53 | -% hist(h, 100); | |
54 | - | |
55 | -%set a threshold for the gradient magnitude | |
56 | -It = Imag > t0; | |
57 | -fidt = fopen('128-128-128/It.vol', 'w'); | |
58 | -fwrite(fidt, It, 'single'); | |
59 | -fclose(fidt); | |
60 | -%Set the boundaries of the threshold image to zero | |
61 | -It(1:rmax(1), :, :) = 0; | |
62 | -It(X - rmax(1):X, :,:) = 0; | |
63 | -It(:, 1:rmax(2), :) = 0; | |
64 | -It(:, Y - rmax(2):Y,:) = 0; | |
65 | -It(:, :, 1:rmax(3)) = 0; | |
66 | -It(:,:, Z - rmax(3):Z) = 0; | |
67 | - | |
68 | -%get the indices of all of the nonzero values in the threshold image | |
69 | -% (voter positions) | |
70 | -[Itx,Ity,Itz] = ind2sub(size(It),find(It)); | |
71 | -Vi =(find(It)); | |
72 | -nV = nnz(It); | |
73 | -% create a meshgrid describing coordinates relative to the voter position | |
74 | -rangex = -rmax(1):rmax(1); %create an array of values between -rmax and rmax | |
75 | -rangey = -rmax(2):rmax(2); | |
76 | -rangez = -rmax(3):rmax(3); | |
77 | -[mx, my, mz] = meshgrid(rangex, rangey, rangez); %create a template describing local pixel position in a small cube | |
78 | -m_mag = (sqrt(mx.^2 + my.^2 + mz.^2)); %create a template describing the distance from the center of a small cube | |
79 | - | |
80 | -% create a mask for the voting area | |
81 | -M_dist1 = (mx.^2/rmax(1)^2 + my.^2/rmax(2)^2 + mz.^2/rmax(3)^2) <= 1 ; %mask for the voting area distance (all values < rmax from the center) | |
82 | -M_dist2 = (mx.^2/rmin(1)^2 + my.^2/rmin(2)^2 + mz.^2/rmin(3)^2) >= 1 ; | |
83 | -M_dist = M_dist1 .* M_dist2; | |
84 | -% calculate the direction vector between a pixel and voter | |
85 | -LV_x = mx./m_mag; | |
86 | -LV_y = my./m_mag; | |
87 | -LV_z = mz./m_mag; | |
88 | - | |
89 | -%number of pixels in the voting area of each voter (initialize to zero) | |
90 | -validPixels = (zeros(nV,1)); | |
91 | -%indices of pixels in the voting area of each voter | |
92 | -% indices reference the 3D image | |
93 | -g_v_prime = zeros(nV, ceil(rmax(1)*rmax(2)*rmax(3)*ang)); | |
94 | - | |
95 | - | |
96 | -% vote | |
97 | -tic; | |
98 | -mask = zeros(Isize); | |
99 | -mask1 = zeros(Isize); | |
100 | - | |
101 | -%for each iteration (in iterative voting) | |
102 | -for itr = 1 :iter | |
103 | - | |
104 | - %initialize the vote image to zero | |
105 | - Ivote = zeros(Isize); | |
106 | - | |
107 | - %for each voter (nonzero pixels in the threshold image It) | |
108 | - for v = 1: nV | |
109 | - %get the cartesian coordinates of the voter v in the main image I | |
110 | - vx = Itx(v); | |
111 | - vy = Ity(v); | |
112 | - vz = Itz(v); | |
113 | - vi = Vi(v); | |
114 | - | |
115 | - %retreive the gradient magnitude at the voter position | |
116 | - vmag = Imag(vi); | |
117 | - | |
118 | - %retrieve the gradient | |
119 | - gx = Igrad_x(vi); | |
120 | - gy = Igrad_y(vi); | |
121 | - gz = Igrad_z(vi); | |
122 | - | |
123 | - %calculate the gradient magnitude | |
124 | - dmag = sqrt (gx^2 + gy^2 + gz^2); | |
125 | - | |
126 | - %calculate the normalized gradient direction | |
127 | - dx = gx / dmag; | |
128 | - dy = gy / dmag; | |
129 | - dz = gz / dmag; | |
130 | - | |
131 | - %calculate the angle between the voter direction and the pixel direction | |
132 | - cos_diff = LV_x .* dx + LV_y .* dy + LV_z .* dz; | |
133 | - | |
134 | - %create an angular mask for the voting area | |
135 | - M_angle = cos_diff >= cos(ang); | |
136 | - | |
137 | - %combine the two masks to mask out the voting angle | |
138 | - M = M_angle.* M_dist; | |
139 | - | |
140 | - % get the coordinates of each pixel in the final voter mask M | |
141 | - pi = find(M); | |
142 | - | |
143 | - %calculate the number of pixels in the voting region | |
144 | - npts = nnz(M); | |
145 | - validPixels(v) = npts; | |
146 | - | |
147 | - %convert every index in the voting area from a local 3D index to a global 3D index (into the original image I) | |
148 | - global_px = vx + mx(pi); | |
149 | - global_py = vy + my(pi); | |
150 | - global_pz = vz + mz(pi); | |
151 | - | |
152 | - %convert the global 3D index of each point into a global 1D index | |
153 | - global_pi = sub2ind(Isize, global_px, global_py, global_pz); | |
154 | - | |
155 | - g_v_prime (v, 1:npts) = global_pi; | |
156 | - | |
157 | - | |
158 | - Ivote( global_pi ) = Ivote( global_pi ) + vmag; | |
159 | -% if itr ==3 | |
160 | -% if mod(v, 5000)==0 | |
161 | -% mask(global_pi)= mask(global_pi) + 0.1; | |
162 | -% mask (vi) = mask(vi) + 0.5; | |
163 | -% end | |
164 | -% end | |
165 | -% if itr ==6 | |
166 | -% if mod(v, 5000)==0 | |
167 | -% mask1(global_pi)= mask1(global_pi) + 0.1; | |
168 | -% mask1 (vi) = mask1(vi) + 0.5; | |
169 | -% end | |
170 | -% end | |
171 | -% if itr==1 | |
172 | -% for ix = -12:12 | |
173 | -% for iy = -12:12 | |
174 | -% for iz = -12:12 | |
175 | -% mask(vx+ix, vy+iy, vz+iz) = M(ix+13,iy+13,iz+13)+ mask(vx+ix, vy+iy, vz+iz); | |
176 | -% end | |
177 | -% end | |
178 | -% end | |
179 | -% end | |
180 | -% if itr==2 | |
181 | -% for ix = -12:12 | |
182 | -% for iy = -12:12 | |
183 | -% for iz = -12:12 | |
184 | -% mask1(vx+ix, vy+iy, vz+iz) = M(ix+13,iy+13,iz+13)+ mask1(vx+ix, vy+iy, vz+iz); | |
185 | -% end | |
186 | -% end | |
187 | -% end | |
188 | -% end | |
189 | - | |
190 | - | |
191 | - end | |
192 | -% fid = fopen(sprintf('128-128-128/nissl-vote%d',itr), 'w'); | |
193 | -% fwrite(fid, single(Ivote), '*single'); | |
194 | - if itr ==1 | |
195 | - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w'); | |
196 | - fwrite(fid, single(Ivote), '*single'); | |
197 | - fclose(fid); | |
198 | - end | |
199 | - if itr ==8 | |
200 | - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w'); | |
201 | - fwrite(fid, single(Ivote), '*single'); | |
202 | - fclose(fid); | |
203 | - Ivote8 = (Ivote); | |
204 | - end | |
205 | - if itr ==9 | |
206 | - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w'); | |
207 | - fwrite(fid, single(Ivote), '*single'); | |
208 | - fclose(fid); | |
209 | - end | |
210 | - if itr ==10 | |
211 | - fid = fopen(sprintf('128-128-128/00-nissl-vote%d.vol',itr), 'w'); | |
212 | - fwrite(fid, single(Ivote), '*single'); | |
213 | - fclose(fid); | |
214 | - end | |
215 | - | |
216 | -% Ivote1 = single(Ivote); | |
217 | -% fwrite(fid, Ivote1, '*single'); | |
218 | -% | |
219 | -% | |
220 | -% elseif itr ==2 | |
221 | -% Ivote2 = single(Ivote); | |
222 | -% fwrite(fid, Ivote2, '*single'); | |
223 | -% | |
224 | -% | |
225 | -% elseif itr ==3 | |
226 | -% Ivote3 = single(Ivote); | |
227 | -% fwrite(fid, Ivote3, '*single'); | |
228 | -% | |
229 | -% | |
230 | -% elseif itr ==4 | |
231 | -% Ivote4 = single(Ivote); | |
232 | -% fwrite(fid, Ivote4, '*single'); | |
233 | -% | |
234 | -% | |
235 | -% elseif itr == 5 | |
236 | -% Ivote5 = single(Ivote); | |
237 | -% fwrite(fid, Ivote5, '*single'); | |
238 | -% | |
239 | -% | |
240 | -% elseif itr == 6 | |
241 | -% fwrite(fid, single(Ivote), '*single'); | |
242 | -% elseif itr == 7 | |
243 | -% fwrite(fid, single(Ivote), '*single'); | |
244 | -% elseif itr == 8 | |
245 | -% fwrite(fid, single(Ivote), '*single'); | |
246 | -% elseif itr == 9 | |
247 | -% fwrite(fid, single(Ivote), '*single'); | |
248 | -% end | |
249 | -% fclose(fid); | |
250 | - t_v1 = toc; | |
251 | - disp(['voting done. time =',num2str(t_v1)]); | |
252 | - | |
253 | - % update the voting direction | |
254 | - if ang>=d_ang | |
255 | - tic; | |
256 | - Igrad_x = zeros(Isize); | |
257 | - Igrad_y = zeros(Isize); | |
258 | - Igrad_z = zeros(Isize); | |
259 | - for v = 1: nV | |
260 | - % coordinates of the current voter | |
261 | - vx = Itx(v); | |
262 | - vy = Ity(v); | |
263 | - vz = Itz(v); | |
264 | - | |
265 | - %get the local value of the voting image | |
266 | - local_Ivote = Ivote(g_v_prime(v,1:validPixels(v))); | |
267 | - | |
268 | - %find the index of the maximum value | |
269 | - [~, local_max_idx] = max(local_Ivote); | |
270 | - | |
271 | - %convert this into a global subscript | |
272 | - [g_px, g_py, g_pz] = ind2sub(size(Ivote), g_v_prime(v,local_max_idx)); | |
273 | - | |
274 | - %compute the vector from the voter position to this position | |
275 | - | |
276 | - Igrad_x(vx, vy, vz) = g_px - vx ; | |
277 | - Igrad_y(vx, vy, vz) = g_py - vy; | |
278 | - Igrad_z(vx, vy, vz) = g_pz - vz; | |
279 | -% if itr ==3 | |
280 | -% if mod(v, 5000)==0 | |
281 | -% mask(g_px, g_py, g_pz)= mask(g_px, g_py, g_pz) + 1; | |
282 | -% end | |
283 | -% end | |
284 | -% if itr ==6 | |
285 | -% if mod(v, 5000)==0 | |
286 | -% mask1(g_px, g_py, g_pz)= mask1(g_px, g_py, g_pz) + 1; | |
287 | -% end | |
288 | -% end | |
289 | - end | |
290 | - | |
291 | - | |
292 | - tdir1 = toc; | |
293 | - display (['updating dir done. time = ', num2str(tdir1)]); | |
294 | - ang = ang - d_ang; | |
295 | - end | |
296 | - | |
297 | - end | |
298 | - | |
299 | -hv = reshape(Ivote, [X*Y*Z, 1]); | |
300 | -hist(hv, 250); | |
301 | -%% | |
302 | -t = 4300; | |
303 | -conn = [5 5 5]; | |
304 | -Icenter = local_max(Ivote, conn, t); | |
305 | -fidc = fopen(sprintf('std3.2-r10.10-v8/out%d-t%d.vol',t,t0), 'w'); | |
306 | -fwrite(fidc, single(Icenter), '*single'); | |
307 | -fclose(fidc); | |
308 | -nnz(Icenter) | |
309 | -% [cxx1, cyy1, czz1] = ind2sub(size(Icenter),find(Icenter)); | |
310 | - | |
311 | -% % center = Ivote1; | |
312 | -% % center(center<t) = 0; | |
313 | -% % center = imregionalmax(center); | |
314 | -% % cn = nnz(center); | |
315 | -% % [cx, cy, cz] = ind2sub(size(center), find(center)); | |
316 | -% % Icenter = zeros(size(center)); | |
317 | -% % for cc =1:cn | |
318 | -% % Icenter(cx(cc), cy(cc), cz(cc)) = 255; | |
319 | -% % end | |
320 | -% | |
321 | -% % fid_Ic = fopen('image_center2-300.vol', 'w'); | |
322 | -% % fwrite(fid_Ic, Icenter); | |
323 | -% % fclose(fid_Ic); | |
324 | -% cn = nnz(Icenter); | |
325 | -% [cx, cy, cz] = ind2sub(size(Icenter), find(Icenter)); | |
326 | -% Ic2d = zeros(size(Icenter,1), size(Icenter,2)); | |
327 | -% for cc =1:cn | |
328 | -% Ic2d(cx(cc), cy(cc)) = 1; | |
329 | -% end | |
330 | -% I2d = max(I, [], 3); | |
331 | -% % figure(1),imagesc(I2d); colormap(gray); | |
332 | -% % figure(2),imagesc(Ic2d); colormap(gray); | |
333 | -% % | |
334 | -% out1(:,:,1) = mat2gray(I2d); | |
335 | -% out1(:,:,2) = mat2gray(Ic2d); | |
336 | -% out1(:,:,3) = mat2gray(I2d); | |
337 | -% figure(1), imagesc((out1)); | |
338 | -%%% % imwrite(mat2gray(c2d), 'vote.bmp'); | |
339 | -%% | |
340 | -% figure(1); imagesc(squeeze(I(:,:,ceil(size(I,3)/2)))), colormap(gray); | |
341 | -% figure(33); imagesc(squeeze(Ivote3(:,:,ceil(size(Ivote,3)/2)))), colormap(gray); |
Matlab_3D/validation.m
1 | + | |
1 | 2 | clear all; |
2 | 3 | disp('***************** NEW RUN *********************'); |
3 | 4 | X = 128; |
... | ... | @@ -7,15 +8,15 @@ D = 10; |
7 | 8 | t0=1; |
8 | 9 | r1=10; |
9 | 10 | r2=10; |
10 | -t=2000; | |
11 | +t=2100; | |
11 | 12 | itr=8; |
12 | 13 | vote=10; |
13 | 14 | std = [5 5]; |
14 | -gt_filename = '128-128-128/0-gt.vol'; | |
15 | +gt_filename = '0-gt.vol'; | |
15 | 16 | % out_filename = sprintf('128-128-128/0-nissl-std%d.%d-t0%d-r%d.%d-t%d-out%d.%d.vol',std(1), std(2),t0,r1,r2,t,itr,vote); |
16 | -out_filename = sprintf('D:/build/ivote3-bld/std5.5-r10.10-v8-phi15/out%d.vol',t); | |
17 | +out_filename = sprintf('D:/build/ivote3-bld/shared2D-v8/out%d.vol',t); | |
17 | 18 | % txt_filename = sprintf('128-128-128/0-validation-nissl-std%d.%d-r%d.%d-t%d-out%d.%d-D%d.txt',std(1), std(2),r1,r2,t,itr,vote,D); |
18 | -txt_filename = sprintf('D:/build/ivote3-bld/std5.5-r10.10-v8-phi15/t%d-D%d.txt',t,D); | |
19 | +txt_filename = sprintf('D:/build/ivote3-bld/shared2D-v8/t%d-D%d.txt',t,D); | |
19 | 20 | spec = sprintf('Nissl-std%d.%d-r%d.%d-t%d-out%d.%d',std(1), std(2),r1,r2,t,itr,vote); |
20 | 21 | fid0 = fopen(gt_filename); |
21 | 22 | gt = fread(fid0,[X Y*Z], 'single'); | ... | ... |
1 | +#ifndef STIM_CUDA_cpyToshare_H | |
2 | +#define STIM_CUDA_cpyToshare_H | |
3 | + | |
4 | + //this function copy one channel data from global to shared memory in one dimension with size of X bytes. | |
5 | + template<typename T> | |
6 | + __device__ void cpyG2S1D(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y, | |
7 | + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){ | |
8 | + | |
9 | + //calculate the total number of threads available | |
10 | + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; | |
11 | + | |
12 | + //calculate the current 1D thread ID | |
13 | + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; | |
14 | + | |
15 | + //calculate the number of iteration require for the copy | |
16 | + unsigned int I = X/tThreads + 1; | |
17 | + | |
18 | + //the specified start position in global memory is (x, y, z) | |
19 | + unsigned int gstart = z*I_x*I_y + y*I_x + x; | |
20 | + | |
21 | + for (unsigned int i = 0; i < I; i++){ | |
22 | + | |
23 | + //each iteration will copy tThreads elements, so the starting index in shared memory | |
24 | + //for each iteration will be i * tThreads (iteration # times the number of elements transferred per iteration) | |
25 | + unsigned int sIdx = i * tThreads + ti; | |
26 | + if (sIdx>= X*Y) return; | |
27 | + | |
28 | + //each iteration will copy tThreads elements from the global index | |
29 | + unsigned int gIdx = gstart + sIdx; | |
30 | + | |
31 | + //copy global to share | |
32 | + dest[sIdx] = src[gIdx]; | |
33 | + | |
34 | + } | |
35 | + } | |
36 | + //this function copy one channel data from global to shared memory in two dimensions with size of X*Y bytes. | |
37 | + template<typename T> | |
38 | + __device__ void cpyG2S2D(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y, | |
39 | + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){ | |
40 | + //calculate the total number of threads available | |
41 | + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; | |
42 | + | |
43 | + //calculate the current 1D thread ID | |
44 | + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; | |
45 | + | |
46 | + //calculate the number of iteration require for the copy | |
47 | + unsigned int I = X*Y/tThreads + 1; | |
48 | + | |
49 | + unsigned int gz1 = z*I_x*I_y ; | |
50 | + | |
51 | + for (unsigned int i = 0; i < I; i++){ | |
52 | + | |
53 | + unsigned int sIdx = i * tThreads + ti; | |
54 | + if (sIdx>= X*Y) return; | |
55 | + | |
56 | + unsigned int sy = sIdx/X; | |
57 | + unsigned int sx = sIdx - (sy * X); | |
58 | + | |
59 | + unsigned int gx = x + sx; | |
60 | + unsigned int gy = y + sy; | |
61 | + if (gx<I_x && gy<I_y){ | |
62 | + unsigned int gIdx = gz1 + gy * I_x + gx; | |
63 | + //copy global to share | |
64 | + dest[sIdx] = src[gIdx]; | |
65 | + } | |
66 | + | |
67 | + } | |
68 | + } | |
69 | + //this function copy three channels data from global to shared memory in one dimension with size of X bytes. | |
70 | + template<typename T> | |
71 | + __device__ void cpyG2S1D3ch(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y, | |
72 | + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){ | |
73 | + | |
74 | + //calculate the total number of threads available | |
75 | + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; | |
76 | + | |
77 | + //calculate the current 1D thread ID | |
78 | + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; | |
79 | + | |
80 | + //calculate the number of iteration require for the copy | |
81 | + unsigned int I = X/tThreads + 1; | |
82 | + | |
83 | + //the specified start position in global memory is (x, y, z) | |
84 | + unsigned int gstart = z*I_x*I_y + y*I_x + x; | |
85 | + | |
86 | + for (unsigned int i = 0; i < I; i++){ | |
87 | + | |
88 | + //each iteration will copy tThreads elements, so the starting index in shared memory | |
89 | + //for each iteration will be i * tThreads (iteration # times the number of elements transferred per iteration) | |
90 | + unsigned int sIdx = i * tThreads + ti; | |
91 | + if (sIdx>= X*Y) return; | |
92 | + unsigned int gIdx = gstart*3 + sIdx; | |
93 | + //copy global to share | |
94 | + dest[sIdx] = src[gIdx]; | |
95 | + | |
96 | + } | |
97 | + } | |
98 | + //this function copy three channels data from global to shared memory in two dimensions with size of X*Y bytes. | |
99 | + template<typename T> | |
100 | + __device__ void cpyG2S2D3ch(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y, | |
101 | + dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){ | |
102 | + //calculate the total number of threads available | |
103 | + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; | |
104 | + | |
105 | + //calculate the current 1D thread ID | |
106 | + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; | |
107 | + | |
108 | + //calculate the number of iteration require for the copy | |
109 | + unsigned int I = X*Y/tThreads + 1; | |
110 | + | |
111 | + unsigned int gz1 = z*I_x*I_y ; | |
112 | + | |
113 | + for (unsigned int i = 0; i < I; i++){ | |
114 | + | |
115 | + unsigned int sIdx = i * tThreads + ti; | |
116 | + if (sIdx>= X*Y) return; | |
117 | + unsigned int sy = sIdx/X; | |
118 | + unsigned int sx = sIdx - (sy * X); | |
119 | + | |
120 | + unsigned int gx = x + sx/3; | |
121 | + unsigned int gy = y + sy; | |
122 | + if (gx<I_x && gy<I_y){ | |
123 | + unsigned int gIdx = (gz1 + gy * I_x + gx)*3 + (sx%3); | |
124 | + //copy global to share | |
125 | + dest[sIdx] = src[gIdx]; | |
126 | + } | |
127 | + } | |
128 | + } | |
129 | + // this function compute the gradient magnitude saved in the shared memory and stores the magnitude result in the rest of shared memory. | |
130 | + template<typename T> | |
131 | + __device__ void mag_share2D(T* grad, unsigned int bs, unsigned int X, unsigned int Y, dim3 threadIdx, dim3 blockDim){ | |
132 | + | |
133 | + //calculate the total number of threads available | |
134 | + unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; | |
135 | + //calculate the current 1D thread ID | |
136 | + unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; | |
137 | + //calculate the number of iteration require for the copy | |
138 | + unsigned int I = X*Y/tThreads + 1; | |
139 | + for (unsigned int i = 0; i < I; i++){ | |
140 | + | |
141 | + unsigned int sIdx = i * tThreads + ti; | |
142 | + if (sIdx>= X*Y) return; | |
143 | + float gx = grad[sIdx*3]; | |
144 | + float gy = grad[sIdx*3 + 1]; | |
145 | + float gz = grad[sIdx*3 + 2]; | |
146 | + float mag = sqrt(gx*gx + gy*gy + gz*gz); | |
147 | + grad[bs + sIdx] = mag; | |
148 | + | |
149 | + } | |
150 | + } | |
151 | +#endif | |
0 | 152 | \ No newline at end of file | ... | ... |
cpp/float_to_half.cuh deleted
1 | -#ifndef STIM_CUDA_FLOAT_TO_HALF_H | |
2 | -#define STIM_CUDA_FLOAT_TO_HALF_H | |
3 | - | |
4 | -#include <iostream> | |
5 | -#include <cuda.h> | |
6 | -#include <stim/cuda/cudatools.h> | |
7 | -#include <stim/cuda/sharedmem.cuh> | |
8 | -#include <stim/cuda/cudatools/error.h> | |
9 | -#include <cuda_fp16.h> | |
10 | -#include <stdio.h> | |
11 | - __global__ void cuda_f2h(half* gpu_half, float* gpu_float, int x, int y, int z){ | |
12 | - | |
13 | - | |
14 | - //calculate x,y,z coordinates for this thread | |
15 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
16 | - //find the grid size along y | |
17 | - int grid_y = y / blockDim.y; | |
18 | - int blockidx_y = blockIdx.y % grid_y; | |
19 | - int yi = blockidx_y * blockDim.y + threadIdx.y; | |
20 | - int zi = blockIdx.y / grid_y; | |
21 | - int i = zi * x * y + yi * x + xi; | |
22 | - | |
23 | - if(xi >= x|| yi >= y || zi>= z) return; | |
24 | - | |
25 | - | |
26 | - gpu_half[i] = __float2half(gpu_float[i]); | |
27 | - | |
28 | - | |
29 | - } | |
30 | - | |
31 | - | |
32 | - void gpu_f2h(half* gpu_half, float* gpu_float, unsigned int x, unsigned int y, unsigned int z){ | |
33 | - | |
34 | - | |
35 | - int max_threads = stim::maxThreadsPerBlock(); | |
36 | - dim3 threads(sqrt (max_threads),sqrt (max_threads)); | |
37 | - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | |
38 | - | |
39 | - //call the GPU kernel to determine the gradient | |
40 | - cuda_f2h <<< blocks, threads >>>(gpu_half, gpu_float, x, y, z); | |
41 | - | |
42 | - } | |
43 | - | |
44 | - | |
45 | - | |
46 | - void cpu_f2h(half* h_out, float* f_in, unsigned int x, unsigned int y, unsigned int z){ | |
47 | - | |
48 | - //calculate the number of pixels in the array | |
49 | - unsigned int pix = x* y* z; | |
50 | - | |
51 | - //allocate memory on the GPU for the input float precision. | |
52 | - float* gpu_float; | |
53 | - cudaMalloc(&gpu_float, pix * sizeof(float)); | |
54 | - cudaMemcpy(gpu_float, f_in, pix * sizeof(float), cudaMemcpyHostToDevice); | |
55 | - | |
56 | - //allocate memory on the GPU for the output half precision | |
57 | - half* gpu_half; | |
58 | - cudaMalloc(&gpu_half, pix * sizeof(half)); | |
59 | - | |
60 | - //call the GPU version of this function | |
61 | - gpu_f2h(gpu_half, gpu_float, x, y, z); | |
62 | - | |
63 | - //copy the array back to the CPU | |
64 | - cudaMemcpy(h_out, gpu_half, pix * sizeof(half), cudaMemcpyDeviceToHost); | |
65 | - | |
66 | - //free allocated memory | |
67 | - cudaFree(gpu_float); | |
68 | - cudaFree(gpu_half); | |
69 | - | |
70 | - } | |
71 | - | |
72 | - | |
73 | -#endif | |
74 | 0 | \ No newline at end of file |
cpp/half_to_float.cuh deleted
1 | -#ifndef STIM_CUDA_HALF_TO_FLOAT_H | |
2 | -#define STIM_CUDA_HALF_TO_FLOAT_H | |
3 | - | |
4 | -#include <iostream> | |
5 | -#include <cuda.h> | |
6 | -#include <stim/cuda/cudatools.h> | |
7 | -#include <stim/cuda/sharedmem.cuh> | |
8 | -#include <stim/cuda/cudatools/error.h> | |
9 | -#include "cuda_fp16.h" | |
10 | - | |
11 | - __global__ void cuda_h2f(float* gpu_float, half* gpu_half, int x, int y, int z){ | |
12 | - | |
13 | - | |
14 | - //calculate x,y,z coordinates for this thread | |
15 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
16 | - //find the grid size along y | |
17 | - int grid_y = y / blockDim.y; | |
18 | - int blockidx_y = blockIdx.y % grid_y; | |
19 | - int yi = blockidx_y * blockDim.y + threadIdx.y; | |
20 | - int zi = blockIdx.y / grid_y; | |
21 | - int i = zi * x * y + yi * x + xi; | |
22 | - | |
23 | - if(xi >= x|| yi >= y || zi>= z) return; | |
24 | - | |
25 | - | |
26 | - gpu_float[i] = __half2float(gpu_half[i]); | |
27 | - | |
28 | - } | |
29 | - | |
30 | - | |
31 | - void gpu_h2f(float* gpu_float, half* gpu_half, unsigned int x, unsigned int y, unsigned int z){ | |
32 | - | |
33 | - | |
34 | - int max_threads = stim::maxThreadsPerBlock(); | |
35 | - dim3 threads(sqrt (max_threads),sqrt (max_threads)); | |
36 | - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | |
37 | - | |
38 | - //call the GPU kernel to determine the gradient | |
39 | - cuda_h2f <<< blocks, threads >>>(gpu_float, gpu_half, x, y, z); | |
40 | - | |
41 | - } | |
42 | - | |
43 | - | |
44 | - | |
45 | - void cpu_f2h(float* f_out, half* h_in, unsigned int x, unsigned int y, unsigned int z){ | |
46 | - | |
47 | - //calculate the number of pixels in the array | |
48 | - unsigned int pix = x* y* z; | |
49 | - | |
50 | - //allocate memory on the GPU for the input half precision | |
51 | - half* gpu_half; | |
52 | - cudaMalloc(&gpu_half, pix * sizeof(half)); | |
53 | - cudaMemcpy(gpu_half, h_in, pix * sizeof(half), cudaMemcpyHostToDevice); | |
54 | - | |
55 | - //allocate memory on the GPU for the output float precision. | |
56 | - float* gpu_float; | |
57 | - cudaMalloc(&gpu_float, pix * sizeof(float)); | |
58 | - | |
59 | - | |
60 | - | |
61 | - //call the GPU version of this function | |
62 | - gpu_h2f(gpu_float, gpu_half, x, y, z); | |
63 | - | |
64 | - cudaMemcpy(f_out, gpu_float, pix * sizeof(float), cudaMemcpyDeviceToHost); | |
65 | - | |
66 | - | |
67 | - | |
68 | - //free allocated memory | |
69 | - cudaFree(gpu_float); | |
70 | - cudaFree(gpu_half); | |
71 | - | |
72 | - } | |
73 | - | |
74 | - | |
75 | -#endif | |
76 | 0 | \ No newline at end of file |
cpp/main.cpp
... | ... | @@ -37,10 +37,9 @@ int main(int argc, char** argv){ |
37 | 37 | printf("current device ID: %d\n", i); |
38 | 38 | printf("device name: %s\n", prop.name); |
39 | 39 | printf("total global mem: %lu\n", prop.totalGlobalMem); |
40 | + printf("shared memory per block: %lu\n", prop.sharedMemPerBlock); | |
40 | 41 | } |
41 | - | |
42 | - | |
43 | - | |
42 | + | |
44 | 43 | //output advertisement |
45 | 44 | std::cout<<std::endl<<std::endl; |
46 | 45 | std::cout<<"========================================================================="<<std::endl; |
... | ... | @@ -124,18 +123,18 @@ int main(int argc, char** argv){ |
124 | 123 | invert_data(cpuI, x, y, z); |
125 | 124 | |
126 | 125 | //write a new file from the cpuI. |
127 | - std::ofstream original("std5.5-r10.10-v8/inv-128.vol", std::ofstream::out | std::ofstream::binary); | |
126 | + std::ofstream original("shared2D-v8/inv-128.vol", std::ofstream::out | std::ofstream::binary); | |
128 | 127 | original.write((char*)cpuI, bytes); |
129 | 128 | original.close(); |
130 | 129 | |
131 | 130 | //allocate space on the cpu for the output result |
132 | - float* cpu_out = (float*) malloc(bytes*3); | |
131 | + float* cpu_out = (float*) malloc(bytes); | |
133 | 132 | |
134 | 133 | // call the ivote function |
135 | 134 | ivote3(cpu_out, cpuI, sigma, anisotropy, phi, d_phi, r, iter, t, conn, x, y, z); |
136 | 135 | |
137 | 136 | //write the blurred file from the cpuI. |
138 | - std::ofstream fblur("vote-check/vote8.vol", std::ofstream::out | std::ofstream::binary); | |
137 | + std::ofstream fblur("shared2D-v8/vote8.vol", std::ofstream::out | std::ofstream::binary); | |
139 | 138 | fblur.write((char*)cpuI, bytes); |
140 | 139 | fblur.close(); |
141 | 140 | /* |
... | ... | @@ -146,13 +145,13 @@ int main(int argc, char** argv){ |
146 | 145 | fgx.close(); |
147 | 146 | */ |
148 | 147 | //write the output file. |
149 | - std::ofstream fo("std5.5-r10.10-v8/" + OutName.str(), std::ofstream::out | std::ofstream::binary); | |
148 | + std::ofstream fo("shared2D-v8/" + OutName.str(), std::ofstream::out | std::ofstream::binary); | |
150 | 149 | fo.write((char*)cpu_out, bytes); |
151 | 150 | fo.close(); |
152 | 151 | |
153 | 152 | // creat a file for saving the list centers |
154 | 153 | |
155 | - std::ofstream list("std5.5-r10.10-v8/center.txt"); | |
154 | + std::ofstream list("shared2D-v8/center.txt"); | |
156 | 155 | if (list.is_open()){ |
157 | 156 | |
158 | 157 | for (int ix=0; ix<x; ix++){ | ... | ... |
cpp/set_rmax.cuh deleted
1 | -#ifndef STIM_CUDA_SET_RMAX_H | |
2 | -#define STIM_CUDA_SET_RMAX_H | |
3 | - | |
4 | -#include <iostream> | |
5 | -#include <cuda.h> | |
6 | -#include <stim/cuda/cudatools.h> | |
7 | -#include <stim/cuda/sharedmem.cuh> | |
8 | -#include <stim/cuda/cudatools/error.h> | |
9 | - | |
10 | -template<typename T> | |
11 | - __global__ void cuda_set_rmax(T* gpu_r, int rx, int ry, int rz, int x, int y, int z){ | |
12 | - | |
13 | - //calculate x,y,z coordinates for this thread | |
14 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
15 | - //find the grid size along y | |
16 | - int grid_y = y / blockDim.y; | |
17 | - int blockidx_y = blockIdx.y % grid_y; | |
18 | - int yi = blockidx_y * blockDim.y + threadIdx.y; | |
19 | - int zi = blockIdx.y / grid_y; | |
20 | - int i = zi * x * y + yi * x + xi; | |
21 | - | |
22 | - if(xi>=x || yi>=y || zi>=z) return; | |
23 | - | |
24 | - gpu_r[i*3+0] = rx; | |
25 | - gpu_r[i*3+1] = ry; | |
26 | - gpu_r[i*3+2] = rz; | |
27 | - | |
28 | - } | |
29 | - | |
30 | -template<typename T> | |
31 | - void gpu_set_rmax(T* gpu_r, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ | |
32 | - | |
33 | - | |
34 | - unsigned int max_threads = stim::maxThreadsPerBlock(); | |
35 | - dim3 threads(sqrt (max_threads),sqrt (max_threads)); | |
36 | - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | |
37 | - | |
38 | - //call the kernel to do the voting | |
39 | - cuda_set_rmax <T> <<< blocks, threads >>>(gpu_r, r[0], r[1], r[2], x , y, z); | |
40 | - | |
41 | - } | |
42 | -template<typename T> | |
43 | - void cpu_set_rmax(T* cpu_r, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ | |
44 | - | |
45 | - //calculate the number of bytes in the array | |
46 | - unsigned int bytes = x * y * z * sizeof(T); | |
47 | - | |
48 | - //allocate space on the GPU for the rmax | |
49 | - T* gpu_r; | |
50 | - cudaMalloc(&gpu_vote, bytes*3); | |
51 | - | |
52 | - cudaMemcpy(gpu_r, cpu_r, bytes*3, cudaMemcpyHostToDevice); | |
53 | - | |
54 | - | |
55 | - //call the GPU version of the vote calculation function | |
56 | - gpu_set_rmax<T>(gpu_r, r, x , y, z); | |
57 | - | |
58 | - //copy the Vote Data back to the CPU | |
59 | - cudaMemcpy(cpu_r, gpu_r, bytes*3, cudaMemcpyDeviceToHost) ; | |
60 | - | |
61 | - //free allocated memory | |
62 | - cudaFree(gpu_r); | |
63 | - | |
64 | - } | |
65 | - | |
66 | - | |
67 | -#endif | |
68 | 0 | \ No newline at end of file |
cpp/update_dir3.cuh
... | ... | @@ -6,11 +6,12 @@ |
6 | 6 | #include <stim/cuda/cudatools.h> |
7 | 7 | #include <stim/cuda/sharedmem.cuh> |
8 | 8 | #include <cuda_fp16.h> |
9 | +#include "cpyToshare.cuh" | |
9 | 10 | |
10 | 11 | // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. |
11 | 12 | template<typename T> |
12 | 13 | __global__ void update_dir3(T* gpu_dir, T* gpu_grad, T* gpu_vote, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ |
13 | - | |
14 | + extern __shared__ float s_vote[]; | |
14 | 15 | //calculate x,y,z coordinates for this thread |
15 | 16 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
16 | 17 | //find the grid size along y |
... | ... | @@ -18,10 +19,21 @@ |
18 | 19 | int blockidx_y = blockIdx.y % grid_y; |
19 | 20 | int yi = blockidx_y * blockDim.y + threadIdx.y; |
20 | 21 | int zi = blockIdx.y / grid_y; |
22 | + //compute the global 1D index for this pixel | |
21 | 23 | int i = zi * x * y + yi * x + xi; |
22 | 24 | |
23 | 25 | if(xi >= x|| yi >= y || zi>= z) return; |
24 | - | |
26 | + // find the starting points for this block along the x and y directions | |
27 | + int bxi = blockIdx.x * blockDim.x; | |
28 | + int byi = blockidx_y * blockDim.y; | |
29 | + //find the starting points and the size of the window, which will be copied to the 2D-shared memory | |
30 | + int bxs = bxi - rx; | |
31 | + int bys = byi - ry; | |
32 | + int xwidth = 2 * rx + blockDim.x; | |
33 | + int ywidth = 2 * ry + blockDim.y; | |
34 | + //compute the coordinations of this pixel in the 2D-shared memory. | |
35 | + int sx_rx = threadIdx.x + rx; | |
36 | + int sy_ry = threadIdx.y + ry; | |
25 | 37 | //find the gradient values along the x, y ,z axis, and the gradient magnitude for the voter |
26 | 38 | float g_v_x = gpu_grad[i * 3 + 0]; |
27 | 39 | float g_v_y = gpu_grad[i * 3 + 1]; |
... | ... | @@ -42,39 +54,48 @@ |
42 | 54 | int rz_sq = rz * rz; |
43 | 55 | |
44 | 56 | for (int z_p = -rz; z_p<=rz; z_p++){ |
45 | - | |
46 | - for(int y_p = -ry; y_p <= ry; y_p++){ | |
47 | - | |
48 | - for(int x_p = -rx; x_p <= rx; x_p++){ | |
49 | - | |
50 | - //calculate the x, y ,z indices for the current pixel | |
51 | - int xi_p = (xi + x_p) ; | |
57 | + int zi_p = zi + z_p; | |
58 | + if ((zi_p) >=0 && (zi_p) < z){ | |
59 | + //call the function to copy one slide of vote date to the 2D-shared memory. | |
60 | + __syncthreads(); | |
61 | + cpyG2S2D<float>(s_vote, gpu_vote, bxs, bys, zi + z_p, xwidth, ywidth, threadIdx, blockDim, x, y); | |
62 | + __syncthreads(); | |
63 | + float z_sq = z_p * z_p; | |
64 | + float d_z_sq = (z_sq)/rz_sq; | |
65 | + for(int y_p = -ry; y_p <= ry; y_p++){ | |
66 | + | |
67 | + float y_sq = y_p * y_p; | |
68 | + float yz_sq = y_sq + z_sq; | |
52 | 69 | int yi_p = (yi + y_p) ; |
53 | - int zi_p = (zi + z_p) ; | |
54 | - if (zi_p >=0 && zi_p < z && yi_p >=0 && yi_p < y && xi_p >=0 && xi_p < x){ | |
70 | + float d_yz_sq = (y_sq)/ry_sq + d_z_sq; | |
71 | + unsigned int s_y1d = (sy_ry + y_p) * xwidth; | |
72 | + for(int x_p = -rx; x_p <= rx; x_p++){ | |
55 | 73 | |
56 | - //calculate the distance between the pixel and the current voter. | |
57 | - float x_sq = x_p * x_p; | |
58 | - float y_sq = y_p * y_p; | |
59 | - float z_sq = z_p * z_p; | |
60 | - float d_pv = sqrt(x_sq + y_sq + z_sq); | |
74 | + //check if the current pixel is inside of the data-set. | |
75 | + int xi_p = (xi + x_p) ; | |
76 | + if (yi_p >=0 && yi_p < y && xi_p >=0 && xi_p < x){ | |
61 | 77 | |
62 | - // calculate the angle between the pixel and the current voter. | |
63 | - float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * mag_v); | |
78 | + //calculate the distance between the pixel and the current voter. | |
79 | + float x_sq = x_p * x_p; | |
80 | + float d_pv = sqrt(x_sq + yz_sq); | |
64 | 81 | |
65 | - if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){ | |
82 | + // calculate the angle between the pixel and the current voter. | |
83 | + float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * mag_v); | |
66 | 84 | |
67 | - //calculate the 1D index for the current pixel | |
68 | - unsigned int id_p = (zi_p) * x * y + (yi_p) * x + (xi_p); | |
69 | - l_vote = gpu_vote[id_p]; | |
70 | - | |
71 | - // compare the vote value of this pixel with the max value to find the maxima and its index. | |
72 | - if (l_vote>max) { | |
85 | + if ((((x_sq)/rx_sq + d_yz_sq )<= 1) && (cos_diff >= cos_phi)){ | |
73 | 86 | |
74 | - max = l_vote; | |
75 | - id_x = x_p; | |
76 | - id_y = y_p; | |
77 | - id_z = z_p; | |
87 | + //calculate the 1D index for the current pixel in the 2D-shared memory | |
88 | + unsigned int id_s = s_y1d + (sx_rx + x_p); | |
89 | + l_vote = s_vote[id_s]; | |
90 | + | |
91 | + // compare the vote value of this pixel with the max value to find the maxima and its index. | |
92 | + if (l_vote>max) { | |
93 | + | |
94 | + max = l_vote; | |
95 | + id_x = x_p; | |
96 | + id_y = y_p; | |
97 | + id_z = z_p; | |
98 | + } | |
78 | 99 | } |
79 | 100 | } |
80 | 101 | } |
... | ... | @@ -115,13 +136,13 @@ |
115 | 136 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
116 | 137 | dim3 threads(sqrt (max_threads),sqrt (max_threads)); |
117 | 138 | dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); |
118 | - | |
139 | + unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*sizeof(T); | |
119 | 140 | // allocate space on the GPU for the updated vote direction |
120 | 141 | T* gpu_dir; |
121 | 142 | cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3); |
122 | 143 | |
123 | 144 | //call the kernel to calculate the new voting direction |
124 | - update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z); | |
145 | + update_dir3 <<< blocks, threads, shared_bytes >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z); | |
125 | 146 | |
126 | 147 | |
127 | 148 | //call the kernel to update the gradient direction | ... | ... |
cpp/vote3.cuh
... | ... | @@ -6,12 +6,13 @@ |
6 | 6 | #include <stim/cuda/cudatools.h> |
7 | 7 | #include <stim/cuda/sharedmem.cuh> |
8 | 8 | #include <stim/cuda/cudatools/error.h> |
9 | - | |
9 | +#include "cpyToshare.cuh" | |
10 | 10 | |
11 | 11 | // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area |
12 | 12 | template<typename T> |
13 | 13 | __global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ |
14 | 14 | |
15 | + extern __shared__ float s[]; | |
15 | 16 | //calculate x,y,z coordinates for this thread |
16 | 17 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
17 | 18 | //find the grid size along y |
... | ... | @@ -23,6 +24,17 @@ |
23 | 24 | |
24 | 25 | if(xi>=x || yi>=y || zi>=z) return; |
25 | 26 | |
27 | + //find the starting points and the size of the window, which will be copied to the 2D-shared memory | |
28 | + int bxs = blockIdx.x * blockDim.x - rx; | |
29 | + int bys = blockidx_y * blockDim.y - ry; | |
30 | + int xwidth = 2 * rx + blockDim.x; | |
31 | + int ywidth = 2 * ry + blockDim.y; | |
32 | + //calculate the starting point of shared memory for storing the magnitude. | |
33 | + unsigned int b_s = 3 * xwidth * ywidth; | |
34 | + //compute the coordinations of this pixel in the 2D-shared memory. | |
35 | + int sx_rx = threadIdx.x + rx; | |
36 | + int sy_ry = threadIdx.y + ry; | |
37 | + | |
26 | 38 | // define a local variable to sum the votes from the voters |
27 | 39 | float sum = 0; |
28 | 40 | |
... | ... | @@ -31,43 +43,58 @@ |
31 | 43 | int rz_sq = rz * rz; |
32 | 44 | |
33 | 45 | for (int z_v = -rz; z_v<=rz; z_v++){ |
34 | - | |
35 | - for(int y_v = -ry; y_v <= ry; y_v++){ | |
36 | - | |
37 | - for(int x_v = -rx; x_v <= rx; x_v++){ | |
38 | - | |
39 | - //calculate the x, y ,z indices for the current voter | |
40 | - int xi_v = (xi + x_v) ; | |
46 | + int zi_v = zi + z_v; | |
47 | + if ((zi_v) >=0 && (zi_v) <z){ | |
48 | + //call the function to copy one slide of the gradient from global to the 2D-shared memory. | |
49 | + __syncthreads(); | |
50 | + cpyG2S2D3ch<float>(s, gpu_grad, bxs, bys, zi + z_v, 3*xwidth, ywidth, threadIdx, blockDim, x, y); | |
51 | + __syncthreads(); | |
52 | + mag_share2D<float>(s, b_s, xwidth, ywidth, threadIdx, blockDim); | |
53 | + __syncthreads(); | |
54 | + float z_sq = z_v * z_v; | |
55 | + float d_z_sq = z_sq/rz_sq; | |
56 | + | |
57 | + for(int y_v = -ry; y_v <= ry; y_v++){ | |
41 | 58 | int yi_v = (yi + y_v) ; |
42 | - int zi_v = (zi + z_v) ; | |
43 | - if (zi_v >=0 && zi_v < z && yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){ | |
44 | - | |
45 | - //calculate the 1D index for the current voter | |
46 | - unsigned int id_v = (zi_v) * x * y + (yi_v) * x + (xi_v); | |
59 | + //compute the position of the current voter in the shared memory along the y axis. | |
60 | + unsigned int sIdx_y1d = (sy_ry + y_v)* xwidth; | |
61 | + | |
62 | + float y_sq = y_v * y_v; | |
63 | + float yz_sq = z_sq + y_sq; | |
64 | + float d_yz_sq = y_sq/ry_sq + d_z_sq; | |
65 | + for(int x_v = -rx; x_v <= rx; x_v++){ | |
66 | + | |
67 | + //check if the current voter is inside of the data-set | |
68 | + int xi_v = (xi + x_v) ; | |
69 | + if (yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){ | |
70 | + | |
71 | + //compute the position of the current voter in the 2D-shared memory along the x axis. | |
72 | + unsigned int sIdx_x = (sx_rx + x_v); | |
73 | + //find the 1D index of this voter in the 2D-shared memory. | |
74 | + unsigned int s_Idx = (sIdx_y1d + sIdx_x); | |
75 | + unsigned int s_Idx3 = s_Idx * 3; | |
76 | + | |
77 | + //save the gradient values for the current voter to the local variables and compute the gradient magnitude. | |
78 | + float g_v_x = s[s_Idx3]; | |
79 | + float g_v_y = s[s_Idx3 + 1]; | |
80 | + float g_v_z = s[s_Idx3 + 2]; | |
81 | + float mag_v = s[b_s + s_Idx]; //sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z); | |
82 | + | |
83 | + //calculate the distance between the pixel and the current voter. | |
84 | + float x_sq = x_v * x_v; | |
85 | + float d_pv = sqrt(x_sq + yz_sq); | |
47 | 86 | |
48 | - //find the gradient values along the x, y ,z axis, and the gradient magnitude for this voter | |
49 | - | |
50 | - float g_v_x = gpu_grad[id_v * 3 + 0]; | |
51 | - float g_v_y = gpu_grad[id_v * 3 + 1]; | |
52 | - float g_v_z = gpu_grad[id_v * 3 + 2]; | |
53 | - float mag_v = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z); | |
54 | - | |
55 | - //calculate the distance between the pixel and the current voter. | |
56 | - float x_sq = x_v * x_v; | |
57 | - float y_sq = y_v * y_v; | |
58 | - float z_sq = z_v * z_v; | |
59 | - float d_pv = sqrt(x_sq + y_sq + z_sq); | |
60 | - | |
61 | - // calculate the angle between the pixel and the current voter. | |
62 | - float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v); | |
87 | + // calculate the angle between the pixel and the current voter. | |
88 | + float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v); | |
63 | 89 | |
64 | - // check if the current voter is located in the voting area of this pixel. | |
65 | - if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){ | |
90 | + // check if the current voter is located in the voting area of this pixel. | |
91 | + if ((((x_sq)/rx_sq + d_yz_sq)<= 1) && (cos_diff >= cos_phi)){ | |
66 | 92 | |
67 | - sum += mag_v; | |
93 | + sum += mag_v; | |
94 | + } | |
68 | 95 | } |
69 | - } | |
70 | - } | |
96 | + } | |
97 | + } | |
71 | 98 | } |
72 | 99 | } |
73 | 100 | |
... | ... | @@ -81,9 +108,9 @@ |
81 | 108 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
82 | 109 | dim3 threads(sqrt (max_threads),sqrt (max_threads)); |
83 | 110 | dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); |
84 | - | |
111 | + unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*4*sizeof(T); | |
85 | 112 | //call the kernel to do the voting |
86 | - vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z); | |
113 | + vote3 <T> <<< blocks, threads, shared_bytes >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z); | |
87 | 114 | |
88 | 115 | } |
89 | 116 | ... | ... |