Commit 310a169858bcdd5efb908f5548e3fd086cd76488
1 parent
2558ee86
update the ivote3 project
Showing
7 changed files
with
257 additions
and
243 deletions
Show diff stats
Matlab_3D/local_max.m
... | ... | @@ -26,8 +26,8 @@ for li = 1: nl |
26 | 26 | ind_m = sub2ind(size(Ivote), cx, cy, cz); |
27 | 27 | lm_Ivote = A(ind_m); |
28 | 28 | [~, lm_indx] = max(lm_Ivote(:)); |
29 | - if ind_f(li) == ind_m(lm_indx); | |
30 | - Icenter(ind_f(li)) = 255; | |
29 | + if ind_f(li) == ind_m(lm_indx) | |
30 | + Icenter(ind_f(li)) = A(ind_f(li)); %255 | |
31 | 31 | end |
32 | 32 | end |
33 | 33 | ... | ... |
cpp/cudafunc.cu
... | ... | @@ -5,71 +5,65 @@ |
5 | 5 | #include "vote3_atomic_aabb.cuh" |
6 | 6 | #include "update_dir3_aabb.cuh" |
7 | 7 | #include "local_max3.cuh" |
8 | - | |
8 | +#include <fstream> | |
9 | 9 | |
10 | 10 | void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], |
11 | - int iter, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ | |
11 | + int iter, float t, unsigned int conn[], size_t x, size_t y, size_t z){ | |
12 | 12 | |
13 | 13 | |
14 | - cudaSetDevice(0); | |
15 | - | |
16 | - unsigned int bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data | |
14 | + size_t bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data | |
17 | 15 | |
18 | 16 | float* gpuI0; //assign memory on gpu for the input data |
19 | 17 | cudaMalloc(&gpuI0, bytes); |
20 | 18 | cudaMemcpy(gpuI0, img, bytes, cudaMemcpyHostToDevice); //copy the image data to the GPU. |
21 | 19 | |
22 | - | |
23 | 20 | gpu_gaussian_blur3<float>(gpuI0, sigma, x, y, z); //call the blurring function from the gpu. |
24 | 21 | cudaDeviceSynchronize(); |
25 | 22 | |
26 | 23 | float* gpu_grad; //assign memory on the gpu for the gradient along the X, y, z. |
27 | 24 | cudaMalloc(&gpu_grad, bytes*3); |
28 | 25 | |
29 | - gpu_gradient3<float>(gpu_grad, gpuI0, 1, x, y, z); //call the gradient function from the gpu. | |
26 | + gpu_gradient3<float>(gpu_grad, gpuI0, x, y, z); //call the gradient function from the gpu. | |
30 | 27 | cudaFree(gpuI0); |
31 | 28 | |
32 | 29 | float* gpu_vote; |
33 | 30 | cudaMalloc(&gpu_vote, bytes); |
34 | 31 | |
35 | 32 | float cos_phi = cos(phi); |
36 | - | |
33 | + | |
37 | 34 | //call the vote function. |
38 | 35 | for (int i = 0; i < iter; i++){ |
39 | 36 | |
40 | 37 | cudaMemset(gpu_vote, 0, bytes); |
41 | 38 | gpu_vote3<float>(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z); |
42 | 39 | cudaDeviceSynchronize(); |
43 | - | |
44 | - //if (phi >= d_phi){ | |
40 | + if (i == 0) { | |
41 | + cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); | |
42 | + std::ofstream fvote("00-vote1_aabb.vol", std::ofstream::out | std::ofstream::binary); | |
43 | + fvote.write((char*)img, bytes); | |
44 | + fvote.close(); | |
45 | + } | |
46 | + if (i == 1) { | |
47 | + cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); | |
48 | + std::ofstream fvote("00-vote2_aabb.vol", std::ofstream::out | std::ofstream::binary); | |
49 | + fvote.write((char*)img, bytes); | |
50 | + fvote.close(); | |
51 | + } | |
45 | 52 | gpu_update_dir3<float>(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z); |
46 | - cudaDeviceSynchronize(); | |
47 | - phi = phi - d_phi; | |
48 | - cos_phi = cos(phi); | |
49 | - //} | |
50 | - | |
53 | + cudaDeviceSynchronize(); | |
54 | + phi = phi - d_phi; | |
55 | + cos_phi = cos(phi); | |
51 | 56 | } |
52 | 57 | |
53 | 58 | cudaFree(gpu_grad); |
54 | 59 | cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); |
55 | 60 | |
56 | - //allocate space on the gpu for the final detected cells. | |
57 | - //float* gpu_output; | |
58 | - //cudaMalloc(&gpu_output, bytes); | |
59 | - | |
60 | - ////call the local max function | |
61 | - //gpu_local_max3<float>(gpu_output, gpu_vote, t, conn, x, y, z); | |
62 | - | |
63 | - ////copy the final result to the cpu. | |
64 | - //cudaMemcpy(center, gpu_output, bytes, cudaMemcpyDeviceToHost); | |
65 | - // | |
66 | - // | |
67 | 61 | cudaFree(gpu_vote); |
68 | 62 | //cudaFree(gpu_output); |
69 | 63 | |
70 | 64 | } |
71 | 65 | |
72 | -void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ | |
66 | +void lmax(float* out, float* in, float t, unsigned int conn[], size_t x, size_t y, size_t z){ | |
73 | 67 | unsigned int bytes = x * y * z * sizeof(float); |
74 | 68 | |
75 | 69 | cudaSetDevice(0); |
... | ... | @@ -89,5 +83,4 @@ void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, u |
89 | 83 | |
90 | 84 | cudaFree(gpuV); |
91 | 85 | cudaFree(gpuOut); |
92 | -} | |
93 | - | |
86 | +} | |
94 | 87 | \ No newline at end of file | ... | ... |
cpp/gaussian_blur3.cuh
... | ... | @@ -8,6 +8,7 @@ |
8 | 8 | |
9 | 9 | #define pi 3.14159 |
10 | 10 | |
11 | + | |
11 | 12 | |
12 | 13 | template<typename T> |
13 | 14 | __global__ void blur_x(T* out, T* in, T sigma, int x, int y, int z){ |
... | ... | @@ -22,7 +23,7 @@ |
22 | 23 | int i = zi * x * y + yi * x + xi; |
23 | 24 | |
24 | 25 | // calculate the kernel size |
25 | - int k_size = sigma * 4; | |
26 | + T k_size = sigma * 4; | |
26 | 27 | |
27 | 28 | //if the current pixel is outside of the image |
28 | 29 | if(xi >= x || yi >= y || zi>=z) |
... | ... | @@ -33,14 +34,21 @@ |
33 | 34 | T G; |
34 | 35 | T sum = 0; //running weighted sum across the kernel |
35 | 36 | out[i] = 0; |
36 | - T sigma_sq = 2 * sigma * sigma; | |
37 | - T a = 1.0 / (sigma * sqrt(2 * pi)); | |
37 | + T sigma_sq = 2.0 * sigma * sigma; | |
38 | + T a = 1.0 / (sigma * sqrt(2.0 * pi)); | |
39 | + | |
40 | + //handle the boundary in x direction | |
41 | + int kl = -(int)k_size; | |
42 | + //if (xi < k_size) kl = -xi; | |
43 | + int kh = (int)k_size; | |
44 | + //if (xi >= x - (int)k_size) kh = x - xi - 1; | |
45 | + | |
38 | 46 | |
39 | 47 | //for each element of the kernel |
40 | - for(int ki = -k_size; ki <= k_size; ki++){ | |
48 | + for(int ki = kl; ki <= kh; ki++){ | |
41 | 49 | |
42 | 50 | //calculate the gaussian value |
43 | - G = a * exp(-(ki*ki) / (sigma_sq)); | |
51 | + G = a * exp(-(T)(ki*ki) / (sigma_sq)); | |
44 | 52 | //calculate the global coordinates for this point in the kernel |
45 | 53 | gx = (xi + ki) % x; |
46 | 54 | gi = zi * x * y + yi * x + gx; |
... | ... | @@ -65,7 +73,7 @@ |
65 | 73 | int i = zi * x * y + yi * x + xi; |
66 | 74 | |
67 | 75 | // calculate the kernel size |
68 | - int k_size = sigma * 4; | |
76 | + T k_size = sigma * 4; | |
69 | 77 | |
70 | 78 | //if the current pixel is outside of the image |
71 | 79 | if(xi >= x || yi >= y || zi>=z) |
... | ... | @@ -76,14 +84,22 @@ |
76 | 84 | T G; |
77 | 85 | T sum = 0; //running weighted sum across the kernel |
78 | 86 | out[i] = 0; |
79 | - T sigma_sq = 2 * sigma * sigma; | |
80 | - T a = 1.0 / (sigma * sqrt(2 * pi)); | |
87 | + T sigma_sq = 2.0 * sigma * sigma; | |
88 | + T a = 1.0 / (sigma * sqrt(2.0 * pi)); | |
89 | + | |
90 | + //handle the boundary in y direction | |
91 | + int kl = -(int)k_size; | |
92 | + //if (yi < k_size) kl = -yi; | |
93 | + int kh = (int)k_size; | |
94 | + //if (yi >= y - (int)k_size) kh = y - yi - 1; | |
95 | + | |
96 | + | |
81 | 97 | |
82 | 98 | //for each element of the kernel |
83 | - for(int ki = -k_size; ki <= k_size; ki++){ | |
99 | + for(int ki = kl; ki <= kh; ki++){ | |
84 | 100 | |
85 | 101 | //calculate the gaussian value |
86 | - G = a * exp(-(ki*ki) / sigma_sq); | |
102 | + G = a * exp(-(T)(ki*ki) / sigma_sq); | |
87 | 103 | //calculate the global coordinates for this point in the kernel |
88 | 104 | gy = (yi + ki ) % y; |
89 | 105 | gi = zi * x * y + gy * x + xi; |
... | ... | @@ -108,7 +124,7 @@ |
108 | 124 | int i = zi * x * y + yi * x + xi; |
109 | 125 | |
110 | 126 | // calculate the kernel size |
111 | - int k_size = sigma * 4; | |
127 | + T k_size = sigma * 4; | |
112 | 128 | |
113 | 129 | //if the current pixel is outside of the image |
114 | 130 | if(xi >= x || yi >= y || zi>=z) |
... | ... | @@ -119,14 +135,20 @@ |
119 | 135 | T G; |
120 | 136 | T sum = 0; //running weighted sum across the kernel |
121 | 137 | out[i] = 0; |
122 | - T sigma_sq = 2 * sigma * sigma; | |
123 | - T a = 1.0 / (sigma * sqrt(2 * pi)); | |
138 | + T sigma_sq = 2.0 * sigma * sigma; | |
139 | + T a = 1.0 / (sigma * sqrt(2.0 * pi)); | |
140 | + | |
141 | + //handle the boundary in z direction | |
142 | + int kl = -(int)k_size; | |
143 | + //if (zi < k_size) kl = -zi; | |
144 | + int kh = (int)k_size; | |
145 | + //if (zi >= z - (int)k_size) kh = z - zi - 1; | |
124 | 146 | |
125 | 147 | //for each element of the kernel |
126 | - for(int ki = -k_size; ki <= k_size; ki++){ | |
148 | + for(int ki = kl; ki <= kh; ki++){ | |
127 | 149 | |
128 | 150 | //calculate the gaussian value |
129 | - G = a * exp(-(ki*ki) / sigma_sq); | |
151 | + G = a * exp(-(T)(ki*ki) / sigma_sq); | |
130 | 152 | //calculate the global coordinates for this point in the kernel |
131 | 153 | gz = (zi + ki) % z; |
132 | 154 | gi = gz * x * y + yi * x + xi; |
... | ... | @@ -138,13 +160,13 @@ |
138 | 160 | } |
139 | 161 | |
140 | 162 | template<typename T> |
141 | - void gpu_gaussian_blur3(T* image, T sigma[], unsigned int x, unsigned int y, unsigned int z){ | |
163 | + void gpu_gaussian_blur3(T* image, T sigma[], size_t x, size_t y, size_t z){ | |
142 | 164 | |
143 | 165 | //get the number of pixels in the image |
144 | - unsigned int pixels = x * y * z; | |
145 | - unsigned int bytes = sizeof(T) * pixels; | |
166 | + size_t pixels = x * y * z; | |
167 | + size_t bytes = sizeof(T) * pixels; | |
146 | 168 | |
147 | - int max_threads = stim::maxThreadsPerBlock(); | |
169 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
148 | 170 | dim3 threads(sqrt (max_threads),sqrt (max_threads)); |
149 | 171 | dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); |
150 | 172 | ... | ... |
cpp/gradient3.cuh
... | ... | @@ -7,7 +7,7 @@ |
7 | 7 | #include <stim/cuda/cudatools/error.h> |
8 | 8 | |
9 | 9 | template<typename T> |
10 | -__global__ void gradient3(T* out, T* in, float anisotropy, int x, int y, int z){ | |
10 | +__global__ void gradient3(T* out, T* in, int x, int y, int z){ | |
11 | 11 | |
12 | 12 | //calculate x,y,z coordinates for this thread |
13 | 13 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
... | ... | @@ -55,30 +55,28 @@ __global__ void gradient3(T* out, T* in, float anisotropy, int x, int y, int z){ |
55 | 55 | if(zi > 0 && zi < z-1) |
56 | 56 | out[i * 3 + 2] = (in[i_zp] - in[i_zn]) / 2; |
57 | 57 | |
58 | - out[i * 3 + 2] *= 1/anisotropy; | |
59 | - | |
60 | 58 | } |
61 | 59 | |
62 | 60 | template<typename T> |
63 | 61 | |
64 | -void gpu_gradient3(T* gpuGrad, T* gpuI, float anisotropy, unsigned int x, unsigned int y, unsigned int z){ | |
62 | +void gpu_gradient3(T* gpuGrad, T* gpuI, size_t x, size_t y, size_t z){ | |
65 | 63 | |
66 | 64 | |
67 | - int max_threads = stim::maxThreadsPerBlock(); | |
65 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
68 | 66 | dim3 threads(sqrt (max_threads),sqrt (max_threads)); |
69 | 67 | dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); |
70 | 68 | |
71 | 69 | //call the GPU kernel to determine the gradient |
72 | - gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, anisotropy, x, y, z); | |
70 | + gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y, z); | |
73 | 71 | |
74 | 72 | } |
75 | 73 | |
76 | 74 | template<typename T> |
77 | -void cpu_gradient3(T* out, T* in, float anisotropy, unsigned int x, unsigned int y, unsigned int z){ | |
75 | +void cpu_gradient3(T* out, T* in, size_t x, size_t y, size_t z){ | |
78 | 76 | |
79 | 77 | //get the number of pixels in the image |
80 | - unsigned int pixels = x * y * z; | |
81 | - unsigned int bytes = pixels * sizeof(T); | |
78 | + size_t pixels = x * y * z; | |
79 | + size_t bytes = pixels * sizeof(T); | |
82 | 80 | |
83 | 81 | //allocate space on the GPU for the input image |
84 | 82 | T* gpuIn; |
... | ... | @@ -92,7 +90,7 @@ void cpu_gradient3(T* out, T* in, float anisotropy, unsigned int x, unsigned int |
92 | 90 | cudaMalloc(&gpuOut, bytes * 3); //the output image will have two channels (x, y) |
93 | 91 | |
94 | 92 | //call the GPU version of this function |
95 | - gpu_gradient3(gpuOut, gpuIn, anisotropy, x, y, z); | |
93 | + gpu_gradient3(gpuOut, gpuIn, x, y, z); | |
96 | 94 | |
97 | 95 | //copy the results to the CPU |
98 | 96 | cudaMemcpy(out, gpuOut, bytes * 3, cudaMemcpyDeviceToHost); | ... | ... |
cpp/main.cpp
... | ... | @@ -19,21 +19,22 @@ float phi; |
19 | 19 | size_t x, y, z; |
20 | 20 | |
21 | 21 | |
22 | -void ivote3(float* img, float std[], float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[], | |
23 | - unsigned int x, unsigned int y, unsigned int z); | |
24 | -void lmax(float* center, float* vote, float t1, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z); | |
25 | - | |
26 | -void invert_data(float* cpuI, unsigned int x, unsigned int y, unsigned int z){ | |
27 | - for(int ix = 0; ix < x; ix++){ | |
28 | - for (int iy = 0; iy < y; iy++){ | |
29 | - for (int iz = 0; iz < z; iz++){ | |
30 | - int idx = iz * x * y + iy * x + ix; | |
31 | - cpuI[idx] = 255 - cpuI[idx]; | |
32 | - } | |
22 | +void ivote3(float* img, float std[], float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[], | |
23 | + size_t x, size_t y, size_t z); | |
24 | +void lmax(float* center, float* vote, float t1, unsigned int conn[], size_t x, size_t y, size_t z); | |
25 | + | |
26 | +void invert_data(float* cpuI, size_t x, size_t y, size_t z) { | |
27 | + size_t idx; | |
28 | + for (size_t ix = 0; ix < x; ix++) { | |
29 | + for (size_t iy = 0; iy < y; iy++) { | |
30 | + for (size_t iz = 0; iz < z; iz++) { | |
31 | + idx = iz * x * y + iy * x + ix; | |
32 | + cpuI[idx] = 255 - cpuI[idx]; | |
33 | 33 | } |
34 | 34 | } |
35 | 35 | } |
36 | - | |
36 | +} | |
37 | + | |
37 | 38 | |
38 | 39 | |
39 | 40 | void advertise() { |
... | ... | @@ -57,12 +58,12 @@ void init_args(int argc, char* argv[]) { |
57 | 58 | args.add("x", "size of the dataset along X axis", "positive value"); |
58 | 59 | args.add("y", "size of the dataset along Y axis", "positive value"); |
59 | 60 | args.add("z", "size of the dataset along Z axis", "positive value"); |
60 | - args.add("t", "threshold value for the final result", "positive valu"); | |
61 | + args.add("t", "threshold value for the final result", "0", "positive valu"); | |
61 | 62 | args.add("invert", "to invert the input data set", "string"); |
62 | 63 | args.add("rmax", "maximum possible radius of the cells in the input image", "10", "[positive value]"); |
63 | 64 | args.add("phi", "starting angle for the vote region (in degrees)", "25.0", "0 <= phi < 180"); |
64 | 65 | args.add("iter", "number of iterations for voting", "8", "i > 0"); |
65 | - args.add("sigma", "the gaussian blur standard deviation", "5", "s >=0 (s = 0, no blurring)"); | |
66 | + args.add("sigma", "the gaussian blur standard deviation", "3", "s >=0 (s = 0, no blurring)"); | |
66 | 67 | args.add("conn", "the number of connected neighbors for calculating the local maxima", "5", "[positive value]"); |
67 | 68 | //parse the command line arguments. |
68 | 69 | args.parse(argc, argv); |
... | ... | @@ -95,94 +96,93 @@ void init_args(int argc, char* argv[]) { |
95 | 96 | t = args["t"].as_float(); |
96 | 97 | sigma = args["sigma"].as_float(); |
97 | 98 | phi = (float)args["phi"].as_float() * (float)stim::PI / 180; |
98 | - | |
99 | + | |
99 | 100 | } |
100 | -int main(int argc, char** argv){ | |
101 | +int main(int argc, char** argv) { | |
101 | 102 | |
102 | 103 | |
103 | 104 | cudaDeviceProp prop; |
104 | 105 | int count; |
105 | 106 | cudaGetDeviceCount(&count); |
106 | - for (int i=0; i<count; i++){ | |
107 | + for (int i = 0; i<count; i++) { | |
107 | 108 | cudaGetDeviceProperties(&prop, i); |
108 | 109 | printf("current device ID: %d\n", i); |
109 | 110 | printf("device name: %s\n", prop.name); |
110 | 111 | printf("total global mem: %lu\n", prop.totalGlobalMem); |
111 | 112 | printf("shared memory per block: %lu\n", prop.sharedMemPerBlock); |
112 | 113 | } |
113 | - | |
114 | + | |
114 | 115 | init_args(argc, argv); |
115 | 116 | |
116 | - unsigned int r[3] = { rmax , rmax, rmax}; | |
117 | - | |
118 | - float sigma3[3] = { sigma, sigma, sigma}; | |
119 | - unsigned int conn[3] = { nlmax, nlmax, nlmax}; | |
120 | - float d_phi = phi/(iter+2); | |
121 | - | |
122 | - size_t bytes = x*y*z*sizeof(float); | |
117 | + unsigned int r[3] = { rmax , rmax, rmax }; | |
118 | + | |
119 | + float sigma3[3] = { sigma, sigma, sigma }; | |
120 | + unsigned int conn[3] = { nlmax, nlmax, nlmax }; | |
121 | + float d_phi = phi / (iter); | |
122 | + | |
123 | + size_t bytes = x*y*z * sizeof(float); | |
123 | 124 | |
124 | 125 | //allocate space on the cpu for the input data |
125 | - float* cpuI = (float*) malloc(bytes); | |
126 | + float* cpuI = (float*)malloc(bytes); | |
126 | 127 | |
127 | 128 | //load the input file into the cpuI |
128 | 129 | std::ifstream nissl(args.arg(0), std::ios::in | std::ios::binary); |
129 | 130 | nissl.read((char*)cpuI, bytes); |
130 | 131 | nissl.close(); |
131 | - if(args["invert"].is_set()) | |
132 | + if (args["invert"].is_set()) | |
132 | 133 | invert_data(cpuI, x, y, z); |
133 | - | |
134 | + | |
134 | 135 | //write a new file from the cpuI. |
135 | 136 | std::ofstream original("0-inv-128.vol", std::ofstream::out | std::ofstream::binary); |
136 | 137 | original.write((char*)cpuI, bytes); |
137 | 138 | original.close(); |
138 | - | |
139 | - | |
139 | + | |
140 | + | |
140 | 141 | ivote3(cpuI, sigma3, phi, d_phi, r, iter, t, conn, x, y, z); // call the ivote function |
141 | - | |
142 | - std::ofstream fvote("0-vote8.vol", std::ofstream::out | std::ofstream::binary); | |
142 | + | |
143 | + std::ofstream fvote("00-vote8_aabb.vol", std::ofstream::out | std::ofstream::binary); | |
143 | 144 | fvote.write((char*)cpuI, bytes); |
144 | 145 | fvote.close(); |
145 | - | |
146 | + | |
146 | 147 | //allocate space on the cpu for the output result |
147 | 148 | float* cpu_out = (float*)malloc(bytes * 3); |
148 | 149 | |
149 | 150 | //write the output file. |
150 | 151 | //for (int t0=0; t0<=5000; t0+=100){ |
151 | 152 | // float t1 = t0; |
152 | - int t0 = t; | |
153 | - lmax(cpu_out, cpuI, t, conn, x, y, z); | |
154 | - //std::ofstream fo("shared2D-v8/" + OutName.str(), std::ofstream::out | std::ofstream::binary); | |
155 | - std::ofstream fo( args.arg(1), std::ofstream::out | std::ofstream::binary); | |
156 | - fo.write((char*)cpu_out, bytes); | |
157 | - fo.close(); | |
153 | + int t0 = t; | |
154 | + lmax(cpu_out, cpuI, t, conn, x, y, z); | |
158 | 155 | |
156 | + std::ofstream fo(args.arg(1), std::ofstream::out | std::ofstream::binary); | |
157 | + fo.write((char*)cpu_out, bytes); | |
158 | + fo.close(); | |
159 | + | |
159 | 160 | // creat a file for saving the list centers |
160 | - | |
161 | - std::ofstream list(args.arg(2)); | |
162 | - // set the number of detected cells to zero. | |
163 | - int nod = 0; | |
164 | - if (list.is_open()){ | |
165 | - | |
166 | - for (int iz=0; iz<z; iz++){ | |
167 | - for (int iy=0; iy<y; iy++){ | |
168 | - for (int ix=0; ix<x; ix++){ | |
169 | - | |
170 | - int idx = iz * x * y + iy * x + ix; | |
171 | - if (cpu_out[idx]>0){ | |
172 | - nod++; | |
173 | - list << ix << " " << iy << " "<< iz << " " << cpu_out[idx] << '\n' ; | |
174 | - | |
175 | - } | |
176 | - } | |
161 | + | |
162 | + std::ofstream list(args.arg(2)); | |
163 | + // set the number of detected cells to zero. | |
164 | + int nod = 0; | |
165 | + if (list.is_open()) { | |
166 | + | |
167 | + for (int iz = 0; iz<z; iz++) { | |
168 | + for (int iy = 0; iy<y; iy++) { | |
169 | + for (int ix = 0; ix<x; ix++) { | |
170 | + | |
171 | + int idx = iz * x * y + iy * x + ix; | |
172 | + if (cpu_out[idx]>0) { | |
173 | + nod++; | |
174 | + list << ix << " " << iy << " " << iz << " " << cpu_out[idx] << '\n'; | |
175 | + | |
177 | 176 | } |
178 | 177 | } |
179 | - | |
180 | - list.close(); | |
178 | + } | |
181 | 179 | } |
182 | 180 | |
181 | + list.close(); | |
182 | + } | |
183 | + | |
183 | 184 | |
184 | 185 | //} |
185 | - cudaDeviceReset(); | |
186 | - | |
187 | -} | |
186 | + cudaDeviceReset(); | |
188 | 187 | |
188 | +} | |
189 | 189 | \ No newline at end of file | ... | ... |
cpp/update_dir3_aabb.cuh
... | ... | @@ -46,7 +46,7 @@ |
46 | 46 | float step = 360.0/n; |
47 | 47 | stim::circle<float> cir(center, d, norm); |
48 | 48 | stim::aabb3<int> bb(xi,yi,zi); |
49 | - bb.insert(xc,yc,zc); | |
49 | + bb.insert((int) xc, (int)yc, (int)zc); | |
50 | 50 | for(float j = 0; j <360.0; j += step){ |
51 | 51 | stim::vec3<float> out = cir.p(j); |
52 | 52 | bb.insert(out[0], out[1], out[2]); |
... | ... | @@ -67,7 +67,7 @@ |
67 | 67 | float id_x = g[0]; // define local variables for the x, y, and z coordinations point to the vote direction |
68 | 68 | float id_y = g[1]; |
69 | 69 | float id_z = g[2]; |
70 | - | |
70 | + | |
71 | 71 | for (bz=bb.low[2]; bz<=bb.high[2]; bz++){ |
72 | 72 | dz = bz - zi; //compute the distance bw the voter and the current counter along z axis |
73 | 73 | dz_sq = dz * dz; |
... | ... | @@ -85,9 +85,9 @@ |
85 | 85 | l_vote = gpu_vote[idx_c]; |
86 | 86 | if (l_vote>max) { |
87 | 87 | max = l_vote; |
88 | - id_x = dx; | |
89 | - id_y = dy; | |
90 | - id_z = dz; | |
88 | + id_x = (float)dx; | |
89 | + id_y = (float)dy; | |
90 | + id_z = (float)dz; | |
91 | 91 | } |
92 | 92 | } |
93 | 93 | } |
... | ... | @@ -103,25 +103,26 @@ |
103 | 103 | |
104 | 104 | // this kernel updates the gradient direction by the calculated voting direction. |
105 | 105 | template<typename T> |
106 | - __global__ void update_grad3(T* gpu_grad, T* gpu_dir, int x, int y, int z){ | |
106 | + __global__ void update_grad3(T* gpu_grad, T* gpu_dir, size_t x, size_t y, size_t z){ | |
107 | 107 | |
108 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread | |
108 | + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread | |
109 | + | |
110 | + size_t grid_y = y / blockDim.y; //find the grid size along y | |
111 | + size_t blockidx_y = blockIdx.y % grid_y; | |
112 | + size_t yi = blockidx_y * blockDim.y + threadIdx.y; | |
113 | + size_t zi = blockIdx.y / grid_y; | |
109 | 114 | |
110 | - int grid_y = y / blockDim.y; //find the grid size along y | |
111 | - int blockidx_y = blockIdx.y % grid_y; | |
112 | - int yi = blockidx_y * blockDim.y + threadIdx.y; | |
113 | - int zi = blockIdx.y / grid_y; | |
114 | - int i = zi * x * y + yi * x + xi; | |
115 | - | |
116 | 115 | if(xi >= x || yi >= y || zi >= z) return; |
117 | 116 | |
117 | + size_t i = zi * x * y + yi * x + xi; | |
118 | + | |
118 | 119 | gpu_grad[i * 3 + 0] = gpu_dir [i * 3 + 0]; //update the gradient image with the new direction direction |
119 | 120 | gpu_grad[i * 3 + 1] = gpu_dir [i * 3 + 1]; |
120 | 121 | gpu_grad[i * 3 + 2] = gpu_dir [i * 3 + 2]; |
121 | 122 | } |
122 | 123 | |
123 | 124 | template<typename T> |
124 | - void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ | |
125 | + void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z){ | |
125 | 126 | |
126 | 127 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
127 | 128 | dim3 threads(sqrt (max_threads),sqrt (max_threads)); | ... | ... |
cpp/vote3_atomic_aabb.cuh
... | ... | @@ -13,121 +13,121 @@ |
13 | 13 | #include <stim/math/vector.h> |
14 | 14 | #include <stim/visualization/aabb3.h> |
15 | 15 | |
16 | - // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | |
17 | - template<typename T> | |
18 | - __global__ void vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ | |
19 | - | |
20 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread | |
21 | - | |
22 | - int grid_y = y / blockDim.y; //find the grid size along y | |
23 | - int blockidx_y = blockIdx.y % grid_y; | |
24 | - int yi = blockidx_y * blockDim.y + threadIdx.y; | |
25 | - int zi = blockIdx.y / grid_y; | |
26 | - | |
27 | - if(xi>=x || yi>=y || zi>=z) return; | |
28 | - | |
29 | - int i = zi * x * y + yi * x + xi; // calculate the 1D index of the voter | |
30 | - | |
31 | - | |
32 | - float rx_sq = rx * rx; // compute the square for rmax | |
33 | - float ry_sq = ry * ry; | |
34 | - float rz_sq = rz * rz; | |
35 | - | |
36 | - stim::vec3<float> g(gpu_grad[3*i],gpu_grad[3*i+1],gpu_grad[3*i+2]); // form a vec3 variable for the gradient vector | |
37 | - stim::vec3<float> g_sph = g.cart2sph(); //convert cartesian coordinate to spherical for the gradient vector | |
38 | - float n =8; //set the number of points to find the boundaries of the conical voting area | |
39 | - float xc = rx * cos(g_sph[1]) * sin(g_sph[2]); //calculate the center point of the surface of the voting area for the voter | |
40 | - float yc = ry * sin(g_sph[1]) * sin(g_sph[2]) ; | |
41 | - float zc = rz * cos(g_sph[2]) ; | |
42 | - float r = sqrt(xc*xc + yc*yc + zc*zc); | |
43 | - xc+=xi; | |
44 | - yc+=yi; | |
45 | - zc+=zi; | |
46 | - stim::vec3<float> center(xc,yc,zc); | |
47 | - | |
48 | - float d = 2 * r * tan(phi); //find the diameter of the conical voting area | |
49 | - stim::vec3<float> norm = g.norm(); //compute the normalize gradient vector | |
50 | - float step = 360.0/n; | |
51 | - stim::circle<float> cir(center, d, norm); | |
52 | - stim::aabb3<int> bb(xi,yi,zi); | |
53 | - bb.insert((int)xc, (int)yc, (int)zc); | |
54 | - for(float j = 0; j <360.0; j += step){ | |
55 | - stim::vec3<float> out = cir.p(j); | |
56 | - bb.insert(out[0], out[1], out[2]); | |
16 | +// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | |
17 | +template<typename T> | |
18 | +__global__ void vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, int rx, int ry, int rz, int x, int y, int z) { | |
19 | + | |
20 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread | |
21 | + | |
22 | + int grid_y = y / blockDim.y; //find the grid size along y | |
23 | + int blockidx_y = blockIdx.y % grid_y; | |
24 | + int yi = blockidx_y * blockDim.y + threadIdx.y; | |
25 | + int zi = blockIdx.y / grid_y; | |
26 | + | |
27 | + if (xi >= x || yi >= y || zi >= z) return; | |
28 | + | |
29 | + int i = zi * x * y + yi * x + xi; // calculate the 1D index of the voter | |
30 | + | |
31 | + | |
32 | + float rx_sq = rx * rx; // compute the square for rmax | |
33 | + float ry_sq = ry * ry; | |
34 | + float rz_sq = rz * rz; | |
35 | + | |
36 | + stim::vec3<float> g(gpu_grad[3 * i], gpu_grad[3 * i + 1], gpu_grad[3 * i + 2]); // form a vec3 variable for the gradient vector | |
37 | + stim::vec3<float> g_sph = g.cart2sph(); //convert cartesian coordinate to spherical for the gradient vector | |
38 | + float n = 8; //set the number of points to find the boundaries of the conical voting area | |
39 | + float xc = rx * cos(g_sph[1]) * sin(g_sph[2]); //calculate the center point of the surface of the voting area for the voter | |
40 | + float yc = ry * sin(g_sph[1]) * sin(g_sph[2]); | |
41 | + float zc = rz * cos(g_sph[2]); | |
42 | + float r = sqrt(xc*xc + yc*yc + zc*zc); | |
43 | + xc += xi; | |
44 | + yc += yi; | |
45 | + zc += zi; | |
46 | + stim::vec3<float> center(xc, yc, zc); | |
47 | + | |
48 | + float d = 2 * r * tan(phi); //find the diameter of the conical voting area | |
49 | + stim::vec3<float> norm = g.norm(); //compute the normalize gradient vector | |
50 | + float step = 360.0 / n; | |
51 | + stim::circle<float> cir(center, d, norm); | |
52 | + stim::aabb3<int> bb(xi, yi, zi); | |
53 | + bb.insert((int)xc, (int)yc, (int)zc); | |
54 | + for (float j = 0; j <360.0; j += step) { | |
55 | + stim::vec3<float> out = cir.p(j); | |
56 | + bb.insert(out[0], out[1], out[2]); | |
57 | + } | |
58 | + bb.trim_low(xi - rx, yi - ry, zi - rz); | |
59 | + bb.trim_low(0, 0, 0); | |
60 | + bb.trim_high(xi + rx, yi + ry, zi + rz); | |
61 | + bb.trim_high(x - 1, y - 1, z - 1); | |
62 | + int bx, by, bz; | |
63 | + int dx, dy, dz; | |
64 | + float dx_sq, dy_sq, dz_sq; | |
65 | + float dist, cos_diff; | |
66 | + int idx_c; | |
67 | + for (bz = bb.low[2]; bz <= bb.high[2]; bz++) { | |
68 | + dz = bz - zi; //compute the distance bw the voter and the current counter along z axis | |
69 | + dz_sq = dz * dz; | |
70 | + for (by = bb.low[1]; by <= bb.high[1]; by++) { | |
71 | + dy = by - yi; //compute the distance bw the voter and the current counter along y axis | |
72 | + dy_sq = dy * dy; | |
73 | + for (bx = bb.low[0]; bx <= bb.high[0]; bx++) { | |
74 | + dx = bx - xi; //compute the distance bw the voter and the current counter along x axis | |
75 | + dx_sq = dx * dx; | |
76 | + | |
77 | + dist = sqrt(dx_sq + dy_sq + dz_sq); //calculate the distance between the voter and the current counter | |
78 | + cos_diff = (norm[0] * dx + norm[1] * dy + norm[2] * dz) / dist; // calculate the cosine of angle between the voter and the current counter | |
79 | + if (((dx_sq / rx_sq + dy_sq / ry_sq + dz_sq / rz_sq) <= 1) && (cos_diff >= cos_phi)) { //check if the current counter located in the voting area of the voter | |
80 | + idx_c = (bz* y + by) * x + bx; //calculate the 1D index for the current counter | |
81 | + atomicAdd(&gpu_vote[idx_c], g_sph[0]); | |
82 | + } | |
57 | 83 | } |
58 | - bb.trim_low(xi-rx, yi-ry, zi-rz); | |
59 | - bb.trim_low(0,0,0); | |
60 | - bb.trim_high(xi+rx, yi+ry, zi+rz); | |
61 | - bb.trim_high(x-1, y-1, z-1); | |
62 | - int bx,by,bz; | |
63 | - int dx, dy, dz; | |
64 | - float dx_sq, dy_sq, dz_sq; | |
65 | - float dist, cos_diff; | |
66 | - int idx_c; | |
67 | - for (bz=bb.low[2]; bz<=bb.high[2]; bz++){ | |
68 | - dz = bz - zi; //compute the distance bw the voter and the current counter along z axis | |
69 | - dz_sq = dz * dz; | |
70 | - for (by=bb.low[1]; by<=bb.high[1]; by++){ | |
71 | - dy = by - yi; //compute the distance bw the voter and the current counter along y axis | |
72 | - dy_sq = dy * dy; | |
73 | - for (bx=bb.low[0]; bx<=bb.high[0]; bx++){ | |
74 | - dx = bx - xi; //compute the distance bw the voter and the current counter along x axis | |
75 | - dx_sq = dx * dx; | |
76 | - | |
77 | - dist = sqrt(dx_sq + dy_sq + dz_sq); //calculate the distance between the voter and the current counter | |
78 | - cos_diff = (norm[0] * dx + norm[1] * dy + norm[2] * dz)/dist; // calculate the cosine of angle between the voter and the current counter | |
79 | - if ( ( (dx_sq/rx_sq + dy_sq/ry_sq + dz_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){ //check if the current counter located in the voting area of the voter | |
80 | - idx_c = (bz* y + by) * x + bx; //calculate the 1D index for the current counter | |
81 | - atomicAdd (&gpu_vote[idx_c] , g_sph[0]); | |
82 | - } | |
83 | - } | |
84 | - } | |
85 | - } | |
86 | 84 | } |
85 | + } | |
86 | +} | |
87 | 87 | |
88 | - template<typename T> | |
89 | - void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ | |
88 | +template<typename T> | |
89 | +void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z) { | |
90 | 90 | |
91 | - | |
92 | - unsigned int max_threads = stim::maxThreadsPerBlock(); | |
93 | - dim3 threads(sqrt (max_threads),sqrt (max_threads)); | |
94 | - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | |
95 | - vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x , y, z); //call the kernel to do the voting | |
96 | 91 | |
97 | - } | |
92 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
93 | + dim3 threads(sqrt(max_threads), sqrt(max_threads)); | |
94 | + dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | |
95 | + vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x, y, z); //call the kernel to do the voting | |
98 | 96 | |
97 | +} | |
99 | 98 | |
100 | - template<typename T> | |
101 | - void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ | |
102 | - | |
103 | - //calculate the number of bytes in the array | |
104 | - unsigned int bytes = x * y * z * sizeof(T); | |
105 | - | |
106 | - | |
107 | - //allocate space on the GPU for the Vote Image | |
108 | - T* gpu_vote; | |
109 | - cudaMalloc(&gpu_vote, bytes); | |
110 | - | |
111 | - //allocate space on the GPU for the input Gradient image | |
112 | - T* gpu_grad; | |
113 | - cudaMalloc(&gpu_grad, bytes*3); | |
114 | - | |
115 | - | |
116 | - //copy the Gradient data to the GPU | |
117 | - cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice); | |
118 | - | |
119 | - | |
120 | - //call the GPU version of the vote calculation function | |
121 | - gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z); | |
122 | - | |
123 | - //copy the Vote Data back to the CPU | |
124 | - cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ; | |
125 | - | |
126 | - //free allocated memory | |
127 | - cudaFree(gpu_vote); | |
128 | - cudaFree(gpu_grad); | |
129 | - | |
130 | - } | |
99 | + | |
100 | +template<typename T> | |
101 | +void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z) { | |
102 | + | |
103 | + //calculate the number of bytes in the array | |
104 | + unsigned int bytes = x * y * z * sizeof(T); | |
105 | + | |
106 | + | |
107 | + //allocate space on the GPU for the Vote Image | |
108 | + T* gpu_vote; | |
109 | + cudaMalloc(&gpu_vote, bytes); | |
110 | + | |
111 | + //allocate space on the GPU for the input Gradient image | |
112 | + T* gpu_grad; | |
113 | + cudaMalloc(&gpu_grad, bytes * 3); | |
114 | + | |
115 | + | |
116 | + //copy the Gradient data to the GPU | |
117 | + cudaMemcpy(gpu_grad, cpu_grad, bytes * 3, cudaMemcpyHostToDevice); | |
118 | + | |
119 | + | |
120 | + //call the GPU version of the vote calculation function | |
121 | + gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x, y, z); | |
122 | + | |
123 | + //copy the Vote Data back to the CPU | |
124 | + cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost); | |
125 | + | |
126 | + //free allocated memory | |
127 | + cudaFree(gpu_vote); | |
128 | + cudaFree(gpu_grad); | |
129 | + | |
130 | +} | |
131 | 131 | |
132 | 132 | |
133 | 133 | #endif |
134 | 134 | \ No newline at end of file | ... | ... |