Commit 310a169858bcdd5efb908f5548e3fd086cd76488
1 parent
2558ee86
update the ivote3 project
Showing
7 changed files
with
257 additions
and
243 deletions
Show diff stats
Matlab_3D/local_max.m
@@ -26,8 +26,8 @@ for li = 1: nl | @@ -26,8 +26,8 @@ for li = 1: nl | ||
26 | ind_m = sub2ind(size(Ivote), cx, cy, cz); | 26 | ind_m = sub2ind(size(Ivote), cx, cy, cz); |
27 | lm_Ivote = A(ind_m); | 27 | lm_Ivote = A(ind_m); |
28 | [~, lm_indx] = max(lm_Ivote(:)); | 28 | [~, lm_indx] = max(lm_Ivote(:)); |
29 | - if ind_f(li) == ind_m(lm_indx); | ||
30 | - Icenter(ind_f(li)) = 255; | 29 | + if ind_f(li) == ind_m(lm_indx) |
30 | + Icenter(ind_f(li)) = A(ind_f(li)); %255 | ||
31 | end | 31 | end |
32 | end | 32 | end |
33 | 33 |
cpp/cudafunc.cu
@@ -5,71 +5,65 @@ | @@ -5,71 +5,65 @@ | ||
5 | #include "vote3_atomic_aabb.cuh" | 5 | #include "vote3_atomic_aabb.cuh" |
6 | #include "update_dir3_aabb.cuh" | 6 | #include "update_dir3_aabb.cuh" |
7 | #include "local_max3.cuh" | 7 | #include "local_max3.cuh" |
8 | - | 8 | +#include <fstream> |
9 | 9 | ||
10 | void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], | 10 | void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], |
11 | - int iter, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ | 11 | + int iter, float t, unsigned int conn[], size_t x, size_t y, size_t z){ |
12 | 12 | ||
13 | 13 | ||
14 | - cudaSetDevice(0); | ||
15 | - | ||
16 | - unsigned int bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data | 14 | + size_t bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data |
17 | 15 | ||
18 | float* gpuI0; //assign memory on gpu for the input data | 16 | float* gpuI0; //assign memory on gpu for the input data |
19 | cudaMalloc(&gpuI0, bytes); | 17 | cudaMalloc(&gpuI0, bytes); |
20 | cudaMemcpy(gpuI0, img, bytes, cudaMemcpyHostToDevice); //copy the image data to the GPU. | 18 | cudaMemcpy(gpuI0, img, bytes, cudaMemcpyHostToDevice); //copy the image data to the GPU. |
21 | 19 | ||
22 | - | ||
23 | gpu_gaussian_blur3<float>(gpuI0, sigma, x, y, z); //call the blurring function from the gpu. | 20 | gpu_gaussian_blur3<float>(gpuI0, sigma, x, y, z); //call the blurring function from the gpu. |
24 | cudaDeviceSynchronize(); | 21 | cudaDeviceSynchronize(); |
25 | 22 | ||
26 | float* gpu_grad; //assign memory on the gpu for the gradient along the X, y, z. | 23 | float* gpu_grad; //assign memory on the gpu for the gradient along the X, y, z. |
27 | cudaMalloc(&gpu_grad, bytes*3); | 24 | cudaMalloc(&gpu_grad, bytes*3); |
28 | 25 | ||
29 | - gpu_gradient3<float>(gpu_grad, gpuI0, 1, x, y, z); //call the gradient function from the gpu. | 26 | + gpu_gradient3<float>(gpu_grad, gpuI0, x, y, z); //call the gradient function from the gpu. |
30 | cudaFree(gpuI0); | 27 | cudaFree(gpuI0); |
31 | 28 | ||
32 | float* gpu_vote; | 29 | float* gpu_vote; |
33 | cudaMalloc(&gpu_vote, bytes); | 30 | cudaMalloc(&gpu_vote, bytes); |
34 | 31 | ||
35 | float cos_phi = cos(phi); | 32 | float cos_phi = cos(phi); |
36 | - | 33 | + |
37 | //call the vote function. | 34 | //call the vote function. |
38 | for (int i = 0; i < iter; i++){ | 35 | for (int i = 0; i < iter; i++){ |
39 | 36 | ||
40 | cudaMemset(gpu_vote, 0, bytes); | 37 | cudaMemset(gpu_vote, 0, bytes); |
41 | gpu_vote3<float>(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z); | 38 | gpu_vote3<float>(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z); |
42 | cudaDeviceSynchronize(); | 39 | cudaDeviceSynchronize(); |
43 | - | ||
44 | - //if (phi >= d_phi){ | 40 | + if (i == 0) { |
41 | + cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); | ||
42 | + std::ofstream fvote("00-vote1_aabb.vol", std::ofstream::out | std::ofstream::binary); | ||
43 | + fvote.write((char*)img, bytes); | ||
44 | + fvote.close(); | ||
45 | + } | ||
46 | + if (i == 1) { | ||
47 | + cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); | ||
48 | + std::ofstream fvote("00-vote2_aabb.vol", std::ofstream::out | std::ofstream::binary); | ||
49 | + fvote.write((char*)img, bytes); | ||
50 | + fvote.close(); | ||
51 | + } | ||
45 | gpu_update_dir3<float>(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z); | 52 | gpu_update_dir3<float>(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z); |
46 | - cudaDeviceSynchronize(); | ||
47 | - phi = phi - d_phi; | ||
48 | - cos_phi = cos(phi); | ||
49 | - //} | ||
50 | - | 53 | + cudaDeviceSynchronize(); |
54 | + phi = phi - d_phi; | ||
55 | + cos_phi = cos(phi); | ||
51 | } | 56 | } |
52 | 57 | ||
53 | cudaFree(gpu_grad); | 58 | cudaFree(gpu_grad); |
54 | cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); | 59 | cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); |
55 | 60 | ||
56 | - //allocate space on the gpu for the final detected cells. | ||
57 | - //float* gpu_output; | ||
58 | - //cudaMalloc(&gpu_output, bytes); | ||
59 | - | ||
60 | - ////call the local max function | ||
61 | - //gpu_local_max3<float>(gpu_output, gpu_vote, t, conn, x, y, z); | ||
62 | - | ||
63 | - ////copy the final result to the cpu. | ||
64 | - //cudaMemcpy(center, gpu_output, bytes, cudaMemcpyDeviceToHost); | ||
65 | - // | ||
66 | - // | ||
67 | cudaFree(gpu_vote); | 61 | cudaFree(gpu_vote); |
68 | //cudaFree(gpu_output); | 62 | //cudaFree(gpu_output); |
69 | 63 | ||
70 | } | 64 | } |
71 | 65 | ||
72 | -void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ | 66 | +void lmax(float* out, float* in, float t, unsigned int conn[], size_t x, size_t y, size_t z){ |
73 | unsigned int bytes = x * y * z * sizeof(float); | 67 | unsigned int bytes = x * y * z * sizeof(float); |
74 | 68 | ||
75 | cudaSetDevice(0); | 69 | cudaSetDevice(0); |
@@ -89,5 +83,4 @@ void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, u | @@ -89,5 +83,4 @@ void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, u | ||
89 | 83 | ||
90 | cudaFree(gpuV); | 84 | cudaFree(gpuV); |
91 | cudaFree(gpuOut); | 85 | cudaFree(gpuOut); |
92 | -} | ||
93 | - | 86 | +} |
94 | \ No newline at end of file | 87 | \ No newline at end of file |
cpp/gaussian_blur3.cuh
@@ -8,6 +8,7 @@ | @@ -8,6 +8,7 @@ | ||
8 | 8 | ||
9 | #define pi 3.14159 | 9 | #define pi 3.14159 |
10 | 10 | ||
11 | + | ||
11 | 12 | ||
12 | template<typename T> | 13 | template<typename T> |
13 | __global__ void blur_x(T* out, T* in, T sigma, int x, int y, int z){ | 14 | __global__ void blur_x(T* out, T* in, T sigma, int x, int y, int z){ |
@@ -22,7 +23,7 @@ | @@ -22,7 +23,7 @@ | ||
22 | int i = zi * x * y + yi * x + xi; | 23 | int i = zi * x * y + yi * x + xi; |
23 | 24 | ||
24 | // calculate the kernel size | 25 | // calculate the kernel size |
25 | - int k_size = sigma * 4; | 26 | + T k_size = sigma * 4; |
26 | 27 | ||
27 | //if the current pixel is outside of the image | 28 | //if the current pixel is outside of the image |
28 | if(xi >= x || yi >= y || zi>=z) | 29 | if(xi >= x || yi >= y || zi>=z) |
@@ -33,14 +34,21 @@ | @@ -33,14 +34,21 @@ | ||
33 | T G; | 34 | T G; |
34 | T sum = 0; //running weighted sum across the kernel | 35 | T sum = 0; //running weighted sum across the kernel |
35 | out[i] = 0; | 36 | out[i] = 0; |
36 | - T sigma_sq = 2 * sigma * sigma; | ||
37 | - T a = 1.0 / (sigma * sqrt(2 * pi)); | 37 | + T sigma_sq = 2.0 * sigma * sigma; |
38 | + T a = 1.0 / (sigma * sqrt(2.0 * pi)); | ||
39 | + | ||
40 | + //handle the boundary in x direction | ||
41 | + int kl = -(int)k_size; | ||
42 | + //if (xi < k_size) kl = -xi; | ||
43 | + int kh = (int)k_size; | ||
44 | + //if (xi >= x - (int)k_size) kh = x - xi - 1; | ||
45 | + | ||
38 | 46 | ||
39 | //for each element of the kernel | 47 | //for each element of the kernel |
40 | - for(int ki = -k_size; ki <= k_size; ki++){ | 48 | + for(int ki = kl; ki <= kh; ki++){ |
41 | 49 | ||
42 | //calculate the gaussian value | 50 | //calculate the gaussian value |
43 | - G = a * exp(-(ki*ki) / (sigma_sq)); | 51 | + G = a * exp(-(T)(ki*ki) / (sigma_sq)); |
44 | //calculate the global coordinates for this point in the kernel | 52 | //calculate the global coordinates for this point in the kernel |
45 | gx = (xi + ki) % x; | 53 | gx = (xi + ki) % x; |
46 | gi = zi * x * y + yi * x + gx; | 54 | gi = zi * x * y + yi * x + gx; |
@@ -65,7 +73,7 @@ | @@ -65,7 +73,7 @@ | ||
65 | int i = zi * x * y + yi * x + xi; | 73 | int i = zi * x * y + yi * x + xi; |
66 | 74 | ||
67 | // calculate the kernel size | 75 | // calculate the kernel size |
68 | - int k_size = sigma * 4; | 76 | + T k_size = sigma * 4; |
69 | 77 | ||
70 | //if the current pixel is outside of the image | 78 | //if the current pixel is outside of the image |
71 | if(xi >= x || yi >= y || zi>=z) | 79 | if(xi >= x || yi >= y || zi>=z) |
@@ -76,14 +84,22 @@ | @@ -76,14 +84,22 @@ | ||
76 | T G; | 84 | T G; |
77 | T sum = 0; //running weighted sum across the kernel | 85 | T sum = 0; //running weighted sum across the kernel |
78 | out[i] = 0; | 86 | out[i] = 0; |
79 | - T sigma_sq = 2 * sigma * sigma; | ||
80 | - T a = 1.0 / (sigma * sqrt(2 * pi)); | 87 | + T sigma_sq = 2.0 * sigma * sigma; |
88 | + T a = 1.0 / (sigma * sqrt(2.0 * pi)); | ||
89 | + | ||
90 | + //handle the boundary in y direction | ||
91 | + int kl = -(int)k_size; | ||
92 | + //if (yi < k_size) kl = -yi; | ||
93 | + int kh = (int)k_size; | ||
94 | + //if (yi >= y - (int)k_size) kh = y - yi - 1; | ||
95 | + | ||
96 | + | ||
81 | 97 | ||
82 | //for each element of the kernel | 98 | //for each element of the kernel |
83 | - for(int ki = -k_size; ki <= k_size; ki++){ | 99 | + for(int ki = kl; ki <= kh; ki++){ |
84 | 100 | ||
85 | //calculate the gaussian value | 101 | //calculate the gaussian value |
86 | - G = a * exp(-(ki*ki) / sigma_sq); | 102 | + G = a * exp(-(T)(ki*ki) / sigma_sq); |
87 | //calculate the global coordinates for this point in the kernel | 103 | //calculate the global coordinates for this point in the kernel |
88 | gy = (yi + ki ) % y; | 104 | gy = (yi + ki ) % y; |
89 | gi = zi * x * y + gy * x + xi; | 105 | gi = zi * x * y + gy * x + xi; |
@@ -108,7 +124,7 @@ | @@ -108,7 +124,7 @@ | ||
108 | int i = zi * x * y + yi * x + xi; | 124 | int i = zi * x * y + yi * x + xi; |
109 | 125 | ||
110 | // calculate the kernel size | 126 | // calculate the kernel size |
111 | - int k_size = sigma * 4; | 127 | + T k_size = sigma * 4; |
112 | 128 | ||
113 | //if the current pixel is outside of the image | 129 | //if the current pixel is outside of the image |
114 | if(xi >= x || yi >= y || zi>=z) | 130 | if(xi >= x || yi >= y || zi>=z) |
@@ -119,14 +135,20 @@ | @@ -119,14 +135,20 @@ | ||
119 | T G; | 135 | T G; |
120 | T sum = 0; //running weighted sum across the kernel | 136 | T sum = 0; //running weighted sum across the kernel |
121 | out[i] = 0; | 137 | out[i] = 0; |
122 | - T sigma_sq = 2 * sigma * sigma; | ||
123 | - T a = 1.0 / (sigma * sqrt(2 * pi)); | 138 | + T sigma_sq = 2.0 * sigma * sigma; |
139 | + T a = 1.0 / (sigma * sqrt(2.0 * pi)); | ||
140 | + | ||
141 | + //handle the boundary in z direction | ||
142 | + int kl = -(int)k_size; | ||
143 | + //if (zi < k_size) kl = -zi; | ||
144 | + int kh = (int)k_size; | ||
145 | + //if (zi >= z - (int)k_size) kh = z - zi - 1; | ||
124 | 146 | ||
125 | //for each element of the kernel | 147 | //for each element of the kernel |
126 | - for(int ki = -k_size; ki <= k_size; ki++){ | 148 | + for(int ki = kl; ki <= kh; ki++){ |
127 | 149 | ||
128 | //calculate the gaussian value | 150 | //calculate the gaussian value |
129 | - G = a * exp(-(ki*ki) / sigma_sq); | 151 | + G = a * exp(-(T)(ki*ki) / sigma_sq); |
130 | //calculate the global coordinates for this point in the kernel | 152 | //calculate the global coordinates for this point in the kernel |
131 | gz = (zi + ki) % z; | 153 | gz = (zi + ki) % z; |
132 | gi = gz * x * y + yi * x + xi; | 154 | gi = gz * x * y + yi * x + xi; |
@@ -138,13 +160,13 @@ | @@ -138,13 +160,13 @@ | ||
138 | } | 160 | } |
139 | 161 | ||
140 | template<typename T> | 162 | template<typename T> |
141 | - void gpu_gaussian_blur3(T* image, T sigma[], unsigned int x, unsigned int y, unsigned int z){ | 163 | + void gpu_gaussian_blur3(T* image, T sigma[], size_t x, size_t y, size_t z){ |
142 | 164 | ||
143 | //get the number of pixels in the image | 165 | //get the number of pixels in the image |
144 | - unsigned int pixels = x * y * z; | ||
145 | - unsigned int bytes = sizeof(T) * pixels; | 166 | + size_t pixels = x * y * z; |
167 | + size_t bytes = sizeof(T) * pixels; | ||
146 | 168 | ||
147 | - int max_threads = stim::maxThreadsPerBlock(); | 169 | + unsigned int max_threads = stim::maxThreadsPerBlock(); |
148 | dim3 threads(sqrt (max_threads),sqrt (max_threads)); | 170 | dim3 threads(sqrt (max_threads),sqrt (max_threads)); |
149 | dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | 171 | dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); |
150 | 172 |
cpp/gradient3.cuh
@@ -7,7 +7,7 @@ | @@ -7,7 +7,7 @@ | ||
7 | #include <stim/cuda/cudatools/error.h> | 7 | #include <stim/cuda/cudatools/error.h> |
8 | 8 | ||
9 | template<typename T> | 9 | template<typename T> |
10 | -__global__ void gradient3(T* out, T* in, float anisotropy, int x, int y, int z){ | 10 | +__global__ void gradient3(T* out, T* in, int x, int y, int z){ |
11 | 11 | ||
12 | //calculate x,y,z coordinates for this thread | 12 | //calculate x,y,z coordinates for this thread |
13 | int xi = blockIdx.x * blockDim.x + threadIdx.x; | 13 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
@@ -55,30 +55,28 @@ __global__ void gradient3(T* out, T* in, float anisotropy, int x, int y, int z){ | @@ -55,30 +55,28 @@ __global__ void gradient3(T* out, T* in, float anisotropy, int x, int y, int z){ | ||
55 | if(zi > 0 && zi < z-1) | 55 | if(zi > 0 && zi < z-1) |
56 | out[i * 3 + 2] = (in[i_zp] - in[i_zn]) / 2; | 56 | out[i * 3 + 2] = (in[i_zp] - in[i_zn]) / 2; |
57 | 57 | ||
58 | - out[i * 3 + 2] *= 1/anisotropy; | ||
59 | - | ||
60 | } | 58 | } |
61 | 59 | ||
62 | template<typename T> | 60 | template<typename T> |
63 | 61 | ||
64 | -void gpu_gradient3(T* gpuGrad, T* gpuI, float anisotropy, unsigned int x, unsigned int y, unsigned int z){ | 62 | +void gpu_gradient3(T* gpuGrad, T* gpuI, size_t x, size_t y, size_t z){ |
65 | 63 | ||
66 | 64 | ||
67 | - int max_threads = stim::maxThreadsPerBlock(); | 65 | + unsigned int max_threads = stim::maxThreadsPerBlock(); |
68 | dim3 threads(sqrt (max_threads),sqrt (max_threads)); | 66 | dim3 threads(sqrt (max_threads),sqrt (max_threads)); |
69 | dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | 67 | dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); |
70 | 68 | ||
71 | //call the GPU kernel to determine the gradient | 69 | //call the GPU kernel to determine the gradient |
72 | - gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, anisotropy, x, y, z); | 70 | + gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y, z); |
73 | 71 | ||
74 | } | 72 | } |
75 | 73 | ||
76 | template<typename T> | 74 | template<typename T> |
77 | -void cpu_gradient3(T* out, T* in, float anisotropy, unsigned int x, unsigned int y, unsigned int z){ | 75 | +void cpu_gradient3(T* out, T* in, size_t x, size_t y, size_t z){ |
78 | 76 | ||
79 | //get the number of pixels in the image | 77 | //get the number of pixels in the image |
80 | - unsigned int pixels = x * y * z; | ||
81 | - unsigned int bytes = pixels * sizeof(T); | 78 | + size_t pixels = x * y * z; |
79 | + size_t bytes = pixels * sizeof(T); | ||
82 | 80 | ||
83 | //allocate space on the GPU for the input image | 81 | //allocate space on the GPU for the input image |
84 | T* gpuIn; | 82 | T* gpuIn; |
@@ -92,7 +90,7 @@ void cpu_gradient3(T* out, T* in, float anisotropy, unsigned int x, unsigned int | @@ -92,7 +90,7 @@ void cpu_gradient3(T* out, T* in, float anisotropy, unsigned int x, unsigned int | ||
92 | cudaMalloc(&gpuOut, bytes * 3); //the output image will have two channels (x, y) | 90 | cudaMalloc(&gpuOut, bytes * 3); //the output image will have two channels (x, y) |
93 | 91 | ||
94 | //call the GPU version of this function | 92 | //call the GPU version of this function |
95 | - gpu_gradient3(gpuOut, gpuIn, anisotropy, x, y, z); | 93 | + gpu_gradient3(gpuOut, gpuIn, x, y, z); |
96 | 94 | ||
97 | //copy the results to the CPU | 95 | //copy the results to the CPU |
98 | cudaMemcpy(out, gpuOut, bytes * 3, cudaMemcpyDeviceToHost); | 96 | cudaMemcpy(out, gpuOut, bytes * 3, cudaMemcpyDeviceToHost); |
cpp/main.cpp
@@ -19,21 +19,22 @@ float phi; | @@ -19,21 +19,22 @@ float phi; | ||
19 | size_t x, y, z; | 19 | size_t x, y, z; |
20 | 20 | ||
21 | 21 | ||
22 | -void ivote3(float* img, float std[], float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[], | ||
23 | - unsigned int x, unsigned int y, unsigned int z); | ||
24 | -void lmax(float* center, float* vote, float t1, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z); | ||
25 | - | ||
26 | -void invert_data(float* cpuI, unsigned int x, unsigned int y, unsigned int z){ | ||
27 | - for(int ix = 0; ix < x; ix++){ | ||
28 | - for (int iy = 0; iy < y; iy++){ | ||
29 | - for (int iz = 0; iz < z; iz++){ | ||
30 | - int idx = iz * x * y + iy * x + ix; | ||
31 | - cpuI[idx] = 255 - cpuI[idx]; | ||
32 | - } | 22 | +void ivote3(float* img, float std[], float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[], |
23 | + size_t x, size_t y, size_t z); | ||
24 | +void lmax(float* center, float* vote, float t1, unsigned int conn[], size_t x, size_t y, size_t z); | ||
25 | + | ||
26 | +void invert_data(float* cpuI, size_t x, size_t y, size_t z) { | ||
27 | + size_t idx; | ||
28 | + for (size_t ix = 0; ix < x; ix++) { | ||
29 | + for (size_t iy = 0; iy < y; iy++) { | ||
30 | + for (size_t iz = 0; iz < z; iz++) { | ||
31 | + idx = iz * x * y + iy * x + ix; | ||
32 | + cpuI[idx] = 255 - cpuI[idx]; | ||
33 | } | 33 | } |
34 | } | 34 | } |
35 | } | 35 | } |
36 | - | 36 | +} |
37 | + | ||
37 | 38 | ||
38 | 39 | ||
39 | void advertise() { | 40 | void advertise() { |
@@ -57,12 +58,12 @@ void init_args(int argc, char* argv[]) { | @@ -57,12 +58,12 @@ void init_args(int argc, char* argv[]) { | ||
57 | args.add("x", "size of the dataset along X axis", "positive value"); | 58 | args.add("x", "size of the dataset along X axis", "positive value"); |
58 | args.add("y", "size of the dataset along Y axis", "positive value"); | 59 | args.add("y", "size of the dataset along Y axis", "positive value"); |
59 | args.add("z", "size of the dataset along Z axis", "positive value"); | 60 | args.add("z", "size of the dataset along Z axis", "positive value"); |
60 | - args.add("t", "threshold value for the final result", "positive valu"); | 61 | + args.add("t", "threshold value for the final result", "0", "positive valu"); |
61 | args.add("invert", "to invert the input data set", "string"); | 62 | args.add("invert", "to invert the input data set", "string"); |
62 | args.add("rmax", "maximum possible radius of the cells in the input image", "10", "[positive value]"); | 63 | args.add("rmax", "maximum possible radius of the cells in the input image", "10", "[positive value]"); |
63 | args.add("phi", "starting angle for the vote region (in degrees)", "25.0", "0 <= phi < 180"); | 64 | args.add("phi", "starting angle for the vote region (in degrees)", "25.0", "0 <= phi < 180"); |
64 | args.add("iter", "number of iterations for voting", "8", "i > 0"); | 65 | args.add("iter", "number of iterations for voting", "8", "i > 0"); |
65 | - args.add("sigma", "the gaussian blur standard deviation", "5", "s >=0 (s = 0, no blurring)"); | 66 | + args.add("sigma", "the gaussian blur standard deviation", "3", "s >=0 (s = 0, no blurring)"); |
66 | args.add("conn", "the number of connected neighbors for calculating the local maxima", "5", "[positive value]"); | 67 | args.add("conn", "the number of connected neighbors for calculating the local maxima", "5", "[positive value]"); |
67 | //parse the command line arguments. | 68 | //parse the command line arguments. |
68 | args.parse(argc, argv); | 69 | args.parse(argc, argv); |
@@ -95,94 +96,93 @@ void init_args(int argc, char* argv[]) { | @@ -95,94 +96,93 @@ void init_args(int argc, char* argv[]) { | ||
95 | t = args["t"].as_float(); | 96 | t = args["t"].as_float(); |
96 | sigma = args["sigma"].as_float(); | 97 | sigma = args["sigma"].as_float(); |
97 | phi = (float)args["phi"].as_float() * (float)stim::PI / 180; | 98 | phi = (float)args["phi"].as_float() * (float)stim::PI / 180; |
98 | - | 99 | + |
99 | } | 100 | } |
100 | -int main(int argc, char** argv){ | 101 | +int main(int argc, char** argv) { |
101 | 102 | ||
102 | 103 | ||
103 | cudaDeviceProp prop; | 104 | cudaDeviceProp prop; |
104 | int count; | 105 | int count; |
105 | cudaGetDeviceCount(&count); | 106 | cudaGetDeviceCount(&count); |
106 | - for (int i=0; i<count; i++){ | 107 | + for (int i = 0; i<count; i++) { |
107 | cudaGetDeviceProperties(&prop, i); | 108 | cudaGetDeviceProperties(&prop, i); |
108 | printf("current device ID: %d\n", i); | 109 | printf("current device ID: %d\n", i); |
109 | printf("device name: %s\n", prop.name); | 110 | printf("device name: %s\n", prop.name); |
110 | printf("total global mem: %lu\n", prop.totalGlobalMem); | 111 | printf("total global mem: %lu\n", prop.totalGlobalMem); |
111 | printf("shared memory per block: %lu\n", prop.sharedMemPerBlock); | 112 | printf("shared memory per block: %lu\n", prop.sharedMemPerBlock); |
112 | } | 113 | } |
113 | - | 114 | + |
114 | init_args(argc, argv); | 115 | init_args(argc, argv); |
115 | 116 | ||
116 | - unsigned int r[3] = { rmax , rmax, rmax}; | ||
117 | - | ||
118 | - float sigma3[3] = { sigma, sigma, sigma}; | ||
119 | - unsigned int conn[3] = { nlmax, nlmax, nlmax}; | ||
120 | - float d_phi = phi/(iter+2); | ||
121 | - | ||
122 | - size_t bytes = x*y*z*sizeof(float); | 117 | + unsigned int r[3] = { rmax , rmax, rmax }; |
118 | + | ||
119 | + float sigma3[3] = { sigma, sigma, sigma }; | ||
120 | + unsigned int conn[3] = { nlmax, nlmax, nlmax }; | ||
121 | + float d_phi = phi / (iter); | ||
122 | + | ||
123 | + size_t bytes = x*y*z * sizeof(float); | ||
123 | 124 | ||
124 | //allocate space on the cpu for the input data | 125 | //allocate space on the cpu for the input data |
125 | - float* cpuI = (float*) malloc(bytes); | 126 | + float* cpuI = (float*)malloc(bytes); |
126 | 127 | ||
127 | //load the input file into the cpuI | 128 | //load the input file into the cpuI |
128 | std::ifstream nissl(args.arg(0), std::ios::in | std::ios::binary); | 129 | std::ifstream nissl(args.arg(0), std::ios::in | std::ios::binary); |
129 | nissl.read((char*)cpuI, bytes); | 130 | nissl.read((char*)cpuI, bytes); |
130 | nissl.close(); | 131 | nissl.close(); |
131 | - if(args["invert"].is_set()) | 132 | + if (args["invert"].is_set()) |
132 | invert_data(cpuI, x, y, z); | 133 | invert_data(cpuI, x, y, z); |
133 | - | 134 | + |
134 | //write a new file from the cpuI. | 135 | //write a new file from the cpuI. |
135 | std::ofstream original("0-inv-128.vol", std::ofstream::out | std::ofstream::binary); | 136 | std::ofstream original("0-inv-128.vol", std::ofstream::out | std::ofstream::binary); |
136 | original.write((char*)cpuI, bytes); | 137 | original.write((char*)cpuI, bytes); |
137 | original.close(); | 138 | original.close(); |
138 | - | ||
139 | - | 139 | + |
140 | + | ||
140 | ivote3(cpuI, sigma3, phi, d_phi, r, iter, t, conn, x, y, z); // call the ivote function | 141 | ivote3(cpuI, sigma3, phi, d_phi, r, iter, t, conn, x, y, z); // call the ivote function |
141 | - | ||
142 | - std::ofstream fvote("0-vote8.vol", std::ofstream::out | std::ofstream::binary); | 142 | + |
143 | + std::ofstream fvote("00-vote8_aabb.vol", std::ofstream::out | std::ofstream::binary); | ||
143 | fvote.write((char*)cpuI, bytes); | 144 | fvote.write((char*)cpuI, bytes); |
144 | fvote.close(); | 145 | fvote.close(); |
145 | - | 146 | + |
146 | //allocate space on the cpu for the output result | 147 | //allocate space on the cpu for the output result |
147 | float* cpu_out = (float*)malloc(bytes * 3); | 148 | float* cpu_out = (float*)malloc(bytes * 3); |
148 | 149 | ||
149 | //write the output file. | 150 | //write the output file. |
150 | //for (int t0=0; t0<=5000; t0+=100){ | 151 | //for (int t0=0; t0<=5000; t0+=100){ |
151 | // float t1 = t0; | 152 | // float t1 = t0; |
152 | - int t0 = t; | ||
153 | - lmax(cpu_out, cpuI, t, conn, x, y, z); | ||
154 | - //std::ofstream fo("shared2D-v8/" + OutName.str(), std::ofstream::out | std::ofstream::binary); | ||
155 | - std::ofstream fo( args.arg(1), std::ofstream::out | std::ofstream::binary); | ||
156 | - fo.write((char*)cpu_out, bytes); | ||
157 | - fo.close(); | 153 | + int t0 = t; |
154 | + lmax(cpu_out, cpuI, t, conn, x, y, z); | ||
158 | 155 | ||
156 | + std::ofstream fo(args.arg(1), std::ofstream::out | std::ofstream::binary); | ||
157 | + fo.write((char*)cpu_out, bytes); | ||
158 | + fo.close(); | ||
159 | + | ||
159 | // creat a file for saving the list centers | 160 | // creat a file for saving the list centers |
160 | - | ||
161 | - std::ofstream list(args.arg(2)); | ||
162 | - // set the number of detected cells to zero. | ||
163 | - int nod = 0; | ||
164 | - if (list.is_open()){ | ||
165 | - | ||
166 | - for (int iz=0; iz<z; iz++){ | ||
167 | - for (int iy=0; iy<y; iy++){ | ||
168 | - for (int ix=0; ix<x; ix++){ | ||
169 | - | ||
170 | - int idx = iz * x * y + iy * x + ix; | ||
171 | - if (cpu_out[idx]>0){ | ||
172 | - nod++; | ||
173 | - list << ix << " " << iy << " "<< iz << " " << cpu_out[idx] << '\n' ; | ||
174 | - | ||
175 | - } | ||
176 | - } | 161 | + |
162 | + std::ofstream list(args.arg(2)); | ||
163 | + // set the number of detected cells to zero. | ||
164 | + int nod = 0; | ||
165 | + if (list.is_open()) { | ||
166 | + | ||
167 | + for (int iz = 0; iz<z; iz++) { | ||
168 | + for (int iy = 0; iy<y; iy++) { | ||
169 | + for (int ix = 0; ix<x; ix++) { | ||
170 | + | ||
171 | + int idx = iz * x * y + iy * x + ix; | ||
172 | + if (cpu_out[idx]>0) { | ||
173 | + nod++; | ||
174 | + list << ix << " " << iy << " " << iz << " " << cpu_out[idx] << '\n'; | ||
175 | + | ||
177 | } | 176 | } |
178 | } | 177 | } |
179 | - | ||
180 | - list.close(); | 178 | + } |
181 | } | 179 | } |
182 | 180 | ||
181 | + list.close(); | ||
182 | + } | ||
183 | + | ||
183 | 184 | ||
184 | //} | 185 | //} |
185 | - cudaDeviceReset(); | ||
186 | - | ||
187 | -} | 186 | + cudaDeviceReset(); |
188 | 187 | ||
188 | +} | ||
189 | \ No newline at end of file | 189 | \ No newline at end of file |
cpp/update_dir3_aabb.cuh
@@ -46,7 +46,7 @@ | @@ -46,7 +46,7 @@ | ||
46 | float step = 360.0/n; | 46 | float step = 360.0/n; |
47 | stim::circle<float> cir(center, d, norm); | 47 | stim::circle<float> cir(center, d, norm); |
48 | stim::aabb3<int> bb(xi,yi,zi); | 48 | stim::aabb3<int> bb(xi,yi,zi); |
49 | - bb.insert(xc,yc,zc); | 49 | + bb.insert((int) xc, (int)yc, (int)zc); |
50 | for(float j = 0; j <360.0; j += step){ | 50 | for(float j = 0; j <360.0; j += step){ |
51 | stim::vec3<float> out = cir.p(j); | 51 | stim::vec3<float> out = cir.p(j); |
52 | bb.insert(out[0], out[1], out[2]); | 52 | bb.insert(out[0], out[1], out[2]); |
@@ -67,7 +67,7 @@ | @@ -67,7 +67,7 @@ | ||
67 | float id_x = g[0]; // define local variables for the x, y, and z coordinations point to the vote direction | 67 | float id_x = g[0]; // define local variables for the x, y, and z coordinations point to the vote direction |
68 | float id_y = g[1]; | 68 | float id_y = g[1]; |
69 | float id_z = g[2]; | 69 | float id_z = g[2]; |
70 | - | 70 | + |
71 | for (bz=bb.low[2]; bz<=bb.high[2]; bz++){ | 71 | for (bz=bb.low[2]; bz<=bb.high[2]; bz++){ |
72 | dz = bz - zi; //compute the distance bw the voter and the current counter along z axis | 72 | dz = bz - zi; //compute the distance bw the voter and the current counter along z axis |
73 | dz_sq = dz * dz; | 73 | dz_sq = dz * dz; |
@@ -85,9 +85,9 @@ | @@ -85,9 +85,9 @@ | ||
85 | l_vote = gpu_vote[idx_c]; | 85 | l_vote = gpu_vote[idx_c]; |
86 | if (l_vote>max) { | 86 | if (l_vote>max) { |
87 | max = l_vote; | 87 | max = l_vote; |
88 | - id_x = dx; | ||
89 | - id_y = dy; | ||
90 | - id_z = dz; | 88 | + id_x = (float)dx; |
89 | + id_y = (float)dy; | ||
90 | + id_z = (float)dz; | ||
91 | } | 91 | } |
92 | } | 92 | } |
93 | } | 93 | } |
@@ -103,25 +103,26 @@ | @@ -103,25 +103,26 @@ | ||
103 | 103 | ||
104 | // this kernel updates the gradient direction by the calculated voting direction. | 104 | // this kernel updates the gradient direction by the calculated voting direction. |
105 | template<typename T> | 105 | template<typename T> |
106 | - __global__ void update_grad3(T* gpu_grad, T* gpu_dir, int x, int y, int z){ | 106 | + __global__ void update_grad3(T* gpu_grad, T* gpu_dir, size_t x, size_t y, size_t z){ |
107 | 107 | ||
108 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread | 108 | + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread |
109 | + | ||
110 | + size_t grid_y = y / blockDim.y; //find the grid size along y | ||
111 | + size_t blockidx_y = blockIdx.y % grid_y; | ||
112 | + size_t yi = blockidx_y * blockDim.y + threadIdx.y; | ||
113 | + size_t zi = blockIdx.y / grid_y; | ||
109 | 114 | ||
110 | - int grid_y = y / blockDim.y; //find the grid size along y | ||
111 | - int blockidx_y = blockIdx.y % grid_y; | ||
112 | - int yi = blockidx_y * blockDim.y + threadIdx.y; | ||
113 | - int zi = blockIdx.y / grid_y; | ||
114 | - int i = zi * x * y + yi * x + xi; | ||
115 | - | ||
116 | if(xi >= x || yi >= y || zi >= z) return; | 115 | if(xi >= x || yi >= y || zi >= z) return; |
117 | 116 | ||
117 | + size_t i = zi * x * y + yi * x + xi; | ||
118 | + | ||
118 | gpu_grad[i * 3 + 0] = gpu_dir [i * 3 + 0]; //update the gradient image with the new direction direction | 119 | gpu_grad[i * 3 + 0] = gpu_dir [i * 3 + 0]; //update the gradient image with the new direction direction |
119 | gpu_grad[i * 3 + 1] = gpu_dir [i * 3 + 1]; | 120 | gpu_grad[i * 3 + 1] = gpu_dir [i * 3 + 1]; |
120 | gpu_grad[i * 3 + 2] = gpu_dir [i * 3 + 2]; | 121 | gpu_grad[i * 3 + 2] = gpu_dir [i * 3 + 2]; |
121 | } | 122 | } |
122 | 123 | ||
123 | template<typename T> | 124 | template<typename T> |
124 | - void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ | 125 | + void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z){ |
125 | 126 | ||
126 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 127 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
127 | dim3 threads(sqrt (max_threads),sqrt (max_threads)); | 128 | dim3 threads(sqrt (max_threads),sqrt (max_threads)); |
cpp/vote3_atomic_aabb.cuh
@@ -13,121 +13,121 @@ | @@ -13,121 +13,121 @@ | ||
13 | #include <stim/math/vector.h> | 13 | #include <stim/math/vector.h> |
14 | #include <stim/visualization/aabb3.h> | 14 | #include <stim/visualization/aabb3.h> |
15 | 15 | ||
16 | - // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | ||
17 | - template<typename T> | ||
18 | - __global__ void vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ | ||
19 | - | ||
20 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread | ||
21 | - | ||
22 | - int grid_y = y / blockDim.y; //find the grid size along y | ||
23 | - int blockidx_y = blockIdx.y % grid_y; | ||
24 | - int yi = blockidx_y * blockDim.y + threadIdx.y; | ||
25 | - int zi = blockIdx.y / grid_y; | ||
26 | - | ||
27 | - if(xi>=x || yi>=y || zi>=z) return; | ||
28 | - | ||
29 | - int i = zi * x * y + yi * x + xi; // calculate the 1D index of the voter | ||
30 | - | ||
31 | - | ||
32 | - float rx_sq = rx * rx; // compute the square for rmax | ||
33 | - float ry_sq = ry * ry; | ||
34 | - float rz_sq = rz * rz; | ||
35 | - | ||
36 | - stim::vec3<float> g(gpu_grad[3*i],gpu_grad[3*i+1],gpu_grad[3*i+2]); // form a vec3 variable for the gradient vector | ||
37 | - stim::vec3<float> g_sph = g.cart2sph(); //convert cartesian coordinate to spherical for the gradient vector | ||
38 | - float n =8; //set the number of points to find the boundaries of the conical voting area | ||
39 | - float xc = rx * cos(g_sph[1]) * sin(g_sph[2]); //calculate the center point of the surface of the voting area for the voter | ||
40 | - float yc = ry * sin(g_sph[1]) * sin(g_sph[2]) ; | ||
41 | - float zc = rz * cos(g_sph[2]) ; | ||
42 | - float r = sqrt(xc*xc + yc*yc + zc*zc); | ||
43 | - xc+=xi; | ||
44 | - yc+=yi; | ||
45 | - zc+=zi; | ||
46 | - stim::vec3<float> center(xc,yc,zc); | ||
47 | - | ||
48 | - float d = 2 * r * tan(phi); //find the diameter of the conical voting area | ||
49 | - stim::vec3<float> norm = g.norm(); //compute the normalize gradient vector | ||
50 | - float step = 360.0/n; | ||
51 | - stim::circle<float> cir(center, d, norm); | ||
52 | - stim::aabb3<int> bb(xi,yi,zi); | ||
53 | - bb.insert((int)xc, (int)yc, (int)zc); | ||
54 | - for(float j = 0; j <360.0; j += step){ | ||
55 | - stim::vec3<float> out = cir.p(j); | ||
56 | - bb.insert(out[0], out[1], out[2]); | 16 | +// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area |
17 | +template<typename T> | ||
18 | +__global__ void vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, int rx, int ry, int rz, int x, int y, int z) { | ||
19 | + | ||
20 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread | ||
21 | + | ||
22 | + int grid_y = y / blockDim.y; //find the grid size along y | ||
23 | + int blockidx_y = blockIdx.y % grid_y; | ||
24 | + int yi = blockidx_y * blockDim.y + threadIdx.y; | ||
25 | + int zi = blockIdx.y / grid_y; | ||
26 | + | ||
27 | + if (xi >= x || yi >= y || zi >= z) return; | ||
28 | + | ||
29 | + int i = zi * x * y + yi * x + xi; // calculate the 1D index of the voter | ||
30 | + | ||
31 | + | ||
32 | + float rx_sq = rx * rx; // compute the square for rmax | ||
33 | + float ry_sq = ry * ry; | ||
34 | + float rz_sq = rz * rz; | ||
35 | + | ||
36 | + stim::vec3<float> g(gpu_grad[3 * i], gpu_grad[3 * i + 1], gpu_grad[3 * i + 2]); // form a vec3 variable for the gradient vector | ||
37 | + stim::vec3<float> g_sph = g.cart2sph(); //convert cartesian coordinate to spherical for the gradient vector | ||
38 | + float n = 8; //set the number of points to find the boundaries of the conical voting area | ||
39 | + float xc = rx * cos(g_sph[1]) * sin(g_sph[2]); //calculate the center point of the surface of the voting area for the voter | ||
40 | + float yc = ry * sin(g_sph[1]) * sin(g_sph[2]); | ||
41 | + float zc = rz * cos(g_sph[2]); | ||
42 | + float r = sqrt(xc*xc + yc*yc + zc*zc); | ||
43 | + xc += xi; | ||
44 | + yc += yi; | ||
45 | + zc += zi; | ||
46 | + stim::vec3<float> center(xc, yc, zc); | ||
47 | + | ||
48 | + float d = 2 * r * tan(phi); //find the diameter of the conical voting area | ||
49 | + stim::vec3<float> norm = g.norm(); //compute the normalize gradient vector | ||
50 | + float step = 360.0 / n; | ||
51 | + stim::circle<float> cir(center, d, norm); | ||
52 | + stim::aabb3<int> bb(xi, yi, zi); | ||
53 | + bb.insert((int)xc, (int)yc, (int)zc); | ||
54 | + for (float j = 0; j <360.0; j += step) { | ||
55 | + stim::vec3<float> out = cir.p(j); | ||
56 | + bb.insert(out[0], out[1], out[2]); | ||
57 | + } | ||
58 | + bb.trim_low(xi - rx, yi - ry, zi - rz); | ||
59 | + bb.trim_low(0, 0, 0); | ||
60 | + bb.trim_high(xi + rx, yi + ry, zi + rz); | ||
61 | + bb.trim_high(x - 1, y - 1, z - 1); | ||
62 | + int bx, by, bz; | ||
63 | + int dx, dy, dz; | ||
64 | + float dx_sq, dy_sq, dz_sq; | ||
65 | + float dist, cos_diff; | ||
66 | + int idx_c; | ||
67 | + for (bz = bb.low[2]; bz <= bb.high[2]; bz++) { | ||
68 | + dz = bz - zi; //compute the distance bw the voter and the current counter along z axis | ||
69 | + dz_sq = dz * dz; | ||
70 | + for (by = bb.low[1]; by <= bb.high[1]; by++) { | ||
71 | + dy = by - yi; //compute the distance bw the voter and the current counter along y axis | ||
72 | + dy_sq = dy * dy; | ||
73 | + for (bx = bb.low[0]; bx <= bb.high[0]; bx++) { | ||
74 | + dx = bx - xi; //compute the distance bw the voter and the current counter along x axis | ||
75 | + dx_sq = dx * dx; | ||
76 | + | ||
77 | + dist = sqrt(dx_sq + dy_sq + dz_sq); //calculate the distance between the voter and the current counter | ||
78 | + cos_diff = (norm[0] * dx + norm[1] * dy + norm[2] * dz) / dist; // calculate the cosine of angle between the voter and the current counter | ||
79 | + if (((dx_sq / rx_sq + dy_sq / ry_sq + dz_sq / rz_sq) <= 1) && (cos_diff >= cos_phi)) { //check if the current counter located in the voting area of the voter | ||
80 | + idx_c = (bz* y + by) * x + bx; //calculate the 1D index for the current counter | ||
81 | + atomicAdd(&gpu_vote[idx_c], g_sph[0]); | ||
82 | + } | ||
57 | } | 83 | } |
58 | - bb.trim_low(xi-rx, yi-ry, zi-rz); | ||
59 | - bb.trim_low(0,0,0); | ||
60 | - bb.trim_high(xi+rx, yi+ry, zi+rz); | ||
61 | - bb.trim_high(x-1, y-1, z-1); | ||
62 | - int bx,by,bz; | ||
63 | - int dx, dy, dz; | ||
64 | - float dx_sq, dy_sq, dz_sq; | ||
65 | - float dist, cos_diff; | ||
66 | - int idx_c; | ||
67 | - for (bz=bb.low[2]; bz<=bb.high[2]; bz++){ | ||
68 | - dz = bz - zi; //compute the distance bw the voter and the current counter along z axis | ||
69 | - dz_sq = dz * dz; | ||
70 | - for (by=bb.low[1]; by<=bb.high[1]; by++){ | ||
71 | - dy = by - yi; //compute the distance bw the voter and the current counter along y axis | ||
72 | - dy_sq = dy * dy; | ||
73 | - for (bx=bb.low[0]; bx<=bb.high[0]; bx++){ | ||
74 | - dx = bx - xi; //compute the distance bw the voter and the current counter along x axis | ||
75 | - dx_sq = dx * dx; | ||
76 | - | ||
77 | - dist = sqrt(dx_sq + dy_sq + dz_sq); //calculate the distance between the voter and the current counter | ||
78 | - cos_diff = (norm[0] * dx + norm[1] * dy + norm[2] * dz)/dist; // calculate the cosine of angle between the voter and the current counter | ||
79 | - if ( ( (dx_sq/rx_sq + dy_sq/ry_sq + dz_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){ //check if the current counter located in the voting area of the voter | ||
80 | - idx_c = (bz* y + by) * x + bx; //calculate the 1D index for the current counter | ||
81 | - atomicAdd (&gpu_vote[idx_c] , g_sph[0]); | ||
82 | - } | ||
83 | - } | ||
84 | - } | ||
85 | - } | ||
86 | } | 84 | } |
85 | + } | ||
86 | +} | ||
87 | 87 | ||
88 | - template<typename T> | ||
89 | - void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ | 88 | +template<typename T> |
89 | +void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z) { | ||
90 | 90 | ||
91 | - | ||
92 | - unsigned int max_threads = stim::maxThreadsPerBlock(); | ||
93 | - dim3 threads(sqrt (max_threads),sqrt (max_threads)); | ||
94 | - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | ||
95 | - vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x , y, z); //call the kernel to do the voting | ||
96 | 91 | ||
97 | - } | 92 | + unsigned int max_threads = stim::maxThreadsPerBlock(); |
93 | + dim3 threads(sqrt(max_threads), sqrt(max_threads)); | ||
94 | + dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | ||
95 | + vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x, y, z); //call the kernel to do the voting | ||
98 | 96 | ||
97 | +} | ||
99 | 98 | ||
100 | - template<typename T> | ||
101 | - void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ | ||
102 | - | ||
103 | - //calculate the number of bytes in the array | ||
104 | - unsigned int bytes = x * y * z * sizeof(T); | ||
105 | - | ||
106 | - | ||
107 | - //allocate space on the GPU for the Vote Image | ||
108 | - T* gpu_vote; | ||
109 | - cudaMalloc(&gpu_vote, bytes); | ||
110 | - | ||
111 | - //allocate space on the GPU for the input Gradient image | ||
112 | - T* gpu_grad; | ||
113 | - cudaMalloc(&gpu_grad, bytes*3); | ||
114 | - | ||
115 | - | ||
116 | - //copy the Gradient data to the GPU | ||
117 | - cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice); | ||
118 | - | ||
119 | - | ||
120 | - //call the GPU version of the vote calculation function | ||
121 | - gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z); | ||
122 | - | ||
123 | - //copy the Vote Data back to the CPU | ||
124 | - cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ; | ||
125 | - | ||
126 | - //free allocated memory | ||
127 | - cudaFree(gpu_vote); | ||
128 | - cudaFree(gpu_grad); | ||
129 | - | ||
130 | - } | 99 | + |
100 | +template<typename T> | ||
101 | +void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z) { | ||
102 | + | ||
103 | + //calculate the number of bytes in the array | ||
104 | + unsigned int bytes = x * y * z * sizeof(T); | ||
105 | + | ||
106 | + | ||
107 | + //allocate space on the GPU for the Vote Image | ||
108 | + T* gpu_vote; | ||
109 | + cudaMalloc(&gpu_vote, bytes); | ||
110 | + | ||
111 | + //allocate space on the GPU for the input Gradient image | ||
112 | + T* gpu_grad; | ||
113 | + cudaMalloc(&gpu_grad, bytes * 3); | ||
114 | + | ||
115 | + | ||
116 | + //copy the Gradient data to the GPU | ||
117 | + cudaMemcpy(gpu_grad, cpu_grad, bytes * 3, cudaMemcpyHostToDevice); | ||
118 | + | ||
119 | + | ||
120 | + //call the GPU version of the vote calculation function | ||
121 | + gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x, y, z); | ||
122 | + | ||
123 | + //copy the Vote Data back to the CPU | ||
124 | + cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost); | ||
125 | + | ||
126 | + //free allocated memory | ||
127 | + cudaFree(gpu_vote); | ||
128 | + cudaFree(gpu_grad); | ||
129 | + | ||
130 | +} | ||
131 | 131 | ||
132 | 132 | ||
133 | #endif | 133 | #endif |
134 | \ No newline at end of file | 134 | \ No newline at end of file |