Commit 07e31b3406b864f75e632888e366669300ab92a6
1 parent
310a1698
fix the bugs in ivote3
Showing
7 changed files
with
36 additions
and
45 deletions
Show diff stats
cpp/cudafunc.cu
@@ -6,11 +6,11 @@ | @@ -6,11 +6,11 @@ | ||
6 | #include "update_dir3_aabb.cuh" | 6 | #include "update_dir3_aabb.cuh" |
7 | #include "local_max3.cuh" | 7 | #include "local_max3.cuh" |
8 | #include <fstream> | 8 | #include <fstream> |
9 | - | 9 | +#include <sstream> |
10 | void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], | 10 | void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], |
11 | int iter, float t, unsigned int conn[], size_t x, size_t y, size_t z){ | 11 | int iter, float t, unsigned int conn[], size_t x, size_t y, size_t z){ |
12 | 12 | ||
13 | - | 13 | + cudaSetDevice(0); |
14 | size_t bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data | 14 | size_t bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data |
15 | 15 | ||
16 | float* gpuI0; //assign memory on gpu for the input data | 16 | float* gpuI0; //assign memory on gpu for the input data |
@@ -37,18 +37,13 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], | @@ -37,18 +37,13 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], | ||
37 | cudaMemset(gpu_vote, 0, bytes); | 37 | cudaMemset(gpu_vote, 0, bytes); |
38 | gpu_vote3<float>(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z); | 38 | gpu_vote3<float>(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z); |
39 | cudaDeviceSynchronize(); | 39 | cudaDeviceSynchronize(); |
40 | - if (i == 0) { | ||
41 | - cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); | ||
42 | - std::ofstream fvote("00-vote1_aabb.vol", std::ofstream::out | std::ofstream::binary); | ||
43 | - fvote.write((char*)img, bytes); | ||
44 | - fvote.close(); | ||
45 | - } | ||
46 | - if (i == 1) { | ||
47 | - cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); | ||
48 | - std::ofstream fvote("00-vote2_aabb.vol", std::ofstream::out | std::ofstream::binary); | ||
49 | - fvote.write((char*)img, bytes); | ||
50 | - fvote.close(); | ||
51 | - } | 40 | + |
41 | + cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); | ||
42 | + std::string filename = "0-vote"; | ||
43 | + std::ofstream fvote(filename + std::to_string(i+1) + "_aabb.vol", std::ofstream::out | std::ofstream::binary); | ||
44 | + fvote.write((char*)img, bytes); | ||
45 | + fvote.close(); | ||
46 | + | ||
52 | gpu_update_dir3<float>(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z); | 47 | gpu_update_dir3<float>(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z); |
53 | cudaDeviceSynchronize(); | 48 | cudaDeviceSynchronize(); |
54 | phi = phi - d_phi; | 49 | phi = phi - d_phi; |
@@ -64,7 +59,7 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], | @@ -64,7 +59,7 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], | ||
64 | } | 59 | } |
65 | 60 | ||
66 | void lmax(float* out, float* in, float t, unsigned int conn[], size_t x, size_t y, size_t z){ | 61 | void lmax(float* out, float* in, float t, unsigned int conn[], size_t x, size_t y, size_t z){ |
67 | - unsigned int bytes = x * y * z * sizeof(float); | 62 | + size_t bytes = x * y * z * sizeof(float); |
68 | 63 | ||
69 | cudaSetDevice(0); | 64 | cudaSetDevice(0); |
70 | 65 |
cpp/gaussian_blur3.cuh
@@ -167,8 +167,8 @@ | @@ -167,8 +167,8 @@ | ||
167 | size_t bytes = sizeof(T) * pixels; | 167 | size_t bytes = sizeof(T) * pixels; |
168 | 168 | ||
169 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 169 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
170 | - dim3 threads(sqrt (max_threads),sqrt (max_threads)); | ||
171 | - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | 170 | + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads)); |
171 | + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z); | ||
172 | 172 | ||
173 | //allocate temporary space on the GPU | 173 | //allocate temporary space on the GPU |
174 | T* gpuIb_x; | 174 | T* gpuIb_x; |
@@ -179,13 +179,13 @@ | @@ -179,13 +179,13 @@ | ||
179 | cudaMalloc(&gpuIb_y, bytes); | 179 | cudaMalloc(&gpuIb_y, bytes); |
180 | 180 | ||
181 | // blur the original image along the x direction | 181 | // blur the original image along the x direction |
182 | - blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], x, y, z); | 182 | + blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], (int)x, (int)y, (int)z); |
183 | 183 | ||
184 | // blur the x-blurred image along the y direction | 184 | // blur the x-blurred image along the y direction |
185 | - blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], x, y, z); | 185 | + blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], (int)x, (int)y, (int)z); |
186 | 186 | ||
187 | // blur the xy-blurred image along the z direction | 187 | // blur the xy-blurred image along the z direction |
188 | - blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], x, y, z); | 188 | + blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], (int)x, (int)y, (int)z); |
189 | 189 | ||
190 | //cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice); | 190 | //cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice); |
191 | 191 |
cpp/gradient3.cuh
@@ -63,11 +63,11 @@ void gpu_gradient3(T* gpuGrad, T* gpuI, size_t x, size_t y, size_t z){ | @@ -63,11 +63,11 @@ void gpu_gradient3(T* gpuGrad, T* gpuI, size_t x, size_t y, size_t z){ | ||
63 | 63 | ||
64 | 64 | ||
65 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 65 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
66 | - dim3 threads(sqrt (max_threads),sqrt (max_threads)); | ||
67 | - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | 66 | + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads)); |
67 | + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z); | ||
68 | 68 | ||
69 | //call the GPU kernel to determine the gradient | 69 | //call the GPU kernel to determine the gradient |
70 | - gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y, z); | 70 | + gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, (int)x, (int)y, (int)z); |
71 | 71 | ||
72 | } | 72 | } |
73 | 73 |
cpp/local_max3.cuh
@@ -49,15 +49,15 @@ __global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, int conn_x, int conn | @@ -49,15 +49,15 @@ __global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, int conn_x, int conn | ||
49 | } | 49 | } |
50 | 50 | ||
51 | template<typename T> | 51 | template<typename T> |
52 | -void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ | 52 | +void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], size_t x, size_t y, size_t z){ |
53 | 53 | ||
54 | //find the max number of threads per block. | 54 | //find the max number of threads per block. |
55 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 55 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
56 | - dim3 threads(sqrt (max_threads),sqrt (max_threads)); | ||
57 | - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | 56 | + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads)); |
57 | + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z); | ||
58 | 58 | ||
59 | //call the kernel to find the local max | 59 | //call the kernel to find the local max |
60 | - cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, conn[0], conn[1], conn[2], x, y, z); | 60 | + cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, (int)conn[0], (int)conn[1], (int)conn[2], (int)x, (int)y, (int)z); |
61 | 61 | ||
62 | 62 | ||
63 | 63 |
cpp/main.cpp
@@ -93,8 +93,8 @@ void init_args(int argc, char* argv[]) { | @@ -93,8 +93,8 @@ void init_args(int argc, char* argv[]) { | ||
93 | iter = args["iter"].as_int(); | 93 | iter = args["iter"].as_int(); |
94 | rmax = (unsigned int)args["rmax"].as_int(); | 94 | rmax = (unsigned int)args["rmax"].as_int(); |
95 | nlmax = (unsigned int)args["conn"].as_int(); | 95 | nlmax = (unsigned int)args["conn"].as_int(); |
96 | - t = args["t"].as_float(); | ||
97 | - sigma = args["sigma"].as_float(); | 96 | + t = (float)args["t"].as_float(); |
97 | + sigma = (float)args["sigma"].as_float(); | ||
98 | phi = (float)args["phi"].as_float() * (float)stim::PI / 180; | 98 | phi = (float)args["phi"].as_float() * (float)stim::PI / 180; |
99 | 99 | ||
100 | } | 100 | } |
@@ -108,8 +108,8 @@ int main(int argc, char** argv) { | @@ -108,8 +108,8 @@ int main(int argc, char** argv) { | ||
108 | cudaGetDeviceProperties(&prop, i); | 108 | cudaGetDeviceProperties(&prop, i); |
109 | printf("current device ID: %d\n", i); | 109 | printf("current device ID: %d\n", i); |
110 | printf("device name: %s\n", prop.name); | 110 | printf("device name: %s\n", prop.name); |
111 | - printf("total global mem: %lu\n", prop.totalGlobalMem); | ||
112 | - printf("shared memory per block: %lu\n", prop.sharedMemPerBlock); | 111 | + printf("total global mem: %zu\n", prop.totalGlobalMem); |
112 | + printf("shared memory per block: %zu\n", prop.sharedMemPerBlock); | ||
113 | } | 113 | } |
114 | 114 | ||
115 | init_args(argc, argv); | 115 | init_args(argc, argv); |
@@ -139,18 +139,16 @@ int main(int argc, char** argv) { | @@ -139,18 +139,16 @@ int main(int argc, char** argv) { | ||
139 | 139 | ||
140 | 140 | ||
141 | ivote3(cpuI, sigma3, phi, d_phi, r, iter, t, conn, x, y, z); // call the ivote function | 141 | ivote3(cpuI, sigma3, phi, d_phi, r, iter, t, conn, x, y, z); // call the ivote function |
142 | - | 142 | + /* |
143 | std::ofstream fvote("00-vote8_aabb.vol", std::ofstream::out | std::ofstream::binary); | 143 | std::ofstream fvote("00-vote8_aabb.vol", std::ofstream::out | std::ofstream::binary); |
144 | fvote.write((char*)cpuI, bytes); | 144 | fvote.write((char*)cpuI, bytes); |
145 | fvote.close(); | 145 | fvote.close(); |
146 | + */ | ||
146 | 147 | ||
147 | //allocate space on the cpu for the output result | 148 | //allocate space on the cpu for the output result |
148 | float* cpu_out = (float*)malloc(bytes * 3); | 149 | float* cpu_out = (float*)malloc(bytes * 3); |
149 | 150 | ||
150 | - //write the output file. | ||
151 | - //for (int t0=0; t0<=5000; t0+=100){ | ||
152 | - // float t1 = t0; | ||
153 | - int t0 = t; | 151 | + |
154 | lmax(cpu_out, cpuI, t, conn, x, y, z); | 152 | lmax(cpu_out, cpuI, t, conn, x, y, z); |
155 | 153 | ||
156 | std::ofstream fo(args.arg(1), std::ofstream::out | std::ofstream::binary); | 154 | std::ofstream fo(args.arg(1), std::ofstream::out | std::ofstream::binary); |
@@ -168,7 +166,7 @@ int main(int argc, char** argv) { | @@ -168,7 +166,7 @@ int main(int argc, char** argv) { | ||
168 | for (int iy = 0; iy<y; iy++) { | 166 | for (int iy = 0; iy<y; iy++) { |
169 | for (int ix = 0; ix<x; ix++) { | 167 | for (int ix = 0; ix<x; ix++) { |
170 | 168 | ||
171 | - int idx = iz * x * y + iy * x + ix; | 169 | + size_t idx = iz * x * y + iy * x + ix; |
172 | if (cpu_out[idx]>0) { | 170 | if (cpu_out[idx]>0) { |
173 | nod++; | 171 | nod++; |
174 | list << ix << " " << iy << " " << iz << " " << cpu_out[idx] << '\n'; | 172 | list << ix << " " << iy << " " << iz << " " << cpu_out[idx] << '\n'; |
@@ -181,8 +179,6 @@ int main(int argc, char** argv) { | @@ -181,8 +179,6 @@ int main(int argc, char** argv) { | ||
181 | list.close(); | 179 | list.close(); |
182 | } | 180 | } |
183 | 181 | ||
184 | - | ||
185 | - //} | ||
186 | cudaDeviceReset(); | 182 | cudaDeviceReset(); |
187 | 183 | ||
188 | } | 184 | } |
189 | \ No newline at end of file | 185 | \ No newline at end of file |
cpp/update_dir3_aabb.cuh
@@ -125,14 +125,14 @@ | @@ -125,14 +125,14 @@ | ||
125 | void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z){ | 125 | void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z){ |
126 | 126 | ||
127 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 127 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
128 | - dim3 threads(sqrt (max_threads),sqrt (max_threads)); | ||
129 | - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | 128 | + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads)); |
129 | + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z); | ||
130 | 130 | ||
131 | T* gpu_dir; // allocate space on the GPU for the updated vote direction | 131 | T* gpu_dir; // allocate space on the GPU for the updated vote direction |
132 | cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3); | 132 | cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3); |
133 | 133 | ||
134 | //call the kernel to calculate the new voting direction | 134 | //call the kernel to calculate the new voting direction |
135 | - update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, phi, cos_phi, r[0], r[1], r[2], x , y, z); | 135 | + update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, phi, cos_phi, (int)r[0], (int)r[1], (int)r[2], (int)x , (int)y, (int)z); |
136 | 136 | ||
137 | //call the kernel to update the gradient direction | 137 | //call the kernel to update the gradient direction |
138 | update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z); | 138 | update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z); |
cpp/vote3_atomic_aabb.cuh
@@ -90,9 +90,9 @@ void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], siz | @@ -90,9 +90,9 @@ void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], siz | ||
90 | 90 | ||
91 | 91 | ||
92 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 92 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
93 | - dim3 threads(sqrt(max_threads), sqrt(max_threads)); | ||
94 | - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | ||
95 | - vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x, y, z); //call the kernel to do the voting | 93 | + dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads)); |
94 | + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z); | ||
95 | + vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, (int)r[0], (int)r[1], (int)r[2], (int)x, (int)y, (int)z); //call the kernel to do the voting | ||
96 | 96 | ||
97 | } | 97 | } |
98 | 98 |