Commit 07e31b3406b864f75e632888e366669300ab92a6
1 parent
310a1698
fix the bugs in ivote3
Showing
7 changed files
with
36 additions
and
45 deletions
Show diff stats
cpp/cudafunc.cu
... | ... | @@ -6,11 +6,11 @@ |
6 | 6 | #include "update_dir3_aabb.cuh" |
7 | 7 | #include "local_max3.cuh" |
8 | 8 | #include <fstream> |
9 | - | |
9 | +#include <sstream> | |
10 | 10 | void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], |
11 | 11 | int iter, float t, unsigned int conn[], size_t x, size_t y, size_t z){ |
12 | 12 | |
13 | - | |
13 | + cudaSetDevice(0); | |
14 | 14 | size_t bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data |
15 | 15 | |
16 | 16 | float* gpuI0; //assign memory on gpu for the input data |
... | ... | @@ -37,18 +37,13 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], |
37 | 37 | cudaMemset(gpu_vote, 0, bytes); |
38 | 38 | gpu_vote3<float>(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z); |
39 | 39 | cudaDeviceSynchronize(); |
40 | - if (i == 0) { | |
41 | - cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); | |
42 | - std::ofstream fvote("00-vote1_aabb.vol", std::ofstream::out | std::ofstream::binary); | |
43 | - fvote.write((char*)img, bytes); | |
44 | - fvote.close(); | |
45 | - } | |
46 | - if (i == 1) { | |
47 | - cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); | |
48 | - std::ofstream fvote("00-vote2_aabb.vol", std::ofstream::out | std::ofstream::binary); | |
49 | - fvote.write((char*)img, bytes); | |
50 | - fvote.close(); | |
51 | - } | |
40 | + | |
41 | + cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); | |
42 | + std::string filename = "0-vote"; | |
43 | + std::ofstream fvote(filename + std::to_string(i+1) + "_aabb.vol", std::ofstream::out | std::ofstream::binary); | |
44 | + fvote.write((char*)img, bytes); | |
45 | + fvote.close(); | |
46 | + | |
52 | 47 | gpu_update_dir3<float>(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z); |
53 | 48 | cudaDeviceSynchronize(); |
54 | 49 | phi = phi - d_phi; |
... | ... | @@ -64,7 +59,7 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], |
64 | 59 | } |
65 | 60 | |
66 | 61 | void lmax(float* out, float* in, float t, unsigned int conn[], size_t x, size_t y, size_t z){ |
67 | - unsigned int bytes = x * y * z * sizeof(float); | |
62 | + size_t bytes = x * y * z * sizeof(float); | |
68 | 63 | |
69 | 64 | cudaSetDevice(0); |
70 | 65 | ... | ... |
cpp/gaussian_blur3.cuh
... | ... | @@ -167,8 +167,8 @@ |
167 | 167 | size_t bytes = sizeof(T) * pixels; |
168 | 168 | |
169 | 169 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
170 | - dim3 threads(sqrt (max_threads),sqrt (max_threads)); | |
171 | - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | |
170 | + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads)); | |
171 | + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z); | |
172 | 172 | |
173 | 173 | //allocate temporary space on the GPU |
174 | 174 | T* gpuIb_x; |
... | ... | @@ -179,13 +179,13 @@ |
179 | 179 | cudaMalloc(&gpuIb_y, bytes); |
180 | 180 | |
181 | 181 | // blur the original image along the x direction |
182 | - blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], x, y, z); | |
182 | + blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], (int)x, (int)y, (int)z); | |
183 | 183 | |
184 | 184 | // blur the x-blurred image along the y direction |
185 | - blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], x, y, z); | |
185 | + blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], (int)x, (int)y, (int)z); | |
186 | 186 | |
187 | 187 | // blur the xy-blurred image along the z direction |
188 | - blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], x, y, z); | |
188 | + blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], (int)x, (int)y, (int)z); | |
189 | 189 | |
190 | 190 | //cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice); |
191 | 191 | ... | ... |
cpp/gradient3.cuh
... | ... | @@ -63,11 +63,11 @@ void gpu_gradient3(T* gpuGrad, T* gpuI, size_t x, size_t y, size_t z){ |
63 | 63 | |
64 | 64 | |
65 | 65 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
66 | - dim3 threads(sqrt (max_threads),sqrt (max_threads)); | |
67 | - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | |
66 | + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads)); | |
67 | + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z); | |
68 | 68 | |
69 | 69 | //call the GPU kernel to determine the gradient |
70 | - gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y, z); | |
70 | + gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, (int)x, (int)y, (int)z); | |
71 | 71 | |
72 | 72 | } |
73 | 73 | ... | ... |
cpp/local_max3.cuh
... | ... | @@ -49,15 +49,15 @@ __global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, int conn_x, int conn |
49 | 49 | } |
50 | 50 | |
51 | 51 | template<typename T> |
52 | -void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ | |
52 | +void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], size_t x, size_t y, size_t z){ | |
53 | 53 | |
54 | 54 | //find the max number of threads per block. |
55 | 55 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
56 | - dim3 threads(sqrt (max_threads),sqrt (max_threads)); | |
57 | - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | |
56 | + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads)); | |
57 | + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z); | |
58 | 58 | |
59 | 59 | //call the kernel to find the local max |
60 | - cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, conn[0], conn[1], conn[2], x, y, z); | |
60 | + cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, (int)conn[0], (int)conn[1], (int)conn[2], (int)x, (int)y, (int)z); | |
61 | 61 | |
62 | 62 | |
63 | 63 | ... | ... |
cpp/main.cpp
... | ... | @@ -93,8 +93,8 @@ void init_args(int argc, char* argv[]) { |
93 | 93 | iter = args["iter"].as_int(); |
94 | 94 | rmax = (unsigned int)args["rmax"].as_int(); |
95 | 95 | nlmax = (unsigned int)args["conn"].as_int(); |
96 | - t = args["t"].as_float(); | |
97 | - sigma = args["sigma"].as_float(); | |
96 | + t = (float)args["t"].as_float(); | |
97 | + sigma = (float)args["sigma"].as_float(); | |
98 | 98 | phi = (float)args["phi"].as_float() * (float)stim::PI / 180; |
99 | 99 | |
100 | 100 | } |
... | ... | @@ -108,8 +108,8 @@ int main(int argc, char** argv) { |
108 | 108 | cudaGetDeviceProperties(&prop, i); |
109 | 109 | printf("current device ID: %d\n", i); |
110 | 110 | printf("device name: %s\n", prop.name); |
111 | - printf("total global mem: %lu\n", prop.totalGlobalMem); | |
112 | - printf("shared memory per block: %lu\n", prop.sharedMemPerBlock); | |
111 | + printf("total global mem: %zu\n", prop.totalGlobalMem); | |
112 | + printf("shared memory per block: %zu\n", prop.sharedMemPerBlock); | |
113 | 113 | } |
114 | 114 | |
115 | 115 | init_args(argc, argv); |
... | ... | @@ -139,18 +139,16 @@ int main(int argc, char** argv) { |
139 | 139 | |
140 | 140 | |
141 | 141 | ivote3(cpuI, sigma3, phi, d_phi, r, iter, t, conn, x, y, z); // call the ivote function |
142 | - | |
142 | + /* | |
143 | 143 | std::ofstream fvote("00-vote8_aabb.vol", std::ofstream::out | std::ofstream::binary); |
144 | 144 | fvote.write((char*)cpuI, bytes); |
145 | 145 | fvote.close(); |
146 | + */ | |
146 | 147 | |
147 | 148 | //allocate space on the cpu for the output result |
148 | 149 | float* cpu_out = (float*)malloc(bytes * 3); |
149 | 150 | |
150 | - //write the output file. | |
151 | - //for (int t0=0; t0<=5000; t0+=100){ | |
152 | - // float t1 = t0; | |
153 | - int t0 = t; | |
151 | + | |
154 | 152 | lmax(cpu_out, cpuI, t, conn, x, y, z); |
155 | 153 | |
156 | 154 | std::ofstream fo(args.arg(1), std::ofstream::out | std::ofstream::binary); |
... | ... | @@ -168,7 +166,7 @@ int main(int argc, char** argv) { |
168 | 166 | for (int iy = 0; iy<y; iy++) { |
169 | 167 | for (int ix = 0; ix<x; ix++) { |
170 | 168 | |
171 | - int idx = iz * x * y + iy * x + ix; | |
169 | + size_t idx = iz * x * y + iy * x + ix; | |
172 | 170 | if (cpu_out[idx]>0) { |
173 | 171 | nod++; |
174 | 172 | list << ix << " " << iy << " " << iz << " " << cpu_out[idx] << '\n'; |
... | ... | @@ -181,8 +179,6 @@ int main(int argc, char** argv) { |
181 | 179 | list.close(); |
182 | 180 | } |
183 | 181 | |
184 | - | |
185 | - //} | |
186 | 182 | cudaDeviceReset(); |
187 | 183 | |
188 | 184 | } |
189 | 185 | \ No newline at end of file | ... | ... |
cpp/update_dir3_aabb.cuh
... | ... | @@ -125,14 +125,14 @@ |
125 | 125 | void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z){ |
126 | 126 | |
127 | 127 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
128 | - dim3 threads(sqrt (max_threads),sqrt (max_threads)); | |
129 | - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | |
128 | + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads)); | |
129 | + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z); | |
130 | 130 | |
131 | 131 | T* gpu_dir; // allocate space on the GPU for the updated vote direction |
132 | 132 | cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3); |
133 | 133 | |
134 | 134 | //call the kernel to calculate the new voting direction |
135 | - update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, phi, cos_phi, r[0], r[1], r[2], x , y, z); | |
135 | + update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, phi, cos_phi, (int)r[0], (int)r[1], (int)r[2], (int)x , (int)y, (int)z); | |
136 | 136 | |
137 | 137 | //call the kernel to update the gradient direction |
138 | 138 | update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z); | ... | ... |
cpp/vote3_atomic_aabb.cuh
... | ... | @@ -90,9 +90,9 @@ void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], siz |
90 | 90 | |
91 | 91 | |
92 | 92 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
93 | - dim3 threads(sqrt(max_threads), sqrt(max_threads)); | |
94 | - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | |
95 | - vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x, y, z); //call the kernel to do the voting | |
93 | + dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads)); | |
94 | + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z); | |
95 | + vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, (int)r[0], (int)r[1], (int)r[2], (int)x, (int)y, (int)z); //call the kernel to do the voting | |
96 | 96 | |
97 | 97 | } |
98 | 98 | ... | ... |