Commit 07e31b3406b864f75e632888e366669300ab92a6

Authored by Laila Saadatifard
1 parent 310a1698

fix the bugs in ivote3

@@ -6,11 +6,11 @@ @@ -6,11 +6,11 @@
6 #include "update_dir3_aabb.cuh" 6 #include "update_dir3_aabb.cuh"
7 #include "local_max3.cuh" 7 #include "local_max3.cuh"
8 #include <fstream> 8 #include <fstream>
9 - 9 +#include <sstream>
10 void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], 10 void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[],
11 int iter, float t, unsigned int conn[], size_t x, size_t y, size_t z){ 11 int iter, float t, unsigned int conn[], size_t x, size_t y, size_t z){
12 12
13 - 13 + cudaSetDevice(0);
14 size_t bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data 14 size_t bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data
15 15
16 float* gpuI0; //assign memory on gpu for the input data 16 float* gpuI0; //assign memory on gpu for the input data
@@ -37,18 +37,13 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], @@ -37,18 +37,13 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[],
37 cudaMemset(gpu_vote, 0, bytes); 37 cudaMemset(gpu_vote, 0, bytes);
38 gpu_vote3<float>(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z); 38 gpu_vote3<float>(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z);
39 cudaDeviceSynchronize(); 39 cudaDeviceSynchronize();
40 - if (i == 0) {  
41 - cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);  
42 - std::ofstream fvote("00-vote1_aabb.vol", std::ofstream::out | std::ofstream::binary);  
43 - fvote.write((char*)img, bytes);  
44 - fvote.close();  
45 - }  
46 - if (i == 1) {  
47 - cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);  
48 - std::ofstream fvote("00-vote2_aabb.vol", std::ofstream::out | std::ofstream::binary);  
49 - fvote.write((char*)img, bytes);  
50 - fvote.close();  
51 - } 40 +
  41 + cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
  42 + std::string filename = "0-vote";
  43 + std::ofstream fvote(filename + std::to_string(i+1) + "_aabb.vol", std::ofstream::out | std::ofstream::binary);
  44 + fvote.write((char*)img, bytes);
  45 + fvote.close();
  46 +
52 gpu_update_dir3<float>(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z); 47 gpu_update_dir3<float>(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z);
53 cudaDeviceSynchronize(); 48 cudaDeviceSynchronize();
54 phi = phi - d_phi; 49 phi = phi - d_phi;
@@ -64,7 +59,7 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], @@ -64,7 +59,7 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[],
64 } 59 }
65 60
66 void lmax(float* out, float* in, float t, unsigned int conn[], size_t x, size_t y, size_t z){ 61 void lmax(float* out, float* in, float t, unsigned int conn[], size_t x, size_t y, size_t z){
67 - unsigned int bytes = x * y * z * sizeof(float); 62 + size_t bytes = x * y * z * sizeof(float);
68 63
69 cudaSetDevice(0); 64 cudaSetDevice(0);
70 65
cpp/gaussian_blur3.cuh
@@ -167,8 +167,8 @@ @@ -167,8 +167,8 @@
167 size_t bytes = sizeof(T) * pixels; 167 size_t bytes = sizeof(T) * pixels;
168 168
169 unsigned int max_threads = stim::maxThreadsPerBlock(); 169 unsigned int max_threads = stim::maxThreadsPerBlock();
170 - dim3 threads(sqrt (max_threads),sqrt (max_threads));  
171 - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); 170 + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
  171 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
172 172
173 //allocate temporary space on the GPU 173 //allocate temporary space on the GPU
174 T* gpuIb_x; 174 T* gpuIb_x;
@@ -179,13 +179,13 @@ @@ -179,13 +179,13 @@
179 cudaMalloc(&gpuIb_y, bytes); 179 cudaMalloc(&gpuIb_y, bytes);
180 180
181 // blur the original image along the x direction 181 // blur the original image along the x direction
182 - blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], x, y, z); 182 + blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], (int)x, (int)y, (int)z);
183 183
184 // blur the x-blurred image along the y direction 184 // blur the x-blurred image along the y direction
185 - blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], x, y, z); 185 + blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], (int)x, (int)y, (int)z);
186 186
187 // blur the xy-blurred image along the z direction 187 // blur the xy-blurred image along the z direction
188 - blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], x, y, z); 188 + blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], (int)x, (int)y, (int)z);
189 189
190 //cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice); 190 //cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice);
191 191
@@ -63,11 +63,11 @@ void gpu_gradient3(T* gpuGrad, T* gpuI, size_t x, size_t y, size_t z){ @@ -63,11 +63,11 @@ void gpu_gradient3(T* gpuGrad, T* gpuI, size_t x, size_t y, size_t z){
63 63
64 64
65 unsigned int max_threads = stim::maxThreadsPerBlock(); 65 unsigned int max_threads = stim::maxThreadsPerBlock();
66 - dim3 threads(sqrt (max_threads),sqrt (max_threads));  
67 - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); 66 + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
  67 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
68 68
69 //call the GPU kernel to determine the gradient 69 //call the GPU kernel to determine the gradient
70 - gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y, z); 70 + gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, (int)x, (int)y, (int)z);
71 71
72 } 72 }
73 73
cpp/local_max3.cuh
@@ -49,15 +49,15 @@ __global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, int conn_x, int conn @@ -49,15 +49,15 @@ __global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, int conn_x, int conn
49 } 49 }
50 50
51 template<typename T> 51 template<typename T>
52 -void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ 52 +void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], size_t x, size_t y, size_t z){
53 53
54 //find the max number of threads per block. 54 //find the max number of threads per block.
55 unsigned int max_threads = stim::maxThreadsPerBlock(); 55 unsigned int max_threads = stim::maxThreadsPerBlock();
56 - dim3 threads(sqrt (max_threads),sqrt (max_threads));  
57 - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); 56 + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
  57 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
58 58
59 //call the kernel to find the local max 59 //call the kernel to find the local max
60 - cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, conn[0], conn[1], conn[2], x, y, z); 60 + cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, (int)conn[0], (int)conn[1], (int)conn[2], (int)x, (int)y, (int)z);
61 61
62 62
63 63
@@ -93,8 +93,8 @@ void init_args(int argc, char* argv[]) { @@ -93,8 +93,8 @@ void init_args(int argc, char* argv[]) {
93 iter = args["iter"].as_int(); 93 iter = args["iter"].as_int();
94 rmax = (unsigned int)args["rmax"].as_int(); 94 rmax = (unsigned int)args["rmax"].as_int();
95 nlmax = (unsigned int)args["conn"].as_int(); 95 nlmax = (unsigned int)args["conn"].as_int();
96 - t = args["t"].as_float();  
97 - sigma = args["sigma"].as_float(); 96 + t = (float)args["t"].as_float();
  97 + sigma = (float)args["sigma"].as_float();
98 phi = (float)args["phi"].as_float() * (float)stim::PI / 180; 98 phi = (float)args["phi"].as_float() * (float)stim::PI / 180;
99 99
100 } 100 }
@@ -108,8 +108,8 @@ int main(int argc, char** argv) { @@ -108,8 +108,8 @@ int main(int argc, char** argv) {
108 cudaGetDeviceProperties(&prop, i); 108 cudaGetDeviceProperties(&prop, i);
109 printf("current device ID: %d\n", i); 109 printf("current device ID: %d\n", i);
110 printf("device name: %s\n", prop.name); 110 printf("device name: %s\n", prop.name);
111 - printf("total global mem: %lu\n", prop.totalGlobalMem);  
112 - printf("shared memory per block: %lu\n", prop.sharedMemPerBlock); 111 + printf("total global mem: %zu\n", prop.totalGlobalMem);
  112 + printf("shared memory per block: %zu\n", prop.sharedMemPerBlock);
113 } 113 }
114 114
115 init_args(argc, argv); 115 init_args(argc, argv);
@@ -139,18 +139,16 @@ int main(int argc, char** argv) { @@ -139,18 +139,16 @@ int main(int argc, char** argv) {
139 139
140 140
141 ivote3(cpuI, sigma3, phi, d_phi, r, iter, t, conn, x, y, z); // call the ivote function 141 ivote3(cpuI, sigma3, phi, d_phi, r, iter, t, conn, x, y, z); // call the ivote function
142 - 142 + /*
143 std::ofstream fvote("00-vote8_aabb.vol", std::ofstream::out | std::ofstream::binary); 143 std::ofstream fvote("00-vote8_aabb.vol", std::ofstream::out | std::ofstream::binary);
144 fvote.write((char*)cpuI, bytes); 144 fvote.write((char*)cpuI, bytes);
145 fvote.close(); 145 fvote.close();
  146 + */
146 147
147 //allocate space on the cpu for the output result 148 //allocate space on the cpu for the output result
148 float* cpu_out = (float*)malloc(bytes * 3); 149 float* cpu_out = (float*)malloc(bytes * 3);
149 150
150 - //write the output file.  
151 - //for (int t0=0; t0<=5000; t0+=100){  
152 - // float t1 = t0;  
153 - int t0 = t; 151 +
154 lmax(cpu_out, cpuI, t, conn, x, y, z); 152 lmax(cpu_out, cpuI, t, conn, x, y, z);
155 153
156 std::ofstream fo(args.arg(1), std::ofstream::out | std::ofstream::binary); 154 std::ofstream fo(args.arg(1), std::ofstream::out | std::ofstream::binary);
@@ -168,7 +166,7 @@ int main(int argc, char** argv) { @@ -168,7 +166,7 @@ int main(int argc, char** argv) {
168 for (int iy = 0; iy<y; iy++) { 166 for (int iy = 0; iy<y; iy++) {
169 for (int ix = 0; ix<x; ix++) { 167 for (int ix = 0; ix<x; ix++) {
170 168
171 - int idx = iz * x * y + iy * x + ix; 169 + size_t idx = iz * x * y + iy * x + ix;
172 if (cpu_out[idx]>0) { 170 if (cpu_out[idx]>0) {
173 nod++; 171 nod++;
174 list << ix << " " << iy << " " << iz << " " << cpu_out[idx] << '\n'; 172 list << ix << " " << iy << " " << iz << " " << cpu_out[idx] << '\n';
@@ -181,8 +179,6 @@ int main(int argc, char** argv) { @@ -181,8 +179,6 @@ int main(int argc, char** argv) {
181 list.close(); 179 list.close();
182 } 180 }
183 181
184 -  
185 - //}  
186 cudaDeviceReset(); 182 cudaDeviceReset();
187 183
188 } 184 }
189 \ No newline at end of file 185 \ No newline at end of file
cpp/update_dir3_aabb.cuh
@@ -125,14 +125,14 @@ @@ -125,14 +125,14 @@
125 void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z){ 125 void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z){
126 126
127 unsigned int max_threads = stim::maxThreadsPerBlock(); 127 unsigned int max_threads = stim::maxThreadsPerBlock();
128 - dim3 threads(sqrt (max_threads),sqrt (max_threads));  
129 - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); 128 + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
  129 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
130 130
131 T* gpu_dir; // allocate space on the GPU for the updated vote direction 131 T* gpu_dir; // allocate space on the GPU for the updated vote direction
132 cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3); 132 cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3);
133 133
134 //call the kernel to calculate the new voting direction 134 //call the kernel to calculate the new voting direction
135 - update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, phi, cos_phi, r[0], r[1], r[2], x , y, z); 135 + update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, phi, cos_phi, (int)r[0], (int)r[1], (int)r[2], (int)x , (int)y, (int)z);
136 136
137 //call the kernel to update the gradient direction 137 //call the kernel to update the gradient direction
138 update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z); 138 update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z);
cpp/vote3_atomic_aabb.cuh
@@ -90,9 +90,9 @@ void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], siz @@ -90,9 +90,9 @@ void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], siz
90 90
91 91
92 unsigned int max_threads = stim::maxThreadsPerBlock(); 92 unsigned int max_threads = stim::maxThreadsPerBlock();
93 - dim3 threads(sqrt(max_threads), sqrt(max_threads));  
94 - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);  
95 - vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x, y, z); //call the kernel to do the voting 93 + dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads));
  94 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
  95 + vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, (int)r[0], (int)r[1], (int)r[2], (int)x, (int)y, (int)z); //call the kernel to do the voting
96 96
97 } 97 }
98 98