Commit 07e31b3406b864f75e632888e366669300ab92a6

Authored by Laila Saadatifard
1 parent 310a1698

fix the bugs in ivote3

cpp/cudafunc.cu
... ... @@ -6,11 +6,11 @@
6 6 #include "update_dir3_aabb.cuh"
7 7 #include "local_max3.cuh"
8 8 #include <fstream>
9   -
  9 +#include <sstream>
10 10 void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[],
11 11 int iter, float t, unsigned int conn[], size_t x, size_t y, size_t z){
12 12  
13   -
  13 + cudaSetDevice(0);
14 14 size_t bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data
15 15  
16 16 float* gpuI0; //assign memory on gpu for the input data
... ... @@ -37,18 +37,13 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[],
37 37 cudaMemset(gpu_vote, 0, bytes);
38 38 gpu_vote3<float>(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z);
39 39 cudaDeviceSynchronize();
40   - if (i == 0) {
41   - cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
42   - std::ofstream fvote("00-vote1_aabb.vol", std::ofstream::out | std::ofstream::binary);
43   - fvote.write((char*)img, bytes);
44   - fvote.close();
45   - }
46   - if (i == 1) {
47   - cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
48   - std::ofstream fvote("00-vote2_aabb.vol", std::ofstream::out | std::ofstream::binary);
49   - fvote.write((char*)img, bytes);
50   - fvote.close();
51   - }
  40 +
  41 + cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
  42 + std::string filename = "0-vote";
  43 + std::ofstream fvote(filename + std::to_string(i+1) + "_aabb.vol", std::ofstream::out | std::ofstream::binary);
  44 + fvote.write((char*)img, bytes);
  45 + fvote.close();
  46 +
52 47 gpu_update_dir3<float>(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z);
53 48 cudaDeviceSynchronize();
54 49 phi = phi - d_phi;
... ... @@ -64,7 +59,7 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[],
64 59 }
65 60  
66 61 void lmax(float* out, float* in, float t, unsigned int conn[], size_t x, size_t y, size_t z){
67   - unsigned int bytes = x * y * z * sizeof(float);
  62 + size_t bytes = x * y * z * sizeof(float);
68 63  
69 64 cudaSetDevice(0);
70 65  
... ...
cpp/gaussian_blur3.cuh
... ... @@ -167,8 +167,8 @@
167 167 size_t bytes = sizeof(T) * pixels;
168 168  
169 169 unsigned int max_threads = stim::maxThreadsPerBlock();
170   - dim3 threads(sqrt (max_threads),sqrt (max_threads));
171   - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  170 + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
  171 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
172 172  
173 173 //allocate temporary space on the GPU
174 174 T* gpuIb_x;
... ... @@ -179,13 +179,13 @@
179 179 cudaMalloc(&gpuIb_y, bytes);
180 180  
181 181 // blur the original image along the x direction
182   - blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], x, y, z);
  182 + blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], (int)x, (int)y, (int)z);
183 183  
184 184 // blur the x-blurred image along the y direction
185   - blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], x, y, z);
  185 + blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], (int)x, (int)y, (int)z);
186 186  
187 187 // blur the xy-blurred image along the z direction
188   - blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], x, y, z);
  188 + blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], (int)x, (int)y, (int)z);
189 189  
190 190 //cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice);
191 191  
... ...
cpp/gradient3.cuh
... ... @@ -63,11 +63,11 @@ void gpu_gradient3(T* gpuGrad, T* gpuI, size_t x, size_t y, size_t z){
63 63  
64 64  
65 65 unsigned int max_threads = stim::maxThreadsPerBlock();
66   - dim3 threads(sqrt (max_threads),sqrt (max_threads));
67   - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  66 + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
  67 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
68 68  
69 69 //call the GPU kernel to determine the gradient
70   - gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y, z);
  70 + gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, (int)x, (int)y, (int)z);
71 71  
72 72 }
73 73  
... ...
cpp/local_max3.cuh
... ... @@ -49,15 +49,15 @@ __global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, int conn_x, int conn
49 49 }
50 50  
51 51 template<typename T>
52   -void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){
  52 +void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], size_t x, size_t y, size_t z){
53 53  
54 54 //find the max number of threads per block.
55 55 unsigned int max_threads = stim::maxThreadsPerBlock();
56   - dim3 threads(sqrt (max_threads),sqrt (max_threads));
57   - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  56 + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
  57 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
58 58  
59 59 //call the kernel to find the local max
60   - cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, conn[0], conn[1], conn[2], x, y, z);
  60 + cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, (int)conn[0], (int)conn[1], (int)conn[2], (int)x, (int)y, (int)z);
61 61  
62 62  
63 63  
... ...
cpp/main.cpp
... ... @@ -93,8 +93,8 @@ void init_args(int argc, char* argv[]) {
93 93 iter = args["iter"].as_int();
94 94 rmax = (unsigned int)args["rmax"].as_int();
95 95 nlmax = (unsigned int)args["conn"].as_int();
96   - t = args["t"].as_float();
97   - sigma = args["sigma"].as_float();
  96 + t = (float)args["t"].as_float();
  97 + sigma = (float)args["sigma"].as_float();
98 98 phi = (float)args["phi"].as_float() * (float)stim::PI / 180;
99 99  
100 100 }
... ... @@ -108,8 +108,8 @@ int main(int argc, char** argv) {
108 108 cudaGetDeviceProperties(&prop, i);
109 109 printf("current device ID: %d\n", i);
110 110 printf("device name: %s\n", prop.name);
111   - printf("total global mem: %lu\n", prop.totalGlobalMem);
112   - printf("shared memory per block: %lu\n", prop.sharedMemPerBlock);
  111 + printf("total global mem: %zu\n", prop.totalGlobalMem);
  112 + printf("shared memory per block: %zu\n", prop.sharedMemPerBlock);
113 113 }
114 114  
115 115 init_args(argc, argv);
... ... @@ -139,18 +139,16 @@ int main(int argc, char** argv) {
139 139  
140 140  
141 141 ivote3(cpuI, sigma3, phi, d_phi, r, iter, t, conn, x, y, z); // call the ivote function
142   -
  142 + /*
143 143 std::ofstream fvote("00-vote8_aabb.vol", std::ofstream::out | std::ofstream::binary);
144 144 fvote.write((char*)cpuI, bytes);
145 145 fvote.close();
  146 + */
146 147  
147 148 //allocate space on the cpu for the output result
148 149 float* cpu_out = (float*)malloc(bytes * 3);
149 150  
150   - //write the output file.
151   - //for (int t0=0; t0<=5000; t0+=100){
152   - // float t1 = t0;
153   - int t0 = t;
  151 +
154 152 lmax(cpu_out, cpuI, t, conn, x, y, z);
155 153  
156 154 std::ofstream fo(args.arg(1), std::ofstream::out | std::ofstream::binary);
... ... @@ -168,7 +166,7 @@ int main(int argc, char** argv) {
168 166 for (int iy = 0; iy<y; iy++) {
169 167 for (int ix = 0; ix<x; ix++) {
170 168  
171   - int idx = iz * x * y + iy * x + ix;
  169 + size_t idx = iz * x * y + iy * x + ix;
172 170 if (cpu_out[idx]>0) {
173 171 nod++;
174 172 list << ix << " " << iy << " " << iz << " " << cpu_out[idx] << '\n';
... ... @@ -181,8 +179,6 @@ int main(int argc, char** argv) {
181 179 list.close();
182 180 }
183 181  
184   -
185   - //}
186 182 cudaDeviceReset();
187 183  
188 184 }
189 185 \ No newline at end of file
... ...
cpp/update_dir3_aabb.cuh
... ... @@ -125,14 +125,14 @@
125 125 void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z){
126 126  
127 127 unsigned int max_threads = stim::maxThreadsPerBlock();
128   - dim3 threads(sqrt (max_threads),sqrt (max_threads));
129   - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  128 + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
  129 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
130 130  
131 131 T* gpu_dir; // allocate space on the GPU for the updated vote direction
132 132 cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3);
133 133  
134 134 //call the kernel to calculate the new voting direction
135   - update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, phi, cos_phi, r[0], r[1], r[2], x , y, z);
  135 + update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, phi, cos_phi, (int)r[0], (int)r[1], (int)r[2], (int)x , (int)y, (int)z);
136 136  
137 137 //call the kernel to update the gradient direction
138 138 update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z);
... ...
cpp/vote3_atomic_aabb.cuh
... ... @@ -90,9 +90,9 @@ void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], siz
90 90  
91 91  
92 92 unsigned int max_threads = stim::maxThreadsPerBlock();
93   - dim3 threads(sqrt(max_threads), sqrt(max_threads));
94   - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
95   - vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x, y, z); //call the kernel to do the voting
  93 + dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads));
  94 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
  95 + vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, (int)r[0], (int)r[1], (int)r[2], (int)x, (int)y, (int)z); //call the kernel to do the voting
96 96  
97 97 }
98 98  
... ...