Compare View

switch
from
...
to
 
Commits (2)
@@ -6,11 +6,11 @@ @@ -6,11 +6,11 @@
6 #include "update_dir3_aabb.cuh" 6 #include "update_dir3_aabb.cuh"
7 #include "local_max3.cuh" 7 #include "local_max3.cuh"
8 #include <fstream> 8 #include <fstream>
9 - 9 +#include <sstream>
10 void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], 10 void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[],
11 int iter, float t, unsigned int conn[], size_t x, size_t y, size_t z){ 11 int iter, float t, unsigned int conn[], size_t x, size_t y, size_t z){
12 12
13 - 13 + cudaSetDevice(0);
14 size_t bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data 14 size_t bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data
15 15
16 float* gpuI0; //assign memory on gpu for the input data 16 float* gpuI0; //assign memory on gpu for the input data
@@ -37,18 +37,13 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], @@ -37,18 +37,13 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[],
37 cudaMemset(gpu_vote, 0, bytes); 37 cudaMemset(gpu_vote, 0, bytes);
38 gpu_vote3<float>(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z); 38 gpu_vote3<float>(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z);
39 cudaDeviceSynchronize(); 39 cudaDeviceSynchronize();
40 - if (i == 0) {  
41 - cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);  
42 - std::ofstream fvote("00-vote1_aabb.vol", std::ofstream::out | std::ofstream::binary);  
43 - fvote.write((char*)img, bytes);  
44 - fvote.close();  
45 - }  
46 - if (i == 1) {  
47 - cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);  
48 - std::ofstream fvote("00-vote2_aabb.vol", std::ofstream::out | std::ofstream::binary);  
49 - fvote.write((char*)img, bytes);  
50 - fvote.close();  
51 - } 40 +
  41 + cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
  42 + std::string filename = "0-vote";
  43 + std::ofstream fvote(filename + std::to_string(i+1) + "_aabb.vol", std::ofstream::out | std::ofstream::binary);
  44 + fvote.write((char*)img, bytes);
  45 + fvote.close();
  46 +
52 gpu_update_dir3<float>(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z); 47 gpu_update_dir3<float>(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z);
53 cudaDeviceSynchronize(); 48 cudaDeviceSynchronize();
54 phi = phi - d_phi; 49 phi = phi - d_phi;
@@ -64,7 +59,7 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], @@ -64,7 +59,7 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[],
64 } 59 }
65 60
66 void lmax(float* out, float* in, float t, unsigned int conn[], size_t x, size_t y, size_t z){ 61 void lmax(float* out, float* in, float t, unsigned int conn[], size_t x, size_t y, size_t z){
67 - unsigned int bytes = x * y * z * sizeof(float); 62 + size_t bytes = x * y * z * sizeof(float);
68 63
69 cudaSetDevice(0); 64 cudaSetDevice(0);
70 65
cpp/gaussian_blur3.cuh
@@ -167,8 +167,8 @@ @@ -167,8 +167,8 @@
167 size_t bytes = sizeof(T) * pixels; 167 size_t bytes = sizeof(T) * pixels;
168 168
169 unsigned int max_threads = stim::maxThreadsPerBlock(); 169 unsigned int max_threads = stim::maxThreadsPerBlock();
170 - dim3 threads(sqrt (max_threads),sqrt (max_threads));  
171 - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); 170 + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
  171 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
172 172
173 //allocate temporary space on the GPU 173 //allocate temporary space on the GPU
174 T* gpuIb_x; 174 T* gpuIb_x;
@@ -179,13 +179,13 @@ @@ -179,13 +179,13 @@
179 cudaMalloc(&gpuIb_y, bytes); 179 cudaMalloc(&gpuIb_y, bytes);
180 180
181 // blur the original image along the x direction 181 // blur the original image along the x direction
182 - blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], x, y, z); 182 + blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], (int)x, (int)y, (int)z);
183 183
184 // blur the x-blurred image along the y direction 184 // blur the x-blurred image along the y direction
185 - blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], x, y, z); 185 + blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], (int)x, (int)y, (int)z);
186 186
187 // blur the xy-blurred image along the z direction 187 // blur the xy-blurred image along the z direction
188 - blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], x, y, z); 188 + blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], (int)x, (int)y, (int)z);
189 189
190 //cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice); 190 //cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice);
191 191
@@ -63,11 +63,11 @@ void gpu_gradient3(T* gpuGrad, T* gpuI, size_t x, size_t y, size_t z){ @@ -63,11 +63,11 @@ void gpu_gradient3(T* gpuGrad, T* gpuI, size_t x, size_t y, size_t z){
63 63
64 64
65 unsigned int max_threads = stim::maxThreadsPerBlock(); 65 unsigned int max_threads = stim::maxThreadsPerBlock();
66 - dim3 threads(sqrt (max_threads),sqrt (max_threads));  
67 - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); 66 + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
  67 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
68 68
69 //call the GPU kernel to determine the gradient 69 //call the GPU kernel to determine the gradient
70 - gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y, z); 70 + gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, (int)x, (int)y, (int)z);
71 71
72 } 72 }
73 73
cpp/local_max3.cuh
@@ -49,15 +49,15 @@ __global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, int conn_x, int conn @@ -49,15 +49,15 @@ __global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, int conn_x, int conn
49 } 49 }
50 50
51 template<typename T> 51 template<typename T>
52 -void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ 52 +void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], size_t x, size_t y, size_t z){
53 53
54 //find the max number of threads per block. 54 //find the max number of threads per block.
55 unsigned int max_threads = stim::maxThreadsPerBlock(); 55 unsigned int max_threads = stim::maxThreadsPerBlock();
56 - dim3 threads(sqrt (max_threads),sqrt (max_threads));  
57 - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); 56 + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
  57 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
58 58
59 //call the kernel to find the local max 59 //call the kernel to find the local max
60 - cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, conn[0], conn[1], conn[2], x, y, z); 60 + cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, (int)conn[0], (int)conn[1], (int)conn[2], (int)x, (int)y, (int)z);
61 61
62 62
63 63
@@ -95,8 +95,8 @@ void init_args(int argc, char* argv[]) { @@ -95,8 +95,8 @@ void init_args(int argc, char* argv[]) {
95 iter = args["iter"].as_int(); 95 iter = args["iter"].as_int();
96 rmax = (unsigned int)args["rmax"].as_int(); 96 rmax = (unsigned int)args["rmax"].as_int();
97 nlmax = (unsigned int)args["conn"].as_int(); 97 nlmax = (unsigned int)args["conn"].as_int();
98 - t = args["t"].as_float();  
99 - sigma = args["sigma"].as_float(); 98 + t = (float)args["t"].as_float();
  99 + sigma = (float)args["sigma"].as_float();
100 phi = (float)args["phi"].as_float() * (float)stim::PI / 180; 100 phi = (float)args["phi"].as_float() * (float)stim::PI / 180;
101 101
102 } 102 }
@@ -110,8 +110,8 @@ int main(int argc, char** argv) { @@ -110,8 +110,8 @@ int main(int argc, char** argv) {
110 cudaGetDeviceProperties(&prop, i); 110 cudaGetDeviceProperties(&prop, i);
111 printf("current device ID: %d\n", i); 111 printf("current device ID: %d\n", i);
112 printf("device name: %s\n", prop.name); 112 printf("device name: %s\n", prop.name);
113 - printf("total global mem: %lu\n", prop.totalGlobalMem);  
114 - printf("shared memory per block: %lu\n", prop.sharedMemPerBlock); 113 + printf("total global mem: %zu\n", prop.totalGlobalMem);
  114 + printf("shared memory per block: %zu\n", prop.sharedMemPerBlock);
115 } 115 }
116 116
117 init_args(argc, argv); 117 init_args(argc, argv);
@@ -141,18 +141,16 @@ int main(int argc, char** argv) { @@ -141,18 +141,16 @@ int main(int argc, char** argv) {
141 141
142 142
143 ivote3(cpuI, sigma3, phi, d_phi, r, iter, t, conn, x, y, z); // call the ivote function 143 ivote3(cpuI, sigma3, phi, d_phi, r, iter, t, conn, x, y, z); // call the ivote function
144 - 144 + /*
145 std::ofstream fvote("00-vote8_aabb.vol", std::ofstream::out | std::ofstream::binary); 145 std::ofstream fvote("00-vote8_aabb.vol", std::ofstream::out | std::ofstream::binary);
146 fvote.write((char*)cpuI, bytes); 146 fvote.write((char*)cpuI, bytes);
147 fvote.close(); 147 fvote.close();
  148 + */
148 149
149 //allocate space on the cpu for the output result 150 //allocate space on the cpu for the output result
150 float* cpu_out = (float*)malloc(bytes * 3); 151 float* cpu_out = (float*)malloc(bytes * 3);
151 152
152 - //write the output file.  
153 - //for (int t0=0; t0<=5000; t0+=100){  
154 - // float t1 = t0;  
155 - int t0 = t; 153 +
156 lmax(cpu_out, cpuI, t, conn, x, y, z); 154 lmax(cpu_out, cpuI, t, conn, x, y, z);
157 155
158 std::ofstream fo(args.arg(1), std::ofstream::out | std::ofstream::binary); 156 std::ofstream fo(args.arg(1), std::ofstream::out | std::ofstream::binary);
@@ -170,7 +168,7 @@ int main(int argc, char** argv) { @@ -170,7 +168,7 @@ int main(int argc, char** argv) {
170 for (int iy = 0; iy<y; iy++) { 168 for (int iy = 0; iy<y; iy++) {
171 for (int ix = 0; ix<x; ix++) { 169 for (int ix = 0; ix<x; ix++) {
172 170
173 - int idx = iz * x * y + iy * x + ix; 171 + size_t idx = iz * x * y + iy * x + ix;
174 if (cpu_out[idx]>0) { 172 if (cpu_out[idx]>0) {
175 nod++; 173 nod++;
176 list << ix << " " << iy << " " << iz << " " << cpu_out[idx] << '\n'; 174 list << ix << " " << iy << " " << iz << " " << cpu_out[idx] << '\n';
@@ -183,8 +181,6 @@ int main(int argc, char** argv) { @@ -183,8 +181,6 @@ int main(int argc, char** argv) {
183 list.close(); 181 list.close();
184 } 182 }
185 183
186 -  
187 - //}  
188 cudaDeviceReset(); 184 cudaDeviceReset();
189 185
190 } 186 }
191 \ No newline at end of file 187 \ No newline at end of file
cpp/update_dir3_aabb.cuh
@@ -125,14 +125,14 @@ @@ -125,14 +125,14 @@
125 void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z){ 125 void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z){
126 126
127 unsigned int max_threads = stim::maxThreadsPerBlock(); 127 unsigned int max_threads = stim::maxThreadsPerBlock();
128 - dim3 threads(sqrt (max_threads),sqrt (max_threads));  
129 - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); 128 + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
  129 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
130 130
131 T* gpu_dir; // allocate space on the GPU for the updated vote direction 131 T* gpu_dir; // allocate space on the GPU for the updated vote direction
132 cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3); 132 cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3);
133 133
134 //call the kernel to calculate the new voting direction 134 //call the kernel to calculate the new voting direction
135 - update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, phi, cos_phi, r[0], r[1], r[2], x , y, z); 135 + update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, phi, cos_phi, (int)r[0], (int)r[1], (int)r[2], (int)x , (int)y, (int)z);
136 136
137 //call the kernel to update the gradient direction 137 //call the kernel to update the gradient direction
138 update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z); 138 update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z);
cpp/vote3_atomic_aabb.cuh
@@ -90,9 +90,9 @@ void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], siz @@ -90,9 +90,9 @@ void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], siz
90 90
91 91
92 unsigned int max_threads = stim::maxThreadsPerBlock(); 92 unsigned int max_threads = stim::maxThreadsPerBlock();
93 - dim3 threads(sqrt(max_threads), sqrt(max_threads));  
94 - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);  
95 - vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x, y, z); //call the kernel to do the voting 93 + dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads));
  94 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
  95 + vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, (int)r[0], (int)r[1], (int)r[2], (int)x, (int)y, (int)z); //call the kernel to do the voting
96 96
97 } 97 }
98 98