Compare View

switch
from
...
to
 
Commits (2)
cpp/cudafunc.cu
... ... @@ -6,11 +6,11 @@
6 6 #include "update_dir3_aabb.cuh"
7 7 #include "local_max3.cuh"
8 8 #include <fstream>
9   -
  9 +#include <sstream>
10 10 void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[],
11 11 int iter, float t, unsigned int conn[], size_t x, size_t y, size_t z){
12 12  
13   -
  13 + cudaSetDevice(0);
14 14 size_t bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data
15 15  
16 16 float* gpuI0; //assign memory on gpu for the input data
... ... @@ -37,18 +37,13 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[],
37 37 cudaMemset(gpu_vote, 0, bytes);
38 38 gpu_vote3<float>(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z);
39 39 cudaDeviceSynchronize();
40   - if (i == 0) {
41   - cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
42   - std::ofstream fvote("00-vote1_aabb.vol", std::ofstream::out | std::ofstream::binary);
43   - fvote.write((char*)img, bytes);
44   - fvote.close();
45   - }
46   - if (i == 1) {
47   - cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
48   - std::ofstream fvote("00-vote2_aabb.vol", std::ofstream::out | std::ofstream::binary);
49   - fvote.write((char*)img, bytes);
50   - fvote.close();
51   - }
  40 +
  41 + cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
  42 + std::string filename = "0-vote";
  43 + std::ofstream fvote(filename + std::to_string(i+1) + "_aabb.vol", std::ofstream::out | std::ofstream::binary);
  44 + fvote.write((char*)img, bytes);
  45 + fvote.close();
  46 +
52 47 gpu_update_dir3<float>(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z);
53 48 cudaDeviceSynchronize();
54 49 phi = phi - d_phi;
... ... @@ -64,7 +59,7 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[],
64 59 }
65 60  
66 61 void lmax(float* out, float* in, float t, unsigned int conn[], size_t x, size_t y, size_t z){
67   - unsigned int bytes = x * y * z * sizeof(float);
  62 + size_t bytes = x * y * z * sizeof(float);
68 63  
69 64 cudaSetDevice(0);
70 65  
... ...
cpp/gaussian_blur3.cuh
... ... @@ -167,8 +167,8 @@
167 167 size_t bytes = sizeof(T) * pixels;
168 168  
169 169 unsigned int max_threads = stim::maxThreadsPerBlock();
170   - dim3 threads(sqrt (max_threads),sqrt (max_threads));
171   - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  170 + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
  171 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
172 172  
173 173 //allocate temporary space on the GPU
174 174 T* gpuIb_x;
... ... @@ -179,13 +179,13 @@
179 179 cudaMalloc(&gpuIb_y, bytes);
180 180  
181 181 // blur the original image along the x direction
182   - blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], x, y, z);
  182 + blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], (int)x, (int)y, (int)z);
183 183  
184 184 // blur the x-blurred image along the y direction
185   - blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], x, y, z);
  185 + blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], (int)x, (int)y, (int)z);
186 186  
187 187 // blur the xy-blurred image along the z direction
188   - blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], x, y, z);
  188 + blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], (int)x, (int)y, (int)z);
189 189  
190 190 //cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice);
191 191  
... ...
cpp/gradient3.cuh
... ... @@ -63,11 +63,11 @@ void gpu_gradient3(T* gpuGrad, T* gpuI, size_t x, size_t y, size_t z){
63 63  
64 64  
65 65 unsigned int max_threads = stim::maxThreadsPerBlock();
66   - dim3 threads(sqrt (max_threads),sqrt (max_threads));
67   - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  66 + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
  67 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
68 68  
69 69 //call the GPU kernel to determine the gradient
70   - gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y, z);
  70 + gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, (int)x, (int)y, (int)z);
71 71  
72 72 }
73 73  
... ...
cpp/local_max3.cuh
... ... @@ -49,15 +49,15 @@ __global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, int conn_x, int conn
49 49 }
50 50  
51 51 template<typename T>
52   -void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){
  52 +void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], size_t x, size_t y, size_t z){
53 53  
54 54 //find the max number of threads per block.
55 55 unsigned int max_threads = stim::maxThreadsPerBlock();
56   - dim3 threads(sqrt (max_threads),sqrt (max_threads));
57   - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  56 + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
  57 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
58 58  
59 59 //call the kernel to find the local max
60   - cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, conn[0], conn[1], conn[2], x, y, z);
  60 + cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, (int)conn[0], (int)conn[1], (int)conn[2], (int)x, (int)y, (int)z);
61 61  
62 62  
63 63  
... ...
cpp/main.cpp
... ... @@ -95,8 +95,8 @@ void init_args(int argc, char* argv[]) {
95 95 iter = args["iter"].as_int();
96 96 rmax = (unsigned int)args["rmax"].as_int();
97 97 nlmax = (unsigned int)args["conn"].as_int();
98   - t = args["t"].as_float();
99   - sigma = args["sigma"].as_float();
  98 + t = (float)args["t"].as_float();
  99 + sigma = (float)args["sigma"].as_float();
100 100 phi = (float)args["phi"].as_float() * (float)stim::PI / 180;
101 101  
102 102 }
... ... @@ -110,8 +110,8 @@ int main(int argc, char** argv) {
110 110 cudaGetDeviceProperties(&prop, i);
111 111 printf("current device ID: %d\n", i);
112 112 printf("device name: %s\n", prop.name);
113   - printf("total global mem: %lu\n", prop.totalGlobalMem);
114   - printf("shared memory per block: %lu\n", prop.sharedMemPerBlock);
  113 + printf("total global mem: %zu\n", prop.totalGlobalMem);
  114 + printf("shared memory per block: %zu\n", prop.sharedMemPerBlock);
115 115 }
116 116  
117 117 init_args(argc, argv);
... ... @@ -141,18 +141,16 @@ int main(int argc, char** argv) {
141 141  
142 142  
143 143 ivote3(cpuI, sigma3, phi, d_phi, r, iter, t, conn, x, y, z); // call the ivote function
144   -
  144 + /*
145 145 std::ofstream fvote("00-vote8_aabb.vol", std::ofstream::out | std::ofstream::binary);
146 146 fvote.write((char*)cpuI, bytes);
147 147 fvote.close();
  148 + */
148 149  
149 150 //allocate space on the cpu for the output result
150 151 float* cpu_out = (float*)malloc(bytes * 3);
151 152  
152   - //write the output file.
153   - //for (int t0=0; t0<=5000; t0+=100){
154   - // float t1 = t0;
155   - int t0 = t;
  153 +
156 154 lmax(cpu_out, cpuI, t, conn, x, y, z);
157 155  
158 156 std::ofstream fo(args.arg(1), std::ofstream::out | std::ofstream::binary);
... ... @@ -170,7 +168,7 @@ int main(int argc, char** argv) {
170 168 for (int iy = 0; iy<y; iy++) {
171 169 for (int ix = 0; ix<x; ix++) {
172 170  
173   - int idx = iz * x * y + iy * x + ix;
  171 + size_t idx = iz * x * y + iy * x + ix;
174 172 if (cpu_out[idx]>0) {
175 173 nod++;
176 174 list << ix << " " << iy << " " << iz << " " << cpu_out[idx] << '\n';
... ... @@ -183,8 +181,6 @@ int main(int argc, char** argv) {
183 181 list.close();
184 182 }
185 183  
186   -
187   - //}
188 184 cudaDeviceReset();
189 185  
190 186 }
191 187 \ No newline at end of file
... ...
cpp/update_dir3_aabb.cuh
... ... @@ -125,14 +125,14 @@
125 125 void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z){
126 126  
127 127 unsigned int max_threads = stim::maxThreadsPerBlock();
128   - dim3 threads(sqrt (max_threads),sqrt (max_threads));
129   - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  128 + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads));
  129 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
130 130  
131 131 T* gpu_dir; // allocate space on the GPU for the updated vote direction
132 132 cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3);
133 133  
134 134 //call the kernel to calculate the new voting direction
135   - update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, phi, cos_phi, r[0], r[1], r[2], x , y, z);
  135 + update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, phi, cos_phi, (int)r[0], (int)r[1], (int)r[2], (int)x , (int)y, (int)z);
136 136  
137 137 //call the kernel to update the gradient direction
138 138 update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z);
... ...
cpp/vote3_atomic_aabb.cuh
... ... @@ -90,9 +90,9 @@ void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], siz
90 90  
91 91  
92 92 unsigned int max_threads = stim::maxThreadsPerBlock();
93   - dim3 threads(sqrt(max_threads), sqrt(max_threads));
94   - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
95   - vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x, y, z); //call the kernel to do the voting
  93 + dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads));
  94 + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z);
  95 + vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, (int)r[0], (int)r[1], (int)r[2], (int)x, (int)y, (int)z); //call the kernel to do the voting
96 96  
97 97 }
98 98  
... ...