diff --git a/cpp/cudafunc.cu b/cpp/cudafunc.cu index db74079..0d7d4a4 100644 --- a/cpp/cudafunc.cu +++ b/cpp/cudafunc.cu @@ -6,11 +6,11 @@ #include "update_dir3_aabb.cuh" #include "local_max3.cuh" #include - +#include void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[], size_t x, size_t y, size_t z){ - + cudaSetDevice(0); size_t bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data float* gpuI0; //assign memory on gpu for the input data @@ -37,18 +37,13 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], cudaMemset(gpu_vote, 0, bytes); gpu_vote3(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z); cudaDeviceSynchronize(); - if (i == 0) { - cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); - std::ofstream fvote("00-vote1_aabb.vol", std::ofstream::out | std::ofstream::binary); - fvote.write((char*)img, bytes); - fvote.close(); - } - if (i == 1) { - cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); - std::ofstream fvote("00-vote2_aabb.vol", std::ofstream::out | std::ofstream::binary); - fvote.write((char*)img, bytes); - fvote.close(); - } + + cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); + std::string filename = "0-vote"; + std::ofstream fvote(filename + std::to_string(i+1) + "_aabb.vol", std::ofstream::out | std::ofstream::binary); + fvote.write((char*)img, bytes); + fvote.close(); + gpu_update_dir3(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z); cudaDeviceSynchronize(); phi = phi - d_phi; @@ -64,7 +59,7 @@ void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], } void lmax(float* out, float* in, float t, unsigned int conn[], size_t x, size_t y, size_t z){ - unsigned int bytes = x * y * z * sizeof(float); + size_t bytes = x * y * z * sizeof(float); cudaSetDevice(0); diff --git a/cpp/gaussian_blur3.cuh b/cpp/gaussian_blur3.cuh index f668a50..38eb25c 100644 --- a/cpp/gaussian_blur3.cuh +++ b/cpp/gaussian_blur3.cuh @@ -167,8 +167,8 @@ size_t bytes = sizeof(T) * pixels; unsigned int max_threads = stim::maxThreadsPerBlock(); - dim3 threads(sqrt (max_threads),sqrt (max_threads)); - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads)); + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z); //allocate temporary space on the GPU T* gpuIb_x; @@ -179,13 +179,13 @@ cudaMalloc(&gpuIb_y, bytes); // blur the original image along the x direction - blur_x <<< blocks, threads >>>(gpuIb_x, image, sigma[0], x, y, z); + blur_x <<< blocks, threads >>>(gpuIb_x, image, sigma[0], (int)x, (int)y, (int)z); // blur the x-blurred image along the y direction - blur_y <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], x, y, z); + blur_y <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], (int)x, (int)y, (int)z); // blur the xy-blurred image along the z direction - blur_z <<< blocks, threads >>>(image, gpuIb_y, sigma[2], x, y, z); + blur_z <<< blocks, threads >>>(image, gpuIb_y, sigma[2], (int)x, (int)y, (int)z); //cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice); diff --git a/cpp/gradient3.cuh b/cpp/gradient3.cuh index 92f0ccf..b515257 100644 --- a/cpp/gradient3.cuh +++ b/cpp/gradient3.cuh @@ -63,11 +63,11 @@ void gpu_gradient3(T* gpuGrad, T* gpuI, size_t x, size_t y, size_t z){ unsigned int max_threads = stim::maxThreadsPerBlock(); - dim3 threads(sqrt (max_threads),sqrt (max_threads)); - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads)); + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z); //call the GPU kernel to determine the gradient - gradient3 <<< blocks, threads >>>(gpuGrad, gpuI, x, y, z); + gradient3 <<< blocks, threads >>>(gpuGrad, gpuI, (int)x, (int)y, (int)z); } diff --git a/cpp/local_max3.cuh b/cpp/local_max3.cuh index a13bd56..677f3cb 100644 --- a/cpp/local_max3.cuh +++ b/cpp/local_max3.cuh @@ -49,15 +49,15 @@ __global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, int conn_x, int conn } template -void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ +void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], size_t x, size_t y, size_t z){ //find the max number of threads per block. unsigned int max_threads = stim::maxThreadsPerBlock(); - dim3 threads(sqrt (max_threads),sqrt (max_threads)); - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads)); + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z); //call the kernel to find the local max - cuda_local_max3<<>>(gpu_output, gpu_vote, conn[0], conn[1], conn[2], x, y, z); + cuda_local_max3<<>>(gpu_output, gpu_vote, (int)conn[0], (int)conn[1], (int)conn[2], (int)x, (int)y, (int)z); diff --git a/cpp/main.cpp b/cpp/main.cpp index 1f0d7a2..72acb2d 100644 --- a/cpp/main.cpp +++ b/cpp/main.cpp @@ -93,8 +93,8 @@ void init_args(int argc, char* argv[]) { iter = args["iter"].as_int(); rmax = (unsigned int)args["rmax"].as_int(); nlmax = (unsigned int)args["conn"].as_int(); - t = args["t"].as_float(); - sigma = args["sigma"].as_float(); + t = (float)args["t"].as_float(); + sigma = (float)args["sigma"].as_float(); phi = (float)args["phi"].as_float() * (float)stim::PI / 180; } @@ -108,8 +108,8 @@ int main(int argc, char** argv) { cudaGetDeviceProperties(&prop, i); printf("current device ID: %d\n", i); printf("device name: %s\n", prop.name); - printf("total global mem: %lu\n", prop.totalGlobalMem); - printf("shared memory per block: %lu\n", prop.sharedMemPerBlock); + printf("total global mem: %zu\n", prop.totalGlobalMem); + printf("shared memory per block: %zu\n", prop.sharedMemPerBlock); } init_args(argc, argv); @@ -139,18 +139,16 @@ int main(int argc, char** argv) { ivote3(cpuI, sigma3, phi, d_phi, r, iter, t, conn, x, y, z); // call the ivote function - + /* std::ofstream fvote("00-vote8_aabb.vol", std::ofstream::out | std::ofstream::binary); fvote.write((char*)cpuI, bytes); fvote.close(); + */ //allocate space on the cpu for the output result float* cpu_out = (float*)malloc(bytes * 3); - //write the output file. - //for (int t0=0; t0<=5000; t0+=100){ - // float t1 = t0; - int t0 = t; + lmax(cpu_out, cpuI, t, conn, x, y, z); std::ofstream fo(args.arg(1), std::ofstream::out | std::ofstream::binary); @@ -168,7 +166,7 @@ int main(int argc, char** argv) { for (int iy = 0; iy0) { nod++; list << ix << " " << iy << " " << iz << " " << cpu_out[idx] << '\n'; @@ -181,8 +179,6 @@ int main(int argc, char** argv) { list.close(); } - - //} cudaDeviceReset(); } \ No newline at end of file diff --git a/cpp/update_dir3_aabb.cuh b/cpp/update_dir3_aabb.cuh index 5491985..bf8e0a7 100644 --- a/cpp/update_dir3_aabb.cuh +++ b/cpp/update_dir3_aabb.cuh @@ -125,14 +125,14 @@ void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z){ unsigned int max_threads = stim::maxThreadsPerBlock(); - dim3 threads(sqrt (max_threads),sqrt (max_threads)); - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); + dim3 threads((unsigned int)sqrt (max_threads), (unsigned int)sqrt (max_threads)); + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z); T* gpu_dir; // allocate space on the GPU for the updated vote direction cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3); //call the kernel to calculate the new voting direction - update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, phi, cos_phi, r[0], r[1], r[2], x , y, z); + update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, phi, cos_phi, (int)r[0], (int)r[1], (int)r[2], (int)x , (int)y, (int)z); //call the kernel to update the gradient direction update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z); diff --git a/cpp/vote3_atomic_aabb.cuh b/cpp/vote3_atomic_aabb.cuh index f5f21d8..8e8556a 100644 --- a/cpp/vote3_atomic_aabb.cuh +++ b/cpp/vote3_atomic_aabb.cuh @@ -90,9 +90,9 @@ void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], siz unsigned int max_threads = stim::maxThreadsPerBlock(); - dim3 threads(sqrt(max_threads), sqrt(max_threads)); - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); - vote3 << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x, y, z); //call the kernel to do the voting + dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads)); + dim3 blocks((unsigned int)x / threads.x + 1, ((unsigned int)y / threads.y + 1) * (unsigned int)z); + vote3 << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, (int)r[0], (int)r[1], (int)r[2], (int)x, (int)y, (int)z); //call the kernel to do the voting } -- libgit2 0.21.4