Commit 6ef1dab9e7cfa8e30d297ba698ffab696b112dc7
1 parent
5c079506
fix one bug in the gaussian_blur3 code, this ivote3 project works well for the d…
…ata image size less than 512 by 512 by 256
Showing
5 changed files
with
18 additions
and
15 deletions
Show diff stats
cpp/cudafunc.cu
1 | +#include "cuda_fp16.h" | |
2 | +#include "float_to_half.cuh" | |
3 | +#include "half_to_float.cuh" | |
1 | 4 | #include "gaussian_blur3.cuh" |
2 | 5 | #include "gradient3.cuh" |
3 | 6 | #include "mag3.cuh" |
... | ... | @@ -9,7 +12,6 @@ |
9 | 12 | void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, unsigned int r[], |
10 | 13 | int iter, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ |
11 | 14 | |
12 | - | |
13 | 15 | // compute the number of bytes in the input data |
14 | 16 | unsigned int bytes = x * y * z * sizeof(float); |
15 | 17 | |
... | ... | @@ -68,7 +70,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un |
68 | 70 | |
69 | 71 | cudaFree(gpu_grad); |
70 | 72 | cudaFree(gpu_mag); |
71 | - //cudaMemcpy(center, gpu_vote, bytes, cudaMemcpyDeviceToHost); | |
73 | + cudaMemcpy(center, gpu_vote, bytes, cudaMemcpyDeviceToHost); | |
72 | 74 | |
73 | 75 | //allocate space on the gpu for the final detected cells. |
74 | 76 | float* gpu_output; |
... | ... | @@ -78,7 +80,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un |
78 | 80 | gpu_local_max3<float>(gpu_output, gpu_vote, t, conn, x, y, z); |
79 | 81 | |
80 | 82 | //copy the final result to the cpu. |
81 | - cudaMemcpy(center, gpu_output, bytes, cudaMemcpyDeviceToHost); | |
83 | + //cudaMemcpy(center, gpu_output, bytes, cudaMemcpyDeviceToHost); | |
82 | 84 | |
83 | 85 | |
84 | 86 | cudaFree(gpu_vote); | ... | ... |
cpp/float_to_half.cuh
... | ... | @@ -6,8 +6,8 @@ |
6 | 6 | #include <stim/cuda/cudatools.h> |
7 | 7 | #include <stim/cuda/sharedmem.cuh> |
8 | 8 | #include <stim/cuda/cudatools/error.h> |
9 | -#include "cuda_fp16.h" | |
10 | - | |
9 | +#include <cuda_fp16.h> | |
10 | +#include <stdio.h> | |
11 | 11 | __global__ void cuda_f2h(half* gpu_half, float* gpu_float, int x, int y, int z){ |
12 | 12 | |
13 | 13 | |
... | ... | @@ -24,7 +24,8 @@ |
24 | 24 | |
25 | 25 | |
26 | 26 | gpu_half[i] = __float2half(gpu_float[i]); |
27 | - | |
27 | + | |
28 | + | |
28 | 29 | } |
29 | 30 | |
30 | 31 | ... | ... |
cpp/gaussian_blur3.cuh
... | ... | @@ -12,7 +12,7 @@ |
12 | 12 | |
13 | 13 | |
14 | 14 | template<typename T> |
15 | - __global__ void blur_x(T* out, T* in, T sigma, unsigned int x, unsigned int y, unsigned int z){ | |
15 | + __global__ void blur_x(T* out, T* in, T sigma, int x, int y, int z){ | |
16 | 16 | |
17 | 17 | //calculate x,y,z coordinates for this thread |
18 | 18 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
... | ... | @@ -55,7 +55,7 @@ |
55 | 55 | |
56 | 56 | |
57 | 57 | template<typename T> |
58 | - __global__ void blur_y(T* out, T* in, T sigma, unsigned int x, unsigned int y, unsigned int z){ | |
58 | + __global__ void blur_y(T* out, T* in, T sigma, int x, int y, int z){ | |
59 | 59 | |
60 | 60 | //calculate x,y,z coordinates for this thread |
61 | 61 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
... | ... | @@ -98,7 +98,7 @@ |
98 | 98 | } |
99 | 99 | |
100 | 100 | template<typename T> |
101 | - __global__ void blur_z(T* out, T* in, T sigma, unsigned int x, unsigned int y, unsigned int z){ | |
101 | + __global__ void blur_z(T* out, T* in, T sigma, int x, int y, int z){ | |
102 | 102 | |
103 | 103 | //calculate x,y,z coordinates for this thread |
104 | 104 | int xi = blockIdx.x * blockDim.x + threadIdx.x; | ... | ... |
cpp/main.cpp
... | ... | @@ -104,7 +104,7 @@ int main(int argc, char** argv){ |
104 | 104 | invert_data(cpuI, x, y, z); |
105 | 105 | |
106 | 106 | //write a new file from the cpuI. |
107 | - std::ofstream original("output/0-original_invert.vol", std::ofstream::out | std::ofstream::binary); | |
107 | + std::ofstream original("output/original_invert--512.vol", std::ofstream::out | std::ofstream::binary); | |
108 | 108 | original.write((char*)cpuI, bytes); |
109 | 109 | original.close(); |
110 | 110 | |
... | ... | @@ -115,7 +115,7 @@ int main(int argc, char** argv){ |
115 | 115 | ivote3(cpu_out, cpuI, sigma, phi, d_phi, r, iter, t, conn, x, y, z); |
116 | 116 | |
117 | 117 | //write the blurred file from the cpuI. |
118 | - std::ofstream fblur("output/test0.vol", std::ofstream::out | std::ofstream::binary); | |
118 | + std::ofstream fblur("output/v1--512.vol", std::ofstream::out | std::ofstream::binary); | |
119 | 119 | fblur.write((char*)cpuI, bytes); |
120 | 120 | fblur.close(); |
121 | 121 | ... | ... |
cpp/update_dir3.cuh
... | ... | @@ -5,7 +5,7 @@ |
5 | 5 | # include <cuda.h> |
6 | 6 | #include <stim/cuda/cudatools.h> |
7 | 7 | #include <stim/cuda/sharedmem.cuh> |
8 | - | |
8 | +#include <cuda_fp16.h> | |
9 | 9 | |
10 | 10 | // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. |
11 | 11 | template<typename T> |
... | ... | @@ -33,9 +33,9 @@ |
33 | 33 | float max = 0; |
34 | 34 | float l_vote = 0; |
35 | 35 | // define local variables for the x, y, and z coordinations where the maximum happened |
36 | - int id_x = g_v_x; | |
37 | - int id_y = g_v_y; | |
38 | - int id_z = g_v_z; | |
36 | + float id_x = g_v_x; | |
37 | + float id_y = g_v_y; | |
38 | + float id_z = g_v_z; | |
39 | 39 | |
40 | 40 | int rx_sq = rx * rx; |
41 | 41 | int ry_sq = ry * ry; | ... | ... |