Commit 6ef1dab9e7cfa8e30d297ba698ffab696b112dc7
1 parent
5c079506
fix one bug in the gaussian_blur3 code, this ivote3 project works well for the d…
…ata image size less than 512 by 512 by 256
Showing
5 changed files
with
18 additions
and
15 deletions
Show diff stats
cpp/cudafunc.cu
1 | +#include "cuda_fp16.h" | ||
2 | +#include "float_to_half.cuh" | ||
3 | +#include "half_to_float.cuh" | ||
1 | #include "gaussian_blur3.cuh" | 4 | #include "gaussian_blur3.cuh" |
2 | #include "gradient3.cuh" | 5 | #include "gradient3.cuh" |
3 | #include "mag3.cuh" | 6 | #include "mag3.cuh" |
@@ -9,7 +12,6 @@ | @@ -9,7 +12,6 @@ | ||
9 | void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, unsigned int r[], | 12 | void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, unsigned int r[], |
10 | int iter, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ | 13 | int iter, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ |
11 | 14 | ||
12 | - | ||
13 | // compute the number of bytes in the input data | 15 | // compute the number of bytes in the input data |
14 | unsigned int bytes = x * y * z * sizeof(float); | 16 | unsigned int bytes = x * y * z * sizeof(float); |
15 | 17 | ||
@@ -68,7 +70,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un | @@ -68,7 +70,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un | ||
68 | 70 | ||
69 | cudaFree(gpu_grad); | 71 | cudaFree(gpu_grad); |
70 | cudaFree(gpu_mag); | 72 | cudaFree(gpu_mag); |
71 | - //cudaMemcpy(center, gpu_vote, bytes, cudaMemcpyDeviceToHost); | 73 | + cudaMemcpy(center, gpu_vote, bytes, cudaMemcpyDeviceToHost); |
72 | 74 | ||
73 | //allocate space on the gpu for the final detected cells. | 75 | //allocate space on the gpu for the final detected cells. |
74 | float* gpu_output; | 76 | float* gpu_output; |
@@ -78,7 +80,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un | @@ -78,7 +80,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un | ||
78 | gpu_local_max3<float>(gpu_output, gpu_vote, t, conn, x, y, z); | 80 | gpu_local_max3<float>(gpu_output, gpu_vote, t, conn, x, y, z); |
79 | 81 | ||
80 | //copy the final result to the cpu. | 82 | //copy the final result to the cpu. |
81 | - cudaMemcpy(center, gpu_output, bytes, cudaMemcpyDeviceToHost); | 83 | + //cudaMemcpy(center, gpu_output, bytes, cudaMemcpyDeviceToHost); |
82 | 84 | ||
83 | 85 | ||
84 | cudaFree(gpu_vote); | 86 | cudaFree(gpu_vote); |
cpp/float_to_half.cuh
@@ -6,8 +6,8 @@ | @@ -6,8 +6,8 @@ | ||
6 | #include <stim/cuda/cudatools.h> | 6 | #include <stim/cuda/cudatools.h> |
7 | #include <stim/cuda/sharedmem.cuh> | 7 | #include <stim/cuda/sharedmem.cuh> |
8 | #include <stim/cuda/cudatools/error.h> | 8 | #include <stim/cuda/cudatools/error.h> |
9 | -#include "cuda_fp16.h" | ||
10 | - | 9 | +#include <cuda_fp16.h> |
10 | +#include <stdio.h> | ||
11 | __global__ void cuda_f2h(half* gpu_half, float* gpu_float, int x, int y, int z){ | 11 | __global__ void cuda_f2h(half* gpu_half, float* gpu_float, int x, int y, int z){ |
12 | 12 | ||
13 | 13 | ||
@@ -24,7 +24,8 @@ | @@ -24,7 +24,8 @@ | ||
24 | 24 | ||
25 | 25 | ||
26 | gpu_half[i] = __float2half(gpu_float[i]); | 26 | gpu_half[i] = __float2half(gpu_float[i]); |
27 | - | 27 | + |
28 | + | ||
28 | } | 29 | } |
29 | 30 | ||
30 | 31 |
cpp/gaussian_blur3.cuh
@@ -12,7 +12,7 @@ | @@ -12,7 +12,7 @@ | ||
12 | 12 | ||
13 | 13 | ||
14 | template<typename T> | 14 | template<typename T> |
15 | - __global__ void blur_x(T* out, T* in, T sigma, unsigned int x, unsigned int y, unsigned int z){ | 15 | + __global__ void blur_x(T* out, T* in, T sigma, int x, int y, int z){ |
16 | 16 | ||
17 | //calculate x,y,z coordinates for this thread | 17 | //calculate x,y,z coordinates for this thread |
18 | int xi = blockIdx.x * blockDim.x + threadIdx.x; | 18 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
@@ -55,7 +55,7 @@ | @@ -55,7 +55,7 @@ | ||
55 | 55 | ||
56 | 56 | ||
57 | template<typename T> | 57 | template<typename T> |
58 | - __global__ void blur_y(T* out, T* in, T sigma, unsigned int x, unsigned int y, unsigned int z){ | 58 | + __global__ void blur_y(T* out, T* in, T sigma, int x, int y, int z){ |
59 | 59 | ||
60 | //calculate x,y,z coordinates for this thread | 60 | //calculate x,y,z coordinates for this thread |
61 | int xi = blockIdx.x * blockDim.x + threadIdx.x; | 61 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
@@ -98,7 +98,7 @@ | @@ -98,7 +98,7 @@ | ||
98 | } | 98 | } |
99 | 99 | ||
100 | template<typename T> | 100 | template<typename T> |
101 | - __global__ void blur_z(T* out, T* in, T sigma, unsigned int x, unsigned int y, unsigned int z){ | 101 | + __global__ void blur_z(T* out, T* in, T sigma, int x, int y, int z){ |
102 | 102 | ||
103 | //calculate x,y,z coordinates for this thread | 103 | //calculate x,y,z coordinates for this thread |
104 | int xi = blockIdx.x * blockDim.x + threadIdx.x; | 104 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
cpp/main.cpp
@@ -104,7 +104,7 @@ int main(int argc, char** argv){ | @@ -104,7 +104,7 @@ int main(int argc, char** argv){ | ||
104 | invert_data(cpuI, x, y, z); | 104 | invert_data(cpuI, x, y, z); |
105 | 105 | ||
106 | //write a new file from the cpuI. | 106 | //write a new file from the cpuI. |
107 | - std::ofstream original("output/0-original_invert.vol", std::ofstream::out | std::ofstream::binary); | 107 | + std::ofstream original("output/original_invert--512.vol", std::ofstream::out | std::ofstream::binary); |
108 | original.write((char*)cpuI, bytes); | 108 | original.write((char*)cpuI, bytes); |
109 | original.close(); | 109 | original.close(); |
110 | 110 | ||
@@ -115,7 +115,7 @@ int main(int argc, char** argv){ | @@ -115,7 +115,7 @@ int main(int argc, char** argv){ | ||
115 | ivote3(cpu_out, cpuI, sigma, phi, d_phi, r, iter, t, conn, x, y, z); | 115 | ivote3(cpu_out, cpuI, sigma, phi, d_phi, r, iter, t, conn, x, y, z); |
116 | 116 | ||
117 | //write the blurred file from the cpuI. | 117 | //write the blurred file from the cpuI. |
118 | - std::ofstream fblur("output/test0.vol", std::ofstream::out | std::ofstream::binary); | 118 | + std::ofstream fblur("output/v1--512.vol", std::ofstream::out | std::ofstream::binary); |
119 | fblur.write((char*)cpuI, bytes); | 119 | fblur.write((char*)cpuI, bytes); |
120 | fblur.close(); | 120 | fblur.close(); |
121 | 121 |
cpp/update_dir3.cuh
@@ -5,7 +5,7 @@ | @@ -5,7 +5,7 @@ | ||
5 | # include <cuda.h> | 5 | # include <cuda.h> |
6 | #include <stim/cuda/cudatools.h> | 6 | #include <stim/cuda/cudatools.h> |
7 | #include <stim/cuda/sharedmem.cuh> | 7 | #include <stim/cuda/sharedmem.cuh> |
8 | - | 8 | +#include <cuda_fp16.h> |
9 | 9 | ||
10 | // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. | 10 | // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. |
11 | template<typename T> | 11 | template<typename T> |
@@ -33,9 +33,9 @@ | @@ -33,9 +33,9 @@ | ||
33 | float max = 0; | 33 | float max = 0; |
34 | float l_vote = 0; | 34 | float l_vote = 0; |
35 | // define local variables for the x, y, and z coordinations where the maximum happened | 35 | // define local variables for the x, y, and z coordinations where the maximum happened |
36 | - int id_x = g_v_x; | ||
37 | - int id_y = g_v_y; | ||
38 | - int id_z = g_v_z; | 36 | + float id_x = g_v_x; |
37 | + float id_y = g_v_y; | ||
38 | + float id_z = g_v_z; | ||
39 | 39 | ||
40 | int rx_sq = rx * rx; | 40 | int rx_sq = rx * rx; |
41 | int ry_sq = ry * ry; | 41 | int ry_sq = ry * ry; |