Commit 6ef1dab9e7cfa8e30d297ba698ffab696b112dc7

Authored by Laila Saadatifard
1 parent 5c079506

fix one bug in the gaussian_blur3 code, this ivote3 project works well for the d…

…ata image size less than 512 by 512 by 256
  1 +#include "cuda_fp16.h"
  2 +#include "float_to_half.cuh"
  3 +#include "half_to_float.cuh"
1 #include "gaussian_blur3.cuh" 4 #include "gaussian_blur3.cuh"
2 #include "gradient3.cuh" 5 #include "gradient3.cuh"
3 #include "mag3.cuh" 6 #include "mag3.cuh"
@@ -9,7 +12,6 @@ @@ -9,7 +12,6 @@
9 void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, unsigned int r[], 12 void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, unsigned int r[],
10 int iter, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ 13 int iter, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){
11 14
12 -  
13 // compute the number of bytes in the input data 15 // compute the number of bytes in the input data
14 unsigned int bytes = x * y * z * sizeof(float); 16 unsigned int bytes = x * y * z * sizeof(float);
15 17
@@ -68,7 +70,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un @@ -68,7 +70,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un
68 70
69 cudaFree(gpu_grad); 71 cudaFree(gpu_grad);
70 cudaFree(gpu_mag); 72 cudaFree(gpu_mag);
71 - //cudaMemcpy(center, gpu_vote, bytes, cudaMemcpyDeviceToHost); 73 + cudaMemcpy(center, gpu_vote, bytes, cudaMemcpyDeviceToHost);
72 74
73 //allocate space on the gpu for the final detected cells. 75 //allocate space on the gpu for the final detected cells.
74 float* gpu_output; 76 float* gpu_output;
@@ -78,7 +80,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un @@ -78,7 +80,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un
78 gpu_local_max3<float>(gpu_output, gpu_vote, t, conn, x, y, z); 80 gpu_local_max3<float>(gpu_output, gpu_vote, t, conn, x, y, z);
79 81
80 //copy the final result to the cpu. 82 //copy the final result to the cpu.
81 - cudaMemcpy(center, gpu_output, bytes, cudaMemcpyDeviceToHost); 83 + //cudaMemcpy(center, gpu_output, bytes, cudaMemcpyDeviceToHost);
82 84
83 85
84 cudaFree(gpu_vote); 86 cudaFree(gpu_vote);
cpp/float_to_half.cuh
@@ -6,8 +6,8 @@ @@ -6,8 +6,8 @@
6 #include <stim/cuda/cudatools.h> 6 #include <stim/cuda/cudatools.h>
7 #include <stim/cuda/sharedmem.cuh> 7 #include <stim/cuda/sharedmem.cuh>
8 #include <stim/cuda/cudatools/error.h> 8 #include <stim/cuda/cudatools/error.h>
9 -#include "cuda_fp16.h"  
10 - 9 +#include <cuda_fp16.h>
  10 +#include <stdio.h>
11 __global__ void cuda_f2h(half* gpu_half, float* gpu_float, int x, int y, int z){ 11 __global__ void cuda_f2h(half* gpu_half, float* gpu_float, int x, int y, int z){
12 12
13 13
@@ -24,7 +24,8 @@ @@ -24,7 +24,8 @@
24 24
25 25
26 gpu_half[i] = __float2half(gpu_float[i]); 26 gpu_half[i] = __float2half(gpu_float[i]);
27 - 27 +
  28 +
28 } 29 }
29 30
30 31
cpp/gaussian_blur3.cuh
@@ -12,7 +12,7 @@ @@ -12,7 +12,7 @@
12 12
13 13
14 template<typename T> 14 template<typename T>
15 - __global__ void blur_x(T* out, T* in, T sigma, unsigned int x, unsigned int y, unsigned int z){ 15 + __global__ void blur_x(T* out, T* in, T sigma, int x, int y, int z){
16 16
17 //calculate x,y,z coordinates for this thread 17 //calculate x,y,z coordinates for this thread
18 int xi = blockIdx.x * blockDim.x + threadIdx.x; 18 int xi = blockIdx.x * blockDim.x + threadIdx.x;
@@ -55,7 +55,7 @@ @@ -55,7 +55,7 @@
55 55
56 56
57 template<typename T> 57 template<typename T>
58 - __global__ void blur_y(T* out, T* in, T sigma, unsigned int x, unsigned int y, unsigned int z){ 58 + __global__ void blur_y(T* out, T* in, T sigma, int x, int y, int z){
59 59
60 //calculate x,y,z coordinates for this thread 60 //calculate x,y,z coordinates for this thread
61 int xi = blockIdx.x * blockDim.x + threadIdx.x; 61 int xi = blockIdx.x * blockDim.x + threadIdx.x;
@@ -98,7 +98,7 @@ @@ -98,7 +98,7 @@
98 } 98 }
99 99
100 template<typename T> 100 template<typename T>
101 - __global__ void blur_z(T* out, T* in, T sigma, unsigned int x, unsigned int y, unsigned int z){ 101 + __global__ void blur_z(T* out, T* in, T sigma, int x, int y, int z){
102 102
103 //calculate x,y,z coordinates for this thread 103 //calculate x,y,z coordinates for this thread
104 int xi = blockIdx.x * blockDim.x + threadIdx.x; 104 int xi = blockIdx.x * blockDim.x + threadIdx.x;
@@ -104,7 +104,7 @@ int main(int argc, char** argv){ @@ -104,7 +104,7 @@ int main(int argc, char** argv){
104 invert_data(cpuI, x, y, z); 104 invert_data(cpuI, x, y, z);
105 105
106 //write a new file from the cpuI. 106 //write a new file from the cpuI.
107 - std::ofstream original("output/0-original_invert.vol", std::ofstream::out | std::ofstream::binary); 107 + std::ofstream original("output/original_invert--512.vol", std::ofstream::out | std::ofstream::binary);
108 original.write((char*)cpuI, bytes); 108 original.write((char*)cpuI, bytes);
109 original.close(); 109 original.close();
110 110
@@ -115,7 +115,7 @@ int main(int argc, char** argv){ @@ -115,7 +115,7 @@ int main(int argc, char** argv){
115 ivote3(cpu_out, cpuI, sigma, phi, d_phi, r, iter, t, conn, x, y, z); 115 ivote3(cpu_out, cpuI, sigma, phi, d_phi, r, iter, t, conn, x, y, z);
116 116
117 //write the blurred file from the cpuI. 117 //write the blurred file from the cpuI.
118 - std::ofstream fblur("output/test0.vol", std::ofstream::out | std::ofstream::binary); 118 + std::ofstream fblur("output/v1--512.vol", std::ofstream::out | std::ofstream::binary);
119 fblur.write((char*)cpuI, bytes); 119 fblur.write((char*)cpuI, bytes);
120 fblur.close(); 120 fblur.close();
121 121
cpp/update_dir3.cuh
@@ -5,7 +5,7 @@ @@ -5,7 +5,7 @@
5 # include <cuda.h> 5 # include <cuda.h>
6 #include <stim/cuda/cudatools.h> 6 #include <stim/cuda/cudatools.h>
7 #include <stim/cuda/sharedmem.cuh> 7 #include <stim/cuda/sharedmem.cuh>
8 - 8 +#include <cuda_fp16.h>
9 9
10 // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. 10 // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
11 template<typename T> 11 template<typename T>
@@ -33,9 +33,9 @@ @@ -33,9 +33,9 @@
33 float max = 0; 33 float max = 0;
34 float l_vote = 0; 34 float l_vote = 0;
35 // define local variables for the x, y, and z coordinations where the maximum happened 35 // define local variables for the x, y, and z coordinations where the maximum happened
36 - int id_x = g_v_x;  
37 - int id_y = g_v_y;  
38 - int id_z = g_v_z; 36 + float id_x = g_v_x;
  37 + float id_y = g_v_y;
  38 + float id_z = g_v_z;
39 39
40 int rx_sq = rx * rx; 40 int rx_sq = rx * rx;
41 int ry_sq = ry * ry; 41 int ry_sq = ry * ry;