From 6ef1dab9e7cfa8e30d297ba698ffab696b112dc7 Mon Sep 17 00:00:00 2001
From: Laila Saadatifard <saadatileila@gmail.com>
Date: Tue, 8 Dec 2015 12:53:12 -0600
Subject: [PATCH] fix one bug in the gaussian_blur3 code, this ivote3 project works well for the data image size less than 512 by 512 by 256

---
 cpp/cudafunc.cu        | 8 +++++---
 cpp/float_to_half.cuh  | 7 ++++---
 cpp/gaussian_blur3.cuh | 6 +++---
 cpp/main.cpp           | 4 ++--
 cpp/update_dir3.cuh    | 8 ++++----
 5 files changed, 18 insertions(+), 15 deletions(-)
diff --git a/cpp/cudafunc.cu b/cpp/cudafunc.cu
index 83fc0d5..c34ade6 100644
--- a/cpp/cudafunc.cu
+++ b/cpp/cudafunc.cu
@@ -1,3 +1,6 @@
+#include "cuda_fp16.h"
+#include "float_to_half.cuh"
+#include "half_to_float.cuh"
 #include "gaussian_blur3.cuh"
 #include "gradient3.cuh"
 #include "mag3.cuh"
@@ -9,7 +12,6 @@
 void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, unsigned int r[],
 			int iter, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){
 
-
 	// compute the number of bytes in the input data
 	unsigned int bytes = x * y * z * sizeof(float);
 
@@ -68,7 +70,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un
 
 	cudaFree(gpu_grad);
 	cudaFree(gpu_mag);
-	//cudaMemcpy(center, gpu_vote, bytes, cudaMemcpyDeviceToHost);
+	cudaMemcpy(center, gpu_vote, bytes, cudaMemcpyDeviceToHost);
 
 	//allocate space on the gpu for the final detected cells.
 	float* gpu_output;
@@ -78,7 +80,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un
 	gpu_local_max3<float>(gpu_output, gpu_vote, t, conn, x, y, z);
 
 	//copy the final result to the cpu.
-	cudaMemcpy(center, gpu_output, bytes, cudaMemcpyDeviceToHost);
+	//cudaMemcpy(center, gpu_output, bytes, cudaMemcpyDeviceToHost);
 		
 	
 	cudaFree(gpu_vote);
diff --git a/cpp/float_to_half.cuh b/cpp/float_to_half.cuh
index ed85ef0..ed00b23 100644
--- a/cpp/float_to_half.cuh
+++ b/cpp/float_to_half.cuh
@@ -6,8 +6,8 @@
 #include <stim/cuda/cudatools.h>
 #include <stim/cuda/sharedmem.cuh>
 #include <stim/cuda/cudatools/error.h>
-#include "cuda_fp16.h"
-
+#include <cuda_fp16.h>
+#include <stdio.h>
 		__global__ void cuda_f2h(half* gpu_half, float* gpu_float, int x, int y, int z){
 
 			
@@ -24,7 +24,8 @@
 
 							
 			gpu_half[i] = __float2half(gpu_float[i]);
-							
+			
+			
 		}
 
 
diff --git a/cpp/gaussian_blur3.cuh b/cpp/gaussian_blur3.cuh
index f42e8d4..142ac6f 100644
--- a/cpp/gaussian_blur3.cuh
+++ b/cpp/gaussian_blur3.cuh
@@ -12,7 +12,7 @@
 
 				
 		template<typename T>
-		__global__ void blur_x(T* out, T* in, T sigma, unsigned int x, unsigned int y, unsigned int z){
+		__global__ void blur_x(T* out, T* in, T sigma, int x, int y, int z){
 
 			//calculate x,y,z coordinates for this thread
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
@@ -55,7 +55,7 @@
 
 	
 		template<typename T>
-		__global__ void blur_y(T* out, T* in, T sigma, unsigned int x, unsigned int y, unsigned int z){
+		__global__ void blur_y(T* out, T* in, T sigma, int x, int y, int z){
 			
 			//calculate x,y,z coordinates for this thread
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
@@ -98,7 +98,7 @@
 		}
 		
 		template<typename T>
-		__global__ void blur_z(T* out, T* in, T sigma, unsigned int x, unsigned int y, unsigned int z){
+		__global__ void blur_z(T* out, T* in, T sigma, int x, int y, int z){
 			
 			//calculate x,y,z coordinates for this thread
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/cpp/main.cpp b/cpp/main.cpp
index 65c74e1..cb012cc 100644
--- a/cpp/main.cpp
+++ b/cpp/main.cpp
@@ -104,7 +104,7 @@ int main(int argc, char** argv){
 	invert_data(cpuI, x, y, z);
 	
 	//write a new file from the cpuI.
-	std::ofstream original("output/0-original_invert.vol", std::ofstream::out | std::ofstream::binary);
+	std::ofstream original("output/original_invert--512.vol", std::ofstream::out | std::ofstream::binary);
 	original.write((char*)cpuI, bytes);
 	original.close();
 	
@@ -115,7 +115,7 @@ int main(int argc, char** argv){
 	ivote3(cpu_out, cpuI, sigma, phi, d_phi, r, iter, t, conn, x, y, z);
 	
 	//write the blurred file from the cpuI.
-	std::ofstream fblur("output/test0.vol", std::ofstream::out | std::ofstream::binary);
+	std::ofstream fblur("output/v1--512.vol", std::ofstream::out | std::ofstream::binary);
 	fblur.write((char*)cpuI, bytes);
 	fblur.close();
 
diff --git a/cpp/update_dir3.cuh b/cpp/update_dir3.cuh
index 47f1cc0..fc9213d 100644
--- a/cpp/update_dir3.cuh
+++ b/cpp/update_dir3.cuh
@@ -5,7 +5,7 @@
 # include <cuda.h>
 #include <stim/cuda/cudatools.h>
 #include <stim/cuda/sharedmem.cuh>
-
+#include <cuda_fp16.h>
 
 		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
 		template<typename T>
@@ -33,9 +33,9 @@
 			float max = 0;
 			float l_vote = 0;
 			// define local variables for the x, y, and z coordinations where the maximum happened
-			int id_x = g_v_x;
-			int id_y = g_v_y;
-			int id_z = g_v_z;
+			float id_x = g_v_x;
+			float id_y = g_v_y;
+			float id_z = g_v_z;
 
 			int rx_sq = rx * rx;
 			int ry_sq = ry * ry;
--
libgit2 0.21.4