upload the ivote code that works well on the small size of image ( not working f…

…or 512 cube) and float precision

upload the ivote code that works well on the small size of image ( not working f…
…or 512 cube) and float precision
Laila Saadatifard
1 parent c9aba18a
Showing 5 changed files with 242 additions and 164 deletions Show diff stats
cpp/CMakeLists.txt
cpp/cudafunc.cu
cpp/gaussian_blur3.cuh
cpp/gradient3.cuh
cpp/main.cpp
@@ -46,4 +46,5 @@ target_link_libraries(ivote3
 		  )
  
 #copy an image test case
-configure_file(data/nissl/nissl-rat.vol nissl-rat.vol COPYONLY)
+configure_file(nissl-raw-data/nissl-float-256.256.256.vol nissl-float-256.256.256.vol COPYONLY)
+configure_file(nissl-raw-data/nissl-float-128.128.128.vol nissl-float-128.128.128.vol COPYONLY)
 #include "gaussian_blur3.cuh"
 #include "gradient3.cuh"
+#include "mag3.cuh"
+#include "vote3.cuh"
+#include "update_dir3.cuh"
+#include "local_max3.cuh"
  
  
-void ivote3(float* out, float* img, float sigma, unsigned int x, unsigned int y, unsigned int z){
-	//cpu_gaussian_blur_conv3<float>(Ib, img, rhs, x, y, z);
-	
+void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, unsigned int r[],
+			int iter, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){
+
+
+	// compute the number of bytes in the input data
 	unsigned int bytes = x * y * z * sizeof(float);
+
+	//assign memory on gpu for the input data.
 	float* gpuI0;
-	cudaMalloc(&gpuI0, bytes);			
-	//copy the image data to the GPU
+	cudaMalloc(&gpuI0, bytes);	
+
+	//copy the image data to the GPU.
 	cudaMemcpy(gpuI0, img, bytes, cudaMemcpyHostToDevice);
  
+	//call the blurring function from the gpu.
 	gpu_gaussian_blur3<float>(gpuI0, sigma, x, y, z);
  
+	cudaDeviceSynchronize();
+		
+	//copy the blur data back to the cpu
+	//cudaMemcpy(img, gpuI0, bytes, cudaMemcpyDeviceToHost);
+	
+	//assign memory on the gpu for the gradient along the X, y, z.
 	float* gpu_grad;
 	cudaMalloc(&gpu_grad, bytes*3);
  
-	gpu_gradient3(gpu_grad, gpuI0, x, y, z);
+	//call the gradient function from the gpu.
+	gpu_gradient3<float>(gpu_grad, gpuI0, x, y, z);
+	cudaFree(gpuI0);
+	
+	//assign memory on the gpu for the gradient magnitude
+	float* gpu_mag;
+	cudaMalloc(&gpu_mag, bytes);
+	
+	//call the magnitude function
+	gpu_mag3<float>(gpu_mag, gpu_grad, x, y, z);
+	//cudaMemcpy(img, gpu_mag, bytes, cudaMemcpyDeviceToHost);
+	//assign memory on the gpu for the vote.
+	float* gpu_vote;
+	cudaMalloc(&gpu_vote, bytes);
  
-	//copy the image data back to the cpu
-	cudaMemcpy(img, gpuI0, x*y*z*sizeof(float), cudaMemcpyDeviceToHost);
-	cudaMemcpy(out, gpu_grad, bytes*3, cudaMemcpyDeviceToHost);
+	float cos_phi = cos(phi);
+
+	//call the vote function.
+	for (int i = 0; i < iter; i++){
+
+		gpu_vote3<float>(gpu_vote, gpu_grad, gpu_mag, cos_phi, r, x, y, z);
+		cudaDeviceSynchronize();
+		if (i==0)
+			cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
+		
+		if (phi >= d_phi){	
+			gpu_update_dir3<float>(gpu_grad, gpu_vote, cos_phi, r, x, y, z);
+			cudaDeviceSynchronize();
+			phi = phi - d_phi;
+			cos_phi = cos(phi);
+		}
+	
+	}
  
-	cudaFree(gpuI0);
 	cudaFree(gpu_grad);
+	cudaFree(gpu_mag);
+	//cudaMemcpy(center, gpu_vote, bytes, cudaMemcpyDeviceToHost);
+
+	//allocate space on the gpu for the final detected cells.
+	float* gpu_output;
+	cudaMalloc(&gpu_output, bytes);
+
+	//call the local max function
+	gpu_local_max3<float>(gpu_output, gpu_vote, t, conn, x, y, z);
+
+	//copy the final result to the cpu.
+	cudaMemcpy(center, gpu_output, bytes, cudaMemcpyDeviceToHost);
+		
+	
+	cudaFree(gpu_vote);
+	cudaFree(gpu_output);
+	
 }
 \ No newline at end of file
@@ -6,6 +6,7 @@
 #include <stim/cuda/cudatools.h>
 #include <stim/cuda/sharedmem.cuh>
 #include <stim/cuda/cudatools/error.h>
+#include "cuda_fp16.h"
  
 #define pi	3.14159
  
@@ -13,14 +14,16 @@
 		template<typename T>
 		__global__ void blur_x(T* out, T* in, T sigma, unsigned int x, unsigned int y, unsigned int z){
  
-			
 			//calculate x,y,z coordinates for this thread
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-			int zi = blockIdx.z * blockDim.z + threadIdx.z;
+			//find the grid size along y
+			int grid_y = y / blockDim.y;
+			int blockidx_y = blockIdx.y % grid_y;
+			int yi = blockidx_y * blockDim.y + threadIdx.y;
+			int zi = blockIdx.y / grid_y;
 			int i = zi * x * y + yi * x + xi;
  
-			// calculate the kernel size
+			// calculate the kernel size			
 			int k_size = sigma * 4;
  
 			//if the current pixel is outside of the image
@@ -28,37 +31,26 @@
 				return;
  
  
-			int gx, gy, gz, gi;
+			int gx, gi;
 			T G;
 			T sum = 0;		//running weighted sum across the kernel
 			out[i] = 0;
-				
+			T sigma_sq = 2 * sigma * sigma;	
+			T a = 1.0 / (sigma * sqrt(2 * pi));
  
 			//for each element of the kernel
 			for(int ki = -k_size; ki <= k_size; ki++){
  
 				//calculate the gaussian value
-				G = (1.0 / (sigma * sqrt(2 * pi))) * exp(-(ki*ki) / (2*sigma*sigma));
-		
-
-					
+				G = a * exp(-(ki*ki) / (sigma_sq));
 				//calculate the global coordinates for this point in the kernel
-				gx = xi + ki;
-				gy = yi;
-				gz = zi;
-
-				//make sure we are inside the bounds of the image
-				if(gx >= 0 && gx < x ){
-					gi = gz * x * y + gy * x + gx;
-					//perform the integration
-					sum += G * in[gi];
-				}
-		
+				gx = (xi + ki) % x;					
+				gi = zi * x * y + yi * x + gx;
+				sum += G * in[gi];				
 			}
  
 			//output the result to global memory
 			out[i] = sum;
-			
 		}
  
  
@@ -67,11 +59,14 @@
  
 			//calculate x,y,z coordinates for this thread
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-			int zi = blockIdx.z * blockDim.z + threadIdx.z;
+			//find the grid size along y
+			int grid_y = y / blockDim.y;
+			int blockidx_y = blockIdx.y % grid_y;
+			int yi = blockidx_y * blockDim.y + threadIdx.y;
+			int zi = blockIdx.y / grid_y;
 			int i = zi * x * y + yi * x + xi;
-			
-			// calculate the kernel size
+
+			// calculate the kernel size			
 			int k_size = sigma * 4;
  
 			//if the current pixel is outside of the image
@@ -79,34 +74,25 @@
 				return;
  
  
-			int gx, gy, gz, gi;
+			int gy, gi;
 			T G;
 			T sum = 0;		//running weighted sum across the kernel
 			out[i] = 0;
-				
+			T sigma_sq = 2 * sigma * sigma;	
+			T a = 1.0 / (sigma * sqrt(2 * pi));	
  
 			//for each element of the kernel
 			for(int ki = -k_size; ki <= k_size; ki++){
  
 				//calculate the gaussian value
-				G = 1.0 / (sigma * sqrt(2 * pi)) * exp(-(ki*ki) / (2*sigma*sigma));
-		
-
-					
+				G = a * exp(-(ki*ki) / sigma_sq);
 				//calculate the global coordinates for this point in the kernel
-				gx = xi;
-				gy = yi + ki;
-				gz = zi;
-
-				//make sure we are inside the bounds of the image
-				if(gx >= 0 && gy >= 0 && gz >= 0 && gx < x && gy < y && gz < z){
-					gi = gz * x * y + gy * x + gx;
-					//perform the integration
-					sum += G * in[gi];
-				}
-		
+				gy = (yi + ki ) % y;				
+				gi = zi * x * y + gy * x + xi;
+				sum += G * in[gi];						
 			}
  
+			
 			//output the result to global memory
 			out[i] = sum;
 		}
@@ -116,11 +102,14 @@
  
 			//calculate x,y,z coordinates for this thread
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
-			int zi = blockIdx.z * blockDim.z + threadIdx.z;
+			//find the grid size along y
+			int grid_y = y / blockDim.y;
+			int blockidx_y = blockIdx.y % grid_y;
+			int yi = blockidx_y * blockDim.y + threadIdx.y;
+			int zi = blockIdx.y / grid_y;
 			int i = zi * x * y + yi * x + xi;
-			
-			// calculate the kernel size
+
+			// calculate the kernel size			
 			int k_size = sigma * 4;
  
 			//if the current pixel is outside of the image
@@ -128,32 +117,22 @@
 				return;
  
  
-			int gx, gy, gz, gi;
+			int gz, gi;
 			T G;
 			T sum = 0;		//running weighted sum across the kernel
 			out[i] = 0;
-				
+			T sigma_sq = 2 * sigma * sigma;	
+			T a = 1.0 / (sigma * sqrt(2 * pi));	
  
 			//for each element of the kernel
 			for(int ki = -k_size; ki <= k_size; ki++){
  
 				//calculate the gaussian value
-				G = 1.0 / (sigma * sqrt(2 * pi)) * exp(-(ki*ki) / (2*sigma*sigma));
-		
-
-					
+				G = a * exp(-(ki*ki) / sigma_sq);
 				//calculate the global coordinates for this point in the kernel
-				gx = xi;
-				gy = yi;
-				gz = zi + ki;
-
-				//make sure we are inside the bounds of the image
-				if(gx >= 0 && gy >= 0 && gz >= 0 && gx < x && gy < y && gz < z){
-					gi = gz * x * y + gy * x + gx;
-					//perform the integration
-					sum += G * in[gi];
-				}
-		
+				gz = (zi + ki) % z;				
+				gi = gz * x * y + yi * x + xi;					
+				sum += G * in[gi];
 			}
  
 			//output the result to global memory
@@ -161,15 +140,15 @@
 		}
  
 		template<typename T>
-		void gpu_gaussian_blur3(T* image, T sigma, unsigned int x, unsigned int y, unsigned int z){
+		void gpu_gaussian_blur3(T* image, T sigma[], unsigned int x, unsigned int y, unsigned int z){
  
 			//get the number of pixels in the image
 			unsigned int pixels = x * y * z;
 			unsigned int bytes = sizeof(T) * pixels;
  
 			int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(512, 1, 1);
-			dim3 blocks(x / threads.x + 1, y, z);
+			dim3 threads(sqrt (max_threads),sqrt (max_threads));
+			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  
 			//allocate temporary space on the GPU
 			T* gpuIb_x;
@@ -179,26 +158,25 @@
 			T* gpuIb_y;
 			cudaMalloc(&gpuIb_y, bytes);
  
-			// blur the original image in the x direction
-			blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma, x, y, z);
+			// blur the original image along the x direction
+			blur_x<T> <<< blocks, threads >>>(gpuIb_x, image, sigma[0], x, y, z);
  
-			// blur the original image in the y direction
-			blur_y<float> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma, x, y, z);
+			// blur the x-blurred image along the y direction
+			blur_y<T> <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], x, y, z);
  
-			// blur the original image in the z direction
-			blur_z<float> <<< blocks, threads >>>(image, gpuIb_y, sigma, x, y, z);
+			// blur the xy-blurred image along the z direction
+			blur_z<T> <<< blocks, threads >>>(image, gpuIb_y, sigma[2], x, y, z);
  
 			//cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice);
  
 			cudaFree(gpuIb_x);
 			cudaFree(gpuIb_y);
-			
 		}
  
  
 		/// Applies a Gaussian blur to a 2D image stored on the CPU
 		template<typename T>
-		void cpu_gaussian_blur3(T* blur, T* image, T sigma, unsigned int x, unsigned int y, unsigned int z){
+		void cpu_gaussian_blur3(T* blur, T* image, T sigma[], unsigned int x, unsigned int y, unsigned int z){
  
  
 			//get the number of pixels in the image
@@ -7,23 +7,30 @@
 #include <stim/cuda/cudatools/error.h>
  
 template<typename T>
-__global__ void gradient3(T* out, T* in, unsigned int x, unsigned int y, unsigned int z){
+__global__ void gradient3(T* out, T* in, int x, int y, int z){
  
 	//calculate x,y,z coordinates for this thread
 	int xi = blockIdx.x * blockDim.x + threadIdx.x;
-	int yi = blockIdx.y * blockDim.y + threadIdx.y;
-	int zi = blockIdx.z * blockDim.z + threadIdx.z;
+	//find the grid size along y
+	int grid_y = y / blockDim.y;
+	int blockidx_y = blockIdx.y % grid_y;
+	int yi = blockidx_y * blockDim.y + threadIdx.y;
+	int zi = blockIdx.y / grid_y;
 	int i = zi * x * y + yi * x + xi;
  
 	//return if the pixel is outside of the image
-	if(xi >= x || yi >= y || zi>=z)
-		return;
+	if(xi >= x || yi >= y || zi>=z) return;
  
 	//calculate indices for the forward difference
 	int i_xp = zi * x * y + yi * x + (xi + 1);
 	int i_yp = zi * x * y + (yi + 1) * x + xi;
 	int i_zp = (zi + 1) * x * y + yi * x + xi;
  
+	//calculate indices for the backward difference
+	int i_xn = zi * x * y + yi * x + (xi - 1);
+	int i_yn = zi * x * y + (yi - 1) * x + xi;
+	int i_zn = (zi - 1) * x * y + yi * x + xi;
+
 	//use forward differences if a coordinate is zero
 	if(xi == 0)
 		out[i * 3 + 0] = in[i_xp] - in[i];
@@ -32,11 +39,6 @@ __global__ void gradient3(T* out, T* in, unsigned int x, unsigned int y, unsigne
 	if (zi==0)
 		out[i * 3 + 2] = in[i_zp] - in[i];
  
-	//calculate indices for the backward difference
-	int i_xn = zi * x * y + yi * x + (xi - 1);
-	int i_yn = zi * x * y + (yi - 1) * x + xi;
-	int i_zn = (zi - 1) * x * y + yi * x + xi;
-
 	//use backward differences if the coordinate is at the maximum edge
 	if(xi == x-1)
 		out[i * 3 + 0] = in[i] - in[i_xn];
@@ -48,11 +50,10 @@ __global__ void gradient3(T* out, T* in, unsigned int x, unsigned int y, unsigne
 	//otherwise use central differences
 	if(xi > 0 && xi < x-1)
 		out[i * 3 + 0] = (in[i_xp] - in[i_xn]) / 2;
-
 	if(yi > 0 && yi < y-1)
 		out[i * 3 + 1] = (in[i_yp] - in[i_yn]) / 2;
 	if(zi > 0 && zi < z-1)
-		out[i * 3 + 1] = (in[i_zp] - in[i_zn]) / 2;
+		out[i * 3 + 2] = (in[i_zp] - in[i_zn]) / 2;
  
 }
  
@@ -60,12 +61,10 @@ template&lt;typename T&gt;
  
 void gpu_gradient3(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y, unsigned int z){
  
-	//get the number of pixels in the image
-	unsigned int pixels = x * y * z;
-	
+			
 	int max_threads = stim::maxThreadsPerBlock();
-	dim3 threads(512, 1, 1);
-	dim3 blocks(x / threads.x + 1, y, z);
+	dim3 threads(sqrt (max_threads),sqrt (max_threads));
+	dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  
 	//call the GPU kernel to determine the gradient
 	gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y, z);
 #include <iostream>
 #include <fstream>
-//#include<stdio.h>
 #include <cuda_runtime.h>
 #include <stim/math/vector.h>
 #include <stim/parser/arguments.h>
@@ -8,85 +7,126 @@
 #include <stim/grids/image_stack.h>
 #include <stim/grids/grid.h>
 #include <stim/visualization/colormap.h>
-
+#include <stim/image/image.h>
 #define pi	3.14159
  
  
-void ivote3(float* out, float* img, float std, unsigned int x, unsigned int y, unsigned int z);
-
-int main(){
-
-	float sigma =3;
-
-	std::string filename = "nissl-float-512.512.512.vol";
-	unsigned int x = 512;
-	unsigned int y = 512;
-	unsigned int z = 512;
+void ivote3(float* center, float* img, float std[], float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[], 
+			unsigned int x, unsigned int y, unsigned int z);
+
+void invert_data(float* cpuI, unsigned int x, unsigned int y, unsigned int z){
+		for(int ix = 0; ix < x; ix++){
+			for (int iy = 0; iy < y; iy++){
+				for (int iz = 0; iz < z; iz++){
+					int idx = iz * x * y + iy * x + ix;
+					cpuI[idx] = 255 - cpuI[idx];
+				}
+			}
+		}
+	}
+
+int main(int argc, char** argv){
+
+	//output advertisement
+	std::cout<<std::endl<<std::endl;
+	std::cout<<"========================================================================="<<std::endl;
+	std::cout<<"Thank you for using the ivote3 segmentation tool!"<<std::endl;
+	std::cout<<"Scalable Tissue Imaging and Modeling (STIM) Lab, University of Houston"<<std::endl;
+	std::cout<<"Developers: Laila Saadatifard and David Mayerich"<<std::endl;
+	std::cout<<"Source: https://git.stim.ee.uh.edu/segmentation/ivote3"<<std::endl;
+	std::cout<<"========================================================================="<<std::endl<<std::endl;
+
+	stim::arglist args;
+
+#ifdef _WIN32
+	args.set_ansi(false);
+#endif
+
+	//add arduments
+	args.add("help", "prints this help");
+	args.add("x", "size of the dataset along X axis", "positive value");
+	args.add("y", "size of the dataset along Y axis", "positive value");
+	args.add("z", "size of the dataset along Z axis", "positive value");
+	args.add("t", "threshold value for the final result", "positive valu");
+	//parse the command line arguments.
+	args.parse(argc, argv);
+
+	//display the help text if requested
+	if(args["help"].is_set()){				
+		std::cout<<std::endl<<"usage: ivote input_image output_list --option [A B C ...]"<<std::endl;
+		std::cout<<std::endl<<std::endl
+				  << "examples: ivote blue.bmp list.txt "<<std::endl;
+		
+		std::cout<<std::endl<<std::endl;
+		std::cout<<args.str()<<std::endl;
+		exit(1);
+	}
+
+	//if the input and output files aren't specified, throw an error and exit
+	if(args.nargs() < 2){
+		std::cout<<"ERROR: two files must be specified for segmentation, enter ivote --help for options."<<std::endl<<std::endl;
+		exit(1);
+	}
+
+	//get the input image file
+	stim::filename Ifilename(args.arg(0));
+
+	//get the output file name
+	stim::filename OutName(args.arg(1));
+
+	//set the x, y, z.
+	int x = args["x"].as_int();
+	int y = args["y"].as_int();
+	int z = args["z"].as_int();
+
+	//set the threshold.
+	float t = args["t"].as_float();
+
+	unsigned int r[3] = { 9, 9, 5};
+	float sigma[3] = { 3, 3, 1.5};
+	unsigned int conn[3] = { 5, 5, 3};
+	float phi_deg = 20.1;
+	float phi = phi_deg * pi /180;
+	int iter = 2;
+	float d_phi = phi/(iter -1);
+	
+	std::string filename = Ifilename.str();
 	unsigned int bytes = x*y*z*sizeof(float);
  
+	//allocate space on the cpu for the input data
 	float* cpuI = (float*) malloc(bytes);
  
+	//load the input file into the cpuI
 	std::ifstream nissl(filename, std::ios::in | std::ios::binary);
 	nissl.read((char*)cpuI, bytes);
 	nissl.close();
  
-
-	std::ofstream original("original.vol", std::ofstream::out | std::ofstream::binary);
+	invert_data(cpuI, x, y, z);
+	
+	//write a new file from the cpuI.
+	std::ofstream original("output/0-original_invert.vol", std::ofstream::out | std::ofstream::binary);
 	original.write((char*)cpuI, bytes);
 	original.close();
-
-	float* cpu_grad = (float*) malloc(bytes*3);
-
-	ivote3(cpu_grad, cpuI, sigma, x, y, z);
-
-	std::ofstream blur("blur.vol", std::ofstream::out | std::ofstream::binary);
-	blur.write((char*)cpuI, bytes);
-	blur.close();
-
-
  
-
-	// load the imagestack from the given directory
-	//std::string file_mask = "D:\\source\\ivote3\\data\\nissl\\*.png";
-	/*stim::image_stack<float> I;
-	unsigned int x =512;
-	unsigned int y =512;
-	unsigned int z =512;
-	unsigned int bytes = x*y*z* sizeof(float);
+	//allocate space on the cpu for the output result
+	float* cpu_out = (float*) malloc(bytes);
  
-	I.read("nissl-float-512.512.512.vol", 512, 512, 512, 1, 0);
-	//float test = I.get(128, 128, 128);
-	//std::cout<< test<<std::endl;
-	float* ptr0 = I.data();
-
-	//stim::cpu2image<float>(I.data(), "after_loading.bmp", 512, 512);
-		
-	float* cpuI = (float*) malloc(bytes);	
+	// call the ivote function
+	ivote3(cpu_out, cpuI, sigma, phi, d_phi, r, iter, t, conn, x, y, z);
  
-	memcpy (cpuI, ptr0, bytes);
-	 
-
-	std::ofstream infile ("original-2.vol");
-	infile.write((char*)cpuI, bytes);
-	infile.close();*/
-	//stim::cpu2image(cpuI, "00-cpuI.bmp", 512, 1024);
+	//write the blurred file from the cpuI.
+	std::ofstream fblur("output/test0.vol", std::ofstream::out | std::ofstream::binary);
+	fblur.write((char*)cpuI, bytes);
+	fblur.close();
+
+	//write the output file.
+	std::ofstream fo("output/" + OutName.str(), std::ofstream::out | std::ofstream::binary);
+	fo.write((char*)cpu_out, bytes);
+	fo.close();
  
  
-	//std::fstream file2;
-	//file2.write((char*)cpuI, 512*512*512*sizeof(unsigned char));
-	//I.write("data\\volume1.raw");
-	//test = ptr0[128*128*128];
-	//std::cout<< test<<std::endl;
  
-	//memset(I.data(), 0, 512*512*512);
-	//I.write("data\\volume.raw");
-	//I.save_images("data\\????.bmp");
+		cudaDeviceReset();       
  
-
-
-
-	
-
-		cudaDeviceReset();           
 }