compute the local max without thresholding

Laila Saadatifard
1 parent c986ebb7
Showing 4 changed files with 133 additions and 32 deletions Show diff stats
cpp/CMakeLists.txt
cpp/local_max3.cuh
cpp/local_max3_threshold.cuh
cpp/main.cpp
@@ -58,3 +58,5 @@ target_link_libraries(ivote3
 configure_file(nissl-raw-data/nissl-float-256.256.256.vol nissl-float-256.256.256.vol COPYONLY)
 configure_file(nissl-raw-data/nissl-float-128.128.128.vol nissl-float-128.128.128.vol COPYONLY)
 configure_file(nissl-raw-data/nissl-float-64.64.64.vol nissl-float-64.64.64.vol COPYONLY)
+
+configure_file(nissl-raw-data/nissl-float-128.128.128.vol ${CMAKE_CURRENT_BINARY_DIR}/nissl-float-128.128.128_at.vol @ONLY)
 \ No newline at end of file
@@ -6,7 +6,7 @@
 #include <stim/cuda/cudatools.h>
 template<typename T>
-__global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, T t, int conn_x, int conn_y, int conn_z, int x, int y, int z){
+__global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, int conn_x, int conn_y, int conn_z, int x, int y, int z){
 	//calculate x,y,z coordinates for this thread
 	int xi = blockIdx.x * blockDim.x + threadIdx.x;
@@ -25,9 +25,6 @@ __global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, T t, int conn_x, int
 	//initialize the max value to the vote value for this pixle.
 	T lv_i = gpu_vote[i];
-	//check the vote value is greater than threshold
-	if (lv_i < t) return;
-
 	for (int xl = xi - conn_x; xl <= xi + conn_x; xl++){
 		for (int yl = yi - conn_y; yl <= yi + conn_y; yl++){
@@ -38,12 +35,17 @@ __global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, T t, int conn_x, int
 					int i_l = zl * x * y + yl * x + xl;
 					if (gpu_vote[i_l] > lv_i) return;
+					if (gpu_vote[i_l] == lv_i){
+							if( i_l > i){
+								return;
+						}
+					}
 				}
 			}
 		}
 	}
-	gpu_center[i] = 1;
+	gpu_center[i] = lv_i;
 }
 template<typename T>
@@ -55,7 +57,7 @@ void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], unsign
 	dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
 	//call the kernel to find the local max
-	cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, t, conn[0], conn[1], conn[2], x, y, z);
+	cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, conn[0], conn[1], conn[2], x, y, z);
+#ifndef STIM_CUDA_LOCAL_MAX3_THRESHOLD_H
+#define STIM_CUDA_LOCAL_MAX3_THRESHOLD_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools.h>
+
+template<typename T>
+__global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, T t, int conn_x, int conn_y, int conn_z, int x, int y, int z){
+
+	//calculate x,y,z coordinates for this thread
+	int xi = blockIdx.x * blockDim.x + threadIdx.x;
+	//find the grid size along y
+	int grid_y = y / blockDim.y;
+	int blockidx_y = blockIdx.y % grid_y;
+	int yi = blockidx_y * blockDim.y + threadIdx.y;
+	int zi = blockIdx.y / grid_y;
+	int i = zi * x * y + yi * x + xi;
+	
+	if (xi>=x || yi>=y || zi>=z) return;
+
+	//initilize the center value for this pixel to zero
+	gpu_center[i] = 0;
+
+	//initialize the max value to the vote value for this pixle.
+	T lv_i = gpu_vote[i];
+	
+	//check the vote value is greater than threshold
+	if (lv_i < t) return;
+
+	for (int xl = xi - conn_x; xl <= xi + conn_x; xl++){
+
+		for (int yl = yi - conn_y; yl <= yi + conn_y; yl++){
+
+			for (int zl = zi - conn_z ; zl <= zi + conn_z; zl++){
+
+				if (xl>=0 && yl>=0 && zl>=0 && xl<x && yl<y && zl<z){
+					
+					int i_l = zl * x * y + yl * x + xl;
+					if (gpu_vote[i_l] > lv_i) return;
+					if (gpu_vote[i_l] == lv_i){
+						if( i_l > i){
+							return;
+						}
+					}
+				}
+			}
+		}
+	}
+
+	gpu_center[i] = 1;
+}
+
+template<typename T>
+void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){
+
+	//find the max number of threads per block.
+	unsigned int max_threads = stim::maxThreadsPerBlock();
+	dim3 threads(sqrt (max_threads),sqrt (max_threads));
+	dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
+
+	//call the kernel to find the local max
+	cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, t, conn[0], conn[1], conn[2], x, y, z);
+
+
+
+}
+
+template<typename T>
+void cpu_local_max3(T* cpu_output, T* cpu_vote, T t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){
+	
+	//calculate the number of bytes in the data set.
+	unsigned int bytes = x * y * z * sizeof(T);
+
+	//allocate space on the gpu for the vote data and the output.
+	T* gpu_vote;
+	cudaMalloc(&gpu_vote, bytes);
+
+	T* gpu_output;
+	cudaMalloc(&gpu_output, bytes);
+
+	//copy the vote data to the gpu.
+	cudaMemcpy(gpu_vote, cpu_vote, bytes, cudaMemcpyHostToDevice);
+
+	//call the gpu version of local max function.
+	gpu_local_max3<T>(gpu_output, gpu_vote, t, conn, x ,y, z);
+
+	//copy the detected result to the cpu.
+	cudaMemcpy(cpu_output, gpu_output, bytes, cudaMemcpyDeviceToHost);
+
+	cudaFree(gpu_vote);
+	cudaFree(gpu_output);
+}
+
+#endif
 \ No newline at end of file
@@ -215,7 +215,7 @@ int main(int argc, char** argv){
 	unsigned int r[3] = { 12, rmax, rmax};
 	float std = 5;
 	float sigma[3] = { std, std, std};
-	unsigned int nlmax = 1;
+	unsigned int nlmax = 5;
 	unsigned int conn[3] = { nlmax, nlmax, nlmax};
 	float phi_deg = 25.0;
 	float phi = phi_deg * pi /180;
@@ -269,31 +269,33 @@ int main(int argc, char** argv){
 	// creat a file for saving the list centers
-		//std::ofstream list("shared2D-v8/" + OutName.str()+std::to_string(t0)+".obj");
-		//// set the number of detected cells to zero.
-		//int nod = 0;
-		//if (list.is_open()){
-
-		//		for (int iz=0; iz<z; iz++){
-		//			for (int iy=0; iy<y; iy++){
-		//				for (int ix=0; ix<x; ix++){
-
-		//					int idx = iz * x * y + iy * x + ix;
-		//					if (cpu_out[idx]==1){
-		//						nod++;
-		//						list << "v" << "\t" << ix << "\t" << iy << "\t"<< iz << '\n' ;
-		//			
-		//					}
-		//				}
-		//			}
-		//		}
-		//		list << "p" << "\t";
-		//		for (unsigned int i_nod =1 ; i_nod <=nod; i_nod++){
-		//			list << i_nod << "\t";
-		//		}
-
-		//list.close();
-		//}
+		std::ofstream list(OutName.str()+std::to_string(t0)+".obj");
+		// set the number of detected cells to zero.
+		int nod = 0;
+		if (list.is_open()){
+
+				for (int iz=0; iz<z; iz++){
+					for (int iy=0; iy<y; iy++){
+						for (int ix=0; ix<x; ix++){
+
+							int idx = iz * x * y + iy * x + ix;
+							if (cpu_out[idx]>0){
+								nod++;
+								list << "v" << "\t" << ix << "\t" << iy << "\t"<< iz << "\t" << cpu_out[idx] << '\n' ;
+					
+							}
+						}
+					}
+				}
+				list << "p" << "\t";
+				for (unsigned int i_nod =1 ; i_nod <=nod; i_nod++){
+					list << i_nod << "\t";
+				}
+
+		list.close();
+		}
+
+
 	//}
 		cudaDeviceReset();