Merge branch 'jiabing' into 'master'

Jiabing See merge request !39

Merge branch 'jiabing' into 'master'
Jiabing See merge request !39
David Mayerich
2 parents db331d8a 0954c632
Showing 6 changed files with 222 additions and 11 deletions Show diff stats
python/network.pyc
python/network_dpy.py
stim/iVote/ivote2.cuh
stim/iVote/ivote2/local_max.cuh
stim/image/image.h
stim/math/filters/median2.cuh
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Jan 19 2018
+
+@author: Jiabing
+"""
+
+import struct
+import numpy as np
+import scipy as sp
+import networkx as nx
+import matplotlib.pyplot as plt
+import math
+
+
+
+alkjasfdljkhfsadlkjhfsad
+class Point:
+       def __init__(self, x, y, z, radius):
+           self.x = x
+           self.y = y
+           self.z = z
+           self.r = radius
+                      
+            
+class Fiber:
+       def __init__(self, fiber_idx,point_idx):     
+           self.fidx = fiber_idx
+           self.pidx = point_idx
+           
+           
+
+class Node:
+        def __init__(self, point_idx, fidx):
+            self.pidx= point_idx
+            self.fidx = fidx
+            
+            
+#class NWT:
+    
+#class network(point, Fiber, Node):
@@ -152,13 +152,17 @@ namespace stim {
  
  
 	template<typename T>
-	void cpu_ivote2(T* cpuI, unsigned int rmax, size_t x, size_t y, bool invert = false, T t = 0, int iter = 8, T phi = 15.0f * (float)stim::PI / 180, int conn = 8, bool debug = false) {
+	void cpu_ivote2(T* cpuI, unsigned int rmax, size_t x, size_t y, float &gpu_time, bool invert = false, T t = 0, int iter = 8, T phi = 15.0f * (float)stim::PI / 180, int conn = 8, bool debug = false) {
 		size_t bytes = x*y * sizeof(T);
 		T* gpuI;						//allocate space on the gpu to save the input image
+
+		gpuTimer_start();
 		HANDLE_ERROR(cudaMalloc(&gpuI, bytes));
 		HANDLE_ERROR(cudaMemcpy(gpuI, cpuI, bytes, cudaMemcpyHostToDevice));		//copy the image to the gpu
 		stim::gpu_ivote2<T>(gpuI, rmax, x, y, invert, t, iter, phi, conn, debug);				//call the gpu version of the ivote
 		HANDLE_ERROR(cudaMemcpy(cpuI, gpuI, bytes, cudaMemcpyDeviceToHost));		//copy the output to the cpu
+
+		gpu_time = gpuTimer_end();
 	}
 }
 #endif
 \ No newline at end of file
@@ -10,7 +10,7 @@ namespace stim{
  
 		// this kernel calculates the local maximum for finding the cell centers
 		template<typename T>
-		__global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, int x, int y){
+		__global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, int x,  int y){
  
 			// calculate the 2D coordinates for this current thread.
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
@@ -20,16 +20,16 @@ namespace stim{
 				return;
  
 			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
+		    int i = yi * x + xi;
  
 			gpuCenters[i] = 0;		//initialize the value at this location to zero
  
 			T val = gpuVote[i];
  
-			for(int xl = xi - conn; xl < xi + conn; xl++){
-				for(int yl = yi - conn; yl < yi + conn; yl++){
+			for(unsigned int xl = xi - conn; xl < xi + conn; xl++){
+				for(unsigned int yl = yi - conn; yl < yi + conn; yl++){
 					if(xl >= 0 && xl < x && yl >= 0 && yl < y){
-						int il = yl * x + xl;
+						unsigned int il = yl * x + xl;
 						if(gpuVote[il] > val){							
 							return;
 							}
@@ -47,7 +47,7 @@ namespace stim{
 		}
  
 		template<typename T>
-		void gpu_local_max(T* gpuCenters, T* gpuVote,  unsigned int conn, size_t x, size_t y){
+		void gpu_local_max(T* gpuCenters, T* gpuVote,  unsigned int conn, unsigned int x, unsigned int y){
  
 			unsigned int max_threads = stim::maxThreadsPerBlock();
 			/*dim3 threads(max_threads, 1);
@@ -275,10 +275,10 @@ public:
 		int channels = cvImage.channels();
 		allocate(cols, rows, channels);			//allocate space for the image
 		size_t img_bytes = bytes();
-		unsigned char* cv_ptr = (unsigned char*)cvImage.data;
-		//if (C() == 1)														//if this is a single-color image, just copy the data
-		//	type_copy<T, T>(cv_ptr, img, size());
-		memcpy(img, cv_ptr, bytes());
+		unsigned char* cv_ptr = (unsigned char*)cvImage.data;		
+		if (C() == 1)														//if this is a single-color image, just copy the data
+			type_copy<unsigned char, T>(cv_ptr, img, size());
+		//memcpy(img, cv_ptr, bytes());
 		if(C() == 3)														//if this is a 3-color image, OpenCV uses BGR interleaving
 			from_opencv(cv_ptr, X(), Y());
 #else
+#ifndef STIM_CUDA_MEDIAN2_H
+#define STIM_CUDA_MEDIAN2_H
+
+#include <iostream>
+#include <cuda.h>
+#include <cmath>
+#include <algorithm>
+
+#ifdef USING_CUDA
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include <stim/cuda/cudatools.h>
+#endif
+
+
+//#include <thrust/sort.h>
+//#include <thrust/execution_policy.h>
+//#include <thrust/binary_search.h>
+//#include <thrust/device_ptr.h>
+
+template <typename T> 
+__device__ void cuswap ( T& a, T& b ){
+	T c(a);
+	a=b;
+	b=c;
+}
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_median2(T* in, T* out, T* kernel, size_t X, size_t Y, size_t K){
+
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;			
+
+			size_t Xout = X - K + 1;
+			size_t Yout = Y - K + 1;
+			int i = yi * Xout + xi;
+
+			//return if (i,j) is outside the matrix
+			if(xi >= Xout || yi >= Yout)   return;		
+
+			//scan the image, copy data from input to 2D window
+			size_t kxi, kyi;
+			for(kyi = 0; kyi < K; kyi++){
+				for(kxi = 0; kxi < K; kxi++){
+					kernel[i * K * K + kyi * K + kxi] = in[(yi + kyi)* X + xi + kxi];				      
+				}
+			}					
+					
+			//calculate kernel radius 4
+            int r = (K*K)/2;
+			
+			//sort the smallest half pixel values inside the window, calculate the middle one
+			size_t Ki = i * K * K;
+		
+			//sort the smallest half pixel values inside the window, calculate the middle one
+            for (int  p = 0; p < r+1; p++){                             
+			    for(int kk = p + 1; kk < K*K; kk++){
+                       if (kernel[Ki + kk] < kernel[Ki + p]){
+						   cuswap<T>(kernel[Ki + kk], kernel[Ki + p]);
+					   }
+                  }
+             }
+            
+			//copy the middle pixel value inside the window to output
+            out[i] = kernel[Ki + r]; 
+				     
+        }
+		 
+		
+		template<typename T>
+		void gpu_median2(T* gpu_in, T* gpu_out, T* gpu_kernel, size_t X, size_t Y, size_t K){
+               
+			//get the maximum number of threads per block for the CUDA device
+            int threads_total = stim::maxThreadsPerBlock();
+
+			//set threads in each block
+			dim3 threads(sqrt(threads_total), sqrt(threads_total));
+
+			//calculate the number of blocks
+			dim3 blocks(( (X - K + 1) / threads.x) + 1, ((Y - K + 1) / threads.y) + 1);
+
+			//call the kernel to perform median filter function
+		    cuda_median2 <<< blocks, threads >>>( gpu_in, gpu_out, gpu_kernel, X, Y, K);
+		
+		}
+
+		template<typename T>
+		void cpu_median2(T* cpu_in, T* cpu_out, size_t X, size_t Y, size_t K){
+
+		#ifndef USING_CUDA
+			
+			//output image width and height
+			 size_t X_out = X + K -1;
+			 size_t Y_out = Y + K -1;
+
+			 float* window = (float*)malloc(K * k *sizeof(float));
+
+			 for(int i = 0; i< Y; i++){
+ 		       for(int j =0; j< X; j++){
+
+				//Pick up window elements		
+				int k = 0; 			
+				for (int m=0; m< kernel_width; m++){
+					for(int n = 0; n < kernel_width; n++){
+						window[k] = in_image.at<float>(m+i, n+j);
+						k++;					
+					 }
+				}
+
+				//calculate kernel radius 4
+				r_ker = K * K/2;
+			    //Order elements (only half of them)
+				for(int p = 0; p<r_ker+1; p++){			
+				  //Find position of minimum element			       
+			       for (int l = p + 1; l < size_kernel; l++){
+     
+					 if(window[l] < window[p]){
+					     float t  = window[p];
+					    window[p] = window[l];
+					    window[l] = t;          }		
+		     	}
+					}
+					
+			  //Get result - the middle element	
+				cpu_out[i * X_out + j] =  window[r_ker];	
+			   }
+			 }
+		#else
+			//calculate input and out image pixels & calculate kernel size
+			size_t N_in = X * Y;									//number of pixels in the input image
+			size_t N_out = (X - K + 1) * (Y - K + 1);								//number of pixels in the output image
+            //size_t kernel_size = kernel_width*kernel_width;				//total number of pixels in the kernel
+            
+			//allocate memory on the GPU for the array
+			T* gpu_in;																	//allocate device memory for the input image
+			HANDLE_ERROR( cudaMalloc( &gpu_in, N_in * sizeof(T) ) );
+			T* gpu_kernel; 															//allocate device memory for the kernel
+			HANDLE_ERROR( cudaMalloc( &gpu_kernel, K * K * N_out * sizeof(T) ) );
+			T* gpu_out;																	//allocate device memory for the output image
+			HANDLE_ERROR( cudaMalloc( &gpu_out, N_out * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_in, cpu_in, N_in * sizeof(T), cudaMemcpyHostToDevice) );
+			
+			//call the GPU version of this function
+			gpu_median2<T>(gpu_in, gpu_out, gpu_kernel, X, Y, K);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N_out * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_in);
+			cudaFree(gpu_kernel);
+			cudaFree(gpu_out);
+
+		}
+		
+	}
+	#endif
+}
+
+
+#endif