From 0d84034253937b68843ec545a0050dd6abaccf5c Mon Sep 17 00:00:00 2001
From: David Mayerich <mayerich@uh.edu>
Date: Mon, 16 Dec 2019 13:50:34 -0600
Subject: [PATCH] public release commit

---
 CMakeLists.txt        | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 FindGLEW.cmake        | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 FindGLUT.cmake        | 186 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 FindLAPACKE.cmake     |  43 +++++++++++++++++++++++++++++++++++++++++++
 FindSTIM.cmake        |  22 ++++++++++++++++++++++
 src/basic_functions.h |  85 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/enviload.h        | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/ga_gpu.cu         | 196 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/ga_gpu.h          | 649 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/main.cpp          | 556 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/timer.h           |  37 +++++++++++++++++++++++++++++++++++++
 11 files changed, 2164 insertions(+), 0 deletions(-)
 create mode 100644 CMakeLists.txt
 create mode 100644 FindGLEW.cmake
 create mode 100644 FindGLUT.cmake
 create mode 100644 FindLAPACKE.cmake
 create mode 100644 FindSTIM.cmake
 create mode 100644 src/basic_functions.h
 create mode 100644 src/enviload.h
 create mode 100644 src/ga_gpu.cu
 create mode 100644 src/ga_gpu.h
 create mode 100644 src/main.cpp
 create mode 100644 src/timer.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..966967a
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,145 @@
+#Specify the version being used aswell as the language
+cmake_minimum_required(VERSION 2.8)
+
+#Name your project here
+project(ga-gpu)
+
+#set the module directory
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}")
+
+#default to release mode
+if(NOT CMAKE_BUILD_TYPE) 
+	set(CMAKE_BUILD_TYPE Release)
+endif(NOT CMAKE_BUILD_TYPE)
+
+#build the executable in the binary directory on MS Visual Studio
+if ( MSVC )
+	SET( CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG "${OUTPUT_DIRECTORY}")
+	SET( CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE "${OUTPUT_DIRECTORY}")
+endif ( MSVC )
+#MAYBE REMOVE-----------------
+#set C++11 flags if using GCC
+if( CMAKE_COMPILER_IS_GNUCC )
+#	SET( CMAKE_CXX_FLAGS "-std=c++11")
+	set(CMAKE_CXX_FLAGS "-std=c++11 -D_FORCE_INLINES")
+#	SET( CUDA_NVCC_FLAGS "-std=c++11")
+endif( CMAKE_COMPILER_IS_GNUCC )
+
+SET( CUDA_NVCC_FLAGS "--gpu-architecture=compute_50 --gpu-code=sm_50,compute_50")
+#-----------------------------
+
+
+
+#find packages-----------------------------------
+#find OpenCV
+find_package(OpenCV REQUIRED)
+add_definitions(-DUSING_OPENCV)
+
+#find the pthreads package
+find_package(Threads)
+
+#find the X11 package
+find_package(X11)
+
+#find the STIM library
+find_package(STIM)
+
+#find CUDA, mostly for LA stuff using cuBLAS
+find_package(CUDA REQUIRED)
+
+#find Boost for Unix-based file lists
+if( CMAKE_COMPILER_IS_GNUCC )
+	find_package(Boost COMPONENTS filesystem system)
+	if(Boost_FOUND)
+		include_directories(${Boost_INCLUDE_DIR})
+	else()
+		message(FATAL_ERROR "HSIproc requires Boost::filesystem and Boost::system when using GCC")
+	endif()
+endif()
+
+#find FANN
+#find_package(FANN REQUIRED)
+
+#find the GLUT library for visualization
+#find_package(OpenGL REQUIRED)
+#find_package(GLUT REQUIRED)
+#if(WIN32)
+#	find_package(GLEW REQUIRED)
+#	include_directories(${GLEW_INCLUDE_DIR})
+#endif(WIN32)
+
+#find LAPACK and supporting link_libraries
+find_package(LAPACKE REQUIRED)
+
+#include include directories
+include_directories(${CUDA_INCLUDE_DIRS}
+					${OpenCV_INCLUDE_DIRS}
+					${LAPACKE_INCLUDE_DIR}
+					${STIM_INCLUDE_DIRS}
+					${OpenGL_INCLUDE_DIRS}
+#					${GLUT_INCLUDE_DIR}
+					${FANN_INCLUDE_DIRS}
+					"${CMAKE_SOURCE_DIR}/src"
+)
+
+#Assign a variable for all of the header files in this project
+include_directories("${CMAKE_SOURCE_DIR}/src")
+#file(GLOB GACPU_H "${CMAKE_SOURCE_DIR}/src/gacpu/*.h")
+file(GLOB GAGPU_H "${CMAKE_SOURCE_DIR}/src/*.h")
+#file(GLOB GA_H "${CMAKE_SOURCE_DIR}/src/*.h")
+
+#Assign source files to the appropriate variables to easily associate them with executables
+#file(GLOB GA_CPU_SRC "${CMAKE_SOURCE_DIR}/src/gacpu/*.cpp")
+file(GLOB GA_GPU_SRC "${CMAKE_SOURCE_DIR}/src/*.c*")
+
+
+#create an executable file
+cuda_add_executable(ga-gpu
+		${GAGPU_H}
+#		${GA_H}
+		${GA_GPU_SRC}
+)
+target_link_libraries(ga-gpu ${CUDA_LIBRARIES}
+						 ${CUDA_CUBLAS_LIBRARIES}
+						 ${CUDA_CUFFT_LIBRARIES}
+						 ${LAPACKE_LIBRARIES}
+						 ${LAPACK_LIBRARIES}
+						 ${BLAS_LIBRARIES}
+						 ${CMAKE_THREAD_LIBS_INIT}
+						 ${X11_LIBRARIES}
+						 ${OpenCV_LIBS}
+)
+
+
+#create the PROC executable----------------------------------------------
+
+#create an executable file
+#add_executable(hsiga
+#		${GACPU_H}
+#		${GA_H}
+#		${GA_CPU_SRC}
+#)
+#target_link_libraries(hsiga ${LAPACKE_LIBRARIES}
+#						 ${LAPACK_LIBRARIES}
+#						 ${BLAS_LIBRARIES}
+#						 ${CMAKE_THREAD_LIBS_INIT}
+#						 ${X11_LIBRARIES}
+#						 ${OpenCV_LIBS}
+#)
+
+
+
+#if Boost is found, set an environment variable to use with preprocessor directives
+if(Boost_FILESYSTEM_FOUND)
+#	if(BUILD_GACPU)	
+#		target_link_libraries(hsiga ${Boost_FILESYSTEM_LIBRARIES}
+#					      ${Boost_SYSTEM_LIBRARY}
+#		)
+		#message(${Boost_FILESYSTEM_LIBRARIES})
+#	endif(BUILD_GACPU)
+#	if(BUILD_GAGPU)
+	target_link_libraries(ga-gpu ${Boost_FILESYSTEM_LIBRARIES}
+					 ${Boost_SYSTEM_LIBRARY}
+	)
+#	endif(BUILD_GAGPU)	
+endif(Boost_FILESYSTEM_FOUND)
diff --git a/FindGLEW.cmake b/FindGLEW.cmake
new file mode 100644
index 0000000..0b72457
--- /dev/null
+++ b/FindGLEW.cmake
@@ -0,0 +1,126 @@
+# Copyright (c) 2012-2016 DreamWorks Animation LLC
+#
+# All rights reserved. This software is distributed under the
+# Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+#
+# Redistributions of source code must retain the above copyright
+# and license notice and the following restrictions and disclaimer.
+#
+# *     Neither the name of DreamWorks Animation nor the names of
+# its contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+# LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+#
+
+#-*-cmake-*-
+# - Find GLEW
+#
+# Author : Nicholas Yue yue.nicholas@gmail.com
+#
+# This auxiliary CMake file helps in find the GLEW headers and libraries
+#
+# GLEW_FOUND            set if Glew is found.
+# GLEW_INCLUDE_DIR      GLEW's include directory
+# GLEW_glew_LIBRARY        GLEW libraries
+# GLEW_glewmx_LIBRARY      GLEWmx libraries (Mulitple Rendering Context)
+
+FIND_PACKAGE ( PackageHandleStandardArgs )
+
+FIND_PATH( GLEW_LOCATION include/GL/glew.h
+  "$ENV{GLEW_ROOT}"
+  NO_DEFAULT_PATH
+  NO_SYSTEM_ENVIRONMENT_PATH
+  )
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS ( GLEW
+  REQUIRED_VARS GLEW_LOCATION
+  )
+
+IF ( GLEW_LOCATION )
+
+  SET( GLEW_INCLUDE_DIR "${GLEW_LOCATION}/include" CACHE STRING "GLEW include path")
+
+  SET ( ORIGINAL_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  IF (GLEW_USE_STATIC_LIBS)
+	IF (APPLE)
+      SET(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
+      FIND_LIBRARY ( GLEW_LIBRARY_PATH GLEW PATHS ${GLEW_LOCATION}/lib
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+      FIND_LIBRARY ( GLEWmx_LIBRARY_PATH GLEWmx PATHS ${GLEW_LOCATION}/lib
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+	  # MESSAGE ( "APPLE STATIC" )
+	  # MESSAGE ( "GLEW_LIBRARY_PATH = " ${GLEW_LIBRARY_PATH} )
+	ELSEIF (WIN32)
+      # Link library
+      SET(CMAKE_FIND_LIBRARY_SUFFIXES ".lib")
+      FIND_LIBRARY ( GLEW_LIBRARY_PATH GLEW32S PATHS ${GLEW_LOCATION}/lib )
+      FIND_LIBRARY ( GLEWmx_LIBRARY_PATH GLEW32MXS PATHS ${GLEW_LOCATION}/lib )
+	ELSE (APPLE)
+      SET(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
+      FIND_LIBRARY ( GLEW_LIBRARY_PATH GLEW PATHS ${GLEW_LOCATION}/lib
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+      FIND_LIBRARY ( GLEWmx_LIBRARY_PATH GLEWmx PATHS ${GLEW_LOCATION}/lib
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+	  # MESSAGE ( "LINUX STATIC" )
+	  # MESSAGE ( "GLEW_LIBRARY_PATH = " ${GLEW_LIBRARY_PATH} )
+	ENDIF (APPLE)
+  ELSE ()
+	IF (APPLE)
+      SET(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib")
+      FIND_LIBRARY ( GLEW_LIBRARY_PATH GLEW PATHS ${GLEW_LOCATION}/lib )
+      FIND_LIBRARY ( GLEWmx_LIBRARY_PATH GLEWmx PATHS ${GLEW_LOCATION}/lib )
+	ELSEIF (WIN32)
+      # Link library
+      SET(CMAKE_FIND_LIBRARY_SUFFIXES ".lib")
+      FIND_LIBRARY ( GLEW_LIBRARY_PATH GLEW32 PATHS ${GLEW_LOCATION}/lib )
+      FIND_LIBRARY ( GLEWmx_LIBRARY_PATH GLEW32mx PATHS ${GLEW_LOCATION}/lib )
+      # Load library
+      SET(CMAKE_FIND_LIBRARY_SUFFIXES ".dll")
+      FIND_LIBRARY ( GLEW_DLL_PATH GLEW32 PATHS ${GLEW_LOCATION}/bin
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+      FIND_LIBRARY ( GLEWmx_DLL_PATH GLEW32mx PATHS ${GLEW_LOCATION}/bin
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+	ELSE (APPLE)
+	  # Unices
+      FIND_LIBRARY ( GLEW_LIBRARY_PATH GLEW PATHS ${GLEW_LOCATION}/lib
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+      FIND_LIBRARY ( GLEWmx_LIBRARY_PATH GLEWmx PATHS ${GLEW_LOCATION}/lib
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+	ENDIF (APPLE)
+  ENDIF ()
+  # MUST reset
+  SET(CMAKE_FIND_LIBRARY_SUFFIXES ${ORIGINAL_CMAKE_FIND_LIBRARY_SUFFIXES})
+
+  SET( GLEW_GLEW_LIBRARY ${GLEW_LIBRARY_PATH} CACHE STRING "GLEW library")
+  SET( GLEW_GLEWmx_LIBRARY ${GLEWmx_LIBRARY_PATH} CACHE STRING "GLEWmx library")
+
+ENDIF ()
diff --git a/FindGLUT.cmake b/FindGLUT.cmake
new file mode 100644
index 0000000..25b2cbb
--- /dev/null
+++ b/FindGLUT.cmake
@@ -0,0 +1,186 @@
+#.rst:
+# FindGLUT
+# --------
+#
+# try to find glut library and include files.
+#
+# IMPORTED Targets
+# ^^^^^^^^^^^^^^^^
+#
+# This module defines the :prop_tgt:`IMPORTED` targets:
+#
+# ``GLUT::GLUT``
+#  Defined if the system has GLUT.
+#
+# Result Variables
+# ^^^^^^^^^^^^^^^^
+#
+# This module sets the following variables:
+#
+# ::
+#
+#   GLUT_INCLUDE_DIR, where to find GL/glut.h, etc.
+#   GLUT_LIBRARIES, the libraries to link against
+#   GLUT_FOUND, If false, do not try to use GLUT.
+#
+# Also defined, but not for general use are:
+#
+# ::
+#
+#   GLUT_glut_LIBRARY = the full path to the glut library.
+#   GLUT_Xmu_LIBRARY  = the full path to the Xmu library.
+#   GLUT_Xi_LIBRARY   = the full path to the Xi Library.
+
+#=============================================================================
+# Copyright 2001-2009 Kitware, Inc.
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+if (WIN32)
+  find_path( GLUT_INCLUDE_DIR NAMES GL/glut.h
+  PATHS  $ENV{GLUT_ROOT_PATH}/include )
+
+  if( CMAKE_SIZEOF_VOID_P EQUAL 8 )
+    find_library( GLUT_glut_LIBRARY NAMES freeglut
+      PATHS
+      $ENV{GLUT_ROOT_PATH}/lib/x64
+
+      NO_DEFAULT_PATH
+    )
+  else( CMAKE_SIZEOF_VOID_P EQUAL 8 )
+    find_library( GLUT_glut_LIBRARY NAMES glut glut32 freeglut
+      PATHS
+      ${OPENGL_LIBRARY_DIR}
+      $ENV{GLUT_ROOT_PATH}/lib
+    )
+  endif( CMAKE_SIZEOF_VOID_P EQUAL 8 )
+
+else ()
+
+  if (APPLE)
+    find_path(GLUT_INCLUDE_DIR glut.h ${OPENGL_LIBRARY_DIR})
+    find_library(GLUT_glut_LIBRARY GLUT DOC "GLUT library for OSX")
+    find_library(GLUT_cocoa_LIBRARY Cocoa DOC "Cocoa framework for OSX")
+
+    if(GLUT_cocoa_LIBRARY AND NOT TARGET GLUT::Cocoa)
+      add_library(GLUT::Cocoa UNKNOWN IMPORTED)
+      # Cocoa should always be a Framework, but we check to make sure.
+      if(GLUT_cocoa_LIBRARY MATCHES "/([^/]+)\\.framework$")
+        set_target_properties(GLUT::Cocoa PROPERTIES
+          IMPORTED_LOCATION "${GLUT_cocoa_LIBRARY}/${CMAKE_MATCH_1}")
+      else()
+        set_target_properties(GLUT::Cocoa PROPERTIES
+          IMPORTED_LOCATION "${GLUT_cocoa_LIBRARY}")
+      endif()
+    endif()
+  else ()
+
+    if (BEOS)
+
+      set(_GLUT_INC_DIR /boot/develop/headers/os/opengl)
+      set(_GLUT_glut_LIB_DIR /boot/develop/lib/x86)
+
+    else()
+
+      find_library( GLUT_Xi_LIBRARY Xi
+        /usr/openwin/lib
+        )
+
+      find_library( GLUT_Xmu_LIBRARY Xmu
+        /usr/openwin/lib
+        )
+
+      if(GLUT_Xi_LIBRARY AND NOT TARGET GLUT::Xi)
+        add_library(GLUT::Xi UNKNOWN IMPORTED)
+        set_target_properties(GLUT::Xi PROPERTIES
+          IMPORTED_LOCATION "${GLUT_Xi_LIBRARY}")
+      endif()
+
+      if(GLUT_Xmu_LIBRARY AND NOT TARGET GLUT::Xmu)
+        add_library(GLUT::Xmu UNKNOWN IMPORTED)
+        set_target_properties(GLUT::Xmu PROPERTIES
+          IMPORTED_LOCATION "${GLUT_Xmu_LIBRARY}")
+      endif()
+
+    endif ()
+
+    find_path( GLUT_INCLUDE_DIR GL/glut.h
+      /usr/include/GL
+      /usr/openwin/share/include
+      /usr/openwin/include
+      /opt/graphics/OpenGL/include
+      /opt/graphics/OpenGL/contrib/libglut
+      ${_GLUT_INC_DIR}
+      )
+
+    find_library( GLUT_glut_LIBRARY glut
+      /usr/openwin/lib
+      ${_GLUT_glut_LIB_DIR}
+      )
+
+    unset(_GLUT_INC_DIR)
+    unset(_GLUT_glut_LIB_DIR)
+
+  endif ()
+
+endif ()
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(GLUT REQUIRED_VARS GLUT_glut_LIBRARY GLUT_INCLUDE_DIR)
+
+if (GLUT_FOUND)
+  # Is -lXi and -lXmu required on all platforms that have it?
+  # If not, we need some way to figure out what platform we are on.
+  set( GLUT_LIBRARIES
+    ${GLUT_glut_LIBRARY}
+    ${GLUT_Xmu_LIBRARY}
+    ${GLUT_Xi_LIBRARY}
+    ${GLUT_cocoa_LIBRARY}
+    )
+
+  if(NOT TARGET GLUT::GLUT)
+    add_library(GLUT::GLUT UNKNOWN IMPORTED)
+    set_target_properties(GLUT::GLUT PROPERTIES
+      INTERFACE_INCLUDE_DIRECTORIES "${GLUT_INCLUDE_DIR}")
+    if(GLUT_glut_LIBRARY MATCHES "/([^/]+)\\.framework$")
+      set_target_properties(GLUT::GLUT PROPERTIES
+        IMPORTED_LOCATION "${GLUT_glut_LIBRARY}/${CMAKE_MATCH_1}")
+    else()
+      set_target_properties(GLUT::GLUT PROPERTIES
+        IMPORTED_LOCATION "${GLUT_glut_LIBRARY}")
+    endif()
+
+    if(TARGET GLUT::Xmu)
+      set_property(TARGET GLUT::GLUT APPEND
+        PROPERTY INTERFACE_LINK_LIBRARIES GLUT::Xmu)
+    endif()
+
+    if(TARGET GLUT::Xi)
+      set_property(TARGET GLUT::GLUT APPEND
+        PROPERTY INTERFACE_LINK_LIBRARIES GLUT::Xi)
+    endif()
+
+    if(TARGET GLUT::Cocoa)
+      set_property(TARGET GLUT::GLUT APPEND
+        PROPERTY INTERFACE_LINK_LIBRARIES GLUT::Cocoa)
+    endif()
+  endif()
+
+  #The following deprecated settings are for backwards compatibility with CMake1.4
+  set (GLUT_LIBRARY ${GLUT_LIBRARIES})
+  set (GLUT_INCLUDE_PATH ${GLUT_INCLUDE_DIR})
+endif()
+
+mark_as_advanced(
+  GLUT_INCLUDE_DIR
+  GLUT_glut_LIBRARY
+  GLUT_Xmu_LIBRARY
+  GLUT_Xi_LIBRARY
+  )
diff --git a/FindLAPACKE.cmake b/FindLAPACKE.cmake
new file mode 100644
index 0000000..a94f1c2
--- /dev/null
+++ b/FindLAPACKE.cmake
@@ -0,0 +1,43 @@
+# - Try to find LAPACKE
+#
+# Once done this will define
+#  LAPACKE_FOUND - System has LAPACKE
+#  LAPACKE_INCLUDE_DIRS - The LAPACKE include directories
+#  LAPACKE_LIBRARIES - The libraries needed to use LAPACKE
+#  LAPACKE_DEFINITIONS - Compiler switches required for using LAPACKE
+#
+# Usually, LAPACKE requires LAPACK and the BLAS.  This module does
+# not enforce anything about that.
+
+find_path(LAPACKE_INCLUDE_DIR
+          NAMES lapacke.h
+          PATHS $ENV{LAPACK_PATH} ${INCLUDE_INSTALL_DIR}
+          PATHS ENV INCLUDE)
+
+find_library(LAPACKE_LIBRARY liblapacke lapacke
+             PATHS $ENV{LAPACK_PATH} ${LIB_INSTALL_DIR}
+             PATHS ENV LIBRARY_PATH
+             PATHS ENV LD_LIBRARY_PATH)
+
+if(MSVC)
+	find_library(LAPACK_LIBRARY liblapack lapack
+             PATHS $ENV{LAPACK_PATH} ${LIB_INSTALL_DIR}
+             PATHS ENV LIBRARY_PATH
+             PATHS ENV LD_LIBRARY_PATH)
+
+	find_library(BLAS_LIBRARY libblas blas
+             PATHS $ENV{LAPACK_PATH} ${LIB_INSTALL_DIR}
+             PATHS ENV LIBRARY_PATH
+             PATHS ENV LD_LIBRARY_PATH)
+	
+else()
+	find_library(LAPACK REQUIRED)
+	find_library(BLAS REQUIRED)
+endif()
+set(LAPACKE_LIBRARIES ${LAPACKE_LIBRARY} ${LAPACK_LIBRARY} ${BLAS_LIBRARY})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LAPACKE DEFAULT_MSG
+                                  LAPACKE_INCLUDE_DIR 
+                                  LAPACKE_LIBRARIES)
+mark_as_advanced(LAPACKE_INCLUDE_DIR LAPACKE_LIBRARIES)
diff --git a/FindSTIM.cmake b/FindSTIM.cmake
new file mode 100644
index 0000000..a83481a
--- /dev/null
+++ b/FindSTIM.cmake
@@ -0,0 +1,22 @@
+# finds the STIM library (downloads it if it isn't present)
+# set STIMLIB_PATH to the directory containing the stim subdirectory (the stim repository)
+
+include(FindPackageHandleStandardArgs)
+
+set(STIM_INCLUDE_DIR $ENV{STIMLIB_PATH})
+
+find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIR)
+
+if(STIM_FOUND)
+    set(STIM_INCLUDE_DIRS ${STIM_INCLUDE_DIR})
+elseif(STIM_FOUND)
+	#if the STIM library isn't found, download it
+	#file(REMOVE_RECURSE ${CMAKE_BINARY_DIR}/stimlib)	#remove the stimlib directory if it exists
+	#set(STIM_GIT "https://git.stim.ee.uh.edu/codebase/stimlib.git")
+	#execute_process(COMMAND git clone --depth 1 ${STIM_GIT} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+	#set(STIM_INCLUDE_DIRS "${CMAKE_BINARY_DIR}/stimlib" CACHE TYPE PATH)
+	message("STIM library not found. Set the STIMLIB_PATH environment variable to the STIMLIB location.")
+	message("STIMLIB can be found here: https://git.stim.ee.uh.edu/codebase/stimlib")
+endif(STIM_FOUND)
+
+find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIR)
diff --git a/src/basic_functions.h b/src/basic_functions.h
new file mode 100644
index 0000000..6d51168
--- /dev/null
+++ b/src/basic_functions.h
@@ -0,0 +1,85 @@
+#include <stdio.h>
+
+
+size_t* sortIndx(float* input, size_t size){
+	//sort indices of score in ascending order (fitness value)
+	size_t *idx; 
+	idx = (size_t*) malloc (size * sizeof (size_t));
+	for (size_t i = 0; i < size; i++)
+		idx[i] = i;    
+		 	
+	for (size_t i=0; i<size; i++){
+		for (size_t j=i+1; j<size; j++){
+			if (input[idx[i]] < input[idx[j]]){
+				std::swap (idx[i], idx[j]);    //float check : it was like this b(&idx[i], &idx[j]) but gave me error
+			}
+		}
+	}				 
+	return idx; //use as sortSIdx in selection
+}
+
+	
+template<typename T>
+void mtxMul(T* M3, T* M1, T* M2, size_t r1, size_t c1, size_t r2, size_t c2){
+   //compute output matrix M3 of size row1 X column2 and data is column major		 
+	for(size_t i = 0 ; i <r1; i++){								
+		for(size_t j = 0; j< c2; j++){
+			T temp = 0;
+			for(size_t k = 0; k < c1 ; k++){					//column1 = row2 for matrix multiplication
+				temp+= M1[i * c1  + k] * M2[k * c2 + j];			//compute an element of output matrix
+			}       
+			M3[i * c1 + j] = temp;								//copy an element to output matrix
+		}
+	}
+}
+
+template<typename T>
+void mtxMultranspose(T* M3, T* M1, T* M2, size_t r1, size_t c1, size_t r2, size_t c2){
+   //compute output matrix M3 of size row1 X column2 and data is column major
+	for(size_t i = 0 ; i <r1; i++){								
+		for(size_t j = 0; j< r2; j++){		
+			T temp = 0;
+			for(size_t k = 0; k < c1 ; k++){					//column1 = row2 for matrix multiplication
+				temp+= M1[i * c1 + k] * M2[j * c2 + k];			//compute an element of output matrix
+			}    
+			M3[i * r1 + j] = temp;								//copy an element to output matrix		
+		}	
+	}		
+}
+
+		//display within class scatter
+template<typename T>
+void displayS(T* sw, size_t f){
+
+	for(size_t g = 0; g<1; g++){
+		std::cout<<std::endl;
+		for(size_t j = 0; j < f; j++){										//total number of features in a gnome
+				for(size_t k = 0; k < f; k++){						    //total number of features in a gnome	
+					std::cout<<sw[g*f*f + j*f + k]<<"  ";
+		}
+	    std::cout<<std::endl;
+		}
+	}
+ 	std::cout<<std::endl;
+}
+
+//sort eigenvalues from lapacke results
+size_t* sortEigenVectorIndx(float* eigenvalue, size_t N){
+		//sort indices of score in ascending order (fitness value)
+		size_t *idx = (size_t*) malloc (N * sizeof (size_t));
+		for (size_t i = 0; i < N; i++)
+			idx[i] = i;    
+		 	
+		for (size_t i=0; i<N; i++){
+			for (size_t j=i+1; j<N; j++){
+				if (eigenvalue[idx[i]] > eigenvalue[idx[j]]){
+					std::swap (idx[i], idx[j]);    //float check : it was like this b(&idx[i], &idx[j]) but gave me error
+				}
+			}
+		}
+
+		std::cout<<"best                                                                                                                                                                                                                                                                      eigenvalue index: "<<eigenvalue[idx[0]]<<std::endl;
+ 
+		return idx; //use as sortSIdx in selection
+
+}
diff --git a/src/enviload.h b/src/enviload.h
new file mode 100644
index 0000000..36a8306
--- /dev/null
+++ b/src/enviload.h
@@ -0,0 +1,119 @@
+#include <iostream>
+#include <fstream>
+#include <thread>
+#include <random>
+#include <vector>
+//#include <algorithm>
+
+#define NOMINMAX
+
+//stim libraries
+#include <stim/envi/envi.h>
+#include <stim/image/image.h>
+#include <stim/parser/arguments.h>
+#include <stim/ui/progressbar.h>
+#include <stim/parser/filename.h>
+//#include <stim/visualization/colormap.h>
+#include <stim/parser/table.h>
+
+std::vector< stim::image<unsigned char> > C;		//2D array used to access each mask C[m][p], where m = mask# and p = pixel#
+//loads spectral features into a feature matrix based on a set of class images (or masks)
+float* load_features(size_t nC, size_t tP, size_t B, stim::envi E, std::vector< unsigned int > nP){
+	float progress = 0;									//initialize the progress bar variable
+	unsigned long long bytes_fmat = sizeof(float) * tP * B;							//calculate the number of bytes in the feature matrix
+	std::cout<<"totalnumber of samples "<<tP<<std::endl;
+	std::cout<<"Allocating space for the feature matrix: "<<tP<<" x "<<B<<" = "<<(float)bytes_fmat/(float)1048576<<"MB"<<std::endl;
+	float* F = (float*) malloc(bytes_fmat);								//allocate space for the sifted matrix
+	std::cout<<"Loading Training Data ("<<nC<<" classes)"<<std::endl;
+	//load all of the training spectra into an array
+	unsigned long long F_idx = 0;												//initialize the matrix index to 0
+	//unsigned long long R_idx = 0;
+	for(unsigned c = 0; c < nC; c++){											//for each class image
+		std::cout<<"\tSifting class "<<c+1<<" = "<<nP[c]<<" pixels..."<<std::endl;
+	//	std::thread t1 = std::thread(progress_thread_envi, &E);					//start the progress bar thread		
+		E.sift((void*)&F[F_idx], C[c].data(), true);							//sift that class into the matrix at the proper location
+		F_idx += nP[c] * B;
+		progress = (float)(c+1) / (float)nC * 100;
+	//	t1.join();
+	}
+	
+		return F;
+}
+
+/// Load responses for a Random Forest Classifier
+unsigned int*  ga_load_responses(size_t tP, size_t nC, std::vector< unsigned int > nP){
+	unsigned int* T = (unsigned int*)malloc(tP*sizeof(unsigned int));						//generate an OpenCV vector of responses
+	size_t R_idx = 0;														//index into the response array
+	for(size_t c = 0; c < nC; c++){										//for each class image
+		for(unsigned long long l = 0; l < nP[c]; l++){						//assign a response for all pixels of class c loaded in the training matrix
+			T[R_idx + l] = (unsigned int)c+1;
+		}
+		R_idx += nP[c];														//increment the response vector index
+	}
+	return T;
+}
+
+
+//loads the necessary data for training a random forest classifier
+std::vector< unsigned int > ga_load_class_images(int argc, stim::arglist args, size_t* nC, size_t* tP){
+	if(args["classes"].nargs() < 2){									//if fewer than two classes are specified, there's a problem
+		std::cout<<"ERROR: training requires at least two class masks"<<std::endl;
+		exit(1);
+	}
+	std::vector< unsigned int > nP;
+	size_t num_images = args["classes"].nargs();						//count the number of class images
+	//size_t num_images = args["rf"].nargs();						//count the number of class images
+	//std::vector<std::string> filenames(num_images);				//initialize an array of file names to store the names of the images
+	std::string filename;										//allocate space to store the filename for an image
+	for(size_t c = 0; c < num_images; c++){						//for each image
+		filename = args["classes"].as_string(c);;						//get the class image file name			
+		stim::image<unsigned char> image(filename);				//load the image
+		//push_training_image(image.channel(0), nC, tP, nP);					//push channel zero (all class images are assumed to be single channel)
+		C.push_back(image.channel(0));
+		unsigned int npixels = (unsigned int)image.channel(0).nnz();
+		nP.push_back(npixels);									//push the number of pixels onto the pixel array
+		*tP += npixels;											//add to the running total of pixels
+		*nC = *nC + 1;
+	}
+
+	return nP;
+}
+
+void display_PixelfeatureNclass(float* F, unsigned int* T, size_t B, size_t Idx){
+	//display code for debug, displaying Idx th pixel from feature matrix F with all features B 
+	std::cout<<"class of pixel["<<Idx<<"]" <<"is: "<<T[Idx]<<std::endl;
+	std::cout<<"feature["<<Idx<<"] is: "<<std::endl;
+	for (size_t i = 0; i< B; i++)
+		std::cout<<" "<<F[Idx * B + i];		
+}
+
+
+void display_args(int argc, stim::arglist args){
+	std::cout<<"number of arguments "<<argc<<std::endl;
+	std::cout<<"arg 0  "<<args.arg(0)<<std::endl;
+	std::cout<<"arg 1  "<<args.arg(1)<<std::endl;
+}
+
+void display_dataSize(size_t X, size_t Y, size_t B){
+	std::cout<<"number of samples "<<X*Y<<std::endl;
+	std::cout<<"number of bands "<<B<<std::endl;
+
+}
+
+void display_phe(float* phe, unsigned int* P, size_t p,size_t f, size_t i, size_t j){
+	//display code for debug, displaying jth pixel from new feature matrix which is created for gnome i 
+	std::cout<<"phe["<<i<<"]["<<j<<"]"<<std::endl;	
+	for(unsigned int n = 0; n < f; n++){
+		std::cout<<P[i * f + n]; //spectral feature indices from gnome i of current population
+		std::cout<<"    "<<phe[i* (p * f) +j * f + n]<<std::endl; //display 100th pixel value corresponding to feature indices in the gnome
+		
+	}
+}
+
+
+void display_gnome(unsigned int* P,size_t f,size_t gIdx){
+	//display code for debug, displaying gnome gIdx of current population, gnome is subset of feature indices
+	for (size_t i = 0; i< f; i++)
+		std::cout<<" "<<P[gIdx * f + i];	
+}
+
diff --git a/src/ga_gpu.cu b/src/ga_gpu.cu
new file mode 100644
index 0000000..dca8a46
--- /dev/null
+++ b/src/ga_gpu.cu
@@ -0,0 +1,196 @@
+#ifndef GA_GPU_CU
+#define GA_GPU_CU
+
+//#include <cuda.h>
+//#include "cuda_runtime.h"
+//#include <cuda_runtime_api.h>
+//#include "device_launch_parameters.h"
+#include <stim/cuda/cudatools/error.h>
+
+#include "timer.h"
+//#include <stdio.h>
+//#include <stdlib.h>
+#include <iostream>
+#include <fstream>
+
+extern Timer timer;
+
+
+__global__ void kernel_computeSb(float* gpuSb, unsigned int* gpuP, float* gpuM, float* gpuCM, size_t ub, size_t f, size_t p, size_t nC, unsigned int* gpu_nPxInCls){
+
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;			//gnomeindex in population matrix
+	size_t j = blockIdx.y * blockDim.y + threadIdx.y;			//index of feature index from gnome
+	size_t gnomeIndx = blockIdx.z * blockDim.z + threadIdx.z;			//if we use 3d grid then it is needed
+	
+   
+	if(gnomeIndx >= p || i >= f || j >= f) return;								//handling segmentation fault
+ 	
+	//form a sb matrix from vector sbVec, multiply each element in matrix with num of pixels in the current class
+	//and add it to previous value of between class scatter matrix sb 
+	float tempsbval;
+	size_t n1;
+	size_t n2;
+	size_t classIndx;										//class index in class mean matrix
+		
+	for(size_t c = 0; c < nC; c++){	
+		tempsbval = 0;
+		classIndx = c * ub;
+		n1 = gpuP[gnomeIndx * f + i];						//actual feature index in original feature matrix 
+		n2 = gpuP[gnomeIndx * f + j];						//actual feature index in original feature matrix 
+		tempsbval = ((gpuCM[classIndx  + n1] - gpuM[n1]) *(gpuCM[classIndx  + n2] - gpuM[n2])) * (float)gpu_nPxInCls[c] ;	
+		gpuSb[gnomeIndx * f * f + j * f + i] += tempsbval;
+	}	
+}
+
+
+//Compute within class scatter sw (p x f x f) of all gnome features phe(tP x f)
+__global__ void kernel_computeSw(float* gpuSw, unsigned int* gpuP, float* gpuCM, float* gpuF, unsigned int* gpuT, size_t ub, size_t f, size_t p, size_t nC, size_t tP){
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;			//gnomeindex in population matrix
+	size_t j = blockIdx.y * blockDim.y + threadIdx.y;			//index of feature index from gnome
+	size_t gnomeIndx = blockIdx.z * blockDim.z + threadIdx.z;	//total number of individuals
+		
+	if(gnomeIndx >= p || i >= f || j >= f) return;				//handling segmentation fault
+	float tempswval;
+
+	size_t n1 = gpuP[gnomeIndx * f + i];						//actual feature index in original feature matrix 
+	size_t n2 = gpuP[gnomeIndx * f + j];						//actual feature index in original feature matrix
+	tempswval = 0;
+	for(size_t c = 0; c < nC; c++){	
+		tempswval = 0;
+		for(size_t k = 0; k < tP; k++){
+			if(gpuT[k] == (c+1) ){  
+				tempswval += ((gpuF[ k * ub + n1] - gpuCM[c * ub  + n1]) * (gpuF[k * ub + n2] - gpuCM[c * ub  + n2]));	
+			}
+		}
+		gpuSw[gnomeIndx * f * f + j * f + i] += tempswval;
+	}		
+}
+	
+
+
+
+	//=============================gpu intialization=============================================
+	/// Initialize all GPU pointers used in the GA-GPU algorithm
+	/// @param gpuP is a pointer to GPU memory location, will point to memory space allocated for the population
+	/// @param p is the population size
+	/// @param f is the number of desired features
+	/// @param gpuCM is a pointer to a GPU memory location, will point to the class mean
+	/// @param cpuM is a pointer to the class mean on the CPU
+	/// @param gpu_nPxInCls is a pointer to a GPU memory location storing the number of pixels in each class
+	/// @param gpu_nPxInCls is a CPU array storing the number of pixels in each class
+	/// @param gpuSb is a GPU memory pointer to the between-class scatter matrices
+	/// @param gpuSw is a GPU memory pointer to the within-class scatter matrices
+	/// @param gpuF is the destination for the GPU feature matrix
+	/// @param cpuF is the complete feature matrix on the CPU
+
+	void gpuIntialization(unsigned int** gpuP, size_t p, size_t f,													//variables required for the population allocation
+						  float** gpuCM, float* cpuCM, size_t nC, unsigned int ub,
+						  float** gpuM, float* cpuM,  unsigned int** gpu_nPxInCls,
+						  float** gpuSb, float** gpuSw,
+						  float** gpuF, float* cpuF,
+						  unsigned int** gpuT, unsigned int* cpuT, size_t tP, unsigned int* cpu_nPxInCls){
+        
+		HANDLE_ERROR(cudaMalloc(gpuP, p * f * sizeof(unsigned int)));										//allocate space for the population on the GPU
+
+		HANDLE_ERROR(cudaMalloc(gpuCM, nC * ub * sizeof(float)));											//allocate space for the class mean and copy it to the GPU
+		HANDLE_ERROR(cudaMemcpy(*gpuCM, cpuCM, nC * ub * sizeof(float), cudaMemcpyHostToDevice));
+
+
+		HANDLE_ERROR(cudaMalloc(gpuM, ub * sizeof(float)));												//allocate space for the mean of the feature matrix
+		HANDLE_ERROR(cudaMemcpy(*gpuM, cpuM, ub * sizeof(float), cudaMemcpyHostToDevice));
+
+		HANDLE_ERROR(cudaMalloc(gpu_nPxInCls, nC * sizeof(unsigned int)));											//number of pixels in each class
+		HANDLE_ERROR(cudaMemcpy(*gpu_nPxInCls, cpu_nPxInCls, nC * sizeof(unsigned int), cudaMemcpyHostToDevice));
+		
+		
+		HANDLE_ERROR(cudaMalloc(gpuSb, p * f * f * sizeof(float)));										//allocate memory for sb which is calculated for eery class separately and added together in different kernel
+		HANDLE_ERROR(cudaMalloc(gpuSw,  p * f * f * sizeof(float)));
+		
+		HANDLE_ERROR(cudaMalloc(gpuF, tP * ub * sizeof(float)));
+		HANDLE_ERROR(cudaMemcpy(*gpuF, cpuF, tP * ub * sizeof(float), cudaMemcpyHostToDevice));
+		
+		HANDLE_ERROR(cudaMalloc(gpuT, tP * sizeof(unsigned int)));
+		HANDLE_ERROR(cudaMemcpy(*gpuT, cpuT, tP* sizeof(unsigned int), cudaMemcpyHostToDevice));
+  
+	}	
+
+	//computation on GPU
+	/// Initialize all GPU pointers used in the GA-GPU algorithm
+	/// @param gpuP is a pointer to GPU memory location, will point to memory space allocated for the population
+	/// @param p is the population size
+	/// @param f is the number of desired features
+	/// @param gpuSb is a GPU memory pointer to the between-class scatter matrices
+	/// @param cpuSb is the between-class scatter matrix on the GPU (this function will copy the GPU result there)
+	/// @param gpuSw is a GPU memory pointer to the within-class scatter matrices
+	/// @param cpuSw is the within-class scatter matrix on the GPU (this function will copy the GPU result there)
+
+	/// @param gpuCM is a pointer to a GPU memory location, will point to the class mean
+	/// @param cpuM is a pointer to the class mean on the CPU
+	/// @param gpu_nPxInCls is a pointer to a GPU memory location storing the number of pixels in each class
+	/// @param gpu_nPxInCls is a CPU array storing the number of pixels in each class	
+	
+	/// @param gpuF is the destination for the GPU feature matrix
+	/// @param cpuF is the complete feature matrix on the CPU
+	void gpucomputeSbSw(unsigned int* gpuP, unsigned int* cpuP, size_t p, size_t f,
+						 float* gpuSb, float* cpuSb,
+						 float* gpuSw, float* cpuSw,
+						 float* gpuF, unsigned int* gpuT,float* gpuM, float* gpuCM, 
+						 size_t nC, size_t tP, cudaDeviceProp props, size_t gen, size_t gnrtn, size_t ub, unsigned int* gpu_nPxInCls, std::ofstream& profilefile){
+		    
+    timer.start();
+		HANDLE_ERROR(cudaMemcpy(gpuP, cpuP, p * f * sizeof(unsigned int), cudaMemcpyHostToDevice));
+		HANDLE_ERROR(cudaMemset(gpuSb, 0, p * f * f * sizeof(float)));
+				
+		//grid configuration of GPU
+		size_t threads = (size_t)sqrt(props.maxThreadsPerBlock);
+		if(threads > f) threads = f;
+		size_t numberofblocksfor_f = (size_t)ceil((float)f/ threads);
+		dim3 blockdim((int)threads, (int)threads, 1);
+		dim3 griddim((int)numberofblocksfor_f, (int)numberofblocksfor_f, (int)p);									//X dimension blocks will cover all gnomes of the population and each block will have as many gnomes as it can feet 
+		//sharedbytes calculation
+		size_t sharedBytes = p * f * sizeof(unsigned int);												//copy population to shared memory
+		if(props.sharedMemPerBlock < sharedBytes) sharedBytes = props.sharedMemPerBlock;
+		
+		//launch kernel to compute sb matrix
+		kernel_computeSb<<<griddim, blockdim, sharedBytes>>>(gpuSb, gpuP, gpuM, gpuCM, ub, f, p, nC, gpu_nPxInCls);
+    cudaDeviceSynchronize();
+   
+		HANDLE_ERROR(cudaMemcpy(cpuSb, gpuSb, p * f * f * sizeof(float), cudaMemcpyDeviceToHost));		//copy between class scatter from gpu to cpu
+		const auto elapsedg1 = timer.time_elapsed();
+		if(gen > gnrtn -2){
+			std::cout << "Sb gpu time "<<std::chrono::duration_cast<std::chrono::microseconds>(elapsedg1).count() << "us" << std::endl;
+			profilefile << "Sb gpu time "<<std::chrono::duration_cast<std::chrono::microseconds>(elapsedg1).count() << "us" << std::endl;
+		}
+
+		timer.start();
+		//Compute within class scatter
+		HANDLE_ERROR(cudaMemset(gpuSw, 0, p * f * f * sizeof(float)));
+
+		//launch kernel to compute sb matrix
+		kernel_computeSw<<<griddim, blockdim>>>(gpuSw, gpuP, gpuCM, gpuF, gpuT, ub, f, p, nC, tP);
+    cudaDeviceSynchronize();
+		//copy between class scatter from gpu to cpu
+		HANDLE_ERROR(cudaMemcpy(cpuSw, gpuSw, p * f * f * sizeof(float), cudaMemcpyDeviceToHost));
+		const auto elapsedg2 = timer.time_elapsed();
+		if(gen > gnrtn - 2){
+			std::cout << "Sw gpu time "<<std::chrono::duration_cast<std::chrono::microseconds>(elapsedg2).count() << "us" << std::endl;
+			profilefile<< "Sw gpu time "<<std::chrono::duration_cast<std::chrono::microseconds>(elapsedg2).count() << "us" << std::endl;
+		}
+
+	}
+ 
+ //free all gpu pointers
+	void gpuDestroy(unsigned int* gpuP, float* gpuCM, float* gpuM, unsigned int* gpu_nPxInCls, float* gpuSb, float* gpuSw, float* gpuF, unsigned int* gpuT){
+ 
+		HANDLE_ERROR(cudaFree(gpuP));
+		HANDLE_ERROR(cudaFree(gpuCM));
+		HANDLE_ERROR(cudaFree(gpuM));
+		HANDLE_ERROR(cudaFree(gpu_nPxInCls));
+		HANDLE_ERROR(cudaFree(gpuSb));
+    HANDLE_ERROR(cudaFree(gpuSw));
+		HANDLE_ERROR(cudaFree(gpuF));
+    HANDLE_ERROR(cudaFree(gpuT));
+	}
+
+#endif
+
diff --git a/src/ga_gpu.h b/src/ga_gpu.h
new file mode 100644
index 0000000..e05bd6e
--- /dev/null
+++ b/src/ga_gpu.h
@@ -0,0 +1,649 @@
+#ifndef GA_GPU_H
+#define GA_GPU_H
+
+#include <iostream>
+#include <thread>
+#include <complex>
+#include <cv.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+
+#include "timer.h"
+
+#include "basic_functions.h"
+//LAPACKE support for Visual Studio
+
+#ifndef LAPACK_COMPLEX_CUSTOM
+#define LAPACK_COMPLEX_CUSTOM
+#define lapack_complex_float std::complex<float>
+#define lapack_complex_double std::complex<double>
+#include "lapacke.h"
+#endif
+
+
+#define LAPACK_ROW_MAJOR               101
+#define LAPACK_COL_MAJOR               102
+
+//CUDA functions
+void gpuIntialization(unsigned int** gpuP, size_t p, size_t f,													//variables required for the population allocation
+	float** gpuCM, float* cpuCM, size_t nC, unsigned int ub,
+	float** gpuM, float* cpuM, unsigned int** gpu_nPxInCls,
+	float** gpuSb, float** gpuSw,
+	float** gpuF, float* cpuF,
+	unsigned int** gpuT, unsigned int* cpuT, size_t tP, unsigned int* cpu_nPxInCls);
+void gpucomputeSbSw(unsigned int* gpuP, unsigned int* cpuP, size_t p, size_t f,
+	float* gpuSb, float* cpuSb,
+	float* gpuSw, float* cpuSw,
+	float* gpuF, unsigned int* T, float* gpuM, float* gpuCM,
+	size_t nC, size_t tP, cudaDeviceProp props, size_t gen, size_t gnrtn, size_t ub, unsigned int* gpu_nPxInCls, std::ofstream& profilefile);
+void gpuDestroy(unsigned int* gpuP, float* gpuCM, float* gpuM, unsigned int* gpu_nPxInCls, float* gpuSb, float* gpuSw, float* gpuF, unsigned int* gpuT);
+
+struct _fcomplex { float re, im; };
+typedef struct _fcomplex fcomplex;
+
+Timer timer;
+
+class ga_gpu {
+
+public:
+	float* F;					//pointer to the raw data in host memory
+	unsigned int* T;			//pointer to the class labels in host memory
+	size_t gnrtn;				//total number of generations
+	size_t p;					//population size
+	size_t f;					// number of features to be selected
+
+	unsigned int* P;			//pointer to population of current generation genotype matrix (p x f)
+	float* S;					//pointer to score(fitness value) of each gnome from current population matric P
+	unsigned int* i_guess;		//initial guess of features if mentioined in args add to initial population
+	unsigned int ub;			//upper bound for gnome value (maximum feature index from raw feature matrix F)
+	unsigned int lb;			//lower bound for gnome value (minimum feature index from raw feature matrix F = 0)
+	float uniformRate;
+	float mutationRate;
+	size_t tournamentSize;  	//number of potential gnomes to select parent for crossover
+	bool elitism;        	//if true then passes best gnome to next generation
+
+							//declare gpu pointers
+	float* gpuF;				//Feature matrix	
+	unsigned int* gpuT;			//target responses of entire feature matrix	
+	unsigned int* gpuP;		//population matrix
+	unsigned int* gpu_nPxInCls;
+	float* gpuCM;				//class mean of entire feature matrix
+	float* gpuM;				//total mean of entire feature matrix
+	float* gpuSb;				//between class scatter for all individuals of current population
+	float* gpuSw;				//within class scatter for all individuals of current population
+
+								//constructor
+	ga_gpu() {}
+
+	//==============================generate initial population
+
+		void initialize_population(std::vector<unsigned int> i_guess, bool debug) {
+			if (debug) {
+				std::cout << std::endl;
+				std::cout << "initial populatyion is: " << std::endl;
+			}
+			
+			lb = 0;
+			P = (unsigned int*)calloc(p * f, sizeof(unsigned int));					//allcate memory for genetic population(indices of features from F), p number of gnomes of size f
+			S = (float*)calloc(p, sizeof(float));							//allcate memory for scores(fitness value) of each gnome from P 
+
+			srand(1);
+			//add intial guess to the population if specified by user as a output of other algorithm or by default just random guess
+			std::memcpy(P, i_guess.data(), f * sizeof(unsigned int));
+
+			//generate random initial population
+			for (size_t i1 = 1; i1 < p; i1++) {
+				for (size_t i2 = 0; i2 < f; i2++) {
+					P[i1 * f + i2] = rand() % ub + lb;						//select element of gnome as random feature index within lower bound(0) and upper bound(B) 
+					if (debug)	std::cout << P[i1 * f + i2] << "\t";
+				}
+				if (debug) std::cout << std::endl;
+			}
+		}
+
+		//===================generation of new population==========================================
+
+		size_t evolvePopulation(unsigned int* newPop, float* M, bool debug) {
+
+			//gget index of best gnome in the current population
+			size_t bestG_Indx = gIdxbestGnome();
+			//-------------(reproduction)-------
+			if (elitism) {
+				saveGnomeIdx(0, bestG_Indx, newPop);							//keep best gnome from previous generation to new generation
+			}
+			// ------------Crossover population---------------
+			int elitismOffset;
+			if (elitism) {
+				elitismOffset = 1;
+			}
+			else {
+				elitismOffset = 0;
+			}
+
+			//Do crossover for rest of population size
+			for (int i = elitismOffset; i <p; i++) {
+				//	std::cout<<"crossover of gnome "<<i<<std::endl;
+				std::vector<unsigned int>gnome1;
+				gnome1.reserve(f);
+				gnome1 = tournamentSelection(5);								//select first parent for crossover from tournament selection of 5 gnomes
+																				//	displaygnome(gnome1);
+				std::vector<unsigned int>gnome2;
+				gnome2.reserve(f);
+				gnome2 = tournamentSelection(5);								//select first parent for crossover from tournament selection of 5 gnomes
+																				//	displaygnome(gnome2);
+				std::vector<unsigned int>gnome;
+				gnome.reserve(f);
+				gnome = crossover(gnome1, gnome2, M);								//Do crossover of above parent gnomes to produce new gnome
+																					//	displaygnome(gnome);
+				saveGnome(i, gnome, newPop);									//save crosseover result to new population
+			}
+
+			//--------------Mutate population------------
+			// introduce some mutation in new population
+			for (int i = elitismOffset; i <p; i++) {
+				//std::cout<<"mutation of gnome"<<std::endl;
+				std::vector<unsigned int>gnome;
+				gnome.reserve(f);
+
+				for (size_t n = 0; n < f; n++)
+					gnome.push_back(newPop[i*f + n]);
+				//std::cout<<"\n starting address "<<(&newPop[0] + i*f)<<"\t end address is "<<(&newPop[0] + i*f + f-1) <<std::endl;
+				//std::copy((&newPop[0] + i*f), (&newPop[0] + i*f +f-1), gnome.begin());
+				//	displaygnome(gnome);
+				mutate(gnome);
+				//	displaygnome(gnome);
+				saveGnome(i, gnome, newPop);								//save new gnome to new population at position i
+			}
+			return bestG_Indx;
+		}
+
+		//============================== functions for population evolution ===========================================================================
+		std::vector<unsigned int> tournamentSelection(size_t tSize) {
+			// Create a tournament population
+			unsigned int* tournamentP = (unsigned int*)malloc(tSize * f * sizeof(unsigned int));
+			std::vector<float>tournamentS;
+
+			// For each place in the tournament get a random individual
+			for (size_t i = 0; i < tSize; i++) {
+				size_t rndmIdx = rand() % p + lb;
+				tournamentS.push_back(S[rndmIdx]);
+				//for (size_t n = 0; n <f; n++) 
+				//tournamentP[i * f + n] = (getGnome(rndmIdx)).at(n);
+				std::vector<unsigned int> temp_g(getGnome(rndmIdx));
+				std::copy(temp_g.begin(), temp_g.end(), tournamentP + i*f);
+			}
+			// Get the fittest
+			std::vector<unsigned int>fittestgnome;
+			fittestgnome.reserve(f);
+
+			//select index of best gnome from fitness score
+			size_t bestSIdx = 0;
+			for (size_t i = 0; i < tSize; i++) {
+				if (tournamentS[i] < tournamentS[bestSIdx])
+					bestSIdx = i;    //float check : it was like this b(&idx[i], &idx[j]) but gave me error
+			}
+
+			for (size_t n = 0; n < f; n++)
+				fittestgnome.push_back(tournamentP[bestSIdx * f + n]);
+			return fittestgnome;
+		} //end of tournament selection 
+
+
+		std::vector<unsigned int>  crossover(std::vector<unsigned int>  gnome1, std::vector<unsigned int>  gnome2, float* M) {
+			std::vector<unsigned int> gnome;
+			for (size_t i = 0; i < f; i++) {
+				// Crossover
+				float r = static_cast <float> (rand()) / static_cast <float> (RAND_MAX);
+				if (r <= uniformRate) {
+					gnome.push_back(gnome1.at(i));
+				}
+				else {
+					gnome.push_back(gnome2.at(i));
+				}
+			}
+
+			//check new gnome for all zero bands and duplicated values
+			std::vector<unsigned int> gnomeunique;
+			int flag = 0;
+			std::sort(gnome.begin(), gnome.end()); // 1 1 2 2 3 3 3 4 4 5 5 6 7 
+			std::unique_copy(gnome.begin(), gnome.end(), std::back_inserter(gnomeunique));
+			/*	if(gnomeunique.size()< gnome.size()){
+			flag = 1;
+			std::cout<<"gnome:["<<g<<"] "<<"\t duplications are "<< (gnome.size() - gnomeunique.size())<<std::endl;
+			}*/
+			unsigned int featureband, featureband1, featureband2;
+			if (gnomeunique.size() < f) {
+				for (size_t k = gnomeunique.size(); k < f; k++) {
+					featureband = rand() % ub + lb;
+					for (size_t i = 0; i < f; i++) {
+						featureband1 = gnome1.at(i);
+						featureband2 = gnome2.at(i);
+						for (size_t j = 0; j < gnomeunique.size(); j++) {
+							if (gnomeunique.at(j) != featureband1) {
+								featureband = featureband1;
+							}
+							else if (gnomeunique.at(j) != featureband2) {
+								featureband = featureband2;
+							}
+							else if (gnomeunique.at(j) == featureband) {
+								featureband = rand() % ub + lb;
+								while (M[featureband] == 0) {
+									featureband = rand() % ub + lb;
+								}
+							}
+						}
+					}
+					gnomeunique.push_back(featureband);
+				}
+			}
+			//if(flag ==1){
+			//	std::cout<<"\n original gnome "<<g<<" are "<<std::endl;
+			//	for(int k = 0; k < gnome.size(); k++)
+			//		std::cout<<gnome[k]<<"\t";
+			//	std::cout<<"\n unique results in cpp for gnome "<<g<<" are "<<std::endl;
+			//	for(int k = 0; k < gnomeunique.size(); k++)
+			//		std::cout<<gnomeunique[k]<<"\t";
+			//}
+
+			return gnomeunique;
+		}
+
+		void mutate(std::vector<unsigned int> gnome) {
+			for (size_t i = 0; i < f; i++) {
+				float LO = (float)0.01;
+				float HI = 1;
+				float r3 = LO + static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / (HI - LO)));
+				//if random value is less than mutationRate then mutate this gnome
+				if (r3 <= mutationRate) {
+					gnome.at(i) = (rand() % ub + lb);
+					gnome.push_back(rand() % ub + lb);
+				}
+			}
+		}
+
+		///returns gnome of given index
+		std::vector<unsigned int> getGnome(size_t idx) {
+			std::vector<unsigned int> gnome;
+			gnome.reserve(f);
+			//pulling gnome idx from population P
+			for (size_t n = 0; n < f; n++)
+				gnome.push_back(P[idx * f + n]);
+			//memcpy(&gnome[0], P+idx*f, f*sizeof(size_t));
+			return gnome;
+		}
+
+		//save gnome of index gIdx from previous population at position i in the new population
+		void saveGnomeIdx(size_t i, size_t gIdx, unsigned int* newPop) {
+			for (size_t n = 0; n < f; n++)
+				newPop[i * f + n] = P[gIdx * f + n];
+		}
+
+		void saveGnome(size_t idx, std::vector<unsigned int>gnome, unsigned int* newPop) {
+			std::copy(gnome.begin(), gnome.end(), newPop + idx*f);
+		}
+
+		size_t gIdxbestGnome() {
+			//std::cout<<"best gnome indes is: "<<sortSIndx()[0];
+			return sortSIndx()[0];
+		}
+
+		void displaygnome(std::vector<unsigned int> gnome) {
+			std::cout << "\t gnome: ";
+			for (int i = 0; i<gnome.size(); ++i)
+				std::cout << gnome[i] << ' ';
+			std::cout << std::endl;
+		}
+
+		//---------------------post processing of score-------------------------------------
+		void Snorm() { //normalize gnome scores 
+			double s;
+			for (size_t i = 0; i < p; i++) {
+				s += S[i];			//sum of all gnome score in population
+			}
+			//std::cout<<"mean Score is: "<<(double) s/p;
+			for (size_t i = 0; i <p; i++)
+				S[i] = S[i] / s;
+		}
+
+		size_t* sortSIndx() {     //sort gnome index according to gnome scores
+			//sort indices of score in ascending order (fitness value)
+			size_t *idx = (size_t*)malloc(p * sizeof(size_t));	//array to hold sorted gnome index
+			for (size_t i = 0; i < p; i++) {					//initialize index array from 1 to p(population size) in an ascending order
+				idx[i] = i;
+			}
+
+			for (size_t i = 0; i<p; i++) {				//sort gnome indices according to score values using bubble sort 
+				for (size_t j = i + 1; j<p; j++) {
+					if (S[idx[i]] > S[idx[j]]) {
+						std::swap(idx[i], idx[j]);		//float check : it was like this b(&idx[i], &idx[j]) but gave me error
+					}
+				}
+			}
+
+			//display best gnome
+            //std::cout << "best fitness value: " << S[idx[0]] << std::endl;
+			/*if (S[idx[0]] < 0) {				
+				std::cout << "best gnome is " << std::endl;
+				for (size_t i = 0; i < f; i++)
+					std::cout << P[f * idx[0] + i] << ", ";
+				std::cout << std::endl;
+			}*/
+
+			return idx; //use as sortSIdx in selection
+		}
+
+
+		//size_t* sortIndx(float* input, size_t size) {
+		//	//sort indices of score in ascending order (fitness value)
+		//	size_t *idx;
+		//	idx = (size_t*)malloc(size * sizeof(size_t));
+		//	for (size_t i = 0; i < size; i++)
+		//		idx[i] = i;
+
+		//	for (size_t i = 0; i<size; i++) {
+		//		for (size_t j = i + 1; j<size; j++) {
+		//			if (input[idx[i]] < input[idx[j]]) {
+		//				std::swap(idx[i], idx[j]);				   //float check : it was like this b(&idx[i], &idx[j]) but gave me error
+		//			}
+		//		}
+		//	}
+		//	return idx; //use as sortSIdx in selection
+
+		//}
+
+		void generateNewP(unsigned int* newPop) {
+			//std::memcpy(P, 0 , p * f *sizeof(unsigned int));			   //copy sb of gnome 'g' into bufferarray tempg_s	
+			std::memcpy(P, newPop, p * f * sizeof(unsigned int));		   //copy sb of gnome 'g' into bufferarray tempg_s	
+		}
+
+		//============================== functions for fitness function  ===========================================================================
+		//compute total mean M (1 X B) of all features (tP X B)
+		void ttlMean(float* M, size_t tP, size_t B) {
+			//std::cout<<"total number of pixels are "<<tP<<std::endl;
+			for (int k = 0; k < tP; k++) {								  //total number of pixel in feature matrix
+				for (size_t n = 0; n < B; n++) {							  // index of feature in ith gnome
+					M[n] += F[k * B + n];
+				}
+			}
+			for (size_t n = 0; n < B; n++)								 //take an avarage of above summation 
+				M[n] = M[n] / (float)tP;
+		}
+
+		void dispalymean(float* M) {				//display mean
+			std::cout << std::endl;
+			std::cout << "Total mean of gnome 1 features are is " << std::endl;
+
+			for (size_t i = 0; i < 1; i++) {
+				for (size_t j = 0; j < f; j++) {
+					size_t index = P[i*f + j];
+					std::cout << "feature index " << index << "\t total mean" << M[index] << std::endl;
+				}
+			}
+			std::cout << std::endl;
+		}
+
+		//Compute class means cM (p x nC x f) of all gnome features phe(tP x f)
+		void classMean(float* cM, size_t tP, size_t nC, size_t B, std::vector<unsigned int> nPxInCls) {
+			for (size_t c = 0; c < nC; c++) {									//index of class feature matrix responses
+				float* tempcM = (float*)calloc(B, sizeof(float));			//tempcM holds classmean vector for current gnome 'i', class 'c'
+				for (size_t k = 0; k < tP; k++) {							//total number of pixel in feature matrix
+					if (T[k] == c + 1) {									//class numbers start from 1 not 0
+						for (size_t n = 0; n < B; n++) {					//total number of features in a gnome
+							tempcM[n] += F[k * B + n];					//add phe value for feature n of class 'c' in  ith gnome
+						}
+					}
+				}
+				for (size_t n = 0; n < B; n++)
+					cM[c * B + n] = tempcM[n] / (float)nPxInCls[c];			//divide by number of pixels from class 'c'
+
+			}
+
+		}
+
+		//display class mean
+		void dispalyClassmean(float* cM, size_t nC) {
+			std::cout << std::endl;
+			std::cout << "class mean of gnome 1 with total classes " << nC << " is :" << std::endl;
+			for (size_t i = 0; i < 1; i++) {
+				for (size_t c = 0; c < nC; c++) {
+					for (size_t j = 0; j < f; j++) {
+						size_t index = P[i*f + j];
+
+						std::cout << "class index: " << c << "\t feature index " << index << "\t  class mean " << cM[c * ub + index] << std::endl;
+					}
+				}
+			}
+			std::cout << std::endl;
+		}
+
+		//-----------------------------------------between and within class Scattering computation---------------------------------------------------------------
+		//computation on CPU
+		void cpu_computeSbSw(float* sb, float* sw, float* M, float* cM, size_t nC, size_t tP, std::vector<unsigned int> nPxInCls) {
+			timer.start();
+			computeSb(sb, M, cM, nC, nPxInCls);					//compute between class scatter on CPU
+			const auto elapsed = timer.time_elapsed();
+			std::cout << "Sb CPU time " << std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count() << "us" << std::endl;
+
+			timer.start();
+			computeSw(sw, cM, nC, tP);							//compute within class scatter on CPU
+			const auto elapsed1 = timer.time_elapsed();
+			std::cout << "Sw CPU time " << std::chrono::duration_cast<std::chrono::microseconds>(elapsed1).count() << "us" << std::endl;
+		}
+
+		//display between class scatter
+		void displaySb(float* sb) {
+			std::cout << "between scatter is  " << std::endl;
+			for (size_t g = 0; g<1; g++) {
+				std::cout << std::endl;
+				for (size_t j = 0; j < f; j++) {								//total number of features in a gnome
+					for (size_t k = 0; k < f; k++) {							//total number of features in a gnome	
+						std::cout << sb[g * f * f + j * f + k] << "  ";
+					}
+					std::cout << std::endl;
+				}
+			}
+			std::cout << std::endl;
+		}
+
+		//Compute between class scatter sb (p x f x f) of all gnome features phe(tP x f)
+		void computeSb(float* sb, float* M, float* cM, size_t nC, std::vector<unsigned int> nPxInCls) {
+			float tempsbval;
+			size_t n1;
+			size_t n2;
+			size_t classIndx;										//class index in class mean matrix
+			/*std::cout <<"population of computation of cpusb "<< std::endl;
+			for (size_t i2 = 0; i2 < f; i2++) {
+				std::cout << P[i2] << "\t";
+			}*/
+
+			for (size_t gnomeIndx = 0; gnomeIndx < p; gnomeIndx++) {
+				for (size_t c = 0; c < nC; c++) {
+					for (size_t i = 0; i < f; i++) {
+						for (size_t j = 0; j < f; j++) {
+							tempsbval = 0;
+							classIndx = c * ub;
+							n1 = P[gnomeIndx * f + i];						//actual feature index in original feature matrix 
+							n2 = P[gnomeIndx * f + j];						//actual feature index in original feature matrix 
+						//	std::cout << "i: " << i << " j: " <<j<< " n1: " << n1 << " n2:" << n2 << std::endl;
+							tempsbval = ((cM[classIndx + n1] - M[n1]) *(cM[classIndx + n2] - M[n2]));							
+							sb[gnomeIndx * f * f + i * f + j] += tempsbval * (float)nPxInCls[c];  // compute tempsb[j][k] element of class 'c' of gnome 'i'
+						}
+					}
+				}
+			}
+
+		}
+
+		//Compute within class scatter sw (p x f x f) of all gnome features phe(tP x f)
+		void computeSw(float* sw, float* cM, size_t nC, size_t tP) {
+			float tempswval;
+			size_t n1;
+			size_t n2;
+			size_t cMclass;										//class index in class mean matrix
+			size_t Pg;
+			size_t swg;
+			size_t pheg;
+			for (size_t gnomeIndx = 0; gnomeIndx < p; gnomeIndx++) {
+				Pg = gnomeIndx * f;
+				swg = gnomeIndx * f * f;
+				pheg = gnomeIndx * tP * f;;
+				for (size_t c = 0; c < nC; c++) {
+					cMclass = c * ub;
+
+					for (size_t k = 0; k < tP; k++) {
+						if (T[k] == (c + 1)) {
+							for (size_t i = 0; i < f; i++) {
+								for (size_t j = 0; j < f; j++) {
+									n1 = P[Pg + i];						//actual feature index in original feature matrix 
+									n2 = P[Pg + j];						//actual feature index in original feature matrix 
+
+									tempswval = 0;
+									tempswval = ((F[k * ub + n1] - cM[cMclass + n1]) * (F[k * ub + n2] - cM[cMclass + n2]));
+									//tempswval = ((phe[gnomeIndx * tP * f + k * f + i] - cM[c * ub + P[gnomeIndx * f + i]]) * (phe[gnomeIndx * tP *f + k * f + j] - cM[c * ub + P[gnomeIndx * f + j]]));	
+									sw[gnomeIndx * f * f + i * f + j] += tempswval;
+								}
+							}
+						}
+					}
+				}
+
+			}
+		}
+		//checking bands with all zeros and replacing duplicated bands in gnome but this function is only for initial population
+		//void zerobandcheck(float* M, bool initial) {
+		//	for (size_t g = 0; g < p; g++) {										// for each gnome			
+		//		for (size_t i = 0; i < f; i++) {									//check each band (feature) index in that gnome
+		//			while (M[P[g * f + i]] == 0) {									//if mean of band is zero then replace band index in population
+		//				P[g * f + i] = rand() % ub + lb;
+		//			}
+		//		}
+		//		//checking for duplicats in a gnome
+		//		std::vector<unsigned int> gnome = getGnome(g);
+		//		std::vector<unsigned int> gnomeunique;
+		//		int flag = 0;							//flag will be set if gnome has duplicated band (feature) index
+		//		std::sort(gnome.begin(), gnome.end());	// 1 1 2 2 3 3 3 4 4 5 5 6 7 
+		//		std::unique_copy(gnome.begin(), gnome.end(), std::back_inserter(gnomeunique));		//keep only unique copies of indices and remove duplicate copies
+		//		if (gnomeunique.size()< gnome.size()) {
+		//			flag = 1;							//set flag for those if there are duplicated indices
+		//												//std::cout<<"gnome:["<<g<<"] "<<"\t duplications are "<< (gnome.size() - gnomeunique.size())<<std::endl;
+		//		}
+
+		//		//adding extra random feature indices to unique copy of gnome to achive gnome size  = f
+		//		if (gnomeunique.size() < f) {
+		//			for (size_t k = gnomeunique.size(); k < f; k++) {
+		//				unsigned int rnumber = rand() % ub + lb;
+		//				//check if this randomaly generated number is already present in that gnome or not
+		//				for (size_t j = 0; j < gnomeunique.size(); j++) {
+		//					if (gnomeunique.at(j) == rnumber) {				//if new index is duplicated copy of any of previous gnome element replace it with another random number
+		//						rnumber = rand() % ub + lb;
+		//						j = 0;										//set j = 0 to start checking of duplication of feature index from the first element of gnome
+		//					}
+		//				}
+		//				gnomeunique.push_back(rnumber);						//add feature index to gnomeunique 			
+		//			}
+		//		}
+		//		std::copy(gnomeunique.begin(), gnomeunique.end(), P + g * f);
+		//	}
+		//}
+
+		//checking bands with all zeros and replacing duplicated bands in gnome
+		void zerobandcheck(float* M, bool initialPop) {
+			size_t startgnome;
+			if (initialPop) {
+				startgnome = 0;    //for initial population check all gnomes 
+			}
+			else {
+				startgnome = 1;    //for next generations start gnome check after elite children offset 
+			}
+			for (size_t g = startgnome; g < p; g++) {							// for each gnome except 		
+				
+				for (size_t i = 0; i < f; i++) {						//check each band (feature) index in that gnome
+					while (M[P[g * f + i]] == 0) {						//if mean of band is zero then replace band index in population
+						P[g * f + i] = rand() % ub + lb;
+					}
+				}
+				//checking for duplicats in a gnome
+				std::vector<unsigned int> gnome = getGnome(g);			//get current gnome g from population matrix P
+				std::vector<unsigned int> gnomeunique;					//array to store only unique band indicies in a genome
+				int flag = 0;											//flag will be set if gnome has duplicated band (feature) index
+				std::sort(gnome.begin(), gnome.end());					//sort current gnome 
+				std::unique_copy(gnome.begin(), gnome.end(), std::back_inserter(gnomeunique));		//remove duplicat copies of band indices and keep only unique in a gnome
+				if (gnomeunique.size()< gnome.size()) {
+					flag = 1;							//set flag for those if there are duplicated indices
+					//std::cout<<"gnome:["<<g<<"] "<<"\t duplications are "<< (gnome.size() - gnomeunique.size())<<std::endl;
+				}
+
+				//adding extra random feature indices to unique copy of gnome to achive gnome size  = f
+				if (gnomeunique.size() < f) {
+					for (size_t k = gnomeunique.size(); k < f; k++) {
+						unsigned int rnumber = rand() % ub + lb;
+						//check if this randomaly generated number is already present in that gnome or not
+						for (size_t j = 0; j < gnomeunique.size(); j++) {
+							if (gnomeunique.at(j) == rnumber) {				//if new index is duplicated copy of any of previous gnome element replace it with another random number
+								rnumber = rand() % ub + lb;					//generate random number between upper bound and lower bound (ub. lb)
+								j = 0;										//set j = 0 to start checking of duplication of feature index from the first element of gnome
+							}
+						}
+						gnomeunique.push_back(rnumber);						//add feature index to gnomeunique 			
+					}
+				}
+
+				//diplay loop only if gnome has duplicated indices
+				//if(flag ==1){
+				//	std::cout<<"\n original gnome "<<g<<" are "<<std::endl;
+				//	for(int k = 0; k < gnome.size(); k++)
+				//		std::cout<<gnome[k]<<"\t";
+				//	std::cout<<"\n unique results in cpp for gnome "<<g<<" are "<<std::endl;
+				//	for(int k = 0; k < gnomeunique.size(); k++)
+				//		std::cout<<gnomeunique[k]<<"\t";
+				//}
+				std::copy(gnomeunique.begin(), gnomeunique.end(), P + g * f);  //copy new gnome without any duplicate band index at current gnome location
+			}
+		}
+
+		
+
+		//gpu calling functions
+		//gpu initialization (allocating space for all array on GPU)
+		void gpuInitializationfrommain(float* cpuM, float* cpuCM, std::vector<unsigned int>cpu_nPxInCls, size_t tP, size_t nC) {
+			//	call gpuInitialization(......) with all of the necessary parameters
+			gpuIntialization(&gpuP, p, f, &gpuCM, cpuCM, nC, ub, &gpuM, cpuM, &gpu_nPxInCls, &gpuSb, &gpuSw, &gpuF, F, &gpuT, T, tP, &cpu_nPxInCls[0]);
+			
+		}
+
+		//Computation of between class scatter and within class scatter in GPU
+		void gpu_computeSbSw(float* cpuSb, float* cpuSw, size_t nC, size_t tP, cudaDeviceProp props, size_t gen, bool debug, std::ofstream& profilefile) {
+			//calling function for SW and Sb computation and passing necessary arrays for computation
+     // std::cout<<"gpu function calling"<<std::endl;
+			gpucomputeSbSw(gpuP, P, p, f, gpuSb, cpuSb, gpuSw, cpuSw, gpuF, gpuT, gpuM, gpuCM, nC, tP, props, gen, gnrtn, ub, gpu_nPxInCls, profilefile);
+
+			//display computed Sb and Sw if debug is set 
+			if (debug) {
+				std::cout << "From GA-GPU class: gpu results of Sb sn Sw" << std::endl;
+				displayS(cpuSb, f);									//display Sb				
+				displayS(cpuSw, f);									//display Sw
+				std::cout << std::endl;
+			}
+		}
+
+		//call function to free gpu pointers
+		//free all gpu pointers
+		void gpu_Destroy() {
+			gpuDestroy(gpuP, gpuCM, gpuM, gpu_nPxInCls, gpuSb, gpuSw, gpuF, gpuT);
+		}
+
+		//Write a destructor here
+		~ga_gpu() {
+
+			if (F != NULL) 		std::free(F);		//not sure about this as it is only for 2nd constructor 
+			if (T != NULL)		std::free(T);		//same as above
+			if (P != NULL)		std::free(P);		//not sure about this as it is only for 2nd constructor 
+			if (S != NULL)		std::free(S);		//same as above
+														//if(i_guess!=NULL) 	std::free(i_guess);     //same as above
+														//HANDLE_ERROR(cudaDeviceReset());
+
+		}
+	};
+
+#endif
diff --git a/src/main.cpp b/src/main.cpp
new file mode 100644
index 0000000..5c09727
--- /dev/null
+++ b/src/main.cpp
@@ -0,0 +1,556 @@
+#include <iostream>
+
+//stim libraries
+#include <stim/envi/envi.h>
+#include <stim/image/image.h>
+#include <stim/ui/progressbar.h>
+#include <stim/parser/filename.h>
+#include <stim/parser/table.h>
+#include <stim/parser/arguments.h>
+//input arguments
+stim::arglist args;
+#include <fstream>
+#include <thread>
+#include <random>
+#include <vector>
+#include <math.h>
+#include <limits>
+
+#define NOMINMAX
+
+
+
+//GA
+#include "ga_gpu.h"
+#include "enviload.h"
+
+
+//envi input file and associated parameters
+stim::envi E;										//ENVI binary file object
+unsigned int B;										//shortcuts storing the spatial and spectral size of the ENVI image
+//mask and class information used for training
+//std::vector< stim::image<unsigned char> > C;		//2D array used to access each mask C[m][p], where m = mask# and p = pixel#
+std::vector<unsigned int> nP;						//array holds the number of pixels in each mask: nP[m] is the number of pixels in mask m
+size_t nC = 0;										//number of classes
+size_t tP = 0;										//total number of pixels in all masks: tP = nP[0] + nP[1] + ... + nP[nC]
+float* fea;
+
+//ga_gpu class object
+ga_gpu ga;
+bool debug;
+bool binaryClass;
+int binClassOne;
+
+//creating struct to pass to thread functions as it limits number of arguments to 3
+typedef struct {
+	float* S;
+	float* Sb;
+	float* Sw;
+	float* lda;
+}gnome;
+gnome gnom;
+
+
+void gpuComputeEignS( size_t g, size_t fea){	
+	//eigen value computation will return  r = (nC-1) eigen vectors so new projected data will have dimension of r rather than f
+	//  std::thread::id this_id = std::this_thread::get_id();
+	//	std::cout<<"thread id is "<< this_id<<std::endl;  
+	size_t f = fea;
+	//std::thread::id g = std::this_thread::get_id();	
+	float* LeftEigVectors_a = (float*) malloc(f * f * sizeof(float));			
+	float* gSw_a = (float*) malloc(f * f * sizeof(float));						//copy of between class scatter		
+	std::memcpy(gSw_a, &gnom.Sw[g * f * f], f * f *sizeof(float));
+	if(debug){
+		std::cout<<"From Eigen function: Sb and Sw "<<std::endl;
+		displayS(gSw_a, f);									//display Sb				
+		displayS(&gnom.Sb[g * f * f], f);									//display Sw
+		std::cout<<std::endl;
+	}	
+	
+	std::vector<unsigned int> features = ga.getGnome(g);
+	std::vector<unsigned int> featuresunique;
+	int flag = 0;
+	std::sort(features.begin(), features.end()); // 1 1 2 2 3 3 3 4 4 5 5 6 7 
+	std::unique_copy(features.begin(), features.end(), std::back_inserter(featuresunique));
+	if(featuresunique.size()< features.size()){
+		f = featuresunique.size();
+	}
+				
+	size_t r = nC-1;                                         //LDA projected dimension (limited to number of classes - 1  by rank)
+	if(r > f){
+		r = f;
+	}
+
+	int info;
+	float* EigenvaluesI_a = (float*)malloc(f * sizeof(float));
+	float* Eigenvalues_a = (float*)malloc(f * sizeof(float));
+	int *IPIV = (int*) malloc(sizeof(int) * f);
+	//computing inverse of matrix Sw
+	memset(IPIV, 0, f * sizeof(int));
+	LAPACKE_sgetrf(LAPACK_COL_MAJOR, (int)f, (int)f, gSw_a, (int)f, IPIV);						
+	// DGETRI computes the inverse of a matrix using the LU factorization computed by DGETRF.
+	LAPACKE_sgetri(LAPACK_COL_MAJOR, (int)f, gSw_a, (int)f, IPIV);
+	
+	float* gSbSw_a  = (float*)calloc(f * f, sizeof(float));
+	//mtxMul(gSbSw_a, gSw_a, &gnom.Sb[g * f * f * sizeof(float)], f, f, f,f);
+	mtxMul(gSbSw_a, gSw_a, &gnom.Sb[g * f * f], f, f, f,f);
+	if(debug){
+		std::cout<<"From Eigen function: inverse of sw and ratio of sb and sw (Sb/Sw)";
+		displayS(gSw_a, f);			//display inverse of Sw (1/Sw)				
+		displayS(gSbSw_a, f);		//display ratio of Sb and Sw (Sb/Sw)
+	}
+   
+   //compute left eigenvectors for current gnome from ratio of between class scatter and within class scatter:  Sb/Sw 
+	info = LAPACKE_sgeev(LAPACK_COL_MAJOR, 'V', 'N', (int)f, gSbSw_a, (int)f, Eigenvalues_a, EigenvaluesI_a, LeftEigVectors_a, (int)f, 0, (int)f);
+	//sort eignevalue indices in descending order							
+	size_t* sortedindx = sortIndx(Eigenvalues_a, f);
+	//displayS(LeftEigVectors_a, f);					//display Eignevectors (Note these are -1 * matlab eigenvectors does not change fitness score results but keep in mind while projecting data on it)		
+	//sorting left eigenvectors  (building forward transformation matrix As)
+	for (size_t rowE = 0; rowE < r; rowE++){
+		for (size_t colE = 0; colE < f; colE++){
+			size_t ind1 = g * r * f + rowE * f + colE;			
+			//size_t ind1 =  rowE * f + colE;
+			size_t ind2 = sortedindx[rowE] * f + colE;				//eigenvector as row vector			
+			gnom.lda[ind1] = LeftEigVectors_a[ind2];
+		}
+	}
+		
+	if(debug){
+		std::cout<<"Eigenvalues are"<<std::endl;
+		for(size_t n = 0 ; n < f; n ++){
+			std::cout << Eigenvalues_a[n] << ", " ;
+		}
+		std::cout<< std::endl;
+		std::cout<<"From Eigen function: Eignevector"<<std::endl;
+	  				
+		std::cout<<"LDA basis is "<<std::endl;
+		std::cout << "r is " << r << std::endl;
+		for(size_t l = 0 ; l < r; l++){
+			for(size_t n = 0 ; n < f; n ++){
+				std::cout << gnom.lda[g * l * f + l * f + n] << ", " ;
+			}
+			std::cout<<std::endl;
+		}
+			
+	}
+	//Extract only r eigne vectors as a LDA projection basis
+	float* tempgSb = (float*)calloc(r * f, sizeof(float));
+	//mtxMul(tempgSb, &gnom.lda[g * r * f * sizeof(float)], &gnom.Sb[g * f * f * sizeof(float)], r, f, f,f);
+	//mtxMul(tempgSb, &lda[g * r * f ], gSb, r, f, f,f);
+	mtxMul(tempgSb, &gnom.lda[g * r * f], &gnom.Sb[g * f * f], r, f, f,f);
+	float* nSb = (float*)calloc(r * r, sizeof(float));
+	mtxMultranspose(nSb, tempgSb, &gnom.lda[g * r * f], r, f, r, f);
+	
+	float* tempgSw = (float*)calloc(r * f, sizeof(float));
+	//mtxMul(tempgSw, &gnom.lda[g * r * f * sizeof(float)], &gnom.Sw[g * f * f * sizeof(float)], r, f, f,f);
+	mtxMul(tempgSw, &gnom.lda[g * r * f], &gnom.Sw[g * f * f], r, f, f,f);
+	float* nSw = (float*)calloc(r * r, sizeof(float));
+	mtxMultranspose(nSw, tempgSw, &gnom.lda[g * r * f], r, f, r, f);
+	if(debug){
+		std::cout<<"From Eigen function: projected Sb sn Sw"<<std::endl;
+		displayS(nSb, r);									//display Sb				
+		displayS(nSw, r);									//display Sw
+		std::cout<<std::endl;
+	}
+	
+	cv::Mat newSw = cv::Mat((int)r, (int)r, CV_32FC1, nSw);		//within scatter of gnome g in the population 
+	cv::Mat newSb = cv::Mat((int)r, (int)r, CV_32FC1, nSb);		//within scatter of gnome g in the population 
+			
+	//fisher's ratio from ratio of projected sb and sw
+	float fisherRatio = cv::determinant(newSb) /cv::determinant(newSw);
+	gnom.S[g] = 1/fisherRatio;
+	if (debug) {
+		std::cout<<"Score["<<g<<"]: "<< gnom.S[g]<<std::endl;
+	
+		std::cout << "best gnoem is " << std::endl;
+		for (size_t i = 0; i < f; i++)
+			std::cout << ga.P[ga.f * g + i] << ", ";
+		std::cout << std::endl;
+	}
+//	if(!isfinite(gnom.S[g])){
+//      std::cout<<"-----------------------------------------------"<<std::endl;
+//      std::cout<<"Displaying intermediate values of gnome for which score is non finite"<<std::endl;
+//     	std::cout<<"population "<<std::endl;
+//			for(int i = 0; i < ga.f; i++){
+//				std::cout<<"\t"<<ga.P[g * ga.f + i];
+//			}
+//      std::cout<<std::endl;
+//      std::cout<<"Sb determinant is "<<cv::determinant(newSb)<<"\t Sw determinant is "<<cv::determinant(newSb)<<std::endl;
+//     	std::cout<<"fisher ratio is "<<fisherRatio<<std::endl;
+//			std::cout<<"Score["<<g<<"]: "<< gnom.S[g]<<std::endl;	
+//      std::cout<<"------------------------------------------------"<<std::endl;
+//
+//	}
+		
+	
+	if(IPIV!= NULL) std::free(IPIV);	
+	if(gSw_a!= NULL) std::free(gSw_a);
+	if(gSbSw_a!= NULL) std::free(gSbSw_a);
+	if(Eigenvalues_a!= NULL) std::free(Eigenvalues_a);
+	if(EigenvaluesI_a!= NULL) std::free(EigenvaluesI_a);
+	if(tempgSb!= NULL) std::free(tempgSb);
+	if(tempgSw!= NULL) std::free(tempgSw);
+	
+}	
+
+	
+void fitnessFunction( float* sb, float* sw, float* lda, float* M, float* cM, size_t f, cudaDeviceProp props, size_t gen, std::ofstream& profilefile){  
+		
+		size_t tP = 0;																	//total number of pixels
+		std::for_each(nP.begin(), nP.end(), [&] (size_t n) {
+			tP += n;
+		});		
+ 		size_t nC = nP.size();													//total number of classes
+		
+		//--------------Compute between class scatter 		
+	//	ga.cpu_computeSbSw(sb, sw,  M,  cM, nC,  tP, nP, gen);		
+		//
+		//if(debug){
+		//	std::cout<<"cpu results of Sb sn Sw"<<std::endl;
+		//	displayS(sb, ga.f);									//display Sb				
+		//	displayS(sw, ga.f);									//display Sw
+		// }
+		
+		//ga.callingGpuComputeSbSw(sb, sw,  nC, tP, nP, props, gen);			
+		ga.gpu_computeSbSw(sb, sw, nC, tP, props, gen, debug, profilefile);
+		//ga.cpu_computeSbSw(sb, sw, M, cM, nC, tP, nP);
+
+		if(debug){
+			std::cout<<"From fitness function: gpu results of Sb sn Sw"<<std::endl;
+			displayS(sb, ga.f);									//display Sb				
+			displayS(sw, ga.f);									//display Sw
+			std::cout<<std::endl;
+		}				
+		
+	// -----------------------   Linear discriminant Analysis   --------------------------------------
+		//timer.start();	
+		//structure is created to pass variable to thread function as it accepts only 3 arguments 
+		gnom.S = ga.S;
+		gnom.Sw = sw;
+		gnom.Sb = sb;
+		gnom.lda = lda;
+
+		//calling function without using threads
+		for (size_t i = 0; i<ga.p; i++){			
+			//calling function for eigencomputation
+			gpuComputeEignS(i, f);
+			//std::cout<<"Score["<<i<<"]: "<< ga.S[i]<<std::endl;
+		}	
+	
+		//std::vector<std::thread> threads;
+		//for (size_t g = 0; g<ga.p; g++){
+		//	//creating thread to do eigen computation 
+		//	threads.push_back(std::thread(gpuComputeEignS, g, f));	
+		//}	
+		//
+		//// loop again to join the threads
+		//for (auto& t : threads)
+		//	t.join();
+
+		const auto elapsed1 = timer.time_elapsed();
+		if(gen > ga.gnrtn - 2){
+			std::cout << "gpu_eigen time "<<std::chrono::duration_cast<std::chrono::microseconds>(elapsed1).count() << "us" << std::endl;
+      profilefile<< "gpu_eigen time "<<std::chrono::duration_cast<std::chrono::microseconds>(elapsed1).count() << "us" << std::endl;
+		}
+		//ga.S = gnom.score;
+		//size_t bestGnomeIdx = ga.sortSIndx()[0];
+
+}//end of fitness function
+
+void binaryclassifier(int classnum){
+	unsigned int* target = (unsigned int*) calloc(tP, sizeof(unsigned int));
+	memcpy(target, ga.T, tP * sizeof(unsigned int));
+	for(int i = 0 ; i < tP; i++){
+		if(target[i]==classnum){
+			ga.T[i] = 1;
+			
+		}else
+			ga.T[i] = 0;
+	}
+}
+
+
+
+void advertisement() {
+	std::cout << std::endl;
+	std::cout << "=========================================================================" << std::endl;
+	std::cout << "Thank you for using the GA-GPU features selection for spectroscopic image!" << std::endl;
+	std::cout << "=========================================================================" << std::endl << std::endl;
+//	std::cout << args.str();
+}
+
+int main(int argc, char* argv[]){
+	
+//Add the argument options and set some of the default parameters
+	args.add("help", "print this help");
+	args.section("Genetic Algorithm");
+	args.add("features", "select features selection algorithm parameters","10", "number of features to be selected");
+	args.add("classes", "image masks used to specify classes", "", "class1.bmp class2.bmp class3.bmp");
+	args.add("population", "total number of feature subsets in puplation matrix", "1000");
+	args.add("generations", "number of generationsr", "50");
+	args.add("initial_guess", "initial guess of featues", "");
+	args.add("debug", "display intermediate data for debugging");
+	args.add("binary", "Select features for binary classes", "");
+	args.add("trim", "this gives wavenumber to use in trim option of siproc which trims all bands from envi file except gagpu selected bands");
+
+	args.parse(argc,argv);                  //parse the command line arguments
+
+//Print the help text if set
+	if(args["help"].is_set()){				//display the help text if requested
+		advertisement();
+		std::cout<<std::endl<<"usage: ga-gpu input output --option [A B C ...]"<<std::endl;
+		std::cout<<std::endl<<std::endl;
+		std::cout<<args.str()<<std::endl;
+		exit(1);
+	}
+	if (args.nargs() < 2) {										//if the user doesn't provide input and output files
+		std::cout << "ERROR: GA-GPU requires an input (ENVI) file and an output (features, text) file." << std::endl;
+		return 1;
+	}
+	if (args["classes"].nargs() < 2) {							//if the user doesn't specify at least two class images
+		std::cout << "ERROR: GA-GPU requires at least two class images to be specified using the --classes option" << std::endl;
+		return 1;
+	}
+
+	std::string outfile = args.arg(1);							//outfile is text file where bnad index, LDA-basis, wavelength and if --trim option is set then trim wavelengths are set respectively 
+  std::string profile_file = "profile_" + outfile ;
+  std::ofstream profilefile(profile_file.c_str(), std::ios::out);			//open outfstream for outfile
+	
+	time_t t_start = time(NULL);								//start a timer for file reading 	
+	E.open(args.arg(0), std::string(args.arg(0)) + ".hdr");		//open header file 		
+	size_t X = E.header.samples;								//total number of pixels in X dimension
+	size_t Y = E.header.lines;									//total number of pixels in Y dimension
+	B = (unsigned int)E.header.bands;							//total number of bands (features)
+	std::vector<double> wavelengths = E.header.wavelength;		//wavelengths of each band
+
+	if(E.header.interleave != stim::envi_header::BIP){			//this code can only load bip files and hence check that in header file
+		std::cout<<"this code works for only bip files. please convert file to bip file"<<std::endl;
+		exit(1);												//if file is not bip file exit code execution
+	}	
+	
+///--------------------------Load features--------------------------------------------- 
+	nP = ga_load_class_images(argc, args, &nC, &tP);			//load supervised class images
+	ga.F = load_features( nC, tP, B, E, nP);					//generate the feature matrix
+	ga.T = ga_load_responses(tP, nC, nP);						//load the responses for RF training
+	E.close();													//close the hyperspectral file
+	time_t t_end = time(NULL);
+	std::cout<<"Total time: "<<t_end - t_start<<" s"<<std::endl;	
+	//display_PixelfeatureNclass(ga.F, ga.T , B , 1);			//Print one value from feature matrix to debug feature loading
+	//std::cout << "pixel target is " << ga.T[0] << "  " << ga.T[1] << "  " << ga.T[tP - 2] << "  " << ga.T[tP - 1]<<std::endl;
+///--------------------------Genetic algorith configurations with defult paramets and from argument values---------------------
+	ga.f = args["features"].as_int(0);					//number of features to be selected by user default value is 10
+	ga.p = args["population"].as_int(0);				//population size to be selected by user default value is 1000
+	ga.gnrtn = args["generations"].as_int(0);			//number of generations to be selected by user default value is 50
+	if(args["binary"]) {						//set this option when features are to be selected as binary clas features (class vs stroma) 
+		binClassOne = args["binary"].as_int(0);			//sel class number here, if 2 then features are selected for (class-2 vs stroma) 
+		//feture selection for class selected by user with user arguments (make it binary class data by making chosen class label as 1 and al other class labels 0 from multiclass data )
+		//to select feature for all classes in joint class data using binary class system need to write a script with loop covering all classes
+		binaryclassifier(binClassOne);
+	}  ///not fully implemented yet
+
+	ga.ub = B;						//upper bound is number of bands (i.e. size of z dimension)  Note: for this particular application and way code is written lower bound is 0 and upper bound is size of z dimension
+	ga.uniformRate = 0.5;			//uniform rate is used in crossover
+	ga.mutationRate = 0.5f;		//in percentage for mutation operation on gnome
+	ga.tournamentSize = 5;			//for crossover best parents are selected from tournament of gnomes  
+	ga.elitism = true;				// if it is true then best gnome of current generation is passed to next generation
+	//initial guess of population
+	ga.i_guess = (unsigned int*) calloc(ga.f, sizeof(unsigned int));		
+	debug = args["debug"];				
+
+//==================Generate intial population =================================================
+	std::vector<unsigned int> i_guess(ga.f);
+	for (size_t b = 0; b < ga.f; b++)     //generate default initial guess 
+		i_guess[b] = rand() % B + 0;
+
+	if (args["initial_guess"].is_set()) {//if the user specifies the --initialguess option & provides feature indices as initial guess
+		size_t nf = args["initial_guess"].nargs();				//get the number of arguments after initial_guess
+		if (nf == 1 || nf == ga.f) {								//check if file with initial guessed indices is given or direct indices are given as argument
+			if (nf == 1) {		 //if initial guessed feature indices are given in file										
+				std::ifstream in;		//open the file containing the baseline points
+				in.open(args["initial_guess"].as_string().c_str());
+				if (in.is_open()){	//if file is present and can be opened then read it 
+					unsigned int b_ind;
+					while (in >> b_ind)  //get each band index and push it into the vector
+						i_guess.push_back(b_ind);					
+				}
+				else 
+					std::cout << "cannot open file of initial_guess indices" << std::endl;
+			}
+			else if (nf == ga.f) {	 //if direct indices are given as argument
+				for (size_t b = 0; b < nf; b++)								//for each band given by the user
+					i_guess[b] = args["initial_guess"].as_int(b);			//store that band in the i_guess array
+			}
+		}		
+	}
+
+	ga.initialize_population(i_guess, debug);      //initialize first population set for first generation, user can pass manually preferred features from command line 
+	//display_gnome(0);	
+
+//------------------Calculate class means and total mean of features----------------------------
+	float* M = (float*)calloc( B , sizeof(float));			//total mean of entire feature martix for all features (bands B)
+	ga.ttlMean(M, tP, B);									//calculate total mean, ga.F is entire feature matrix, M is mean for all bands B(features)
+	if(debug) ga.dispalymean(M);							//if option --debug is used display all bands mean
+	
+	//display band index of bands with mean zero, this indicates that band has all zero values 
+	std::cout<<"Display features indices with zero mean "<<std::endl;
+	for(unsigned int i = 0; i < B; i++){
+		if(M[i]== 0)
+			std::cout<<"\t"<<i;
+	}
+	std::cout<<std::endl;
+//	std::cout << "pixel target is " << ga.T[0] << "  " << ga.T[1] << "  " << ga.T[tP - 2] << "  " << ga.T[tP - 1]<<std::endl;
+	float* cM = (float*)calloc(nC * B , sizeof(float));		//cM is nC X B matrix with each row as mean of all samples in one class for all features (bands B)
+	ga.classMean(cM, tP, nC, B, nP);						//calculate class mean, ga.F is entire feature matrix, M is mean for all bands B(features)
+	if(debug) ga.dispalyClassmean(cM, nC);
+
+//------------------------------------GPU init----------------------------------------------------
+	//checking for cuda device 
+	int count;
+	HANDLE_ERROR(cudaGetDeviceCount(&count));
+	if(count < 1){
+		std::cout<<"no cuda device is available"<<std::endl;
+		return 1;
+	}
+	cudaDeviceProp props;
+	HANDLE_ERROR(cudaGetDeviceProperties(&props, 0));
+	ga.gpuInitializationfrommain(M, cM, nP, tP, nC);
+
+//feture selection for class selected by user with user arguments (make it binary class data by making chosen class label as 1 and al other class labels 0 from multiclass data )
+//to select feature for all classes in joint class data using binary class system need to write a script with loop covering all classes
+	//if(binaryClass){									
+	//	binaryclassifier(binClassOne);
+	//}
+
+//============================= GA evolution by generations ====================================================
+	std::vector<unsigned int> bestgnome;												//holds best gnome after each generation evaluation 
+	size_t bestG_Indx;																	//This gives index of best gnome in the current population to get best gnome and its fitness value
+	unsigned int* newPop = (unsigned int*) calloc(ga.p * ga.f, sizeof(unsigned int));   //temprory storage of new population
+	double* best_S = (double*) calloc (ga.gnrtn, sizeof(double));						//stores fitness value of best gnome at each iteration
+	float* lda = (float*) calloc (ga.p * (nC-1) * ga.f, sizeof(float));					//stores LDA basis for each gnome so that we can have best gnome's LDA basis 																	
+	float* sb = (float*) calloc( ga.p * ga.f * ga.f , sizeof(float)) ;					//3d matrix for between class scatter (each 2d matrix between class scatter for one gnome) 
+	float* sw = (float*) calloc( ga.p * ga.f * ga.f , sizeof(float)) ;					//3d matrix for within class scatter (each 2d matrix within class scatter for one gnome) 
+	ga.zerobandcheck(M, true);										//checking bands with all zeros and duplicated bands in a gnome replacing them with other bands avoiding duplication and zero mean 
+	ga.zerobandcheck(M, true);										//Repeating zeroband cheack as some of these bands are not replaced in previous run and gave random results
+	time_t gpu_timestart = time(NULL);					//start a timer for total evoluation 
+
+	for (size_t gen = 0; gen < ga.gnrtn; gen++){		//for each generation find fitness value of all gnomes in population matrix and generate population for next generation
+		//std::cout<<"Generation: "<<gen<<std::endl;
+		fitnessFunction(sb, sw, lda, M , cM, ga.f, props, gen, profilefile);		//Evaluate phe(feature matrix for current population) for fitness of all gnomes in current population 
+		timer.start();												//start timer for new population generation
+		bestG_Indx = ga.evolvePopulation(newPop, M, debug);			//evolve population to generate new generation population
+		const auto pop_generation = timer.time_elapsed(); 			// end timer for new population generation
+		if(gen >ga.gnrtn -2){
+			std::cout << "population evolution time "<<std::chrono::duration_cast<std::chrono::microseconds>(pop_generation).count() << "us" << std::endl;
+  			profilefile<<"population evolution time "<<std::chrono::duration_cast<std::chrono::microseconds>(pop_generation).count() << "us" <<std::endl;
+		}
+						
+		best_S[gen] = ga.S[bestG_Indx];								//score of best gnome in current generation 
+		bestgnome = ga.getGnome(bestG_Indx);						//Best gnome of current populaation																				
+		ga.generateNewP(newPop);									//replace current population with new populaiton in the ga classs object
+		ga.zerobandcheck(M, false);										//checking bands with all zeros and duplicated bands in a gnome replacing them with other bands avoiding duplication and zero mean 
+		ga.zerobandcheck(M, false);										//Repeating zeroband cheack as some of these bands are not replaced in previous run and gave random results
+	}//end generation	  
+	
+	time_t gpu_timeend = time(NULL);					//end a timer for total evoluation 
+	std::cout<<"Total gpu time: "<<gpu_timeend - gpu_timestart<<" s"<<std::endl;
+ 	profilefile<<"Total gpu time: "<<gpu_timeend - gpu_timestart<<" s"<<std::endl;
+
+//================================ Results of GA ===============================================================
+	std::cout<<"best gnome's fitness value is "<<best_S[ga.gnrtn-1]<<std::endl;	
+	std::cout<<"best gnome is:	";
+	for(size_t i = 0; i < ga.f; i++){
+		std::cout<<" "<<(bestgnome.at(i));
+   	}
+	std::cout<<std::endl;	
+
+	//create a text file to store the LDA stats (features subset and  LDA-basis)
+	////format of CSV file is: 1st row - band index, 2nd LDA basis depending on number of classes, 3rd - wavenumber corresponding to band index and it --trim is selected then trim wavnumbersare also given
+	std::ofstream csv(outfile.c_str(), std::ios::out);			//open outfstream for outfile
+	size_t ldaindx = bestG_Indx * (nC-1) * ga.f ;				//Compute LDA basis index of best gnome
+	
+	//if(binaryClass){											//this option is for binary class feature selection from joint classes but not fully implemented
+	//	csv<<binClassOne<<std::endl;
+	//}
+
+ //fitness values of best gnome is 
+	csv<<"best gnome's fitness value is "<<best_S[ga.gnrtn-1]<<std::endl;									//output fitness value of best gnome in last generation
+	//output gnome i.e. band index of selected featurs
+	csv<<(bestgnome.at(0));									//output feature subset
+	for(size_t i = 1; i < ga.f; i++)
+		csv<<","<<(bestgnome.at(i));
+	csv<<std::endl;
+
+	//output LDA basis of size r X f, r is nC - 1 as LDA projection is rank limited by number of classes - 1
+	for (size_t i = 0; i < nC-1; i++){						
+		csv<<lda[ldaindx + i * ga.f ];
+		for (size_t j = 1; j < ga.f; j++){
+			csv<<","<<lda[ldaindx + i * ga.f +j];
+		}
+		csv << std::endl;
+	}
+	//output actual wavelenths corresponding to those band indices 
+	csv << (wavelengths[bestgnome.at(0)]);
+	for (size_t i = 1; i < ga.f; i++)
+		csv << "," << (wavelengths[bestgnome.at(i)]);
+	csv << std::endl;
+
+
+	if (args["trim"].is_set()) {
+		csv << "trim info" << std::endl;
+		std::sort(bestgnome.begin(), bestgnome.end());			//sort features index in best gnome 
+
+		std::vector<unsigned int> trimindex(ga.f);				//create a vector to store temprory trim index bounds 
+		std::vector<unsigned int> finaltrim_ind;				//create a vector to store final trim index bounds 
+		std::vector<unsigned int> trim_wv;						//create a vector to store final trim wavelength bounds 
+
+		//trim index 
+		trimindex.push_back(1);									//1st trimming band index is 1
+		for (size_t i = 0; i < ga.f; i++) {						// for each feature find its bound indexes 
+			trimindex[i * 2] = bestgnome.at(i) - 1;		
+			trimindex[i * 2 + 1] = bestgnome.at(i) + 1;			
+		}
+		trimindex.push_back(B);								    //last bound index is B 		
+
+		//organize trim index
+		int k = 0;
+		for (size_t i = 0; i < ga.f + 1; i++) {						// find valid pair of trim indices bound excluding adjacent trim indices
+			if (trimindex[2 * i] < trimindex[2 * i + 1]) {
+				finaltrim_ind.push_back(trimindex[2 * i]);			//this is left bound
+				finaltrim_ind.push_back(trimindex[2 * i + 1]);
+				k = k + 2;
+			}
+		}
+		//add duplicated trim indices as single index to final trim index
+		for (size_t i = 0; i < ga.f + 1; i++) {						//check each pair of trim indices for duplications	
+			if (trimindex[2 * i] == trimindex[2 * i + 1]) {			// (duplication caused due to adjacent features)
+				finaltrim_ind.push_back(trimindex[2 * i]);			// remove duplicated trim indices replace by one
+				k = k + 1;
+			}
+		}
+
+
+		////output actual wavelenths corresponding to those trim indices 
+		////these wavenumber are grouped in pairs, check each pair, if duplicated numbers are there in pair delete one and keep other as band to trim, if 2nd wavnumber is smaller than 1st in a pair ignore that pair
+		////e.g [1, 228, 230, 230, 232, 350,352, 351, 353, 1200] pairas [1(start)-228,230-230, 232-350, 352-351, 353-1200(end)], trimming wavenumbers are [1-228, 230, 233-350, 353-1200]
+		csv << (wavelengths[finaltrim_ind.at(0)]);
+		for (size_t i = 1; i < ga.f * + 2 ; i++)
+			csv << "," << (wavelengths[finaltrim_ind.at(i)]);
+		csv << std::endl;
+	} //end trim option
+
+
+  //free gpu pointers
+  ga.gpu_Destroy();
+
+	//free pointers
+	std::free(sb);
+	std::free(sw);
+	std::free(M);
+	std::free(cM);
+	std::free(best_S);
+	std::free(lda);
+	std::free(newPop);	
+	
+}//end main
+
+
diff --git a/src/timer.h b/src/timer.h
new file mode 100644
index 0000000..a482858
--- /dev/null
+++ b/src/timer.h
@@ -0,0 +1,37 @@
+#ifndef GA_GPU_TIMER_H
+#define GA_GPU_TIMER_H
+
+#include <chrono>
+//#include <thread>
+//#include <iostream>
+
+//using namespace std::chrono;
+
+class Timer {
+typedef std::chrono::high_resolution_clock Clock;
+ 
+Clock::time_point epoch;
+public:
+  void start(){
+	  epoch = Clock::now();
+  }
+  Clock::duration time_elapsed() const{
+	  return Clock::now() - epoch;
+  }
+};
+
+#endif
+
+
+//class Timer {
+//	std::chrono::time_point<std::chrono::high_resolution_clock> epoch;
+//
+//	public:
+//	typedef high_resolution_clock Clock;
+//	void start(){
+//		epoch = Clock::now();
+//	}
+//	std::chrono::time_point<std::chrono::high_resolution_clock> time_elapsed() const {
+//		return Clock::now() - epoch;
+//	}
+//};
\ No newline at end of file
--
libgit2 0.21.4