Merge branch 'master' of git.stim.ee.uh.edu:codebase/stimlib into Graph

Pavel Govyadinov
2 parents 8823488b 9b563709
Showing 51 changed files with 5615 additions and 339 deletions Show diff stats
cmake/FindFANN.cmake
cmake/FindGLEW.cmake
cmake/FindGLUT.cmake
cmake/FindSTIM.cmake
matlab/bsq2tensorflow.m
matlab/enviLoadRaw.m
matlab/enviSaveRaw.m
matlab/readspe.m
matlab/spe2envi.m
matlab/brewermap.m → matlab/stimBrewerMap.m
matlab/stimLoadAgilent.m
matlab/stimROC.m
python/enviProcess.py
stim/biomodels/cellset.h
stim/biomodels/network.h
stim/biomodels/network_dep.h
stim/cuda/cudatools/error.h
stim/envi/agilent_binary.h
stim/envi/bil.h
stim/envi/bip.h
+#
+# Windows users: define the GLEW_PATH environment variable to point
+# to the directory containing:
+#   include/fann.h
+#   lib/*fann.lib
+
+
+#  FANN_FOUND - system has fann
+#  FANN_INCLUDE_DIRS - the fann include directory
+#  FANN_LIBRARIES - Link these to use fann
+#  FANN_DEFINITIONS - Compiler switches required for using fann
+#
+
+if(FANN_LIBRARIES AND FANN_INCLUDE_DIRS)
+  set(FANN_FOUND TRUE)
+else()
+  find_path(FANN_INCLUDE_DIR
+    NAMES
+      fann.h
+    PATHS
+      $ENV{FANN_PATH}/include
+      ${FANN_DIR}/include
+      /usr/include
+      /usr/local/include
+      /opt/local/include
+      /sw/include
+  )
+
+  set( _libraries fann doublefann fixedfann floatfann )
+
+  foreach( _lib ${_libraries} )
+    string( TOUPPER ${_lib} _name )
+
+    find_library(${_name}_LIBRARY
+      NAMES
+        ${_lib}
+      PATHS
+        $ENV{FANN_PATH}/lib
+        ${FANN_DIR}/lib
+        /usr/lib
+        /usr/local/lib
+        /opt/local/lib
+        /sw/lib
+      )
+
+  endforeach()
+
+
+  set(FANN_INCLUDE_DIRS
+    ${FANN_INCLUDE_DIR}
+  )
+
+  set(FANN_LIBRARIES
+    ${FANN_LIBRARIES}
+    ${FANN_LIBRARY}
+    ${DOUBLEFANN_LIBRARY}
+    ${FIXEDFANN_LIBRARY}
+    ${FLOATFANN_LIBRARY}
+  )
+
+  if( UNIX )
+    set( FANN_LIBRARIES ${FANN_LIBRARIES} m )
+  endif()
+
+  if(FANN_INCLUDE_DIRS AND FANN_LIBRARIES)
+     set(FANN_FOUND TRUE)
+  endif()
+
+  if(FANN_FOUND)
+    if(NOT FANN_FIND_QUIETLY)
+      message(STATUS "Found FANN:")
+      message(STATUS "FANN_INCLUDE_DIRS: ${FANN_INCLUDE_DIRS}")
+      message(STATUS "FANN_LIBRARIES: ${FANN_LIBRARIES}")
+    endif()
+  else()
+    if(FANN_FIND_REQUIRED)
+      message(FATAL_ERROR "Could not find FANN")
+    endif()
+  endif()
+
+  mark_as_advanced(FANN_INCLUDE_DIRS FANN_LIBRARIES)
+endif()
+# Copyright (c) 2012-2016 DreamWorks Animation LLC
+#
+# All rights reserved. This software is distributed under the
+# Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+#
+# Redistributions of source code must retain the above copyright
+# and license notice and the following restrictions and disclaimer.
+#
+# *     Neither the name of DreamWorks Animation nor the names of
+# its contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+# LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+#
+
+#-*-cmake-*-
+# - Find GLEW
+#
+# Author : Nicholas Yue yue.nicholas@gmail.com
+#
+# This auxiliary CMake file helps in find the GLEW headers and libraries
+#
+# GLEW_FOUND            set if Glew is found.
+# GLEW_INCLUDE_DIR      GLEW's include directory
+# GLEW_glew_LIBRARY        GLEW libraries
+# GLEW_glewmx_LIBRARY      GLEWmx libraries (Mulitple Rendering Context)
+
+FIND_PACKAGE ( PackageHandleStandardArgs )
+
+FIND_PATH( GLEW_LOCATION include/GL/glew.h
+  "$ENV{GLEW_ROOT}"
+  NO_DEFAULT_PATH
+  NO_SYSTEM_ENVIRONMENT_PATH
+  )
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS ( GLEW
+  REQUIRED_VARS GLEW_LOCATION
+  )
+
+IF ( GLEW_LOCATION )
+
+  SET( GLEW_INCLUDE_DIR "${GLEW_LOCATION}/include" CACHE STRING "GLEW include path")
+
+  SET ( ORIGINAL_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  IF (GLEW_USE_STATIC_LIBS)
+	IF (APPLE)
+      SET(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
+      FIND_LIBRARY ( GLEW_LIBRARY_PATH GLEW PATHS ${GLEW_LOCATION}/lib
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+      FIND_LIBRARY ( GLEWmx_LIBRARY_PATH GLEWmx PATHS ${GLEW_LOCATION}/lib
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+	  # MESSAGE ( "APPLE STATIC" )
+	  # MESSAGE ( "GLEW_LIBRARY_PATH = " ${GLEW_LIBRARY_PATH} )
+	ELSEIF (WIN32)
+      # Link library
+      SET(CMAKE_FIND_LIBRARY_SUFFIXES ".lib")
+      FIND_LIBRARY ( GLEW_LIBRARY_PATH GLEW32S PATHS ${GLEW_LOCATION}/lib )
+      FIND_LIBRARY ( GLEWmx_LIBRARY_PATH GLEW32MXS PATHS ${GLEW_LOCATION}/lib )
+	ELSE (APPLE)
+      SET(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
+      FIND_LIBRARY ( GLEW_LIBRARY_PATH GLEW PATHS ${GLEW_LOCATION}/lib
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+      FIND_LIBRARY ( GLEWmx_LIBRARY_PATH GLEWmx PATHS ${GLEW_LOCATION}/lib
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+	  # MESSAGE ( "LINUX STATIC" )
+	  # MESSAGE ( "GLEW_LIBRARY_PATH = " ${GLEW_LIBRARY_PATH} )
+	ENDIF (APPLE)
+  ELSE ()
+	IF (APPLE)
+      SET(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib")
+      FIND_LIBRARY ( GLEW_LIBRARY_PATH GLEW PATHS ${GLEW_LOCATION}/lib )
+      FIND_LIBRARY ( GLEWmx_LIBRARY_PATH GLEWmx PATHS ${GLEW_LOCATION}/lib )
+	ELSEIF (WIN32)
+      # Link library
+      SET(CMAKE_FIND_LIBRARY_SUFFIXES ".lib")
+      FIND_LIBRARY ( GLEW_LIBRARY_PATH GLEW32 PATHS ${GLEW_LOCATION}/lib )
+      FIND_LIBRARY ( GLEWmx_LIBRARY_PATH GLEW32mx PATHS ${GLEW_LOCATION}/lib )
+      # Load library
+      SET(CMAKE_FIND_LIBRARY_SUFFIXES ".dll")
+      FIND_LIBRARY ( GLEW_DLL_PATH GLEW32 PATHS ${GLEW_LOCATION}/bin
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+      FIND_LIBRARY ( GLEWmx_DLL_PATH GLEW32mx PATHS ${GLEW_LOCATION}/bin
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+	ELSE (APPLE)
+	  # Unices
+      FIND_LIBRARY ( GLEW_LIBRARY_PATH GLEW PATHS ${GLEW_LOCATION}/lib
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+      FIND_LIBRARY ( GLEWmx_LIBRARY_PATH GLEWmx PATHS ${GLEW_LOCATION}/lib
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+	ENDIF (APPLE)
+  ENDIF ()
+  # MUST reset
+  SET(CMAKE_FIND_LIBRARY_SUFFIXES ${ORIGINAL_CMAKE_FIND_LIBRARY_SUFFIXES})
+
+  SET( GLEW_GLEW_LIBRARY ${GLEW_LIBRARY_PATH} CACHE STRING "GLEW library")
+  SET( GLEW_GLEWmx_LIBRARY ${GLEWmx_LIBRARY_PATH} CACHE STRING "GLEWmx library")
+
+ENDIF ()
+#.rst:
+# FindGLUT
+# --------
+#
+# try to find glut library and include files.
+#
+# IMPORTED Targets
+# ^^^^^^^^^^^^^^^^
+#
+# This module defines the :prop_tgt:`IMPORTED` targets:
+#
+# ``GLUT::GLUT``
+#  Defined if the system has GLUT.
+#
+# Result Variables
+# ^^^^^^^^^^^^^^^^
+#
+# This module sets the following variables:
+#
+# ::
+#
+#   GLUT_INCLUDE_DIR, where to find GL/glut.h, etc.
+#   GLUT_LIBRARIES, the libraries to link against
+#   GLUT_FOUND, If false, do not try to use GLUT.
+#
+# Also defined, but not for general use are:
+#
+# ::
+#
+#   GLUT_glut_LIBRARY = the full path to the glut library.
+#   GLUT_Xmu_LIBRARY  = the full path to the Xmu library.
+#   GLUT_Xi_LIBRARY   = the full path to the Xi Library.
+
+#=============================================================================
+# Copyright 2001-2009 Kitware, Inc.
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+if (WIN32)
+  find_path( GLUT_INCLUDE_DIR NAMES GL/glut.h
+  PATHS  $ENV{GLUT_ROOT_PATH}/include )
+
+  if( CMAKE_SIZEOF_VOID_P EQUAL 8 )
+    find_library( GLUT_glut_LIBRARY NAMES freeglut
+      PATHS
+      $ENV{GLUT_ROOT_PATH}/lib/x64
+
+      NO_DEFAULT_PATH
+    )
+  else( CMAKE_SIZEOF_VOID_P EQUAL 8 )
+    find_library( GLUT_glut_LIBRARY NAMES glut glut32 freeglut
+      PATHS
+      ${OPENGL_LIBRARY_DIR}
+      $ENV{GLUT_ROOT_PATH}/lib
+    )
+  endif( CMAKE_SIZEOF_VOID_P EQUAL 8 )
+
+else ()
+
+  if (APPLE)
+    find_path(GLUT_INCLUDE_DIR glut.h ${OPENGL_LIBRARY_DIR})
+    find_library(GLUT_glut_LIBRARY GLUT DOC "GLUT library for OSX")
+    find_library(GLUT_cocoa_LIBRARY Cocoa DOC "Cocoa framework for OSX")
+
+    if(GLUT_cocoa_LIBRARY AND NOT TARGET GLUT::Cocoa)
+      add_library(GLUT::Cocoa UNKNOWN IMPORTED)
+      # Cocoa should always be a Framework, but we check to make sure.
+      if(GLUT_cocoa_LIBRARY MATCHES "/([^/]+)\\.framework$")
+        set_target_properties(GLUT::Cocoa PROPERTIES
+          IMPORTED_LOCATION "${GLUT_cocoa_LIBRARY}/${CMAKE_MATCH_1}")
+      else()
+        set_target_properties(GLUT::Cocoa PROPERTIES
+          IMPORTED_LOCATION "${GLUT_cocoa_LIBRARY}")
+      endif()
+    endif()
+  else ()
+
+    if (BEOS)
+
+      set(_GLUT_INC_DIR /boot/develop/headers/os/opengl)
+      set(_GLUT_glut_LIB_DIR /boot/develop/lib/x86)
+
+    else()
+
+      find_library( GLUT_Xi_LIBRARY Xi
+        /usr/openwin/lib
+        )
+
+      find_library( GLUT_Xmu_LIBRARY Xmu
+        /usr/openwin/lib
+        )
+
+      if(GLUT_Xi_LIBRARY AND NOT TARGET GLUT::Xi)
+        add_library(GLUT::Xi UNKNOWN IMPORTED)
+        set_target_properties(GLUT::Xi PROPERTIES
+          IMPORTED_LOCATION "${GLUT_Xi_LIBRARY}")
+      endif()
+
+      if(GLUT_Xmu_LIBRARY AND NOT TARGET GLUT::Xmu)
+        add_library(GLUT::Xmu UNKNOWN IMPORTED)
+        set_target_properties(GLUT::Xmu PROPERTIES
+          IMPORTED_LOCATION "${GLUT_Xmu_LIBRARY}")
+      endif()
+
+    endif ()
+
+    find_path( GLUT_INCLUDE_DIR GL/glut.h
+      /usr/include/GL
+      /usr/openwin/share/include
+      /usr/openwin/include
+      /opt/graphics/OpenGL/include
+      /opt/graphics/OpenGL/contrib/libglut
+      ${_GLUT_INC_DIR}
+      )
+
+    find_library( GLUT_glut_LIBRARY glut
+      /usr/openwin/lib
+      ${_GLUT_glut_LIB_DIR}
+      )
+
+    unset(_GLUT_INC_DIR)
+    unset(_GLUT_glut_LIB_DIR)
+
+  endif ()
+
+endif ()
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(GLUT REQUIRED_VARS GLUT_glut_LIBRARY GLUT_INCLUDE_DIR)
+
+if (GLUT_FOUND)
+  # Is -lXi and -lXmu required on all platforms that have it?
+  # If not, we need some way to figure out what platform we are on.
+  set( GLUT_LIBRARIES
+    ${GLUT_glut_LIBRARY}
+    ${GLUT_Xmu_LIBRARY}
+    ${GLUT_Xi_LIBRARY}
+    ${GLUT_cocoa_LIBRARY}
+    )
+
+  if(NOT TARGET GLUT::GLUT)
+    add_library(GLUT::GLUT UNKNOWN IMPORTED)
+    set_target_properties(GLUT::GLUT PROPERTIES
+      INTERFACE_INCLUDE_DIRECTORIES "${GLUT_INCLUDE_DIR}")
+    if(GLUT_glut_LIBRARY MATCHES "/([^/]+)\\.framework$")
+      set_target_properties(GLUT::GLUT PROPERTIES
+        IMPORTED_LOCATION "${GLUT_glut_LIBRARY}/${CMAKE_MATCH_1}")
+    else()
+      set_target_properties(GLUT::GLUT PROPERTIES
+        IMPORTED_LOCATION "${GLUT_glut_LIBRARY}")
+    endif()
+
+    if(TARGET GLUT::Xmu)
+      set_property(TARGET GLUT::GLUT APPEND
+        PROPERTY INTERFACE_LINK_LIBRARIES GLUT::Xmu)
+    endif()
+
+    if(TARGET GLUT::Xi)
+      set_property(TARGET GLUT::GLUT APPEND
+        PROPERTY INTERFACE_LINK_LIBRARIES GLUT::Xi)
+    endif()
+
+    if(TARGET GLUT::Cocoa)
+      set_property(TARGET GLUT::GLUT APPEND
+        PROPERTY INTERFACE_LINK_LIBRARIES GLUT::Cocoa)
+    endif()
+  endif()
+
+  #The following deprecated settings are for backwards compatibility with CMake1.4
+  set (GLUT_LIBRARY ${GLUT_LIBRARIES})
+  set (GLUT_INCLUDE_PATH ${GLUT_INCLUDE_DIR})
+endif()
+
+mark_as_advanced(
+  GLUT_INCLUDE_DIR
+  GLUT_glut_LIBRARY
+  GLUT_Xmu_LIBRARY
+  GLUT_Xi_LIBRARY
+  )
-include(FindPackageHandleStandardArgs)
-
-set(STIM_INCLUDE_DIR $ENV{STIMLIB_PATH})
-
-find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIR)
-
-if(STIM_FOUND)
-    set(STIM_INCLUDE_DIRS ${STIM_INCLUDE_DIR})
-endif()
 \ No newline at end of file
+# finds the STIM library (downloads it if it isn't present)
+# set STIMLIB_PATH to the directory containing the stim subdirectory (the stim repository)
+
+include(FindPackageHandleStandardArgs)
+
+set(STIM_INCLUDE_DIR $ENV{STIMLIB_PATH})
+
+find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIR)
+
+if(STIM_FOUND)
+    set(STIM_INCLUDE_DIRS ${STIM_INCLUDE_DIR})
+elseif(STIM_FOUND)
+	#if the STIM library isn't found, download it
+	#file(REMOVE_RECURSE ${CMAKE_BINARY_DIR}/stimlib)	#remove the stimlib directory if it exists
+	#set(STIM_GIT "https://git.stim.ee.uh.edu/codebase/stimlib.git")
+	#execute_process(COMMAND git clone --depth 1 ${STIM_GIT} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+	#set(STIM_INCLUDE_DIRS "${CMAKE_BINARY_DIR}/stimlib" CACHE TYPE PATH)
+	message("STIM library not found. Set the STIMLIB_PATH environment variable to the STIMLIB location.")
+	message("STIMLIB can be found here: https://git.stim.ee.uh.edu/codebase/stimlib")
+endif(STIM_FOUND)
+
+find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIR)
+function T = bsq2tensorflow(I, n)
+
+    sx = size(I, 1);
+    sy = size(I, 2) / n;            %get the size of the tensor along Y
+    sb = size(I, 3);
+    
+    T = zeros(sx * sy * sb, n);     %allocate space for the output matrix
+    for i = 0:n-1
+        ti = I(:, i * sy + 1 : i * sy + sy, :);
+        T(:, i+1) = ti(:);
+    end    
+end
+        
+        
 \ No newline at end of file
 %loads an ENVI file without any manipulation (changing orientation)
+% enviLoadRaw(filename, headername)
 function M = enviLoadRaw(filename, headername)
     %if a header isn't provided, assume it's just the filename
 %saves an ENVI file without any manipulation, assumes (X, Y, S)
+% enviSaveRaw(M, filename, headername)
 function enviSaveRaw(M, filename, headername)
     %if a header isn't provided, assume it's just the filename
+% Read images of TIFF, SPE2.2(WinSpec) and SPE3.0(Lightfield)
+% Version: JTL Jun-9-2016
+% ----------------- READ THIS FIRST !!!!! --------------------------------
+% Change the file name to "readspe" before use
+% Example:
+%   Z = readspe(filename)
+%   Z = readspe(filename,'info')
+%   Z = readspe(filename,frame_index)
+%   Z = readspe(filename,frame_index,'info')
+% Input:
+%   filename - filename string, e.g. 'image.spe'
+%   frame_index    - frame index, start from 1
+%              If you have multiple frames, use a "for" loop
+%   'info'   - flag to show file info, i.e. dimension, number of frames, version 
+% Output:
+%   Z        - UINT16 image (convert to double if you need)
+% ------------------------------------------------------------------------
+%        Z = readspe (filename,frame_index,'info')
+function Z = readspe (filename,varargin) 
+
+if exist(filename) == 2
+
+    Nfr = 1; % default read first frame
+    if nargin >1
+        if isa(varargin{1},'numeric')
+            Nfr = varargin{1};
+        end
+    end
+
+    [~,name,ext] = fileparts(filename);
+    switch upper(ext)
+        case '.TIFF'
+            file_ver = 'TIFF';
+            Z  = imread(filename);
+            [Y,X] = size(Z);
+    %         datatype = class(Z)
+
+        case '.SPE'
+            fid = fopen(filename); 
+            I = fread(fid,Inf,'uint8');
+            X = double(typecast(uint8(I(43:44)),'uint16'));
+            Y = double(typecast(uint8(I(657:658)),'uint16'));
+            fr = typecast(uint8(I(1447:1450)),'int32');
+            spe_ver = typecast(uint8(I(1993:1996)),'single');
+            file_ver = ['SPE ' num2str(spe_ver)];
+            datatypeN = typecast(uint8(I(109:110)),'int16');
+            switch datatypeN
+                case 0 % 32-bit float
+                    datatype = 'single'; datalength = 4; 
+                case 1 % 32-bit signed integer
+                    datatype = 'int32'; datalength = 4;
+                case 2 % 16-bit signed integer
+                    datatype = 'int16'; datalength = 2;
+                case 3 % 16-bit unsigned integer
+                    datatype = 'uint16'; datalength = 2;
+                case 8 % 32-bit unsigned integer
+                    datatype = 'uint32'; datalength = 4;
+            end 
+    %       A = I(4101:4100+X*Y*2); % Default read first frame
+            A = I(4101+X*Y*datalength*(Nfr-1):4100+X*Y*datalength*Nfr);
+            B = typecast(uint8(A),datatype); % important
+            Z = reshape(B,X,Y);
+            Z = Z';
+            fclose(fid);
+    end
+
+    if nargin >1
+        if varargin{end} == 'info'
+            display(['X = ' num2str(X)]);
+            display(['Y = ' num2str(Y)]);
+            if(exist('fr','var'));display(['Number of Frames: ' num2str(fr)]);end;
+            display(['File version: ' file_ver]);
+        end
+    end
+    
+elseif exist(filename) == 0
+    display('File does not exist!');  
+end
 \ No newline at end of file
+function spe2envi(filemask, outfile)
+
+    filelist = dir(filemask);
+
+    %get a list of date numbers
+    datenums = cell2mat({filelist.datenum});
+
+    %sort the file order based on acquisition time
+    [~, id] = sort(datenums);
+
+    %get the number of files
+    Y = length(id);                 %size of the image along Y
+
+    %load the first file to determine the spectral and X-axis size
+    temp = readspe(filelist(1).name);
+    X = size(temp, 1);              %size of the image along X
+    B = size(temp, 2);              %number of bands in the image
+
+    %create the cube
+    I = zeros(X, Y, B);
+
+    %for each line
+    for y = 1:Y
+
+        %read a SPE file
+        img = readspe(filelist(id(y)).name);
+
+        I(:, y, :) = permute(img, [1 3 2]);
+    end
+
+    enviSaveRaw(single(I), outfile, [outfile '.hdr']);
+
+    
+
+%Loads a standard Agilent ResPro binary file
+%   stimLoadAgilent(filename)
+function S = stimLoadAgilent(filename)
+
+    fid = fopen(filename);
+    fseek(fid, 9, 'bof');
+    Z = fread(fid, 1, 'uint16');
+    fseek(fid, 13, 'cof');
+    X = fread(fid, 1, 'uint16');
+    Y = fread(fid, 1, 'uint16');
+    
+    fseek(fid, 1020, 'bof');
+    
+    S = reshape(fread(fid, [X, Y * Z], 'float32'), [X, Y, Z]);
+    
+    
 \ No newline at end of file
+function [TPR, FPR, AUC] = stimROC(C, T)
+%build an ROC curve
+%   C - class labels as an array of binary values (1 = true positive)
+%   T - threshold used for classification
+
+    %sort the thresholds in descending order and get the indices
+    [~, I] = sort(T, 'descend');
+    
+    %sort the class labels in the same order as the thresholds
+    Cs = C(I);
+    
+    %calculate the number of measurements
+    M = size(C, 2);
+    
+    %calculate the number of positives
+    P = nnz(C);
+
+    %calculate the number of negatives
+    N = M - P;
+    
+    %if all examples are positives or negatives, return a perfect score?
+    if P == M
+        error('ERROR: no positive observations');
+    end
+    if P == 0
+        error('ERROR: no negative observations');
+    end
+    
+    %allocate space for the ROC curve
+    TPR = zeros(1, M);
+    FPR = zeros(1, M);
+    
+    
+    
+    %calculate the number of inflection points
+    ip = 0;
+    for i = 2:M
+        if Cs(i) ~= Cs(i-1)
+            ip = ip + 1;
+        end
+    end
+    
+    %initialize the true and false positive rates to zero
+    TP = 0;
+    FP = 0;
+    for i = 1:M
+        if Cs(i) == 1
+            TP = TP + 1;
+        else
+            FP = FP + 1;
+        end
+        
+        TPR(i) = TP / P;
+        FPR(i) = FP / N;
+    end
+    
+    %calculate the area under the ROC curve
+    AUC = 0;
+    for i = 2:M
+        w = FPR(i) - FPR(i-1);
+        h = TPR(i);
+        AUC = AUC + w * h;
+    end
+        
+    
+    
+    
+    
+    
 \ No newline at end of file
+#!/usr/bin/python3
+
+#import system processes
+import subprocess, sys
+
+if len(sys.argv) > 1:
+	infile = int(sys.argv[1])
+
+basefile = infile + "-base"
+normfile = infile + "-norm"
+
+runcommand = "hsiproc " + infile + basefile + " --baseline baseline.txt"
+subprocess.call(runcommand, shell=True)
 \ No newline at end of file
@@ -117,7 +117,7 @@ public:
 	}
 	/// Return the maximum value of a field in this cell set
-	double max(std::string field){
+	double maximum(std::string field){
 		size_t idx = fields[field];						//get the field index
 		size_t ncells = cells.size();					//get the total number of cells
 		double maxval, val;								//stores the current and maximum values
@@ -130,7 +130,7 @@ public:
 	}
 	/// Return the maximum value of a field in this cell set
-	double min(std::string field){
+	double minimum(std::string field){
 		size_t idx = fields[field];						//get the field index
 		size_t ncells = cells.size();					//get the total number of cells
 		double minval, val;								//stores the current and maximum values
@@ -11,8 +11,8 @@
 #include <stim/math/vec3.h>
 #include <stim/visualization/obj.h>
 #include <stim/visualization/cylinder.h>
-#include <ANN/ANN.h>
-#include <boost/tuple/tuple.hpp>
+#include <stim/structures/kdtree.cuh>
+#include <stim/cuda/cudatools/timer.h>
 namespace stim{
@@ -35,7 +35,7 @@ class network{
 		// default constructor
 		edge() : cylinder<T>()
 		{
-			v[1] = -1; v[0] = -1;
+			v[1] = (unsigned)(-1); v[0] = (unsigned)(-1);
 		}
 		/// Constructor - creates an edge from a list of points by calling the stim::fiber constructor
@@ -57,7 +57,7 @@ class network{
 		/// Output the edge information as a string
 		std::string str(){
 			std::stringstream ss;
-			ss<<"("<<cylinder<T>::size()<<")\tl = "<<this.length()<<"\t"<<v[0]<<"----"<<v[1];
+			ss<<"("<<cylinder<T>::size()<<")\tl = "<<this->length()<<"\t"<<v[0]<<"----"<<v[1];
 			return ss.str();
 		}
@@ -125,7 +125,9 @@ public:
 		return V.size();
 	}
-	std::vector<vertex> operator*(T s){
+	//scale the network by some constant value
+	//	I don't think these work??????
+	/*std::vector<vertex> operator*(T s){
 		for (unsigned i=0; i< vertices; i ++ ){
 			V[i] = V[i] * s;
 		}
@@ -139,10 +141,9 @@ public:
 			}
 		}
 		return V;
-	}
+	}*/
 	// Returns an average of branching index in the network
-
 	double BranchingIndex(){
 		double B=0;
 		for(unsigned v=0; v < V.size(); v ++){
@@ -154,7 +155,6 @@ public:
 	}
 	// Returns number of branch points in thenetwork
-
 	unsigned int BranchP(){
 		unsigned int B=0;
 		unsigned int c;
@@ -168,7 +168,6 @@ public:
 	}
 	// Returns number of end points (tips) in thenetwork
-
 	unsigned int EndP(){
 		unsigned int B=0;
 		unsigned int c;
@@ -202,10 +201,11 @@ public:
 	//	return s;
 	//}
-
+	//Calculate Metrics---------------------------------------------------
 	// Returns an average of fiber/edge lengths in the network
 	double Lengths(){
-		stim::vec<T> L;double sumLength = 0;
+		stim::vec<T> L;
+		double sumLength = 0;
 		for(unsigned e = 0; e < E.size(); e++){				//for each edge in the network
 			L.push_back(E[e].length());						//append the edge length
 			sumLength = sumLength + E[e].length();
@@ -269,8 +269,10 @@ public:
 	double avg = sumFractDim / E.size();
 	return avg;
 	}
-	stim::cylinder<T> get_cylinder(unsigned f){
-		return E[f];									//return the specified edge (casting it to a fiber)
+
+	//returns a cylinder represented a given fiber (based on edge index)
+	stim::cylinder<T> get_cylinder(unsigned e){
+		return E[e];									//return the specified edge (casting it to a fiber)
 	}
 	//load a network from an OBJ file
@@ -385,11 +387,27 @@ public:
 		return n;
 	}
+	//Copy the point cloud representing the centerline for the network into an array
+	void centerline_cloud(T* dst) {
+		size_t p;										//stores the current edge point
+		size_t P;										//stores the number of points in an edge
+		size_t i = 0;									//index into the output array of points
+		for (size_t e = 0; e < E.size(); e++) {			//for each edge in the network
+			P = E[e].size();							//get the number of points in this edge
+			for (p = 0; p < P; p++) {
+				dst[i * 3 + 0] = E[e][p][0];		
+				dst[i * 3 + 1] = E[e][p][1];
+				dst[i * 3 + 2] = E[e][p][2];
+				i++;
+			}
+		}
+	}
+
 	// gaussian function
 	float gaussianFunction(float x, float std=25){ return exp(-x/(2*std*std));} // by default std = 25
-    // stim 3d vector to annpoint of 3 dimensions
-	void stim2ann(ANNpoint &a, stim::vec3<T> b){
+    // convert vec3 to array
+	void stim2array(float *a, stim::vec3<T> b){
 		a[0] = b[0];
 		a[1] = b[1];
 		a[2] = b[2];
@@ -413,57 +431,81 @@ public:
 	/// @param A is the network to compare to - the field is generated for A
 	/// @param sigma is the user-defined tolerance value - smaller values provide a stricter comparison
-	stim::network<T> compare(stim::network<T> A, float sigma){
+	stim::network<T> compare(stim::network<T> A, float sigma, int device){
-		stim::network<T> R;								//generate a network storing the result of the comparison
-		R = (*this);									//initialize the result with the current network
+		stim::network<T> R;										//generate a network storing the result of the comparison
+		R = (*this);											//initialize the result with the current network
-		//generate a KD-tree for network A
-		float metric = 0.0;                               				// initialize metric to be returned after comparing the networks
-		ANNkd_tree* kdt;                                 				// initialize a pointer to a kd tree
-		double **c;						                 	// centerline (array of double pointers) - points on kdtree must be double
-		unsigned int n_data = A.total_points();          				// set the number of points
-		c = (double**) malloc(sizeof(double*) * n_data); 				// allocate the array pointer
-		for(unsigned int i = 0; i < n_data; i++)		 			// allocate space for each point of 3 dimensions
-			c[i] = (double*) malloc(sizeof(double) * 3);
+		T *c;						                 			// centerline (array of double pointers) - points on kdtree must be double
+		size_t n_data = A.total_points();          				// set the number of points
+		c = (T*) malloc(sizeof(T) * n_data * 3);				//allocate an array to store all points in the data set				
 		unsigned t = 0;
-		for(unsigned e = 0; e < A.E.size(); e++){					//for each edge in the network
-			for(unsigned p = 0; p < A.E[e].size(); p++){				//for each point in the edge
+		for(unsigned e = 0; e < A.E.size(); e++){				//for each edge in the network
+			for(unsigned p = 0; p < A.E[e].size(); p++){		//for each point in the edge
 				for(unsigned d = 0; d < 3; d++){				//for each coordinate
-					c[t][d] = A.E[e][p][d];
+					c[t * 3 + d] = A.E[e][p][d];				//copy the point into the array c
 				}
 				t++;
 			}
 		}
+		//generate a KD-tree for network A
+		//float metric = 0.0;                               	// initialize metric to be returned after comparing the network
+		size_t MaxTreeLevels = 3;								// max tree level
+		
+#ifdef __CUDACC__
+		cudaSetDevice(device);
+		stim::cuda_kdtree<T, 3> kdt;								// initialize a pointer to a kd tree
+	
 		//compare each point in the current network to the field produced by A
-		ANNpointArray pts = (ANNpointArray)c;           				// create an array of data points of type double
-		kdt = new ANNkd_tree(pts, n_data, 3);						// build a KD tree using the annpointarray
-		double eps = 0; // error bound
-		ANNdistArray dists = new ANNdist[1];     					// near neighbor distances
-		ANNidxArray nnIdx = new ANNidx[1];						// near neighbor indices // allocate near neigh indices
+		kdt.create(c, n_data, MaxTreeLevels);				// build a KD tree
+		T *dists = new T[1];								// near neighbor distances
+		size_t *nnIdx = new size_t[1];							// near neighbor indices // allocate near neigh indices
 		stim::vec3<T> p0, p1;
-		float m1;
-		float M = 0;									//stores the total metric value
-		float L = 0;									//stores the total network length
-		ANNpoint queryPt = annAllocPt(3);
+		T m1;
+		//float M = 0;									//stores the total metric value
+		//float L = 0;									//stores the total network length
+		T* queryPt = new T[3];
 		for(unsigned e = 0; e < R.E.size(); e++){					//for each edge in A
 			R.E[e].add_mag(0);							//add a new magnitude for the metric
 			for(unsigned p = 0; p < R.E[e].size(); p++){				//for each point in the edge
 				p1 = R.E[e][p];							//get the next point in the edge
-				stim2ann(queryPt, p1);
-				kdt->annkSearch( queryPt, 1, nnIdx, dists, eps);		//find the distance between A and the current network
-				m1 = 1.0f - gaussianFunction((float)dists[0], sigma);		//calculate the metric value based on the distance
+				stim2array(queryPt, p1);
+				kdt.search(queryPt, 1, nnIdx, dists);			//find the distance between A and the current network
+
+				m1 = 1.0f - gaussianFunction((T)dists[0], sigma);		//calculate the metric value based on the distance
 				R.E[e].set_mag(m1, p, 1);					//set the error for the second point in the segment
 			}
 		}
+#else
+		stim::cpu_kdtree<T, 3> kdt;
+		kdt.create(c, n_data, MaxTreeLevels);
+		T *dists = new T[1];								// near neighbor distances
+		size_t *nnIdx = new size_t[1];						// near neighbor indices // allocate near neigh indices
+
+		stim::vec3<T> p0, p1;
+		T m1;
+		T* queryPt = new T[3];
+		for(unsigned e = 0; e < R.E.size(); e++){			//for each edge in A
+			R.E[e].add_mag(0);								//add a new magnitude for the metric
+
+			for(unsigned p = 0; p < R.E[e].size(); p++){				//for each point in the edge
+				p1 = R.E[e][p];							//get the next point in the edge
+				stim2array(queryPt, p1);
+				kdt.cpu_search(queryPt, 1, nnIdx, dists);			//find the distance between A and the current network
+
+				m1 = 1.0f - gaussianFunction((T)dists[0], sigma);		//calculate the metric value based on the distance
+				R.E[e].set_mag(m1, p, 1);					//set the error for the second point in the segment
+			}
+		}
+#endif
 		return R;		//return the resulting network
 	}
@@ -487,7 +529,7 @@ public:
 	void load_txt(std::string filename)
 	{
 		std::vector <std::string> file_contents;
-		std::ifstream file(filename);
+		std::ifstream file(filename.c_str());
 		std::string line;
 		std::vector<unsigned> id2vert;	//this list stores the vertex ID associated with each network vertex
 		//for each line in the text file, store them as strings in file_contents
@@ -538,7 +580,7 @@ public:
 			for(unsigned int d = 0; d < 3; d++){
 				ss<<p[i][d];
 			}
-			ss < "\n";
+			ss << "\n";
 		}
 		return ss.str();
 	}
@@ -552,8 +594,8 @@ public:
 	void
 	to_txt(std::string filename)
 	{
-		std::ofstream ofs(filename, std::ofstream::out | std::ofstream::app);
-		int num;
+		std::ofstream ofs(filename.c_str(), std::ofstream::out | std::ofstream::app);
+		//int num;
 		ofs << (E.size()).str() << "\n";
 		for(unsigned int i = 0; i < E.size(); i++)
 		{
@@ -566,7 +608,8 @@ public:
 		{
 			std::string str;
 			str = V[i].str();
-			removeCharsFromString(str, "[],");
+			char temp[4] = "[],";
+			removeCharsFromString(str, temp);
 			ofs << str << "\n";
 		}
 		ofs.close();
@@ -4,7 +4,7 @@
 #include <stim/math/vector.h>
 #include <stim/visualization/obj.h>
 #include <list>
-#include <ANN/ANN.h>
+//#include <ANN/ANN.h>
 namespace stim{
+#ifndef STIM_CUDA_ERROR_H
+#define STIM_CUDA_ERROR_H
+
 #include <stdio.h>
 #include <iostream>
 using namespace std;
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
 #include "cufft.h"
-
-#ifndef CUDA_HANDLE_ERROR_H
-#define CUDA_HANDLE_ERROR_H
+#include "cublas_v2.h"
 //handle error macro
-static void HandleError( cudaError_t err, const char *file,  int line ) {
+static void cuHandleError( cudaError_t err, const char *file,  int line ) {
    	if (err != cudaSuccess) {
-			//FILE* outfile = fopen("cudaErrorLog.txt", "w");
-      		//fprintf(outfile,  "%s in %s at line %d\n", cudaGetErrorString( err ),  file, line );
-			//fclose(outfile);
             printf("%s in %s at line %d\n", cudaGetErrorString( err ),  file, line );
-       		//exit( EXIT_FAILURE );
    	}
 }
-#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
+#define HANDLE_ERROR( err ) (cuHandleError( err, __FILE__, __LINE__ ))
-static void CufftError( cufftResult err )
+static void cufftHandleError( cufftResult err, const char*file, int line )
 {
     if (err != CUFFT_SUCCESS)
     {
@@ -42,7 +39,29 @@ static void CufftError( cufftResult err )
     }
 }
+#define CUFFT_HANDLE_ERROR( err ) (cufftHandleError( err, __FILE__, __LINE__ ))
+static void cublasHandleError( cublasStatus_t err, const char*file, int line ){
+	if(err != CUBLAS_STATUS_SUCCESS){
+		if(err == CUBLAS_STATUS_NOT_INITIALIZED)
+			std::cout<<"CUBLAS_STATUS_NOT_INITIALIZED" <<" in file "<<file<<" line "<<std::endl;
+		else if(err == CUBLAS_STATUS_ALLOC_FAILED)
+			std::cout<<"CUBLAS_STATUS_ALLOC_FAILED" <<" in file "<<file<<" line "<<std::endl;
+		else if(err == CUBLAS_STATUS_INVALID_VALUE)
+			std::cout<<"CUBLAS_STATUS_INVALID_VALUE" <<" in file "<<file<<" line "<<std::endl;
+		else if(err == CUBLAS_STATUS_ARCH_MISMATCH)
+			std::cout<<"CUBLAS_STATUS_ARCH_MISMATCH" <<" in file "<<file<<" line "<<std::endl;
+		else if(err == CUBLAS_STATUS_MAPPING_ERROR)
+			std::cout<<"CUBLAS_STATUS_MAPPING_ERROR" <<" in file "<<file<<" line "<<std::endl;
+		else if(err == CUBLAS_STATUS_EXECUTION_FAILED)
+			std::cout<<"CUBLAS_STATUS_EXECUTION_FAILED" <<" in file "<<file<<" line "<<std::endl;
+		else if(err == CUBLAS_STATUS_INTERNAL_ERROR)
+			std::cout<<"CUBLAS_STATUS_INTERNAL_ERROR" <<" in file "<<file<<" line "<<std::endl;
+		else
+			std::cout<<"Unknown error"<<" in file "<<file<<" line "<<std::endl;
+	}
+}
+#define CUBLAS_HANDLE_ERROR( err ) (cublasHandleError( err, __FILE__, __LINE__ ))
 #endif
@@ -4,13 +4,15 @@
 #include <string>
 #include <fstream>
+#include <complex>
 //CUDA
-#ifdef CUDA_FOUND
-	#include <cuda_runtime.h>
-	#include "cufft.h"
-	#include <stim/cuda/cudatools/error.h>
-#endif
+//#ifdef CUDA_FOUND
+#include <cuda_runtime.h>
+#include "cufft.h"
+#include <stim/cuda/cudatools/error.h>
+#include <stim/envi/envi_header.h>
+//#endif
 namespace stim{
@@ -19,10 +21,10 @@ class agilent_binary{
 protected:
 	std::string fname;
-	T* ptr;
-	size_t R[3];
-	static const size_t header = 1020;
-	double Z[2];
+	T* ptr;														//pointer to the image data
+	size_t R[3];												//size of the binary image in X, Y, and Z
+	static const size_t header = 1020;							//header size
+	double Z[2];												//range of z values (position or wavelength)
 public:
 	size_t size(){
@@ -42,6 +44,10 @@ public:
 		alloc();
 	}
+	size_t dim(size_t i){
+		return R[i];
+	}
+
 	/// Create a deep copy of an agileng_binary object
 	void deep_copy(agilent_binary<T>* dst, const agilent_binary<T>* src){
 		dst->alloc(src->R[0], src->R[1], src->R[2]);			//allocate memory
@@ -136,6 +142,42 @@ public:
 		return header;
 	}
+	/// Subtract the mean from each pixel. Generally used for centering an interferogram.
+	void meancenter(){
+		size_t Z = R[2];											//store the number of bands
+		size_t XY = R[0] * R[1];									//store the number of pixels in the image
+		T sum = (T)0;
+		T mean;
+		for(size_t xy = 0; xy < XY; xy++){							//for each pixel
+			sum = 0;
+			for(size_t z = 0; z < Z; z++){							//for each band
+				sum += ptr[ z * XY + xy ];							//add the band value to a running sum
+			}
+			mean = sum / (T)Z;										//calculate the pixel mean
+			for(size_t z = 0; z < Z; z++){
+				ptr[ z * XY + xy ] -= mean;							//subtract the mean from each band
+			}
+		}
+	}
+
+	/// adds n bands of zero padding to the end of the file
+	void zeropad(size_t n){
+		size_t newZ = R[2] + n;
+		T* temp = (T*) calloc(R[0] * R[1] * newZ, sizeof(T));	//allocate space for the new image
+		memcpy(temp, ptr, size() * sizeof(T));					//copy the old data to the new image
+		
+		free(ptr);												//free the old data
+		ptr = temp;												//swap in the new data
+		R[2] = newZ;											//set the z-dimension to the new zero value
+	}
+
+	//pads to the nearest power-of-two
+	void zeropad(){
+		size_t newZ = (size_t)pow(2, ceil(log(R[2])/log(2)));			//find the nearest power-of-two
+		size_t n = newZ - R[2];									//calculate the number of bands to add
+		zeropad(n);												//add the padding
+	}
+
 	/// Calculate the absorbance spectrum from the transmission spectrum given a background
 	void absorbance(stim::agilent_binary<T>* background){
 		size_t N = size();											//calculate the number of values to be ratioed
@@ -147,7 +189,7 @@ public:
 			ptr[i] = -log10(ptr[i] / background->ptr[i]);
 	}
-#ifdef CUDA_FOUND
+//#ifdef CUDA_FOUND
 	/// Perform an FFT and return a binary file with bands in the specified range
 	agilent_binary<T> fft(double band_min, double band_max, double ELWN = 15798, int UDR = 2){
 		auto total_start = std::chrono::high_resolution_clock::now();
@@ -234,7 +276,22 @@ public:
 		return result;
 	}
-#endif
+
+	//saves the binary as an ENVI file with a BIP interleave format
+	int bip(T* bip_ptr){
+		//std::ofstream out(outfile.c_str(), std::ios::binary);			//create a binary file stream for output
+		size_t XY = R[0] * R[1];
+		size_t B = R[2];
+		size_t b;
+
+		for(size_t xy = 0; xy < XY; xy++){
+			for(b = 0; b < B; b++){
+				bip_ptr[xy * B + b] = ptr[b * XY + xy];
+			}
+		}
+		return 0;
+	}
+//#endif
 };
@@ -4,6 +4,7 @@
 #include "../envi/envi_header.h"
 #include "../envi/hsi.h"
 #include "../math/fd_coefficients.h"
+#include <stim/cuda/cudatools/error.h>
 #include <cstring>
 #include <utility>
 #include <deque>
@@ -118,7 +119,7 @@ public:
 			page++;
 			//if wavelength is larger than the last wavelength in header file
 			if (page == Z()) {
-				band_index(p, Z()-1);
+				band_index(p, Z()-1, PROGRESS);
 				return true;
 			}
 		}
@@ -224,10 +225,44 @@ public:
 	}
 	//given a Y ,return a XZ slice
-	bool read_plane_y(T * p, unsigned long long y){
+	bool read_plane_xz(T * p, size_t y){
 		return binary<T>::read_plane_2(p, y);
 	}
+	//given a Y, return ZX slice (transposed such that the spectrum is the leading dimension)
+	int read_plane_zx(T* p, size_t y){
+		T* temp = (T*) malloc(X() * Z() * sizeof(T));	//allocate space to store the temporary xz plane
+		binary<T>::read_plane_2(temp, y);					//load the plane from disk
+		size_t z, x;
+		for(z = 0; z < Z(); z++){
+			for(x = 0; x <= z; x++){
+				p[x * Z() + z] = temp[z * X() + x];		//copy to the destination frame
+			}
+		}
+	}
+
+	//load a frame y into a pre-allocated double-precision array
+	int read_plane_xzd(double* f, size_t y){		
+		size_t XB = X() * Z();
+		T* temp = (T*) malloc(XB * sizeof(T));			//create a temporary location to store the plane at current precision
+		if(!read_plane_y(temp, y)) return 1;			//read the plane in its native format, if it fails return a 1
+		for(size_t i = 0; i < XB; i++) f[i] = temp[i];	//convert the plane to a double
+		return 0;
+	}
+
+	//given a Y, return ZX slice (transposed such that the spectrum is the leading dimension)
+	int read_plane_zxd(double* p, size_t y){
+		T* temp = (T*) malloc(X() * Z() * sizeof(T));		//allocate space to store the temporary xz plane
+		binary<T>::read_plane_2(temp, y);					//load the plane from disk
+		size_t z, x;
+		for(z = 0; z < Z(); z++){
+			for(x = 0; x < X(); x++){
+				p[x * Z() + z] = (double)temp[z * X() + x];	//copy to the destination frame
+			}
+		}
+		return 0;
+	}
+
 	/// Perform baseline correction given a list of baseline points and stores the result in a new BSQ file.
@@ -268,7 +303,7 @@ public:
 		for (unsigned long long k =0; k < Y(); k++)
 		{
 			//get the current y slice
-			read_plane_y(c, k);
+			read_plane_xz(c, k);
 			//initialize lownum, highnum, low, high
 			ai = w[0];
@@ -369,7 +404,7 @@ public:
 		for(unsigned long long j = 0; j < Y(); j++)
 		{
-			read_plane_y(c, j);
+			read_plane_xz(c, j);
 			for(unsigned long long i = 0; i < B; i++)
 			{
 				for(unsigned long long m = 0; m < X(); m++)
@@ -469,7 +504,7 @@ public:
 		for ( unsigned long long i = 0; i < Y(); i++)
 		{
-			read_plane_y(p, i);
+			read_plane_xz(p, i);
 			for ( unsigned long long k = 0; k < Z(); k++)
 			{
 				unsigned long long ks = k * X();
@@ -863,7 +898,7 @@ public:
 		for (unsigned long long i = 0; i < Y(); i++)			//for each value in Y() (BIP should be X)
 		{
-			read_plane_y(temp, i);							//retrieve an ZX slice, stored in temp
+			read_plane_xz(temp, i);							//retrieve an ZX slice, stored in temp
 			for ( unsigned long long j = 0; j < Z(); j++)	//for each Z() (Y)
 			{
 				for (unsigned long long k = 0; k < X(); k++) //for each band
@@ -933,7 +968,7 @@ public:
 		//for each slice along the y axis
 		for (unsigned long long y = 0; y < Y(); y++)			//Select a page by choosing Y coordinate, Y()
 		{
-			read_plane_y(slice, y);							//retrieve an ZX page, store in "slice"
+			read_plane_xz(slice, y);							//retrieve an ZX page, store in "slice"
 			//for each sample along X
 			for (unsigned long long x = 0; x < X(); x++)		//Select a pixel by choosing X coordinate in the page, X()
@@ -992,43 +1027,136 @@ public:
 	/// @param p is a pointer to pre-allocated memory of size [B * sizeof(T)] that stores the mean spectrum
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
-	bool avg_band(double* p, unsigned char* mask = NULL, bool PROGRESS = false){
+	bool mean_spectrum(double* m, double* std, unsigned char* mask = NULL, bool PROGRESS = false){
 		unsigned long long XZ = X() * Z();
 		unsigned long long XY = X() * Y();
 		T* temp = (T*)malloc(sizeof(T) * XZ);
-		for (unsigned long long j = 0; j < Z(); j++){
-			p[j] = 0;
-		}
+		memset(m, 0, Z() * sizeof(double));							//initialize the mean to zero
+		double* e_x2 = (double*)malloc(Z() * sizeof(double));		//allocate space for E[x^2]
+		memset(e_x2, 0, Z() * sizeof(double));						//initialize E[x^2] to zero
 		//calculate vaild number in a band
-		unsigned long long count = 0;
-		for (unsigned long long j = 0; j < XY; j++){
-			if (mask == NULL || mask[j] != 0){
-				count++;
-			}
-		}
+		size_t count = nnz(mask);							//count the number of pixels in the mask
+
+		double x;											//create a register to store the pixel value
 		for (unsigned long long k = 0; k < Y(); k++){
-			read_plane_y(temp, k);
+			read_plane_xz(temp, k);
 			unsigned long long kx = k * X();
 			for (unsigned long long i = 0; i < X(); i++){
 				if (mask == NULL || mask[kx + i] != 0){
 					for (unsigned long long j = 0; j < Z(); j++){
-						p[j] += temp[j * X() + i] / (double)count;
+						x = temp[j * X() + i];
+						m[j] += x / (double)count;
+						e_x2[j] += x*x / (double)count;
 					}
 				}
 			}
 			if(PROGRESS) progress = (double)(k+1) / Y() * 100;
 		}
+
+		for(size_t i = 0; i < Z(); i++)							//calculate the standard deviation
+			std[i] = sqrt(e_x2[i] - m[i] * m[i]);
+
 		free(temp);
 		return true;
 	}
+	int co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
+		cublasStatus_t stat;
+		cublasHandle_t handle;
+
+		progress = 0;														//initialize the progress to zero (0)
+		size_t XY = X() * Y();												//calculate the number of elements in a band image
+		size_t XB = X() * Z();
+		size_t B = Z();														//calculate the number of spectral elements
+
+		double* F = (double*)malloc(sizeof(double) * B * X());				//allocate space for the frame that will be pulled from the file
+		double* F_dev;
+		HANDLE_ERROR(cudaMalloc(&F_dev, X() * B * sizeof(double)));			//allocate space for the frame on the GPU
+		double* s_dev;														//declare a device pointer that will store the spectrum on the GPU
+		double* A_dev;														//declare a device pointer that will store the covariance matrix on the GPU
+		double* avg_dev;													//declare a device pointer that will store the average spectrum
+		HANDLE_ERROR(cudaMalloc(&s_dev, B * sizeof(double)));				//allocate space on the CUDA device for a spectrum
+		HANDLE_ERROR(cudaMalloc(&A_dev, B * B * sizeof(double)));			//allocate space on the CUDA device for the covariance matrix
+		HANDLE_ERROR(cudaMemset(A_dev, 0, B * B * sizeof(double)));			//initialize the covariance matrix to zero (0)
+		HANDLE_ERROR(cudaMalloc(&avg_dev, XB * sizeof(double)));				//allocate space on the CUDA device for the average spectrum
+		for(size_t x = 0; x < X(); x++)											//make multiple copies of the average spectrum in order to build a matrix
+			HANDLE_ERROR(cudaMemcpy(&avg_dev[x * B], avg, B * sizeof(double), cudaMemcpyHostToDevice));	
+		//stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1);	//copy the average spectrum to the CUDA device
+
+		double ger_alpha = 1.0/(double)XY;										//scale the outer product by the inverse of the number of samples (mean outer product)
+		double axpy_alpha = -1;													//multiplication factor for the average spectrum (in order to perform a subtraction)
+
+		CUBLAS_HANDLE_ERROR(stat = cublasCreate(&handle));								//create a cuBLAS instance
+		if (stat != CUBLAS_STATUS_SUCCESS) return 1;									//test the cuBLAS instance to make sure it is valid
+
+		else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl;
+		double beta = 1.0;
+		size_t x, y;
+		for(y = 0; y < Y(); y++){										//for each line
+			read_plane_zxd(F, y);												//read a frame from the file
+			HANDLE_ERROR(cudaMemcpy(F_dev, F, XB * sizeof(double), cudaMemcpyHostToDevice));	//copy the frame to the GPU
+			CUBLAS_HANDLE_ERROR(cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, (int)B, (int)X(), &axpy_alpha, avg_dev, (int)B, &beta, F_dev, (int)B, F_dev, (int)B));//subtract the mean spectrum
+
+			for(x = 0; x < X(); x++)
+				CUBLAS_HANDLE_ERROR(cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, &F_dev[x*B], 1, A_dev, (int)B));			//perform an outer product
+			if(PROGRESS) progress = (double)(y + 1) / Y() * 100;
+		}
+
+		cublasGetMatrix((int)B, (int)B, sizeof(double), A_dev, (int)B, co, (int)B);			//copy the result from the GPU to the CPU
+
+		cudaFree(A_dev);																	//clean up allocated device memory
+		cudaFree(s_dev);
+		cudaFree(avg_dev);
+
+		for(unsigned long long i = 0; i < B; i++){										//copy the upper triangular portion to the lower triangular portion
+			for(unsigned long long j = i+1; j < B; j++){
+				co[B * i + j] = co[B * j + i];
+			}
+		}
+
+		return 0;
+
+
+
+	}
+
+
 	/// Calculate the covariance matrix for all masked pixels in the image.
 	/// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
 	/// @param avg is a pointer to memory of size B that stores the average spectrum
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
-	bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
+	bool co_matrix(double* co, double* avg, unsigned char *mask, bool use_gpu = true, bool PROGRESS = false){
 		progress = 0;
+
+		if(use_gpu){
+			int dev_count;
+			HANDLE_ERROR(cudaGetDeviceCount(&dev_count));						//get the number of CUDA devices
+			std::cout<<"Number of CUDA devices: "<<dev_count<<std::endl;		//output the number of CUDA devices
+			cudaDeviceProp prop;
+			int best_device_id = 0;													//stores the best CUDA device
+			float best_device_cc = 0.0f;												//stores the compute capability of the best device
+			std::cout<<"CUDA devices:"<<std::endl;
+			for(int d = 0; d < dev_count; d++){									//for each CUDA device
+				cudaGetDeviceProperties(&prop, d);								//get the property of the first device
+				float cc = prop.major + prop.minor / 10.0f;						//calculate the compute capability
+				std::cout<<"("<<prop.major<<"."<<prop.minor<<")      "<<prop.name<<std::endl;	//display the device information
+				if(cc > best_device_cc){
+					best_device_cc = cc;										//if this is better than the previous device, use it
+					best_device_id = d;
+				}
+			}		
+		
+			if(dev_count > 0 && prop.major != 9999){							//if the first device is not an emulator
+				std::cout<<"Using device "<<best_device_id<<std::endl;
+				HANDLE_ERROR(cudaSetDevice(best_device_id));
+				int status = co_matrix_cublas(co, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
+				if(status == 0) return true;									//if the cuBLAS function returned correctly, we're done
+			}																	//otherwise continue using the CPU
+		
+			std::cout<<"No supported CUDA devices found or cuBLAS failed. Using CPU"<<std::endl;
+		}
+
 		//memory allocation
 		unsigned long long xy = X() * Y();
 		unsigned long long B = Z();
@@ -1325,7 +1453,7 @@ public:
 		c = (T*)malloc( L );										//allocate space for the slice
 		for(unsigned long long j = 0; j < Y(); j++){				//for each line
-			read_plane_y(c, j);										//load the line into memory
+			read_plane_xz(c, j);										//load the line into memory
 			for(unsigned long long i = 0; i < B; i++){				//for each band
 				for(unsigned long long m = 0; m < X(); m++){		//for each sample
 					if( mask == NULL && mask[m + j * X()] )			//if the pixel is masked
@@ -1355,7 +1483,7 @@ public:
 		c = (T*)malloc( L );										//allocate space for the slice
 		for(unsigned long long j = 0; j < Y(); j++){				//for each line
-			read_plane_y(c, j);										//load the line into memory
+			read_plane_xz(c, j);										//load the line into memory
 			for(unsigned long long i = 0; i < B; i++){				//for each band
 				for(unsigned long long m = 0; m < X(); m++){		//for each sample
 					if( mask == NULL && mask[m + j * X()] )			//if the pixel is masked
@@ -5,13 +5,16 @@
 #include "../envi/bil.h"
 #include "../envi/hsi.h"
 #include <cstring>
+#include <complex>
 #include <utility>
 //CUDA
-#ifdef CUDA_FOUND
-	#include <cuda_runtime.h>
-	#include "cublas_v2.h"
-#endif
+//#ifdef CUDA_FOUND
+#include <stim/cuda/cudatools/error.h>
+#include <cuda_runtime.h>
+#include "cublas_v2.h"
+#include "cufft.h"
+//#endif
 namespace stim{
@@ -257,7 +260,7 @@ public:
 	}
 	//given a Y ,return a ZX slice
-	bool read_plane_y(T * p, unsigned long long y){
+	bool read_plane_y(T * p, size_t y){
 		return binary<T>::read_plane_2(p, y);
 	}
@@ -954,39 +957,43 @@ public:
 	/// @param p is a pointer to pre-allocated memory of size [B * sizeof(T)] that stores the mean spectrum
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
-	bool avg_band(double* p, unsigned char* mask = NULL, bool PROGRESS = false){
+	bool mean_spectrum(double* m, double* std, unsigned char* mask = NULL, bool PROGRESS = false){
 		unsigned long long XY = X() * Y();							//calculate the total number of pixels in the HSI
 		T* temp = (T*)malloc(sizeof(T) * Z());						//allocate space for the current spectrum to be read
-		memset(p, 0, sizeof(double) * Z());							//initialize the average spectrum to zero (0)
-		//for (unsigned j = 0; j < Z(); j++){
-		//	p[j] = 0;
-		//}
+		memset(m, 0, Z() * sizeof(double));							//set the mean spectrum to zero
+		double* e_x2 = (double*)malloc(Z() * sizeof(double));		//allocate space for E[x^2]
+		memset(e_x2, 0, Z() * sizeof(double));						//set all values for E[x^2] to zero
 		unsigned long long count = nnz(mask);									//calculate the number of masked pixels
-
+		double x;
 		for (unsigned long long i = 0; i < XY; i++){							//for each pixel in the HSI
 			if (mask == NULL || mask[i] != 0){						//if the pixel is masked
 				pixel(temp, i);										//get the spectrum
 				for (unsigned long long j = 0; j < Z(); j++){					//for each spectral component
-					p[j] += (double)temp[j] / (double)count;		//add the weighted value to the average
+					x = temp[j];
+					m[j] += x / (double)count;		//add the weighted value to the average
+					e_x2[j] += x*x / (double)count;
 				}
 			}
 			if(PROGRESS) progress = (double)(i+1) / XY * 100;		//increment the progress
 		}
+		//calculate the standard deviation
+		for(size_t i = 0; i < Z(); i++)
+			std[i] = sqrt(e_x2[i] - m[i] * m[i]);
+
 		free(temp);
 		return true;
 	}
-#ifdef CUDA_FOUND
+//#ifdef CUDA_FOUND
 	/// Calculate the covariance matrix for masked pixels using cuBLAS
 	/// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra
-	bool co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
+	int co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
 		cudaError_t cudaStat;
 		cublasStatus_t stat;
 		cublasHandle_t handle;
-		progress = 0;													//initialize the progress to zero (0)
 		unsigned long long XY = X() * Y();									//calculate the number of elements in a band image
 		unsigned long long B = Z();											//calculate the number of spectral elements
@@ -1004,10 +1011,9 @@ public:
 		double axpy_alpha = -1;												//multiplication factor for the average spectrum (in order to perform a subtraction)
 		stat = cublasCreate(&handle);										//create a cuBLAS instance
-		if (stat != CUBLAS_STATUS_SUCCESS) {								//test the cuBLAS instance to make sure it is valid
-			printf ("CUBLAS initialization failed\n");
-			return EXIT_FAILURE;
-		}
+		if (stat != CUBLAS_STATUS_SUCCESS) return 1;						//test the cuBLAS instance to make sure it is valid
+
+		else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl;
 		for (unsigned long long xy = 0; xy < XY; xy++){										//for each pixel
 			if (mask == NULL || mask[xy] != 0){
 				pixeld(s, xy);																	//retreive the spectrum at the current xy pixel location
@@ -1031,26 +1037,45 @@ public:
 			}
 		}
-		return true;
+		return 0;
 	}
-#endif
+//#endif
 	/// Calculate the covariance matrix for all masked pixels in the image with 64-bit floating point precision.
 	/// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
 	/// @param avg is a pointer to memory of size B that stores the average spectrum
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
-	bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
-
-#ifdef CUDA_FOUND
-		int dev_count;
-		cudaGetDeviceCount(&dev_count);									//get the number of CUDA devices
-		cudaDeviceProp prop;
-		cudaGetDeviceProperties(&prop, 0);								//get the property of the first device
-		if(dev_count > 0 && prop.major != 9999)							//if the first device is not an emulator
-			return co_matrix_cublas(co, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
-#endif
+	bool co_matrix(double* co, double* avg, unsigned char *mask, bool use_gpu = true, bool PROGRESS = false){
 		progress = 0;
+
+		if(use_gpu){
+			int dev_count;
+			HANDLE_ERROR(cudaGetDeviceCount(&dev_count));						//get the number of CUDA devices
+			std::cout<<"Number of CUDA devices: "<<dev_count<<std::endl;		//output the number of CUDA devices
+			cudaDeviceProp prop;
+			int best_device_id = 0;													//stores the best CUDA device
+			float best_device_cc = 0.0f;												//stores the compute capability of the best device
+			std::cout<<"CUDA devices----"<<std::endl;
+			for(int d = 0; d < dev_count; d++){									//for each CUDA device
+				cudaGetDeviceProperties(&prop, d);								//get the property of the first device
+				float cc = prop.major + prop.minor / 10.0f;						//calculate the compute capability
+				std::cout<<d<<":  ["<<prop.major<<"."<<prop.minor<<"]      "<<prop.name<<std::endl;	//display the device information
+				if(cc > best_device_cc){
+					best_device_cc = cc;										//if this is better than the previous device, use it
+					best_device_id = d;
+				}
+			}		
+		
+			if(dev_count > 0 && prop.major != 9999){							//if the first device is not an emulator
+				std::cout<<"Using device "<<best_device_id<<std::endl;
+				HANDLE_ERROR(cudaSetDevice(best_device_id));
+				int status = co_matrix_cublas(co, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
+				if(status == 0) return true;									//if the cuBLAS function returned correctly, we're done
+			}																	//otherwise continue using the CPU
+		
+			std::cout<<"No supported CUDA devices found or cuBLAS failed. Using CPU"<<std::endl;
+		}
 		//memory allocation
 		unsigned long long XY = X() * Y();
 		unsigned long long B = Z();
@@ -1092,10 +1117,10 @@ public:
 	}
-#ifdef CUDA_FOUND
+//#ifdef CUDA_FOUND
 	/// Calculate the covariance matrix of Noise for masked pixels using cuBLAS
 	/// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra
-	bool coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
+	int coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
 		cudaError_t cudaStat;
 		cublasStatus_t stat;
@@ -1123,11 +1148,9 @@ public:
 		double ger_alpha = 1.0/(double)XY;									//scale the outer product by the inverse of the number of samples (mean outer product)
 		double axpy_alpha = -1;												//multiplication factor for the average spectrum (in order to perform a subtraction)
-		stat = cublasCreate(&handle);										//create a cuBLAS instance
-		if (stat != CUBLAS_STATUS_SUCCESS) {								//test the cuBLAS instance to make sure it is valid
-			printf ("CUBLAS initialization failed\n");
-			return EXIT_FAILURE;
-		}
+		CUBLAS_HANDLE_ERROR(cublasCreate(&handle));							//create a cuBLAS instance
+		if (stat != CUBLAS_STATUS_SUCCESS) return 1;						//test the cuBLAS instance to make sure it is valid
+
 		for (unsigned long long xy = 0; xy < XY; xy++){										//for each pixel
 			if (mask == NULL || mask[xy] != 0){
 				pixeld(s, xy);                                                             //retreive the spectrum at the current xy pixel location
@@ -1158,27 +1181,44 @@ public:
 			}
 		}
-		return true;
+		return 0;
 	}
-#endif
+//#endif
 	/// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision.
 	/// @param coN is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
 	/// @param avg is a pointer to memory of size B that stores the average spectrum
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
-	bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
-
-#ifdef CUDA_FOUND
-		int dev_count;
-		cudaGetDeviceCount(&dev_count);									//get the number of CUDA devices
-		cudaDeviceProp prop;
-		cudaGetDeviceProperties(&prop, 0);								//get the property of the first device
-		if(dev_count > 0 && prop.major != 9999)							//if the first device is not an emulator
-			return coNoise_matrix_cublas(coN, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
-#endif
-
-
+	bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool use_gpu = true, bool PROGRESS = false){
+
+		if(use_gpu){
+			int dev_count;
+			HANDLE_ERROR(cudaGetDeviceCount(&dev_count));						//get the number of CUDA devices
+			std::cout<<"Number of CUDA devices: "<<dev_count<<std::endl;		//output the number of CUDA devices
+			cudaDeviceProp prop;
+			int best_device_id = 0;													//stores the best CUDA device
+			float best_device_cc = 0.0f;												//stores the compute capability of the best device
+			std::cout<<"CUDA devices:"<<std::endl;
+			for(int d = 0; d < dev_count; d++){									//for each CUDA device
+				cudaGetDeviceProperties(&prop, d);								//get the property of the first device
+				float cc = prop.major + prop.minor / 10.0f;						//calculate the compute capability
+				std::cout<<d<<":   ("<<prop.major<<"."<<prop.minor<<")      "<<prop.name<<std::endl;	//display the device information
+				if(cc > best_device_cc){
+					best_device_cc = cc;										//if this is better than the previous device, use it
+					best_device_id = d;
+				}
+			}		
+		
+			if(dev_count > 0 && prop.major != 9999){							//if the first device is not an emulator
+				std::cout<<"Using device "<<best_device_id<<std::endl;
+				HANDLE_ERROR(cudaSetDevice(best_device_id));
+				int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
+				if(status == 0) return true;									//if the cuBLAS function returned correctly, we're done
+			}																	//otherwise continue using the CPU
+		
+			std::cout<<"cuBLAS initialization failed - using CPU"<<std::endl;
+		}
 		progress = 0;
 		//memory allocation
@@ -1443,7 +1483,7 @@ public:
 		unsigned long long jump_sample = ( (Z() - b1) + b0 ) * sizeof(T);
 		//distance between sample spectra in adjacent lines
-		unsigned long long jump_line = (X() - x1) * Z() * sizeof(T);
+		unsigned long long jump_line = ( X() - x1 + x0 ) * Z() * sizeof(T);
 		//unsigned long long sp = y0 * X() + x0;		//start pixel
@@ -1682,7 +1722,117 @@ public:
 		return true;
 	}
+	int fft(std::string outname, size_t bandmin, size_t bandmax, size_t samples = 0, T* ratio = NULL, size_t rx = 0, size_t ry = 0, bool PROGRESS = false, int device = 0){
+		if(device == -1){
+			std::cout<<"ERROR: GPU required for FFT (uses cuFFT)."<<std::endl;
+			exit(1);
+		}
+		if(samples == 0) samples = Z();								//if samples are specified, use all of them
+		if(samples > Z()){
+			std::cout<<"ERROR: stim::envi doesn't support FFT padding just yet."<<std::endl;
+			exit(1);
+		}
+		int nd;															//stores the number of CUDA devices
+		HANDLE_ERROR(cudaGetDeviceCount(&nd));							//get the number of CUDA devices
+		if(device >= nd){												//test for the existence of the requested device
+			std::cout<<"ERROR: requested CUDA device for stim::envi::fft() doesn't exist"<<std::endl;
+			exit(1);
+		}
+		HANDLE_ERROR(cudaSetDevice(device));							//set the CUDA device
+		cudaDeviceProp prop;
+		HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));			//get the CUDA device properties
+
+		size_t B = Z();
+		size_t S = samples;
+		size_t fft_size = S * sizeof(T);								//number of bytes for each FFT
+		size_t cuda_bytes = prop.totalGlobalMem;						//get the number of bytes of global memory available
+		size_t cuda_use = (size_t)floor(cuda_bytes * 0.2);								//only use 80%
+		size_t nS = cuda_use / fft_size;								//calculate the number of spectra that can be loaded onto the GPU as a single batch
+		size_t batch_bytes = nS * fft_size;								//calculate the size of a batch (in bytes)
+		size_t fft_bytes = nS * (S/2 + 1) * sizeof(cufftComplex);
+		T* batch = (T*) malloc(batch_bytes);							//allocate space in host memory to store a batch
+		memset(batch, 0, batch_bytes);
+		std::complex<T>* batch_fft = (std::complex<T>*) malloc(fft_bytes);
+		T* gpu_batch;													//device pointer to the batch
+		HANDLE_ERROR(cudaMalloc(&gpu_batch, batch_bytes));				//allocate space on the device for the FFT batch
+		cufftComplex* gpu_batch_fft;												//allocate space for the FFT result
+		HANDLE_ERROR(cudaMalloc(&gpu_batch_fft, fft_bytes));
+		int N[1];														//create an array with the interferogram size (required for cuFFT input)
+		N[0] = (int)S;													//set the only array value to the interferogram size
+
+		//if a background is provided for a ratio
+		std::complex<T>* ratio_fft = NULL;											//create a pointer for the FFT of the ratio image (if it exists)
+		if(ratio){
+			size_t bkg_bytes = rx * ry * S * sizeof(T);								//calculate the total number of bytes in the background image
+			T* bkg_copy = (T*) malloc(bkg_bytes);									//allocate space to copy the background
+			if(S == Z()) memcpy(bkg_copy, ratio, bkg_bytes);						//if the number of samples used in processing equals the number of available samples
+			else{
+				for(size_t xyi = 0; xyi < rx*ry; xyi++)
+					memcpy(&bkg_copy[xyi * S], &ratio[xyi * B], S * sizeof(T));
+			}
+			T* gpu_ratio;
+			HANDLE_ERROR(cudaMalloc(&gpu_ratio, bkg_bytes));
+			HANDLE_ERROR(cudaMemcpy(gpu_ratio, bkg_copy, bkg_bytes, cudaMemcpyHostToDevice));
+			cufftHandle bkg_plan;
+			CUFFT_HANDLE_ERROR(cufftPlanMany(&bkg_plan, 1, N, NULL, 1, N[0], NULL, 1, N[0], CUFFT_R2C, (int)(rx * ry)));
+			size_t bkg_fft_bytes = rx * ry * (S / 2 + 1) * sizeof(cufftComplex);
+			T* gpu_ratio_fft;
+			HANDLE_ERROR(cudaMalloc(&gpu_ratio_fft, bkg_fft_bytes));
+			CUFFT_HANDLE_ERROR(cufftExecR2C(bkg_plan, (cufftReal*)gpu_ratio, (cufftComplex*)gpu_ratio_fft));
+			ratio_fft = (std::complex<T>*) malloc(bkg_fft_bytes);
+			HANDLE_ERROR(cudaMemcpy(ratio_fft, gpu_ratio_fft, bkg_fft_bytes, cudaMemcpyDeviceToHost));
+			HANDLE_ERROR(cudaFree(gpu_ratio));
+			HANDLE_ERROR(cudaFree(gpu_ratio_fft));
+			CUFFT_HANDLE_ERROR(cufftDestroy(bkg_plan));
+		}
+		cufftHandle plan;												//create a CUFFT plan
+		CUFFT_HANDLE_ERROR(cufftPlanMany(&plan, 1, N, NULL, 1, N[0], NULL, 1, N[0], CUFFT_R2C, (int)nS));
+
+		std::ofstream outfile(outname, std::ios::binary);				//open a file for writing
+
+		size_t XY = X() * Y();											//calculate the number of spectra
+		size_t xy = 0;
+		size_t bs;														//stores the number of spectra in the current batch
+		size_t s, b;
+		size_t S_fft = S/2 + 1;
+		size_t bandkeep = bandmax - bandmin + 1;
+		size_t x, y;
+		size_t ratio_i;
+		T* temp_spec = (T*) malloc(Z() * sizeof(T));					//allocate space to hold a single pixel
+		while(xy < XY){													//while there are unprocessed spectra
+			bs = min(XY - xy, nS);										//calculate the number of spectra to include in the batch
+			for(s = 0; s < bs; s++){									//for each spectrum in the batch
+				pixel(temp_spec, xy + s);						//read a pixel from disk
+				memcpy(&batch[s * S], temp_spec, S * sizeof(T));
+				//pixel(&batch[s * S], xy + s);							//read the next spectrum
+			}
+			HANDLE_ERROR(cudaMemcpy(gpu_batch, batch, batch_bytes, cudaMemcpyHostToDevice));
+			CUFFT_HANDLE_ERROR(cufftExecR2C(plan, (cufftReal*)gpu_batch, gpu_batch_fft));			//execute the (implicitly forward) transform
+			HANDLE_ERROR(cudaMemcpy(batch_fft, gpu_batch_fft, fft_bytes, cudaMemcpyDeviceToHost));	//copy the data back to the GPU
+			for(s = 0; s < bs; s++){															//for each spectrum in the batch
+				y = (xy + s)/X();
+				x = xy + s - y * X();
+				if(ratio_fft)	ratio_i = (y % ry) * rx + (x % rx);								//if a background is used, calculate the coordinates into it
+				for(b = 0; b < S/2 + 1; b++){														//for each sample
+					if(ratio_fft)						
+						batch[s * S + b] = -log(abs(batch_fft[s * S_fft + b]) / abs(ratio_fft[ratio_i * S_fft + b]));
+					else
+						batch[s * S + b] = abs(batch_fft[s * S_fft + b]);		//calculate the magnitude of the spectrum					
+				}
+				outfile.write((char*)&batch[s * S + bandmin], bandkeep * sizeof(T));							//save the resulting spectrum
+			}
+			xy += bs;													//increment xy by the number of spectra processed
+			if(PROGRESS) progress = (double)xy / (double)XY * 100;
+		}
+		outfile.close();
+		free(ratio_fft);
+		free(batch_fft);
+		free(batch);
+		HANDLE_ERROR(cudaFree(gpu_batch));
+		HANDLE_ERROR(cudaFree(gpu_batch_fft));
+		return 0;
+	}
 	/// Close the file.
 	bool close(){
@@ -104,6 +104,7 @@ public:
 		//if wavelength is smaller than the first one in header file
 		if ( w[page] > wavelength ){
 			band_index(p, page);
+			if(PROGRESS) progress = 100;
 			return true;
 		}
@@ -114,6 +115,7 @@ public:
 			//	(the wavelength is out of bounds)
 			if (page == Z()) {
 				band_index(p, Z()-1);		//return the last band
+				if(PROGRESS) progress = 100;
 				return true;
 			}
 		}
@@ -561,12 +563,12 @@ public:
 		free(src[1]);
 		free(dst[0]);
 		free(dst[1]);
-		//if(VERBOSE){
+		if(VERBOSE){
 			std::cout<<"total time to execute bsq::bip(): "<<t_total<<" ms"<<std::endl;
 			std::cout<<"     total time spent processing: "<<pt_total<<" ms"<<std::endl;
 			std::cout<<"        total time spent reading: "<<rt_total<<" ms"<<std::endl;
 			std::cout<<"        total time spent writing: "<<wt_total<<" ms"<<std::endl;
-		//}
+		}
 		return true;															//return true
 	}
@@ -1120,27 +1122,61 @@ public:
 	/// @param p is a pointer to pre-allocated memory of size [B * sizeof(T)] that stores the mean spectrum
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
-	bool avg_band(double* p, unsigned char* mask = NULL, bool PROGRESS = false){
+	bool mean_spectrum(double* m, double* std, unsigned char* mask = NULL, bool PROGRESS = false){
 		unsigned long long XY = X() * Y();
-		unsigned long long count = 0;						//count will store the number of masked pixels
+		unsigned long long count = nnz(mask);						//count will store the number of masked pixels
 		T* temp = (T*)malloc(sizeof(T) * XY);
-		//calculate this loop counts the number of true pixels in the mask
-		for (unsigned j = 0; j < XY; j++){
-			if (mask == NULL || mask[j] != 0){
-				count++;
-			}
-		}
+		
 		//this loops goes through each band in B (Z())
 		//	masked (or valid) pixels from that band are averaged and the average is stored in p
+		double e_x;										//stores E[x]^2
+		double e_x2;										//stores E[x^2]
+		double x;
 		for (unsigned long long i = 0; i < Z(); i++){
-			p[i] = 0;
+			e_x = 0;
+			e_x2 = 0;
 			band_index(temp, i);				//get the band image and store it in temp
 			for (unsigned long long j = 0; j < XY; j++){	//loop through temp, averaging valid pixels
 				if (mask == NULL || mask[j] != 0){
-					p[i] += (double)temp[j] / (double)count;
+					x = (double)temp[j];
+					e_x += x / (double)count;				//sum the expected value of x
+					e_x2 += (x * x) / (double)count;		//sum the expected value of x^2
 				}
 			}
-			if(PROGRESS) progress = (double)(i+1) / Z() * 100;
+			m[i] = e_x;												//store the mean
+			std[i] = sqrt(e_x2 - e_x * e_x);						//calculate the standard deviation
+			if(PROGRESS) progress = (double)(i+1) / Z() * 100;		//update the progress counter
+		}
+		free(temp);
+		return true;
+	}
+
+	/// Calculate the median value for all masked (or valid) pixels in a band and returns the median spectrum
+
+	/// @param p is a pointer to pre-allocated memory of size [B * sizeof(T)] that stores the mean spectrum
+	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
+	bool median_spectrum(double* m, unsigned char* mask = NULL, bool PROGRESS = false){
+		size_t XY = X() * Y();
+		size_t count = nnz(mask);						//count will store the number of masked pixels
+		T* temp = (T*)malloc(sizeof(T) * XY);
+		
+		std::vector<T> band_values(count);				//create an STD vector of band values
+
+		//this loops goes through each band in B (Z())
+		//	masked (or valid) pixels from that band are averaged and the average is stored in p
+		size_t k;
+		for (size_t i = 0; i < Z(); i++){							//for each band
+			band_index(temp, i);									//get the band image and store it in temp
+			k = 0;													//initialize the band_value index to zero
+			for (size_t j = 0; j < XY; j++){						//loop through temp, averaging valid pixels
+				if (mask == NULL || mask[j] != 0){
+					band_values[k] = temp[j];				//store the value in the band_values array
+					k++;											//increment the band_values index
+				}
+			}
+			std::sort(band_values.begin(), band_values.end());		//sort all of the values in the band
+			m[i] = band_values[ count/2 ];							//store the center value in the array
+			if(PROGRESS) progress = (double)(i+1) / Z() * 100;		//update the progress counter
 		}
 		free(temp);
 		return true;
@@ -1203,6 +1239,52 @@ public:
 		return true;
 	}
+	///Crop out several subimages and assemble a new image from these concatenated subimages
+
+	/// @param outfile is the file name for the output image
+	/// @param sx is the width of each subimage
+	/// @param sy is the height of each subimage
+	/// @mask is the mask used to define subimage positions extracted from the input file
+	void subimages(std::string outfile, size_t sx, size_t sy, unsigned char* mask, bool PROGRESS = false){
+
+		size_t N = nnz(mask);									//get the number of subimages
+		T* dst = (T*) malloc(N * sx * sy * sizeof(T));			//allocate space for a single band of the output image
+		memset(dst, 0, N*sx*sy*sizeof(T));						//initialize the band image to zero
+
+		std::ofstream out(outfile, std::ios::binary);			//open a file for writing
+
+		T* src = (T*) malloc(X() * Y() * sizeof(T));
+
+		for(size_t b = 0; b < Z(); b++){						//for each band
+			band_index(src, b);									//load the band image
+			size_t i = 0;										//create an image index and initialize it to zero
+			size_t n = 0;
+			while(n < N){										//for each subimage
+				if(mask[i]){									//if the pixel is masked, copy the surrounding pixels into the destination band
+					size_t yi = i / X();						//determine the y position of the current pixel
+					size_t xi = i - yi * X();					//determine the x position of the current pixel
+					if( xi > sx/2 && xi < X() - sx/2 &&			//if the subimage is completely within the bounds of the original image
+						yi > sy/2 && yi < Y() - sy/2){
+						size_t cx = xi - sx/2;					//calculate the corner position for the subimage
+						size_t cy = yi - sy/2;
+						for(size_t syi = 0; syi < sy; syi++){					//for each line in the subimage
+							size_t src_i = (cy + syi) * X() + cx;
+							//size_t dst_i = syi * (N * sx) + n * sx;
+							size_t dst_i = (n * sy + syi) * sx;
+							memcpy(&dst[dst_i],  &src[src_i], sx * sizeof(T));	//copy one line from the subimage to the destination image
+						}
+						n++;
+					}
+				}
+				i++;
+				if(PROGRESS) progress = (double)( (n+1) * (b+1) ) / (N * Z()) * 100;
+			}//end while n
+			out.write((const char*)dst, N * sx * sy * sizeof(T));			//write the band to memory
+		}
+		free(dst);												//free memory
+		free(src);
+	}
+
 	/// Remove a list of bands from the ENVI file
 	/// @param outfile is the file name for the output hyperspectral image (with trimmed bands)
@@ -6,6 +6,8 @@
 #include "../envi/bip.h"
 #include "../envi/bil.h"
 #include "../math/fd_coefficients.h"
+#include <stim/parser/filename.h>
+#include <stim/util/filesize.h>
 #include <iostream>
 #include <fstream>
 //#include "../image/image.h"
@@ -76,7 +78,31 @@ public:
 		allocate();
 	}
+	//used to test if the current ENVI file is valid
+	operator bool(){
+		if(file == NULL) return false;
+		return true;
+	}
+
+	//test to determine if the specified file is an ENVI file
+	static bool is_envi(std::string fname, std::string hname = ""){
+		stim::filename data_file(fname);
+		stim::filename header_file;
+		if(hname == ""){								//if the header isn't provided
+			header_file = data_file;					//assume that it's the same name as the data file, with a .hdr extension
+			header_file = header_file.extension("hdr");
+		}
+		else header_file = hname;						//otherwise load the passed header
+
+		stim::envi_header H;
+		if(H.load(header_file) == false)				//load the header file, if it doesn't load return false
+			return false;
+		size_t targetBytes = H.data_bytes();			//get the number of bytes that SHOULD be in the data file
+		size_t bytes = stim::file_size(fname);
+		if(bytes != targetBytes) return false;			//if the data doesn't match the header, return false
+		return true;									//otherwise everything looks fine
+	}
 	void* malloc_spectrum(){
@@ -359,11 +385,23 @@ public:
 		fseek(f, 9, SEEK_SET);											//seek to the number of bands
 		short b;														//allocate space for the number of bands
-		fread(&b, sizeof(short), 1, f);									//read the number of bands
+		size_t nread = fread(&b, sizeof(short), 1, f);					//read the number of bands
+		if(nread != 1){
+			std::cout<<"Error reading band number from Agilent file."<<std::endl;
+			exit(1);
+		}
 		fseek(f, 13, SEEK_CUR);											//skip the the x and y dimensions
 		short x, y;
-		fread(&x, sizeof(short), 1, f);									//read the image x and y size
-		fread(&y, sizeof(short), 1, f);
+		nread = fread(&x, sizeof(short), 1, f);									//read the image x and y size
+		if(nread != 1){
+			std::cout<<"Error reading X dimension from Agilent file."<<std::endl;
+			exit(1);
+		}
+		nread = fread(&y, sizeof(short), 1, f);
+		if(nread != 1){
+			std::cout<<"Error reading Y dimension from Agilent file."<<std::endl;
+			exit(1);
+		}
 		fclose(f);														//close the file
 		//store the information from the Agilent header in the ENVI header
@@ -1368,12 +1406,12 @@ public:
 	/// @param p is a pointer to pre-allocated memory of size [B * sizeof(T)] that stores the mean spectrum
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
-	bool avg_band(double * p, unsigned char* mask, bool PROGRESS = false){
+	bool mean_spectrum(double * p, double* std, unsigned char* mask, bool PROGRESS = false){
 		if (header.interleave == envi_header::BSQ){
 			if (header.data_type == envi_header::float32)
-				return ((bsq<float>*)file)->avg_band(p, mask, PROGRESS);
+				return ((bsq<float>*)file)->mean_spectrum(p, std, mask, PROGRESS);
 			else if (header.data_type == envi_header::float64)
-				return ((bsq<double>*)file)->avg_band(p,  mask, PROGRESS);
+				return ((bsq<double>*)file)->mean_spectrum(p, std, mask, PROGRESS);
 			else{
 				std::cout << "ERROR: unidentified data type" << std::endl;
 				exit(1);
@@ -1381,9 +1419,9 @@ public:
 		}
 		else if (header.interleave == envi_header::BIL){
 			if (header.data_type == envi_header::float32)
-				return ((bil<float>*)file)->avg_band(p,  mask, PROGRESS);
+				return ((bil<float>*)file)->mean_spectrum(p, std, mask, PROGRESS);
 			else if (header.data_type == envi_header::float64)
-				return ((bil<double>*)file)->avg_band(p,  mask, PROGRESS);
+				return ((bil<double>*)file)->mean_spectrum(p, std, mask, PROGRESS);
 			else{
 				std::cout << "ERROR: unidentified data type" << std::endl;
 				exit(1);
@@ -1391,14 +1429,36 @@ public:
 		}
 		else if (header.interleave == envi_header::BIP){
 			if (header.data_type == envi_header::float32)
-				return ((bip<float>*)file)->avg_band(p, mask, PROGRESS);
+				return ((bip<float>*)file)->mean_spectrum(p, std, mask, PROGRESS);
+			else if (header.data_type == envi_header::float64)
+				return ((bip<double>*)file)->mean_spectrum(p, std, mask, PROGRESS);
+			else{
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+		return false;
+	}
+
+	/// Calculate the mean value for all masked (or valid) pixels in a band and returns the average spectrum
+
+	/// @param p is a pointer to pre-allocated memory of size [B * sizeof(T)] that stores the mean spectrum
+	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
+	bool median_spectrum(double* m, unsigned char* mask, bool PROGRESS = false){
+		if (header.interleave == envi_header::BSQ){
+			if (header.data_type == envi_header::float32)
+				return ((bsq<float>*)file)->median_spectrum(m, mask, PROGRESS);
 			else if (header.data_type == envi_header::float64)
-				return ((bip<double>*)file)->avg_band(p, mask, PROGRESS);
+				return ((bsq<double>*)file)->median_spectrum(m, mask, PROGRESS);
 			else{
 				std::cout << "ERROR: unidentified data type" << std::endl;
 				exit(1);
 			}
 		}
+		else{
+			std::cout<<"ERROR: median calculation is only supported for BSQ interleave types. Convert to process."<<std::endl;
+			exit(1);
+		}
 		return false;
 	}
@@ -1407,16 +1467,16 @@ public:
 	/// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
 	/// @param avg is a pointer to memory of size B that stores the average spectrum
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
-	bool co_matrix(double* co, double* avg, unsigned char* mask, bool PROGRESS = false){
+	bool co_matrix(double* co, double* avg, unsigned char* mask, bool use_gpu, bool PROGRESS = false){
 		if (header.interleave == envi_header::BSQ){
 			std::cout<<"ERROR: calculating the covariance matrix for a BSQ file is impractical; convert to BIL or BIP first"<<std::endl;
 			exit(1);
 		}
 		else if (header.interleave == envi_header::BIL){
 			if (header.data_type == envi_header::float32)
-				return ((bil<float>*)file)->co_matrix(co, avg, mask, PROGRESS);
+				return ((bil<float>*)file)->co_matrix(co, avg, mask, use_gpu, PROGRESS);
 			else if (header.data_type == envi_header::float64)
-				return ((bil<double>*)file)->co_matrix(co, avg, mask, PROGRESS);
+				return ((bil<double>*)file)->co_matrix(co, avg, mask, use_gpu, PROGRESS);
 			else{
 				std::cout << "ERROR: unidentified data type" << std::endl;
 				exit(1);
@@ -1424,9 +1484,9 @@ public:
 		}
 		else if (header.interleave == envi_header::BIP){
 			if (header.data_type == envi_header::float32)
-				return ((bip<float>*)file)->co_matrix(co, avg, mask, PROGRESS);
+				return ((bip<float>*)file)->co_matrix(co, avg, mask, use_gpu, PROGRESS);
 			else if (header.data_type == envi_header::float64)
-				return ((bip<double>*)file)->co_matrix(co, avg, mask, PROGRESS);
+				return ((bip<double>*)file)->co_matrix(co, avg, mask, use_gpu, PROGRESS);
 			else{
 				std::cout << "ERROR: unidentified data type" << std::endl;
 				exit(1);
@@ -1440,7 +1500,7 @@ public:
 	/// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
 	/// @param avg is a pointer to memory of size B that stores the average spectrum
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
-	bool coNoise_matrix(double* coN, double* avg, unsigned char* mask, bool PROGRESS = false){
+	bool coNoise_matrix(double* coN, double* avg, unsigned char* mask, bool use_gpu = true, bool PROGRESS = false){
 		if (header.interleave == envi_header::BSQ){
 			std::cout<<"ERROR: calculating the covariance matrix of noise for a BSQ file is impractical; convert to BIP first"<<std::endl;
 			exit(1);
@@ -1454,9 +1514,9 @@ public:
 		else if (header.interleave == envi_header::BIP){
 			if (header.data_type == envi_header::float32)
-				return ((bip<float>*)file)->coNoise_matrix(coN, avg, mask, PROGRESS);
+				return ((bip<float>*)file)->coNoise_matrix(coN, avg, mask, use_gpu, PROGRESS);
 			else if (header.data_type == envi_header::float64)
-				return ((bip<double>*)file)->coNoise_matrix(coN, avg, mask, PROGRESS);
+				return ((bip<double>*)file)->coNoise_matrix(coN, avg, mask, use_gpu, PROGRESS);
 			else{
 				std::cout << "ERROR: unidentified data type" << std::endl;
 				exit(1);
@@ -1524,6 +1584,41 @@ public:
 		return false;
 	}
+	void subimages(std::string outfile, size_t nx, size_t ny, unsigned char* mask, bool PROGRESS = false){
+		
+		size_t nnz = 0;													//initialize the number of subimages to zero
+		for(size_t i = 0; i < header.lines * header.samples; i++)		//for each pixel in the mask
+			if(mask[i]) nnz++;											//if the pixel is valid, add a subimage
+
+
+		//save the header for the cropped file
+		stim::envi_header new_header = header;
+		new_header.samples = nx;									//calculate the width of the output image (concatenated subimages)
+		new_header.lines = nnz * ny;											//calculate the height of the output image (height of subimages)
+		
+
+		if (header.interleave == envi_header::BSQ){
+			if (header.data_type == envi_header::float32)
+				((bsq<float>*)file)->subimages(outfile, nx, ny, mask, PROGRESS);
+			else if (header.data_type == envi_header::float64)
+				((bsq<double>*)file)->subimages(outfile, nx, ny, mask, PROGRESS);
+			else{
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+		else if (header.interleave == envi_header::BIL){
+			std::cout << "ERROR: unidentified data type" << std::endl;
+			exit(1);
+		}
+		else if (header.interleave == envi_header::BIP){
+			std::cout << "ERROR: unidentified data type" << std::endl;
+			exit(1);
+		}
+
+		new_header.save(outfile + ".hdr");									//save the header for the output file
+	}
+
 	/// Remove a list of bands from the ENVI file
 	/// @param outfile is the file name for the output hyperspectral image (with trimmed bands)
@@ -1801,6 +1896,44 @@ public:
 		}
 		exit(1);
 	}
+
+	
+
+
+	void fft(std::string outfile, double band_min, double band_max, size_t samples = 0, void* ratio = NULL, size_t rx = 0, size_t ry = 0, bool PROGRESS = false, int cuda_device = 0){
+		if(samples == 0) samples = header.bands;
+		double B = (double)header.bands;
+		double delta = header.wavelength[1] - header.wavelength[0];					//calculate spacing in the current domain
+		double span = samples * delta;											//calculate the span in the current domain
+		double fft_delta = 1.0 / span;												//calculate the span in the FFT domain
+		double fft_max = fft_delta * samples/2;										//calculate the maximum range of the FFT
+
+		if(band_max > fft_max) band_max = fft_max;									//the user gave a band outside of the FFT range, reset the band to the maximum available
+		size_t start_i = (size_t)std::ceil(band_min / fft_delta);					//calculate the first band to store
+		size_t size_i = (size_t)std::floor(band_max / fft_delta) - start_i + 1;		//calculate the number of bands to store
+		size_t end_i = start_i + size_i - 1;										//last band number
+
+		envi_header new_header = header;
+		new_header.bands = size_i;
+		new_header.set_wavelengths(start_i * fft_delta, fft_delta);
+		new_header.wavelength_units = "inv_" + header.wavelength_units;
+		new_header.save(outfile + ".hdr");
+		
+		if (header.interleave == envi_header::BIP){
+			if (header.data_type == envi_header::float32)
+				((bip<float>*)file)->fft(outfile, start_i, end_i, samples, (float*)ratio, rx, ry, PROGRESS, cuda_device);
+			else if (header.data_type == envi_header::float64)
+				((bip<double>*)file)->fft(outfile, start_i, end_i, samples, (double*)ratio, rx, ry, PROGRESS, cuda_device);
+			else{
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+		else{
+			std::cout<<"ERROR: only BIP files supported for FFT"<<std::endl;
+			exit(1);
+		}
+	}
 };	//end ENVI
 }	//end namespace rts
@@ -78,6 +78,14 @@ struct envi_header
 		load(name);
 	}
+	//sets the wavelength vector given a starting value and uniform step size
+	void set_wavelengths(double start, double step){
+		size_t B = bands;						//get the number of bands
+		wavelength.resize(B);
+		for(size_t b = 0; b < B; b++)
+			wavelength[b] = start + b * step;
+	}
+
 	std::string trim(std::string line){
 		if(line.length() == 0)
@@ -417,8 +425,13 @@ struct envi_header
 		default:
 			return 0;
 		}
+	}
+	//return the number of bytes that SHOULD be in the data file
+	size_t data_bytes(){
+		return samples * lines * bands * valsize() + header_offset;
 	}
+	
 	/// Convert an interleave type to a string
 	static std::string interleave_str(interleaveType t){
@@ -142,7 +142,7 @@ public:
 	void mask_finite(unsigned char* out_mask, unsigned char* mask, bool PROGRESS = false){
 		size_t XY = X() * Y();
 		if(mask == NULL)												//if no mask is provided
-			memset(mask, 255, XY * sizeof(unsigned char));				//initialize the mask to 255
+			memset(out_mask, 255, XY * sizeof(unsigned char));				//initialize the mask to 255
 		else															//if a mask is provided
 			memcpy(out_mask, mask, XY * sizeof(unsigned char));			//initialize the current mask to that one
 		T* page = (T*)malloc(R[0] * R[1] * sizeof(T));		//allocate space for a page of data
@@ -12,4 +12,5 @@
    } \
 }
+
 #endif
 \ No newline at end of file
@@ -479,7 +479,7 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 			glEndList();					///finilize the display list.
 			#ifdef DEBUG
 				for(int i = 0; i < numSamplesPos; i++)
-					std::cout << pV[i] << std::endl;
+					std::cout << pV[i].str() << std::endl;
 			#endif
 		}
@@ -1151,8 +1151,8 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 				out[3] = temp[2];
 			}
 			#ifdef DEBUG
-//				std::cout << "out is " << out << std::endl;
-//				std::cout << "when rotating from " << from << " to " << dir << std::endl;
+				std::cout << "out is " << out.str() << std::endl;
+				std::cout << "when rotating from " << from.str() << " to " << dir.str() << std::endl;
 			#endif
 			return out;
 		}
@@ -1545,7 +1545,7 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 				setMagnitude(curSeedMag);
 				#ifdef DEBUG
-					std::cout << "The new seed " << curSeed << curSeedVec << curSeedMag << std::endl;
+					std::cout << "The new seed " << curSeed.str() << curSeedVec.str() << curSeedMag << std::endl;
 				#endif
 //			Bind(direction_texID, direction_buffID, numSamples, n_pixels);
@@ -139,7 +139,7 @@ public:
 	/// @param depth,  number of pixels in depth.
 	void init(int channels, int width, int height, int depth)
 	{
-		R.resize(4);
+		//R.resize(4);
 		R[0] = channels;
 		R[1] = width;
 		R[2] = height;
@@ -10,6 +10,7 @@
 #include <limits>
 #include <typeinfo>
 #include <fstream>
+#include <cstring>
 namespace stim{
 /// This static class provides the STIM interface for loading, saving, and storing 2D images.
@@ -74,18 +75,7 @@ class image{
 #endif
 	/// Returns the value for "white" based on the dynamic range (assumes white is 1.0 for floating point images)
 	T white(){
-
-		if(typeid(T) == typeid(unsigned char))		return UCHAR_MAX;
-		if(typeid(T) == typeid(unsigned short))		return SHRT_MAX;
-		if(typeid(T) == typeid(unsigned))			return UINT_MAX;
-		if(typeid(T) == typeid(unsigned long))		return ULONG_MAX;
-		if(typeid(T) == typeid(unsigned long long))	return ULLONG_MAX;
-		if(typeid(T) == typeid(float))				return 1.0f;
-		if(typeid(T) == typeid(double))				return 1.0;
-
-		std::cout<<"ERROR in stim::image::white - no white value known for this data type"<<std::endl;
-		exit(1);
-
+		return std::numeric_limits<T>::max();
 	}
+#ifndef RTS_BESSEL_H
+#define RTS_BESSEL_H
+
+#define _USE_MATH_DEFINES
+#include <math.h>
+#include "../math/complex.h"
+#define eps 1e-15
+#define el 0.5772156649015329
+
+
+namespace stim{
+
+static complex<double> cii(0.0,1.0);
+static complex<double> cone(1.0,0.0);
+static complex<double> czero(0.0,0.0);
+
+template< typename P >
+P gamma(P x)
+{
+    int i,k,m;
+    P ga,gr,r,z;
+
+    static P g[] = {
+        1.0,
+        0.5772156649015329,
+       -0.6558780715202538,
+       -0.420026350340952e-1,
+        0.1665386113822915,
+       -0.421977345555443e-1,
+       -0.9621971527877e-2,
+        0.7218943246663e-2,
+       -0.11651675918591e-2,
+       -0.2152416741149e-3,
+        0.1280502823882e-3,
+       -0.201348547807e-4,
+       -0.12504934821e-5,
+        0.1133027232e-5,
+       -0.2056338417e-6,
+        0.6116095e-8,
+        0.50020075e-8,
+       -0.11812746e-8,
+        0.1043427e-9,
+        0.77823e-11,
+       -0.36968e-11,
+        0.51e-12,
+       -0.206e-13,
+       -0.54e-14,
+        0.14e-14};
+
+    if (x > 171.0) return 1e308;    // This value is an overflow flag.
+    if (x == (int)x) {
+        if (x > 0.0) {
+            ga = 1.0;               // use factorial
+            for (i=2;i<x;i++) {
+               ga *= i;
+            }
+         }
+         else
+            ga = 1e308;
+     }
+     else {
+        if (fabs(x) > 1.0) {
+            z = fabs(x);
+            m = (int)z;
+            r = 1.0;
+            for (k=1;k<=m;k++) {
+                r *= (z-k);
+            }
+            z -= m;
+        }
+        else
+            z = x;
+        gr = g[24];
+        for (k=23;k>=0;k--) {
+            gr = gr*z+g[k];
+        }
+        ga = 1.0/(gr*z);
+        if (fabs(x) > 1.0) {
+            ga *= r;
+            if (x < 0.0) {
+                ga = -M_PI/(x*ga*sin(M_PI*x));
+            }
+        }
+    }
+    return ga;
+}
+
+template<typename P>
+int bessjy01a(P x,P &j0,P &j1,P &y0,P &y1,
+    P &j0p,P &j1p,P &y0p,P &y1p)
+{
+    P x2,r,ec,w0,w1,r0,r1,cs0,cs1;
+    P cu,p0,q0,p1,q1,t1,t2;
+    int k,kz;
+    static P a[] = {
+        -7.03125e-2,
+         0.112152099609375,
+        -0.5725014209747314,
+         6.074042001273483,
+        -1.100171402692467e2,
+         3.038090510922384e3,
+        -1.188384262567832e5,
+         6.252951493434797e6,
+        -4.259392165047669e8,
+         3.646840080706556e10,
+        -3.833534661393944e12,
+         4.854014686852901e14,
+        -7.286857349377656e16,
+         1.279721941975975e19};
+    static P b[] = {
+         7.32421875e-2,
+        -0.2271080017089844,
+         1.727727502584457,
+        -2.438052969955606e1,
+         5.513358961220206e2,
+        -1.825775547429318e4,
+         8.328593040162893e5,
+        -5.006958953198893e7,
+         3.836255180230433e9,
+        -3.649010818849833e11,
+         4.218971570284096e13,
+        -5.827244631566907e15,
+         9.476288099260110e17,
+        -1.792162323051699e20};
+    static P a1[] = {
+         0.1171875,
+        -0.1441955566406250,
+         0.6765925884246826,
+        -6.883914268109947,
+         1.215978918765359e2,
+        -3.302272294480852e3,
+         1.276412726461746e5,
+        -6.656367718817688e6,
+         4.502786003050393e8,
+        -3.833857520742790e10,
+         4.011838599133198e12,
+        -5.060568503314727e14,
+         7.572616461117958e16,
+        -1.326257285320556e19};
+    static P b1[] = {
+        -0.1025390625,
+         0.2775764465332031,
+        -1.993531733751297,
+         2.724882731126854e1,
+        -6.038440767050702e2,
+         1.971837591223663e4,
+        -8.902978767070678e5,
+         5.310411010968522e7,
+        -4.043620325107754e9,
+         3.827011346598605e11,
+        -4.406481417852278e13,
+         6.065091351222699e15,
+        -9.833883876590679e17,
+         1.855045211579828e20};
+
+    if (x < 0.0) return 1;
+    if (x == 0.0) {
+        j0 = 1.0;
+        j1 = 0.0;
+        y0 = -1e308;
+        y1 = -1e308;
+        j0p = 0.0;
+        j1p = 0.5;
+        y0p = 1e308;
+        y1p = 1e308;
+        return 0;
+    }
+    x2 = x*x;
+    if (x <= 12.0) {
+        j0 = 1.0;
+        r = 1.0;
+        for (k=1;k<=30;k++) {
+            r *= -0.25*x2/(k*k);
+            j0 += r;
+            if (fabs(r) < fabs(j0)*1e-15) break;
+        }
+        j1 = 1.0;
+        r = 1.0;
+        for (k=1;k<=30;k++) {
+            r *= -0.25*x2/(k*(k+1));
+            j1 += r;
+            if (fabs(r) < fabs(j1)*1e-15) break;
+        }
+        j1 *= 0.5*x;
+        ec = log(0.5*x)+el;
+        cs0 = 0.0;
+        w0 = 0.0;
+        r0 = 1.0;
+        for (k=1;k<=30;k++) {
+            w0 += 1.0/k;
+            r0 *= -0.25*x2/(k*k);
+            r = r0 * w0;
+            cs0 += r;
+            if (fabs(r) < fabs(cs0)*1e-15) break;
+        }
+        y0 = M_2_PI*(ec*j0-cs0);
+        cs1 = 1.0;
+        w1 = 0.0;
+        r1 = 1.0;
+        for (k=1;k<=30;k++) {
+            w1 += 1.0/k;
+            r1 *= -0.25*x2/(k*(k+1));
+            r = r1*(2.0*w1+1.0/(k+1));
+            cs1 += r;
+            if (fabs(r) < fabs(cs1)*1e-15) break;
+        }
+        y1 = M_2_PI * (ec*j1-1.0/x-0.25*x*cs1);
+    }
+    else {
+        if (x >= 50.0) kz = 8;          // Can be changed to 10
+        else if (x >= 35.0) kz = 10;    //  "       "        12
+        else kz = 12;                   //  "       "        14
+        t1 = x-M_PI_4;
+        p0 = 1.0;
+        q0 = -0.125/x;
+        for (k=0;k<kz;k++) {
+            p0 += a[k]*pow(x,-2*k-2);
+            q0 += b[k]*pow(x,-2*k-3);
+        }
+        cu = sqrt(M_2_PI/x);
+        j0 = cu*(p0*cos(t1)-q0*sin(t1));
+        y0 = cu*(p0*sin(t1)+q0*cos(t1));
+        t2 = x-0.75*M_PI;
+        p1 = 1.0;
+        q1 = 0.375/x;
+        for (k=0;k<kz;k++) {
+            p1 += a1[k]*pow(x,-2*k-2);
+            q1 += b1[k]*pow(x,-2*k-3);
+        }
+        j1 = cu*(p1*cos(t2)-q1*sin(t2));
+        y1 = cu*(p1*sin(t2)+q1*cos(t2));
+    }
+    j0p = -j1;
+    j1p = j0-j1/x;
+    y0p = -y1;
+    y1p = y0-y1/x;
+    return 0;
+}
+//
+//  INPUT:
+//      double x    -- argument of Bessel function
+//
+//  OUTPUT:
+//      double j0   -- Bessel function of 1st kind, 0th order
+//      double j1   -- Bessel function of 1st kind, 1st order
+//      double y0   -- Bessel function of 2nd kind, 0th order
+//      double y1   -- Bessel function of 2nd kind, 1st order
+//      double j0p  -- derivative of Bessel function of 1st kind, 0th order
+//      double j1p  -- derivative of Bessel function of 1st kind, 1st order
+//      double y0p  -- derivative of Bessel function of 2nd kind, 0th order
+//      double y1p  -- derivative of Bessel function of 2nd kind, 1st order
+//
+//  RETURN:
+//      int error code: 0 = OK, 1 = error
+//
+//  This algorithm computes the functions using polynomial approximations.
+//
+template<typename P>
+int bessjy01b(P x,P &j0,P &j1,P &y0,P &y1,
+    P &j0p,P &j1p,P &y0p,P &y1p)
+{
+    P t,t2,dtmp,a0,p0,q0,p1,q1,ta0,ta1;
+    if (x < 0.0) return 1;
+    if (x == 0.0) {
+        j0 = 1.0;
+        j1 = 0.0;
+        y0 = -1e308;
+        y1 = -1e308;
+        j0p = 0.0;
+        j1p = 0.5;
+        y0p = 1e308;
+        y1p = 1e308;
+        return 0;
+    }
+    if(x <= 4.0) {
+        t = x/4.0;
+        t2 = t*t;
+        j0 = ((((((-0.5014415e-3*t2+0.76771853e-2)*t2-0.0709253492)*t2+
+            0.4443584263)*t2-1.7777560599)*t2+3.9999973021)*t2
+            -3.9999998721)*t2+1.0;
+        j1 = t*(((((((-0.1289769e-3*t2+0.22069155e-2)*t2-0.0236616773)*t2+
+            0.1777582922)*t2-0.8888839649)*t2+2.6666660544)*t2-
+            3.999999971)*t2+1.9999999998);
+        dtmp = (((((((-0.567433e-4*t2+0.859977e-3)*t2-0.94855882e-2)*t2+
+            0.0772975809)*t2-0.4261737419)*t2+1.4216421221)*t2-
+            2.3498519931)*t2+1.0766115157)*t2+0.3674669052;
+        y0 = M_2_PI*log(0.5*x)*j0+dtmp;
+        dtmp = (((((((0.6535773e-3*t2-0.0108175626)*t2+0.107657607)*t2-
+            0.7268945577)*t2+3.1261399273)*t2-7.3980241381)*t2+
+            6.8529236342)*t2+0.3932562018)*t2-0.6366197726;
+        y1 = M_2_PI*log(0.5*x)*j1+dtmp/x;
+    }
+    else {
+        t = 4.0/x;
+        t2 = t*t;
+        a0 = sqrt(M_2_PI/x);
+        p0 = ((((-0.9285e-5*t2+0.43506e-4)*t2-0.122226e-3)*t2+
+             0.434725e-3)*t2-0.4394275e-2)*t2+0.999999997;
+        q0 = t*(((((0.8099e-5*t2-0.35614e-4)*t2+0.85844e-4)*t2-
+            0.218024e-3)*t2+0.1144106e-2)*t2-0.031249995);
+        ta0 = x-M_PI_4;
+        j0 = a0*(p0*cos(ta0)-q0*sin(ta0));
+        y0 = a0*(p0*sin(ta0)+q0*cos(ta0));
+        p1 = ((((0.10632e-4*t2-0.50363e-4)*t2+0.145575e-3)*t2
+            -0.559487e-3)*t2+0.7323931e-2)*t2+1.000000004;
+        q1 = t*(((((-0.9173e-5*t2+0.40658e-4)*t2-0.99941e-4)*t2
+            +0.266891e-3)*t2-0.1601836e-2)*t2+0.093749994);
+        ta1 = x-0.75*M_PI;
+        j1 = a0*(p1*cos(ta1)-q1*sin(ta1));
+        y1 = a0*(p1*sin(ta1)+q1*cos(ta1));
+    }
+    j0p = -j1;
+    j1p = j0-j1/x;
+    y0p = -y1;
+    y1p = y0-y1/x;
+    return 0;
+}
+template<typename P>
+int msta1(P x,int mp)
+{
+    P a0,f0,f1,f;
+    int i,n0,n1,nn;
+
+    a0 = fabs(x);
+    n0 = (int)(1.1*a0)+1;
+    f0 = 0.5*log10(6.28*n0)-n0*log10(1.36*a0/n0)-mp;
+    n1 = n0+5;
+    f1 = 0.5*log10(6.28*n1)-n1*log10(1.36*a0/n1)-mp;
+    for (i=0;i<20;i++) {
+        nn = (int)(n1-(n1-n0)/(1.0-f0/f1));
+        f = 0.5*log10(6.28*nn)-nn*log10(1.36*a0/nn)-mp;
+        if (std::abs(nn-n1) < 1) break;
+        n0 = n1;
+        f0 = f1;
+        n1 = nn;
+        f1 = f;
+    }
+    return nn;
+}
+template<typename P>
+int msta2(P x,int n,int mp)
+{
+    P a0,ejn,hmp,f0,f1,f,obj;
+    int i,n0,n1,nn;
+
+    a0 = fabs(x);
+    hmp = 0.5*mp;
+    ejn = 0.5*log10(6.28*n)-n*log10(1.36*a0/n);
+    if (ejn <= hmp) {
+        obj = mp;
+        n0 = (int)(1.1*a0);
+        if (n0 < 1) n0 = 1;
+    }
+    else {
+        obj = hmp+ejn;
+        n0 = n;
+    }
+    f0 = 0.5*log10(6.28*n0)-n0*log10(1.36*a0/n0)-obj;
+    n1 = n0+5;
+    f1 = 0.5*log10(6.28*n1)-n1*log10(1.36*a0/n1)-obj;
+    for (i=0;i<20;i++) {
+        nn = (int)(n1-(n1-n0)/(1.0-f0/f1));
+        f = 0.5*log10(6.28*nn)-nn*log10(1.36*a0/nn)-obj;
+        if (std::abs(nn-n1) < 1) break;
+        n0 = n1;
+        f0 = f1;
+        n1 = nn;
+        f1 = f;
+    }
+    return nn+10;
+}
+//
+//  INPUT:
+//  double x    -- argument of Bessel function of 1st and 2nd kind.
+//  int n       -- order
+//
+//  OUPUT:
+//
+//  int nm      -- highest order actually computed (nm <= n)
+//  double jn[] -- Bessel function of 1st kind, orders from 0 to nm
+//  double yn[] -- Bessel function of 2nd kind, orders from 0 to nm
+//  double j'n[]-- derivative of Bessel function of 1st kind,
+//                      orders from 0 to nm
+//  double y'n[]-- derivative of Bessel function of 2nd kind,
+//                      orders from 0 to nm
+//
+//  Computes Bessel functions of all order up to 'n' using recurrence
+//  relations. If 'nm' < 'n' only 'nm' orders are returned.
+//
+template<typename P>
+int bessjyna(int n,P x,int &nm,P *jn,P *yn,
+    P *jnp,P *ynp)
+{
+    P bj0,bj1,f,f0,f1,f2,cs;
+    int i,k,m,ecode;
+
+    nm = n;
+    if ((x < 0.0) || (n < 0)) return 1;
+    if (x < 1e-15) {
+        for (i=0;i<=n;i++) {
+            jn[i] = 0.0;
+            yn[i] = -1e308;
+            jnp[i] = 0.0;
+            ynp[i] = 1e308;
+        }
+        jn[0] = 1.0;
+        jnp[1] = 0.5;
+        return 0;
+    }
+    ecode = bessjy01a(x,jn[0],jn[1],yn[0],yn[1],jnp[0],jnp[1],ynp[0],ynp[1]);
+    if (n < 2) return 0;
+    bj0 = jn[0];
+    bj1 = jn[1];
+    if (n < (int)0.9*x) {
+        for (k=2;k<=n;k++) {
+            jn[k] = 2.0*(k-1.0)*bj1/x-bj0;
+            bj0 = bj1;
+            bj1 = jn[k];
+        }
+    }
+    else {
+        m = msta1(x,200);
+        if (m < n) nm = m;
+        else m = msta2(x,n,15);
+        f2 = 0.0;
+        f1 = 1.0e-100;
+        for (k=m;k>=0;k--) {
+            f = 2.0*(k+1.0)/x*f1-f2;
+            if (k <= nm) jn[k] = f;
+            f2 = f1;
+            f1 = f;
+        }
+        if (fabs(bj0) > fabs(bj1)) cs = bj0/f;
+        else cs = bj1/f2;
+        for (k=0;k<=nm;k++) {
+            jn[k] *= cs;
+        }
+    }
+    for (k=2;k<=nm;k++) {
+        jnp[k] = jn[k-1]-k*jn[k]/x;
+    }
+    f0 = yn[0];
+    f1 = yn[1];
+    for (k=2;k<=nm;k++) {
+        f = 2.0*(k-1.0)*f1/x-f0;
+        yn[k] = f;
+        f0 = f1;
+        f1 = f;
+    }
+    for (k=2;k<=nm;k++) {
+        ynp[k] = yn[k-1]-k*yn[k]/x;
+    }
+    return 0;
+}
+//
+//  Same input and output conventions as above. Different recurrence
+//  relations used for 'x' < 300.
+//
+template<typename P>
+int bessjynb(int n,P x,int &nm,P *jn,P *yn,
+    P *jnp,P *ynp)
+{
+    P t1,t2,f,f1,f2,bj0,bj1,bjk,by0,by1,cu,s0,su,sv;
+    P ec,bs,byk,p0,p1,q0,q1;
+    static P a[] = {
+        -0.7031250000000000e-1,
+         0.1121520996093750,
+        -0.5725014209747314,
+         6.074042001273483};
+    static P b[] = {
+         0.7324218750000000e-1,
+        -0.2271080017089844,
+         1.727727502584457,
+        -2.438052969955606e1};
+    static P a1[] = {
+         0.1171875,
+        -0.1441955566406250,
+         0.6765925884246826,
+        -6.883914268109947};
+    static P b1[] = {
+       -0.1025390625,
+        0.2775764465332031,
+       -1.993531733751297,
+        2.724882731126854e1};
+
+    int i,k,m;
+    nm = n;
+    if ((x < 0.0) || (n < 0)) return 1;
+    if (x < 1e-15) {
+        for (i=0;i<=n;i++) {
+            jn[i] = 0.0;
+            yn[i] = -1e308;
+            jnp[i] = 0.0;
+            ynp[i] = 1e308;
+        }
+        jn[0] = 1.0;
+        jnp[1] = 0.5;
+        return 0;
+    }
+    if (x <= 300.0 || n > (int)(0.9*x)) {
+        if (n == 0) nm = 1;
+        m = msta1(x,200);
+        if (m < nm) nm = m;
+        else m = msta2(x,nm,15);
+        bs = 0.0;
+        su = 0.0;
+        sv = 0.0;
+        f2 = 0.0;
+        f1 = 1.0e-100;
+        for (k = m;k>=0;k--) {
+            f = 2.0*(k+1.0)/x*f1 - f2;
+            if (k <= nm) jn[k] = f;
+            if ((k == 2*(int)(k/2)) && (k != 0)) {
+                bs += 2.0*f;
+//                su += pow(-1,k>>1)*f/(double)k;
+                su += (-1)*((k & 2)-1)*f/(P)k;
+            }
+            else if (k > 1) {
+//                sv += pow(-1,k>>1)*k*f/(k*k-1.0);
+                sv += (-1)*((k & 2)-1)*(P)k*f/(k*k-1.0);
+            }
+            f2 = f1;
+            f1 = f;
+        }
+        s0 = bs+f;
+        for (k=0;k<=nm;k++) {
+            jn[k] /= s0;
+        }
+        ec = log(0.5*x) +0.5772156649015329;
+        by0 = M_2_PI*(ec*jn[0]-4.0*su/s0);
+        yn[0] = by0;
+        by1 = M_2_PI*((ec-1.0)*jn[1]-jn[0]/x-4.0*sv/s0);
+        yn[1] = by1;
+    }
+    else {
+        t1 = x-M_PI_4;
+        p0 = 1.0;
+        q0 = -0.125/x;
+        for (k=0;k<4;k++) {
+            p0 += a[k]*pow(x,-2*k-2);
+            q0 += b[k]*pow(x,-2*k-3);
+        }
+        cu = sqrt(M_2_PI/x);
+        bj0 = cu*(p0*cos(t1)-q0*sin(t1));
+        by0 = cu*(p0*sin(t1)+q0*cos(t1));
+        jn[0] = bj0;
+        yn[0] = by0;
+        t2 = x-0.75*M_PI;
+        p1 = 1.0;
+        q1 = 0.375/x;
+        for (k=0;k<4;k++) {
+            p1 += a1[k]*pow(x,-2*k-2);
+            q1 += b1[k]*pow(x,-2*k-3);
+        }
+        bj1 = cu*(p1*cos(t2)-q1*sin(t2));
+        by1 = cu*(p1*sin(t2)+q1*cos(t2));
+        jn[1] = bj1;
+        yn[1] = by1;
+        for (k=2;k<=nm;k++) {
+            bjk = 2.0*(k-1.0)*bj1/x-bj0;
+            jn[k] = bjk;
+            bj0 = bj1;
+            bj1 = bjk;
+        }
+    }
+    jnp[0] = -jn[1];
+    for (k=1;k<=nm;k++) {
+        jnp[k] = jn[k-1]-k*jn[k]/x;
+    }
+    for (k=2;k<=nm;k++) {
+        byk = 2.0*(k-1.0)*by1/x-by0;
+        yn[k] = byk;
+        by0 = by1;
+        by1 = byk;
+    }
+    ynp[0] = -yn[1];
+    for (k=1;k<=nm;k++) {
+        ynp[k] = yn[k-1]-k*yn[k]/x;
+    }
+    return 0;
+
+}
+
+//  The following routine computes Bessel Jv(x) and Yv(x) for
+//  arbitrary positive order (v). For negative order, use:
+//
+//      J-v(x) = Jv(x)cos(v pi) - Yv(x)sin(v pi)
+//      Y-v(x) = Jv(x)sin(v pi) + Yv(x)cos(v pi)
+//
+template<typename P>
+int bessjyv(P v,P x,P &vm,P *jv,P *yv,
+    P *djv,P *dyv)
+{
+    P v0,vl,vg,vv,a,a0,r,x2,bjv0,bjv1,bjvl,f,f0,f1,f2;
+    P r0,r1,ck,cs,cs0,cs1,sk,qx,px,byv0,byv1,rp,xk,rq;
+    P b,ec,w0,w1,bju0,bju1,pv0,pv1,byvk;
+    int j,k,l,m,n,kz;
+
+    x2 = x*x;
+    n = (int)v;
+    v0 = v-n;
+    if ((x < 0.0) || (v < 0.0)) return 1;
+    if (x < 1e-15) {
+        for (k=0;k<=n;k++) {
+            jv[k] = 0.0;
+            yv[k] = -1e308;
+            djv[k] = 0.0;
+            dyv[k] = 1e308;
+            if (v0 == 0.0) {
+                jv[0] = 1.0;
+                djv[1] = 0.5;
+            }
+            else djv[0] = 1e308;
+        }
+        vm = v;
+        return 0;
+    }
+    if (x <= 12.0) {
+        for (l=0;l<2;l++) {
+            vl = v0 + l;
+            bjvl = 1.0;
+            r = 1.0;
+            for (k=1;k<=40;k++) {
+                r *= -0.25*x2/(k*(k+vl));
+                bjvl += r;
+                if (fabs(r) < fabs(bjvl)*1e-15) break;
+            }
+            vg = 1.0 + vl;
+            a = pow(0.5*x,vl)/gamma(vg);
+            if (l == 0) bjv0 = bjvl*a;
+            else bjv1 = bjvl*a;
+        }
+    }
+    else {
+        if (x >= 50.0) kz = 8;
+        else if (x >= 35.0) kz = 10;
+        else kz = 11;
+        for (j=0;j<2;j++) {
+            vv = 4.0*(j+v0)*(j+v0);
+            px = 1.0;
+            rp = 1.0;
+            for (k=1;k<=kz;k++) {
+                rp *= (-0.78125e-2)*(vv-pow(4.0*k-3.0,2.0))*
+                    (vv-pow(4.0*k-1.0,2.0))/(k*(2.0*k-1.0)*x2);
+                px += rp;
+            }
+            qx = 1.0;
+            rq = 1.0;
+            for (k=1;k<=kz;k++) {
+                rq *= (-0.78125e-2)*(vv-pow(4.0*k-1.0,2.0))*
+                    (vv-pow(4.0*k+1.0,2.0))/(k*(2.0*k+1.0)*x2);
+                qx += rq;
+            }
+            qx *= 0.125*(vv-1.0)/x;
+            xk = x-(0.5*(j+v0)+0.25)*M_PI;
+            a0 = sqrt(M_2_PI/x);
+            ck = cos(xk);
+            sk = sin(xk);
+
+            if (j == 0) {
+                bjv0 = a0*(px*ck-qx*sk);
+                byv0 = a0*(px*sk+qx*ck);
+            }
+            else if (j == 1) {
+                bjv1 = a0*(px*ck-qx*sk);
+                byv1 = a0*(px*sk+qx*ck);
+            }
+        }
+    }
+    jv[0] = bjv0;
+    jv[1] = bjv1;
+    djv[0] = v0*jv[0]/x-jv[1];
+    djv[1] = -(1.0+v0)*jv[1]/x+jv[0];
+    if ((n >= 2) && (n <= (int)(0.9*x))) {
+        f0 = bjv0;
+        f1 = bjv1;
+        for (k=2;k<=n;k++) {
+            f = 2.0*(k+v0-1.0)*f1/x-f0;
+            jv[k] = f;
+            f0 = f1;
+            f1 = f;
+        }
+    }
+    else if (n >= 2) {
+        m = msta1(x,200);
+        if (m < n) n = m;
+        else m = msta2(x,n,15);
+        f2 = 0.0;
+        f1 = 1.0e-100;
+        for (k=m;k>=0;k--) {
+            f = 2.0*(v0+k+1.0)*f1/x-f2;
+            if (k <= n) jv[k] = f;
+            f2 = f1;
+            f1 = f;
+        }
+        if (fabs(bjv0) > fabs(bjv1)) cs = bjv0/f;
+        else cs = bjv1/f2;
+        for (k=0;k<=n;k++) {
+            jv[k] *= cs;
+        }
+    }
+    for (k=2;k<=n;k++) {
+        djv[k] = -(k+v0)*jv[k]/x+jv[k-1];
+    }
+    if (x <= 12.0) {
+        if (v0 != 0.0) {
+            for (l=0;l<2;l++) {
+                vl = v0 +l;
+                bjvl = 1.0;
+                r = 1.0;
+                for (k=1;k<=40;k++) {
+                    r *= -0.25*x2/(k*(k-vl));
+                    bjvl += r;
+                    if (fabs(r) < fabs(bjvl)*1e-15) break;
+                }
+                vg = 1.0-vl;
+                b = pow(2.0/x,vl)/gamma(vg);
+                if (l == 0) bju0 = bjvl*b;
+                else bju1 = bjvl*b;
+            }
+            pv0 = M_PI*v0;
+            pv1 = M_PI*(1.0+v0);
+            byv0 = (bjv0*cos(pv0)-bju0)/sin(pv0);
+            byv1 = (bjv1*cos(pv1)-bju1)/sin(pv1);
+        }
+        else {
+            ec = log(0.5*x)+el;
+            cs0 = 0.0;
+            w0 = 0.0;
+            r0 = 1.0;
+            for (k=1;k<=30;k++) {
+                w0 += 1.0/k;
+                r0 *= -0.25*x2/(k*k);
+                cs0 += r0*w0;
+            }
+            byv0 = M_2_PI*(ec*bjv0-cs0);
+            cs1 = 1.0;
+            w1 = 0.0;
+            r1 = 1.0;
+            for (k=1;k<=30;k++) {
+                w1 += 1.0/k;
+                r1 *= -0.25*x2/(k*(k+1));
+                cs1 += r1*(2.0*w1+1.0/(k+1.0));
+            }
+            byv1 = M_2_PI*(ec*bjv1-1.0/x-0.25*x*cs1);
+        }
+    }
+    yv[0] = byv0;
+    yv[1] = byv1;
+    for (k=2;k<=n;k++) {
+        byvk = 2.0*(v0+k-1.0)*byv1/x-byv0;
+        yv[k] = byvk;
+        byv0 = byv1;
+        byv1 = byvk;
+    }
+    dyv[0] = v0*yv[0]/x-yv[1];
+    for (k=1;k<=n;k++) {
+        dyv[k] = -(k+v0)*yv[k]/x+yv[k-1];
+    }
+    vm = n + v0;
+    return 0;
+}
+
+template<typename P>
+int bessjyv_sph(int v, P z, P &vm, P* cjv,
+    P* cyv, P* cjvp, P* cyvp)
+{
+    //first, compute the bessel functions of fractional order
+    bessjyv<P>(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp);
+
+    //iterate through each and scale
+    for(int n = 0; n<=v; n++)
+    {
+
+        cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
+        cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
+
+        cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0));
+        cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0));
+    }
+
+	return 0;
+
+}
+
+template<typename P>
+int cbessjy01(complex<P> z,complex<P> &cj0,complex<P> &cj1,
+    complex<P> &cy0,complex<P> &cy1,complex<P> &cj0p,
+    complex<P> &cj1p,complex<P> &cy0p,complex<P> &cy1p)
+{
+    complex<P> z1,z2,cr,cp,cs,cp0,cq0,cp1,cq1,ct1,ct2,cu;
+    P a0,w0,w1;
+    int k,kz;
+
+    static P a[] = {
+        -7.03125e-2,
+         0.112152099609375,
+        -0.5725014209747314,
+         6.074042001273483,
+        -1.100171402692467e2,
+         3.038090510922384e3,
+        -1.188384262567832e5,
+         6.252951493434797e6,
+        -4.259392165047669e8,
+         3.646840080706556e10,
+        -3.833534661393944e12,
+         4.854014686852901e14,
+        -7.286857349377656e16,
+         1.279721941975975e19};
+    static P b[] = {
+         7.32421875e-2,
+        -0.2271080017089844,
+         1.727727502584457,
+        -2.438052969955606e1,
+         5.513358961220206e2,
+        -1.825775547429318e4,
+         8.328593040162893e5,
+        -5.006958953198893e7,
+         3.836255180230433e9,
+        -3.649010818849833e11,
+         4.218971570284096e13,
+        -5.827244631566907e15,
+         9.476288099260110e17,
+        -1.792162323051699e20};
+    static P a1[] = {
+         0.1171875,
+        -0.1441955566406250,
+         0.6765925884246826,
+        -6.883914268109947,
+         1.215978918765359e2,
+        -3.302272294480852e3,
+         1.276412726461746e5,
+        -6.656367718817688e6,
+         4.502786003050393e8,
+        -3.833857520742790e10,
+         4.011838599133198e12,
+        -5.060568503314727e14,
+         7.572616461117958e16,
+        -1.326257285320556e19};
+    static P b1[] = {
+        -0.1025390625,
+         0.2775764465332031,
+        -1.993531733751297,
+         2.724882731126854e1,
+        -6.038440767050702e2,
+         1.971837591223663e4,
+        -8.902978767070678e5,
+         5.310411010968522e7,
+        -4.043620325107754e9,
+         3.827011346598605e11,
+        -4.406481417852278e13,
+         6.065091351222699e15,
+        -9.833883876590679e17,
+         1.855045211579828e20};
+
+    a0 = abs(z);
+    z2 = z*z;
+    z1 = z;
+    if (a0 == 0.0) {
+        cj0 = cone;
+        cj1 = czero;
+        cy0 = complex<P>(-1e308,0);
+        cy1 = complex<P>(-1e308,0);
+        cj0p = czero;
+        cj1p = complex<P>(0.5,0.0);
+        cy0p = complex<P>(1e308,0);
+        cy1p = complex<P>(1e308,0);
+        return 0;
+    }
+    if (real(z) < 0.0) z1 = -z;
+    if (a0 <= 12.0) {
+        cj0 = cone;
+        cr = cone;
+        for (k=1;k<=40;k++) {
+            cr *= -0.25*z2/(P)(k*k);
+            cj0 += cr;
+            if (abs(cr) < abs(cj0)*eps) break;
+        }
+        cj1 = cone;
+        cr = cone;
+        for (k=1;k<=40;k++) {
+            cr *= -0.25*z2/(k*(k+1.0));
+            cj1 += cr;
+            if (abs(cr) < abs(cj1)*eps) break;
+        }
+        cj1 *= 0.5*z1;
+        w0 = 0.0;
+        cr = cone;
+        cs = czero;
+        for (k=1;k<=40;k++) {
+            w0 += 1.0/k;
+            cr *= -0.25*z2/(P)(k*k);
+            cp = cr*w0;
+            cs += cp;
+            if (abs(cp) < abs(cs)*eps) break;
+        }
+        cy0 = M_2_PI*((log(0.5*z1)+el)*cj0-cs);
+        w1 = 0.0;
+        cr = cone;
+        cs = cone;
+        for (k=1;k<=40;k++) {
+            w1 += 1.0/k;
+            cr *= -0.25*z2/(k*(k+1.0));
+            cp = cr*(2.0*w1+1.0/(k+1.0));
+            cs += cp;
+            if (abs(cp) < abs(cs)*eps) break;
+        }
+        cy1 = M_2_PI*((log(0.5*z1)+el)*cj1-1.0/z1-0.25*z1*cs);
+    }
+    else {
+        if (a0 >= 50.0) kz = 8;         // can be changed to 10
+        else if (a0 >= 35.0) kz = 10;   //   "      "     "  12
+        else kz = 12;                   //   "      "     "  14
+        ct1 = z1 - M_PI_4;
+        cp0 = cone;
+        for (k=0;k<kz;k++) {
+            cp0 += a[k]*pow(z1,-2.0*k-2.0);
+        }
+        cq0 = -0.125/z1;
+        for (k=0;k<kz;k++) {
+            cq0 += b[k]*pow(z1,-2.0*k-3.0);
+        }
+        cu = sqrt(M_2_PI/z1);
+        cj0 = cu*(cp0*cos(ct1)-cq0*sin(ct1));
+        cy0 = cu*(cp0*sin(ct1)+cq0*cos(ct1));
+        ct2 = z1 - 0.75*M_PI;
+        cp1 = cone;
+        for (k=0;k<kz;k++) {
+            cp1 += a1[k]*pow(z1,-2.0*k-2.0);
+        }
+        cq1 = 0.375/z1;
+        for (k=0;k<kz;k++) {
+            cq1 += b1[k]*pow(z1,-2.0*k-3.0);
+        }
+        cj1 = cu*(cp1*cos(ct2)-cq1*sin(ct2));
+        cy1 = cu*(cp1*sin(ct2)+cq1*cos(ct2));
+    }
+    if (real(z) < 0.0) {
+        if (imag(z) < 0.0) {
+            cy0 -= 2.0*cii*cj0;
+            cy1 = -(cy1-2.0*cii*cj1);
+        }
+        else if (imag(z) > 0.0) {
+            cy0 += 2.0*cii*cj0;
+            cy1 = -(cy1+2.0*cii*cj1);
+        }
+        cj1 = -cj1;
+    }
+    cj0p = -cj1;
+    cj1p = cj0-cj1/z;
+    cy0p = -cy1;
+    cy1p = cy0-cy1/z;
+    return 0;
+}
+
+template<typename P>
+int cbessjyna(int n,complex<P> z,int &nm,complex<P> *cj,
+    complex<P> *cy,complex<P> *cjp,complex<P> *cyp)
+{
+    complex<P> cbj0,cbj1,cby0,cby1,cj0,cjk,cj1,cf,cf1,cf2;
+    complex<P> cs,cg0,cg1,cyk,cyl1,cyl2,cylk,cp11,cp12,cp21,cp22;
+    complex<P> ch0,ch1,ch2;
+    P a0,yak,ya1,ya0,wa;
+    int m,k,lb,lb0;
+
+    if (n < 0) return 1;
+    a0 = abs(z);
+    nm = n;
+    if (a0 < 1.0e-100) {
+        for (k=0;k<=n;k++) {
+            cj[k] = czero;
+            cy[k] = complex<P> (-1e308,0);
+            cjp[k] = czero;
+            cyp[k] = complex<P>(1e308,0);
+        }
+        cj[0] = cone;
+        cjp[1] = complex<P>(0.5,0.0);
+        return 0;
+    }
+    cbessjy01(z,cj[0],cj[1],cy[0],cy[1],cjp[0],cjp[1],cyp[0],cyp[1]);
+    cbj0 = cj[0];
+    cbj1 = cj[1];
+    cby0 = cy[0];
+    cby1 = cy[1];
+    if (n <= 1) return 0;
+    if (n < (int)0.25*a0) {
+        cj0 = cbj0;
+        cj1 = cbj1;
+        for (k=2;k<=n;k++) {
+            cjk = 2.0*(k-1.0)*cj1/z-cj0;
+            cj[k] = cjk;
+            cj0 = cj1;
+            cj1 = cjk;
+        }
+    }
+    else {
+        m = msta1(a0,200);
+        if (m < n) nm = m;
+        else m = msta2(a0,n,15);
+        cf2 = czero;
+        cf1 = complex<P> (1.0e-100,0.0);
+        for (k=m;k>=0;k--) {
+            cf = 2.0*(k+1.0)*cf1/z-cf2;
+            if (k <=nm) cj[k] = cf;
+            cf2 = cf1;
+            cf1 = cf;
+        }
+        if (abs(cbj0) > abs(cbj1)) cs = cbj0/cf;
+        else cs = cbj1/cf2;
+        for (k=0;k<=nm;k++) {
+            cj[k] *= cs;
+        }
+    }
+    for (k=2;k<=nm;k++) {
+        cjp[k] = cj[k-1]-(P)k*cj[k]/z;
+    }
+    ya0 = abs(cby0);
+    lb = 0;
+    cg0 = cby0;
+    cg1 = cby1;
+    for (k=2;k<=nm;k++) {
+        cyk = 2.0*(k-1.0)*cg1/z-cg0;
+        yak = abs(cyk);
+        ya1 = abs(cg0);
+        if ((yak < ya0) && (yak < ya1)) lb = k;
+        cy[k] = cyk;
+        cg0 = cg1;
+        cg1 = cyk;
+    }
+    lb0 = 0;
+    if ((lb > 4) && (imag(z) != 0.0)) {
+        while (lb != lb0) {
+            ch2 = cone;
+            ch1 = czero;
+            lb0 = lb;
+            for (k=lb;k>=1;k--) {
+                ch0 = 2.0*k*ch1/z-ch2;
+                ch2 = ch1;
+                ch1 = ch0;
+            }
+            cp12 = ch0;
+            cp22 = ch2;
+            ch2 = czero;
+            ch1 = cone;
+            for (k=lb;k>=1;k--) {
+                ch0 = 2.0*k*ch1/z-ch2;
+                ch2 = ch1;
+                ch1 = ch0;
+            }
+            cp11 = ch0;
+            cp21 = ch2;
+            if (lb == nm)
+                cj[lb+1] = 2.0*lb*cj[lb]/z-cj[lb-1];
+            if (abs(cj[0]) > abs(cj[1])) {
+                cy[lb+1] = (cj[lb+1]*cby0-2.0*cp11/(M_PI*z))/cj[0];
+                cy[lb] = (cj[lb]*cby0+2.0*cp12/(M_PI*z))/cj[0];
+            }
+            else {
+                cy[lb+1] = (cj[lb+1]*cby1-2.0*cp21/(M_PI*z))/cj[1];
+                cy[lb] = (cj[lb]*cby1+2.0*cp22/(M_PI*z))/cj[1];
+            }
+            cyl2 = cy[lb+1];
+            cyl1 = cy[lb];
+            for (k=lb-1;k>=0;k--) {
+                cylk = 2.0*(k+1.0)*cyl1/z-cyl2;
+                cy[k] = cylk;
+                cyl2 = cyl1;
+                cyl1 = cylk;
+            }
+            cyl1 = cy[lb];
+            cyl2 = cy[lb+1];
+            for (k=lb+1;k<n;k++) {
+                cylk = 2.0*k*cyl2/z-cyl1;
+                cy[k+1] = cylk;
+                cyl1 = cyl2;
+                cyl2 = cylk;
+            }
+            for (k=2;k<=nm;k++) {
+                wa = abs(cy[k]);
+                if (wa < abs(cy[k-1])) lb = k;
+            }
+        }
+    }
+    for (k=2;k<=nm;k++) {
+        cyp[k] = cy[k-1]-(P)k*cy[k]/z;
+    }
+    return 0;
+}
+
+template<typename P>
+int cbessjynb(int n,complex<P> z,int &nm,complex<P> *cj,
+    complex<P> *cy,complex<P> *cjp,complex<P> *cyp)
+{
+    complex<P> cf,cf0,cf1,cf2,cbs,csu,csv,cs0,ce;
+    complex<P> ct1,cp0,cq0,cp1,cq1,cu,cbj0,cby0,cbj1,cby1;
+    complex<P> cyy,cbjk,ct2;
+    P a0,y0;
+    int k,m;
+    static P a[] = {
+        -0.7031250000000000e-1,
+         0.1121520996093750,
+        -0.5725014209747314,
+         6.074042001273483};
+    static P b[] = {
+         0.7324218750000000e-1,
+        -0.2271080017089844,
+         1.727727502584457,
+        -2.438052969955606e1};
+    static P a1[] = {
+         0.1171875,
+        -0.1441955566406250,
+         0.6765925884246826,
+        -6.883914268109947};
+    static P b1[] = {
+       -0.1025390625,
+        0.2775764465332031,
+       -1.993531733751297,
+        2.724882731126854e1};
+
+    y0 = abs(imag(z));
+    a0 = abs(z);
+    nm = n;
+    if (a0 < 1.0e-100) {
+        for (k=0;k<=n;k++) {
+            cj[k] = czero;
+            cy[k] = complex<P> (-1e308,0);
+            cjp[k] = czero;
+            cyp[k] = complex<P>(1e308,0);
+        }
+        cj[0] = cone;
+        cjp[1] = complex<P>(0.5,0.0);
+        return 0;
+    }
+    if ((a0 <= 300.0) || (n > (int)(0.25*a0))) {
+        if (n == 0) nm = 1;
+        m = msta1(a0,200);
+        if (m < nm) nm = m;
+        else m = msta2(a0,nm,15);
+        cbs = czero;
+        csu = czero;
+        csv = czero;
+        cf2 = czero;
+        cf1 = complex<P> (1.0e-100,0.0);
+        for (k=m;k>=0;k--) {
+            cf = 2.0*(k+1.0)*cf1/z-cf2;
+            if (k <= nm) cj[k] = cf;
+            if (((k & 1) == 0) && (k != 0)) {
+                if (y0 <= 1.0) {
+                    cbs += 2.0*cf;
+                }
+                else {
+                    cbs += (-1)*((k & 2)-1)*2.0*cf;
+                }
+                csu += (P)((-1)*((k & 2)-1))*cf/(P)k;
+            }
+            else if (k > 1) {
+                csv += (P)((-1)*((k & 2)-1)*k)*cf/(P)(k*k-1.0);
+            }
+            cf2 = cf1;
+            cf1 = cf;
+        }
+        if (y0 <= 1.0) cs0 = cbs+cf;
+        else cs0 = (cbs+cf)/cos(z);
+        for (k=0;k<=nm;k++) {
+            cj[k] /= cs0;
+        }
+        ce = log(0.5*z)+el;
+        cy[0] = M_2_PI*(ce*cj[0]-4.0*csu/cs0);
+        cy[1] = M_2_PI*(-cj[0]/z+(ce-1.0)*cj[1]-4.0*csv/cs0);
+    }
+    else {
+        ct1 = z-M_PI_4;
+        cp0 = cone;
+        for (k=0;k<4;k++) {
+            cp0 += a[k]*pow(z,-2.0*k-2.0);
+        }
+        cq0 = -0.125/z;
+        for (k=0;k<4;k++) {
+            cq0 += b[k] *pow(z,-2.0*k-3.0);
+        }
+        cu = sqrt(M_2_PI/z);
+        cbj0 = cu*(cp0*cos(ct1)-cq0*sin(ct1));
+        cby0 = cu*(cp0*sin(ct1)+cq0*cos(ct1));
+        cj[0] = cbj0;
+        cy[0] = cby0;
+        ct2 = z-0.75*M_PI;
+        cp1 = cone;
+        for (k=0;k<4;k++) {
+            cp1 += a1[k]*pow(z,-2.0*k-2.0);
+        }
+        cq1 = 0.375/z;
+        for (k=0;k<4;k++) {
+            cq1 += b1[k]*pow(z,-2.0*k-3.0);
+        }
+        cbj1 = cu*(cp1*cos(ct2)-cq1*sin(ct2));
+        cby1 = cu*(cp1*sin(ct2)+cq1*cos(ct2));
+        cj[1] = cbj1;
+        cy[1] = cby1;
+        for (k=2;k<=n;k++) {
+            cbjk = 2.0*(k-1.0)*cbj1/z-cbj0;
+            cj[k] = cbjk;
+            cbj0 = cbj1;
+            cbj1 = cbjk;
+        }
+    }
+    cjp[0] = -cj[1];
+    for (k=1;k<=nm;k++) {
+        cjp[k] = cj[k-1]-(P)k*cj[k]/z;
+    }
+    if (abs(cj[0]) > 1.0)
+        cy[1] = (cj[1]*cy[0]-2.0/(M_PI*z))/cj[0];
+    for (k=2;k<=nm;k++) {
+        if (abs(cj[k-1]) >= abs(cj[k-2]))
+            cyy = (cj[k]*cy[k-1]-2.0/(M_PI*z))/cj[k-1];
+        else
+            cyy = (cj[k]*cy[k-2]-4.0*(k-1.0)/(M_PI*z*z))/cj[k-2];
+        cy[k] = cyy;
+    }
+    cyp[0] = -cy[1];
+    for (k=1;k<=nm;k++) {
+        cyp[k] = cy[k-1]-(P)k*cy[k]/z;
+    }
+
+    return 0;
+}
+
+template<typename P>
+int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv,
+    complex<P>*cyv,complex<P>*cjvp,complex<P>*cyvp)
+{
+    complex<P> z1,z2,zk,cjvl,cr,ca,cjv0,cjv1,cpz,crp;
+    complex<P> cqz,crq,ca0,cck,csk,cyv0,cyv1,cju0,cju1,cb;
+    complex<P> cs,cs0,cr0,cs1,cr1,cec,cf,cf0,cf1,cf2;
+    complex<P> cfac0,cfac1,cg0,cg1,cyk,cp11,cp12,cp21,cp22;
+    complex<P> ch0,ch1,ch2,cyl1,cyl2,cylk;
+
+    P a0,v0,pv0,pv1,vl,ga,gb,vg,vv,w0,w1,ya0,yak,ya1,wa;
+    int j,n,k,kz,l,lb,lb0,m;
+
+    a0 = abs(z);
+    z1 = z;
+    z2 = z*z;
+    n = (int)v;
+
+
+    v0 = v-n;
+
+    pv0 = M_PI*v0;
+    pv1 = M_PI*(1.0+v0);
+    if (a0 < 1.0e-100) {
+        for (k=0;k<=n;k++) {
+            cjv[k] = czero;
+            cyv[k] = complex<P> (-1e308,0);
+            cjvp[k] = czero;
+            cyvp[k] = complex<P> (1e308,0);
+
+        }
+        if (v0 == 0.0) {
+            cjv[0] = cone;
+            cjvp[1] = complex<P> (0.5,0.0);
+        }
+        else {
+            cjvp[0] = complex<P> (1e308,0);
+        }
+        vm = v;
+        return 0;
+    }
+    if (real(z1) < 0.0) z1 = -z;
+    if (a0 <= 12.0) {
+        for (l=0;l<2;l++) {
+            vl = v0+l;
+            cjvl = cone;
+            cr = cone;
+            for (k=1;k<=40;k++) {
+                cr *= -0.25*z2/(k*(k+vl));
+                cjvl += cr;
+                if (abs(cr) < abs(cjvl)*eps) break;
+            }
+           vg = 1.0 + vl;
+           ga = gamma(vg);
+           ca = pow(0.5*z1,vl)/ga;
+           if (l == 0) cjv0 = cjvl*ca;
+           else cjv1 = cjvl*ca;
+        }
+    }
+    else {
+        if (a0 >= 50.0) kz = 8;
+        else if (a0 >= 35.0) kz = 10;
+        else kz = 11;
+        for (j=0;j<2;j++) {
+            vv = 4.0*(j+v0)*(j+v0);
+            cpz = cone;
+            crp = cone;
+            for (k=1;k<=kz;k++) {
+                crp = -0.78125e-2*crp*(vv-pow(4.0*k-3.0,2.0))*
+                    (vv-pow(4.0*k-1.0,2.0))/(k*(2.0*k-1.0)*z2);
+                cpz += crp;
+            }
+            cqz = cone;
+            crq = cone;
+            for (k=1;k<=kz;k++) {
+                crq = -0.78125e-2*crq*(vv-pow(4.0*k-1.0,2.0))*
+                    (vv-pow(4.0*k+1.0,2.0))/(k*(2.0*k+1.0)*z2);
+                cqz += crq;
+            }
+            cqz *= 0.125*(vv-1.0)/z1;
+            zk = z1-(0.5*(j+v0)+0.25)*M_PI;
+            ca0 = sqrt(M_2_PI/z1);
+            cck = cos(zk);
+            csk = sin(zk);
+            if (j == 0) {
+                cjv0 = ca0*(cpz*cck-cqz*csk);
+                cyv0 = ca0*(cpz*csk+cqz+cck);
+            }
+            else {
+                cjv1 = ca0*(cpz*cck-cqz*csk);
+                cyv1 = ca0*(cpz*csk+cqz*cck);
+            }
+        }
+    }
+    if (a0 <= 12.0) {
+        if (v0 != 0.0) {
+            for (l=0;l<2;l++) {
+                vl = v0+l;
+                cjvl = cone;
+                cr = cone;
+                for (k=1;k<=40;k++) {
+                    cr *= -0.25*z2/(k*(k-vl));
+                    cjvl += cr;
+                    if (abs(cr) < abs(cjvl)*eps) break;
+                }
+                vg = 1.0-vl;
+                gb = gamma(vg);
+                cb = pow(2.0/z1,vl)/gb;
+                if (l == 0) cju0 = cjvl*cb;
+                else cju1 = cjvl*cb;
+            }
+            cyv0 = (cjv0*cos(pv0)-cju0)/sin(pv0);
+            cyv1 = (cjv1*cos(pv1)-cju1)/sin(pv1);
+        }
+        else {
+            cec = log(0.5*z1)+el;
+            cs0 = czero;
+            w0 = 0.0;
+            cr0 = cone;
+            for (k=1;k<=30;k++) {
+                w0 += 1.0/k;
+                cr0 *= -0.25*z2/(P)(k*k);
+                cs0 += cr0*w0;
+            }
+            cyv0 = M_2_PI*(cec*cjv0-cs0);
+            cs1 = cone;
+            w1 = 0.0;
+            cr1 = cone;
+            for (k=1;k<=30;k++) {
+                w1 += 1.0/k;
+                cr1 *= -0.25*z2/(k*(k+1.0));
+                cs1 += cr1*(2.0*w1+1.0/(k+1.0));
+            }
+            cyv1 = M_2_PI*(cec*cjv1-1.0/z1-0.25*z1*cs1);
+        }
+    }
+    if (real(z) < 0.0) {
+        cfac0 = exp(pv0*cii);
+        cfac1 = exp(pv1*cii);
+        if (imag(z) < 0.0) {
+            cyv0 = cfac0*cyv0-(P)2.0*(complex<P>)cii*cos(pv0)*cjv0;
+            cyv1 = cfac1*cyv1-(P)2.0*(complex<P>)cii*cos(pv1)*cjv1;
+            cjv0 /= cfac0;
+            cjv1 /= cfac1;
+        }
+        else if (imag(z) > 0.0) {
+            cyv0 = cyv0/cfac0+(P)2.0*(complex<P>)cii*cos(pv0)*cjv0;
+            cyv1 = cyv1/cfac1+(P)2.0*(complex<P>)cii*cos(pv1)*cjv1;
+            cjv0 *= cfac0;
+            cjv1 *= cfac1;
+        }
+    }
+    cjv[0] = cjv0;
+    cjv[1] = cjv1;
+    if ((n >= 2) && (n <= (int)(0.25*a0))) {
+        cf0 = cjv0;
+        cf1 = cjv1;
+        for (k=2;k<= n;k++) {
+            cf = 2.0*(k+v0-1.0)*cf1/z-cf0;
+            cjv[k] = cf;
+            cf0 = cf1;
+            cf1 = cf;
+        }
+    }
+    else if (n >= 2) {
+        m = msta1(a0,200);
+        if (m < n) n = m;
+        else  m = msta2(a0,n,15);
+        cf2 = czero;
+        cf1 = complex<P>(1.0e-100,0.0);
+        for (k=m;k>=0;k--) {
+            cf = 2.0*(v0+k+1.0)*cf1/z-cf2;
+            if (k <= n) cjv[k] = cf;
+            cf2 = cf1;
+            cf1 = cf;
+        }
+        if (abs(cjv0) > abs(cjv1)) cs = cjv0/cf;
+        else cs = cjv1/cf2;
+        for (k=0;k<=n;k++) {
+            cjv[k] *= cs;
+        }
+    }
+    cjvp[0] = v0*cjv[0]/z-cjv[1];
+    for (k=1;k<=n;k++) {
+        cjvp[k] = -(k+v0)*cjv[k]/z+cjv[k-1];
+    }
+    cyv[0] = cyv0;
+    cyv[1] = cyv1;
+    ya0 = abs(cyv0);
+    lb = 0;
+    cg0 = cyv0;
+    cg1 = cyv1;
+    for (k=2;k<=n;k++) {
+        cyk = 2.0*(v0+k-1.0)*cg1/z-cg0;
+        yak = abs(cyk);
+        ya1 = abs(cg0);
+        if ((yak < ya0) && (yak< ya1)) lb = k;
+        cyv[k] = cyk;
+        cg0 = cg1;
+        cg1 = cyk;
+    }
+    lb0 = 0;
+    if ((lb > 4) && (imag(z) != 0.0)) {
+        while(lb != lb0) {
+            ch2 = cone;
+            ch1 = czero;
+            lb0 = lb;
+            for (k=lb;k>=1;k--) {
+                ch0 = 2.0*(k+v0)*ch1/z-ch2;
+                ch2 = ch1;
+                ch1 = ch0;
+            }
+            cp12 = ch0;
+            cp22 = ch2;
+            ch2 = czero;
+            ch1 = cone;
+            for (k=lb;k>=1;k--) {
+                ch0 = 2.0*(k+v0)*ch1/z-ch2;
+                ch2 = ch1;
+                ch1 = ch0;
+            }
+            cp11 = ch0;
+            cp21 = ch2;
+            if (lb == n)
+                cjv[lb+1] = 2.0*(lb+v0)*cjv[lb]/z-cjv[lb-1];
+            if (abs(cjv[0]) > abs(cjv[1])) {
+                cyv[lb+1] = (cjv[lb+1]*cyv0-2.0*cp11/(M_PI*z))/cjv[0];
+                cyv[lb] = (cjv[lb]*cyv0+2.0*cp12/(M_PI*z))/cjv[0];
+            }
+            else {
+                cyv[lb+1] = (cjv[lb+1]*cyv1-2.0*cp21/(M_PI*z))/cjv[1];
+                cyv[lb] = (cjv[lb]*cyv1+2.0*cp22/(M_PI*z))/cjv[1];
+            }
+            cyl2 = cyv[lb+1];
+            cyl1 = cyv[lb];
+            for (k=lb-1;k>=0;k--) {
+                cylk = 2.0*(k+v0+1.0)*cyl1/z-cyl2;
+                cyv[k] = cylk;
+                cyl2 = cyl1;
+                cyl1 = cylk;
+            }
+            cyl1 = cyv[lb];
+            cyl2 = cyv[lb+1];
+            for (k=lb+1;k<n;k++) {
+                cylk = 2.0*(k+v0)*cyl2/z-cyl1;
+                cyv[k+1] = cylk;
+                cyl1 = cyl2;
+                cyl2 = cylk;
+            }
+            for (k=2;k<=n;k++) {
+                wa = abs(cyv[k]);
+                if (wa < abs(cyv[k-1])) lb = k;
+            }
+        }
+    }
+    cyvp[0] = v0*cyv[0]/z-cyv[1];
+    for (k=1;k<=n;k++) {
+        cyvp[k] = cyv[k-1]-(k+v0)*cyv[k]/z;
+    }
+    vm = n+v0;
+    return 0;
+}
+
+template<typename P>
+int cbessjyva_sph(int v,complex<P> z,P &vm,complex<P>*cjv,
+    complex<P>*cyv,complex<P>*cjvp,complex<P>*cyvp)
+{
+    //first, compute the bessel functions of fractional order
+    cbessjyva<P>(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp);
+
+    //iterate through each and scale
+    for(int n = 0; n<=v; n++)
+    {
+
+        cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
+        cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
+
+        cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0));
+        cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0));
+    }
+
+	return 0;
+
+}
+
+}	//end namespace rts
+
+
+#endif
@@ -13,7 +13,7 @@ namespace stim
 	///@param dimx is the size of in* in the z direction.
         ///@param stdx is the standard deviation (in pixels) along the x axis.
         ///@param stdy is the standard deviation (in pixels) along the y axis.
-        ///@param nstds specifies the number of standard deviations of the Gaussian that will be k    ept in the kernel.
+        ///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel.
 	template<typename T, typename K>
 	void cpu_gauss3(T* in, K dimx, K dimy, K dimz, K stdx, K stdy, K stdz, size_t nstds = 3)
 	{
@@ -37,6 +37,20 @@ struct matrix
 		return *this;
 	}
+	//create a symmetric matrix given the rhs values, given in column-major order
+	CUDA_CALLABLE void setsym(T rhs[(N*N+N)/2]){
+		const size_t L = (N*N+N)/2;		//store the number of values
+
+		size_t r, c;
+		r = c = 0;
+		for(size_t i = 0; i < L; i++){ 				//for each value
+			if(r == c) M[c * N + r] = rhs[i];
+			else M[c*N + r] = M[r * N + c] = rhs[i];
+			r++;
+			if(r == N) r = ++c;
+		}
+	}
+
 	CUDA_CALLABLE T& operator()(int row, int col)
 	{
 		return M[col * N + row];
@@ -91,6 +105,14 @@ struct matrix
 		return ss.str();
 	}
+
+	static matrix<T, N> identity() {
+		matrix<T, N> I;
+		I = 0;
+		for (size_t i = 0; i < N; i++)
+			I.M[i * N + i] = 1;
+		return I;
+	}
 };
 }	//end namespace rts
+#ifndef STIM_MATRIX_SYM_H
+#define STIM_MATRIX_SYM_H
+
+#include <stim/cuda/cudatools/callable.h>
+#include <stim/math/matrix.h>
+
+/* This class represents a rank 2, 3-dimensional tensor viable
+for representing tensor fields such as structure and diffusion tensors
+*/
+namespace stim{
+
+template <typename T, int D>
+class matrix_sym{
+
+protected:
+	//values are stored in column-major order as a lower-triangular matrix
+	T M[D*(D + 1)/2];
+
+	static size_t idx(size_t r, size_t c) {
+		//if the index is in the upper-triangular portion, swap the indices
+		if(r < c){
+			size_t t = r;
+			r = c;
+			c = t;
+		}
+
+		size_t ci = (c + 1) * (D + (D - c))/2 - 1;		//index to the end of column c
+		size_t i = ci - (D - r - 1);
+		return i;
+	}
+
+	//calculate the row and column given an index
+	//static void indices(size_t& r, size_t& c, size_t idx) {
+	//	size_t col = 0;
+	//	for ( ; col < D; col++)
+	//		if(idx <= ((D - col + D) * (col + 1)/2 - 1))
+	//			break;
+
+	//	c = col;
+	//	size_t ci = (D - (col - 1) + D) * col / 2 - 1;   //index to the end of last column col -1
+	//	r = idx - ci + c - 1;
+	//}
+	static void indices(size_t& r, size_t& c, size_t idx) {
+		size_t cf = -1/2 * sqrt(4 * D * D + 4 * D - (7 + 8 * idx)) + D - 1/2;
+		c = ceil(cf);
+		r = idx - D * c + c * (c + 1) / 2;
+	}
+
+public:
+	//return the symmetric matrix associated with this tensor
+	stim::matrix<T, D> mat() {
+		stim::matrix<T, D> r;
+		r.setsym(M);
+		return r;
+	}
+
+	CUDA_CALLABLE T& operator()(int r, int c) {		
+		return M[idx(r, c)];
+	}
+
+	CUDA_CALLABLE matrix_sym<T, D> operator=(T rhs) {
+		int Nsq = D*(D+1)/2;
+		for(int i=0; i<Nsq; i++)
+			M[i] = rhs;
+
+		return *this;
+	}
+
+	CUDA_CALLABLE matrix_sym<T, D> operator=(matrix_sym<T, D> rhs) {
+		size_t N = D * (D + 1) / 2;
+		for (size_t i = 0; i < N; i++) M[i] = rhs.M[i];
+		return *this;
+	}
+
+	CUDA_CALLABLE T trace() {
+		T tr = 0;
+		for (size_t i = 0; i < D; i++)		//for each diagonal value
+			tr += M[idx(i, i)];				//add the value on the diagonal
+		return tr;
+	}
+	// overload matrix multiply scalar
+	CUDA_CALLABLE void operator_product(matrix_sym<T, D> &B, T rhs) {
+		int Nsq = D*(D+1)/2;
+		for(int i=0; i<Nsq; i++)
+			B.M[i] *= rhs;
+	}
+
+	//return the tensor as a string
+	std::string str() {
+		std::stringstream ss;
+		for(int r = 0; r < D; r++){
+			ss << "| ";
+			for(int c=0; c<D; c++)
+			{
+				ss << (*this)(r, c) << " ";
+			}
+			ss << "|" << std::endl;
+		}
+
+		return ss.str();
+	}
+
+	//returns an identity matrix
+	static matrix_sym<T, D> identity() {
+		matrix_sym<T, D> I;
+		I = 0;
+		for (size_t i = 0; i < D; i++)
+			I.M[matrix_sym<T, D>::idx(i, i)] = 1;
+		return I;
+	}
+};
+
+
+
+}	//end namespace stim
+
+
+#endif
+#ifndef STIM_TENSOR2_H
+#define STIM_TENSOR2_H
+
+#include "matrix_sym.h"
+
+namespace stim {
+
+/*This class represents a symmetric rank-2 2D tensor, useful for structure tensors
+*/
+template<typename T>
+class tensor2 : public matrix_sym<T, 2> {
+
+protected:
+
+public:
+
+	//calculate the eigenvectors and eigenvalues of the tensor
+	CUDA_CALLABLE void eig(stim::matrix<T, 2>& v, stim::matrix<T, 2>& lambda) {
+		
+		lambda = 0;							//initialize the eigenvalue matrix to zero
+		
+		T t = M[0] + M[2];					//calculate the trace of the tensor
+		T d = M[0] * M[2] - M[1] * M[1];	//calculate the determinant of the tensor
+		
+		lambda(0, 0) = t / 2 + sqrt(t*t / 4 - d);
+		lambda(1, 1) = t / 2 - sqrt(t*t / 4 - d);
+
+		if (M[1] == 0) {
+			v = stim::matrix<T, 2>::identity();
+		}
+		else {
+			v(0, 0) = lambda(0, 0) - d;
+			v(0, 1) = lambda(1, 1) - d;
+			v(1, 0) = v(1, 1) = M[1];
+		}
+	}
+
+	CUDA_CALLABLE tensor2<T> operator=(stim::matrix_sym<T, 2> rhs){
+		stim::matrix_sym<T, 2>::operator=(rhs);
+		return *this;
+	}
+};
+
+
+}	//end namespace stim
+
+
+#endif
 \ No newline at end of file
+#ifndef STIM_TENSOR3_H
+#define STIM_TENSOR3_H
+
+#include "matrix_sym.h"
+#include <stim/math/constants.h>
+
+namespace stim {
+
+	/*This class represents a symmetric rank-2 2D tensor, useful for structure tensors
+	*/
+
+	//Matrix ID cheat sheet
+	//	| 0  1  2 |
+	//	| 1  3  4 |
+	//	| 2  4  5 |
+	template<typename T>
+	class tensor3 : public matrix_sym<T, 3> {
+
+	protected:
+
+	public:
+
+		//calculates the determinant of the tensor
+		CUDA_CALLABLE T det() {
+			return M[0] * M[3] * M[5] + 2 * (M[1] * M[4] * M[2]) - M[2] * M[3] * M[2] - M[1] * M[1] * M[5] - M[0] * M[4] * M[4];
+		}
+
+		//calculate the eigenvalues for the tensor
+		//adapted from https://en.wikipedia.org/wiki/Eigenvalue_algorithm
+
+		CUDA_CALLABLE stim::vec3<T> lambda() {
+			stim::vec3<T> lam;
+			T p1 = M[1] * M[1] + M[2] * M[2] + M[4] * M[4];		//calculate the sum of the squared off-diagonal values
+			if (p1 == 0) {										//if this value is zero, the matrix is diagonal
+				lam[0] = M[0];									//the eigenvalues are the diagonal values
+				lam[1] = M[3];
+				lam[2] = M[5];
+				return lam;										//return the eigenvalue vector
+			}
+
+			T tr = matrix_sym<T, 3>::trace();					//calculate the trace of the matrix
+			T q = tr / 3;
+			T p2 = (M[0] - q) * (M[0] - q) + (M[3] - q) * (M[3] - q) + (M[5] - q) * (M[5] - q) + 2 * p1;
+			T p = sqrt(p2 / 6);
+			tensor3<T> Q;										//allocate space for Q (q along the diagonals)
+			Q = (T)0;											//initialize Q to zeros
+			Q(0, 0) = Q(1, 1) = Q(2, 2) = q;					//set the diagonal values to q
+			tensor3<T> B = *this;								// B1 = A
+			B.M[0] = (B.M[0] - q);
+			B.M[3] = (B.M[3] - q);
+			B.M[5] = (B.M[5] - q);						
+			matrix_sym<T, 3>::operator_product(B, 1/p);				// B = (1/p) * (A - q*I)
+			//B.M[0] = B.M[0] * 1/p;
+			//B.M[1] = B.M[1] * 1/p;
+			//B.M[2] = B.M[2] * 1/p;
+			//B.M[3] = B.M[3] * 1/p;
+			//B.M[4] = B.M[4] * 1/p;
+			//B.M[5] = B.M[5] * 1/p;
+			T r = B.det() / 2;									//calculate det(B) / 2
+
+			// In exact arithmetic for a symmetric matrix - 1 <= r <= 1
+			// but computation error can leave it slightly outside this range.
+			T phi;
+			if (r <= -1) phi = stim::PI / 3;
+			else if (r >= 1) phi = 0;
+			else phi = acos(r) / 3;
+			
+			// the eigenvalues satisfy eig3 >= eig2 >= eig1
+			lam[2] = q + 2 * p * cos(phi);
+			lam[0] = q + 2 * p * cos(phi + (2 * stim::PI / 3));
+			lam[1] = 3 * q - (lam[2] + lam[0]);
+
+			return lam;
+		}
+
+		CUDA_CALLABLE stim::matrix<T, 3> eig(stim::vec3<T>& lambda = stim::vec3<T>()) {
+			stim::matrix<T, 3> V;
+		
+			stim::matrix<T, 3> M1 = matrix_sym<T, 3>::mat();
+			stim::matrix<T, 3> M2 = matrix_sym<T, 3>::mat();
+			stim::matrix<T, 3> M3 = matrix_sym<T, 3>::mat();	// fill a tensor with symmetric values
+
+			M1.operator_minus(M1, lambda[0]);					// M1 = A - lambda[0] * I
+	
+			M2.operator_minus(M2, lambda[1]);					// M2 = A - lambda[1] * I
+
+			M3.operator_minus(M3, lambda[2]);					// M3 = A - lambda[2] * I
+
+			T Mod = 0;											// module of one column
+
+			T tmp1[9] = {0};
+			for(int i = 0; i < 9; i++) {
+				for(int j = 0; j < 3; j++){
+					tmp1[i] += M2(i%3, j) * M3(j, i/3); 
+				}
+			}
+			if(tmp1[0] * tmp1[1] * tmp1[2] != 0) {				// test whether it is zero column
+				Mod = sqrt(pow(tmp1[0],2) + pow(tmp1[1],2) + pow(tmp1[2],2));
+				V(0, 0) = tmp1[0]/Mod;
+				V(1, 0) = tmp1[1]/Mod;
+				V(2, 0) = tmp1[2]/Mod;
+			}
+			else {
+				Mod = sqrt(pow(tmp1[3],2) + pow(tmp1[4],2) + pow(tmp1[5],2));
+				V(0, 0) = tmp1[3]/Mod;
+				V(1, 0) = tmp1[4]/Mod;
+				V(2, 0) = tmp1[5]/Mod;
+			}
+
+			T tmp2[9] = {0};
+			for(int i = 0; i < 9; i++) {
+				for(int j = 0; j < 3; j++){
+					tmp2[i] += M1(i%3, j) * M3(j, i/3); 
+				}
+			}
+			if(tmp2[0] * tmp2[1] * tmp2[2] != 0) {
+				Mod = sqrt(pow(tmp2[0],2) + pow(tmp2[1],2) + pow(tmp2[2],2));
+				V(0, 1) = tmp2[0]/Mod;
+				V(1, 1) = tmp2[1]/Mod;
+				V(2, 1) = tmp2[2]/Mod;
+			}
+			else {
+				Mod = sqrt(pow(tmp2[3],2) + pow(tmp2[4],2) + pow(tmp2[5],2));
+				V(0, 1) = tmp2[3]/Mod;
+				V(1, 1) = tmp2[4]/Mod;
+				V(2, 1) = tmp2[5]/Mod;
+			}
+
+			T tmp3[9] = {0};
+			for(int i = 0; i < 9; i++) {
+				for(int j = 0; j < 3; j++){
+					tmp3[i] += M1(i%3, j) * M2(j, i/3); 
+				}
+			}
+			if(tmp3[0] * tmp3[1] * tmp3[2] != 0) {
+				Mod = sqrt(pow(tmp3[0],2) + pow(tmp3[1],2) + pow(tmp3[2],2));
+				V(0, 2) = tmp3[0]/Mod;
+				V(1, 2) = tmp3[1]/Mod;
+				V(2, 2) = tmp3[2]/Mod;
+			}
+			else {
+				Mod = sqrt(pow(tmp3[3],2) + pow(tmp3[4],2) + pow(tmp3[5],2));
+				V(0, 2) = tmp3[3]/Mod;
+				V(1, 2) = tmp3[4]/Mod;
+				V(2, 2) = tmp3[5]/Mod;
+			}
+			return V;					//return the eigenvector matrix
+		}
+		// return one specific eigenvector
+		CUDA_CALLABLE stim::vec3<T> eig(int n, stim::vec3<T>& lambda = stim::vec3<T>()) {
+			stim::matrix<T, 3> V = eig(lambda);
+			stim::vec3<T> v;
+			for(int i = 0; i < 3; i++)
+				v[i] = V(i, n);
+			return v;
+		}
+
+
+		CUDA_CALLABLE T linear(stim::vec3<T>& lambda = stim::vec3<T>()) {
+			T cl = (lambda[2] - lambda[1]) / (lambda[0] + lambda[1] + lambda[2]);
+			return cl;
+		}
+
+		CUDA_CALLABLE T Planar(stim::vec3<T>& lambda = stim::vec3<T>()) {
+			T cp = 2 * (lambda[1] - lambda[0]) / (lambda[0] + lambda[1] + lambda[2]);
+			return cp;
+		}
+
+		CUDA_CALLABLE T spherical(stim::vec3<T>& lambda = stim::vec3<T>()) {
+			T cs = 3 * lambda[0] / (lambda[0] + lambda[1] + lambda[2]);
+			return cs;
+		}
+
+		CUDA_CALLABLE T fa(stim::vec3<T>& lambda = stim::vec3<T>()) {
+			T fa = sqrt(1/2) * sqrt(pow(lambda[2] - lambda[1], 2) + pow(lambda[1] - lambda[0], 2) + pow(lambda[0] - lambda[2], 2)) / sqrt(pow(lambda[2], 2) + pow(lambda[1], 2) + pow(lambda[0], 2));
+		}
+		//JACK 2: write functions to calculate anisotropy
+		//ex: fa(), linear(), planar(), spherical()
+
+
+		//calculate the eigenvectors and eigenvalues of the tensor
+		//CUDA_CALLABLE void eig(stim::matrix<T, 3>& v, stim::matrix<T, 3>& lambda){
+
+		//}
+		CUDA_CALLABLE tensor3<T> operator=(T rhs) {
+			stim::matrix_sym<T, 3>::operator=(rhs);
+			return *this;
+		}
+
+		CUDA_CALLABLE tensor3<T> operator=(stim::matrix_sym<T, 3> rhs) {
+			stim::matrix_sym<T, 3>::operator=(rhs);
+			return *this;
+		}
+	};
+
+
+}	//end namespace stim
+
+
+#endif
 \ No newline at end of file
+#ifndef STIM_VEC3_H
+#define STIM_VEC3_H
+
+
+#include <stim/cuda/cudatools/callable.h>
+#include <cmath>
+
+
+namespace stim{
+
+
+/// A class designed to act as a 3D vector with CUDA compatibility
+template<typename T>
+class vec3{
+
+protected:
+	T ptr[3];
+
+public:
+
+	CUDA_CALLABLE vec3(){}
+
+	CUDA_CALLABLE vec3(T v){
+		ptr[0] = ptr[1] = ptr[2] = v;
+	}
+
+	CUDA_CALLABLE vec3(T x, T y, T z){
+		ptr[0] = x;
+		ptr[1] = y;
+		ptr[2] = z;
+	}
+
+	//copy constructor
+	CUDA_CALLABLE vec3( const vec3<T>& other){
+		ptr[0] = other.ptr[0];
+		ptr[1] = other.ptr[1];
+		ptr[2] = other.ptr[2];
+	}
+
+	//access an element using an index
+	CUDA_CALLABLE T& operator[](size_t idx){
+		return ptr[idx];
+	}
+
+	CUDA_CALLABLE T* data(){
+		return ptr;
+	}
+
+/// Casting operator. Creates a new vector with a new type U.
+	template< typename U >
+	CUDA_CALLABLE operator vec3<U>(){
+		vec3<U> result;
+		result.ptr[0] = (U)ptr[0];
+		result.ptr[1] = (U)ptr[1];
+		result.ptr[2] = (U)ptr[2];
+
+		return result;
+	}
+
+	// computes the squared Euclidean length (useful for several operations where only >, =, or < matter)
+	CUDA_CALLABLE T len_sq() const{
+		return ptr[0] * ptr[0] + ptr[1] * ptr[1] + ptr[2] * ptr[2];
+	}
+
+	/// computes the Euclidean length of the vector
+	CUDA_CALLABLE T len() const{
+		return sqrt(len_sq());
+	}
+	
+
+	/// Convert the vector from cartesian to spherical coordinates (x, y, z -> r, theta, phi where theta = [0, 2*pi])
+	CUDA_CALLABLE vec3<T> cart2sph() const{
+		vec3<T> sph;
+		sph.ptr[0] = len();
+		sph.ptr[1] = std::atan2(ptr[1], ptr[0]);
+		if(sph.ptr[0] == 0)
+			sph.ptr[2] = 0;
+		else
+			sph.ptr[2] = std::acos(ptr[2] / sph.ptr[0]);
+		return sph;
+	}
+
+	/// Convert the vector from cartesian to spherical coordinates (r, theta, phi -> x, y, z where theta = [0, 2*pi])
+	CUDA_CALLABLE vec3<T> sph2cart() const{
+		vec3<T> cart;
+		cart.ptr[0] = ptr[0] * std::cos(ptr[1]) * std::sin(ptr[2]);
+		cart.ptr[1] = ptr[0] * std::sin(ptr[1]) * std::sin(ptr[2]);
+		cart.ptr[2] = ptr[0] * std::cos(ptr[2]);
+
+		return cart;
+	}
+
+	/// Computes the normalized vector (where each coordinate is divided by the L2 norm)
+	CUDA_CALLABLE vec3<T> norm() const{
+        vec3<T> result;
+        T l = len();						//compute the vector length
+        return (*this) / l;
+	}
+
+	/// Computes the cross product of a 3-dimensional vector
+	CUDA_CALLABLE vec3<T> cross(const vec3<T> rhs) const{
+
+		vec3<T> result;
+
+		result[0] = (ptr[1] * rhs.ptr[2] - ptr[2] * rhs.ptr[1]);
+		result[1] = (ptr[2] * rhs.ptr[0] - ptr[0] * rhs.ptr[2]);
+		result[2] = (ptr[0] * rhs.ptr[1] - ptr[1] * rhs.ptr[0]);
+
+		return result;
+	}
+
+	/// Compute the Euclidean inner (dot) product
+    CUDA_CALLABLE T dot(vec3<T> rhs) const{
+        return ptr[0] * rhs.ptr[0] + ptr[1] * rhs.ptr[1] + ptr[2] * rhs.ptr[2];
+    }
+
+	/// Arithmetic addition operator
+
+    /// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator+(vec3<T> rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] + rhs[0];
+		result.ptr[1] = ptr[1] + rhs[1];
+		result.ptr[2] = ptr[2] + rhs[2];
+		return result;
+	}
+
+	/// Arithmetic addition to a scalar
+
+	/// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator+(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] + rhs;
+		result.ptr[1] = ptr[1] + rhs;
+		result.ptr[2] = ptr[2] + rhs;
+		return result;
+	}
+
+	/// Arithmetic subtraction operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator-(vec3<T> rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] - rhs[0];
+		result.ptr[1] = ptr[1] - rhs[1];
+		result.ptr[2] = ptr[2] - rhs[2];
+		return result;
+	}
+	/// Arithmetic subtraction to a scalar
+
+	/// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator-(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] - rhs;
+		result.ptr[1] = ptr[1] - rhs;
+		result.ptr[2] = ptr[2] - rhs;
+		return result;
+	}
+
+	/// Arithmetic scalar multiplication operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator*(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] * rhs;
+		result.ptr[1] = ptr[1] * rhs;
+		result.ptr[2] = ptr[2] * rhs;
+		return result;
+	}
+
+	/// Arithmetic scalar division operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator/(T rhs) const{
+		return (*this) * ((T)1.0/rhs);
+	}
+
+	/// Multiplication by a scalar, followed by assignment
+	CUDA_CALLABLE vec3<T> operator*=(T rhs){
+		ptr[0] = ptr[0] * rhs;
+		ptr[1] = ptr[1] * rhs;
+		ptr[2] = ptr[2] * rhs;
+		return *this;
+	}
+
+	/// Addition and assignment
+	CUDA_CALLABLE vec3<T> operator+=(vec3<T> rhs){
+		ptr[0] = ptr[0] + rhs;
+		ptr[1] = ptr[1] + rhs;
+		ptr[2] = ptr[2] + rhs;
+		return *this;
+	}
+
+	/// Assign a scalar to all values
+	CUDA_CALLABLE vec3<T> & operator=(T rhs){
+		ptr[0] = ptr[0] = rhs;
+		ptr[1] = ptr[1] = rhs;
+		ptr[2] = ptr[2] = rhs;
+		return *this;
+	}
+
+	/// Casting and assignment
+	template<typename Y>
+	CUDA_CALLABLE vec3<T> & operator=(vec3<Y> rhs){
+		ptr[0] = (T)rhs.ptr[0];
+		ptr[1] = (T)rhs.ptr[1];
+		ptr[2] = (T)rhs.ptr[2];
+		return *this;
+	}
+
+	/// Unary minus (returns the negative of the vector)
+	CUDA_CALLABLE vec3<T> operator-() const{
+		vec3<T> result;
+		result.ptr[0] = -ptr[0];
+		result.ptr[1] = -ptr[1];
+		result.ptr[2] = -ptr[2];
+		return result;
+	}
+
+<<<<<<< HEAD
+//#ifndef __NVCC__
+=======
+>>>>>>> 9f5c0d4a055a2a19e69a97db1441aa617f96180c
+	/// Outputs the vector as a string
+	std::string str() const{
+		std::stringstream ss;
+
+		const size_t N = 3;
+
+		ss<<"[";
+		for(size_t i=0; i<N; i++)
+		{
+			ss<<ptr[i];
+			if(i != N-1)
+				ss<<", ";
+		}
+		ss<<"]";
+
+		return ss.str();
+	}
+<<<<<<< HEAD
+//#endif
+=======
+>>>>>>> 9f5c0d4a055a2a19e69a97db1441aa617f96180c
+
+	size_t size(){ return 3; }
+
+	};						//end class vec3
+}							//end namespace stim
+
+/// Multiply a vector by a constant when the vector is on the right hand side
+template <typename T>
+stim::vec3<T> operator*(T lhs, stim::vec3<T> rhs){
+    return rhs * lhs;
+}
+
+//stream operator
+template<typename T>
+std::ostream& operator<<(std::ostream& os, stim::vec3<T> const& rhs){
+	os<<rhs.str();
+	return os;
+}
+
+#endif
+#ifndef STIM_VEC3_H
+#define STIM_VEC3_H
+
+
+#include <stim/cuda/cudatools/callable.h>
+#include <cmath>
+
+
+namespace stim{
+
+
+/// A class designed to act as a 3D vector with CUDA compatibility
+template<typename T>
+class vec3{
+
+protected:
+	T ptr[3];
+
+public:
+
+	CUDA_CALLABLE vec3(){}
+
+	CUDA_CALLABLE vec3(T v){
+		ptr[0] = ptr[1] = ptr[2] = v;
+	}
+
+	CUDA_CALLABLE vec3(T x, T y, T z){
+		ptr[0] = x;
+		ptr[1] = y;
+		ptr[2] = z;
+	}
+
+	//copy constructor
+	CUDA_CALLABLE vec3( const vec3<T>& other){
+		ptr[0] = other.ptr[0];
+		ptr[1] = other.ptr[1];
+		ptr[2] = other.ptr[2];
+	}
+
+	//access an element using an index
+	CUDA_CALLABLE T& operator[](size_t idx){
+		return ptr[idx];
+	}
+
+	CUDA_CALLABLE T* data(){
+		return ptr;
+	}
+
+/// Casting operator. Creates a new vector with a new type U.
+	template< typename U >
+	CUDA_CALLABLE operator vec3<U>(){
+		vec3<U> result;
+		result.ptr[0] = (U)ptr[0];
+		result.ptr[1] = (U)ptr[1];
+		result.ptr[2] = (U)ptr[2];
+
+		return result;
+	}
+
+	// computes the squared Euclidean length (useful for several operations where only >, =, or < matter)
+	CUDA_CALLABLE T len_sq() const{
+		return ptr[0] * ptr[0] + ptr[1] * ptr[1] + ptr[2] * ptr[2];
+	}
+
+	/// computes the Euclidean length of the vector
+	CUDA_CALLABLE T len() const{
+		return sqrt(len_sq());
+	}
+	
+
+	/// Convert the vector from cartesian to spherical coordinates (x, y, z -> r, theta, phi where theta = [0, 2*pi])
+	CUDA_CALLABLE vec3<T> cart2sph() const{
+		vec3<T> sph;
+		sph.ptr[0] = len();
+		sph.ptr[1] = std::atan2(ptr[1], ptr[0]);
+		if(sph.ptr[0] == 0)
+			sph.ptr[2] = 0;
+		else
+			sph.ptr[2] = std::acos(ptr[2] / sph.ptr[0]);
+		return sph;
+	}
+
+	/// Convert the vector from cartesian to spherical coordinates (r, theta, phi -> x, y, z where theta = [0, 2*pi])
+	CUDA_CALLABLE vec3<T> sph2cart() const{
+		vec3<T> cart;
+		cart.ptr[0] = ptr[0] * std::cos(ptr[1]) * std::sin(ptr[2]);
+		cart.ptr[1] = ptr[0] * std::sin(ptr[1]) * std::sin(ptr[2]);
+		cart.ptr[2] = ptr[0] * std::cos(ptr[2]);
+
+		return cart;
+	}
+
+	/// Computes the normalized vector (where each coordinate is divided by the L2 norm)
+	CUDA_CALLABLE vec3<T> norm() const{
+        vec3<T> result;
+        T l = len();						//compute the vector length
+        return (*this) / l;
+	}
+
+	/// Computes the cross product of a 3-dimensional vector
+	CUDA_CALLABLE vec3<T> cross(const vec3<T> rhs) const{
+
+		vec3<T> result;
+
+		result[0] = (ptr[1] * rhs.ptr[2] - ptr[2] * rhs.ptr[1]);
+		result[1] = (ptr[2] * rhs.ptr[0] - ptr[0] * rhs.ptr[2]);
+		result[2] = (ptr[0] * rhs.ptr[1] - ptr[1] * rhs.ptr[0]);
+
+		return result;
+	}
+
+	/// Compute the Euclidean inner (dot) product
+    CUDA_CALLABLE T dot(vec3<T> rhs) const{
+        return ptr[0] * rhs.ptr[0] + ptr[1] * rhs.ptr[1] + ptr[2] * rhs.ptr[2];
+    }
+
+	/// Arithmetic addition operator
+
+    /// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator+(vec3<T> rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] + rhs[0];
+		result.ptr[1] = ptr[1] + rhs[1];
+		result.ptr[2] = ptr[2] + rhs[2];
+		return result;
+	}
+
+	/// Arithmetic addition to a scalar
+
+	/// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator+(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] + rhs;
+		result.ptr[1] = ptr[1] + rhs;
+		result.ptr[2] = ptr[2] + rhs;
+		return result;
+	}
+
+	/// Arithmetic subtraction operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator-(vec3<T> rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] - rhs[0];
+		result.ptr[1] = ptr[1] - rhs[1];
+		result.ptr[2] = ptr[2] - rhs[2];
+		return result;
+	}
+	/// Arithmetic subtraction to a scalar
+
+	/// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator-(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] - rhs;
+		result.ptr[1] = ptr[1] - rhs;
+		result.ptr[2] = ptr[2] - rhs;
+		return result;
+	}
+
+	/// Arithmetic scalar multiplication operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator*(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] * rhs;
+		result.ptr[1] = ptr[1] * rhs;
+		result.ptr[2] = ptr[2] * rhs;
+		return result;
+	}
+
+	/// Arithmetic scalar division operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator/(T rhs) const{
+		return (*this) * ((T)1.0/rhs);
+	}
+
+	/// Multiplication by a scalar, followed by assignment
+	CUDA_CALLABLE vec3<T> operator*=(T rhs){
+		ptr[0] = ptr[0] * rhs;
+		ptr[1] = ptr[1] * rhs;
+		ptr[2] = ptr[2] * rhs;
+		return *this;
+	}
+
+	/// Addition and assignment
+	CUDA_CALLABLE vec3<T> operator+=(vec3<T> rhs){
+		ptr[0] = ptr[0] + rhs;
+		ptr[1] = ptr[1] + rhs;
+		ptr[2] = ptr[2] + rhs;
+		return *this;
+	}
+
+	/// Assign a scalar to all values
+	CUDA_CALLABLE vec3<T> & operator=(T rhs){
+		ptr[0] = ptr[0] = rhs;
+		ptr[1] = ptr[1] = rhs;
+		ptr[2] = ptr[2] = rhs;
+		return *this;
+	}
+
+	/// Casting and assignment
+	template<typename Y>
+	CUDA_CALLABLE vec3<T> & operator=(vec3<Y> rhs){
+		ptr[0] = (T)rhs.ptr[0];
+		ptr[1] = (T)rhs.ptr[1];
+		ptr[2] = (T)rhs.ptr[2];
+		return *this;
+	}
+
+	/// Unary minus (returns the negative of the vector)
+	CUDA_CALLABLE vec3<T> operator-() const{
+		vec3<T> result;
+		result.ptr[0] = -ptr[0];
+		result.ptr[1] = -ptr[1];
+		result.ptr[2] = -ptr[2];
+		return result;
+	}
+
+<<<<<<< HEAD
+//#ifndef __NVCC__
+=======
+>>>>>>> 9f5c0d4a055a2a19e69a97db1441aa617f96180c
+	/// Outputs the vector as a string
+	std::string str() const{
+		std::stringstream ss;
+
+		const size_t N = 3;
+
+		ss<<"[";
+		for(size_t i=0; i<N; i++)
+		{
+			ss<<ptr[i];
+			if(i != N-1)
+				ss<<", ";
+		}
+		ss<<"]";
+
+		return ss.str();
+	}
+<<<<<<< HEAD
+//#endif
+=======
+>>>>>>> 9f5c0d4a055a2a19e69a97db1441aa617f96180c
+
+	size_t size(){ return 3; }
+
+	};						//end class vec3
+}							//end namespace stim
+
+/// Multiply a vector by a constant when the vector is on the right hand side
+template <typename T>
+stim::vec3<T> operator*(T lhs, stim::vec3<T> rhs){
+    return rhs * lhs;
+}
+
+//stream operator
+template<typename T>
+std::ostream& operator<<(std::ostream& os, stim::vec3<T> const& rhs){
+	os<<rhs.str();
+	return os;
+}
+
+#endif
+#ifndef STIM_VEC3_H
+#define STIM_VEC3_H
+
+
+#include <stim/cuda/cudatools/callable.h>
+
+
+namespace stim{
+
+
+/// A class designed to act as a 3D vector with CUDA compatibility
+template<typename T>
+class vec3{
+
+protected:
+	T ptr[3];
+
+public:
+
+	CUDA_CALLABLE vec3(){}
+
+	CUDA_CALLABLE vec3(T v){
+		ptr[0] = ptr[1] = ptr[2] = v;
+	}
+
+	CUDA_CALLABLE vec3(T x, T y, T z){
+		ptr[0] = x;
+		ptr[1] = y;
+		ptr[2] = z;
+	}
+
+	//copy constructor
+	CUDA_CALLABLE vec3( const vec3<T>& other){
+		ptr[0] = other.ptr[0];
+		ptr[1] = other.ptr[1];
+		ptr[2] = other.ptr[2];
+	}
+
+	//access an element using an index
+	CUDA_CALLABLE T& operator[](size_t idx){
+		return ptr[idx];
+	}
+
+	CUDA_CALLABLE T* data(){
+		return ptr;
+	}
+
+/// Casting operator. Creates a new vector with a new type U.
+	template< typename U >
+	CUDA_CALLABLE operator vec3<U>(){
+		vec3<U> result;
+		result.ptr[0] = (U)ptr[0];
+		result.ptr[1] = (U)ptr[1];
+		result.ptr[2] = (U)ptr[2];
+
+		return result;
+	}
+
+	// computes the squared Euclidean length (useful for several operations where only >, =, or < matter)
+	CUDA_CALLABLE T len_sq() const{
+		return ptr[0] * ptr[0] + ptr[1] * ptr[1] + ptr[2] * ptr[2];
+	}
+
+	/// computes the Euclidean length of the vector
+	CUDA_CALLABLE T len() const{
+		return sqrt(len_sq());
+	}
+	
+
+	/// Convert the vector from cartesian to spherical coordinates (x, y, z -> r, theta, phi where theta = [0, 2*pi])
+	CUDA_CALLABLE vec3<T> cart2sph() const{
+		vec3<T> sph;
+		sph.ptr[0] = len();
+		sph.ptr[1] = std::atan2(ptr[1], ptr[0]);
+		if(sph.ptr[0] == 0)
+			sph.ptr[2] = 0;
+		else
+			sph.ptr[2] = std::acos(ptr[2] / sph.ptr[0]);
+		return sph;
+	}
+
+	/// Convert the vector from cartesian to spherical coordinates (r, theta, phi -> x, y, z where theta = [0, 2*pi])
+	CUDA_CALLABLE vec3<T> sph2cart() const{
+		vec3<T> cart;
+		cart.ptr[0] = ptr[0] * std::cos(ptr[1]) * std::sin(ptr[2]);
+		cart.ptr[1] = ptr[0] * std::sin(ptr[1]) * std::sin(ptr[2]);
+		cart.ptr[2] = ptr[0] * std::cos(ptr[2]);
+
+		return cart;
+	}
+
+	/// Computes the normalized vector (where each coordinate is divided by the L2 norm)
+	CUDA_CALLABLE vec3<T> norm() const{
+        vec3<T> result;
+        T l = len();						//compute the vector length
+        return (*this) / l;
+	}
+
+	/// Computes the cross product of a 3-dimensional vector
+	CUDA_CALLABLE vec3<T> cross(const vec3<T> rhs) const{
+
+		vec3<T> result;
+
+		result[0] = (ptr[1] * rhs.ptr[2] - ptr[2] * rhs.ptr[1]);
+		result[1] = (ptr[2] * rhs.ptr[0] - ptr[0] * rhs.ptr[2]);
+		result[2] = (ptr[0] * rhs.ptr[1] - ptr[1] * rhs.ptr[0]);
+
+		return result;
+	}
+
+	/// Compute the Euclidean inner (dot) product
+    CUDA_CALLABLE T dot(vec3<T> rhs) const{
+        return ptr[0] * rhs.ptr[0] + ptr[1] * rhs.ptr[1] + ptr[2] * rhs.ptr[2];
+    }
+
+	/// Arithmetic addition operator
+
+    /// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator+(vec3<T> rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] + rhs[0];
+		result.ptr[1] = ptr[1] + rhs[1];
+		result.ptr[2] = ptr[2] + rhs[2];
+		return result;
+	}
+
+	/// Arithmetic addition to a scalar
+
+	/// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator+(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] + rhs;
+		result.ptr[1] = ptr[1] + rhs;
+		result.ptr[2] = ptr[2] + rhs;
+		return result;
+	}
+
+	/// Arithmetic subtraction operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator-(vec3<T> rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] - rhs[0];
+		result.ptr[1] = ptr[1] - rhs[1];
+		result.ptr[2] = ptr[2] - rhs[2];
+		return result;
+	}
+	/// Arithmetic subtraction to a scalar
+
+	/// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator-(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] - rhs;
+		result.ptr[1] = ptr[1] - rhs;
+		result.ptr[2] = ptr[2] - rhs;
+		return result;
+	}
+
+	/// Arithmetic scalar multiplication operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator*(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] * rhs;
+		result.ptr[1] = ptr[1] * rhs;
+		result.ptr[2] = ptr[2] * rhs;
+		return result;
+	}
+
+	/// Arithmetic scalar division operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator/(T rhs) const{
+		return (*this) * ((T)1.0/rhs);
+	}
+
+	/// Multiplication by a scalar, followed by assignment
+	CUDA_CALLABLE vec3<T> operator*=(T rhs){
+		ptr[0] = ptr[0] * rhs;
+		ptr[1] = ptr[1] * rhs;
+		ptr[2] = ptr[2] * rhs;
+		return *this;
+	}
+
+	/// Addition and assignment
+	CUDA_CALLABLE vec3<T> operator+=(vec3<T> rhs){
+		ptr[0] = ptr[0] + rhs;
+		ptr[1] = ptr[1] + rhs;
+		ptr[2] = ptr[2] + rhs;
+		return *this;
+	}
+
+	/// Assign a scalar to all values
+	CUDA_CALLABLE vec3<T> & operator=(T rhs){
+		ptr[0] = ptr[0] = rhs;
+		ptr[1] = ptr[1] = rhs;
+		ptr[2] = ptr[2] = rhs;
+		return *this;
+	}
+
+	/// Casting and assignment
+	template<typename Y>
+	CUDA_CALLABLE vec3<T> & operator=(vec3<Y> rhs){
+		ptr[0] = (T)rhs.ptr[0];
+		ptr[1] = (T)rhs.ptr[1];
+		ptr[2] = (T)rhs.ptr[2];
+		return *this;
+	}
+
+	/// Unary minus (returns the negative of the vector)
+	CUDA_CALLABLE vec3<T> operator-() const{
+		vec3<T> result;
+		result.ptr[0] = -ptr[0];
+		result.ptr[1] = -ptr[1];
+		result.ptr[2] = -ptr[2];
+		return result;
+	}
+
+#ifndef __NVCC__
+	/// Outputs the vector as a string
+	std::string str() const{
+		std::stringstream ss;
+
+		const size_t N = 3;
+
+		ss<<"[";
+		for(size_t i=0; i<N; i++)
+		{
+			ss<<ptr[i];
+			if(i != N-1)
+				ss<<", ";
+		}
+		ss<<"]";
+
+		return ss.str();
+	}
+#endif
+
+	size_t size(){ return 3; }
+
+	};						//end class vec3
+}							//end namespace stim
+
+/// Multiply a vector by a constant when the vector is on the right hand side
+template <typename T>
+stim::vec3<T> operator*(T lhs, stim::vec3<T> rhs){
+    return rhs * lhs;
+}
+
+//stream operator
+template<typename T>
+std::ostream& operator<<(std::ostream& os, stim::vec3<T> const& rhs){
+	os<<rhs.str();
+	return os;
+}
+
+#endif
+#ifndef STIM_VEC3_H
+#define STIM_VEC3_H
+
+
+#include <stim/cuda/cudatools/callable.h>
+
+
+namespace stim{
+
+
+/// A class designed to act as a 3D vector with CUDA compatibility
+template<typename T>
+class vec3{
+
+protected:
+	T ptr[3];
+
+public:
+
+	CUDA_CALLABLE vec3(){}
+
+	CUDA_CALLABLE vec3(T v){
+		ptr[0] = ptr[1] = ptr[2] = v;
+	}
+
+	CUDA_CALLABLE vec3(T x, T y, T z){
+		ptr[0] = x;
+		ptr[1] = y;
+		ptr[2] = z;
+	}
+
+	//copy constructor
+	CUDA_CALLABLE vec3( const vec3<T>& other){
+		ptr[0] = other.ptr[0];
+		ptr[1] = other.ptr[1];
+		ptr[2] = other.ptr[2];
+	}
+
+	//access an element using an index
+	CUDA_CALLABLE T& operator[](size_t idx){
+		return ptr[idx];
+	}
+
+	CUDA_CALLABLE T* data(){
+		return ptr;
+	}
+
+/// Casting operator. Creates a new vector with a new type U.
+	template< typename U >
+	CUDA_CALLABLE operator vec3<U>(){
+		vec3<U> result;
+		result.ptr[0] = (U)ptr[0];
+		result.ptr[1] = (U)ptr[1];
+		result.ptr[2] = (U)ptr[2];
+
+		return result;
+	}
+
+	// computes the squared Euclidean length (useful for several operations where only >, =, or < matter)
+	CUDA_CALLABLE T len_sq() const{
+		return ptr[0] * ptr[0] + ptr[1] * ptr[1] + ptr[2] * ptr[2];
+	}
+
+	/// computes the Euclidean length of the vector
+	CUDA_CALLABLE T len() const{
+		return sqrt(len_sq());
+	}
+	
+
+	/// Convert the vector from cartesian to spherical coordinates (x, y, z -> r, theta, phi where theta = [0, 2*pi])
+	CUDA_CALLABLE vec3<T> cart2sph() const{
+		vec3<T> sph;
+		sph.ptr[0] = len();
+		sph.ptr[1] = std::atan2(ptr[1], ptr[0]);
+		if(sph.ptr[0] == 0)
+			sph.ptr[2] = 0;
+		else
+			sph.ptr[2] = std::acos(ptr[2] / sph.ptr[0]);
+		return sph;
+	}
+
+	/// Convert the vector from cartesian to spherical coordinates (r, theta, phi -> x, y, z where theta = [0, 2*pi])
+	CUDA_CALLABLE vec3<T> sph2cart() const{
+		vec3<T> cart;
+		cart.ptr[0] = ptr[0] * std::cos(ptr[1]) * std::sin(ptr[2]);
+		cart.ptr[1] = ptr[0] * std::sin(ptr[1]) * std::sin(ptr[2]);
+		cart.ptr[2] = ptr[0] * std::cos(ptr[2]);
+
+		return cart;
+	}
+
+	/// Computes the normalized vector (where each coordinate is divided by the L2 norm)
+	CUDA_CALLABLE vec3<T> norm() const{
+        vec3<T> result;
+        T l = len();						//compute the vector length
+        return (*this) / l;
+	}
+
+	/// Computes the cross product of a 3-dimensional vector
+	CUDA_CALLABLE vec3<T> cross(const vec3<T> rhs) const{
+
+		vec3<T> result;
+
+		result[0] = (ptr[1] * rhs.ptr[2] - ptr[2] * rhs.ptr[1]);
+		result[1] = (ptr[2] * rhs.ptr[0] - ptr[0] * rhs.ptr[2]);
+		result[2] = (ptr[0] * rhs.ptr[1] - ptr[1] * rhs.ptr[0]);
+
+		return result;
+	}
+
+	/// Compute the Euclidean inner (dot) product
+    CUDA_CALLABLE T dot(vec3<T> rhs) const{
+        return ptr[0] * rhs.ptr[0] + ptr[1] * rhs.ptr[1] + ptr[2] * rhs.ptr[2];
+    }
+
+	/// Arithmetic addition operator
+
+    /// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator+(vec3<T> rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] + rhs[0];
+		result.ptr[1] = ptr[1] + rhs[1];
+		result.ptr[2] = ptr[2] + rhs[2];
+		return result;
+	}
+
+	/// Arithmetic addition to a scalar
+
+	/// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator+(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] + rhs;
+		result.ptr[1] = ptr[1] + rhs;
+		result.ptr[2] = ptr[2] + rhs;
+		return result;
+	}
+
+	/// Arithmetic subtraction operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator-(vec3<T> rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] - rhs[0];
+		result.ptr[1] = ptr[1] - rhs[1];
+		result.ptr[2] = ptr[2] - rhs[2];
+		return result;
+	}
+	/// Arithmetic subtraction to a scalar
+
+	/// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator-(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] - rhs;
+		result.ptr[1] = ptr[1] - rhs;
+		result.ptr[2] = ptr[2] - rhs;
+		return result;
+	}
+
+	/// Arithmetic scalar multiplication operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator*(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] * rhs;
+		result.ptr[1] = ptr[1] * rhs;
+		result.ptr[2] = ptr[2] * rhs;
+		return result;
+	}
+
+	/// Arithmetic scalar division operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator/(T rhs) const{
+		return (*this) * ((T)1.0/rhs);
+	}
+
+	/// Multiplication by a scalar, followed by assignment
+	CUDA_CALLABLE vec3<T> operator*=(T rhs){
+		ptr[0] = ptr[0] * rhs;
+		ptr[1] = ptr[1] * rhs;
+		ptr[2] = ptr[2] * rhs;
+		return *this;
+	}
+
+	/// Addition and assignment
+	CUDA_CALLABLE vec3<T> operator+=(vec3<T> rhs){
+		ptr[0] = ptr[0] + rhs;
+		ptr[1] = ptr[1] + rhs;
+		ptr[2] = ptr[2] + rhs;
+		return *this;
+	}
+
+	/// Assign a scalar to all values
+	CUDA_CALLABLE vec3<T> & operator=(T rhs){
+		ptr[0] = ptr[0] = rhs;
+		ptr[1] = ptr[1] = rhs;
+		ptr[2] = ptr[2] = rhs;
+		return *this;
+	}
+
+	/// Casting and assignment
+	template<typename Y>
+	CUDA_CALLABLE vec3<T> & operator=(vec3<Y> rhs){
+		ptr[0] = (T)rhs.ptr[0];
+		ptr[1] = (T)rhs.ptr[1];
+		ptr[2] = (T)rhs.ptr[2];
+		return *this;
+	}
+
+	/// Unary minus (returns the negative of the vector)
+	CUDA_CALLABLE vec3<T> operator-() const{
+		vec3<T> result;
+		result.ptr[0] = -ptr[0];
+		result.ptr[1] = -ptr[1];
+		result.ptr[2] = -ptr[2];
+		return result;
+	}
+
+//#ifndef __NVCC__
+	/// Outputs the vector as a string
+	std::string str() const{
+		std::stringstream ss;
+
+		const size_t N = 3;
+
+		ss<<"[";
+		for(size_t i=0; i<N; i++)
+		{
+			ss<<ptr[i];
+			if(i != N-1)
+				ss<<", ";
+		}
+		ss<<"]";
+
+		return ss.str();
+	}
+//#endif
+
+	size_t size(){ return 3; }
+
+	};						//end class vec3
+}							//end namespace stim
+
+/// Multiply a vector by a constant when the vector is on the right hand side
+template <typename T>
+stim::vec3<T> operator*(T lhs, stim::vec3<T> rhs){
+    return rhs * lhs;
+}
+
+//stream operator
+template<typename T>
+std::ostream& operator<<(std::ostream& os, stim::vec3<T> const& rhs){
+	os<<rhs.str();
+	return os;
+}
+
+#endif
+#ifndef STIM_VEC3_H
+#define STIM_VEC3_H
+
+
+#include <stim/cuda/cudatools/callable.h>
+#include <cmath>
+
+
+namespace stim{
+
+
+/// A class designed to act as a 3D vector with CUDA compatibility
+template<typename T>
+class vec3{
+
+protected:
+	T ptr[3];
+
+public:
+
+	CUDA_CALLABLE vec3(){}
+
+	CUDA_CALLABLE vec3(T v){
+		ptr[0] = ptr[1] = ptr[2] = v;
+	}
+
+	CUDA_CALLABLE vec3(T x, T y, T z){
+		ptr[0] = x;
+		ptr[1] = y;
+		ptr[2] = z;
+	}
+
+	//copy constructor
+	CUDA_CALLABLE vec3( const vec3<T>& other){
+		ptr[0] = other.ptr[0];
+		ptr[1] = other.ptr[1];
+		ptr[2] = other.ptr[2];
+	}
+
+	//access an element using an index
+	CUDA_CALLABLE T& operator[](size_t idx){
+		return ptr[idx];
+	}
+
+	CUDA_CALLABLE T* data(){
+		return ptr;
+	}
+
+/// Casting operator. Creates a new vector with a new type U.
+	template< typename U >
+	CUDA_CALLABLE operator vec3<U>(){
+		vec3<U> result;
+		result.ptr[0] = (U)ptr[0];
+		result.ptr[1] = (U)ptr[1];
+		result.ptr[2] = (U)ptr[2];
+
+		return result;
+	}
+
+	// computes the squared Euclidean length (useful for several operations where only >, =, or < matter)
+	CUDA_CALLABLE T len_sq() const{
+		return ptr[0] * ptr[0] + ptr[1] * ptr[1] + ptr[2] * ptr[2];
+	}
+
+	/// computes the Euclidean length of the vector
+	CUDA_CALLABLE T len() const{
+		return sqrt(len_sq());
+	}
+	
+
+	/// Convert the vector from cartesian to spherical coordinates (x, y, z -> r, theta, phi where theta = [0, 2*pi])
+	CUDA_CALLABLE vec3<T> cart2sph() const{
+		vec3<T> sph;
+		sph.ptr[0] = len();
+		sph.ptr[1] = std::atan2(ptr[1], ptr[0]);
+		if(sph.ptr[0] == 0)
+			sph.ptr[2] = 0;
+		else
+			sph.ptr[2] = std::acos(ptr[2] / sph.ptr[0]);
+		return sph;
+	}
+
+	/// Convert the vector from cartesian to spherical coordinates (r, theta, phi -> x, y, z where theta = [0, 2*pi])
+	CUDA_CALLABLE vec3<T> sph2cart() const{
+		vec3<T> cart;
+		cart.ptr[0] = ptr[0] * std::cos(ptr[1]) * std::sin(ptr[2]);
+		cart.ptr[1] = ptr[0] * std::sin(ptr[1]) * std::sin(ptr[2]);
+		cart.ptr[2] = ptr[0] * std::cos(ptr[2]);
+
+		return cart;
+	}
+
+	/// Computes the normalized vector (where each coordinate is divided by the L2 norm)
+	CUDA_CALLABLE vec3<T> norm() const{
+        vec3<T> result;
+        T l = len();						//compute the vector length
+        return (*this) / l;
+	}
+
+	/// Computes the cross product of a 3-dimensional vector
+	CUDA_CALLABLE vec3<T> cross(const vec3<T> rhs) const{
+
+		vec3<T> result;
+
+		result[0] = (ptr[1] * rhs.ptr[2] - ptr[2] * rhs.ptr[1]);
+		result[1] = (ptr[2] * rhs.ptr[0] - ptr[0] * rhs.ptr[2]);
+		result[2] = (ptr[0] * rhs.ptr[1] - ptr[1] * rhs.ptr[0]);
+
+		return result;
+	}
+
+	/// Compute the Euclidean inner (dot) product
+    CUDA_CALLABLE T dot(vec3<T> rhs) const{
+        return ptr[0] * rhs.ptr[0] + ptr[1] * rhs.ptr[1] + ptr[2] * rhs.ptr[2];
+    }
+
+	/// Arithmetic addition operator
+
+    /// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator+(vec3<T> rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] + rhs[0];
+		result.ptr[1] = ptr[1] + rhs[1];
+		result.ptr[2] = ptr[2] + rhs[2];
+		return result;
+	}
+
+	/// Arithmetic addition to a scalar
+
+	/// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator+(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] + rhs;
+		result.ptr[1] = ptr[1] + rhs;
+		result.ptr[2] = ptr[2] + rhs;
+		return result;
+	}
+
+	/// Arithmetic subtraction operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator-(vec3<T> rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] - rhs[0];
+		result.ptr[1] = ptr[1] - rhs[1];
+		result.ptr[2] = ptr[2] - rhs[2];
+		return result;
+	}
+	/// Arithmetic subtraction to a scalar
+
+	/// @param rhs is the right-hand-side operator for the addition
+	CUDA_CALLABLE vec3<T> operator-(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] - rhs;
+		result.ptr[1] = ptr[1] - rhs;
+		result.ptr[2] = ptr[2] - rhs;
+		return result;
+	}
+
+	/// Arithmetic scalar multiplication operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator*(T rhs) const{
+		vec3<T> result;
+		result.ptr[0] = ptr[0] * rhs;
+		result.ptr[1] = ptr[1] * rhs;
+		result.ptr[2] = ptr[2] * rhs;
+		return result;
+	}
+
+	/// Arithmetic scalar division operator
+
+	/// @param rhs is the right-hand-side operator for the subtraction
+	CUDA_CALLABLE vec3<T> operator/(T rhs) const{
+		return (*this) * ((T)1.0/rhs);
+	}
+
+	/// Multiplication by a scalar, followed by assignment
+	CUDA_CALLABLE vec3<T> operator*=(T rhs){
+		ptr[0] = ptr[0] * rhs;
+		ptr[1] = ptr[1] * rhs;
+		ptr[2] = ptr[2] * rhs;
+		return *this;
+	}
+
+	/// Addition and assignment
+	CUDA_CALLABLE vec3<T> operator+=(vec3<T> rhs){
+		ptr[0] = ptr[0] + rhs;
+		ptr[1] = ptr[1] + rhs;
+		ptr[2] = ptr[2] + rhs;
+		return *this;
+	}
+
+	/// Assign a scalar to all values
+	CUDA_CALLABLE vec3<T> & operator=(T rhs){
+		ptr[0] = ptr[0] = rhs;
+		ptr[1] = ptr[1] = rhs;
+		ptr[2] = ptr[2] = rhs;
+		return *this;
+	}
+
+	/// Casting and assignment
+	template<typename Y>
+	CUDA_CALLABLE vec3<T> & operator=(vec3<Y> rhs){
+		ptr[0] = (T)rhs.ptr[0];
+		ptr[1] = (T)rhs.ptr[1];
+		ptr[2] = (T)rhs.ptr[2];
+		return *this;
+	}
+
+	/// Unary minus (returns the negative of the vector)
+	CUDA_CALLABLE vec3<T> operator-() const{
+		vec3<T> result;
+		result.ptr[0] = -ptr[0];
+		result.ptr[1] = -ptr[1];
+		result.ptr[2] = -ptr[2];
+		return result;
+	}
+
+	/// Outputs the vector as a string
+	std::string str() const{
+		std::stringstream ss;
+
+		const size_t N = 3;
+
+		ss<<"[";
+		for(size_t i=0; i<N; i++)
+		{
+			ss<<ptr[i];
+			if(i != N-1)
+				ss<<", ";
+		}
+		ss<<"]";
+
+		return ss.str();
+	}
+
+	size_t size(){ return 3; }
+
+	};						//end class vec3
+}							//end namespace stim
+
+/// Multiply a vector by a constant when the vector is on the right hand side
+template <typename T>
+stim::vec3<T> operator*(T lhs, stim::vec3<T> rhs){
+    return rhs * lhs;
+}
+
+//stream operator
+template<typename T>
+std::ostream& operator<<(std::ostream& os, stim::vec3<T> const& rhs){
+	os<<rhs.str();
+	return os;
+}
+
+#endif
@@ -5,6 +5,7 @@
 #include <cmath>
 #include <sstream>
 #include <vector>
+#include <algorithm>
 #include <stim/cuda/cudatools/callable.h>
 #include <stim/math/vec3.h>
@@ -74,11 +75,11 @@ struct vec : public std::vector&lt;T&gt;
 			at(i) = other[i];
 		}
 	}
-
+
 //	vec( vec3<T>& other){
 //		resize(3);							//resize the current vector to match the copy
 //		for(size_t i=0; i<3; i++){	//copy each element
-//			at(i) = other[i];
+//			at(i) = other[i];
 //		}
 //	}
@@ -139,16 +140,16 @@ struct vec : public std::vector&lt;T&gt;
 	}
-	
-	vec<T> cyl2cart() const
-	{
-		vec<T> cyl;
-		cyl.push_back(at(0)*std::sin(at(1)));
-		cyl.push_back(at(0)*std::cos(at(1)));
-		cyl.push_back(at(2));
-		return(cyl);
-		
-	}
+	
+	vec<T> cyl2cart() const
+	{
+		vec<T> cyl;
+		cyl.push_back(at(0)*std::sin(at(1)));
+		cyl.push_back(at(0)*std::cos(at(1)));
+		cyl.push_back(at(2));
+		return(cyl);
+		
+	}
 	/// Convert the vector from cartesian to spherical coordinates (x, y, z -> r, theta, phi where theta = [0, 2*pi])
 	vec<T> cart2sph() const
 	{
@@ -335,16 +336,16 @@ struct vec : public std::vector&lt;T&gt;
 		return *this;
 	}
-	/// Cast to a vec3
-	operator stim::vec3<T>(){
-		stim::vec3<T> r;
-		size_t N = std::min<size_t>(size(), 3);
-		for(size_t i = 0; i < N; i++)
-			r[i] = at(i);
-		return r;
-	}
-
-
+	/// Cast to a vec3
+	operator stim::vec3<T>(){
+		stim::vec3<T> r;
+		size_t N = std::min(size(), (size_t)3);
+		for(size_t i = 0; i < N; i++)
+			r[i] = at(i);
+		return r;
+	}
+
+
 	/// Casting and assignment
 	template<typename Y>
 	vec<T> & operator=(vec<Y> rhs){
@@ -355,16 +356,16 @@ struct vec : public std::vector&lt;T&gt;
 			at(i) = rhs[i];
 		return *this;
 	}
-
-	/// Assign a vec = vec3
-	template<typename Y>
-	vec<T> & operator=(vec3<Y> rhs)
-	{
-		resize(3);
-		for(size_t i=0; i<3; i++)
-			at(i) = rhs[i];
-		return *this;
-	}
+
+	/// Assign a vec = vec3
+	template<typename Y>
+	vec<T> & operator=(vec3<Y> rhs)
+	{
+		resize(3);
+		for(size_t i=0; i<3; i++)
+			at(i) = rhs[i];
+		return *this;
+	}
 	/// Unary minus (returns the negative of the vector)
 	vec<T> operator-() const{
@@ -13,6 +13,44 @@
 #include <Windows.h>
 #endif
+/**The arglist class implements command line arguments.
+    Example:
+
+    1) Create an arglist instance:
+
+        stim::arglist args;
+
+    2) Add arguments:
+
+        args.add("help", "prints this help");
+        args.add("foo", "foo takes a single integer value", "", "[intval]");
+        args.add("bar", "bar takes two floating point values", "", "[value1], [value2]");
+
+    3) Parse the command line:
+
+        args.parse(argc, argv);
+
+    4) You generally want to immediately test for help and output available arguments:
+
+        if(args["help"].is_set())
+            std::cout<<args.str();
+
+
+
+    5)  Retrieve values:
+
+        int foo;
+        float bar1, bar2;
+        if(args["foo"])
+            foo = args["foo"].as_int();
+        if(args["bar"]){
+            bar1 = args["bar"].as_float(0);
+            bar2 = args["bar"].as_float(1);
+        }
+
+
+**/
+
 namespace stim{
 	class cmd_option
@@ -258,10 +296,12 @@ namespace stim{
             flag = true;
 		}
-		bool is_set()
-		{
+		bool is_set() const{
             return flag;
         }
+        operator bool() const{
+            return is_set();
+        }
 	};
@@ -271,43 +311,7 @@ namespace stim{
 		size_t index;
 	};
-    /**The arglist class implements command line arguments.
-        Example:
-
-        1) Create an arglist instance:
-
-            stim::arglist args;
-
-        2) Add arguments:
-            args.add("help", "prints this help");
-            args.add("foo", "foo takes a single integer value", "", "[intval]");
-            args.add("bar", "bar takes two floating point values", "", "[value1], [value2]");
-
-        3) Parse the command line:
-
-            args.parse(argc, argv);
-
-        4) You generally want to immediately test for help and output available arguments:
-
-            if(args["help"].is_set())
-                std::cout<<args.str();
-
-
-
-        5)  Retrieve values:
-
-            int foo;
-            float bar1, bar2;
-            if(args["foo"])
-                foo = args["foo"].as_int();
-            if(args["bar"]){
-                bar1 = args["bar"].as_float(0);
-                bar2 = args["bar"].as_float(1);
-            }
-
-
-    **/
 	class arglist
 	{
@@ -528,21 +532,21 @@ namespace stim{
 	std::vector<std::string> arg_vector(){
 		return args;
 	}
-        ///Returns an object describing the argument
-
-        /// @param _name is the name of the requested argument
-        cmd_option operator[](std::string _name){
-			std::vector<cmd_option>::iterator it;
-            it = find(opts.begin(), opts.end(), _name);// - opts.begin();
+    ///Returns an object describing the argument
-            if(it == opts.end()){
-                std::cout<<"ERROR - Unspecified parameter name: "<<_name<<std::endl;
-                exit(1);
-            }
+    /// @param _name is the name of the requested argument
+    cmd_option operator[](std::string _name){
+		std::vector<cmd_option>::iterator it;
+        it = find(opts.begin(), opts.end(), _name);// - opts.begin();
-            return *it;
+        if(it == opts.end()){
+            std::cout<<"ERROR - Unspecified parameter name: "<<_name<<std::endl;
+            exit(1);
         }
+        return *it;
+    }
+
 	};
+/// Reconstruct a 1D function from a 2D symmetric function. This function takes a 2D image f(x,y) as input and
+///		builds a 1D function f(r) where r = sqrt(x^2 + y^2) to approximate this 2D function.
+///	This is useful for several applications, such as:
+///		1) Calculating a 1D function from a noisy 2D image, when you know the 2D image is supposed to be symmetric
+///		2) Calculating the average value for every r = sqrt(x^2 + y^2)
+
+/// Given a set of function samples equally spaced by dx, calculate the two samples closest to x and the proximity ratio alpha.
+/// This can be used to linearly interpolate between an array of equally spaced values. Given the query value x, the
+/// 	interpolated value can be calculated as r = values[sample] * alpha + values[sample + 1] * (1 - alpha)
+/// @param sample is the lowest bin closest to the query point x
+/// @param alpha is the ratio of x between [sample, sample + 1]
+/// @param dx is the spacing between values
+/// @param x is the query point
+template<typename T>
+void lerp_alpha(T& sample, T& alpha, T dx, T x){
+	sample = std::floor(x/dx);
+	alpha = 1 - (x - (b * dx)) / dx;
+}
+
+/// This function assumes that the input image is square, that the # of samples are odd, and that r=0 is at the center
+/// @param fr is an array of X elements that will store the reconstructed function
+/// @param dr is the spacing (in pixels) between samples in fr
+template<typename T>
+void cpu_func1_from_symmetric2(T* fr, T& dr, T* fxy, size_t X){
+
+	if(X%2 == 0){ 													//the 2D function must be odd (a sample must be available for r=0)
+		std::err<<"Error, X = "<<X<<" must be odd."<<std::endl;
+		exit(1);
+	}
+	size_t C = X/2+1;												//calculate the center pixel coordinate
+	size_t N = C * C;												//number of values in the folded function
+
+	// The first step is to fold the function 8 times to take advantage of symmetry in the grid
+	T* folded = (T*) malloc(sizeof(T) * N );					//allocate space for the folded function
+	memset(folded, 0, sizeof(T) * N);
+	char* count = (char*) malloc( N );								//allocate space for a counter for the folded function
+	memset(count, 0, sizeof(T) * N);
+	size_t xi, yi;													//indices into the image f(xi, yi)
+	size_t xii, yii;												//indices into the folded image
+	T v;															//register to store the value at point (xi, yi)
+	for(xi = 0; xi < X; xi++){
+		for(yi = 0; yi < X; yi++){
+			v = fxy[yi * X + xi];									//retrieve f(x, y)
+
+			xii = xi;
+			yii = yi;												//initialize the indices into the folded image
+
+			//fold the function along the x and y axes
+			if(xi > C) xii = 2 * C - xi - 1;						//calculate the folded index of x
+			if(yi > C) yii = 2 * C - yi - 1;						//calculate the folded index of y
+
+			if(xii < yii) std::swap<T>(xii, yii);					//fold the function again along the 45-degree line
+
+			folded[yii * C + xii] += v;									//add the value to the folded function
+			count[yii * C + xii] += 1;									//add a counter to the counter table
+		}
+	}
+
+	//divide out the counter to correct the folded function
+	for(size_t i = 0; i < N){
+		folded[i] /= (T)count[i];									//divide out the counter
+	}
+
+	T max_r = sqrt(X * X + Y * Y);								//calculate the maximum r value, which will be along the image diagonal
+	T dr = max_r / (X - 1);											//spacing between samples in the output function f(r)
+
+	T* fA = (T*) malloc( sizeof(T) * X);							//allocate space for a counter function storing alpha weights
+	memset(fA, 0, sizeof(T) * X);									//zero out the alpha array
+	memset(fr, 0, sizeof(T) * X);									//zero out the output function
+
+	T r;															//register to store the value of r at each point
+	size_t sample;
+	T alpha;
+	for(xi = 0; xi < C; xi++){
+		for(yi = 0; yi < xi; yi++){
+			r = sqrt(xi*xi + yi*yi);								//calculate the value of r for the current (x, y)
+			lerp_alpha(sample, alpha, dr, r);						//calculate the lowest nearby sample index and the associated alpha weight
+			fr[sample] += folded[yi * C + xi] * alpha;				//sum the weighted value from the folded function
+			fA[sample] += alpha;									//sum the weight
+
+			if(sample < X - 1){											//if we aren't dealing with the last bin
+				fr[sample + 1] += folded[yi * C + xi] * (1.0 - alpha);	//calculate the weighted value for the second point
+				fA[sample + 1] += 1 - alpha;							//add the second alpha value
+			}
+		}
+	}
+
+	//divide out the alpha values
+	for(size_t i = 0; i < X; i++)
+		fr[i] /= fA[i];
+
+	//free allocated memory
+	free(folded);
+	free(count);
+	free(fA);
+}
 \ No newline at end of file
+// right now the size of CUDA STACK is set to 1000, increase it if you mean to make deeper tree
+// data should be stored in row-major
+// x1,x2,x3,x4,x5......
+// y1,y2,y3,y4,y5......
+// ....................
+// ....................
+
+#ifndef KDTREE_H
+#define KDTREE_H
+#define stack_size 50
+
+#include "device_launch_parameters.h"
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "cuda_runtime.h"
+#include <vector>
+#include <cstring>
+#include <float.h>
+#include <iostream>
+#include <algorithm>
+#include <stim/cuda/cudatools/error.h>
+#include <stim/visualization/aabbn.h>
+
+namespace stim {
+	namespace kdtree {
+		template<typename T, int D>											// typename refers to float or double while D refers to dimension of points
+		struct point {
+			T dim[D];														// create a structure to store every one input point
+		};
+
+		template<typename T>
+		class kdnode {
+		public:
+			kdnode() {														// constructor for initializing a kdnode
+				parent = NULL;												// set every node's parent, left and right kdnode pointers to NULL
+				left = NULL;
+				right = NULL;
+				parent_idx = -1;											// set parent node index to default -1
+				left_idx = -1;
+				right_idx = -1;
+				split_value = -1;											// set split_value to default -1
+			}
+			int idx;														// index of current node
+			int parent_idx, left_idx, right_idx;							// index of parent, left and right nodes
+			kdnode *parent, *left, *right;									// parent, left and right kdnodes
+			T split_value;													// splitting value of current node
+			std::vector <size_t> indices;									// it indicates the points' indices that current node has 
+			size_t level;													// tree level of current node
+		};
+	}				// end of namespace kdtree
+
+	template <typename T, int D = 3>										// set dimension of data to default 3
+	class cpu_kdtree {
+	protected:
+		int current_axis;													// current judging axis
+		int n_id;															// store the total number of nodes
+		std::vector < typename kdtree::point<T, D> > *tmp_points;			// transfer or temperary points
+		std::vector < typename kdtree::point<T, D> > cpu_tmp_points;		// for cpu searching
+		kdtree::kdnode<T> *root;											// root node
+		static cpu_kdtree<T, D> *cur_tree_ptr;
+	public:
+		cpu_kdtree() {														// constructor for creating a cpu_kdtree
+			cur_tree_ptr = this;											// create  a class pointer points to the current class value
+			n_id = 0;														// set total number of points to default 0
+		}
+		~cpu_kdtree() {											  			// destructor of cpu_kdtree
+			std::vector <kdtree::kdnode<T>*> next_nodes;
+			next_nodes.push_back(root);
+			while (next_nodes.size()) {
+				std::vector <kdtree::kdnode<T>*> next_search_nodes;
+				while (next_nodes.size()) {
+					kdtree::kdnode<T> *cur = next_nodes.back();
+					next_nodes.pop_back();
+					if (cur->left)
+						next_search_nodes.push_back(cur->left);
+					if (cur->right)
+						next_search_nodes.push_back(cur->right);
+					delete cur;
+				}
+				next_nodes = next_search_nodes;
+			}
+			root = NULL;
+		}
+		void cpu_create(std::vector < typename kdtree::point<T, D> > &reference_points, size_t max_levels) {									
+			tmp_points = &reference_points;
+			root = new kdtree::kdnode<T>();									// initializing the root node
+			root->idx = n_id++;												// the index of root is 0
+			root->level = 0;												// tree level begins at 0
+			root->indices.resize(reference_points.size());					// get the number of points
+			for (size_t i = 0; i < reference_points.size(); i++) {
+				root->indices[i] = i;										// set indices of input points
+			}
+			std::vector <kdtree::kdnode<T>*> next_nodes;					// next nodes
+			next_nodes.push_back(root);										// push back the root node
+			while (next_nodes.size()) {
+				std::vector <kdtree::kdnode<T>*> next_search_nodes;			// next search nodes
+				while (next_nodes.size()) {									// two same WHILE is because we need to make a new vector to store nodes for search
+					kdtree::kdnode<T> *current_node = next_nodes.back();	// handle node one by one (right first) 
+					next_nodes.pop_back();									// pop out current node in order to store next round of nodes
+					if (current_node->level < max_levels) {					
+						if (current_node->indices.size() > 1) {				// split if the nonleaf node contains more than one point
+							kdtree::kdnode<T> *left = new kdtree::kdnode<T>();
+							kdtree::kdnode<T> *right = new kdtree::kdnode<T>();
+							left->idx = n_id++;								// set the index of current node's left node
+							right->idx = n_id++;							
+							split(current_node, left, right);				// split left and right and determine a node
+							std::vector <size_t> temp;						// empty vecters of int
+							//temp.resize(current_node->indices.size());
+							current_node->indices.swap(temp);				// clean up current node's indices
+							current_node->left = left;
+							current_node->right = right;
+							current_node->left_idx = left->idx;				
+							current_node->right_idx = right->idx;					
+							if (right->indices.size())
+								next_search_nodes.push_back(right);			// left pop out first
+							if (left->indices.size())
+								next_search_nodes.push_back(left);	
+						}
+					}
+				}
+				next_nodes = next_search_nodes;								// go deeper within the tree
+			}
+		}
+		static bool sort_points(const size_t a, const size_t b) {									// create functor for std::sort
+			std::vector < typename kdtree::point<T, D> > &pts = *cur_tree_ptr->tmp_points;			// put cur_tree_ptr to current input points' pointer
+			return pts[a].dim[cur_tree_ptr->current_axis] < pts[b].dim[cur_tree_ptr->current_axis];
+		}
+		void split(kdtree::kdnode<T> *cur, kdtree::kdnode<T> *left, kdtree::kdnode<T> *right) {
+			std::vector < typename kdtree::point<T, D> > &pts = *tmp_points;
+			current_axis = cur->level % D;												// indicate the judicative dimension or axis
+			std::sort(cur->indices.begin(), cur->indices.end(), sort_points);			// using SortPoints as comparison function to sort the data
+			size_t mid_value = cur->indices[cur->indices.size() / 2];                   // odd in the mid_value, even take the floor
+			cur->split_value = pts[mid_value].dim[current_axis];						// get the parent node
+			left->parent = cur;                                                         // set the parent of the next search nodes to current node
+			right->parent = cur;
+			left->level = cur->level + 1;												// level + 1
+			right->level = cur->level + 1;
+			left->parent_idx = cur->idx;                                                // set its parent node's index
+			right->parent_idx = cur->idx;                                            
+			for (size_t i = 0; i < cur->indices.size(); i++) {							// split into left and right half-space one by one
+				size_t idx = cur->indices[i];
+				if (pts[idx].dim[current_axis] < cur->split_value)
+					left->indices.push_back(idx);
+				else
+					right->indices.push_back(idx);
+			}
+		}
+		void create(T *h_reference_points, size_t reference_count, size_t max_levels) {
+			std::vector < typename kdtree::point<T, D> > reference_points(reference_count);		// restore the reference points in particular way
+			for (size_t j = 0; j < reference_count; j++)
+				for (size_t i = 0; i < D; i++)
+					reference_points[j].dim[i] = h_reference_points[j * D + i];
+			cpu_create(reference_points, max_levels);
+			cpu_tmp_points = *tmp_points;
+		}
+		int get_num_nodes() const {														// get the total number of nodes
+			return n_id; 
+		}
+		kdtree::kdnode<T>* get_root() const {											// get the root node of tree
+			return root; 
+		}
+        T cpu_distance(const kdtree::point<T, D> &a, const kdtree::point<T, D> &b) {
+			T distance = 0;
+
+			for (size_t i = 0; i < D; i++) {
+				T d = a.dim[i] - b.dim[i];
+				distance += d*d;
+			}
+			return distance;
+		}
+		void cpu_search_at_node(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, size_t *index, T *distance, kdtree::kdnode<T> **node) {
+			T best_distance = FLT_MAX;                                              // initialize the best distance to max of floating point
+			size_t best_index = 0;
+			std::vector < typename kdtree::point<T, D> > pts = cpu_tmp_points;
+			while (true) {
+				size_t split_axis = cur->level % D;
+				if (cur->left == NULL) {                                            // risky but acceptable, same goes for right because left and right are in same pace
+					*node = cur;													// pointer points to a pointer
+					for (size_t i = 0; i < cur->indices.size(); i++) {
+						size_t idx = cur->indices[i];
+						T d = cpu_distance(query, pts[idx]);						// compute distances
+						/// if we want to compute k nearest neighbor, we can input the last resul
+						/// (last_best_dist < dist < best_dist) to select the next point until reaching to k
+						if (d < best_distance) {
+							best_distance = d;
+							best_index = idx;                                       // record the nearest neighbor index
+						}
+					}
+					break;                                                          // find the target point then break the loop
+				}
+				else if (query.dim[split_axis] < cur->split_value) {				// if it has son node, visit the next node on either left side or right side
+					cur = cur->left;
+				}
+				else {
+					cur = cur->right;
+				}
+			}
+			*index = best_index;
+			*distance = best_distance;
+		} 
+		void cpu_search_at_node_range(kdtree::kdnode<T> *cur, const kdtree::point<T, D> &query, T range, size_t *index, T *distance) {
+			T best_distance = FLT_MAX;                                              // initialize the best distance to max of floating point
+			size_t best_index = 0;
+			std::vector < typename kdtree::point<T, D> > pts = cpu_tmp_points;
+			std::vector < typename kdtree::kdnode<T>*> next_node;
+			next_node.push_back(cur);
+			while (next_node.size()) {
+				std::vector<typename kdtree::kdnode<T>*> next_search;
+				while (next_node.size()) {
+					cur = next_node.back();                                         
+					next_node.pop_back();
+					size_t split_axis = cur->level % D;
+					if (cur->left == NULL) {
+						for (size_t i = 0; i < cur->indices.size(); i++) {
+							size_t idx = cur->indices[i];
+							T d = cpu_distance(query, pts[idx]);
+							if (d < best_distance) {
+								best_distance = d;
+								best_index = idx;
+							}
+						}
+					}
+					else {
+						T d = query.dim[split_axis] - cur->split_value;				// computer distance along specific axis or dimension
+						/// there are three possibilities: on either left or right, and on both left and right
+						if (fabs(d) > range) {										// absolute value of floating point to see if distance will be larger that best_dist
+							if (d < 0)
+								next_search.push_back(cur->left);                   // every left[split_axis] is less and equal to cur->split_value, so it is possible to find the nearest point in this region
+							else
+								next_search.push_back(cur->right);
+						}
+						else {                                                      // it is possible that nereast neighbor will appear on both left and right 
+							next_search.push_back(cur->left);
+							next_search.push_back(cur->right);
+						}
+					}
+				}
+				next_node = next_search;                                            // pop out at least one time                                  
+			}
+			*index = best_index;
+			*distance = best_distance;
+		}
+		void cpu_search(T *h_query_points, size_t query_count, size_t *h_indices, T *h_distances) {
+			/// first convert the input query point into specific type
+			kdtree::point<T, D> query;
+			for (size_t j = 0; j < query_count; j++) {
+				for (size_t i = 0; i < D; i++)
+					query.dim[i] = h_query_points[j * D + i];
+				/// find the nearest node, this will be the upper bound for the next time searching
+				kdtree::kdnode<T> *best_node = NULL;
+				T best_distance = FLT_MAX;
+				size_t best_index = 0;
+				T radius = 0;																				// radius for range                                                                           
+				cpu_search_at_node(root, query, &best_index, &best_distance, &best_node);                   // simple search to rougly determine a result for next search step
+				radius = sqrt(best_distance);                                                               // It is possible that nearest will appear in another region
+				/// find other possibilities
+				kdtree::kdnode<T> *cur = best_node;
+				while (cur->parent != NULL) {																// every node that you pass will be possible to be the best node
+					/// go up
+					kdtree::kdnode<T> *parent = cur->parent;                                                // travel back to every node that we pass through
+					size_t split_axis = (parent->level) % D;
+					/// search other nodes
+					size_t tmp_index;
+					T tmp_distance = FLT_MAX;
+					if (fabs(parent->split_value - query.dim[split_axis]) <= radius) {
+						/// search opposite node
+						if (parent->left != cur)
+							cpu_search_at_node_range(parent->left, query, radius, &tmp_index, &tmp_distance);        // to see whether it is its mother node's left son node
+						else
+							cpu_search_at_node_range(parent->right, query, radius, &tmp_index, &tmp_distance);
+					}
+					if (tmp_distance < best_distance) {
+						best_distance = tmp_distance;
+						best_index = tmp_index;
+					}
+					cur = parent;
+				}
+				h_indices[j] = best_index;
+				h_distances[j] = best_distance;
+			}
+		}
+	};				//end class kdtree
+
+	template <typename T, int D>
+	cpu_kdtree<T, D>* cpu_kdtree<T, D>::cur_tree_ptr = NULL;												// definition of cur_tree_ptr pointer points to the current class
+
+	template <typename T>
+	struct cuda_kdnode {
+		int parent, left, right;														
+		T split_value;
+		size_t num_index;																					// number of indices it has
+		int index;																							// the beginning index
+		size_t level;
+	};
+
+	template <typename T, int D>
+    __device__ T gpu_distance(kdtree::point<T, D> &a, kdtree::point<T, D> &b) {
+		T distance = 0;
+
+		for (size_t i = 0; i < D; i++) {
+			T d = a.dim[i] - b.dim[i];
+			distance += d*d;
+		}
+		return distance;
+	}
+	template <typename T, int D>
+	__device__ void search_at_node(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, int cur, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, int *d_node) {
+		T best_distance = FLT_MAX;
+		size_t best_index = 0;
+
+		while (true) {																						// break until reach the bottom
+			int split_axis = nodes[cur].level % D;
+			if (nodes[cur].left == -1) {																	// check whether it has left node or not
+				*d_node = cur;
+				for (int i = 0; i < nodes[cur].num_index; i++) {
+					size_t idx = indices[nodes[cur].index + i];
+					T dist = gpu_distance<T, D>(d_query_point, d_reference_points[idx]);
+					if (dist < best_distance) {
+						best_distance = dist;
+						best_index = idx;
+					}
+				}
+			break;
+			}
+			else if (d_query_point.dim[split_axis] < nodes[cur].split_value) {								// jump into specific son node
+				cur = nodes[cur].left;
+			}
+			else {
+				cur = nodes[cur].right;
+			}
+		}
+		*d_distance = best_distance;
+		*d_index = best_index;
+	}
+	template <typename T, int D>
+	__device__ void search_at_node_range(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, int cur, T range, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
+		T best_distance = FLT_MAX;
+		size_t best_index = 0;
+
+		int next_nodes_pos = 0;																				// initialize pop out order index
+		next_nodes[id * stack_size + next_nodes_pos] = cur;															// find data that belongs to the very specific thread
+		next_nodes_pos++;
+
+		while (next_nodes_pos) {
+			int next_search_nodes_pos = 0;																	// record push back order index
+			while (next_nodes_pos) {
+				cur = next_nodes[id * stack_size + next_nodes_pos - 1];												// pop out the last push in one and keep poping out
+				next_nodes_pos--;
+				int split_axis = nodes[cur].level % D;
+
+				if (nodes[cur].left == -1) {
+					for (int i = 0; i < nodes[cur].num_index; i++) {
+						int idx = indices[nodes[cur].index + i];											// all indices are stored in one array, pick up from every node's beginning index
+						T d = gpu_distance<T>(d_query_point, d_reference_points[idx]);
+						if (d < best_distance) {
+							best_distance = d;
+							best_index = idx;
+						}
+					}
+				}
+				else {
+					T d = d_query_point.dim[split_axis] - nodes[cur].split_value;
+
+					if (fabs(d) > range) {
+						if (d < 0) {
+							next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left;
+							next_search_nodes_pos++;
+						}
+						else {
+							next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right;
+							next_search_nodes_pos++;
+						}
+					}
+					else {
+						next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].right;
+						next_search_nodes_pos++;
+						next_search_nodes[id * stack_size + next_search_nodes_pos] = nodes[cur].left;
+						next_search_nodes_pos++;
+						if (next_search_nodes_pos > stack_size) {
+							printf("Thread conflict might be caused by thread %d, so please try smaller input max_tree_levels\n", id);
+							(*Judge)++;
+						}
+					}
+				}
+			}
+			for (int i = 0; i < next_search_nodes_pos; i++)
+				next_nodes[id * stack_size + i] = next_search_nodes[id * stack_size + i];
+			next_nodes_pos = next_search_nodes_pos;										
+		}
+		*d_distance = best_distance;
+		*d_index = best_index;
+	}
+	template <typename T, int D>
+	__device__ void search(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> &d_query_point, size_t *d_index, T *d_distance, size_t id, int *next_nodes, int *next_search_nodes, int *Judge) {
+		int best_node = 0;
+		T best_distance = FLT_MAX;
+		size_t best_index = 0;
+		T radius = 0;
+
+		search_at_node<T, D>(nodes, indices, d_reference_points, 0, d_query_point, &best_index, &best_distance, &best_node);
+		radius = sqrt(best_distance);																															// get range
+		int cur = best_node;
+
+		while (nodes[cur].parent != -1) {
+			int parent = nodes[cur].parent;
+			int split_axis = nodes[parent].level % D;
+
+			T tmp_dist = FLT_MAX;
+			size_t tmp_idx;
+			if (fabs(nodes[parent].split_value - d_query_point.dim[split_axis]) <= radius) {
+				if (nodes[parent].left != cur)
+					search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].left, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
+				else
+					search_at_node_range(nodes, indices, d_reference_points, d_query_point, nodes[parent].right, radius, &tmp_idx, &tmp_dist, id, next_nodes, next_search_nodes, Judge);
+			}
+			if (tmp_dist < best_distance) {
+				best_distance = tmp_dist;
+				best_index = tmp_idx;
+			}
+			cur = parent;
+		}
+		*d_distance = sqrt(best_distance);
+		*d_index = best_index;
+	}
+	template <typename T, int D>
+	__global__ void search_batch(cuda_kdnode<T> *nodes, size_t *indices, kdtree::point<T, D> *d_reference_points, kdtree::point<T, D> *d_query_points, size_t d_query_count, size_t *d_indices, T *d_distances, int *next_nodes, int *next_search_nodes, int *Judge) {
+		size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+		if (idx >= d_query_count) return;																														 // avoid segfault
+
+		search<T, D>(nodes, indices, d_reference_points, d_query_points[idx], &d_indices[idx], &d_distances[idx], idx, next_nodes, next_search_nodes, Judge);    // every query points are independent
+	}
+
+	template <typename T, int D = 3>
+	class cuda_kdtree {
+	protected:
+		cuda_kdnode<T> *d_nodes;                                                    																		 
+		size_t *d_index;
+		kdtree::point<T, D>* d_reference_points;
+		size_t npts;
+		int num_nodes;
+	public:
+		~cuda_kdtree() {
+			HANDLE_ERROR(cudaFree(d_nodes));
+			HANDLE_ERROR(cudaFree(d_index));
+			HANDLE_ERROR(cudaFree(d_reference_points));
+		}
+
+		/// Create a KD-tree given a pointer to an array of reference points and the number of reference points
+		/// @param h_reference_points is a host array containing the reference points in (x0, y0, z0, ...., ) order
+		/// @param reference_count is the number of reference point in the array
+		/// @param max_levels is the deepest number of tree levels allowed
+		void create(T *h_reference_points, size_t reference_count, size_t max_levels = 3) {
+			if (max_levels > 10) {
+				std::cout<<"The max_tree_levels should be smaller!"<<std::endl;
+				exit(1);
+			}		
+			//bb.init(&h_reference_points[0]);
+			//aaboundingboxing<T, D>(bb, h_reference_points, reference_count);
+
+			std::vector < typename kdtree::point<T, D> > reference_points(reference_count);																				// restore the reference points in particular way
+			for (size_t j = 0; j < reference_count; j++)
+				for (size_t i = 0; i < D; i++)
+					reference_points[j].dim[i] = h_reference_points[j * D + i];	
+			cpu_kdtree<T, D> tree;																																// creating a tree on cpu
+			tree.cpu_create(reference_points, max_levels);																											// building a tree on cpu
+			kdtree::kdnode<T> *d_root = tree.get_root();
+			num_nodes = tree.get_num_nodes();
+			npts = reference_count;																												// also equals to reference_count
+
+			HANDLE_ERROR(cudaMalloc((void**)&d_nodes, sizeof(cuda_kdnode<T>) * num_nodes));																		// copy data from host to device
+			HANDLE_ERROR(cudaMalloc((void**)&d_index, sizeof(size_t) * npts));
+			HANDLE_ERROR(cudaMalloc((void**)&d_reference_points, sizeof(kdtree::point<T, D>) * npts));
+
+			std::vector < cuda_kdnode<T> > tmp_nodes(num_nodes);																									
+			std::vector <size_t> indices(npts);
+			std::vector <kdtree::kdnode<T>*> next_nodes;
+			size_t cur_pos = 0;
+			next_nodes.push_back(d_root);
+			while (next_nodes.size()) {
+				std::vector <typename kdtree::kdnode<T>*> next_search_nodes;
+				while (next_nodes.size()) {
+					kdtree::kdnode<T> *cur = next_nodes.back();
+					next_nodes.pop_back();
+					int id = cur->idx;																															// the nodes at same level are independent
+					tmp_nodes[id].level = cur->level;
+					tmp_nodes[id].parent = cur->parent_idx;
+					tmp_nodes[id].left = cur->left_idx;
+					tmp_nodes[id].right = cur->right_idx;
+					tmp_nodes[id].split_value = cur->split_value;
+					tmp_nodes[id].num_index = cur->indices.size();																								// number of index
+					if (cur->indices.size()) {
+						for (size_t i = 0; i < cur->indices.size(); i++)
+							indices[cur_pos + i] = cur->indices[i];
+
+						tmp_nodes[id].index = (int)cur_pos;																										// beginning index of reference_points that every bottom node has
+						cur_pos += cur->indices.size();																											// store indices continuously for every query_point
+					}
+					else {
+						tmp_nodes[id].index = -1;
+					}
+
+					if (cur->left)
+						next_search_nodes.push_back(cur->left);
+
+					if (cur->right)
+						next_search_nodes.push_back(cur->right);
+				}
+				next_nodes = next_search_nodes;
+			}
+			HANDLE_ERROR(cudaMemcpy(d_nodes, &tmp_nodes[0], sizeof(cuda_kdnode<T>) * tmp_nodes.size(), cudaMemcpyHostToDevice));
+			HANDLE_ERROR(cudaMemcpy(d_index, &indices[0], sizeof(size_t) * indices.size(), cudaMemcpyHostToDevice));
+			HANDLE_ERROR(cudaMemcpy(d_reference_points, &reference_points[0], sizeof(kdtree::point<T, D>) * reference_points.size(), cudaMemcpyHostToDevice));
+		}
+
+		/// Search the KD tree for nearest neighbors to a set of specified query points
+		/// @param h_query_points an array of query points in (x0, y0, z0, ...) order
+		/// @param query_count is the number of query points
+		/// @param indices are the indices to the nearest reference point for each query points
+		/// @param distances is an array containing the distance between each query point and the nearest reference point
+		void search(T *h_query_points, size_t query_count, size_t *indices, T *distances) {
+			std::vector < typename kdtree::point<T, D> > query_points(query_count);
+			for (size_t j = 0; j < query_count; j++)
+				for (size_t i = 0; i < D; i++)
+					query_points[j].dim[i] = h_query_points[j * D + i];
+
+			unsigned int threads = (unsigned int)(query_points.size() > 1024 ? 1024 : query_points.size());
+			unsigned int blocks = (unsigned int)(query_points.size() / threads + (query_points.size() % threads ? 1 : 0));
+
+			kdtree::point<T, D> *d_query_points;																												// create a pointer pointing to query points on gpu
+			size_t *d_indices;
+			T *d_distances;
+
+			int *next_nodes;																																	// create two STACK-like array
+			int *next_search_nodes;
+
+			int *Judge = NULL;																																	// judge variable to see whether one thread is overwrite another thread's memory																						
+		
+			HANDLE_ERROR(cudaMalloc((void**)&d_query_points, sizeof(T) * query_points.size() * D));
+			HANDLE_ERROR(cudaMalloc((void**)&d_indices, sizeof(size_t) * query_points.size()));
+			HANDLE_ERROR(cudaMalloc((void**)&d_distances, sizeof(T) * query_points.size()));
+			HANDLE_ERROR(cudaMalloc((void**)&next_nodes, threads * blocks * stack_size * sizeof(int)));																	// STACK size right now is 50, you can change it if you mean to
+			HANDLE_ERROR(cudaMalloc((void**)&next_search_nodes, threads * blocks * stack_size * sizeof(int)));	
+			HANDLE_ERROR(cudaMemcpy(d_query_points, &query_points[0], sizeof(T) * query_points.size() * D, cudaMemcpyHostToDevice));
+
+			search_batch<<<blocks, threads>>> (d_nodes, d_index, d_reference_points, d_query_points, query_points.size(), d_indices, d_distances, next_nodes, next_search_nodes, Judge);
+
+			if (Judge == NULL) {																																// do the following work if the thread works safely
+				HANDLE_ERROR(cudaMemcpy(indices, d_indices, sizeof(size_t) * query_points.size(), cudaMemcpyDeviceToHost));
+				HANDLE_ERROR(cudaMemcpy(distances, d_distances, sizeof(T) * query_points.size(), cudaMemcpyDeviceToHost));
+			}
+
+			HANDLE_ERROR(cudaFree(next_nodes));
+			HANDLE_ERROR(cudaFree(next_search_nodes));
+			HANDLE_ERROR(cudaFree(d_query_points));
+			HANDLE_ERROR(cudaFree(d_indices));
+			HANDLE_ERROR(cudaFree(d_distances));
+		}
+
+		/// Return the number of points in the KD tree
+		size_t num_points() {
+			return npts;
+		}
+
+		stim::aabbn<T, D> getbox() {
+			size_t N = npts;
+			//std::vector < typename kdtree::point<T, D> > cpu_ref(npts);	//allocate space on the CPU for the reference points
+			T* cpu_ref = (T*)malloc(N * D * sizeof(T));					//allocate space on the CPU for the reference points
+			HANDLE_ERROR(cudaMemcpy(cpu_ref, d_reference_points, N * D * sizeof(T), cudaMemcpyDeviceToHost));	//copy from GPU to CPU
+
+			stim::aabbn<T, D> bb(cpu_ref);
+
+			for (size_t i = 1; i < N; i++) {							//for each reference point
+				//std::cout << "( " << cpu_ref[i * D + 0] << ", " << cpu_ref[i * D + 1] << ", " << cpu_ref[i * D + 2] << ")" << std::endl;
+				bb.insert(&cpu_ref[i * D]);
+			}
+			return bb;
+		}
+
+		//generate an implicit distance field for the KD-tree
+		void dist_field3(T* dist, size_t* dims, stim::aabbn<T, 3> bb) {
+			size_t N = 1;									//number of query points that make up the distance field
+			for (size_t d = 0; d < 3; d++) N *= dims[d];	//calculate the total number of query points
+
+			//calculate the grid spatial parameters
+			T dx = 0;
+			if (dims[0] > 1) dx = bb.length(0) / dims[0];
+			T dy = 0;
+			if (dims[1] > 1) dy = bb.length(1) / dims[1];
+			T dz = 0;
+			if (dims[2] > 1) dz = bb.length(2) / dims[2];
+
+			T* Q = (T*)malloc(N * 3 * sizeof(T));				//allocate space for the query points
+			size_t i;
+			for (size_t z = 0; z < dims[2]; z++) {				//for each query point (which is a point in the grid)
+				for (size_t y = 0; y < dims[1]; y++) {
+					for (size_t x = 0; x < dims[0]; x++) {
+						i = z * dims[1] * dims[0] + y * dims[0] + x;
+						Q[i * 3 + 0] = bb.low[0] + x * dx + dx / 2;
+						Q[i * 3 + 1] = bb.low[1] + y * dy + dy / 2;
+						Q[i * 3 + 2] = bb.low[2] + z * dz + dz / 2;
+						//std::cout << i<<"     "<<Q[i * 3 + 0] << "     " << Q[i * 3 + 1] << "     " << Q[i * 3 + 2] << std::endl;
+					}
+				}
+			}
+			size_t* temp = (size_t*)malloc(N * sizeof(size_t));	//allocate space to store the indices (unused)
+			search(Q, N, temp, dist);
+		}
+
+		//generate an implicit distance field for the KD-tree
+		void dist_field3(T* dist, size_t* dims) {
+			stim::aabbn<T, D> bb = getbox();					//get a bounding box around the tree
+			dist_field3(dist, dims, bb);
+		}
+
+	};
+}				//end namespace stim
+#endif
 \ No newline at end of file
+#ifndef STIM_UTIL_FILESIZE_H
+#define STIM_UTIL_FILESIZE_H
+
+#ifdef _WIN32
+#include <Windows.h>
+#else
+#include <sys/types.h>
+#include <sys/stat.h>
+#endif
+
+namespace stim{
+static size_t file_size(std::string filename){
+#ifdef _WIN32
+	HANDLE hFile = CreateFile(filename.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+	if(hFile == INVALID_HANDLE_VALUE) return 0;
+	LARGE_INTEGER size;
+	if(!GetFileSizeEx(hFile, &size)){
+		CloseHandle(hFile);
+		return 0;
+	}
+	CloseHandle(hFile);
+	return (size_t)size.QuadPart;
+#else
+	struct stat sb;
+	stat(filename.c_str(), &sb);
+	return sb.st_size;
+#endif
+}
+
+}	//end namespace stim
+
+
+
+#endif
@@ -2,51 +2,31 @@
 #define STIM_AABB3_H
 #include <stim/cuda/cudatools/callable.h>
+#include <stim/visualization/aabbn.h>
 namespace stim{
-/// Structure for a 3D axis aligned bounding box
+	template<typename T>
+	using aabb3 = aabbn<T, 3>;
+/*/// Structure for a 3D axis aligned bounding box
 template<typename T>
-struct aabb3{
-
-//protected:
-
-	T low[3];						//top left corner position
-	T high[3];							//dimensions along x and y and z
-
-//public:
-
-	CUDA_CALLABLE aabb3(T x, T y, T z){					//initialize an axis aligned bounding box of size 0 at the given position
-		low[0] = high[0] = x;					//set the position to the user specified coordinates
-		low[1] = high[1] = y;
-		low[2] = high[2] = z;
+struct aabb3 : public aabbn<T, 3>{
+
+	aabb3() : aabbn() {}
+	aabb3(T x0, T y0, T z0, T x1, T y1, T z1){
+		low[0] = x0;
+		low[1] = y0;
+		low[2] = z0;
+		high[0] = x0;
+		high[1] = x1;
+		high[2] = x2;
 	}
-	//insert a point into the bounding box, growing the box appropriately
-	CUDA_CALLABLE void insert(T x, T y, T z){
-		if(x < low[0]) low[0] = x;
-		if(y < low[1]) low[1] = y;
-		if(z < low[2]) low[2] = z;
-
-		if(x > high[0]) high[0] = x;		
-		if(y > high[1]) high[1] = y;
-		if(z > high[2]) high[2] = z;
-	}
-
-	//trim the bounding box so that the lower bounds are (x, y, z)
-	CUDA_CALLABLE void trim_low(T x, T y, T z){
-		if(low[0] < x) low[0] = x;
-		if(low[1] < y) low[1] = y;
-		if(low[2] < z) low[2] = z;
-	}
+	aabb3 aabbn<T, 3>() {
-	CUDA_CALLABLE void trim_high(T x, T y, T z){
-		if(high[0] > x) high[0] = x;
-		if(high[1] > y) high[1] = y;
-		if(high[2] > z) high[2] = z;
 	}
-};
+};*/
 }
+#ifndef STIM_AABBN_H
+#define STIM_AABBN_H
+
+#include <vector>
+#include <stim/cuda/cudatools/callable.h>
+
+namespace stim{
+
+/// Structure for a 3D axis aligned bounding box
+template<typename T, size_t D>
+struct aabbn{
+
+//protected:
+
+	T low[D];					//top left corner position
+	T high[D]; 							//dimensions along x and y and z
+
+	CUDA_CALLABLE void init(T* i) {
+		for (size_t d = 0; d < D; d++)
+			low[d] = high[d] = i[d];
+	}
+
+	CUDA_CALLABLE aabbn() {}
+	CUDA_CALLABLE aabbn(T* i) {
+		init(i);
+	}
+
+	CUDA_CALLABLE aabbn(T x0, T x1) {
+		low[0] = x0;
+		high[0] = x1;
+	}
+
+	CUDA_CALLABLE aabbn(T x0, T y0, T x1, T y1) : aabbn(x0, x1) {
+		low[1] = y0;
+		high[1] = y1;
+	}
+
+	CUDA_CALLABLE aabbn(T x0, T y0, T z0, T x1, T y1, T z1) : aabbn(x0, y0, x1, y1) {
+		low[2] = z0;
+		high[2] = z1;
+	}
+	
+
+	//insert a point into the bounding box, growing the box appropriately
+	CUDA_CALLABLE void insert(T* p){
+		for(size_t d = 0; d < D; d++){
+			if(p[d] < low[d]) low[d] = p[d];
+			if(p[d] > high[d]) high[d] = p[d];
+		}
+	}
+
+	//trim the bounding box so that the lower bounds are b(x, y, z, ...)
+	CUDA_CALLABLE void trim_low(T* b){
+		for(size_t d = 0; d < D; d++)
+			if(low[d] < b[d]) low[d] = b[d];
+	}
+
+	CUDA_CALLABLE void trim_high(T* b){
+		for(size_t d = 0; d < D; d++)
+			if(low[d] > b[d]) low[d] = b[d];
+	}
+
+	CUDA_CALLABLE T length(size_t d) {
+		return high[d] - low[d];
+	}
+
+	CUDA_CALLABLE aabbn<T, D> operator*(T s) {
+		aabbn<T, D> newbox;
+		for (size_t d = 0; d < D; d++) {
+			T c = (low[d] + high[d]) / 2;
+			T l = high[d] - low[d];
+			newbox.low[d] = c - l * s / 2;
+			newbox.high[d] = c + l * s / 2;
+		}
+		return newbox;
+	}
+
+	//translate the box along dimension d a distance of v
+	CUDA_CALLABLE void translate(size_t d, T v) {
+		for (size_t d = 0; d < D; d++) {
+			low[d] += v;
+			high[d] += v;
+		}
+	}
+
+};
+
+}
+
+
+#endif
 \ No newline at end of file
@@ -4,6 +4,9 @@
 #include <stim/math/circle.h>
 #include <stim/biomodels/centerline.h>
+/*
+	
+*/
 namespace stim
 {
@@ -12,13 +15,13 @@ class cylinder
  : public centerline<T>
 {
 	private:
-		stim::circle<T> s;			//an arbitrary circle
-		std::vector<stim::circle<T> > e;	//an array of circles that store the centerline
+		stim::circle<T> s;							//an arbitrary circle
+		std::vector<stim::circle<T> > e;			//an array of circles that store the centerline
 		std::vector<stim::vec3<T> > norms;
 		std::vector<stim::vec<T> > Us;
-		std::vector<stim::vec<T> > mags;
-		std::vector< T > L;			//length of the cylinder at each position.
+		std::vector<stim::vec<T> > mags;			//stores a list of magnitudes for each point in the centerline (assuming mags[0] is the radius)
+		std::vector< T > L;							//length of the cylinder at each position (pre-integration)
 		using stim::centerline<T>::c;
@@ -61,9 +64,9 @@ class cylinder
 				return;
 			//calculate each L.
-			L.resize(inP.size());
-			T temp = (T)0;
-			L[0] = 0;
+			L.resize(inP.size());						//the number of precomputed lengths will equal the number of points
+			T temp = (T)0;								//length up to that point
+			L[0] = temp;
 			for(size_t i = 1; i < L.size(); i++)
 			{
 				temp += (inP[i-1] - inP[i]).len();
@@ -234,7 +237,7 @@ class cylinder
 		cylinder(std::vector< stim::vec3<T> > inP)
 			: centerline<T>(inP)
 		{
-			std::vector< T > inM;						//create an array of arbitrary magnitudes
+			std::vector< stim::vec<T> > inM;						//create an array of arbitrary magnitudes
 			stim::vec<T> zero;
 			zero.push_back(0);
@@ -476,30 +479,30 @@ class cylinder
 			std::vector< vec3<T> > result;
-			vec3<T> p0 = e[0].P;								//initialize p0 to the first point on the centerline
+			vec3<T> p0 = e[0].P;									//initialize p0 to the first point on the centerline
 			vec3<T> p1;
-			unsigned N = size();							//number of points in the current centerline
+			unsigned N = size();									//number of points in the current centerline
 			//for each line segment on the centerline
 			for(unsigned int i = 1; i < N; i++){
-				p1 = e[i].P;								//get the second point in the line segment
+				p1 = e[i].P;										//get the second point in the line segment
-				vec3<T> v = p1 - p0;							//calculate the vector between these two points
-				T d = v.len();								//calculate the distance between these two points (length of the line segment)
+				vec3<T> v = p1 - p0;								//calculate the vector between these two points
+				T d = v.len();										//calculate the distance between these two points (length of the line segment)
 				size_t nsteps = (size_t)std::ceil(d / spacing);		//calculate the number of steps to take along the segment to meet the spacing criteria
-				T stepsize = (T)1.0 / nsteps;			//calculate the parametric step size between new centerline points
+				T stepsize = (T)1.0 / nsteps;						//calculate the parametric step size between new centerline points
 				//for each step along the line segment
 				for(unsigned s = 0; s < nsteps; s++){
-					T alpha = stepsize * s;					//calculate the fraction of the distance along the line segment covered
-					result.push_back(p0 + alpha * v);	//push the point at alpha position along the line segment
+					T alpha = stepsize * s;							//calculate the fraction of the distance along the line segment covered
+					result.push_back(p0 + alpha * v);				//push the point at alpha position along the line segment
 				}
-				p0 = p1;								//shift the points to move to the next line segment
+				p0 = p1;											//shift the points to move to the next line segment
 			}
-			result.push_back(e[size() - 1].P);			//push the last point in the centerline
+			result.push_back(e[size() - 1].P);						//push the last point in the centerline
 			return cylinder<T>(result);