Showing 127 changed files Show diff stats
cmake/FindGLEW.cmake
cmake/FindSTIM.cmake
matlab/bsq2tensorflow.m
matlab/cls_ConfusionMatrix.m
matlab/cls_MeanClassFeatures.m
matlab/cls_PlotConfusionMatrix.m
matlab/enviSaveRawcamille.m
matlab/hyperRaman.m
matlab/inferno.m
matlab/magma.m
matlab/plasma.m
matlab/spe2envicamille.m
matlab/stim_images2matrix.m
matlab/viridis.m
matlab/wavenumber.dat
python/classify.py
python/digitalstain.py
python/envi.py
python/example.py
python/hyperspectral.py
+# Copyright (c) 2012-2016 DreamWorks Animation LLC
+#
+# All rights reserved. This software is distributed under the
+# Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+#
+# Redistributions of source code must retain the above copyright
+# and license notice and the following restrictions and disclaimer.
+#
+# *     Neither the name of DreamWorks Animation nor the names of
+# its contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+# LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
 #
-# Windows users: define the GLEW_PATH environment variable to point
-# to the root glew directory, which contains:
-#		lib/Release/Win32/glew32.lib AND/OR lib/Release/x64/glew32.lib
-#		include/GL/glew.h
-#Try to find GLEW library and include path.
-# Once done this will define
+#-*-cmake-*-
+# - Find GLEW
+#
+# Author : Nicholas Yue yue.nicholas@gmail.com
+#
+# This auxiliary CMake file helps in find the GLEW headers and libraries
 #
-# GLEW_FOUND
-# GLEW_INCLUDE_DIR
-# GLEW_LIBRARY
-# 
+# GLEW_FOUND            set if Glew is found.
+# GLEW_INCLUDE_DIR      GLEW's include directory
+# GLEW_glew_LIBRARY        GLEW libraries
+# GLEW_glewmx_LIBRARY      GLEWmx libraries (Mulitple Rendering Context)
+
+FIND_PACKAGE ( PackageHandleStandardArgs )
+
+FIND_PATH( GLEW_LOCATION include/GL/glew.h
+  "$ENV{GLEW_ROOT}"
+  NO_DEFAULT_PATH
+  NO_SYSTEM_ENVIRONMENT_PATH
+  )
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS ( GLEW
+  REQUIRED_VARS GLEW_LOCATION
+  )
+
+IF ( GLEW_LOCATION )
+
+  SET( GLEW_INCLUDE_DIR "${GLEW_LOCATION}/include" CACHE STRING "GLEW include path")
-IF (WIN32)
-	FIND_PATH( GLEW_INCLUDE_DIR GL/glew.h
-		$ENV{GLEW_PATH}/include
-		$ENV{PROGRAMFILES}/GLEW/include
-		${PROJECT_SOURCE_DIR}/src/nvgl/glew/include
-		DOC "The directory where GL/glew.h resides")
-	if( CMAKE_SIZEOF_VOID_P EQUAL 8 )
-		FIND_LIBRARY( GLEW_LIBRARY
-			NAMES glew GLEW glew32 glew32s
-			PATHS
-			$ENV{GLEW_PATH}/lib/Release/x64
-			$ENV{PROGRAMFILES}/GLEW/lib
-			${PROJECT_SOURCE_DIR}/src/nvgl/glew/bin
-			${PROJECT_SOURCE_DIR}/src/nvgl/glew/lib
-			DOC "The GLEW library")
-	else( CMAKE_SIZEOF_VOID_P EQUAL 8 )
-		FIND_LIBRARY( GLEW_LIBRARY
-			NAMES glew GLEW glew32 glew32s
-			PATHS
-			$ENV{GLEW_PATH}/lib/Release/Win32
-			$ENV{PROGRAMFILES}/GLEW/lib
-			${PROJECT_SOURCE_DIR}/src/nvgl/glew/bin
-			${PROJECT_SOURCE_DIR}/src/nvgl/glew/lib
-			DOC "The GLEW library")
-	endif( CMAKE_SIZEOF_VOID_P EQUAL 8 )
-ELSE (WIN32)
-	FIND_PATH( GLEW_INCLUDE_DIR GL/glew.h
-		/usr/include
-		/usr/local/include
-		/sw/include
-		/opt/local/include
-		DOC "The directory where GL/glew.h resides")
-	FIND_LIBRARY( GLEW_LIBRARY
-		NAMES GLEW glew
-		PATHS
-		/usr/lib64
-		/usr/lib
-		/usr/local/lib64
-		/usr/local/lib
-		/sw/lib
-		/opt/local/lib
-		DOC "The GLEW library")
-ENDIF (WIN32)
+  SET ( ORIGINAL_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  IF (GLEW_USE_STATIC_LIBS)
+	IF (APPLE)
+      SET(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
+      FIND_LIBRARY ( GLEW_LIBRARY_PATH GLEW PATHS ${GLEW_LOCATION}/lib
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+      FIND_LIBRARY ( GLEWmx_LIBRARY_PATH GLEWmx PATHS ${GLEW_LOCATION}/lib
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+	  # MESSAGE ( "APPLE STATIC" )
+	  # MESSAGE ( "GLEW_LIBRARY_PATH = " ${GLEW_LIBRARY_PATH} )
+	ELSEIF (WIN32)
+      # Link library
+      SET(CMAKE_FIND_LIBRARY_SUFFIXES ".lib")
+      FIND_LIBRARY ( GLEW_LIBRARY_PATH GLEW32S PATHS ${GLEW_LOCATION}/lib )
+      FIND_LIBRARY ( GLEWmx_LIBRARY_PATH GLEW32MXS PATHS ${GLEW_LOCATION}/lib )
+	ELSE (APPLE)
+      SET(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
+      FIND_LIBRARY ( GLEW_LIBRARY_PATH GLEW PATHS ${GLEW_LOCATION}/lib
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+      FIND_LIBRARY ( GLEWmx_LIBRARY_PATH GLEWmx PATHS ${GLEW_LOCATION}/lib
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+	  # MESSAGE ( "LINUX STATIC" )
+	  # MESSAGE ( "GLEW_LIBRARY_PATH = " ${GLEW_LIBRARY_PATH} )
+	ENDIF (APPLE)
+  ELSE ()
+	IF (APPLE)
+      SET(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib")
+      FIND_LIBRARY ( GLEW_LIBRARY_PATH GLEW PATHS ${GLEW_LOCATION}/lib )
+      FIND_LIBRARY ( GLEWmx_LIBRARY_PATH GLEWmx PATHS ${GLEW_LOCATION}/lib )
+	ELSEIF (WIN32)
+      # Link library
+      SET(CMAKE_FIND_LIBRARY_SUFFIXES ".lib")
+      FIND_LIBRARY ( GLEW_LIBRARY_PATH GLEW32 PATHS ${GLEW_LOCATION}/lib )
+      FIND_LIBRARY ( GLEWmx_LIBRARY_PATH GLEW32mx PATHS ${GLEW_LOCATION}/lib )
+      # Load library
+      SET(CMAKE_FIND_LIBRARY_SUFFIXES ".dll")
+      FIND_LIBRARY ( GLEW_DLL_PATH GLEW32 PATHS ${GLEW_LOCATION}/bin
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+      FIND_LIBRARY ( GLEWmx_DLL_PATH GLEW32mx PATHS ${GLEW_LOCATION}/bin
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+	ELSE (APPLE)
+	  # Unices
+      FIND_LIBRARY ( GLEW_LIBRARY_PATH GLEW PATHS ${GLEW_LOCATION}/lib
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+      FIND_LIBRARY ( GLEWmx_LIBRARY_PATH GLEWmx PATHS ${GLEW_LOCATION}/lib
+		NO_DEFAULT_PATH
+		NO_SYSTEM_ENVIRONMENT_PATH
+		)
+	ENDIF (APPLE)
+  ENDIF ()
+  # MUST reset
+  SET(CMAKE_FIND_LIBRARY_SUFFIXES ${ORIGINAL_CMAKE_FIND_LIBRARY_SUFFIXES})
-IF (GLEW_INCLUDE_DIR)
-	SET( GLEW_FOUND 1 CACHE STRING "Set to 1 if GLEW is found, 0 otherwise")
-ELSE (GLEW_INCLUDE_DIR)
-	SET( GLEW_FOUND 0 CACHE STRING "Set to 1 if GLEW is found, 0 otherwise")
-ENDIF (GLEW_INCLUDE_DIR)
+  SET( GLEW_GLEW_LIBRARY ${GLEW_LIBRARY_PATH} CACHE STRING "GLEW library")
+  SET( GLEW_GLEWmx_LIBRARY ${GLEWmx_LIBRARY_PATH} CACHE STRING "GLEWmx library")
-MARK_AS_ADVANCED( 
-	GLEW_FOUND 
-	GLEW_INCLUDE_DIR
-	GLEW_LIBRARY
-)
 \ No newline at end of file
+ENDIF ()
-include(FindPackageHandleStandardArgs)
-
-set(STIM_INCLUDE_DIR $ENV{STIMLIB_PATH})
-
-find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIR)
-
-if(STIM_FOUND)
-    set(STIM_INCLUDE_DIRS ${STIM_INCLUDE_DIR})
-endif()
 \ No newline at end of file
+# finds the STIM library (downloads it if it isn't present)
+# set STIMLIB_PATH to the directory containing the stim subdirectory (the stim repository)
+
+include(FindPackageHandleStandardArgs)
+
+set(STIM_INCLUDE_DIR $ENV{STIMLIB_PATH})
+
+find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIR)
+
+if(STIM_FOUND)
+    set(STIM_INCLUDE_DIRS ${STIM_INCLUDE_DIR})
+elseif(STIM_FOUND)
+	#if the STIM library isn't found, download it
+	#file(REMOVE_RECURSE ${CMAKE_BINARY_DIR}/stimlib)	#remove the stimlib directory if it exists
+	#set(STIM_GIT "https://git.stim.ee.uh.edu/codebase/stimlib.git")
+	#execute_process(COMMAND git clone --depth 1 ${STIM_GIT} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+	#set(STIM_INCLUDE_DIRS "${CMAKE_BINARY_DIR}/stimlib" CACHE TYPE PATH)
+	message("STIM library not found. Set the STIMLIB_PATH environment variable to the STIMLIB location.")
+	message("STIMLIB can be found here: https://git.stim.ee.uh.edu/codebase/stimlib")
+endif(STIM_FOUND)
+
+find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIR)
+function T = bsq2tensorflow(I, n)
+
+    sx = size(I, 1);
+    sy = size(I, 2) / n;            %get the size of the tensor along Y
+    sb = size(I, 3);
+    
+    T = zeros(sx * sy * sb, n);     %allocate space for the output matrix
+    for i = 0:n-1
+        ti = I(:, i * sy + 1 : i * sy + sy, :);
+        T(:, i+1) = ti(:);
+    end    
+end
+        
+        
 \ No newline at end of file
+function M = cls_ConfusionMatrix(GT, T)
+
+%calculate the classes (unique elements in the GT array)
+C = unique(GT);
+nc = length(C);        %calculate the number of classes
+M = zeros(nc);      %allocate space for the confusion matrix
+
+%for each class
+for ci = 1:nc
+    for cj = 1:nc
+        M(ci, cj) = nnz((GT == C(ci)) .* (T == C(cj)));
+    end
+end
 \ No newline at end of file
+function S = cls_MeanClassFeatures(F, T)
+%Calculates the mean set of features for each class given the feature matrix F and targets T
+
+C = unique(T);                          %get the class IDs
+nc = length(C);
+
+S = zeros(nc, size(F, 2));              %allocate space for the mean feature vectors
+for c = 1:nc                            %for each class
+    S(c, :) = mean(F(T == C(c), :));    %calculate the mean feature vector for class c
+end
+	
+S = S';
 \ No newline at end of file
+function cls_PlotConfusionMatrix(M)
+
+
+%normalize each row by its column
+sum_cols = repmat(sum(M, 1), size(M, 1), 1);
+Mc = M ./ sum_cols;
+subplot(2, 1, 1),
+bar(Mc');
+
+sum_rows = repmat(sum(M, 2), 1, size(M, 2));
+Mr = M ./ sum_rows;
+subplot(2, 1, 2),
+bar(Mr);
 \ No newline at end of file
+%saves an ENVI file without any manipulation, assumes (X, Y, S)
+% enviSaveRaw(M, filename, headername,wavenumber data filename)
+function enviSaveRawcamille(M, filename, camilleheader,wavenumberfilename)
+
+%if a header isn't provided, assume it's just the filename
+% with '.hdr' added to the end
+if nargin == 2
+    camilleheader = [filename '.hdr'];
+end
+%load the wavenumbers
+if nargin == 4
+    wn = csvread([wavenumberfilename '.dat']);
+end
+
+%open a file for writing
+fid = fopen(filename, 'w');
+
+%write the data to disk
+fwrite(fid, M, class(M));
+
+%close the file
+fclose(fid);
+
+%open a header file for writing
+fid = fopen(camilleheader, 'w');
+fprintf(fid, 'ENVI\n');
+fprintf(fid, 'description = {}\n');
+fprintf(fid, 'samples = %d\n', size(M, 1));
+fprintf(fid, 'lines = %d\n', size(M, 2));
+fprintf(fid, 'bands = %d\n', size(M, 3));
+fprintf(fid, 'header offset = 0\n');
+fprintf(fid, 'file type = ENVI Standard\n');
+
+%get a string representing the matlab data type
+matlab_real = isreal(M);
+
+if(isa(M, 'uchar'))
+    envi_type = 1;
+elseif(isa(M, 'short'))
+    envi_type = 2;
+elseif(isa(M, 'int'))
+    envi_type = 3;
+elseif(isa(M, 'single') && matlab_real)
+    envi_type = 4;
+elseif(isa(M, 'double') && matlab_real)
+    envi_type = 5;
+elseif(isa(M, 'single') && ~matlab_real)
+    envi_type = 6;
+elseif(isa(M, 'double') && ~matlab_real)
+    envi_type = 9;
+elseif(isa(M, 'ushort'))
+    envi_type = 12;
+elseif(isa(M, 'ulong'))
+    envi_type = 13;
+elseif(isa(M, 'int64'))
+    envi_type = 14;
+elseif(isa(M, 'uint64'))
+    envi_type = 15;
+end
+
+fprintf(fid, 'data type = %d\n', envi_type);
+
+fprintf(fid, 'interleave = bsq\n');
+fprintf(fid, 'sensor type = Unknown\n');
+fprintf(fid, 'byte order = 0\n');
+fprintf(fid, 'x start = 0\n');
+fprintf(fid, 'y start = 0\n');
+fprintf(fid, 'wavelength units = Unknown\n');
+fprintf(fid, 'z plot titles = {Unknown, Unknown}\n');
+fprintf(fid, 'pixel size = {1, 1, units=Meters}\n');
+
+% print the wavelengths for each band
+fprintf(fid, 'wavelength = {\n');
+for i= 1:size(wn,2)-1
+    fprintf(fid, '%8.3f ,\n',wn(i));
+end
+fprintf(fid, '%8.3f \n',wn(end));
+
+fprintf(fid, '}');
+fclose(fid);
 \ No newline at end of file
+function [im]=hyperRaman(filemask)
+
+    filelist = dir(filemask);
+
+    %get a list of date numbers
+    datenums = cell2mat({filelist.datenum});
+
+    %sort the file order based on acquisition time
+    [~, id] = sort(datenums);
+
+    %get the number of files
+    T = length(id);                 %size of the image along transverse direction X, number of steps when scanned
+
+    %load the first file to determine the spectral and Y-axis size
+    temp = readspe(filelist(1).name);
+    L = size(temp, 1);              %size of the image along longitudinal direction Y (laser line)
+    B = size(temp, 2);              %number of bands in the image
+
+    %create the cube
+    outvar = zeros(L, T, B);
+
+    %for each line
+    for x = 1:T
+
+        %read a SPE file
+        img = readspe(filelist(id(x)).name);
+        outvar(:,x,:)= permute(img, [1 3 2]);
+    end
+    im=outvar;
+    figure
+    imagesc(outvar(:,:,150)), colorbar, axis normal, axis equal off
+   
+%     subplot(1,3,1),imagesc(outvar(:,:,151)), colorbar
+%     subplot(1,3,2),imagesc(outvar(:,:,175)), colorbar
+%     subplot(1,3,3),imagesc(outvar(:,:,291)), colorbar
+    colormap inferno;
+function [cm_data]=inferno(m)
+
+cm = [[  1.46159096e-03,   4.66127766e-04,   1.38655200e-02],
+       [  2.26726368e-03,   1.26992553e-03,   1.85703520e-02],
+       [  3.29899092e-03,   2.24934863e-03,   2.42390508e-02],
+       [  4.54690615e-03,   3.39180156e-03,   3.09092475e-02],
+       [  6.00552565e-03,   4.69194561e-03,   3.85578980e-02],
+       [  7.67578856e-03,   6.13611626e-03,   4.68360336e-02],
+       [  9.56051094e-03,   7.71344131e-03,   5.51430756e-02],
+       [  1.16634769e-02,   9.41675403e-03,   6.34598080e-02],
+       [  1.39950388e-02,   1.12247138e-02,   7.18616890e-02],
+       [  1.65605595e-02,   1.31362262e-02,   8.02817951e-02],
+       [  1.93732295e-02,   1.51325789e-02,   8.87668094e-02],
+       [  2.24468865e-02,   1.71991484e-02,   9.73274383e-02],
+       [  2.57927373e-02,   1.93306298e-02,   1.05929835e-01],
+       [  2.94324251e-02,   2.15030771e-02,   1.14621328e-01],
+       [  3.33852235e-02,   2.37024271e-02,   1.23397286e-01],
+       [  3.76684211e-02,   2.59207864e-02,   1.32232108e-01],
+       [  4.22525554e-02,   2.81385015e-02,   1.41140519e-01],
+       [  4.69146287e-02,   3.03236129e-02,   1.50163867e-01],
+       [  5.16437624e-02,   3.24736172e-02,   1.59254277e-01],
+       [  5.64491009e-02,   3.45691867e-02,   1.68413539e-01],
+       [  6.13397200e-02,   3.65900213e-02,   1.77642172e-01],
+       [  6.63312620e-02,   3.85036268e-02,   1.86961588e-01],
+       [  7.14289181e-02,   4.02939095e-02,   1.96353558e-01],
+       [  7.66367560e-02,   4.19053329e-02,   2.05798788e-01],
+       [  8.19620773e-02,   4.33278666e-02,   2.15289113e-01],
+       [  8.74113897e-02,   4.45561662e-02,   2.24813479e-01],
+       [  9.29901526e-02,   4.55829503e-02,   2.34357604e-01],
+       [  9.87024972e-02,   4.64018731e-02,   2.43903700e-01],
+       [  1.04550936e-01,   4.70080541e-02,   2.53430300e-01],
+       [  1.10536084e-01,   4.73986708e-02,   2.62912235e-01],
+       [  1.16656423e-01,   4.75735920e-02,   2.72320803e-01],
+       [  1.22908126e-01,   4.75360183e-02,   2.81624170e-01],
+       [  1.29284984e-01,   4.72930838e-02,   2.90788012e-01],
+       [  1.35778450e-01,   4.68563678e-02,   2.99776404e-01],
+       [  1.42377819e-01,   4.62422566e-02,   3.08552910e-01],
+       [  1.49072957e-01,   4.54676444e-02,   3.17085139e-01],
+       [  1.55849711e-01,   4.45588056e-02,   3.25338414e-01],
+       [  1.62688939e-01,   4.35542881e-02,   3.33276678e-01],
+       [  1.69575148e-01,   4.24893149e-02,   3.40874188e-01],
+       [  1.76493202e-01,   4.14017089e-02,   3.48110606e-01],
+       [  1.83428775e-01,   4.03288858e-02,   3.54971391e-01],
+       [  1.90367453e-01,   3.93088888e-02,   3.61446945e-01],
+       [  1.97297425e-01,   3.84001825e-02,   3.67534629e-01],
+       [  2.04209298e-01,   3.76322609e-02,   3.73237557e-01],
+       [  2.11095463e-01,   3.70296488e-02,   3.78563264e-01],
+       [  2.17948648e-01,   3.66146049e-02,   3.83522415e-01],
+       [  2.24762908e-01,   3.64049901e-02,   3.88128944e-01],
+       [  2.31538148e-01,   3.64052511e-02,   3.92400150e-01],
+       [  2.38272961e-01,   3.66209949e-02,   3.96353388e-01],
+       [  2.44966911e-01,   3.70545017e-02,   4.00006615e-01],
+       [  2.51620354e-01,   3.77052832e-02,   4.03377897e-01],
+       [  2.58234265e-01,   3.85706153e-02,   4.06485031e-01],
+       [  2.64809649e-01,   3.96468666e-02,   4.09345373e-01],
+       [  2.71346664e-01,   4.09215821e-02,   4.11976086e-01],
+       [  2.77849829e-01,   4.23528741e-02,   4.14392106e-01],
+       [  2.84321318e-01,   4.39325787e-02,   4.16607861e-01],
+       [  2.90763373e-01,   4.56437598e-02,   4.18636756e-01],
+       [  2.97178251e-01,   4.74700293e-02,   4.20491164e-01],
+       [  3.03568182e-01,   4.93958927e-02,   4.22182449e-01],
+       [  3.09935342e-01,   5.14069729e-02,   4.23720999e-01],
+       [  3.16281835e-01,   5.34901321e-02,   4.25116277e-01],
+       [  3.22609671e-01,   5.56335178e-02,   4.26376869e-01],
+       [  3.28920763e-01,   5.78265505e-02,   4.27510546e-01],
+       [  3.35216916e-01,   6.00598734e-02,   4.28524320e-01],
+       [  3.41499828e-01,   6.23252772e-02,   4.29424503e-01],
+       [  3.47771086e-01,   6.46156100e-02,   4.30216765e-01],
+       [  3.54032169e-01,   6.69246832e-02,   4.30906186e-01],
+       [  3.60284449e-01,   6.92471753e-02,   4.31497309e-01],
+       [  3.66529195e-01,   7.15785403e-02,   4.31994185e-01],
+       [  3.72767575e-01,   7.39149211e-02,   4.32400419e-01],
+       [  3.79000659e-01,   7.62530701e-02,   4.32719214e-01],
+       [  3.85228383e-01,   7.85914864e-02,   4.32954973e-01],
+       [  3.91452659e-01,   8.09267058e-02,   4.33108763e-01],
+       [  3.97674379e-01,   8.32568129e-02,   4.33182647e-01],
+       [  4.03894278e-01,   8.55803445e-02,   4.33178526e-01],
+       [  4.10113015e-01,   8.78961593e-02,   4.33098056e-01],
+       [  4.16331169e-01,   9.02033992e-02,   4.32942678e-01],
+       [  4.22549249e-01,   9.25014543e-02,   4.32713635e-01],
+       [  4.28767696e-01,   9.47899342e-02,   4.32411996e-01],
+       [  4.34986885e-01,   9.70686417e-02,   4.32038673e-01],
+       [  4.41207124e-01,   9.93375510e-02,   4.31594438e-01],
+       [  4.47428382e-01,   1.01597079e-01,   4.31080497e-01],
+       [  4.53650614e-01,   1.03847716e-01,   4.30497898e-01],
+       [  4.59874623e-01,   1.06089165e-01,   4.29845789e-01],
+       [  4.66100494e-01,   1.08321923e-01,   4.29124507e-01],
+       [  4.72328255e-01,   1.10546584e-01,   4.28334320e-01],
+       [  4.78557889e-01,   1.12763831e-01,   4.27475431e-01],
+       [  4.84789325e-01,   1.14974430e-01,   4.26547991e-01],
+       [  4.91022448e-01,   1.17179219e-01,   4.25552106e-01],
+       [  4.97257069e-01,   1.19379132e-01,   4.24487908e-01],
+       [  5.03492698e-01,   1.21575414e-01,   4.23356110e-01],
+       [  5.09729541e-01,   1.23768654e-01,   4.22155676e-01],
+       [  5.15967304e-01,   1.25959947e-01,   4.20886594e-01],
+       [  5.22205646e-01,   1.28150439e-01,   4.19548848e-01],
+       [  5.28444192e-01,   1.30341324e-01,   4.18142411e-01],
+       [  5.34682523e-01,   1.32533845e-01,   4.16667258e-01],
+       [  5.40920186e-01,   1.34729286e-01,   4.15123366e-01],
+       [  5.47156706e-01,   1.36928959e-01,   4.13510662e-01],
+       [  5.53391649e-01,   1.39134147e-01,   4.11828882e-01],
+       [  5.59624442e-01,   1.41346265e-01,   4.10078028e-01],
+       [  5.65854477e-01,   1.43566769e-01,   4.08258132e-01],
+       [  5.72081108e-01,   1.45797150e-01,   4.06369246e-01],
+       [  5.78303656e-01,   1.48038934e-01,   4.04411444e-01],
+       [  5.84521407e-01,   1.50293679e-01,   4.02384829e-01],
+       [  5.90733615e-01,   1.52562977e-01,   4.00289528e-01],
+       [  5.96939751e-01,   1.54848232e-01,   3.98124897e-01],
+       [  6.03138930e-01,   1.57151161e-01,   3.95891308e-01],
+       [  6.09330184e-01,   1.59473549e-01,   3.93589349e-01],
+       [  6.15512627e-01,   1.61817111e-01,   3.91219295e-01],
+       [  6.21685340e-01,   1.64183582e-01,   3.88781456e-01],
+       [  6.27847374e-01,   1.66574724e-01,   3.86276180e-01],
+       [  6.33997746e-01,   1.68992314e-01,   3.83703854e-01],
+       [  6.40135447e-01,   1.71438150e-01,   3.81064906e-01],
+       [  6.46259648e-01,   1.73913876e-01,   3.78358969e-01],
+       [  6.52369348e-01,   1.76421271e-01,   3.75586209e-01],
+       [  6.58463166e-01,   1.78962399e-01,   3.72748214e-01],
+       [  6.64539964e-01,   1.81539111e-01,   3.69845599e-01],
+       [  6.70598572e-01,   1.84153268e-01,   3.66879025e-01],
+       [  6.76637795e-01,   1.86806728e-01,   3.63849195e-01],
+       [  6.82656407e-01,   1.89501352e-01,   3.60756856e-01],
+       [  6.88653158e-01,   1.92238994e-01,   3.57602797e-01],
+       [  6.94626769e-01,   1.95021500e-01,   3.54387853e-01],
+       [  7.00575937e-01,   1.97850703e-01,   3.51112900e-01],
+       [  7.06499709e-01,   2.00728196e-01,   3.47776863e-01],
+       [  7.12396345e-01,   2.03656029e-01,   3.44382594e-01],
+       [  7.18264447e-01,   2.06635993e-01,   3.40931208e-01],
+       [  7.24102613e-01,   2.09669834e-01,   3.37423766e-01],
+       [  7.29909422e-01,   2.12759270e-01,   3.33861367e-01],
+       [  7.35683432e-01,   2.15905976e-01,   3.30245147e-01],
+       [  7.41423185e-01,   2.19111589e-01,   3.26576275e-01],
+       [  7.47127207e-01,   2.22377697e-01,   3.22855952e-01],
+       [  7.52794009e-01,   2.25705837e-01,   3.19085410e-01],
+       [  7.58422090e-01,   2.29097492e-01,   3.15265910e-01],
+       [  7.64009940e-01,   2.32554083e-01,   3.11398734e-01],
+       [  7.69556038e-01,   2.36076967e-01,   3.07485188e-01],
+       [  7.75058888e-01,   2.39667435e-01,   3.03526312e-01],
+       [  7.80517023e-01,   2.43326720e-01,   2.99522665e-01],
+       [  7.85928794e-01,   2.47055968e-01,   2.95476756e-01],
+       [  7.91292674e-01,   2.50856232e-01,   2.91389943e-01],
+       [  7.96607144e-01,   2.54728485e-01,   2.87263585e-01],
+       [  8.01870689e-01,   2.58673610e-01,   2.83099033e-01],
+       [  8.07081807e-01,   2.62692401e-01,   2.78897629e-01],
+       [  8.12239008e-01,   2.66785558e-01,   2.74660698e-01],
+       [  8.17340818e-01,   2.70953688e-01,   2.70389545e-01],
+       [  8.22385784e-01,   2.75197300e-01,   2.66085445e-01],
+       [  8.27372474e-01,   2.79516805e-01,   2.61749643e-01],
+       [  8.32299481e-01,   2.83912516e-01,   2.57383341e-01],
+       [  8.37165425e-01,   2.88384647e-01,   2.52987700e-01],
+       [  8.41968959e-01,   2.92933312e-01,   2.48563825e-01],
+       [  8.46708768e-01,   2.97558528e-01,   2.44112767e-01],
+       [  8.51383572e-01,   3.02260213e-01,   2.39635512e-01],
+       [  8.55992130e-01,   3.07038188e-01,   2.35132978e-01],
+       [  8.60533241e-01,   3.11892183e-01,   2.30606009e-01],
+       [  8.65005747e-01,   3.16821833e-01,   2.26055368e-01],
+       [  8.69408534e-01,   3.21826685e-01,   2.21481734e-01],
+       [  8.73740530e-01,   3.26906201e-01,   2.16885699e-01],
+       [  8.78000715e-01,   3.32059760e-01,   2.12267762e-01],
+       [  8.82188112e-01,   3.37286663e-01,   2.07628326e-01],
+       [  8.86301795e-01,   3.42586137e-01,   2.02967696e-01],
+       [  8.90340885e-01,   3.47957340e-01,   1.98286080e-01],
+       [  8.94304553e-01,   3.53399363e-01,   1.93583583e-01],
+       [  8.98192017e-01,   3.58911240e-01,   1.88860212e-01],
+       [  9.02002544e-01,   3.64491949e-01,   1.84115876e-01],
+       [  9.05735448e-01,   3.70140419e-01,   1.79350388e-01],
+       [  9.09390090e-01,   3.75855533e-01,   1.74563472e-01],
+       [  9.12965874e-01,   3.81636138e-01,   1.69754764e-01],
+       [  9.16462251e-01,   3.87481044e-01,   1.64923826e-01],
+       [  9.19878710e-01,   3.93389034e-01,   1.60070152e-01],
+       [  9.23214783e-01,   3.99358867e-01,   1.55193185e-01],
+       [  9.26470039e-01,   4.05389282e-01,   1.50292329e-01],
+       [  9.29644083e-01,   4.11479007e-01,   1.45366973e-01],
+       [  9.32736555e-01,   4.17626756e-01,   1.40416519e-01],
+       [  9.35747126e-01,   4.23831237e-01,   1.35440416e-01],
+       [  9.38675494e-01,   4.30091162e-01,   1.30438175e-01],
+       [  9.41521384e-01,   4.36405243e-01,   1.25409440e-01],
+       [  9.44284543e-01,   4.42772199e-01,   1.20354038e-01],
+       [  9.46964741e-01,   4.49190757e-01,   1.15272059e-01],
+       [  9.49561766e-01,   4.55659658e-01,   1.10163947e-01],
+       [  9.52075421e-01,   4.62177656e-01,   1.05030614e-01],
+       [  9.54505523e-01,   4.68743522e-01,   9.98735931e-02],
+       [  9.56851903e-01,   4.75356048e-01,   9.46952268e-02],
+       [  9.59114397e-01,   4.82014044e-01,   8.94989073e-02],
+       [  9.61292850e-01,   4.88716345e-01,   8.42893891e-02],
+       [  9.63387110e-01,   4.95461806e-01,   7.90731907e-02],
+       [  9.65397031e-01,   5.02249309e-01,   7.38591143e-02],
+       [  9.67322465e-01,   5.09077761e-01,   6.86589199e-02],
+       [  9.69163264e-01,   5.15946092e-01,   6.34881971e-02],
+       [  9.70919277e-01,   5.22853259e-01,   5.83674890e-02],
+       [  9.72590351e-01,   5.29798246e-01,   5.33237243e-02],
+       [  9.74176327e-01,   5.36780059e-01,   4.83920090e-02],
+       [  9.75677038e-01,   5.43797733e-01,   4.36177922e-02],
+       [  9.77092313e-01,   5.50850323e-01,   3.90500131e-02],
+       [  9.78421971e-01,   5.57936911e-01,   3.49306227e-02],
+       [  9.79665824e-01,   5.65056600e-01,   3.14091591e-02],
+       [  9.80823673e-01,   5.72208516e-01,   2.85075931e-02],
+       [  9.81895311e-01,   5.79391803e-01,   2.62497353e-02],
+       [  9.82880522e-01,   5.86605627e-01,   2.46613416e-02],
+       [  9.83779081e-01,   5.93849168e-01,   2.37702263e-02],
+       [  9.84590755e-01,   6.01121626e-01,   2.36063833e-02],
+       [  9.85315301e-01,   6.08422211e-01,   2.42021174e-02],
+       [  9.85952471e-01,   6.15750147e-01,   2.55921853e-02],
+       [  9.86502013e-01,   6.23104667e-01,   2.78139496e-02],
+       [  9.86963670e-01,   6.30485011e-01,   3.09075459e-02],
+       [  9.87337182e-01,   6.37890424e-01,   3.49160639e-02],
+       [  9.87622296e-01,   6.45320152e-01,   3.98857472e-02],
+       [  9.87818759e-01,   6.52773439e-01,   4.55808037e-02],
+       [  9.87926330e-01,   6.60249526e-01,   5.17503867e-02],
+       [  9.87944783e-01,   6.67747641e-01,   5.83286889e-02],
+       [  9.87873910e-01,   6.75267000e-01,   6.52570167e-02],
+       [  9.87713535e-01,   6.82806802e-01,   7.24892330e-02],
+       [  9.87463516e-01,   6.90366218e-01,   7.99897176e-02],
+       [  9.87123759e-01,   6.97944391e-01,   8.77314215e-02],
+       [  9.86694229e-01,   7.05540424e-01,   9.56941797e-02],
+       [  9.86174970e-01,   7.13153375e-01,   1.03863324e-01],
+       [  9.85565739e-01,   7.20782460e-01,   1.12228756e-01],
+       [  9.84865203e-01,   7.28427497e-01,   1.20784651e-01],
+       [  9.84075129e-01,   7.36086521e-01,   1.29526579e-01],
+       [  9.83195992e-01,   7.43758326e-01,   1.38453063e-01],
+       [  9.82228463e-01,   7.51441596e-01,   1.47564573e-01],
+       [  9.81173457e-01,   7.59134892e-01,   1.56863224e-01],
+       [  9.80032178e-01,   7.66836624e-01,   1.66352544e-01],
+       [  9.78806183e-01,   7.74545028e-01,   1.76037298e-01],
+       [  9.77497453e-01,   7.82258138e-01,   1.85923357e-01],
+       [  9.76108474e-01,   7.89973753e-01,   1.96017589e-01],
+       [  9.74637842e-01,   7.97691563e-01,   2.06331925e-01],
+       [  9.73087939e-01,   8.05409333e-01,   2.16876839e-01],
+       [  9.71467822e-01,   8.13121725e-01,   2.27658046e-01],
+       [  9.69783146e-01,   8.20825143e-01,   2.38685942e-01],
+       [  9.68040817e-01,   8.28515491e-01,   2.49971582e-01],
+       [  9.66242589e-01,   8.36190976e-01,   2.61533898e-01],
+       [  9.64393924e-01,   8.43848069e-01,   2.73391112e-01],
+       [  9.62516656e-01,   8.51476340e-01,   2.85545675e-01],
+       [  9.60625545e-01,   8.59068716e-01,   2.98010219e-01],
+       [  9.58720088e-01,   8.66624355e-01,   3.10820466e-01],
+       [  9.56834075e-01,   8.74128569e-01,   3.23973947e-01],
+       [  9.54997177e-01,   8.81568926e-01,   3.37475479e-01],
+       [  9.53215092e-01,   8.88942277e-01,   3.51368713e-01],
+       [  9.51546225e-01,   8.96225909e-01,   3.65627005e-01],
+       [  9.50018481e-01,   9.03409063e-01,   3.80271225e-01],
+       [  9.48683391e-01,   9.10472964e-01,   3.95289169e-01],
+       [  9.47594362e-01,   9.17399053e-01,   4.10665194e-01],
+       [  9.46809163e-01,   9.24168246e-01,   4.26373236e-01],
+       [  9.46391536e-01,   9.30760752e-01,   4.42367495e-01],
+       [  9.46402951e-01,   9.37158971e-01,   4.58591507e-01],
+       [  9.46902568e-01,   9.43347775e-01,   4.74969778e-01],
+       [  9.47936825e-01,   9.49317522e-01,   4.91426053e-01],
+       [  9.49544830e-01,   9.55062900e-01,   5.07859649e-01],
+       [  9.51740304e-01,   9.60586693e-01,   5.24203026e-01],
+       [  9.54529281e-01,   9.65895868e-01,   5.40360752e-01],
+       [  9.57896053e-01,   9.71003330e-01,   5.56275090e-01],
+       [  9.61812020e-01,   9.75924241e-01,   5.71925382e-01],
+       [  9.66248822e-01,   9.80678193e-01,   5.87205773e-01],
+       [  9.71161622e-01,   9.85282161e-01,   6.02154330e-01],
+       [  9.76510983e-01,   9.89753437e-01,   6.16760413e-01],
+       [  9.82257307e-01,   9.94108844e-01,   6.31017009e-01],
+       [  9.88362068e-01,   9.98364143e-01,   6.44924005e-01]];
+if nargin < 1
+    cm_data = cm;
+else
+    hsv=rgb2hsv(cm);
+    hsv(144:end,1)=hsv(144:end,1)+1; % hardcoded
+    cm_data=interp1(linspace(0,1,size(cm,1)),hsv,linspace(0,1,m));
+    cm_data(cm_data(:,1)>1,1)=cm_data(cm_data(:,1)>1,1)-1;
+    cm_data=hsv2rgb(cm_data);
+  
+end
+end
 \ No newline at end of file
+function [cm_data]=magma(m)
+
+cm = [[  1.46159096e-03,   4.66127766e-04,   1.38655200e-02],
+    [  2.25764007e-03,   1.29495431e-03,   1.83311461e-02],
+    [  3.27943222e-03,   2.30452991e-03,   2.37083291e-02],
+    [  4.51230222e-03,   3.49037666e-03,   2.99647059e-02],
+    [  5.94976987e-03,   4.84285000e-03,   3.71296695e-02],
+    [  7.58798550e-03,   6.35613622e-03,   4.49730774e-02],
+    [  9.42604390e-03,   8.02185006e-03,   5.28443561e-02],
+    [  1.14654337e-02,   9.82831486e-03,   6.07496380e-02],
+    [  1.37075706e-02,   1.17705913e-02,   6.86665843e-02],
+    [  1.61557566e-02,   1.38404966e-02,   7.66026660e-02],
+    [  1.88153670e-02,   1.60262753e-02,   8.45844897e-02],
+    [  2.16919340e-02,   1.83201254e-02,   9.26101050e-02],
+    [  2.47917814e-02,   2.07147875e-02,   1.00675555e-01],
+    [  2.81228154e-02,   2.32009284e-02,   1.08786954e-01],
+    [  3.16955304e-02,   2.57651161e-02,   1.16964722e-01],
+    [  3.55204468e-02,   2.83974570e-02,   1.25209396e-01],
+    [  3.96084872e-02,   3.10895652e-02,   1.33515085e-01],
+    [  4.38295350e-02,   3.38299885e-02,   1.41886249e-01],
+    [  4.80616391e-02,   3.66066101e-02,   1.50326989e-01],
+    [  5.23204388e-02,   3.94066020e-02,   1.58841025e-01],
+    [  5.66148978e-02,   4.21598925e-02,   1.67445592e-01],
+    [  6.09493930e-02,   4.47944924e-02,   1.76128834e-01],
+    [  6.53301801e-02,   4.73177796e-02,   1.84891506e-01],
+    [  6.97637296e-02,   4.97264666e-02,   1.93735088e-01],
+    [  7.42565152e-02,   5.20167766e-02,   2.02660374e-01],
+    [  7.88150034e-02,   5.41844801e-02,   2.11667355e-01],
+    [  8.34456313e-02,   5.62249365e-02,   2.20755099e-01],
+    [  8.81547730e-02,   5.81331465e-02,   2.29921611e-01],
+    [  9.29486914e-02,   5.99038167e-02,   2.39163669e-01],
+    [  9.78334770e-02,   6.15314414e-02,   2.48476662e-01],
+    [  1.02814972e-01,   6.30104053e-02,   2.57854400e-01],
+    [  1.07898679e-01,   6.43351102e-02,   2.67288933e-01],
+    [  1.13094451e-01,   6.54920358e-02,   2.76783978e-01],
+    [  1.18405035e-01,   6.64791593e-02,   2.86320656e-01],
+    [  1.23832651e-01,   6.72946449e-02,   2.95879431e-01],
+    [  1.29380192e-01,   6.79349264e-02,   3.05442931e-01],
+    [  1.35053322e-01,   6.83912798e-02,   3.14999890e-01],
+    [  1.40857952e-01,   6.86540710e-02,   3.24537640e-01],
+    [  1.46785234e-01,   6.87382323e-02,   3.34011109e-01],
+    [  1.52839217e-01,   6.86368599e-02,   3.43404450e-01],
+    [  1.59017511e-01,   6.83540225e-02,   3.52688028e-01],
+    [  1.65308131e-01,   6.79108689e-02,   3.61816426e-01],
+    [  1.71713033e-01,   6.73053260e-02,   3.70770827e-01],
+    [  1.78211730e-01,   6.65758073e-02,   3.79497161e-01],
+    [  1.84800877e-01,   6.57324381e-02,   3.87972507e-01],
+    [  1.91459745e-01,   6.48183312e-02,   3.96151969e-01],
+    [  1.98176877e-01,   6.38624166e-02,   4.04008953e-01],
+    [  2.04934882e-01,   6.29066192e-02,   4.11514273e-01],
+    [  2.11718061e-01,   6.19917876e-02,   4.18646741e-01],
+    [  2.18511590e-01,   6.11584918e-02,   4.25391816e-01],
+    [  2.25302032e-01,   6.04451843e-02,   4.31741767e-01],
+    [  2.32076515e-01,   5.98886855e-02,   4.37694665e-01],
+    [  2.38825991e-01,   5.95170384e-02,   4.43255999e-01],
+    [  2.45543175e-01,   5.93524384e-02,   4.48435938e-01],
+    [  2.52220252e-01,   5.94147119e-02,   4.53247729e-01],
+    [  2.58857304e-01,   5.97055998e-02,   4.57709924e-01],
+    [  2.65446744e-01,   6.02368754e-02,   4.61840297e-01],
+    [  2.71994089e-01,   6.09935552e-02,   4.65660375e-01],
+    [  2.78493300e-01,   6.19778136e-02,   4.69190328e-01],
+    [  2.84951097e-01,   6.31676261e-02,   4.72450879e-01],
+    [  2.91365817e-01,   6.45534486e-02,   4.75462193e-01],
+    [  2.97740413e-01,   6.61170432e-02,   4.78243482e-01],
+    [  3.04080941e-01,   6.78353452e-02,   4.80811572e-01],
+    [  3.10382027e-01,   6.97024767e-02,   4.83186340e-01],
+    [  3.16654235e-01,   7.16895272e-02,   4.85380429e-01],
+    [  3.22899126e-01,   7.37819504e-02,   4.87408399e-01],
+    [  3.29114038e-01,   7.59715081e-02,   4.89286796e-01],
+    [  3.35307503e-01,   7.82361045e-02,   4.91024144e-01],
+    [  3.41481725e-01,   8.05635079e-02,   4.92631321e-01],
+    [  3.47635742e-01,   8.29463512e-02,   4.94120923e-01],
+    [  3.53773161e-01,   8.53726329e-02,   4.95501096e-01],
+    [  3.59897941e-01,   8.78311772e-02,   4.96778331e-01],
+    [  3.66011928e-01,   9.03143031e-02,   4.97959963e-01],
+    [  3.72116205e-01,   9.28159917e-02,   4.99053326e-01],
+    [  3.78210547e-01,   9.53322947e-02,   5.00066568e-01],
+    [  3.84299445e-01,   9.78549106e-02,   5.01001964e-01],
+    [  3.90384361e-01,   1.00379466e-01,   5.01864236e-01],
+    [  3.96466670e-01,   1.02902194e-01,   5.02657590e-01],
+    [  4.02547663e-01,   1.05419865e-01,   5.03385761e-01],
+    [  4.08628505e-01,   1.07929771e-01,   5.04052118e-01],
+    [  4.14708664e-01,   1.10431177e-01,   5.04661843e-01],
+    [  4.20791157e-01,   1.12920210e-01,   5.05214935e-01],
+    [  4.26876965e-01,   1.15395258e-01,   5.05713602e-01],
+    [  4.32967001e-01,   1.17854987e-01,   5.06159754e-01],
+    [  4.39062114e-01,   1.20298314e-01,   5.06555026e-01],
+    [  4.45163096e-01,   1.22724371e-01,   5.06900806e-01],
+    [  4.51270678e-01,   1.25132484e-01,   5.07198258e-01],
+    [  4.57385535e-01,   1.27522145e-01,   5.07448336e-01],
+    [  4.63508291e-01,   1.29892998e-01,   5.07651812e-01],
+    [  4.69639514e-01,   1.32244819e-01,   5.07809282e-01],
+    [  4.75779723e-01,   1.34577500e-01,   5.07921193e-01],
+    [  4.81928997e-01,   1.36891390e-01,   5.07988509e-01],
+    [  4.88088169e-01,   1.39186217e-01,   5.08010737e-01],
+    [  4.94257673e-01,   1.41462106e-01,   5.07987836e-01],
+    [  5.00437834e-01,   1.43719323e-01,   5.07919772e-01],
+    [  5.06628929e-01,   1.45958202e-01,   5.07806420e-01],
+    [  5.12831195e-01,   1.48179144e-01,   5.07647570e-01],
+    [  5.19044825e-01,   1.50382611e-01,   5.07442938e-01],
+    [  5.25269968e-01,   1.52569121e-01,   5.07192172e-01],
+    [  5.31506735e-01,   1.54739247e-01,   5.06894860e-01],
+    [  5.37755194e-01,   1.56893613e-01,   5.06550538e-01],
+    [  5.44015371e-01,   1.59032895e-01,   5.06158696e-01],
+    [  5.50287252e-01,   1.61157816e-01,   5.05718782e-01],
+    [  5.56570783e-01,   1.63269149e-01,   5.05230210e-01],
+    [  5.62865867e-01,   1.65367714e-01,   5.04692365e-01],
+    [  5.69172368e-01,   1.67454379e-01,   5.04104606e-01],
+    [  5.75490107e-01,   1.69530062e-01,   5.03466273e-01],
+    [  5.81818864e-01,   1.71595728e-01,   5.02776690e-01],
+    [  5.88158375e-01,   1.73652392e-01,   5.02035167e-01],
+    [  5.94508337e-01,   1.75701122e-01,   5.01241011e-01],
+    [  6.00868399e-01,   1.77743036e-01,   5.00393522e-01],
+    [  6.07238169e-01,   1.79779309e-01,   4.99491999e-01],
+    [  6.13617209e-01,   1.81811170e-01,   4.98535746e-01],
+    [  6.20005032e-01,   1.83839907e-01,   4.97524075e-01],
+    [  6.26401108e-01,   1.85866869e-01,   4.96456304e-01],
+    [  6.32804854e-01,   1.87893468e-01,   4.95331769e-01],
+    [  6.39215638e-01,   1.89921182e-01,   4.94149821e-01],
+    [  6.45632778e-01,   1.91951556e-01,   4.92909832e-01],
+    [  6.52055535e-01,   1.93986210e-01,   4.91611196e-01],
+    [  6.58483116e-01,   1.96026835e-01,   4.90253338e-01],
+    [  6.64914668e-01,   1.98075202e-01,   4.88835712e-01],
+    [  6.71349279e-01,   2.00133166e-01,   4.87357807e-01],
+    [  6.77785975e-01,   2.02202663e-01,   4.85819154e-01],
+    [  6.84223712e-01,   2.04285721e-01,   4.84219325e-01],
+    [  6.90661380e-01,   2.06384461e-01,   4.82557941e-01],
+    [  6.97097796e-01,   2.08501100e-01,   4.80834678e-01],
+    [  7.03531700e-01,   2.10637956e-01,   4.79049270e-01],
+    [  7.09961888e-01,   2.12797337e-01,   4.77201121e-01],
+    [  7.16387038e-01,   2.14981693e-01,   4.75289780e-01],
+    [  7.22805451e-01,   2.17193831e-01,   4.73315708e-01],
+    [  7.29215521e-01,   2.19436516e-01,   4.71278924e-01],
+    [  7.35615545e-01,   2.21712634e-01,   4.69179541e-01],
+    [  7.42003713e-01,   2.24025196e-01,   4.67017774e-01],
+    [  7.48378107e-01,   2.26377345e-01,   4.64793954e-01],
+    [  7.54736692e-01,   2.28772352e-01,   4.62508534e-01],
+    [  7.61077312e-01,   2.31213625e-01,   4.60162106e-01],
+    [  7.67397681e-01,   2.33704708e-01,   4.57755411e-01],
+    [  7.73695380e-01,   2.36249283e-01,   4.55289354e-01],
+    [  7.79967847e-01,   2.38851170e-01,   4.52765022e-01],
+    [  7.86212372e-01,   2.41514325e-01,   4.50183695e-01],
+    [  7.92426972e-01,   2.44242250e-01,   4.47543155e-01],
+    [  7.98607760e-01,   2.47039798e-01,   4.44848441e-01],
+    [  8.04751511e-01,   2.49911350e-01,   4.42101615e-01],
+    [  8.10854841e-01,   2.52861399e-01,   4.39304963e-01],
+    [  8.16914186e-01,   2.55894550e-01,   4.36461074e-01],
+    [  8.22925797e-01,   2.59015505e-01,   4.33572874e-01],
+    [  8.28885740e-01,   2.62229049e-01,   4.30643647e-01],
+    [  8.34790818e-01,   2.65539703e-01,   4.27671352e-01],
+    [  8.40635680e-01,   2.68952874e-01,   4.24665620e-01],
+    [  8.46415804e-01,   2.72473491e-01,   4.21631064e-01],
+    [  8.52126490e-01,   2.76106469e-01,   4.18572767e-01],
+    [  8.57762870e-01,   2.79856666e-01,   4.15496319e-01],
+    [  8.63320397e-01,   2.83729003e-01,   4.12402889e-01],
+    [  8.68793368e-01,   2.87728205e-01,   4.09303002e-01],
+    [  8.74176342e-01,   2.91858679e-01,   4.06205397e-01],
+    [  8.79463944e-01,   2.96124596e-01,   4.03118034e-01],
+    [  8.84650824e-01,   3.00530090e-01,   4.00047060e-01],
+    [  8.89731418e-01,   3.05078817e-01,   3.97001559e-01],
+    [  8.94700194e-01,   3.09773445e-01,   3.93994634e-01],
+    [  8.99551884e-01,   3.14616425e-01,   3.91036674e-01],
+    [  9.04281297e-01,   3.19609981e-01,   3.88136889e-01],
+    [  9.08883524e-01,   3.24755126e-01,   3.85308008e-01],
+    [  9.13354091e-01,   3.30051947e-01,   3.82563414e-01],
+    [  9.17688852e-01,   3.35500068e-01,   3.79915138e-01],
+    [  9.21884187e-01,   3.41098112e-01,   3.77375977e-01],
+    [  9.25937102e-01,   3.46843685e-01,   3.74959077e-01],
+    [  9.29845090e-01,   3.52733817e-01,   3.72676513e-01],
+    [  9.33606454e-01,   3.58764377e-01,   3.70540883e-01],
+    [  9.37220874e-01,   3.64929312e-01,   3.68566525e-01],
+    [  9.40687443e-01,   3.71224168e-01,   3.66761699e-01],
+    [  9.44006448e-01,   3.77642889e-01,   3.65136328e-01],
+    [  9.47179528e-01,   3.84177874e-01,   3.63701130e-01],
+    [  9.50210150e-01,   3.90819546e-01,   3.62467694e-01],
+    [  9.53099077e-01,   3.97562894e-01,   3.61438431e-01],
+    [  9.55849237e-01,   4.04400213e-01,   3.60619076e-01],
+    [  9.58464079e-01,   4.11323666e-01,   3.60014232e-01],
+    [  9.60949221e-01,   4.18323245e-01,   3.59629789e-01],
+    [  9.63310281e-01,   4.25389724e-01,   3.59469020e-01],
+    [  9.65549351e-01,   4.32518707e-01,   3.59529151e-01],
+    [  9.67671128e-01,   4.39702976e-01,   3.59810172e-01],
+    [  9.69680441e-01,   4.46935635e-01,   3.60311120e-01],
+    [  9.71582181e-01,   4.54210170e-01,   3.61030156e-01],
+    [  9.73381238e-01,   4.61520484e-01,   3.61964652e-01],
+    [  9.75082439e-01,   4.68860936e-01,   3.63111292e-01],
+    [  9.76690494e-01,   4.76226350e-01,   3.64466162e-01],
+    [  9.78209957e-01,   4.83612031e-01,   3.66024854e-01],
+    [  9.79645181e-01,   4.91013764e-01,   3.67782559e-01],
+    [  9.81000291e-01,   4.98427800e-01,   3.69734157e-01],
+    [  9.82279159e-01,   5.05850848e-01,   3.71874301e-01],
+    [  9.83485387e-01,   5.13280054e-01,   3.74197501e-01],
+    [  9.84622298e-01,   5.20712972e-01,   3.76698186e-01],
+    [  9.85692925e-01,   5.28147545e-01,   3.79370774e-01],
+    [  9.86700017e-01,   5.35582070e-01,   3.82209724e-01],
+    [  9.87646038e-01,   5.43015173e-01,   3.85209578e-01],
+    [  9.88533173e-01,   5.50445778e-01,   3.88365009e-01],
+    [  9.89363341e-01,   5.57873075e-01,   3.91670846e-01],
+    [  9.90138201e-01,   5.65296495e-01,   3.95122099e-01],
+    [  9.90871208e-01,   5.72706259e-01,   3.98713971e-01],
+    [  9.91558165e-01,   5.80106828e-01,   4.02441058e-01],
+    [  9.92195728e-01,   5.87501706e-01,   4.06298792e-01],
+    [  9.92784669e-01,   5.94891088e-01,   4.10282976e-01],
+    [  9.93325561e-01,   6.02275297e-01,   4.14389658e-01],
+    [  9.93834412e-01,   6.09643540e-01,   4.18613221e-01],
+    [  9.94308514e-01,   6.16998953e-01,   4.22949672e-01],
+    [  9.94737698e-01,   6.24349657e-01,   4.27396771e-01],
+    [  9.95121854e-01,   6.31696376e-01,   4.31951492e-01],
+    [  9.95480469e-01,   6.39026596e-01,   4.36607159e-01],
+    [  9.95809924e-01,   6.46343897e-01,   4.41360951e-01],
+    [  9.96095703e-01,   6.53658756e-01,   4.46213021e-01],
+    [  9.96341406e-01,   6.60969379e-01,   4.51160201e-01],
+    [  9.96579803e-01,   6.68255621e-01,   4.56191814e-01],
+    [  9.96774784e-01,   6.75541484e-01,   4.61314158e-01],
+    [  9.96925427e-01,   6.82827953e-01,   4.66525689e-01],
+    [  9.97077185e-01,   6.90087897e-01,   4.71811461e-01],
+    [  9.97186253e-01,   6.97348991e-01,   4.77181727e-01],
+    [  9.97253982e-01,   7.04610791e-01,   4.82634651e-01],
+    [  9.97325180e-01,   7.11847714e-01,   4.88154375e-01],
+    [  9.97350983e-01,   7.19089119e-01,   4.93754665e-01],
+    [  9.97350583e-01,   7.26324415e-01,   4.99427972e-01],
+    [  9.97341259e-01,   7.33544671e-01,   5.05166839e-01],
+    [  9.97284689e-01,   7.40771893e-01,   5.10983331e-01],
+    [  9.97228367e-01,   7.47980563e-01,   5.16859378e-01],
+    [  9.97138480e-01,   7.55189852e-01,   5.22805996e-01],
+    [  9.97019342e-01,   7.62397883e-01,   5.28820775e-01],
+    [  9.96898254e-01,   7.69590975e-01,   5.34892341e-01],
+    [  9.96726862e-01,   7.76794860e-01,   5.41038571e-01],
+    [  9.96570645e-01,   7.83976508e-01,   5.47232992e-01],
+    [  9.96369065e-01,   7.91167346e-01,   5.53498939e-01],
+    [  9.96162309e-01,   7.98347709e-01,   5.59819643e-01],
+    [  9.95932448e-01,   8.05527126e-01,   5.66201824e-01],
+    [  9.95680107e-01,   8.12705773e-01,   5.72644795e-01],
+    [  9.95423973e-01,   8.19875302e-01,   5.79140130e-01],
+    [  9.95131288e-01,   8.27051773e-01,   5.85701463e-01],
+    [  9.94851089e-01,   8.34212826e-01,   5.92307093e-01],
+    [  9.94523666e-01,   8.41386618e-01,   5.98982818e-01],
+    [  9.94221900e-01,   8.48540474e-01,   6.05695903e-01],
+    [  9.93865767e-01,   8.55711038e-01,   6.12481798e-01],
+    [  9.93545285e-01,   8.62858846e-01,   6.19299300e-01],
+    [  9.93169558e-01,   8.70024467e-01,   6.26189463e-01],
+    [  9.92830963e-01,   8.77168404e-01,   6.33109148e-01],
+    [  9.92439881e-01,   8.84329694e-01,   6.40099465e-01],
+    [  9.92089454e-01,   8.91469549e-01,   6.47116021e-01],
+    [  9.91687744e-01,   8.98627050e-01,   6.54201544e-01],
+    [  9.91331929e-01,   9.05762748e-01,   6.61308839e-01],
+    [  9.90929685e-01,   9.12915010e-01,   6.68481201e-01],
+    [  9.90569914e-01,   9.20048699e-01,   6.75674592e-01],
+    [  9.90174637e-01,   9.27195612e-01,   6.82925602e-01],
+    [  9.89814839e-01,   9.34328540e-01,   6.90198194e-01],
+    [  9.89433736e-01,   9.41470354e-01,   6.97518628e-01],
+    [  9.89077438e-01,   9.48604077e-01,   7.04862519e-01],
+    [  9.88717064e-01,   9.55741520e-01,   7.12242232e-01],
+    [  9.88367028e-01,   9.62878026e-01,   7.19648627e-01],
+    [  9.88032885e-01,   9.70012413e-01,   7.27076773e-01],
+    [  9.87690702e-01,   9.77154231e-01,   7.34536205e-01],
+    [  9.87386827e-01,   9.84287561e-01,   7.42001547e-01],
+    [  9.87052509e-01,   9.91437853e-01,   7.49504188e-01]];
+
+
+if nargin < 1
+    cm_data = cm;
+else
+    hsv=rgb2hsv(cm);
+    hsv(170:end,1)=hsv(170:end,1)+1; % hardcoded
+    cm_data=interp1(linspace(0,1,size(cm,1)),hsv,linspace(0,1,m));
+    cm_data(cm_data(:,1)>1,1)=cm_data(cm_data(:,1)>1,1)-1;
+    cm_data=hsv2rgb(cm_data);
+  
+end
+end
 \ No newline at end of file
+function cm_data=plasma(m)
+
+cm = [[  5.03832136e-02,   2.98028976e-02,   5.27974883e-01],
+       [  6.35363639e-02,   2.84259729e-02,   5.33123681e-01],
+       [  7.53531234e-02,   2.72063728e-02,   5.38007001e-01],
+       [  8.62217979e-02,   2.61253206e-02,   5.42657691e-01],
+       [  9.63786097e-02,   2.51650976e-02,   5.47103487e-01],
+       [  1.05979704e-01,   2.43092436e-02,   5.51367851e-01],
+       [  1.15123641e-01,   2.35562500e-02,   5.55467728e-01],
+       [  1.23902903e-01,   2.28781011e-02,   5.59423480e-01],
+       [  1.32380720e-01,   2.22583774e-02,   5.63250116e-01],
+       [  1.40603076e-01,   2.16866674e-02,   5.66959485e-01],
+       [  1.48606527e-01,   2.11535876e-02,   5.70561711e-01],
+       [  1.56420649e-01,   2.06507174e-02,   5.74065446e-01],
+       [  1.64069722e-01,   2.01705326e-02,   5.77478074e-01],
+       [  1.71573925e-01,   1.97063415e-02,   5.80805890e-01],
+       [  1.78950212e-01,   1.92522243e-02,   5.84054243e-01],
+       [  1.86212958e-01,   1.88029767e-02,   5.87227661e-01],
+       [  1.93374449e-01,   1.83540593e-02,   5.90329954e-01],
+       [  2.00445260e-01,   1.79015512e-02,   5.93364304e-01],
+       [  2.07434551e-01,   1.74421086e-02,   5.96333341e-01],
+       [  2.14350298e-01,   1.69729276e-02,   5.99239207e-01],
+       [  2.21196750e-01,   1.64970484e-02,   6.02083323e-01],
+       [  2.27982971e-01,   1.60071509e-02,   6.04867403e-01],
+       [  2.34714537e-01,   1.55015065e-02,   6.07592438e-01],
+       [  2.41396253e-01,   1.49791041e-02,   6.10259089e-01],
+       [  2.48032377e-01,   1.44393586e-02,   6.12867743e-01],
+       [  2.54626690e-01,   1.38820918e-02,   6.15418537e-01],
+       [  2.61182562e-01,   1.33075156e-02,   6.17911385e-01],
+       [  2.67702993e-01,   1.27162163e-02,   6.20345997e-01],
+       [  2.74190665e-01,   1.21091423e-02,   6.22721903e-01],
+       [  2.80647969e-01,   1.14875915e-02,   6.25038468e-01],
+       [  2.87076059e-01,   1.08554862e-02,   6.27294975e-01],
+       [  2.93477695e-01,   1.02128849e-02,   6.29490490e-01],
+       [  2.99855122e-01,   9.56079551e-03,   6.31623923e-01],
+       [  3.06209825e-01,   8.90185346e-03,   6.33694102e-01],
+       [  3.12543124e-01,   8.23900704e-03,   6.35699759e-01],
+       [  3.18856183e-01,   7.57551051e-03,   6.37639537e-01],
+       [  3.25150025e-01,   6.91491734e-03,   6.39512001e-01],
+       [  3.31425547e-01,   6.26107379e-03,   6.41315649e-01],
+       [  3.37683446e-01,   5.61830889e-03,   6.43048936e-01],
+       [  3.43924591e-01,   4.99053080e-03,   6.44710195e-01],
+       [  3.50149699e-01,   4.38202557e-03,   6.46297711e-01],
+       [  3.56359209e-01,   3.79781761e-03,   6.47809772e-01],
+       [  3.62553473e-01,   3.24319591e-03,   6.49244641e-01],
+       [  3.68732762e-01,   2.72370721e-03,   6.50600561e-01],
+       [  3.74897270e-01,   2.24514897e-03,   6.51875762e-01],
+       [  3.81047116e-01,   1.81356205e-03,   6.53068467e-01],
+       [  3.87182639e-01,   1.43446923e-03,   6.54176761e-01],
+       [  3.93304010e-01,   1.11388259e-03,   6.55198755e-01],
+       [  3.99410821e-01,   8.59420809e-04,   6.56132835e-01],
+       [  4.05502914e-01,   6.78091517e-04,   6.56977276e-01],
+       [  4.11580082e-01,   5.77101735e-04,   6.57730380e-01],
+       [  4.17642063e-01,   5.63847476e-04,   6.58390492e-01],
+       [  4.23688549e-01,   6.45902780e-04,   6.58956004e-01],
+       [  4.29719186e-01,   8.31008207e-04,   6.59425363e-01],
+       [  4.35733575e-01,   1.12705875e-03,   6.59797077e-01],
+       [  4.41732123e-01,   1.53984779e-03,   6.60069009e-01],
+       [  4.47713600e-01,   2.07954744e-03,   6.60240367e-01],
+       [  4.53677394e-01,   2.75470302e-03,   6.60309966e-01],
+       [  4.59622938e-01,   3.57374415e-03,   6.60276655e-01],
+       [  4.65549631e-01,   4.54518084e-03,   6.60139383e-01],
+       [  4.71456847e-01,   5.67758762e-03,   6.59897210e-01],
+       [  4.77343929e-01,   6.97958743e-03,   6.59549311e-01],
+       [  4.83210198e-01,   8.45983494e-03,   6.59094989e-01],
+       [  4.89054951e-01,   1.01269996e-02,   6.58533677e-01],
+       [  4.94877466e-01,   1.19897486e-02,   6.57864946e-01],
+       [  5.00677687e-01,   1.40550640e-02,   6.57087561e-01],
+       [  5.06454143e-01,   1.63333443e-02,   6.56202294e-01],
+       [  5.12206035e-01,   1.88332232e-02,   6.55209222e-01],
+       [  5.17932580e-01,   2.15631918e-02,   6.54108545e-01],
+       [  5.23632990e-01,   2.45316468e-02,   6.52900629e-01],
+       [  5.29306474e-01,   2.77468735e-02,   6.51586010e-01],
+       [  5.34952244e-01,   3.12170300e-02,   6.50165396e-01],
+       [  5.40569510e-01,   3.49501310e-02,   6.48639668e-01],
+       [  5.46157494e-01,   3.89540334e-02,   6.47009884e-01],
+       [  5.51715423e-01,   4.31364795e-02,   6.45277275e-01],
+       [  5.57242538e-01,   4.73307585e-02,   6.43443250e-01],
+       [  5.62738096e-01,   5.15448092e-02,   6.41509389e-01],
+       [  5.68201372e-01,   5.57776706e-02,   6.39477440e-01],
+       [  5.73631859e-01,   6.00281369e-02,   6.37348841e-01],
+       [  5.79028682e-01,   6.42955547e-02,   6.35126108e-01],
+       [  5.84391137e-01,   6.85790261e-02,   6.32811608e-01],
+       [  5.89718606e-01,   7.28775875e-02,   6.30407727e-01],
+       [  5.95010505e-01,   7.71902878e-02,   6.27916992e-01],
+       [  6.00266283e-01,   8.15161895e-02,   6.25342058e-01],
+       [  6.05485428e-01,   8.58543713e-02,   6.22685703e-01],
+       [  6.10667469e-01,   9.02039303e-02,   6.19950811e-01],
+       [  6.15811974e-01,   9.45639838e-02,   6.17140367e-01],
+       [  6.20918555e-01,   9.89336721e-02,   6.14257440e-01],
+       [  6.25986869e-01,   1.03312160e-01,   6.11305174e-01],
+       [  6.31016615e-01,   1.07698641e-01,   6.08286774e-01],
+       [  6.36007543e-01,   1.12092335e-01,   6.05205491e-01],
+       [  6.40959444e-01,   1.16492495e-01,   6.02064611e-01],
+       [  6.45872158e-01,   1.20898405e-01,   5.98867442e-01],
+       [  6.50745571e-01,   1.25309384e-01,   5.95617300e-01],
+       [  6.55579615e-01,   1.29724785e-01,   5.92317494e-01],
+       [  6.60374266e-01,   1.34143997e-01,   5.88971318e-01],
+       [  6.65129493e-01,   1.38566428e-01,   5.85582301e-01],
+       [  6.69845385e-01,   1.42991540e-01,   5.82153572e-01],
+       [  6.74522060e-01,   1.47418835e-01,   5.78688247e-01],
+       [  6.79159664e-01,   1.51847851e-01,   5.75189431e-01],
+       [  6.83758384e-01,   1.56278163e-01,   5.71660158e-01],
+       [  6.88318440e-01,   1.60709387e-01,   5.68103380e-01],
+       [  6.92840088e-01,   1.65141174e-01,   5.64521958e-01],
+       [  6.97323615e-01,   1.69573215e-01,   5.60918659e-01],
+       [  7.01769334e-01,   1.74005236e-01,   5.57296144e-01],
+       [  7.06177590e-01,   1.78437000e-01,   5.53656970e-01],
+       [  7.10548747e-01,   1.82868306e-01,   5.50003579e-01],
+       [  7.14883195e-01,   1.87298986e-01,   5.46338299e-01],
+       [  7.19181339e-01,   1.91728906e-01,   5.42663338e-01],
+       [  7.23443604e-01,   1.96157962e-01,   5.38980786e-01],
+       [  7.27670428e-01,   2.00586086e-01,   5.35292612e-01],
+       [  7.31862231e-01,   2.05013174e-01,   5.31600995e-01],
+       [  7.36019424e-01,   2.09439071e-01,   5.27908434e-01],
+       [  7.40142557e-01,   2.13863965e-01,   5.24215533e-01],
+       [  7.44232102e-01,   2.18287899e-01,   5.20523766e-01],
+       [  7.48288533e-01,   2.22710942e-01,   5.16834495e-01],
+       [  7.52312321e-01,   2.27133187e-01,   5.13148963e-01],
+       [  7.56303937e-01,   2.31554749e-01,   5.09468305e-01],
+       [  7.60263849e-01,   2.35975765e-01,   5.05793543e-01],
+       [  7.64192516e-01,   2.40396394e-01,   5.02125599e-01],
+       [  7.68090391e-01,   2.44816813e-01,   4.98465290e-01],
+       [  7.71957916e-01,   2.49237220e-01,   4.94813338e-01],
+       [  7.75795522e-01,   2.53657797e-01,   4.91170517e-01],
+       [  7.79603614e-01,   2.58078397e-01,   4.87539124e-01],
+       [  7.83382636e-01,   2.62499662e-01,   4.83917732e-01],
+       [  7.87132978e-01,   2.66921859e-01,   4.80306702e-01],
+       [  7.90855015e-01,   2.71345267e-01,   4.76706319e-01],
+       [  7.94549101e-01,   2.75770179e-01,   4.73116798e-01],
+       [  7.98215577e-01,   2.80196901e-01,   4.69538286e-01],
+       [  8.01854758e-01,   2.84625750e-01,   4.65970871e-01],
+       [  8.05466945e-01,   2.89057057e-01,   4.62414580e-01],
+       [  8.09052419e-01,   2.93491117e-01,   4.58869577e-01],
+       [  8.12611506e-01,   2.97927865e-01,   4.55337565e-01],
+       [  8.16144382e-01,   3.02368130e-01,   4.51816385e-01],
+       [  8.19651255e-01,   3.06812282e-01,   4.48305861e-01],
+       [  8.23132309e-01,   3.11260703e-01,   4.44805781e-01],
+       [  8.26587706e-01,   3.15713782e-01,   4.41315901e-01],
+       [  8.30017584e-01,   3.20171913e-01,   4.37835947e-01],
+       [  8.33422053e-01,   3.24635499e-01,   4.34365616e-01],
+       [  8.36801237e-01,   3.29104836e-01,   4.30905052e-01],
+       [  8.40155276e-01,   3.33580106e-01,   4.27454836e-01],
+       [  8.43484103e-01,   3.38062109e-01,   4.24013059e-01],
+       [  8.46787726e-01,   3.42551272e-01,   4.20579333e-01],
+       [  8.50066132e-01,   3.47048028e-01,   4.17153264e-01],
+       [  8.53319279e-01,   3.51552815e-01,   4.13734445e-01],
+       [  8.56547103e-01,   3.56066072e-01,   4.10322469e-01],
+       [  8.59749520e-01,   3.60588229e-01,   4.06916975e-01],
+       [  8.62926559e-01,   3.65119408e-01,   4.03518809e-01],
+       [  8.66077920e-01,   3.69660446e-01,   4.00126027e-01],
+       [  8.69203436e-01,   3.74211795e-01,   3.96738211e-01],
+       [  8.72302917e-01,   3.78773910e-01,   3.93354947e-01],
+       [  8.75376149e-01,   3.83347243e-01,   3.89975832e-01],
+       [  8.78422895e-01,   3.87932249e-01,   3.86600468e-01],
+       [  8.81442916e-01,   3.92529339e-01,   3.83228622e-01],
+       [  8.84435982e-01,   3.97138877e-01,   3.79860246e-01],
+       [  8.87401682e-01,   4.01761511e-01,   3.76494232e-01],
+       [  8.90339687e-01,   4.06397694e-01,   3.73130228e-01],
+       [  8.93249647e-01,   4.11047871e-01,   3.69767893e-01],
+       [  8.96131191e-01,   4.15712489e-01,   3.66406907e-01],
+       [  8.98983931e-01,   4.20391986e-01,   3.63046965e-01],
+       [  9.01807455e-01,   4.25086807e-01,   3.59687758e-01],
+       [  9.04601295e-01,   4.29797442e-01,   3.56328796e-01],
+       [  9.07364995e-01,   4.34524335e-01,   3.52969777e-01],
+       [  9.10098088e-01,   4.39267908e-01,   3.49610469e-01],
+       [  9.12800095e-01,   4.44028574e-01,   3.46250656e-01],
+       [  9.15470518e-01,   4.48806744e-01,   3.42890148e-01],
+       [  9.18108848e-01,   4.53602818e-01,   3.39528771e-01],
+       [  9.20714383e-01,   4.58417420e-01,   3.36165582e-01],
+       [  9.23286660e-01,   4.63250828e-01,   3.32800827e-01],
+       [  9.25825146e-01,   4.68103387e-01,   3.29434512e-01],
+       [  9.28329275e-01,   4.72975465e-01,   3.26066550e-01],
+       [  9.30798469e-01,   4.77867420e-01,   3.22696876e-01],
+       [  9.33232140e-01,   4.82779603e-01,   3.19325444e-01],
+       [  9.35629684e-01,   4.87712357e-01,   3.15952211e-01],
+       [  9.37990034e-01,   4.92666544e-01,   3.12575440e-01],
+       [  9.40312939e-01,   4.97642038e-01,   3.09196628e-01],
+       [  9.42597771e-01,   5.02639147e-01,   3.05815824e-01],
+       [  9.44843893e-01,   5.07658169e-01,   3.02433101e-01],
+       [  9.47050662e-01,   5.12699390e-01,   2.99048555e-01],
+       [  9.49217427e-01,   5.17763087e-01,   2.95662308e-01],
+       [  9.51343530e-01,   5.22849522e-01,   2.92274506e-01],
+       [  9.53427725e-01,   5.27959550e-01,   2.88883445e-01],
+       [  9.55469640e-01,   5.33093083e-01,   2.85490391e-01],
+       [  9.57468770e-01,   5.38250172e-01,   2.82096149e-01],
+       [  9.59424430e-01,   5.43431038e-01,   2.78700990e-01],
+       [  9.61335930e-01,   5.48635890e-01,   2.75305214e-01],
+       [  9.63202573e-01,   5.53864931e-01,   2.71909159e-01],
+       [  9.65023656e-01,   5.59118349e-01,   2.68513200e-01],
+       [  9.66798470e-01,   5.64396327e-01,   2.65117752e-01],
+       [  9.68525639e-01,   5.69699633e-01,   2.61721488e-01],
+       [  9.70204593e-01,   5.75028270e-01,   2.58325424e-01],
+       [  9.71835007e-01,   5.80382015e-01,   2.54931256e-01],
+       [  9.73416145e-01,   5.85761012e-01,   2.51539615e-01],
+       [  9.74947262e-01,   5.91165394e-01,   2.48151200e-01],
+       [  9.76427606e-01,   5.96595287e-01,   2.44766775e-01],
+       [  9.77856416e-01,   6.02050811e-01,   2.41387186e-01],
+       [  9.79232922e-01,   6.07532077e-01,   2.38013359e-01],
+       [  9.80556344e-01,   6.13039190e-01,   2.34646316e-01],
+       [  9.81825890e-01,   6.18572250e-01,   2.31287178e-01],
+       [  9.83040742e-01,   6.24131362e-01,   2.27937141e-01],
+       [  9.84198924e-01,   6.29717516e-01,   2.24595006e-01],
+       [  9.85300760e-01,   6.35329876e-01,   2.21264889e-01],
+       [  9.86345421e-01,   6.40968508e-01,   2.17948456e-01],
+       [  9.87332067e-01,   6.46633475e-01,   2.14647532e-01],
+       [  9.88259846e-01,   6.52324832e-01,   2.11364122e-01],
+       [  9.89127893e-01,   6.58042630e-01,   2.08100426e-01],
+       [  9.89935328e-01,   6.63786914e-01,   2.04858855e-01],
+       [  9.90681261e-01,   6.69557720e-01,   2.01642049e-01],
+       [  9.91364787e-01,   6.75355082e-01,   1.98452900e-01],
+       [  9.91984990e-01,   6.81179025e-01,   1.95294567e-01],
+       [  9.92540939e-01,   6.87029567e-01,   1.92170500e-01],
+       [  9.93031693e-01,   6.92906719e-01,   1.89084459e-01],
+       [  9.93456302e-01,   6.98810484e-01,   1.86040537e-01],
+       [  9.93813802e-01,   7.04740854e-01,   1.83043180e-01],
+       [  9.94103226e-01,   7.10697814e-01,   1.80097207e-01],
+       [  9.94323596e-01,   7.16681336e-01,   1.77207826e-01],
+       [  9.94473934e-01,   7.22691379e-01,   1.74380656e-01],
+       [  9.94553260e-01,   7.28727890e-01,   1.71621733e-01],
+       [  9.94560594e-01,   7.34790799e-01,   1.68937522e-01],
+       [  9.94494964e-01,   7.40880020e-01,   1.66334918e-01],
+       [  9.94355411e-01,   7.46995448e-01,   1.63821243e-01],
+       [  9.94140989e-01,   7.53136955e-01,   1.61404226e-01],
+       [  9.93850778e-01,   7.59304390e-01,   1.59091984e-01],
+       [  9.93482190e-01,   7.65498551e-01,   1.56890625e-01],
+       [  9.93033251e-01,   7.71719833e-01,   1.54807583e-01],
+       [  9.92505214e-01,   7.77966775e-01,   1.52854862e-01],
+       [  9.91897270e-01,   7.84239120e-01,   1.51041581e-01],
+       [  9.91208680e-01,   7.90536569e-01,   1.49376885e-01],
+       [  9.90438793e-01,   7.96858775e-01,   1.47869810e-01],
+       [  9.89587065e-01,   8.03205337e-01,   1.46529128e-01],
+       [  9.88647741e-01,   8.09578605e-01,   1.45357284e-01],
+       [  9.87620557e-01,   8.15977942e-01,   1.44362644e-01],
+       [  9.86509366e-01,   8.22400620e-01,   1.43556679e-01],
+       [  9.85314198e-01,   8.28845980e-01,   1.42945116e-01],
+       [  9.84031139e-01,   8.35315360e-01,   1.42528388e-01],
+       [  9.82652820e-01,   8.41811730e-01,   1.42302653e-01],
+       [  9.81190389e-01,   8.48328902e-01,   1.42278607e-01],
+       [  9.79643637e-01,   8.54866468e-01,   1.42453425e-01],
+       [  9.77994918e-01,   8.61432314e-01,   1.42808191e-01],
+       [  9.76264977e-01,   8.68015998e-01,   1.43350944e-01],
+       [  9.74443038e-01,   8.74622194e-01,   1.44061156e-01],
+       [  9.72530009e-01,   8.81250063e-01,   1.44922913e-01],
+       [  9.70532932e-01,   8.87896125e-01,   1.45918663e-01],
+       [  9.68443477e-01,   8.94563989e-01,   1.47014438e-01],
+       [  9.66271225e-01,   9.01249365e-01,   1.48179639e-01],
+       [  9.64021057e-01,   9.07950379e-01,   1.49370428e-01],
+       [  9.61681481e-01,   9.14672479e-01,   1.50520343e-01],
+       [  9.59275646e-01,   9.21406537e-01,   1.51566019e-01],
+       [  9.56808068e-01,   9.28152065e-01,   1.52409489e-01],
+       [  9.54286813e-01,   9.34907730e-01,   1.52921158e-01],
+       [  9.51726083e-01,   9.41670605e-01,   1.52925363e-01],
+       [  9.49150533e-01,   9.48434900e-01,   1.52177604e-01],
+       [  9.46602270e-01,   9.55189860e-01,   1.50327944e-01],
+       [  9.44151742e-01,   9.61916487e-01,   1.46860789e-01],
+       [  9.41896120e-01,   9.68589814e-01,   1.40955606e-01],
+       [  9.40015097e-01,   9.75158357e-01,   1.31325517e-01]];
+   
+if nargin < 1
+    cm_data = cm;
+else
+    hsv=rgb2hsv(cm);
+    hsv(153:end,1)=hsv(153:end,1)+1; % hardcoded
+    cm_data=interp1(linspace(0,1,size(cm,1)),hsv,linspace(0,1,m));
+    cm_data(cm_data(:,1)>1,1)=cm_data(cm_data(:,1)>1,1)-1;
+    cm_data=hsv2rgb(cm_data);
+  
+end
+end
 \ No newline at end of file
+function spe2envicamille(filemask, outfile,wavenumberfilename)
+
+    filelist = dir(filemask);
+
+    %get a list of date numbers
+    datenums = cell2mat({filelist.datenum});
+
+    %sort the file order based on acquisition time
+    [~, id] = sort(datenums);
+
+    %get the number of files
+    Y = length(id);                 %size of the image along Y
+
+    %load the first file to determine the spectral and X-axis size
+    temp = readspe(filelist(1).name);
+    X = size(temp, 1);              %size of the image along X
+    B = size(temp, 2);              %number of bands in the image
+
+    %create the cube
+    I = zeros(X, Y, B);
+
+    %for each line
+    for y = 1:Y
+
+        %read a SPE file
+        img = readspe(filelist(id(y)).name);
+
+        I(:, y, :) = permute(img, [1 3 2]);
+    end
+ 
+    enviSaveRawcamille(single(I), outfile, [outfile '.hdr'],wavenumberfilename);
+
+    
+
+function S = stim_images2matrix(filemask)
+%This function loads a set of images as a 3D matrix. Color images are
+%converted to grayscale when loaded, so the resulting matrix is always 3D
+%with size X x Y x Z, where:
+%   X is the size of the images along the X axis
+%   Y is the size of the images along the Y axis
+%   Z is the number of images
+%
+%   all images are assumed to be the same size (though they do not have to
+%   be the same file format or number of bits per pixel
+
+    files = dir(filemask);
+
+    %figure out the file size
+    I = imread([files(1).folder '/' files(1).name]);
+    X = size(I, 1);
+    Y = size(I, 2);
+    Z = length(files);
+
+    S = zeros(X, Y, Z, 'uint8');
+
+    h = waitbar(0, ['Loading ' num2str(Z) ' images...']);
+    for i = 1:Z    
+        I = rgb2gray(imread([files(1).folder '/' files(1).name]));
+        S(:, :, i) = I;
+        waitbar(i/Z, h);
+    end
+    close(h);
+end
+    
+
+function cm_data=viridis(m)
+cm = [[ 0.26700401,  0.00487433,  0.32941519],
+       [ 0.26851048,  0.00960483,  0.33542652],
+       [ 0.26994384,  0.01462494,  0.34137895],
+       [ 0.27130489,  0.01994186,  0.34726862],
+       [ 0.27259384,  0.02556309,  0.35309303],
+       [ 0.27380934,  0.03149748,  0.35885256],
+       [ 0.27495242,  0.03775181,  0.36454323],
+       [ 0.27602238,  0.04416723,  0.37016418],
+       [ 0.2770184 ,  0.05034437,  0.37571452],
+       [ 0.27794143,  0.05632444,  0.38119074],
+       [ 0.27879067,  0.06214536,  0.38659204],
+       [ 0.2795655 ,  0.06783587,  0.39191723],
+       [ 0.28026658,  0.07341724,  0.39716349],
+       [ 0.28089358,  0.07890703,  0.40232944],
+       [ 0.28144581,  0.0843197 ,  0.40741404],
+       [ 0.28192358,  0.08966622,  0.41241521],
+       [ 0.28232739,  0.09495545,  0.41733086],
+       [ 0.28265633,  0.10019576,  0.42216032],
+       [ 0.28291049,  0.10539345,  0.42690202],
+       [ 0.28309095,  0.11055307,  0.43155375],
+       [ 0.28319704,  0.11567966,  0.43611482],
+       [ 0.28322882,  0.12077701,  0.44058404],
+       [ 0.28318684,  0.12584799,  0.44496   ],
+       [ 0.283072  ,  0.13089477,  0.44924127],
+       [ 0.28288389,  0.13592005,  0.45342734],
+       [ 0.28262297,  0.14092556,  0.45751726],
+       [ 0.28229037,  0.14591233,  0.46150995],
+       [ 0.28188676,  0.15088147,  0.46540474],
+       [ 0.28141228,  0.15583425,  0.46920128],
+       [ 0.28086773,  0.16077132,  0.47289909],
+       [ 0.28025468,  0.16569272,  0.47649762],
+       [ 0.27957399,  0.17059884,  0.47999675],
+       [ 0.27882618,  0.1754902 ,  0.48339654],
+       [ 0.27801236,  0.18036684,  0.48669702],
+       [ 0.27713437,  0.18522836,  0.48989831],
+       [ 0.27619376,  0.19007447,  0.49300074],
+       [ 0.27519116,  0.1949054 ,  0.49600488],
+       [ 0.27412802,  0.19972086,  0.49891131],
+       [ 0.27300596,  0.20452049,  0.50172076],
+       [ 0.27182812,  0.20930306,  0.50443413],
+       [ 0.27059473,  0.21406899,  0.50705243],
+       [ 0.26930756,  0.21881782,  0.50957678],
+       [ 0.26796846,  0.22354911,  0.5120084 ],
+       [ 0.26657984,  0.2282621 ,  0.5143487 ],
+       [ 0.2651445 ,  0.23295593,  0.5165993 ],
+       [ 0.2636632 ,  0.23763078,  0.51876163],
+       [ 0.26213801,  0.24228619,  0.52083736],
+       [ 0.26057103,  0.2469217 ,  0.52282822],
+       [ 0.25896451,  0.25153685,  0.52473609],
+       [ 0.25732244,  0.2561304 ,  0.52656332],
+       [ 0.25564519,  0.26070284,  0.52831152],
+       [ 0.25393498,  0.26525384,  0.52998273],
+       [ 0.25219404,  0.26978306,  0.53157905],
+       [ 0.25042462,  0.27429024,  0.53310261],
+       [ 0.24862899,  0.27877509,  0.53455561],
+       [ 0.2468114 ,  0.28323662,  0.53594093],
+       [ 0.24497208,  0.28767547,  0.53726018],
+       [ 0.24311324,  0.29209154,  0.53851561],
+       [ 0.24123708,  0.29648471,  0.53970946],
+       [ 0.23934575,  0.30085494,  0.54084398],
+       [ 0.23744138,  0.30520222,  0.5419214 ],
+       [ 0.23552606,  0.30952657,  0.54294396],
+       [ 0.23360277,  0.31382773,  0.54391424],
+       [ 0.2316735 ,  0.3181058 ,  0.54483444],
+       [ 0.22973926,  0.32236127,  0.54570633],
+       [ 0.22780192,  0.32659432,  0.546532  ],
+       [ 0.2258633 ,  0.33080515,  0.54731353],
+       [ 0.22392515,  0.334994  ,  0.54805291],
+       [ 0.22198915,  0.33916114,  0.54875211],
+       [ 0.22005691,  0.34330688,  0.54941304],
+       [ 0.21812995,  0.34743154,  0.55003755],
+       [ 0.21620971,  0.35153548,  0.55062743],
+       [ 0.21429757,  0.35561907,  0.5511844 ],
+       [ 0.21239477,  0.35968273,  0.55171011],
+       [ 0.2105031 ,  0.36372671,  0.55220646],
+       [ 0.20862342,  0.36775151,  0.55267486],
+       [ 0.20675628,  0.37175775,  0.55311653],
+       [ 0.20490257,  0.37574589,  0.55353282],
+       [ 0.20306309,  0.37971644,  0.55392505],
+       [ 0.20123854,  0.38366989,  0.55429441],
+       [ 0.1994295 ,  0.38760678,  0.55464205],
+       [ 0.1976365 ,  0.39152762,  0.55496905],
+       [ 0.19585993,  0.39543297,  0.55527637],
+       [ 0.19410009,  0.39932336,  0.55556494],
+       [ 0.19235719,  0.40319934,  0.55583559],
+       [ 0.19063135,  0.40706148,  0.55608907],
+       [ 0.18892259,  0.41091033,  0.55632606],
+       [ 0.18723083,  0.41474645,  0.55654717],
+       [ 0.18555593,  0.4185704 ,  0.55675292],
+       [ 0.18389763,  0.42238275,  0.55694377],
+       [ 0.18225561,  0.42618405,  0.5571201 ],
+       [ 0.18062949,  0.42997486,  0.55728221],
+       [ 0.17901879,  0.43375572,  0.55743035],
+       [ 0.17742298,  0.4375272 ,  0.55756466],
+       [ 0.17584148,  0.44128981,  0.55768526],
+       [ 0.17427363,  0.4450441 ,  0.55779216],
+       [ 0.17271876,  0.4487906 ,  0.55788532],
+       [ 0.17117615,  0.4525298 ,  0.55796464],
+       [ 0.16964573,  0.45626209,  0.55803034],
+       [ 0.16812641,  0.45998802,  0.55808199],
+       [ 0.1666171 ,  0.46370813,  0.55811913],
+       [ 0.16511703,  0.4674229 ,  0.55814141],
+       [ 0.16362543,  0.47113278,  0.55814842],
+       [ 0.16214155,  0.47483821,  0.55813967],
+       [ 0.16066467,  0.47853961,  0.55811466],
+       [ 0.15919413,  0.4822374 ,  0.5580728 ],
+       [ 0.15772933,  0.48593197,  0.55801347],
+       [ 0.15626973,  0.4896237 ,  0.557936  ],
+       [ 0.15481488,  0.49331293,  0.55783967],
+       [ 0.15336445,  0.49700003,  0.55772371],
+       [ 0.1519182 ,  0.50068529,  0.55758733],
+       [ 0.15047605,  0.50436904,  0.55742968],
+       [ 0.14903918,  0.50805136,  0.5572505 ],
+       [ 0.14760731,  0.51173263,  0.55704861],
+       [ 0.14618026,  0.51541316,  0.55682271],
+       [ 0.14475863,  0.51909319,  0.55657181],
+       [ 0.14334327,  0.52277292,  0.55629491],
+       [ 0.14193527,  0.52645254,  0.55599097],
+       [ 0.14053599,  0.53013219,  0.55565893],
+       [ 0.13914708,  0.53381201,  0.55529773],
+       [ 0.13777048,  0.53749213,  0.55490625],
+       [ 0.1364085 ,  0.54117264,  0.55448339],
+       [ 0.13506561,  0.54485335,  0.55402906],
+       [ 0.13374299,  0.54853458,  0.55354108],
+       [ 0.13244401,  0.55221637,  0.55301828],
+       [ 0.13117249,  0.55589872,  0.55245948],
+       [ 0.1299327 ,  0.55958162,  0.55186354],
+       [ 0.12872938,  0.56326503,  0.55122927],
+       [ 0.12756771,  0.56694891,  0.55055551],
+       [ 0.12645338,  0.57063316,  0.5498411 ],
+       [ 0.12539383,  0.57431754,  0.54908564],
+       [ 0.12439474,  0.57800205,  0.5482874 ],
+       [ 0.12346281,  0.58168661,  0.54744498],
+       [ 0.12260562,  0.58537105,  0.54655722],
+       [ 0.12183122,  0.58905521,  0.54562298],
+       [ 0.12114807,  0.59273889,  0.54464114],
+       [ 0.12056501,  0.59642187,  0.54361058],
+       [ 0.12009154,  0.60010387,  0.54253043],
+       [ 0.11973756,  0.60378459,  0.54139999],
+       [ 0.11951163,  0.60746388,  0.54021751],
+       [ 0.11942341,  0.61114146,  0.53898192],
+       [ 0.11948255,  0.61481702,  0.53769219],
+       [ 0.11969858,  0.61849025,  0.53634733],
+       [ 0.12008079,  0.62216081,  0.53494633],
+       [ 0.12063824,  0.62582833,  0.53348834],
+       [ 0.12137972,  0.62949242,  0.53197275],
+       [ 0.12231244,  0.63315277,  0.53039808],
+       [ 0.12344358,  0.63680899,  0.52876343],
+       [ 0.12477953,  0.64046069,  0.52706792],
+       [ 0.12632581,  0.64410744,  0.52531069],
+       [ 0.12808703,  0.64774881,  0.52349092],
+       [ 0.13006688,  0.65138436,  0.52160791],
+       [ 0.13226797,  0.65501363,  0.51966086],
+       [ 0.13469183,  0.65863619,  0.5176488 ],
+       [ 0.13733921,  0.66225157,  0.51557101],
+       [ 0.14020991,  0.66585927,  0.5134268 ],
+       [ 0.14330291,  0.66945881,  0.51121549],
+       [ 0.1466164 ,  0.67304968,  0.50893644],
+       [ 0.15014782,  0.67663139,  0.5065889 ],
+       [ 0.15389405,  0.68020343,  0.50417217],
+       [ 0.15785146,  0.68376525,  0.50168574],
+       [ 0.16201598,  0.68731632,  0.49912906],
+       [ 0.1663832 ,  0.69085611,  0.49650163],
+       [ 0.1709484 ,  0.69438405,  0.49380294],
+       [ 0.17570671,  0.6978996 ,  0.49103252],
+       [ 0.18065314,  0.70140222,  0.48818938],
+       [ 0.18578266,  0.70489133,  0.48527326],
+       [ 0.19109018,  0.70836635,  0.48228395],
+       [ 0.19657063,  0.71182668,  0.47922108],
+       [ 0.20221902,  0.71527175,  0.47608431],
+       [ 0.20803045,  0.71870095,  0.4728733 ],
+       [ 0.21400015,  0.72211371,  0.46958774],
+       [ 0.22012381,  0.72550945,  0.46622638],
+       [ 0.2263969 ,  0.72888753,  0.46278934],
+       [ 0.23281498,  0.73224735,  0.45927675],
+       [ 0.2393739 ,  0.73558828,  0.45568838],
+       [ 0.24606968,  0.73890972,  0.45202405],
+       [ 0.25289851,  0.74221104,  0.44828355],
+       [ 0.25985676,  0.74549162,  0.44446673],
+       [ 0.26694127,  0.74875084,  0.44057284],
+       [ 0.27414922,  0.75198807,  0.4366009 ],
+       [ 0.28147681,  0.75520266,  0.43255207],
+       [ 0.28892102,  0.75839399,  0.42842626],
+       [ 0.29647899,  0.76156142,  0.42422341],
+       [ 0.30414796,  0.76470433,  0.41994346],
+       [ 0.31192534,  0.76782207,  0.41558638],
+       [ 0.3198086 ,  0.77091403,  0.41115215],
+       [ 0.3277958 ,  0.77397953,  0.40664011],
+       [ 0.33588539,  0.7770179 ,  0.40204917],
+       [ 0.34407411,  0.78002855,  0.39738103],
+       [ 0.35235985,  0.78301086,  0.39263579],
+       [ 0.36074053,  0.78596419,  0.38781353],
+       [ 0.3692142 ,  0.78888793,  0.38291438],
+       [ 0.37777892,  0.79178146,  0.3779385 ],
+       [ 0.38643282,  0.79464415,  0.37288606],
+       [ 0.39517408,  0.79747541,  0.36775726],
+       [ 0.40400101,  0.80027461,  0.36255223],
+       [ 0.4129135 ,  0.80304099,  0.35726893],
+       [ 0.42190813,  0.80577412,  0.35191009],
+       [ 0.43098317,  0.80847343,  0.34647607],
+       [ 0.44013691,  0.81113836,  0.3409673 ],
+       [ 0.44936763,  0.81376835,  0.33538426],
+       [ 0.45867362,  0.81636288,  0.32972749],
+       [ 0.46805314,  0.81892143,  0.32399761],
+       [ 0.47750446,  0.82144351,  0.31819529],
+       [ 0.4870258 ,  0.82392862,  0.31232133],
+       [ 0.49661536,  0.82637633,  0.30637661],
+       [ 0.5062713 ,  0.82878621,  0.30036211],
+       [ 0.51599182,  0.83115784,  0.29427888],
+       [ 0.52577622,  0.83349064,  0.2881265 ],
+       [ 0.5356211 ,  0.83578452,  0.28190832],
+       [ 0.5455244 ,  0.83803918,  0.27562602],
+       [ 0.55548397,  0.84025437,  0.26928147],
+       [ 0.5654976 ,  0.8424299 ,  0.26287683],
+       [ 0.57556297,  0.84456561,  0.25641457],
+       [ 0.58567772,  0.84666139,  0.24989748],
+       [ 0.59583934,  0.84871722,  0.24332878],
+       [ 0.60604528,  0.8507331 ,  0.23671214],
+       [ 0.61629283,  0.85270912,  0.23005179],
+       [ 0.62657923,  0.85464543,  0.22335258],
+       [ 0.63690157,  0.85654226,  0.21662012],
+       [ 0.64725685,  0.85839991,  0.20986086],
+       [ 0.65764197,  0.86021878,  0.20308229],
+       [ 0.66805369,  0.86199932,  0.19629307],
+       [ 0.67848868,  0.86374211,  0.18950326],
+       [ 0.68894351,  0.86544779,  0.18272455],
+       [ 0.69941463,  0.86711711,  0.17597055],
+       [ 0.70989842,  0.86875092,  0.16925712],
+       [ 0.72039115,  0.87035015,  0.16260273],
+       [ 0.73088902,  0.87191584,  0.15602894],
+       [ 0.74138803,  0.87344918,  0.14956101],
+       [ 0.75188414,  0.87495143,  0.14322828],
+       [ 0.76237342,  0.87642392,  0.13706449],
+       [ 0.77285183,  0.87786808,  0.13110864],
+       [ 0.78331535,  0.87928545,  0.12540538],
+       [ 0.79375994,  0.88067763,  0.12000532],
+       [ 0.80418159,  0.88204632,  0.11496505],
+       [ 0.81457634,  0.88339329,  0.11034678],
+       [ 0.82494028,  0.88472036,  0.10621724],
+       [ 0.83526959,  0.88602943,  0.1026459 ],
+       [ 0.84556056,  0.88732243,  0.09970219],
+       [ 0.8558096 ,  0.88860134,  0.09745186],
+       [ 0.86601325,  0.88986815,  0.09595277],
+       [ 0.87616824,  0.89112487,  0.09525046],
+       [ 0.88627146,  0.89237353,  0.09537439],
+       [ 0.89632002,  0.89361614,  0.09633538],
+       [ 0.90631121,  0.89485467,  0.09812496],
+       [ 0.91624212,  0.89609127,  0.1007168 ],
+       [ 0.92610579,  0.89732977,  0.10407067],
+       [ 0.93590444,  0.8985704 ,  0.10813094],
+       [ 0.94563626,  0.899815  ,  0.11283773],
+       [ 0.95529972,  0.90106534,  0.11812832],
+       [ 0.96489353,  0.90232311,  0.12394051],
+       [ 0.97441665,  0.90358991,  0.13021494],
+       [ 0.98386829,  0.90486726,  0.13689671],
+       [ 0.99324789,  0.90615657,  0.1439362 ]];
+
+if nargin < 1
+    cm_data = cm;
+else
+    hsv=rgb2hsv(cm);
+    cm_data=interp1(linspace(0,1,size(cm,1)),hsv,linspace(0,1,m));
+    cm_data=hsv2rgb(cm_data);
+  
+end
+end
 \ No newline at end of file
+128.23,131.48,134.74,138,141.25,144.5,147.75,151,154.25,157.5,160.74,163.98,167.23,170.47,173.71,176.95,180.18,183.42,186.65,189.88,193.11,196.34,199.57,202.8,206.02,209.25,212.47,215.69,218.91,222.13,225.34,228.56,231.77,234.98,238.19,241.4,244.61,247.82,251.02,254.23,257.43,260.63,263.83,267.03,270.22,273.42,276.61,279.81,283,286.19,289.37,292.56,295.75,298.93,302.11,305.29,308.47,311.65,314.83,318,321.18,324.35,327.52,330.69,333.86,337.02,340.19,343.35,346.51,349.68,352.84,355.99,359.15,362.31,365.46,368.61,371.76,374.91,378.06,381.21,384.35,387.5,390.64,393.78,396.92,400.06,403.19,406.33,409.46,412.6,415.73,418.86,421.99,425.11,428.24,431.36,434.48,437.61,440.72,443.84,446.96,450.08,453.19,456.3,459.41,462.52,465.63,468.74,471.84,474.95,478.05,481.15,484.25,487.35,490.45,493.54,496.64,499.73,502.82,505.91,509,512.08,515.17,518.25,521.34,524.42,527.5,530.58,533.65,536.73,539.8,542.88,545.95,549.02,552.08,555.15,558.22,561.28,564.34,567.41,570.47,573.52,576.58,579.64,582.69,585.74,588.79,591.84,594.89,597.94,600.99,604.03,607.07,610.11,613.15,616.19,619.23,622.26,625.3,628.33,631.36,634.39,637.42,640.45,643.47,646.5,649.52,652.54,655.56,658.58,661.6,664.61,667.63,670.64,673.65,676.66,679.67,682.68,685.68,688.69,691.69,694.69,697.69,700.69,703.69,706.68,709.68,712.67,715.66,718.65,721.64,724.63,727.62,730.6,733.58,736.56,739.54,742.52,745.5,748.48,751.45,754.42,757.39,760.36,763.33,766.3,769.27,772.23,775.19,778.16,781.12,784.07,787.03,789.99,792.94,795.9,798.85,801.8,804.75,807.69,810.64,813.58,816.53,819.47,822.41,825.35,828.28,831.22,834.16,837.09,840.02,842.95,845.88,848.81,851.73,854.66,857.58,860.5,863.42,866.34,869.26,872.17,875.09,878,880.91,883.82,886.73,889.64,892.55,895.45,898.35,901.26,904.16,907.06,909.95,912.85,915.74,918.64,921.53,924.42,927.31,930.2,933.08,935.97,938.85,941.73,944.61,947.49,950.37,953.24,956.12,958.99,961.86,964.74,967.6,970.47,973.34,976.2,979.07,981.93,984.79,987.65,990.5,993.36,996.22,999.07,1001.9,1004.8,1007.6,1010.5,1013.3,1016.2,1019,1021.8,1024.7,1027.5,1030.4,1033.2,1036,1038.9,1041.7,1044.5,1047.4,1050.2,1053,1055.8,1058.7,1061.5,1064.3,1067.1,1069.9,1072.8,1075.6,1078.4,1081.2,1084,1086.8,1089.6,1092.4,1095.2,1098,1100.8,1103.6,1106.4,1109.2,1112,1114.8,1117.6,1120.4,1123.2,1126,1128.8,1131.6,1134.4,1137.1,1139.9,1142.7,1145.5,1148.3,1151,1153.8,1156.6,1159.4,1162.1,1164.9,1167.7,1170.4,1173.2,1176,1178.7,1181.5,1184.2,1187,1189.8,1192.5,1195.3,1198,1200.8,1203.5,1206.3,1209,1211.8,1214.5,1217.2,1220,1222.7,1225.5,1228.2,1230.9,1233.7,1236.4,1239.1,1241.9,1244.6,1247.3,1250,1252.8,1255.5,1258.2,1260.9,1263.6,1266.4,1269.1,1271.8,1274.5,1277.2,1279.9,1282.6,1285.3,1288,1290.7,1293.4,1296.1,1298.8,1301.5,1304.2,1306.9,1309.6,1312.3,1315,1317.7,1320.4,1323.1,1325.8,1328.4,1331.1,1333.8,1336.5,1339.2,1341.8,1344.5,1347.2,1349.9,1352.5,1355.2,1357.9,1360.5,1363.2,1365.9,1368.5,1371.2,1373.9,1376.5,1379.2,1381.8,1384.5,1387.1,1389.8,1392.4,1395.1,1397.7,1400.4,1403,1405.7,1408.3,1410.9,1413.6,1416.2,1418.9,1421.5,1424.1,1426.7,1429.4,1432,1434.6,1437.3,1439.9,1442.5,1445.1,1447.7,1450.4,1453,1455.6,1458.2,1460.8,1463.4,1466,1468.7,1471.3,1473.9,1476.5,1479.1,1481.7,1484.3,1486.9,1489.5,1492.1,1494.7,1497.3,1499.8,1502.4,1505,1507.6,1510.2,1512.8,1515.4,1517.9,1520.5,1523.1,1525.7,1528.3,1530.8,1533.4,1536,1538.5,1541.1,1543.7,1546.2,1548.8,1551.4,1553.9,1556.5,1559.1,1561.6,1564.2,1566.7,1569.3,1571.8,1574.4,1576.9,1579.5,1582,1584.6,1587.1,1589.6,1592.2,1594.7,1597.3,1599.8,1602.3,1604.9,1607.4,1609.9,1612.5,1615,1617.5,1620,1622.6,1625.1,1627.6,1630.1,1632.6,1635.1,1637.7,1640.2,1642.7,1645.2,1647.7,1650.2,1652.7,1655.2,1657.7,1660.2,1662.7,1665.2,1667.7,1670.2,1672.7,1675.2,1677.7,1680.2,1682.7,1685.2,1687.6,1690.1,1692.6,1695.1,1697.6,1700.1,1702.5,1705,1707.5,1710,1712.4,1714.9,1717.4,1719.8,1722.3,1724.8,1727.2,1729.7,1732.1,1734.6,1737.1,1739.5,1742,1744.4,1746.9,1749.3,1751.8,1754.2,1756.7,1759.1,1761.6,1764,1766.4,1768.9,1771.3,1773.8,1776.2,1778.6,1781.1,1783.5,1785.9,1788.3,1790.8,1793.2,1795.6,1798,1800.5,1802.9,1805.3,1807.7,1810.1,1812.5,1815,1817.4,1819.8,1822.2,1824.6,1827,1829.4,1831.8,1834.2,1836.6,1839,1841.4,1843.8,1846.2,1848.6,1851,1853.3,1855.7,1858.1,1860.5,1862.9,1865.3,1867.7,1870,1872.4,1874.8,1877.2,1879.5,1881.9,1884.3,1886.6,1889,1891.4,1893.7,1896.1,1898.5,1900.8,1903.2,1905.5,1907.9,1910.3,1912.6,1915,1917.3,1919.7,1922,1924.4,1926.7,1929,1931.4,1933.7,1936.1,1938.4,1940.7,1943.1,1945.4,1947.7,1950.1,1952.4,1954.7,1957.1,1959.4,1961.7,1964,1966.3,1968.7,1971,1973.3,1975.6,1977.9,1980.2,1982.6,1984.9,1987.2,1989.5,1991.8,1994.1,1996.4,1998.7,2001,2003.3,2005.6,2007.9,2010.2,2012.5,2014.8,2017,2019.3,2021.6,2023.9,2026.2,2028.5,2030.7,2033,2035.3,2037.6,2039.9,2042.1,2044.4,2046.7,2048.9,2051.2,2053.5,2055.7,2058,2060.3,2062.5,2064.8,2067,2069.3,2071.6,2073.8,2076.1,2078.3,2080.6,2082.8,2085.1,2087.3,2089.5,2091.8,2094,2096.3,2098.5,2100.7,2103,2105.2,2107.4,2109.7,2111.9,2114.1,2116.4,2118.6,2120.8,2123,2125.3,2127.5,2129.7,2131.9,2134.1,2136.3,2138.6,2140.8,2143,2145.2,2147.4,2149.6,2151.8,2154,2156.2,2158.4,2160.6,2162.8,2165,2167.2,2169.4,2171.6,2173.8,2175.9,2178.1,2180.3,2182.5,2184.7,2186.9,2189,2191.2,2193.4,2195.6,2197.7,2199.9,2202.1,2204.3,2206.4,2208.6,2210.8,2212.9,2215.1,2217.3,2219.4,2221.6,2223.7,2225.9,2228,2230.2,2232.3,2234.5,2236.6,2238.8,2240.9,2243.1,2245.2,2247.4,2249.5,2251.6,2253.8,2255.9,2258,2260.2,2262.3,2264.4,2266.6,2268.7,2270.8,2272.9,2275.1,2277.2,2279.3,2281.4,2283.5,2285.7,2287.8,2289.9,2292,2294.1,2296.2,2298.3,2300.4,2302.5,2304.6,2306.7,2308.8,2310.9,2313,2315.1,2317.2,2319.3,2321.4,2323.5,2325.6,2327.7,2329.8,2331.8,2333.9,2336,2338.1,2340.2,2342.2,2344.3,2346.4,2348.5,2350.5,2352.6,2354.7,2356.7,2358.8,2360.9,2362.9,2365,2367.1,2369.1,2371.2,2373.2,2375.3,2377.3,2379.4,2381.4,2383.5,2385.5,2387.6,2389.6,2391.7,2393.7,2395.7,2397.8,2399.8,2401.9,2403.9,2405.9,2408,2410,2412,2414,2416.1,2418.1,2420.1,2422.1,2424.2,2426.2,2428.2,2430.2,2432.2,2434.2,2436.2,2438.3,2440.3,2442.3,2444.3,2446.3,2448.3,2450.3,2452.3,2454.3,2456.3,2458.3,2460.3,2462.3,2464.3,2466.3,2468.2,2470.2,2472.2,2474.2,2476.2,2478.2,2480.1,2482.1,2484.1,2486.1,2488,2490,2492,2494,2495.9,2497.9,2499.9,2501.8,2503.8,2505.8,2507.7,2509.7,2511.6,2513.6,2515.5,2517.5,2519.4,2521.4,2523.3,2525.3,2527.2,2529.2,2531.1,2533.1,2535,2537,2538.9,2540.8,2542.8,2544.7,2546.6,2548.6,2550.5,2552.4,2554.3,2556.3,2558.2,2560.1,2562,2563.9,2565.9,2567.8,2569.7,2571.6,2573.5,2575.4,2577.3,2579.2,2581.2,2583.1,2585,2586.9,2588.8,2590.7,2592.6,2594.5,2596.4,2598.2,2600.1,2602,2603.9,2605.8,2607.7,2609.6,2611.5,2613.3,2615.2,2617.1,2619,2620.9,2622.7,2624.6,2626.5,2628.3,2630.2,2632.1,2634,2635.8,2637.7,2639.5,2641.4,2643.3,2645.1,2647,2648.8,2650.7,2652.5,2654.4,2656.2,2658.1,2659.9,2661.8,2663.6,2665.5,2667.3,2669.1,2671,2672.8,2674.7,2676.5,2678.3,2680.1,2682,2683.8,2685.6,2687.5,2689.3,2691.1,2692.9,2694.7,2696.6,2698.4,2700.2,2702,2703.8,2705.6,2707.4,2709.2,2711.1,2712.9,2714.7,2716.5,2718.3,2720.1,2721.9,2723.7,2725.5,2727.2,2729,2730.8,2732.6,2734.4,2736.2,2738,2739.8,2741.5,2743.3,2745.1,2746.9,2748.7,2750.4,2752.2,2754,2755.8,2757.5,2759.3,2761.1,2762.8,2764.6,2766.3,2768.1,2769.9,2771.6,2773.4,2775.1,2776.9,2778.6,2780.4,2782.1,2783.9,2785.6,2787.4,2789.1,2790.9,2792.6,2794.4,2796.1,2797.8,2799.6,2801.3,2803,2804.8,2806.5,2808.2,2809.9,2811.7,2813.4,2815.1,2816.8,2818.6,2820.3,2822,2823.7,2825.4,2827.1,2828.8,2830.6,2832.3,2834,2835.7,2837.4,2839.1,2840.8,2842.5,2844.2,2845.9,2847.6,2849.3,2851,2852.7,2854.3,2856,2857.7,2859.4,2861.1,2862.8,2864.5,2866.1,2867.8,2869.5,2871.2,2872.8,2874.5,2876.2,2877.8,2879.5,2881.2,2882.8,2884.5,2886.2,2887.8,2889.5,2891.2,2892.8,2894.5,2896.1,2897.8,2899.4,2901.1,2902.7,2904.4,2906,2907.7,2909.3,2910.9,2912.6,2914.2,2915.9,2917.5,2919.1,2920.8,2922.4,2924,2925.7,2927.3,2928.9,2930.5,2932.2,2933.8,2935.4,2937,2938.6,2940.2,2941.9,2943.5,2945.1,2946.7,2948.3,2949.9,2951.5,2953.1,2954.7,2956.3,2957.9,2959.5,2961.1,2962.7,2964.3,2965.9,2967.5,2969.1,2970.7,2972.3,2973.8,2975.4,2977,2978.6,2980.2,2981.8,2983.3,2984.9,2986.5,2988.1,2989.6,2991.2,2992.8,2994.3,2995.9,2997.5,2999,3000.6,3002.1,3003.7,3005.3,3006.8,3008.4,3009.9,3011.5,3013,3014.6,3016.1,3017.7,3019.2,3020.8,3022.3,3023.8,3025.4,3026.9,3028.5,3030,3031.5,3033.1,3034.6,3036.1,3037.6,3039.2,3040.7,3042.2,3043.7,3045.3,3046.8,3048.3,3049.8,3051.3,3052.8,3054.3,3055.9,3057.4,3058.9,3060.4,3061.9,3063.4,3064.9,3066.4,3067.9,3069.4,3070.9,3072.4,3073.9,3075.4,3076.8,3078.3,3079.8,3081.3,3082.8,3084.3,3085.8,3087.2,3088.7,3090.2,3091.7,3093.1,3094.6,3096.1,3097.6,3099,3100.5,3102,3103.4,3104.9,3106.4,3107.8,3109.3,3110.7,3112.2,3113.6,3115.1,3116.6,3118,3119.5,3120.9,3122.3,3123.8,3125.2,3126.7,3128.1,3129.6,3131,3132.4,3133.9,3135.3,3136.7,3138.2,3139.6,3141,3142.4,3143.9,3145.3,3146.7,3148.1,3149.6,3151,3152.4,3153.8,3155.2,3156.6,3158,3159.4,3160.9,3162.3,3163.7,3165.1,3166.5,3167.9,3169.3,3170.7,3172.1,3173.5,3174.9,3176.2,3177.6,3179,3180.4,3181.8,3183.2,3184.6,3185.9,3187.3,3188.7,3190.1,3191.5,3192.8,3194.2,3195.6,3197,3198.3,3199.7,3201.1,3202.4,3203.8,3205.1,3206.5,3207.9,3209.2,3210.6,3211.9,3213.3,3214.6
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jul 23 16:04:33 2017
+
+@author: david
+"""
+
+import numpy
+import colorsys
+import sklearn
+import sklearn.metrics
+import scipy
+import scipy.misc
+import envi
+import hyperspectral
+import random
+import progressbar
+import matplotlib.pyplot as plt
+
+#generate N qualitative colors and return the value for color c
+def qualcolor(c, N):
+    dN = numpy.ceil(numpy.sqrt(N)).astype(numpy.int32)
+    h = c/N
+    
+    sp = c/N * 2 * numpy.pi * dN + numpy.pi/2
+    s = numpy.sin(sp) * 0.25 + 0.75
+    
+    vp = c/N * 2 * numpy.pi * dN
+    v = numpy.sin(vp) * 0.25 + 0.75
+    
+    rgb = numpy.array(colorsys.hsv_to_rgb(h, s, v))
+    return rgb * 255
+
+#generate a 2D color class map using a stack of binary class images
+#input: C is a C x Y x X binary image
+#output: an RGB color image with a unique color for each class
+def class2color(C):
+    
+    #determine the number of classes
+    nc = C.shape[0]
+    
+    s = C.shape[1:]
+    s = numpy.append(s, 3)
+
+    #generate an RGB image
+    RGB = numpy.zeros(s, dtype=numpy.ubyte)
+    
+    #for each class
+    for c in range(0, nc):
+        color = qualcolor(c, nc)
+        RGB[C[c, ...], :] = color
+    
+    return RGB
+
+#create a function that loads a set of class images as a stack of binary masks
+#input: list of class image names
+#output: C x Y x X binary image specifying class/pixel membership
+#example: image2class(("class_coll.bmp", "class_epith.bmp"))
+def filenames2class(masks):
+    #get num of mask file names
+    num_masks = len(masks)
+
+    if num_masks == 0:
+        print("ERROR: mask filenames not provided")
+        print("Usage example: image2class(('class_coll.bmp', 'class_epith.bmp'))")
+        return
+
+    classimages = []
+    bar = progressbar.ProgressBar(max_value=num_masks)
+    for m in range(0, num_masks):
+        img = scipy.misc.imread(masks[m], flatten=True).astype(numpy.bool)
+        classimages.append(img)
+        bar.update(m+1)
+
+    result = numpy.stack(classimages)
+    sum_images = numpy.sum(result.astype(numpy.uint32), 0)
+
+    #identify and remove redundant pixels
+    bad_idx = sum_images > 1
+    result[:, bad_idx] = 0
+
+    return result
+
+
+#create a class mask stack from an C x Y x X probability image
+#input: C x Y x X image giving the probability P(c |x,y)
+#output: C x Y x X binary class image
+def prob2class(prob_image):
+    class_image = numpy.zeros(prob_image.shape, dtype=numpy.bool)
+    #get nonzero indices
+    nnz_idx = numpy.transpose(numpy.nonzero(numpy.sum(prob_image, axis=0)))
+    
+    #set pixel corresponding to max probability to 1
+    for idx in nnz_idx:
+        idx_max_prob = numpy.argmax(prob_image[:, idx[0], idx[1]])
+        class_image[idx_max_prob, idx[0], idx[1]] = 1
+
+    return class_image
+
+#calculate an ROC curve given a probability image and mask of "True" values
+#input:
+#       P is a Y x X probability image specifying P(c | x,y)
+#       t_vals is a Y x X binary image specifying points where x,y = c
+#       mask is a mask specifying all pixels to be considered (positives and negatives)
+#           use this mask to limit analysis to regions of the image that have been classified
+#output: fpr, tpr, thresholds
+#       fpr is the false-positive rate (x-axis of an ROC curve)
+#       tpr is the true-positive rate (y-axis of an ROC curve)
+#       thresholds stores the threshold associated with each point on the ROC curve
+#
+#note: the AUC can be calculated as auc = sklearn.metrics.auc(fpr, tpr)
+def prob2roc(P, t_vals, mask=[]):
+    
+    if not P.shape == t_vals.shape:
+        print("ERROR: the probability and mask images must be the same shape")
+        return
+    
+    #if a mask image isn't provided, create one for the entire image
+    if mask == []:
+        mask = numpy.ones(t_vals.shape, dtype=numpy.bool)
+    
+    #create masks for the positive and negative probability scores
+    mask_p = t_vals
+    mask_n = mask - mask * t_vals
+    
+    #calculate the indices for the positive and negative scores
+    idx_p = numpy.nonzero(mask_p)
+    idx_n = numpy.nonzero(mask_n)
+    
+    Pp = P[idx_p]
+    Pn = P[idx_n]
+
+    Lp = numpy.ones((Pp.shape), dtype=numpy.bool)
+    Ln = numpy.zeros((Pn.shape), dtype=numpy.bool)
+    
+    scores = numpy.concatenate((Pp, Pn))
+    labels = numpy.concatenate((Lp, Ln))
+    
+    return sklearn.metrics.roc_curve(labels, scores)
+
+#convert a label image to a C x Y x X class image
+def label2class(L, background=[]):
+    unique = numpy.unique(L)
+
+    if not background == []:                                                #if a background value is specified
+        unique = numpy.delete(unique, numpy.nonzero(unique == background))  #remove it from the label array
+    s = L.shape
+    s = numpy.append(numpy.array((len(unique))), s)
+    C = numpy.zeros(s, dtype=numpy.bool)
+    for i in range(0, len(unique)):
+        C[i, :, :] = L == unique[i]
+    return C
+
+#randomizes a given mask to include a subset of n pixels in the original
+def random_mask(M, n):
+    idx = numpy.flatnonzero(M)
+    new_idx = numpy.random.permutation(idx)
+    new_mask = numpy.zeros(M.shape, dtype=numpy.bool)
+    new_mask[numpy.unravel_index(new_idx[0:n], new_mask.shape)] = True
+    return new_mask
+
+#perform classification of an ENVI image using batch processing
+# input:    E is the ENVI object (file is assumed to be loaded)
+#           C is a classifier - anything in sklearn should work
+#           batch is the batch size
+def envi_batch_predict(E, C, batch=10000):
+
+    Fv = E.loadbatch(batch)
+    i = 0
+    Tv = []
+    plt.ion()
+    bar = progressbar.ProgressBar(max_value=numpy.count_nonzero(E.mask))
+    while not Fv == []:
+        Fv = numpy.nan_to_num(Fv)                                                     #remove infinite values        
+        if i == 0:
+            Tv = C.predict(Fv.transpose())
+        else:
+            Tv = numpy.concatenate((Tv, C.predict(Fv.transpose()).transpose()), 0)
+        tempmask = E.batchmask()
+        Lv = hyperspectral.unsift2(Tv, tempmask)
+        Cv = label2class(Lv.squeeze(), background=0)
+        RGB = class2color(Cv)
+        plt.imshow(RGB)
+        plt.pause(0.05)
+        Fv = E.loadbatch(batch)   
+        i = i + 1
+        bar.update(len(Tv))
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jul 25 16:28:37 2017
+
+@author: david
+"""
+
+import hyperspectral
+import envi
+import classify
+import numpy
+import scipy
+import scipy.misc
+import sklearn
+import sklearn.naive_bayes
+import sklearn.neural_network
+import glob
+import matplotlib.pyplot as plt
+import random
+
+def generate_stain(envifile, stainfile, maskfile="", trainmask="", N=5000, batch_size=10000, validate=True):
+    if trainmask == "":
+        E = envi.envi(envifile)
+    else:
+        mask = scipy.misc.imread(trainmask, flatten=True)
+        E = envi.envi(envifile, mask=mask)
+        
+    mask = classify.random_mask(E.mask, N)
+    scipy.misc.imsave("random.bmp", mask)
+    
+    Ft = E.loadmask(mask).transpose()
+
+    stain = numpy.rollaxis(scipy.misc.imread(stainfile), 2)
+    Tt = hyperspectral.sift2(stain, mask).transpose()
+
+    print("Training MLPRegressor...")
+    CLASS = sklearn.neural_network.MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(), random_state=1, verbose=True)
+    CLASS.fit(Ft, Tt)
+
+    if validate == False:
+        return CLASS
+    
+    print("Validating Stain...")
+    plt.ion()    
+    if not maskfile == "":
+        E.close()                                                                  #close the ENVI file
+        mask = scipy.misc.imread(maskfile, flatten=True)
+        print(numpy.count_nonzero(mask))
+        E = envi.envi(envifile, mask=mask)
+    
+    Fv = E.loadbatch(batch_size)                                                #load the first batch
+    n = 0
+    while not Fv == []:                                                         #loop until an empty batch is returned
+        if n == 0:
+            Tv = CLASS.predict(Fv.transpose()).transpose()
+        else:
+            Tv = numpy.append(Tv, CLASS.predict(Fv.transpose()).transpose(), 1)                        #append the predicted labels from this batch to those of previous batches
+        COLORS = hyperspectral.unsift2(Tv, E.batchmask())                                    #convert the matrix of class labels to a 2D array
+        RGB = numpy.rollaxis(COLORS, 0, 3).astype(numpy.ubyte)
+        plt.imshow(RGB)                                                             #display it
+        plt.pause(0.05)
+        Fv = E.loadbatch(batch_size)                                                         #load the next batch
+        n = n + 1
+    return CLASS, RGB
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jul 21 20:18:01 2017
+
+@author: david
+"""
+
+import os
+import numpy
+import scipy
+import matplotlib.pyplot as plt
+import progressbar
+import sys
+
+class envi_header:
+    def __init__(self, filename = ""):
+        if filename != "":
+            self.load(filename)
+        else:
+            self.initialize()
+        
+    #initialization function
+    def initialize(self):
+        self.samples = int(0)
+        self.lines = int(0)
+        self.bands = int(0)
+        self.header_offset = int(0)
+        self.data_type = int(4)
+        self.interleave = "bsq"
+        self.sensor_type = "Unknown"
+        self.byte_order = int(0)
+        self.x_start = int(0)
+        self.y_start = int(0)
+        self.z_plot_titles = "Unknown, Unknown"
+        self.pixel_size = [float(0), float(0)]
+        self.pixel_size_units = "Meters"
+        self.wavelength_units = "Wavenumber"
+        self.description = "no description"
+        self.band_names = []
+        self.wavelength = []
+        
+    #convert an ENVI data_type value to a numpy data type        
+    def get_numpy_type(self, val):
+        if val == 1:
+            return numpy.byte
+        elif val == 2:
+            return numpy.int16
+        elif val == 3:
+            return numpy.int32
+        elif val == 4:
+            return numpy.float32
+        elif val == 5:
+            return numpy.float64
+        elif val == 6:
+            return numpy.complex64
+        elif val == 9:
+            return numpy.complex128
+        elif val == 12:
+            return numpy.uint16
+        elif val == 13:
+            return numpy.uint32
+        elif val == 14:
+            return numpy.int64
+        elif val == 15:
+            return numpy.uint64
+    
+    def get_envi_type(self, val):
+        if val == numpy.byte:
+            return 1
+        elif val == numpy.int16:
+            return 2
+        elif val == numpy.int32:
+            return 3
+        elif val == numpy.float32:
+            return 4
+        elif val == numpy.float64:
+            return 5
+        elif val == numpy.complex64:
+            return 6
+        elif val == numpy.complex128:
+            return 9
+        elif val == numpy.uint16:
+            return 12
+        elif val == numpy.uint32:
+            return 13
+        elif val == numpy.int64:
+            return 14
+        elif val == numpy.uint64:
+            return 15
+            
+    def load(self, fname):
+        f = open(fname)
+        l = f.readlines()
+        if l[0].strip() != "ENVI":
+            print("ERROR: not an ENVI file")
+            return
+        li = 1
+        while li < len(l):
+            #t = l[li].split()               #split the line into tokens
+            #t = map(str.strip, t)               #strip all of the tokens in the token list
+            
+            #handle the simple conditions
+            #if l[li].startswith("file type"):
+            #    if not l[li].strip().endswith("ENVI Standard"):
+            #        print("ERROR: unsupported ENVI file format: " + l[li].strip())
+            #        return
+            if l[li].startswith("samples"):
+                self.samples = int(l[li].split()[-1])
+            elif l[li].startswith("lines"):
+                self.lines = int(l[li].split()[-1])
+            elif l[li].startswith("bands"):
+                self.bands = int(l[li].split()[-1])
+            elif l[li].startswith("header offset"):
+                self.header_offset = int(l[li].split()[-1])
+            elif l[li].startswith("data type"):
+                self.data_type = self.get_numpy_type(int(l[li].split()[-1]))
+            elif l[li].startswith("interleave"):
+                self.interleave = l[li].split()[-1].strip()
+            elif l[li].startswith("sensor type"):
+                self.sensor_type = l[li].split()[-1].strip()
+            elif l[li].startswith("byte order"):
+                self.byte_order = int(l[li].split()[-1])
+            elif l[li].startswith("x start"):
+                self.x_start = int(l[li].split()[-1])
+            elif l[li].startswith("y start"):
+                self.y_start = int(l[li].split()[-1])
+            elif l[li].startswith("z plot titles"):
+                i0 = l[li].rindex('{')
+                i1 = l[li].rindex('}')
+                self.z_plot_titles = l[li][i0 + 1 : i1]
+            elif l[li].startswith("pixel size"):
+                i0 = l[li].rindex('{')
+                i1 = l[li].rindex('}')
+                s = l[li][i0 + 1 : i1].split(',')
+                self.pixel_size = [float(s[0]), float(s[1])]
+                self.pixel_size_units = s[2][s[2].rindex('=') + 1:].strip()
+            elif l[li].startswith("wavelength units"):
+                self.wavelength_units = l[li].split()[-1].strip()                
+            
+            #handle the complicated conditions
+            elif l[li].startswith("description"):
+                desc = [l[li]]
+                ''' 
+                while l[li].strip()[-1] != '}': #will fail if l[li].strip() is empty
+                    li += 1
+                    desc.append(l[li])
+                '''
+                while True:
+                    if l[li].strip():
+                       if  l[li].strip()[-1] == '}':
+                           break
+                    li += 1
+                    desc.append(l[li])
+
+                desc = ''.join(list(map(str.strip, desc)))           #strip all white space from the string list
+                i0 = desc.rindex('{')
+                i1 = desc.rindex('}')
+                self.description = desc[i0 + 1 : i1]
+                
+            elif l[li].startswith("band names"):
+                names = [l[li]]
+                while l[li].strip()[-1] != '}':
+                    li += 1
+                    names.append(l[li])
+                names = ''.join(list(map(str.strip, names)))           #strip all white space from the string list
+                i0 = names.rindex('{')
+                i1 = names.rindex('}')
+                names = names[i0 + 1 : i1]
+                self.band_names = list(map(str.strip, names.split(',')))
+            elif l[li].startswith("wavelength"):
+                waves = [l[li]]
+                while l[li].strip()[-1] != '}':
+                    li += 1
+                    waves.append(l[li])
+                waves = ''.join(list(map(str.strip, waves)))           #strip all white space from the string list
+                i0 = waves.rindex('{')
+                i1 = waves.rindex('}')
+                waves = waves[i0 + 1 : i1]
+                self.wavelength = list(map(float, waves.split(',')))
+
+            li += 1          
+        
+        f.close()
+
+    #save an ENVI header
+    def save(self, fname):
+    	f = open(fname, "w")
+    	f.write("ENVI\n")
+    	f.write("description = {" + self.description + "}" + "\n")
+    	f.write("samples = " + str(self.samples) + "\n")
+    	f.write("lines = " + str(self.lines) + "\n")
+    	f.write("bands = " + str(self.bands) + "\n")
+    	f.write("header offset = " + str(self.header_offset) + "\n")
+    	f.write("file type = ENVI Standard" + "\n")
+    	f.write("data type = " + str(self.get_envi_type(self.type)) + "\n")
+    	f.write("interleave = " + self.interleave + "\n")
+    	f.write("sensor type = " + self.sensor_type + "\n")
+    	f.write("byte order = " + str(self.byte_order) + "\n")
+    	f.write("x start = " + str(self.x_start) + "\n")
+    	f.write("y start = " + str(self.y_start) + "\n")
+    	f.write("wavelength units = " + self.wavelength_units + "\n")
+    	f.write("z plot titles = {" + self.z_plot_titles + "}" + "\n")
+
+    	f.close()
+
+    #sets the properties of the header to match those of the input array
+    def set(self, A):
+    	self.type = A.dtype
+    	self.samples = A.shape[2]
+    	self.lines = A.shape[1]
+    	self.bands = A.shape[0]
+
+        
+class envi:
+    def __init__(self, filename, headername = "", mask = []):
+        self.open(filename, headername)
+        if mask == []:
+            self.mask = numpy.ones((self.header.lines, self.header.samples), dtype=numpy.bool)
+        elif type(mask) == numpy.ndarray:
+            self.mask = mask
+        else:
+            print("ERROR: unrecognized mask format - expecting a boolean array")
+        self.idx = 0                                                               #initialize the batch IDX to 0 for batch reading
+        
+    def open(self, filename, headername = ""):
+        if headername == "":
+            headername = filename + ".hdr"
+            
+        if not os.path.isfile(filename):
+            print("ERROR: " + filename + " not found")
+            return
+        if not os.path.isfile(headername):
+            print("ERROR: " + headername + " not found")
+            return
+        
+        #open the file
+        self.header = envi_header(headername)
+        self.file = open(filename, "rb")
+        
+    def loadall(self):
+        X = self.header.samples
+        Y = self.header.lines
+        B = self.header.bands
+        
+        #load the data
+        D = numpy.fromfile(self.file, dtype=self.header.data_type)
+        
+        if self.header.interleave == "bsq":
+            return numpy.reshape(D, (B, Y, X))
+            #return numpy.swapaxes(D, 0, 2)
+        elif self.header.interleave == "bip":
+            D = numpy.reshape(D, (Y, X, B))
+            return numpy.rollaxis(D, 2)
+        elif self.header.interleave == "bil":
+            D = numpy.reshape(D, (Y, B, X))
+            return numpy.rollaxis(D, 1)
+        
+    #loads all of the pixels where mask != 0 and returns them as a matrix
+    def loadmask(self, mask):
+        X = self.header.samples
+        Y = self.header.lines
+        B = self.header.bands
+        
+        P = numpy.count_nonzero(mask)           #count the number of zeros in the mask file
+        M = numpy.zeros((B, P), dtype=self.header.data_type)
+        type_bytes = numpy.dtype(self.header.data_type).itemsize
+        
+        prev_pos = self.file.tell()
+        self.file.seek(0)
+        if self.header.interleave == "bip":
+            spectrum = numpy.zeros(B, dtype=self.header.data_type)
+            flatmask = numpy.reshape(mask, (X * Y))
+            i = numpy.flatnonzero(flatmask)
+            bar = progressbar.ProgressBar(max_value = P)
+            for p in range(0, P):
+                self.file.seek(i[p] * B * type_bytes)
+                self.file.readinto(spectrum)
+                M[:, p] = spectrum
+                bar.update(p+1)
+        elif self.header.interleave == "bsq":
+            band = numpy.zeros(mask.shape, dtype=self.header.data_type)
+            i = numpy.nonzero(mask)
+            bar = progressbar.ProgressBar(max_value=B)
+            for b in range(0, B):
+                self.file.seek(b * X * Y * type_bytes)
+                self.file.readinto(band)
+                M[b, :] = band[i]
+                bar.update(b+1)
+        elif self.header.interleave == "bil":
+            plane = numpy.zeros((B, X), dtype=self.header.data_type)
+            p = 0
+            bar = progressbar.ProgressBar(max_value=Y)
+            for l in range(0, Y):
+                i = numpy.flatnonzero(mask[l, :])
+                self.file.readinto(plane)
+                M[:, p:p+i.shape[0]] = plane[:, i]
+                p = p + i.shape[0]
+                bar.update(l+1)
+        self.file.seek(prev_pos)
+        return M
+
+    def loadband(self, n):
+        X = self.header.samples
+        Y = self.header.lines
+        B = self.header.bands
+
+        band = numpy.zeros((Y, X), dtype=self.header.data_type)
+        type_bytes = numpy.dtype(self.header.data_type).itemsize
+        
+        prev_pos = self.file.tell()
+        if self.header.interleave == "bsq":
+            self.file.seek(n * X * Y * type_bytes)
+            self.file.readinto(band)
+        self.file.seek(prev_pos)
+        return band
+
+    #create a set of feature/target pairs for classification
+    #input: envi file object, stack of class masks C x Y x X
+    #output: feature matrix (features x pixels), target matrix (1 x pixels)
+    #example: generate_training(("class_coll.bmp", "class_epith.bmp"), (1, 2))
+    #   verify      verify that there are no NaN or Inf values
+    def loadtrain(self, classimages, verify=True):
+
+        # get number of classes
+        C = classimages.shape[0]
+
+        F = []
+        T = []
+        for c in range(0, C):
+            print("\nLoading class " + str(c+1) + "...")
+            f = self.loadmask(classimages[c, :, :])            #load the feature matrix for class c
+            t = numpy.ones((f.shape[1])) * (c+1)         #generate a target array                 
+            F.append(f)
+            T.append(t)
+        
+        return numpy.nan_to_num(numpy.concatenate(F, 1).transpose()), numpy.concatenate(T)
+    
+    #read a batch of data based on the mask
+    def loadbatch(self, npixels):
+        i = numpy.flatnonzero(self.mask)                                      #get the indices of valid pixels
+        if len(i) == self.idx:													#if all of the pixels have been read, return an empyt array
+        	return []
+        npixels = min(npixels, len(i) - self.idx)                        #if there aren't enough pixels, change the batch size
+        B = self.header.bands
+        
+        batch = numpy.zeros((B, npixels), dtype=self.header.data_type)          #allocate space for the batch
+        pixel = numpy.zeros((B), dtype=self.header.data_type)                   #allocate space for a single pixel
+        type_bytes = numpy.dtype(self.header.data_type).itemsize                #calculate the size of a single value
+        if self.header.interleave == "bip":
+            for n in range(0, npixels):                                          #for each pixel in the batch
+                self.file.seek(i[self.idx] * B * type_bytes)                 #seek to the current pixel in the file
+                self.file.readinto(pixel)                                       #read a single pixel
+                batch[:, n] = pixel                                             #save the pixel into the batch matrix
+                self.idx = self.idx + 1
+            return batch
+        elif self.header.interleave == "bsq":
+            print("ERROR: BSQ batch loading isn't implemented yet!")
+        elif self.header.interleave == "bil":
+            print("ERROR: BIL batch loading isn't implemented yet!")        
+       
+    #returns the current batch index         
+    def getidx(self):
+        return self.idx
+
+    #returns an image of the pixels that have been read using batch loading
+    def batchmask(self):
+    	#allocate a new mask
+    	outmask = numpy.zeros(self.mask.shape, dtype=numpy.bool)
+
+    	#zero out any unclassified pixels 
+    	idx = self.getidx()
+    	i = numpy.nonzero(self.mask)
+    	outmask[i[0][0:idx], i[1][0:idx]] = self.mask[i[0][0:idx], i[1][0:idx]]
+    	return outmask
+
+    def close(self):
+        self.file.close()
+            
+    def __del__(self):
+        self.file.close()
+
+#saves an array as an ENVI file
+def save_envi(A, fname):
+    
+    #create and save a header file
+    header = envi_header();
+    header.set(A)
+    header.save(fname + ".hdr")
+
+    #save the raw data
+    file = open(fname, "wb")
+    file.write(bytearray(A))
+    file.close()
 \ No newline at end of file
+import numpy
+import classify
+import matplotlib.pyplot as plt
+from envi import envi
+
+mask_path = '/home/sberisha/data/masks/'
+mask_stack = classify.image2class(mask_path + "class_blood.png", mask_path + "class_coll.png", mask_path +  "class_epith.png",
+                         mask_path + "class_lymph.png", mask_path + "class_necrosis.png")
+
+color_image = classify.classcolor2(mask_stack)
+plt.imshow(color_image)
+
+data_path ='/home/sberisha/data/cnn/brc961-nfp8/envi/'
+
+feature_matrix, target_matrix = classify.generate_training(data_path + 'brc961-nfp8-project-br1003', mask_stack)
+
+prob_path = '/home/sberisha/data/'
+prob_envi= envi(prob_path + "cnn-response")
+prob_image = prob_envi.loadall()
+
+class_image = classify.prob2class(prob_image)
+plt.imshow(class_image[4,:,:])
 \ No newline at end of file
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jul 23 13:52:22 2017
+
+@author: david
+"""
+import numpy
+
+#sift a 2D hyperspectral image into a PxB matrix where P is the number of pixels and B is the number of bands
+def sift2(I, mask = []):
+    
+    #get the shape of the input array
+    S = I.shape
+    
+    #convert that array into a 1D matrix
+    M = numpy.reshape(I, (S[0], S[1] * S[2]))
+    
+    #gif no mask is provided, just return all pixels
+    if mask == []:
+        return M
+
+    #if a mask is provided, only return pixels corresponding to that mask
+    flatmask = numpy.reshape(mask, (S[1] * S[2]))
+    i = numpy.flatnonzero(flatmask)                     #get the nonzero indices
+    return M[:, i]                                      #return pixels corresponding to the masked values
+
+def unsift2(M, mask):
+    
+    #get the size of the input matrix
+    S = M.shape
+    
+    #count the number of nonzero values in the mask
+    nnz = numpy.count_nonzero(mask)
+    
+    #the number of masked values should be the same as the number of pixels in the input matrix
+    if len(S) == 1:
+    	if not S[0] == nnz:
+    		print("ERROR: expected " + str(nnz) + " pixels based on the mask but there are " + str(S[0]) + " in the matrix.")
+    elif not S[1] == nnz:
+        print("ERROR: expected " + str(nnz) + " pixels based on the mask but there are " + str(S[1]) + " in the matrix.")
+    
+    
+    i = numpy.nonzero(mask)
+    
+    if len(S) == 1:
+    	I = numpy.zeros((1, mask.shape[0], mask.shape[1]), dtype=M.dtype)
+    else:
+    	I = numpy.zeros((M.shape[0], mask.shape[0], mask.shape[1]), dtype=M.dtype)
+    I[:, i[0], i[1]] = M
+    return I
+
+#create a function that sifts a color image
+#input: image name, mask  
 \ No newline at end of file
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Sep 16 16:34:49 2017
+
+@author: pavel
+"""
+
+import struct
+import numpy as np
+import scipy as sp
+import networkx as nx
+import matplotlib.pyplot as plt
+import math
+import time
+import spharmonics
+
+'''
+    Definition of the Node class
+    Duplicate of the node class in network
+    Stores the physical position, outgoing edges list and incoming edges list.
+'''
+class Node:
+    def __init__(self, point, outgoing, incoming):
+        self.p = point
+        self.o = outgoing
+        self.i = incoming
+
+#    def p():
+#        return self.p
+
+'''
+    Definition of the Fiber class.
+    Duplicate of the Node class in network
+    Stores the starting vertex, the ending vertex, the points array and the radius array
+'''
+class Fiber:
+    
+       
+    def __init__ (self, p1, p2, pois, rads):
+        self.v0 = p1
+        self.v1 = p2
+        self.points = pois
+        self.radii = rads
+    
+    '''
+        return the length of the fiber.
+    '''        
+    def length(self):
+        length = 0
+        for i in range(len(self.points)-1):
+            length = length + math.sqrt(pow(self.points[i][0]- self.points[i+1][0],2) + pow(self.points[i][1]- self.points[i+1][1],2) + pow(self.points[i][2]- self.points[i+1][2],2))
+
+        return length
+        
+    '''
+        returns the turtuosity of the fiber.
+    '''    
+    def turtuosity(self):
+        turtuosity = 0
+        distance = math.sqrt(math.pow(self.points[0][0]- self.points[len(self.points)-1][0],2) + math.pow(self.points[0][1]- self.points[len(self.points)-1][1],2) + math.pow(self.points[0][2]- self.points[len(self.points)-1][2],2))
+        turtuosity = self.length()/distance
+        #print(turtuosity)
+        
+        return turtuosity
+        
+    '''
+        returns the volume of the fiber.
+    '''
+    def volume(self):
+        volume = 0
+        for i in range(len(self.points)-1):
+            volume = volume + 1.0/3.0 * math.pi * (math.pow(self.radii[i],2) + math.pow(self.radii[i+1],2) + self.radii[i]*self.radii[i+1]) * math.sqrt(math.pow(self.points[i][0]- self.points[i+1][0],2) + math.pow(self.points[i][1]- self.points[i+1][1],2) + math.pow(self.points[i][2]- self.points[i+1][2],2))
+
+        #print(volume)
+        return volume
+    
+class NWT:   
+    
+    '''
+        Writes the header given and open file descripion, number of verticies and number of edges.
+    '''
+    def writeHeader(open_file, numVerts, numEdges):
+        txt = "nwtFileFormat fileid(14B), desc(58B), #vertices(4B), #edges(4B): bindata"
+        b = bytearray()
+        b.extend(txt.encode())
+        open_file.write(b)
+        open_file.write(struct.pack('i', numVerts))
+        open_file.write(struct.pack('i', numEdges))
+        
+    
+    '''
+        Writes a single vertex to a file.
+    '''
+    def writeVertex(open_file, vertex):
+        open_file.write(struct.pack('<f',vertex.p[0]))
+        open_file.write(struct.pack('<f',vertex.p[1]))
+        open_file.write(struct.pack('<f',vertex.p[2]))
+        open_file.write(struct.pack('i', len(vertex.o)))
+        open_file.write(struct.pack('i', len(vertex.i)))
+        for j in range(len(vertex.o)):
+            open_file.write(struct.pack('i',vertex.o[j]))
+            
+        for j in range(len(vertex.i)):
+            open_file.write(struct.pack('i', vertex.i[j]))    
+            
+        return
+    
+    '''
+        Writes a single fiber to a file.
+    '''
+    def writeFiber(open_file, edge):
+        open_file.write(struct.pack('i',edge.v0))
+        open_file.write(struct.pack('i',edge.v1))
+        open_file.write(struct.pack('i',len(edge.points)))
+        for j in range(len(edge.points)):
+            open_file.write(struct.pack('<f', edge.points[j][0]))
+            open_file.write(struct.pack('<f', edge.points[j][1]))
+            open_file.write(struct.pack('<f', edge.points[j][2]))
+            open_file.write(struct.pack('<f', edge.radii[j]))
+            
+        return
+    
+    '''
+        Writes the entire network to a file in str given the vertices array and the edges array.
+    '''
+    def exportNWT(str, vertices, edges):
+        with open(str, "wb") as file:
+            NWT.writeHeader(file, len(vertices), len(edges))
+            for i in range(len(vertices)):
+                NWT.writeVertex(file, vertices[i])
+                
+            for i in range(len(edges)):
+                NWT.writeFiber(file, edges[i])
+                
+        return
+    
+    
+    '''
+        Reads a single vertex from an open file and returns a node Object.
+    '''
+    def readVertex(open_file):
+        points = np.tile(0., 3)
+        bytes = open_file.read(4)
+        points[0] = struct.unpack('f', bytes)[0]
+        bytes = open_file.read(4)
+        points[1] = struct.unpack('f', bytes)[0]
+        bytes = open_file.read(4)
+        points[2] = struct.unpack('f', bytes)[0]
+        bytes = open_file.read(4)
+        
+        numO = int.from_bytes(bytes, byteorder='little')
+        outgoing = np.tile(0, numO)
+        bts = open_file.read(4)
+        numI = int.from_bytes(bts, byteorder='little')
+        incoming = np.tile(0, numI)
+        for j in range(numO):
+            bytes = open_file.read(4)
+            outgoing[j] = int.from_bytes(bytes, byteorder='little')
+            
+        for j in range(numI):
+            bytes = open_file.read(4)
+            incoming[j] = int.from_bytes(bytes, byteorder='little')
+            
+        node = Node(points, outgoing, incoming)    
+        return node
+        
+        
+    '''
+        Reads a single fiber from an open file and returns a Fiber object .   
+    '''
+    def readFiber(open_file):
+        bytes = open_file.read(4)
+        vtx0 = int.from_bytes(bytes, byteorder = 'little')
+        bytes = open_file.read(4)
+        vtx1 = int.from_bytes(bytes, byteorder = 'little')
+        bytes = open_file.read(4)
+        numVerts = int.from_bytes(bytes, byteorder = 'little')
+        pts = []
+        rads = []
+        
+        for j in range(numVerts):
+            point = np.tile(0., 3)
+            bytes = open_file.read(4)
+            point[0] = struct.unpack('f', bytes)[0]
+            bytes = open_file.read(4)
+            point[1] = struct.unpack('f', bytes)[0]
+            bytes = open_file.read(4)
+            point[2] = struct.unpack('f', bytes)[0]
+            bytes = open_file.read(4)
+            radius = struct.unpack('f', bytes)[0]
+            pts.append(point)
+            rads.append(radius)
+            
+        F = Fiber(vtx0, vtx1, pts, rads)
+            
+        return F
+    
+    '''
+        Imports a NWT file at location str.
+        Returns a list of Nodes objects and a list of Fiber objects.
+    '''
+
+class Network:
+    
+    def __init__(self, filename, clock=False):
+        if clock:
+            start_time = time.time()
+            
+        with open(filename, "rb") as file:
+            header = file.read(72)
+            bytes = file.read(4)
+            numVertex = int.from_bytes(bytes, byteorder='little')
+            bytes = file.read(4)
+            numEdges = int.from_bytes(bytes, byteorder='little')
+            
+            self.N = []
+            self.F = []
+            for i in range(numVertex):
+                node = NWT.readVertex(file)
+                self.N.append(node)
+    
+            for i in range(numEdges):
+                edge = NWT.readFiber(file)
+                self.F.append(edge)
+        if clock:
+            print("Network initialization: "  + str(time.time() - start_time) + "s")
+    
+    '''
+    Creates a graph from a list of nodes and a list of edges.
+    Uses edge length as weight.
+    Returns a NetworkX Object.
+    '''
+#    def createLengthGraph(self):
+#        G = nx.Graph()
+#        for i in range(len(self.nodeList)):
+#            G.add_node(i, p=V[i].p)
+#        for i in range(len(self.edgeList)):
+#            G.add_edge(self.edgeList[i].v0, self.edgeList[i].v1, weight = E[i].length())
+#            
+#        return G
+#    '''
+#    Creates a graph from a list of nodes and a list of edges.
+#    Uses edge turtuosity as weight.
+#    Returns a NetworkX Object.
+#    '''    
+#    def createTortuosityGraph(nodeList, edgeList):
+#        G = nx.Graph()
+#        for i in range(len(nodeList)):
+#            G.add_node(i, p=V[i].p)
+#        for i in range(len(edgeList)):
+#            G.add_edge(edgeList[i].v0, edgeList[i].v1, weight = E[i].turtuosity())
+#            
+#        return G
+    
+#    '''
+#    Creates a graph from a list of nodes and a list of edges.
+#    Uses edge volume as weight.
+#    Returns a NetworkX Object.
+#    '''    
+#    def createVolumeGraph(nodeList, edgeList):
+#        G = nx.Graph()
+#        for i in range(len(nodeList)):
+#            G.add_node(i, p=V[i].p)
+#        for i in range(len(edgeList)):
+#            G.add_edge(edgeList[i].v0, edgeList[i].v1, weight = E[i].volume())
+#            
+#        return G
+#'''
+#Returns the positions dictionary for the Circular layout.
+#'''    
+#def getCircularLayout(graph, dim, radius):
+#    return nx.circular_layout(graph, dim, radius)
+#
+#'''
+#Return the positions dictionary for the Spring layout.
+#'''    
+#def getSpringLayout(graph, pos, iterations, scale):
+#    return nx.spring_layout(graph, 2, None, pos, iterations, 'weight', scale, None)
+#        
+#'''
+#Draws the graph.
+#'''        
+#def drawGraph(graph, pos):
+#    nx.draw(graph, pos)
+#    return
+
+    def aabb(self):
+    
+        lower = self.N[0].p.copy()
+        upper = lower.copy()
+        for i in self.N:
+            for c in range(len(lower)):
+                if lower[c] > i.p[c]:
+                    lower[c] = i.p[c]
+                if upper[c] < i.p[c]:
+                    upper[c] = i.p[c]
+        return lower, upper
+    
+    #calculate the distance field at a given resolution
+    #   R (triple) resolution along each dimension
+    def distancefield(self, R=(100, 100, 100)):
+        
+        #get a list of all node positions in the network
+        P = []
+        for e in self.F:
+            for p in e.points:
+                P.append(p)
+                
+        #turn that list into a Numpy array so that we can create a KD tree
+        P = np.array(P)
+        
+        #generate a KD-Tree out of the network point array
+        tree = sp.spatial.cKDTree(P)
+        
+        plt.scatter(P[:, 0], P[:, 1])
+        
+        #specify the resolution of the ouput grid
+        R = (200, 200, 200)
+        
+        #generate a meshgrid of the appropriate size and resolution to surround the network
+        lower, upper = self.aabb(self.N, self.F)    #get the space occupied by the network
+        x = np.linspace(lower[0], upper[0], R[0])   #get the grid points for uniform sampling of this space
+        y = np.linspace(lower[1], upper[1], R[1])
+        z = np.linspace(lower[2], upper[2], R[2])
+        X, Y, Z = np.meshgrid(x, y, z)
+        #Z = 150 * numpy.ones(X.shape)
+        
+        
+        Q = np.stack((X, Y, Z), 3)
+        
+        
+        D, I = tree.query(Q)
+        
+        return D
+    
+    #returns the number of points in the network
+    def npoints(self):                              
+        n = 0                                       #initialize the counter to zero
+        for f in self.F:                            #for each fiber
+            n = n + len(f.points) - 2               #count the number of points in the fiber - ignoring the end points
+        n = n + len(self.N)                         #add the number of nodes (shared points) to the node count
+        return n                                    #return the number of nodes
+    
+    #returns all of the points in the network
+    def points(self):
+        k = self.npoints()
+        P = np.zeros((3, k))                        #allocate space for the point list
+        
+        idx = 0
+        for f in self.F:                            #for each fiber in the network
+            for ip in range(1, len(f.points)-1):    #for each point in the network
+                P[:, idx] = f.points[ip]            #store the point in the raw point list
+                idx = idx + 1
+        return P                                    #return the point array        
+    
+    #returns the number of linear segments in the network
+    def nsegments(self):
+        n = 0                                       #initialize the segment counter to 0
+        for f in self.F:                            #for each fiber
+            n = n + len(f.points) - 1               #calculate the number of line segments in the fiber (points - 1)
+        return n                                    #return the number of line segments
+    
+    #return a list of line segments representing the network
+    def segments(self, dtype=np.float32):
+        k = self.nsegments()                        #get the number of line segments
+        start = np.zeros((k, 3),dtype=dtype)                    #start points for the line segments
+        end = np.zeros((k, 3), dtype=dtype)                      #end points for the line segments
+        
+        idx = 0                                     #initialize the index counter to zero
+        for f in self.F:                            #for each fiber in the network
+            for ip in range(0, len(f.points)-1):    #for each point in the network
+                start[idx, :] = f.points[ip]            #store the point in the raw point list
+                idx = idx + 1
+        
+        idx = 0
+        for f in self.F:                            #for each fiber in the network
+            for ip in range(1, len(f.points)):      #for each point in the network
+                end[idx, :] = f.points[ip]            #store the point in the raw point list
+                idx = idx + 1
+                
+        return start, end
+    
+    #function returns the fiber associated with a given 1D line segment index
+    def segment2fiber(self, idx):        
+        i = 0
+        for f in range(len(self.F)):                #for each fiber in the network
+            i = i + len(self.F[f].points)-1         #add the number of points in the fiber to i
+            if i > idx:                             #if we encounter idx in this fiber
+                return self.F[f].points, f          #return the fiber associated with idx and the index into the fiber array
+        
+    def vectors(self, clock=False, dtype=np.float32):
+        if clock:
+            start_time = time.time()
+        start, end = self.segments(dtype)                #retrieve all of the line segments
+        v = end - start                             #calculate the resulting vectors
+        l = np.sqrt(v[:, 0]**2 + v[:,1]**2 + v[:,2]**2) #calculate the fiber lengths
+        z = l==0                                    #look for any zero values
+        nz = z.sum()
+        if nz > 0:
+            print("WARNING: " + str(nz) + " line segment(s) of length zero were found in the network and will be removed" )
+            
+        if clock:
+            print("Network::vectors: " + str(time.time() - start_time) + "s")
+            
+        return np.delete(v, np.where(z), 0)
+    
+    #scale all values in the network by tuple S = (sx, sy, sz)
+    def scale(self, S):
+        for f in self.F:
+            for p in f.points:
+                p[0] = p[0] * S[0]
+                p[1] = p[1] * S[1]
+                p[2] = p[2] * S[2]
+                
+        for n in self.N:
+            n.p[0] = n.p[0] * S[0]
+            n.p[1] = n.p[1] * S[1]
+            n.p[2] = n.p[2] * S[2]
+        
+    
+    #calculate the adjacency weighting function for the network given a set of vectors X = (x, y, z) and weight exponent k
+    def adjacencyweight(self, P, k=200, length_threshold = 25, dtype=np.float32):
+        V = self.vectors(dtype)                                                 #get the vectors representing each segment
+        #V = V[0:n_vectors, :]
+        L = np.expand_dims(np.sqrt((V**2).sum(1)), 1)                           #calculate the length of each vector
+        
+        outliers = L > length_threshold                                         #remove outliers based on the length_threshold
+        V = np.delete(V, np.where(outliers), 0)
+        L = np.delete(L, np.where(outliers))
+        V = V/L[:,None]                                                         #normalize the vectors
+        
+        P = np.stack(spharmonics.sph2cart(1, P[0], P[1]), P[0].ndim)        
+        PV = P[...,None,:] * V
+        cos_alpha = PV.sum(PV.ndim-1)
+        W = np.abs(cos_alpha) ** k
+
+        return W, L
+        
+
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Jan 19 2018
+
+@author: Jiabing
+"""
+
+import struct
+import numpy as np
+import scipy as sp
+import networkx as nx
+import matplotlib.pyplot as plt
+import math
+
+class Point:
+       def __init__(self, x, y, z, radius):
+           self.x = x
+           self.y = y
+           self.z = z
+           self.r = radius
+                      
+            
+class Fiber:
+       def __init__(self, fiber_idx,point_idx):     
+           self.fidx = fiber_idx
+           self.pidx = point_idx
+           
+           
+
+class Node:
+        def __init__(self, point_idx, fidx):
+            self.pidx= point_idx
+            self.fidx = fidx
+            
+            
+#class NWT:
+    
+#class network(point, Fiber, Node):
 \ No newline at end of file
+# -*- coding: utf-8 -*-
+"""
+Created on Thu May 18 15:56:20 2017
+
+@author: david
+"""
+
+import numpy as np
+import scipy as sp
+import scipy.misc
+import scipy.ndimage
+import matplotlib.pyplot as plt
+import os
+
+#calculate a 3D image from a 2D binary mask
+def binary(infile, sigma=2):
+    
+    I = sp.misc.imread(infile).astype(np.bool)
+    
+    #if the image has more than one channel, just keep the first one
+    if(I.ndim == 3):
+        I = I[:, :, 0]
+    
+    L = []
+    while np.count_nonzero(I) != 0:
+        L.append(I)
+        I = sp.ndimage.binary_erosion(I)
+        
+    #create a 3D image representing the new stack    
+    S = np.zeros( (I.shape[0], I.shape[1], len(L) * 2 - 1) )
+    
+    #for each image in the list
+    for i in range(0, len(L)):
+        if(i == 0):
+            S[:, :, len(L) - 1] = L[0]
+        else:
+            S[:, :, len(L) - 1 + i] = L[i]
+            S[:, :, len(L) - 1 - i] = L[i]
+            
+    S = sp.ndimage.filters.gaussian_filter(S, sigma)
+    return S
+
+#generate a 3D image stack from a 2D binary mask
+def binary_stack(infile, outdir, sigma=2):
+    outfile_base = os.path.basename(infile)
+    outfile_prefix, outfile_ext = os.path.splitext(outfile_base)
+    
+    S = binary(infile, sigma)
+
+    zcount = len(str(S.shape[2]))
+    for f in range(0, S.shape[2]):
+        fname = outdir + "/" + outfile_prefix + str(f).zfill(zcount) + outfile_ext
+    
+        sp.misc.imsave(fname, S[:, :, f])
+
+
+
+
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Dec 18 16:31:36 2017
+
+@author: david
+"""
+
+import numpy
+import scipy
+import matplotlib.pyplot as plt
+from matplotlib import cm, colors
+from mpl_toolkits.mplot3d import Axes3D
+import math
+import time
+
+
+def sph2cart(r, theta, phi):
+    x = r * numpy.cos(theta) * numpy.sin(phi)
+    y = r * numpy.sin(theta) * numpy.sin(phi)
+    z = r * numpy.cos(phi)
+    
+    return x, y, z
+
+def cart2sph(x, y, z):
+    r = numpy.sqrt(x**2+y**2+z**2)
+    theta = numpy.arctan2(y,x)
+    phi = numpy.arccos(z/r)
+    #if(x == 0):
+    #    phi = 0
+    #else:
+    #    phi = numpy.arccos(z/r)
+    #phi = numpy.arctan2(numpy.sqrt(x**2 + y**2), z)
+    return r, theta, phi
+
+
+def sh(theta, phi, l, m):
+    
+    if m < 0:
+        return numpy.sqrt(2) * (-1)**m * scipy.special.sph_harm(abs(m), l, theta, phi).imag
+    elif m > 0:
+        return numpy.sqrt(2) * (-1)**m * scipy.special.sph_harm(m, l, theta, phi).real
+    else:
+        return scipy.special.sph_harm(0, l, theta, phi).real
+    
+#calculate a spherical harmonic value from a set of coefficients and coordinates P = (theta, phi)    
+def sh_coeff(P, C):
+    
+    s = numpy.zeros(P[0].shape)
+    c = range(len(C))
+    
+    for c in range(len(C)):
+        l, m = i2lm(c)                  #get the 2D indices
+        s = s + C[c] * sh(P[0], P[1], l, m)
+        
+    return s
+
+#plot a spherical harmonic function on a sphere using N points
+def sh_plot(C, N):       
+    phi = numpy.linspace(0, numpy.pi, N)
+    theta = numpy.linspace(0, 2*numpy.pi, N)
+    phi, theta = numpy.meshgrid(phi, theta)
+    
+    # The Cartesian coordinates of the unit sphere
+    x = numpy.sin(phi) * numpy.cos(theta)
+    y = numpy.sin(phi) * numpy.sin(theta)
+    z = numpy.cos(phi)
+    
+    # Calculate the spherical harmonic Y(l,m) and normalize to [0,1]
+    fcolors = sh_coeff(theta, phi, C)
+    fmax, fmin = fcolors.max(), fcolors.min()
+    fcolors = (fcolors - fmin)/(fmax - fmin)
+    
+    # Set the aspect ratio to 1 so our sphere looks spherical
+    fig = plt.figure(figsize=plt.figaspect(1.))
+    ax = fig.add_subplot(111, projection='3d')
+    ax.plot_surface(x, y, z,  rstride=1, cstride=1, facecolors=cm.seismic(fcolors))
+    # Turn off the axis planes
+    ax.set_axis_off()
+    plt.show()
+    
+def i2lm(i):
+    l = numpy.floor(numpy.sqrt(i))
+    m = i - l *(l + 1)
+    return l, m
+
+def lm2i(l, m):
+    return l * (l+1) + m
+
+#generates a set of spherical harmonic coefficients from samples using linear least squares
+def linfit(P, s, nc, clock=False):
+    if clock:
+        start_time = time.time()
+        
+    #allocate space for the matrix and RHS values
+    A = numpy.zeros((nc, nc))
+    b = numpy.zeros(nc)
+    
+    #calculate each of the matrix coefficients
+        #(see SH technical report in the vascular_viz repository)
+    for i in range(nc):
+        li, mi = i2lm(i)
+        yi = sh(P[0], P[1], li, mi)
+        for j in range(nc):        
+            lj, mj = i2lm(j)
+            yj = sh(P[0], P[1], lj, mj)
+            A[i, j] = numpy.sum(yi * yj)
+        b[i] = numpy.sum(yi * s)            #calculate the RHS value
+    
+    #calculate the RHS values
+    #for j in range(nc):
+    #    lj, mj = i2lm(j)
+    #    yj = sh(theta, phi, lj, mj)
+    #    b[j] = numpy.sum(yj * s)
+    
+    if clock:
+        print("SH::linfit:matrix "+str(time.time() - start_time)+"s")
+    #solve the system of linear equations
+    R = numpy.linalg.solve(A, b)
+    
+    if clock:
+        print("SH::linfit:solution "+str(time.time() - start_time)+"s")
+    return R
+
+#generate a scatter plot in 3D using spherical coordinates
+def scatterplot3d(P):
+    r, theta, phi = P
+    #convert all of the samples to cartesian coordinates
+    X, Y, Z = sph2cart(r, theta, phi)
+    
+    fig = plt.figure()
+    ax = fig.add_subplot(111, projection='3d')
+    ax.scatter(X, Y, Z)
+    plt.show()
+    
+
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Mar 12 21:54:40 2017
+
+@author: david
+"""
+
+import numpy as np
+import scipy as sp
+import scipy.ndimage
+import progressbar
+import glob
+
+def st2(I, s=1, dtype=np.float32):   
+    
+    #calculates the 2D structure tensor for an image using a gaussian window with standard deviation s
+    
+    #calculate the gradient
+    dI = np.gradient(I.astype(dtype))
+      
+    #calculate the dimensions of the tensor field
+    field_dims = [dI[0].shape[0], dI[0].shape[1], 3]
+    
+    #allocate space for the tensor field
+    Tg = np.zeros(field_dims, dtype=dtype)
+    
+    #calculate the gradient components of the tensor
+    ti = 0
+    for i in range(2):
+        for j in range(i + 1):
+            Tg[:, :, ti] = dI[j] * dI[i]
+            ti = ti + 1
+    
+    #if the user does not want a blurred field
+    if(s == 0):
+        return Tg
+        
+    #blur the tensor field
+    T = np.zeros(field_dims, dtype=dtype)
+    
+    for i in range(3):
+        T[:, :, i] = scipy.ndimage.filters.gaussian_filter(Tg[:, :, i], [s, s])
+
+    
+    return T
+
+def st3(I, s=1, v=[1, 1, 1], dtype=np.float32):
+    #calculate the structure tensor field for the 3D input image I given the window size s and voxel size v
+    #check the format for the window size
+        
+    v = np.array(v)
+    print("\nCalculating gradient...")
+    dI = np.gradient(I.astype(dtype), v[0], v[1], v[2])
+    #calculate the dimensions of the tensor field
+    field_dims = [dI[0].shape[0], dI[0].shape[1], dI[0].shape[2], 6]
+    
+    #allocate space for the tensor field
+    Tg = np.zeros(field_dims, dtype=np.float32)
+    
+    #calculate the gradient components of the tensor
+    ti = 0
+    print("Calculating tensor components...")
+    bar = progressbar.ProgressBar()
+    bar.max_value = 6
+    for i in range(3):
+        for j in range(i + 1):
+            Tg[:, :, :, ti] = dI[j] * dI[i]
+            ti = ti + 1
+            bar.update(ti)
+    
+    #blur the tensor field
+    T = np.zeros(field_dims, dtype=np.float32)
+    
+    print("\nConvolving tensor field...")
+    bar = progressbar.ProgressBar()
+    bar.max_value = 6
+    sigma = s / v
+    print(sigma)
+    for i in range(6):
+        T[:, :, :, i] = scipy.ndimage.filters.gaussian_filter(Tg[:, :, :, i], sigma)
+        bar.update(i+1)
+        
+    return T
+
+def st(I, s=1):
+    if I.ndim == 3:
+        return st3(I, s)
+    elif I.ndim == 2:
+        return st2(I, s)
+    else:
+        print("Image must be 2D or 3D")
+    return
+        
+   
+    
+def sym2mat(T):
+    #Calculate the full symmetric matrix from a list of lower triangular elements.
+    #The lower triangular components in the input field T are assumed to be the
+    #   final dimension of the input matrix.
+    
+    #       | 1  2  4  7  |
+    #       | 0  3  5  8  |
+    #       | 0  0  6  9  |
+    #       | 0  0  0  10 |
+   
+    in_s = T.shape
+    
+    #get the number of tensor elements
+    n = in_s[T.ndim - 1]
+    
+    #calculate the dimension of the symmetric matrix
+    d = int(0.5 * (np.sqrt(8. * n + 1.) - 1.))
+    
+    #calculate the dimensions of the output field
+    out_s = list(in_s)[:-1] + [d] + [d]
+
+    #allocate space for the output field
+    R = np.zeros(out_s)
+    
+    ni = 0
+    for i in range(d):
+        for j in range(i + 1):
+            R[..., i, j] = T[..., ni]
+            if i != j:
+                R[..., j, i] = T[..., ni]
+            ni = ni + 1
+    
+    return R   
+
+def vec(S, vector=0):
+   
+    if(S.ndim != 3):
+        print("ERROR: a 2D slice is expected")
+        return
+      
+    #convert the field to a full rank-2 tensor
+    T = sym2mat(S);
+    del(S)
+    
+    #calculate the eigenvectors and eigenvalues
+    l, v = np.linalg.eig(T)
+    
+    #get the dimension of the tensor field
+    d = T.shape[2]
+    
+    #allocate space for the vector field
+    V = np.zeros([T.shape[0], T.shape[1], 3])
+
+    #arrange the indices for each pixel from smallest to largest eigenvalue    
+    idx = l.argsort()
+    
+    for di in range(d):
+        b = idx[:, :, -1-vector] == di
+        V[b, 0:d] = v[b, :, di]
+    
+    return V
+   
+def loadstack(filemask):
+    #Load an image stack as a 3D grayscale array
+    
+    #get a list of all files matching the given mask
+    files = [file for file in glob.glob(filemask)]
+    
+    #calculate the size of the output stack
+    I = scipy.misc.imread(files[0])
+    X = I.shape[0]
+    Y = I.shape[1]
+    Z = len(files)
+    
+    #allocate space for the image stack
+    M = np.zeros([X, Y, Z]).astype('float32')
+    
+    #create a progress bar
+    bar = progressbar.ProgressBar()
+    bar.max_value = Z
+    
+    #for each file
+    for z in range(Z):
+        #load the file and save it to the image stack
+        M[:, :, z] = scipy.misc.imread(files[z], flatten="True").astype('float32')
+        bar.update(z+1)
+    return M
+
+#calculate the anisotropy of a structure tensor given the tensor field S
+def anisotropy3(S):
+
+    Sf = sym2mat(S)
+    
+    #calculate the eigenvectors and eigenvalues
+    l, v = np.linalg.eig(Sf)
+    
+    #store the sorted eigenvalues
+    ls = np.sort(l)
+    l0 = ls[:, :, 0]
+    l1 = ls[:, :, 1]
+    l2 = ls[:, :, 2]
+    
+    #calculate the linear anisotropy
+    Cl = (l2 - l1)/(l2 + l1 + l0)
+    
+    #calculate the planar anisotropy
+    Cp = 2 * (l1 - l0) / (l2 + l1 + l0)
+    
+    #calculate the spherical anisotropy
+    Cs = 3 * l0 / (l2 + l1 + l0)
+    
+    #calculate the fractional anisotropy
+    l_hat = (l0 + l1 + l2)/3
+    fa_num = (l2 - l_hat) ** 2 + (l1 - l_hat) ** 2 + (l0 - l_hat) ** 2;
+    fa_den = l0 ** 2 + l1 ** 2 + l2 ** 2
+    FA = np.sqrt(3./2.) * np.sqrt(fa_num) / np.sqrt(fa_den)
+    
+    return FA, Cl, Cp, Cs
+
+#calculate the fractional anisotropy
+def fa(S):
+    Sf = sym2mat(S)
+    
+    #calculate the eigenvectors and eigenvalues
+    l, v = np.linalg.eig(Sf)
+    
+    #store the sorted eigenvalues
+    ls = np.sort(l)
+    l0 = ls[:, :, 0]
+    l1 = ls[:, :, 1]
+    
+    #if this is a 2D tensor, calculate and return the coherence
+    if(S.shape[2] == 3):
+        C = ((l0 - l1) / (l0 + l1)) ** 2
+        return C
+        
+    #if this is a 3D tensor    
+    elif(S.shape[2] == 6):
+        l2 = ls[:, :, 2]
+        
+        #calculate the fractional anisotropy
+        l_hat = (l0 + l1 + l2)/3
+        fa_num = (l2 - l_hat) ** 2 + (l1 - l_hat) ** 2 + (l0 - l_hat) ** 2;
+        fa_den = l0 ** 2 + l1 ** 2 + l2 ** 2
+        FA = np.sqrt(3./2.) * np.sqrt(fa_num) / np.sqrt(fa_den)
+        return FA
+
+#calculate the specified eigenvalue for the tensor field
+def eigenval(S, ev):
+    Sf = sym2mat(S)
+    
+     #calculate the eigenvectors and eigenvalues
+    l, v = np.linalg.eig(Sf)
+    
+    #store the sorted eigenvalues
+    ls = np.sort(l)
+    evals = ls[:, :, ev]
+
+    return evals
+
+def amira(filename, T):
+    #generates a tensor field that can be imported into Amira
+    
+    #   0    dx dx   ----> 0
+    #   1    dx dy   ----> 1
+    #   2    dy dy   ----> 3
+    #   3    dx dz   ----> 2
+    #   4    dy dz   ----> 4
+    #   5    dz dz   ----> 5
+    
+    #swap the 2nd and 3rd tensor components
+    A = np.copy(T)
+    A[..., 3] = T[..., 2]
+    A[..., 2] = T[..., 3]
+    
+    #roll the tensor axis so that it is the leading component
+    #A = numpy.rollaxis(A, A.ndim - 1)
+    A.tofile(filename)
+    print("\n", A.shape)
+
+def resample3(T, s=2):
+    #resample a tensor field by an integer factor s
+    #This function first convolves the field with a box filter and then
+    #   re-samples to create a smaller field
+    
+    #check the format for the window size
+    if type(s) is not list:
+        s = [s] * 3
+    elif len(s) == 1:
+        s = s * 3
+    elif len(s) == 2:
+        s.insert(1, s[0])
+    s = np.array(s)
+    
+    bar = progressbar.ProgressBar()
+    bar.max_value = T.shape[3]
+    
+    #blur with a uniform box filter of size r
+    for t in range(T.shape[3]):
+        T[..., t] = scipy.ndimage.filters.uniform_filter(T[..., t], 2 * s)
+        bar.update(t+1)
+        
+    #resample at a rate of r
+    R = T[::s[0], ::s[1], ::s[2], :]
+    return R
+
+def color3(prefix, T, vector='largest', aniso=True):
+    #Saves a stack of color images corresponding to the eigenvector and optionally scaled by anisotropy
+    
+    bar = progressbar.ProgressBar()
+    bar.max_value = T.shape[2]
+    
+    #for each z-axis slice
+    for z in range(T.shape[2]):
+        S = T[:, :, z, :]                           #get the slice
+        V = st2vec(S, vector='smallest')   #calculate the vector
+        C = np.absolute(V)                       #calculate the absolute value
+        
+        if aniso == True:                              #if the image is scaled by anisotropy
+            FA, Cl, Cp, Cs = anisotropy(S)          #calculate the anisotropy of the slice
+            if vector == 'largest':
+                A = Cl
+            elif vector == 'smallest':
+                A = Cp
+        else:                                       #otherwise just scale by 1
+            A = np.ones(T.shape[0], T.shape[1])
+        image = C * np.expand_dims(A, 3)
+        
+        filename = prefix + str(z).zfill(4) + ".bmp"
+        scipy.misc.imsave(filename, image)
+        bar.update(z + 1)
+        
+def st2stack(T, outfile, **kwargs):
+    eigenvector = False                                             #initialize the colormap flags to false
+    aniso_color = False
+    aniso_alpha = False
+    #image = False
+    aniso_pwr = 1
+    cimage_pwr = 1
+    aimage_pwr = 1
+    anisostretch = 1                                                #set the contrast stretch parameter
+    alpha_channel = False
+    alpha_image = False
+    color_image = False
+    
+    for k,v in kwargs.items():                                  #for each argument
+        if(k == "ev"):                                               #if the user wants a colormap based on an eigenvector
+            eigenvector = True                                     #set the eigenvector flag to true
+            ev = v                                                 #save the desired eigenvector
+        if(k == "aniso"):                                            #if the user wants to factor in anisotropy
+            aniso = True                                      #set the anisotropy flag to true
+            aniso_channel = v                                      #save the anisotropy channel
+        if(k == "aniso_color"):
+            aniso_color = v
+        if(k == "aniso_alpha"):
+            aniso_alpha = v
+        if(k == "apwr"):                                              #if the user wants to amplify the anisotropy
+            aniso_pwr = v                                                 #set the anisotropy exponent
+        if(k == "cipwr"):                                            #if the user specifies the image power
+            cimage_pwr = v
+        if(k == "aipwr"):
+            aimage_pwr = v
+        if(k == "alphaimage"):
+            Ia = v
+            alpha_image = True
+        if(k == "colorimage"):
+            Ic = v
+            color_image = True
+        if(k == "anisostretch"):
+            anisostretch = v
+        if(k == "alpha"):
+            alpha_channel = v
+             
+    bar = progressbar.ProgressBar()
+    bar.max_value = T.shape[2]
+    for i in range(0, T.shape[2]):
+    #for i in range(0, 50):
+    
+        if(alpha_image or alpha_channel):
+            img = np.ones([T.shape[0], T.shape[1], 4])
+        else:
+            img = np.ones([T.shape[0], T.shape[1], 3])
+        if(eigenvector):
+            V = st2vec(T[:, :, i], ev)                         #get the vector field for slice i corresponding to eigenvector ev
+            img[:, :, 0:3] = V                                      #update the image with the vector field information
+        if(aniso):                                             #if the user is requesting anisotropy be incorporated into the image
+            FA, Cl, Cp, Cs = anisotropy(T[:, :, i])        #calculate the anisotropy of the tensors in slice i
+            if(aniso_channel == "fa"):
+                A = FA
+            elif(aniso_channel == "l"):
+                A = Cl
+            elif(aniso_channel == "p"):
+                A = Cp
+            else:
+                A = Cs
+            if(aniso_alpha):
+                print("rendering anisotropy to the alpha channel")
+                img[:, :, 3] = A ** aniso_pwr * anisostretch
+            if(aniso_color):
+                print("rendering anisotropy to the color channel")
+                img[:, :, 0:3] = img[:, :, 0:3] * np.expand_dims(A ** aniso_pwr, 3) * anisostretch               
+        if(alpha_image):
+            img[:, :, 3] = Ia[:, :, i]/255 ** aimage_pwr
+        if(color_image):
+            img[:, :, 0:3] = img[:, :, 0:3] * (np.expand_dims(Ic[:, :, i], 3)/255) ** cimage_pwr    #multiply the vector field by the image intensity
+        #outname = outfile + str(i).zfill(3) + ".bmp"                    #get the file name to be saved
+        outname = outfile.replace("*", str(i).zfill(3))
+        
+        sp.misc.imsave(outname, np.ndarray.astype(np.abs(img)*255, "uint8"))                              #save the output image
+        bar.update(i+1)
+        
+
+#this function takes a 3D image and turns it into a stack of color images based on the structure tensor
+def img2stack(I, outfile, **kwargs):
+    
+    vs = [1, 1, 1]                                                  #set the default voxel size to 1
+    w = 5
+    
+    for k,v in kwargs.items():                                  #for each argument
+        if(k == "voxelsize"):                                       #if the voxel size is specified
+            if(len(v) == 1):                                        #if the user just specifies one value
+                vs = [v] * 3                                        #assume that the voxels are isotropic, create a list of 3 v's
+            elif(len(v) == 2):                                      #if the user specifies two values
+                vs[0] = v[0]                                        #assume that the voxels are isotropic along (x, y) and anisotropic along z
+                vs[1] = v[0]
+                vs[2] = v[1]
+            elif(len(v) == 3):
+                vs = v
+        if(k == "window"):                                          #if the user specifies a window size
+            w = v
+            
+    T = st3(I, w, vs)                                       #calculate the structure tensor
+    
+    st2stack(T, outfile, **kwargs)
+    
+def stack2stack(infile_mask, outfile, **kwargs):
+     
+    I = loadstack(infile_mask)                            #load the file mask
+    for k,v in kwargs.items():                                  #for each argument
+        if(k == "ipwr"):
+            img = I
+    
+    img2stack(I, outfile, image=img, **kwargs)                                #call the function to convert the image to an output ST stack
 \ No newline at end of file
+/*
+Copyright <2017> <David Mayerich>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
 #ifndef STIM_CELLSET_H
 #define STIM_CELLSET_H
@@ -117,7 +127,7 @@ public:
 	}
 	/// Return the maximum value of a field in this cell set
-	double max(std::string field){
+	double maximum(std::string field){
 		size_t idx = fields[field];						//get the field index
 		size_t ncells = cells.size();					//get the total number of cells
 		double maxval, val;								//stores the current and maximum values
@@ -130,7 +140,7 @@ public:
 	}
 	/// Return the maximum value of a field in this cell set
-	double min(std::string field){
+	double minimum(std::string field){
 		size_t idx = fields[field];						//get the field index
 		size_t ncells = cells.size();					//get the total number of cells
 		double minval, val;								//stores the current and maximum values
+/*
+Copyright <2017> <David Mayerich>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
 #ifndef STIM_CENTERLINE_H
 #define STIM_CENTERLINE_H
 #include <vector>
 #include <stim/math/vec3.h>
-//#include <ANN/ANN.h>
+#include <stim/structures/kdtree.cuh>
 namespace stim{
@@ -12,195 +22,499 @@ namespace stim{
  *	class to describe an interconnected (often biological) network.
  */
 template<typename T>
-class centerline{
+class centerline : public std::vector< stim::vec3<T> >{
 protected:
-	unsigned int N;					//number of points in the fiber
-	double **c;						//centerline (array of double pointers)
-//	ANNkd_tree* kdt;				//kd-tree stores all points in the fiber for fast searching
-
-	/// Initialize an empty fiber
-	void init()
-	{
-		N=0;
-		c=NULL;
-//		kdt = NULL;
-	}
-	/// Initialize a fiber with N centerline points (all located at [0, 0, 0] with radius 0)
-	void init(unsigned int n)
-	{
+	std::vector<T> L;										//stores the integrated length along the fiber (used for parameterization)
-		N = n;												//set the number of points
-//		kdt = NULL;
-		c = (double**) malloc(sizeof(double*) * N);			//allocate the array pointer
+	///Return the normalized direction vector at point i (average of the incoming and outgoing directions)
+	vec3<T> d(size_t i) {
+		if (size() <= 1) return vec3<T>(0, 0, 0);						//if there is insufficient information to calculate the direction, return a null vector
+		if (size() == 2) return (at(1) - at(0)).norm();					//if there are only two points, the direction vector at both is the direction of the line segment
+		if (i == 0) return (at(1) - at(0)).norm();						//the first direction vector is oriented towards the first line segment
+		if (i == size() - 1) return (at(size() - 1) - at(size() - 2)).norm();	//the last direction vector is oriented towards the last line segment
-		for(unsigned int i = 0; i < N; i++)					//allocate space for each point
-			c[i] = (double*) malloc(sizeof(double) * 3);
+		//all other direction vectors are the average direction of the two joined line segments
+		vec3<T> a = at(i) - at(i - 1);
+		vec3<T> b = at(i + 1) - at(i);
+		vec3<T> ab = a.norm() + b.norm();
+		return ab.norm();
 	}
-	/// Copies an existing fiber to the current fiber
-
-	/// @param cpy stores the new copy of the fiber
-	void copy( const stim::centerline<T>& cpy, bool kd = 0){
-
-		///allocate space for the new fiber
-		init(cpy.N);
-
-		///copy the points
-		for(unsigned int i = 0; i < N; i++){
-			for(unsigned int d = 0; d < 3; d++)		//for each dimension
-				c[i][d] = cpy.c[i][d];				//copy the coordinate
+	//initializes the integrated length vector to make parameterization easier, starting with index idx (all previous indices are assumed to be correct)
+	void update_L(size_t start = 0) {
+		L.resize(size());									//allocate space for the L array
+		if (start == 0) {
+			L[0] = 0;											//initialize the length value for the first point to zero (0)
+			start++;
 		}
-//		if(kd)
-//			gen_kdtree();							//generate the kd tree for the new fiber
-	}
-	/// generate a KD tree for points on fiber
-//	void gen_kdtree()
-//	{
-//		int n_data = N; //create an array of data points
-//		ANNpointArray pts = (ANNpointArray)c;			//cast the centerline list to an ANNpointArray
-//		kdt = new ANNkd_tree(pts, n_data, 3);			//build a KD tree
-//	}
-
-	/// find distance between two points
-	double dist(double* p0, double* p1){
-
-		double sum = 0; // initialize variables
-		float v;
-		for(unsigned int d = 0; d < 3; d++)
-		{
-			v = p1[d] - p0[d];
-			sum +=v * v;
+		stim::vec3<T> d;
+		for (size_t i = start; i < size(); i++) {		//for each line segment in the centerline
+			d = at(i) - at(i - 1);
+			L[i] = L[i - 1] + d.len();				//calculate the running length total
 		}
-		return sqrt(sum);
 	}
-	/// This function retreives the index for the fiber point closest to q
-
-	/// @param q is a reference point used to find the closest point on the fiber center line
-//	unsigned int ann( stim::vec<double> q ){
-//		ANNidxArray idx = new ANNidx[1];			//variable used to hold the nearest point
-//		ANNdistArray sq_dist = new ANNdist[1];		//variable used to hold the squared distance to the nearest point
-//		kdt->annkSearch(q.data(), 1, idx, sq_dist);	//search the KD tree for the nearest neighbor
-//		return *idx;
-//	}
+	void init() {
+		if (size() == 0) return;								//return if there aren't any points
+		update_L();
+	}
 	/// Returns a stim::vec representing the point at index i
 	/// @param i is an index of the desired centerline point
 	stim::vec<T> get_vec(unsigned i){
-		stim::vec3<T> r;
-		r.resize(3);
-		r[0] = c[i][0];
-		r[1] = c[i][1];
-		r[2] = c[i][2];
+		return std::vector< stim::vec3<T> >::at(i);
+	}
+
+	///finds the index of the point closest to the length l on the lower bound.
+	///binary search.
+	size_t findIdx(T l) {
+		for (size_t i = 1; i < L.size(); i++) {				//for each point in the centerline
+			if (L[i] > l) return i - 1;						//if we have passed the desired length value, return i-1
+		}
+		return L.size() - 1;
+		/*size_t i = L.size() / 2;
+		size_t max = L.size() - 1;
+		size_t min = 0;
+		while (i < L.size() - 1){
+			if (l < L[i]) {
+				max = i;
+				i = min + (max - min) / 2;
+			}
+			else if (L[i] <= l && L[i + 1] >= l) {
+				break;
+			}
+			else {
+				min = i;
+				i = min + (max - min) / 2;
+			}
+		}
+		return i;*/
+	}
-		return r;
+	///Returns a position vector at the given length into the fiber (based on the pvalue).
+	///Interpolates the radius along the line.
+	///@param l: the location of the in the cylinder.
+	///@param idx: integer location of the point closest to l but prior to it.
+	stim::vec3<T> p(T l, int idx) {
+		T rat = (l - L[idx]) / (L[idx + 1] - L[idx]);
+		stim::vec3<T> v1 = at(idx);
+		stim::vec3<T> v2 = at(idx + 1);
+		return(v1 + (v2 - v1)*rat);
 	}
 public:
-	centerline(){
+	using std::vector< stim::vec3<T> >::at;
+	using std::vector< stim::vec3<T> >::size;
+
+	centerline() : std::vector< stim::vec3<T> >() {
+		init();
+	}
+	centerline(size_t n) : std::vector< stim::vec3<T> >(n){
+		init();
+	}
+	centerline(std::vector<stim::vec3<T> > pos) :
+		std::vector<stim::vec3<T> > (pos)
+	{
 		init();
 	}
+	
+	//overload the push_back function to update the length vector
+	void push_back(stim::vec3<T> p) {
+		std::vector< stim::vec3<T> >::push_back(p);
+		update_L(size() - 1);
+	}
+
+	///Returns a position vector at the given p-value (p value ranges from 0 to 1).
+	///interpolates the position along the line.
+	///@param pvalue: the location of the in the cylinder, from 0 (beginning to 1).
+	stim::vec3<T> p(T pvalue) {
+		if (pvalue <= 0.0) return at(0);			//return the first element
+		if (pvalue >= 1.0) return back();			//return the last element
-	/// Copy constructor
-	centerline(const stim::centerline<T> &obj){
-		copy(obj);
+		T l = pvalue*L[L.size() - 1];
+		int idx = findIdx(l);
+		return p(l, idx);
 	}
-	//temp constructor for graph visualization
-	centerline(int n)
-	{
-		init(n);
+	///Update centerline internal parameters (currently the L vector)
+	void update() {
+		init();
+	}
+	///Return the length of the entire centerline
+	T length() {
+		return L.back();
 	}
-	/// Constructor takes a list of stim::vec points, the radius at each point is set to zero
-	centerline(std::vector< stim::vec<T> > p, bool kd = 0){
-		init(p.size());		//initialize the fiber
-		//for each point, set the centerline position and radius
-		for(unsigned int i = 0; i < N; i++){
+	/// stitch two centerlines
+	///@param c1, c2: two centerlines
+	///@param sigma: sample rate
+	static std::vector< stim::centerline<T> > stitch(stim::centerline<T> c1, stim::centerline<T> c2 = stim::centerline<T>()) {
+		
+		std::vector< stim::centerline<T> > result;
+		stim::centerline<T> new_centerline;
+		stim::vec3<T> new_vertex;
+
+		// if only one centerline, stitch itself!
+		if (c2.size() == 0) {
+			size_t num = c1.size();
+			size_t id = 100000;							// store the downsample start position
+			T threshold;
+			if (num < 4) {								// if the number of vertex is less than 4, do nothing
+				result.push_back(c1);
+				return result;
+			}
+			else {
+				// test geometry start vertex
+				stim::vec3<T> v1 = c1[1] - c1[0];		// vector from c1[0] to c1[1]
+				for (size_t p = 2; p < num; p++) {		// 90° standard???
+					stim::vec3<T> v2 = c1[p] - c1[0];
+					float cosine = v2.dot(v1);
+					if (cosine < 0) {
+						id = p;
+						threshold = v2.len();
+						break;
+					}
+				}
+				if (id != 100000) {						// find a downsample position on the centerline
+					T* c;
+					c = (T*)malloc(sizeof(T) * (num - id) * 3);
+					for (size_t p = id; p < num; p++) {
+						for (size_t d = 0; d < 3; d++) {
+							c[p * 3 + d] = c1[p][d];
+						}
+					}
+					stim::kdtree<T, 3> kdt;
+					kdt.create(c, num - id, 5);			// create tree
-			//set the centerline position
-			for(unsigned int d = 0; d < 3; d++)
-				c[i][d] = (double) p[i][d];
+					T* query = (T*)malloc(sizeof(T) * 3);
+					for (size_t d = 0; d < 3; d++)
+						query[d] = c1[0][d];
+					size_t index;
+					T dist;
-			//set the radius
-		}
-		//generate a kd tree
-//		if(kd)
-//			gen_kdtree();
-	}
+					kdt.search(query, 1, &index, &dist);
-	/// constructor takes a list of points
-	centerline(std::vector< stim::vec3< T > > pos, bool kd = 0){
-		init(pos.size());		//initialize the fiber
+					free(query);
+					free(c);
-		//for each point, set the centerline position and radius
-		for(unsigned int i = 0; i < N; i++){
-			//set the centerline position
-			for(unsigned int d = 0; d < 3; d++)
-				c[i][d] = (double) pos[i][d];
-			//set the radius
+					if (dist > threshold) {
+						result.push_back(c1);
+					}
+					else {
+						// the loop part
+						new_vertex = c1[index];
+						new_centerline.push_back(new_vertex);
+						for (size_t p = 0; p < index + 1; p++) {
+							new_vertex = c1[p];
+							new_centerline.push_back(new_vertex);
+						}
+						result.push_back(new_centerline);
+						new_centerline.clear();
+
+						// the tail part
+						for (size_t p = index; p < num; p++) {
+							new_vertex = c1[p];
+							new_centerline.push_back(new_vertex);
+						}
+						result.push_back(new_centerline);
+					}
+				}
+				else {	// there is one potential problem that two positions have to be stitched
+						// test geometry end vertex
+					stim::vec3<T> v1 = c1[num - 2] - c1[num - 1];
+					for (size_t p = num - 2; p > 0; p--) {		// 90° standard
+						stim::vec3<T> v2 = c1[p - 1] - c1[num - 1];
+						float cosine = v2.dot(v1);
+						if (cosine < 0) {
+							id = p;
+							threshold = v2.len();
+							break;
+						}
+					}
+					if (id != 100000) {						// find a downsample position
+						T* c;
+						c = (T*)malloc(sizeof(T) * (id + 1) * 3);
+						for (size_t p = 0; p < id + 1; p++) {
+							for (size_t d = 0; d < 3; d++) {
+								c[p * 3 + d] = c1[p][d];
+							}
+						}
+						stim::kdtree<T, 3> kdt;
+						kdt.create(c, id + 1, 5);				// create tree
+
+						T* query = (T*)malloc(sizeof(T) * 1 * 3);
+						for (size_t d = 0; d < 3; d++)
+							query[d] = c1[num - 1][d];
+						size_t index;
+						T dist;
+
+						kdt.search(query, 1, &index, &dist);
+
+						free(query);
+						free(c);
+
+						if (dist > threshold) {
+							result.push_back(c1);
+						}
+						else {
+							// the tail part
+							for (size_t p = 0; p < index + 1; p++) {
+								new_vertex = c1[p];
+								new_centerline.push_back(new_vertex);
+							}
+							result.push_back(new_centerline);
+							new_centerline.clear();
+
+							// the loop part
+							for (size_t p = index; p < num; p++) {
+								new_vertex = c1[p];
+								new_centerline.push_back(new_vertex);
+							}
+							new_vertex = c1[index];
+							new_centerline.push_back(new_vertex);
+							result.push_back(new_centerline);
+						}
+					}
+					else {	// no stitch position
+						result.push_back(c1);
+					}
+				}
+			}
 		}
-		//generate a kd tree
-		//if(kd)
-		//	gen_kdtree();
-	}
-	/// Assignment operation
-	centerline& operator=(const centerline &rhs){
-		if(this == &rhs) return *this;			//test for and handle self-assignment
-		copy(rhs);
-		return *this;
-	}
+		// two centerlines
+		else {
+			// find stitch position based on nearest neighbors												
+			size_t num1 = c1.size();
+			T* c = (T*)malloc(sizeof(T) * num1 * 3);		// c1 as reference point
+			for (size_t p = 0; p < num1; p++)				// centerline to array
+				for (size_t d = 0; d < 3; d++)				// because right now my kdtree code is a relatively close code, it has its own structure
+					c[p * 3 + d] = c1[p][d];				// I will merge it into stimlib totally in the near future
+			stim::kdtree<T, 3> kdt;							// kdtree object
+			kdt.create(c, num1, 5);							// create tree
-	/// Return the point on the fiber closest to q
-	/// @param q is the query point used to locate the nearest point on the fiber centerline
-//	stim::vec<T> nearest(stim::vec<T> q){
-//
-//		stim::vec<double> temp( (double) q[0], (double) q[1], (double) q[2]);
-//
-//		unsigned int idx = ann(temp);		//determine the index of the nearest neighbor
-//
-//		return stim::vec<T>((T) c[idx][0], (T) c[idx][1], (T) c[idx][2]);	//return the nearest centerline point
-//	}
-
-	/// Return the point index on the fiber closest to q
-	/// @param q is the query point used to locate the nearest point on the fiber centerline
-//	unsigned int nearest_idx(stim::vec<T> q){
-//
-//		stim::vec<double> temp((double) q[0], (double) q[1], (double) q[2]);
-//
-//		unsigned int idx = ann(temp);		//determine the index of the nearest neighbor
-//
-//		return idx;	//return the nearest centerline point index
-//	}
-
-	/// Returns the fiber centerline as an array of stim::vec points
-	std::vector< stim::vec<T> > get_centerline(){
-
-		//create an array of stim vectors
-		std::vector< stim::vec3<T> > pts(N);
-
-		//cast each point to a stim::vec, keeping only the position information
-		for(unsigned int i = 0; i < N; i++)
-			pts[i] = stim::vec3<T>((T) c[i][0], (T) c[i][1], (T) c[i][2]);
-
-		//return the centerline array
-		return pts;
+			size_t num2 = c2.size();						
+			T* query = (T*)malloc(sizeof(T) * num2 * 3);	// c2 as query point
+			for (size_t p = 0; p < num2; p++) {
+				for (size_t d = 0; d < 3; d++) {
+					query[p * 3 + d] = c2[p][d];
+				}
+			}
+			std::vector<size_t> index(num2);
+			std::vector<T> dist(num2);
+
+			kdt.search(query, num2, &index[0], &dist[0]);	// find the nearest neighbors in c1 for c2
+
+			// clear up
+			free(query);
+			free(c);
+
+			// find the average vertex distance of one centerline
+			T sigma1 = 0;
+			T sigma2 = 0;
+			for (size_t p = 0; p < num1 - 1; p++) 
+				sigma1 += (c1[p] - c1[p + 1]).len();
+			for (size_t p = 0; p < num2 - 1; p++)
+				sigma2 += (c2[p] - c2[p + 1]).len();
+			sigma1 /= (num1 - 1);
+			sigma2 /= (num2 - 1);
+			float threshold = 4 * (sigma1 + sigma2) / 2;			// better way to do this?
+
+			T min_d = *std::min_element(dist.begin(), dist.end());	// find the minimum distance between c1 and c2
+
+			if (min_d > threshold) {								// if the minimum distance is too large
+				result.push_back(c1);
+				result.push_back(c2);
+
+#ifdef DEBUG
+				std::cout << "The distance between these two centerlines is too large" << std::endl;
+#endif
+			}
+			else {
+			//	auto smallest = std::min_element(dist.begin(), dist.end());
+				unsigned int smallest = std::min_element(dist.begin(), dist.end());
+			//	auto i = std::distance(dist.begin(), smallest);		// find the index of min-distance in distance list
+				unsigned int i = std::distance(dist.begin(), smallest);		// find the index of min-distance in distance list
+
+				// stitch position in c1 and c2
+				int id1 = index[i];
+				int id2 = i;
+
+				// actually there are two cases
+				// first one inacceptable
+				// second one acceptable
+				if (id1 != 0 && id1 != num1 - 1 && id2 != 0 && id2 != num2 - 1) {		// only stitch one end vertex to another centerline
+					result.push_back(c1);
+					result.push_back(c2);
+				}
+				else {
+					if (id1 == 0 || id1 == num1 - 1) {			// if the stitch vertex is the first or last vertex of c1
+						// for c2, consider two cases(one degenerate case)
+						if (id2 == 0 || id2 == num2 - 1) {		// case 1, if stitch position is also on the end of c2
+							// we have to decide which centerline get a new vertex, based on direction
+							// for c1, computer the direction change angle
+							stim::vec3<T> v1, v2;
+							float alpha1, alpha2;				// direction change angle
+							if (id1 == 0)
+								v1 = (c1[1] - c1[0]).norm();
+							else
+								v1 = (c1[num1 - 2] - c1[num1 - 1]).norm();
+							v2 = (c2[id2] - c1[id1]).norm();
+							alpha1 = v1.dot(v2);
+							if (id2 == 0)
+								v1 = (c2[1] - c2[0]).norm();
+							else
+								v1 = (c2[num2 - 2] - c2[num2 - 1]).norm();
+							v2 = (c1[id1] - c2[id2]).norm();
+							alpha2 = v1.dot(v2);
+							if (abs(alpha1) > abs(alpha2)) {					// add the vertex to c1 in order to get smooth connection
+								// push back c1
+								if (id1 == 0) {									// keep geometry information
+									new_vertex = c2[id2];
+									new_centerline.push_back(new_vertex);
+									for (size_t p = 0; p < num1; p++) {			// stitch vertex on c2 -> geometry start vertex on c1 -> geometry end vertex on c1
+										new_vertex = c1[p];
+										new_centerline.push_back(new_vertex);
+									}
+								}
+								else {
+									for (size_t p = 0; p < num1; p++) {			// stitch vertex on c2 -> geometry end vertex on c1 -> geometry start vertex on c1
+										new_vertex = c1[p];
+										new_centerline.push_back(new_vertex);
+									}
+									new_vertex = c2[id2];
+									new_centerline.push_back(new_vertex);
+								}
+								result.push_back(new_centerline);
+								new_centerline.clear();
+
+								// push back c2
+								for (size_t p = 0; p < num2; p++) {
+									new_vertex = c2[p];
+									new_centerline.push_back(new_vertex);
+								}
+								result.push_back(new_centerline);
+							}
+							else {												// add the vertex to c2 in order to get smooth connection
+								// push back c1
+								for (size_t p = 0; p < num1; p++) {
+									new_vertex = c1[p];
+									new_centerline.push_back(new_vertex);
+								}
+								result.push_back(new_centerline);
+								new_centerline.clear();
+
+								// push back c2
+								if (id2 == 0) {									// keep geometry information
+									new_vertex = c1[id1];
+									new_centerline.push_back(new_vertex);
+									for (size_t p = 0; p < num2; p++) {			// stitch vertex on c2 -> geometry start vertex on c1 -> geometry end vertex on c1
+										new_vertex = c2[p];
+										new_centerline.push_back(new_vertex);
+									}
+								}
+								else {
+									for (size_t p = 0; p < num2; p++) {			// stitch vertex on c2 -> geometry end vertex on c1 -> geometry start vertex on c1
+										new_vertex = c2[p];
+										new_centerline.push_back(new_vertex);
+									}
+									new_vertex = c1[id1];
+									new_centerline.push_back(new_vertex);
+								}
+								result.push_back(new_centerline);
+							}
+						}
+						else {												// case 2, the stitch position is on c2
+							// push back c1
+							if (id1 == 0) {									// keep geometry information
+								new_vertex = c2[id2];
+								new_centerline.push_back(new_vertex);
+								for (size_t p = 0; p < num1; p++) {			// stitch vertex on c2 -> geometry start vertex on c1 -> geometry end vertex on c1
+									new_vertex = c1[p];
+									new_centerline.push_back(new_vertex);
+								}
+							}
+							else {
+								for (size_t p = 0; p < num1; p++) {			// geometry end vertex on c1 -> geometry start vertex on c1 -> stitch vertex on c2
+									new_vertex = c1[p];
+									new_centerline.push_back(new_vertex);
+								}
+								new_vertex = c2[id2];
+								new_centerline.push_back(new_vertex);
+							}
+							result.push_back(new_centerline);
+							new_centerline.clear();
+
+							// push back c2
+							for (size_t p = 0; p < id2 + 1; p++) {			// first part
+								new_vertex = c2[p];
+								new_centerline.push_back(new_vertex);
+							}
+							result.push_back(new_centerline);
+							new_centerline.clear();
+
+							for (size_t p = id2; p < num2; p++) {			// second part
+								new_vertex = c2[p];
+								new_centerline.push_back(new_vertex);
+							}
+							result.push_back(new_centerline);
+						}
+					}
+					else {							// if the stitch vertex is the first or last vertex of c2
+						// push back c2
+						if (id2 == 0) {										// keep geometry information
+							new_vertex = c1[id1];
+							new_centerline.push_back(new_vertex);
+							for (size_t p = 0; p < num2; p++) {				// stitch vertex on c1 -> geometry start vertex on c2 -> geometry end vertex on c2
+								new_vertex = c2[p];
+								new_centerline.push_back(new_vertex);
+							}
+						}
+						else {
+							for (size_t p = 0; p < num2; p++) {				// geometry end vertex on c2 -> geometry start vertex on c2 -> stitch vertex on c1
+								new_vertex = c2[p];
+								new_centerline.push_back(new_vertex);
+							}
+							new_vertex = c1[id1];
+							new_centerline.push_back(new_vertex);
+							result.push_back(new_centerline);
+							new_centerline.clear();
+
+							// push back c1
+							for (size_t p = 0; p < id1 + 1; p++) {			// first part
+								new_vertex = c1[p];
+								new_centerline.push_back(new_vertex);
+							}
+							result.push_back(new_centerline);
+							new_centerline.clear();
+
+							for (size_t p = id1; p < num1; p++) {			// second part
+								new_vertex = c1[p];
+								new_centerline.push_back(new_vertex);
+							}
+							result.push_back(new_centerline);
+						}
+					}
+				}
+			}
+		}
+		return result;
 	}
 	/// Split the fiber at the specified index. If the index is an end point, only one fiber is returned
 	std::vector< stim::centerline<T> > split(unsigned int idx){
-		std::vector< stim::centerline<T> > fl;		//create an array to store up to two fibers
+		std::vector< stim::centerline<T> > fl;				//create an array to store up to two fibers
+		size_t N = size();
 		//if the index is an end point, only the existing fiber is returned
 		if(idx == 0 || idx == N-1){
@@ -216,123 +530,84 @@ public:
 			fl.resize(2);								//set the array size to 2
-			fl[0].init(N1);								//set the size of each fiber
-			fl[1].init(N2);
+			fl[0] = stim::centerline<T>(N1);			//set the size of each fiber
+			fl[1] = stim::centerline<T>(N2);
 			//copy both halves of the fiber
-			unsigned int i, d;
+			unsigned int i;
 			//first half
-			for(i = 0; i < N1; i++){					//for each centerline point
-				for(d = 0; d < 3; d++)
-					fl[0].c[i][d] = c[i][d];			//copy each coordinate
-			}
+			for(i = 0; i < N1; i++)					//for each centerline point
+				fl[0][i] = std::vector< stim::vec3<T> >::at(i);
+			fl[0].init();							//initialize the length vector
 			//second half
-			for(i = 0; i < N2; i++){
-				for(d = 0; d < 3; d++)
-					fl[1].c[i][d] = c[idx + i][d];
-
-			}
+			for(i = 0; i < N2; i++)
+				fl[1][i] = std::vector< stim::vec3<T> >::at(idx+i);
+			fl[1].init();							//initialize the length vector
 		}
 		return fl;										//return the array
 	}
-	/// Calculates the set of fibers resulting from a connection between the current fiber and a fiber f
-
-	/// @param f is the fiber that will be connected to the current fiber
-/*	std::vector< stim::centerline<T> > connect( stim::centerline<T> &f, double dist){
-
-		double min_dist;
-		unsigned int idx0, idx1;
-
-		//go through each point in the query fiber, looking for the indices for the closest points
-		for(unsigned int i = 0; i < f.n_pts(); i++){
-			//Run through all points and find the index with the closest point, then partition the fiber and return two fibers.
-
-		}
-
-
-
-	}
-*/
 	/// Outputs the fiber as a string
 	std::string str(){
 		std::stringstream ss;
-
-		//create an iterator for the point list
-		//typename std::list< point<T> >::iterator i;
-		for(unsigned int i = 0; i < N; i++){
-			ss<<"  [  ";
-			for(unsigned int d = 0; d < 3; d++){
-				ss<<c[i][d]<<"  ";
-			}
-		}
+		size_t N = std::vector< stim::vec3<T> >::size();
+		ss << "---------[" << N << "]---------" << std::endl;
+		for (size_t i = 0; i < N; i++)
+			ss << std::vector< stim::vec3<T> >::at(i) << std::endl;
+		ss << "--------------------" << std::endl;
 		return ss.str();
 	}
-	/// Returns the number of centerline points in the fiber
-	unsigned int size(){
-		return N;
-	}
-
-
-	/// Bracket operator returns the element at index i
-
-	/// @param i is the index of the element to be returned as a stim::vec
-	stim::vec<T> operator[](unsigned i){
-		return get_vec(i);
-	}
 	/// Back method returns the last point in the fiber
-	stim::vec<T> back(){
-		return get_vec(N-1);
+	stim::vec3<T> back(){
+		return std::vector< stim::vec3<T> >::back();
 	}
+
 		////resample a fiber in the network
 	stim::centerline<T> resample(T spacing)
-	{
-		std::cout<<"fiber::resample()"<<std::endl;
-
-		std::vector<T> v(3);    //v-direction vector of the segment
-		stim::vec<T> p(3);      //- intermediate point to be added
-		stim::vec<T> p1(3);   // p1 - starting point of an segment on the fiber,
-		stim::vec<T> p2(3);   // p2 - ending point,
-		double sum=0;  //distance summation
-		std::vector<stim::vec<T> > fiberPositions = centerline();
-		std::vector<stim::vec<T> > newPointList; // initialize list of new resampled points on the fiber
+	{	
+		//std::cout<<"fiber::resample()"<<std::endl;
+
+		stim::vec3<T> v;    //v-direction vector of the segment
+		stim::vec3<T> p;      //- intermediate point to be added
+		stim::vec3<T> p1;   // p1 - starting point of an segment on the fiber,
+		stim::vec3<T> p2;   // p2 - ending point,
+		//double sum=0;  //distance summation
+
+		size_t N = size();
+
+		centerline<T> new_c; // initialize list of new resampled points on the fiber
 		// for each point on the centerline (skip if it is the last point on centerline)
-		//unsigned int N = fiberPositions.size(); // number of points on the fiber
 		for(unsigned int f=0; f< N-1; f++)
-		{
+		{			
+			p1 = at(f); 
+			p2 = at(f+1);
+			v = p2 - p1;
-			p1 = fiberPositions[f]; p2 = fiberPositions[f + 1]; v = p2 - p1;
-			for(unsigned int d = 0; d < 3; d++){
-				sum +=v[d] * v[d];}              //length of segment-distance between starting and ending point
-
-			T lengthSegment = sqrt(sum);  //find Length of the segment as distance between the starting and ending points of the segment
-
-			if(lengthSegment >= spacing) // if length of the segment is greater than standard deviation resample
-				{
-					// repeat resampling until accumulated stepsize is equsl to length of the segment
-					for(T step=0.0; step<lengthSegment; step+=spacing)
-					{
-						// calculate the resampled point by travelling step size in the direction of normalized gradient vector
-						for(unsigned int i=0; i<3;i++)
-							{
-								p[i] = p1[i] + v[i]*(step/lengthSegment);
-							} //for each dimension
-						// add this resampled points to the new fiber list
-						newPointList.push_back(p);
-					}
+			T lengthSegment = v.len();			//find Length of the segment as distance between the starting and ending points of the segment
+
+			if(lengthSegment >= spacing){ // if length of the segment is greater than standard deviation resample
+				
+				// repeat resampling until accumulated stepsize is equsl to length of the segment
+				for(T step=0.0; step<lengthSegment; step+=spacing){
+					// calculate the resampled point by travelling step size in the direction of normalized gradient vector
+					p = p1 + v * (step / lengthSegment);
+					
+					// add this resampled points to the new fiber list
+					new_c.push_back(p);
 				}
-			else       // length of the segment is now less than standard deviation, push the ending point of the segment and proceed to the next point in the fiber
-				newPointList.push_back(fiberPositions[f+1]);
 			}
-		newPointList.push_back(fiberPositions[N-1]);   //add the last point on the fiber to the new fiber list
-		centerline newFiber(newPointList);
-		return newFiber;
+			else       // length of the segment is now less than standard deviation, push the ending point of the segment and proceed to the next point in the fiber
+				new_c.push_back(at(f));
+		}
+		new_c.push_back(at(N-1));   //add the last point on the fiber to the new fiber list
+		//centerline newFiber(newPointList);
+		return new_c;
 	}
 };
+/*
+Copyright <2017> <David Mayerich>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#ifndef STIM_CENTERLINE_H
+#define STIM_CENTERLINE_H
+
+#include <vector>
+#include <stim/math/vec3.h>
+
+namespace stim{
+
+/**	This class stores information about a single fiber represented as a set of geometric points
+ *	between two branch or end points. This class is used as a fundamental component of the stim::network
+ *	class to describe an interconnected (often biological) network.
+ */
+template<typename T>
+class centerline{
+
+protected:
+	unsigned int N;					//number of points in the fiber
+	double **c;						//centerline (array of double pointers)
+
+	/// Initialize an empty fiber
+	void init(){
+		N=0;
+		c=NULL;
+	}
+
+	/// Initialize a fiber with N centerline points (all located at [0, 0, 0] with radius 0)
+	void init(unsigned int n){
+
+		N = n;												//set the number of points
+		c = (double**) malloc(sizeof(double*) * N);			//allocate the array pointer
+
+		for(unsigned int i = 0; i < N; i++)					//allocate space for each point
+			c[i] = (double*) malloc(sizeof(double) * 3);
+	}
+
+	/// Copies an existing fiber to the current fiber
+
+	/// @param cpy stores the new copy of the fiber
+	void copy( const stim::centerline<T>& cpy, bool kd = 0){
+
+		///allocate space for the new fiber
+		init(cpy.N);
+
+		///copy the points
+		for(unsigned int i = 0; i < N; i++){
+			for(unsigned int d = 0; d < 3; d++)		//for each dimension
+				c[i][d] = cpy.c[i][d];				//copy the coordinate
+		}
+	}
+
+	/// find distance between two points
+	double dist(double* p0, double* p1){
+
+		double sum = 0; // initialize variables
+		float v;
+		for(unsigned int d = 0; d < 3; d++)
+		{
+			v = p1[d] - p0[d];
+			sum +=v * v;
+		}
+		return sqrt(sum);
+	}
+
+	/// Returns a stim::vec representing the point at index i
+
+	/// @param i is an index of the desired centerline point
+	stim::vec<T> get_vec(unsigned i){
+		stim::vec3<T> r;
+		r.resize(3);
+		r[0] = c[i][0];
+		r[1] = c[i][1];
+		r[2] = c[i][2];
+
+		return r;
+	}
+
+
+public:
+
+	centerline(){
+		init();
+	}
+
+	/// Copy constructor
+	centerline(const stim::centerline<T> &obj){
+		copy(obj);
+	}
+
+	//initialize a centerline with n points
+	centerline(int n){
+		init(n);
+	}
+
+	/// Constructor takes a list of stim::vec points, the radius at each point is set to zero
+	centerline(std::vector< stim::vec<T> > p, bool kd = 0){
+		init(p.size());		//initialize the fiber
+
+		//for each point, set the centerline position and radius
+		for(unsigned int i = 0; i < N; i++){
+
+			//set the centerline position
+			for(unsigned int d = 0; d < 3; d++)
+				c[i][d] = (double) p[i][d];
+		}
+	}
+
+	/// constructor takes a list of points
+	centerline(std::vector< stim::vec3< T > > pos, bool kd = 0){
+		init(pos.size());		//initialize the fiber
+
+		//for each point, set the centerline position and radius
+		for(unsigned int i = 0; i < N; i++){
+			//set the centerline position
+			for(unsigned int d = 0; d < 3; d++)
+				c[i][d] = (double) pos[i][d];
+		}
+	}
+
+	/// Assignment operation
+	centerline& operator=(const centerline &rhs){
+		if(this == &rhs) return *this;			//test for and handle self-assignment
+		copy(rhs);
+		return *this;
+	}
+
+
+	/// Returns the fiber centerline as an array of stim::vec points
+	std::vector< stim::vec<T> > get_centerline(){
+
+		//create an array of stim vectors
+		std::vector< stim::vec3<T> > pts(N);
+
+		//cast each point to a stim::vec, keeping only the position information
+		for(unsigned int i = 0; i < N; i++)
+			pts[i] = stim::vec3<T>((T) c[i][0], (T) c[i][1], (T) c[i][2]);
+
+		//return the centerline array
+		return pts;
+	}
+
+	/// Split the fiber at the specified index. If the index is an end point, only one fiber is returned
+	std::vector< stim::centerline<T> > split(unsigned int idx){
+
+		std::vector< stim::centerline<T> > fl;		//create an array to store up to two fibers
+
+		//if the index is an end point, only the existing fiber is returned
+		if(idx == 0 || idx == N-1){
+			fl.resize(1);							//set the size of the fiber to 1
+			fl[0] = *this;							//copy the current fiber
+		}
+
+		//if the index is not an end point
+		else{
+
+			unsigned int N1 = idx + 1;					//calculate the size of both fibers
+			unsigned int N2 = N - idx;
+
+			fl.resize(2);								//set the array size to 2
+
+			fl[0].init(N1);								//set the size of each fiber
+			fl[1].init(N2);
+
+			//copy both halves of the fiber
+			unsigned int i, d;
+
+			//first half
+			for(i = 0; i < N1; i++){					//for each centerline point
+				for(d = 0; d < 3; d++)
+					fl[0].c[i][d] = c[i][d];			//copy each coordinate
+			}
+
+			//second half
+			for(i = 0; i < N2; i++){
+				for(d = 0; d < 3; d++)
+					fl[1].c[i][d] = c[idx + i][d];
+
+			}
+		}
+
+		return fl;										//return the array
+
+	}
+
+	/// Outputs the fiber as a string
+	std::string str(){
+		std::stringstream ss;
+
+		//create an iterator for the point list
+		//typename std::list< point<T> >::iterator i;
+		for(unsigned int i = 0; i < N; i++){
+			ss<<"  [  ";
+			for(unsigned int d = 0; d < 3; d++){
+				ss<<c[i][d]<<"  ";
+			}
+		}
+
+		return ss.str();
+	}
+	/// Returns the number of centerline points in the fiber
+	unsigned int size(){
+		return N;
+	}
+
+
+	/// Bracket operator returns the element at index i
+
+	/// @param i is the index of the element to be returned as a stim::vec
+	stim::vec<T> operator[](unsigned i){
+		return get_vec(i);
+	}
+
+	/// Back method returns the last point in the fiber
+	stim::vec<T> back(){
+		return get_vec(N-1);
+	}
+		////resample a fiber in the network
+	stim::centerline<T> resample(T spacing)
+	{
+		std::cout<<"fiber::resample()"<<std::endl;
+
+		std::vector<T> v(3);    //v-direction vector of the segment
+		stim::vec<T> p(3);      //- intermediate point to be added
+		stim::vec<T> p1(3);   // p1 - starting point of an segment on the fiber,
+		stim::vec<T> p2(3);   // p2 - ending point,
+		double sum=0;  //distance summation
+		std::vector<stim::vec<T> > fiberPositions = centerline();
+		std::vector<stim::vec<T> > newPointList; // initialize list of new resampled points on the fiber
+		// for each point on the centerline (skip if it is the last point on centerline)
+		for(unsigned int f=0; f< N-1; f++)
+		{
+			
+			p1 = fiberPositions[f]; p2 = fiberPositions[f + 1]; v = p2 - p1;
+			for(unsigned int d = 0; d < 3; d++){
+				sum +=v[d] * v[d];}              //length of segment-distance between starting and ending point
+
+			T lengthSegment = sqrt(sum);  //find Length of the segment as distance between the starting and ending points of the segment
+
+			if(lengthSegment >= spacing) // if length of the segment is greater than standard deviation resample
+				{
+					// repeat resampling until accumulated stepsize is equsl to length of the segment
+					for(T step=0.0; step<lengthSegment; step+=spacing)
+					{
+						// calculate the resampled point by travelling step size in the direction of normalized gradient vector
+						for(unsigned int i=0; i<3;i++)
+							{
+								p[i] = p1[i] + v[i]*(step/lengthSegment);
+							} //for each dimension
+						// add this resampled points to the new fiber list
+						newPointList.push_back(p);
+					}
+				}
+			else       // length of the segment is now less than standard deviation, push the ending point of the segment and proceed to the next point in the fiber
+				newPointList.push_back(fiberPositions[f+1]);
+			}
+		newPointList.push_back(fiberPositions[N-1]);   //add the last point on the fiber to the new fiber list
+		centerline newFiber(newPointList);
+		return newFiber;
+	}
+
+};
+
+
+
+}	//end namespace stim
+
+
+
+#endif
-#pragma once
-#include <fstream>									// Required for ofstream, etc.
-#include <iomanip>									// Required for setw
-#include <iostream>									// Required for cout, cin, etc.
-#include <tuple>									// Required for returning multiple values from a function
-
-using namespace std;
-
-
-class flow
-{
-public:
-	void backupToTxt(unsigned int nL, double **D, char filename[]);
-	tuple<int, int> copySrcDesRadLen(char filename[]);
-	void copyToArray(int *src, int *dest, double *radii, double *len);
-	int getDangleNodes(int datarow, int numNodes, int *row, int *column, int *dangleNodes);
-	void inversion(double **a, int n, double **b);
-
-protected:
-	float determinant(double **a, int n);
-	int minor(double **src, double **dest, int row, int col, int order);
-};
-
-/* Function to find the dangle nodes in a network */
-// Created by Cherub P. Harder (8/10/2015), U of Houston
-// Modified by Cherub P. Harder on 8/12/2015
-int flow::getDangleNodes(int datarow, int numNodes, int *column1, int *column2, int *dangleNodes)
-{
-	int count = datarow, diff1 = 0, diff2 = 0, numPress = 0, st = 0;
-
-	// Find matching nodes within column2
-	for( int i = 0; i < count; i++ )
-	{
-		for( int y = i+1; y < datarow; y++ )
-		{
-			if( column2[i] == column2[y] )			// Is there a match?
-			{
-				st = column2[i];					// Save the matching node
-//				cout << endl << column2[i] << " = " << column2[y] << endl; // Display the matching nodes
-				memmove(column2+i, column2+i+1, (datarow-(i+1)) * sizeof(column2[0])); // Move up the rows
-													// taking the places of the rows before them starting
-													// with where the matching node is located
-				column2[datarow-1] = st;			// Place the matching node at the very end of the array--
-													// this is for comparison purpose so that the other match-
-													// ing node will be moved as well and then omitted later.
-				diff1++;							// Count the matching node
-
-				// Display the updated array (with the matching node moved to the bottommost row)
-/*				cout << "Updated array:" << endl;
-				for( int k = 0; k < datarow; k++ )
-					cout << column2[k] << endl;
+/*
+Copyright <2017> <David Mayerich>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
-				// Decrement the counters
-				// NOTE: The counters need to be decremented because the rows had been moved up, so the same
-				// locations need to be read again because they contain different values now after the move.
-				i--;								// Decrement i to read the node that took over the place
-													// of the matching node. Otherwise, it will be skipped.
-				y--;								// Decrement y to read the following node for comparison
-				count--;							// The maximum count need to be decremented so that the
-													// matching nodes that had been moved will not be read again.
-													// However, the maximum count (datarow) for finding a match
-													// will not be decremented because the remaining matching
-													// node that has not been moved yet needs to be moved and
-													// the only way to do that is to match it with its duplicate.
-			}
-		}
-	}
-	
-	// Store the nodes that have no duplicates
-	// NOTE: This will only save the nodes that have not been moved to the bottom.
-//	cout << "\ndangleNodes array:" << endl;
-	for( int j = 0; j < datarow-diff1; j++ )
-	{
-		dangleNodes[numPress] = column2[j];
-//		cout << dangleNodes[j] << endl;				// DELETE!!!
-		numPress++;									// Count the non-duplicated node
-	}
-
-	// Find if the non-duplicated nodes have a match from column1
-	count = datarow-diff1;							// Reinitialize the counter
-
-	for( int i = 0; i < count; i++ )
-	{
-		for( int j = 0; j < datarow; j++ )
-		{
-			if( dangleNodes[i] == column1[j] )		// Is there a match?
-			{
-				st = column1[j];					// Save the matching node
-//				cout << endl << dangleNodes[i] << " = " << column1[j] << endl; // Display the matching nodes
-				memmove(dangleNodes+i, dangleNodes+i+1, (datarow-diff1-(i+1)) * sizeof(dangleNodes[0]));
-				dangleNodes[count-1] = st;			// Move the matching node to the bottom of the array
-				diff2++;							// Count the matching node
-
-				// Display the updated array
-/*				cout << "Updated dangleNodes array:" << endl;
-				for( int k = 0; k < count-1; k++ )
-				{
-					cout << dangleNodes[k] << endl;
+#ifndef STIM_FLOW_H
+#define STIM_FLOW_H
+
+#include <vector>
+#include <algorithm>
+
+//STIM include
+#include <stim/math/vec3.h>
+#include <stim/parser/arguments.h>
+#include <stim/biomodels/network.h>
+
+#ifdef __CUDACC__
+#include <cublas_v2.h>
+#include <stim/cuda/cudatools/error.h>
+#endif
+
+namespace stim {
+	template <typename A, typename B, typename C>
+	struct triple {
+		A first;
+		B second;
+		C third;
+	};
+
+	template <typename T>
+	struct bridge {
+		std::vector<unsigned> v;	// vertices' indices
+		std::vector<typename stim::vec3<T> > V;	// vertices' coordinates
+		T l;		// length
+		T r;		// radii
+		T deltaP;	// pressure drop
+		T Q;		// volume flow rate
+	};
+
+	template <typename T>
+	class flow {
+
+	private:
+		
+		// calculate the cofactor of elemen[row][col]
+		void get_minor(T** src, T** dest, int row, int col, int order) {
+
+			// index of element to be copied
+			int rowCount = 0;
+			int colCount = 0;
+
+			for (int i = 0; i < order; i++) {
+				if (i != row) {
+					colCount = 0;
+					for (int j = 0; j < order; j++) {
+						// when j is not the element
+						if (j != col) {
+							dest[rowCount][colCount] = src[i][j];
+							colCount++;
+						}
+					}
+					rowCount++;
 				}
-*/
-				// Decrement the counters
-				i--;
-				j--;
-				count--;
-				numPress--;							// Decrement to the exact number of dangle nodes
 			}
 		}
-	}
-	return numPress;								// Number of dangle nodes
-}
+		// calculate the det()
+		T determinant(T** mat, int order) {
+			// degenate case when n = 1
+			if (order == 1)
+				return mat[0][0];
-// Function to make a backup copy of the contents of a matrix to a .txt file
-// Created by Cherub P. Harder (8/10/2015), U of Houston
-void flow::backupToTxt(unsigned int nL, double **D, char filename[])
-{
-	ofstream output_file(filename);
-	
-	for( unsigned int i = 0; i < nL; i++ )
-	{
-		for( int j = 0; j < 4; j++ )
-		{
-			if( j < 3 )
-				output_file << D[i][j] << "\t";
-
-			else
-				output_file << D[i][j];
-		}
+			T det = 0.0;		// determinant value
-		output_file << "\n";
-	}
+			// allocate the cofactor matrix
+			T** minor = (T**)malloc((order - 1) * sizeof(T*));
+			for (int i = 0; i < order - 1; i++)
+				minor[i] = (T*)malloc((order - 1) * sizeof(T));
-	output_file.close( );
-}
+			for (int i = 0; i < order; i++) {
-// Function to make separate copies of the source nodes, destination nodes, radii, and lengths
-// Created by Cherub P. Harder (8/10/2015), U of Houston
-tuple<int, int> flow::copySrcDesRadLen(char filename[])
-{
-	int cnt = 0, numElements = 0, numNodes = 0;
-	float number = 0.0;
-	ofstream srcData("srcCol.txt");					// A .txt file to store the source nodes
-	ofstream desData("destCol.txt");				// A .txt file to store the destination nodes
-	ofstream radiiData("radii.txt");				// A .txt file to store the radii
-	ofstream lenData("lengths.txt");				// A .txt file to store the lengths
-	FILE *fp = fopen(filename, "r");				// Create a variable of type FILE* and open the file using
-													// the fopen function and assign the file to the variable
-	// Check if the file exists
-	if(fp == NULL)									// Alternative: if(!fp)
-	{
-		printf("Error! File does not exist.\n");
-		getchar( );									// Pause
-		exit(-1);									// NOTE: Must include stdlib.h.
-	}
-
-	// Store data to their respective .txt files
-	while(fscanf(fp, "%f", &number) == 1)
-	{
-		cnt++;										// Increment counter
-
-		// Store to srcCol.txt
-		if(cnt == 1)
-			srcData << number << endl;
-
-		// Store to destCol.txt
-		if(cnt == 2)
-			desData << number << endl;
-
-		// Save the current number of nodes
-		if(cnt < 3)
-		{
-			if(number > numNodes)
-				numNodes = (int)number;
-		}
+				// get minor of element(0, i)
+				get_minor(mat, minor, 0, i, order);
-		// Store to radii.txt
-		if(cnt == 3)
-			radiiData << number << endl;
+				// recursion
+				det += (i % 2 == 1 ? -1.0 : 1.0) * mat[0][i] * determinant(minor, order - 1);
+			}
-		// Store to lengths.txt
-		if(cnt == 4)
-		{
-			lenData << number << endl;
+			// release memory
+			for (int i = 0; i < order - 1; i++)
+				free(minor[i]);
+			free(minor);
-			numElements++;							// Count the elements
-			cnt = 0;								// Reset counter
+			return det;
 		}
-	}
-
-	srcData.close( );
-	desData.close( );
-	radiiData.close( );
-	lenData.close( );
-
-	return make_tuple(numNodes, numElements);		// Return two values
-}
-
-
-// Function to copy data for .txt files to their respective arrays
-// Created by Cherub P. Harder (8/11/2015), U of Houston
-void flow::copyToArray(int *src, int *dest, double *radii, double *len)
-{
-	int v = 0;
-	double tmp = 0, R = 0, L = 0;
-	
-	// Store source node values to the array src
-	ifstream readSrc("srcCol.txt");
-
-	while( readSrc >> tmp )
-	{
-		src[v] = (int)tmp;
-		v++;
-	}
-
-	readSrc.close( );
-	// Store destination node values to the array dest
-	v = 0;											// Reset counter
-	ifstream readDest("destCol.txt");
+	public:
+		T** C;																	// Conductance
+		std::vector<typename stim::triple<unsigned, unsigned, float> > Q;		// volume flow rate
+		std::vector<T> QQ;														// Q' vector
+		std::vector<T> P;														// initial pressure
+		std::vector<T> pressure;												// final pressure
-	while( readDest >> tmp )
-	{
-		dest[v] = (int)tmp;
-		v++;
-	}
+		//std::vector<typename stim::triple<unsigned, unsigned, T> > V;		 // velocity
+		//std::vector<typename stim::triple<unsigned, unsigned, T> > Q;		 // volume flow rate
+		//std::vector<typename stim::triple<unsigned, unsigned, T> > deltaP; // pressure drop
-	readDest.close( );
+		flow() {}				// default constructor
-	// Store radius values to the array radii
-	v = 0;											// Reset counter
-	ifstream readRad("radii.txt");
-
-	while( readRad >> tmp )
-	{
-		radii[v] = tmp;
-		v++;
-	}
+		void init(unsigned n_e, unsigned n_v) {
+			
+			C = new T*[n_v]();
+			for (unsigned i = 0; i < n_v; i++) {
+				C[i] = new T[n_v]();
+			}
-	readRad.close( );
+			QQ.resize(n_v);
+			P.resize(n_v);
+			pressure.resize(n_v);
-	// Store length values to the array len
-	v = 0;											// Reset counter
-	ifstream readLen("lengths.txt");
+			Q.resize(n_e);
+		}
-	while( readLen >> tmp )
-	{
-		len[v] = tmp;
-		v++;
-	}
+		void reset(unsigned n_v) {
+			
+			for (unsigned i = 0; i < n_v; i++) {
+				for (unsigned j = 0; j < n_v; j++) {
+					C[i][j] = 0;
+				}
+			}
+		}
-	readLen.close( );
-}
+		void clear(unsigned n_v) {
+			
+			for (unsigned i = 0; i < n_v; i++)
+				delete[] C[i];
+			delete[] C;
+		}
+		/// Calculate the inverse of A and store the result in C
+		void inversion(T** A, int order, T* C) {
+
+#ifdef __CUDACC__
+		
+			// convert from double pointer to single pointer, make it flat
+			T* Aflat = (T*)malloc(order * order * sizeof(T));
+			for (unsigned i = 0; i < order; i++)
+				for (unsigned j = 0; j < order; j++)
+					Aflat[i * order + j] = A[i][j];
+
+			// create device pointer
+			T* d_Aflat;		// flat original matrix
+			T* d_Cflat;	// flat inverse matrix
+			T** d_A;		// put the flat original matrix into another array of pointer
+			T** d_C;
+			int *d_P;
+			int *d_INFO;
+
+			// allocate memory on device
+			HANDLE_ERROR(cudaMalloc((void**)&d_Aflat, order * order * sizeof(T)));
+			HANDLE_ERROR(cudaMalloc((void**)&d_Cflat, order * order * sizeof(T)));
+			HANDLE_ERROR(cudaMalloc((void**)&d_A, sizeof(T*)));
+			HANDLE_ERROR(cudaMalloc((void**)&d_C, sizeof(T*)));
+			HANDLE_ERROR(cudaMalloc((void**)&d_P, order * 1 * sizeof(int)));
+			HANDLE_ERROR(cudaMalloc((void**)&d_INFO, 1 * sizeof(int)));
+
+			// copy matrix from host to device
+			HANDLE_ERROR(cudaMemcpy(d_Aflat, Aflat, order * order * sizeof(T), cudaMemcpyHostToDevice));
+			
+			// copy matrix from device to device
+			HANDLE_ERROR(cudaMemcpy(d_A, &d_Aflat, sizeof(T*), cudaMemcpyHostToDevice));
+			HANDLE_ERROR(cudaMemcpy(d_C, &d_Cflat, sizeof(T*), cudaMemcpyHostToDevice));
-// Function to find the inverse of a square matrix
-void flow::inversion(double **a, int n, double **b)
-{
-	// Get 1 over the determinant of A
-	double det = (double)(1.0/determinant(a, n));
-//	cerr << "\n1/det(C) = " << det << endl;				// DELETE!!!
-
-    // Memory allocation
-    double *tmp = new double[(n-1) * (n-1)];
-    double **m = new double * [n-1];
-    for( int i = 0; i < n-1; i++ )
-		m[i] = tmp + ( i * (n-1) );
- 
-    for( int j = 0; j < n; j++)
-    {
-		for( int i = 0; i < n; i++ )
-        {
-			// Get the cofactor (matrix) of a(j,i)
-            minor(a, m, j, i, n);
-            b[i][j] = det * determinant( m, n-1 );
-            if( (i+j)%2 == 1 )
-                b[i][j] = -b[i][j];
-        }
-    }
- 
-    // Release memory
-    // Delete [] minor[0];
-    delete [] tmp;
-    delete [] m;
-}
+			// calculate the inverse of matrix based on cuBLAS
+			cublasHandle_t handle;		
+			CUBLAS_HANDLE_ERROR(cublasCreate_v2(&handle));	// create cuBLAS handle object
+			CUBLAS_HANDLE_ERROR(cublasSgetrfBatched(handle, order, d_A, order, d_P, d_INFO, 1));
-// Function to find the determinant of a matrix using recursion
-// Contribution by Edward Popko
-// Modified by Cherub P. Harder (7/15/2015), U of Houston
-// Arguments: a(double **) - pointer to a pointer of an arbitrary square matrix
-//			  n(int) - dimension of the square matrix
-float flow::determinant(double **a, int n)
-{
-	int i, j, j1, j2;								// General loop and matrix subscripts
-    double det = 0;									// Initialize determinant
-    double **m = NULL;								// Pointer to pointer to implement 2D square array
-
-	// Display contents of matrix C (DELETE!!!)
-/*	std::cout << "\nThe updated matrix C:\n";
-	for( int j = 0; j < n; ++j )
-	{
-		std::cerr << "\t";
-
-		for( int k = 0; k < n; ++k )
-			std::cerr << left << setw(15) << a[j][k];
-
-		std::cerr << endl;
-	}
-
-	getchar();					// DELETE!!!*/
-
-	if(n < 1) { }									// Error condition - should never get here
-
-    else if (n == 1)								// Should never get here
-	{
-		det = a[0][0];
-	}
-
-    else if(n == 2)									// Basic 2x2 sub-matrix determinate definition
-	{												// When n == 2, this ends the recursion series
-		det = a[0][0] * a[1][1] - a[1][0] * a[0][1];
-	}
-													// Recursion continues, solve next sub-matrix
-    else											// Solve the next minor by building a sub-matrix
-	{
-		det = 0;									// Initialize determinant of sub-matrix
-
-        for (j1 = 0; j1 < n; j1++)					// For each column in sub-matrix get space for the
-		{											// pointer list
-			m = (double **) malloc((n-1) * sizeof(double *));
-
-            for (i = 0; i < n-1; i++)
-				m[i] = (double *) malloc((n-1)* sizeof(double));
-                       //     i[0][1][2][3]  first malloc
-                       //  m -> +  +  +  +   space for 4 pointers
-                       //       |  |  |  |          j  second malloc
-                       //       |  |  |  +-> _ _ _ [0] pointers to
-                       //       |  |  +----> _ _ _ [1] and memory for
-                       //       |  +-------> _ a _ [2] 4 doubles
-                       //       +----------> _ _ _ [3]
-                       //
-                       //                   a[1][2]
-                       // Build sub-matrix with minor elements excluded
-            
-			for (i = 1; i < n; i++)
+			int INFO = 0;
+			HANDLE_ERROR(cudaMemcpy(&INFO, d_INFO, sizeof(int), cudaMemcpyDeviceToHost));
+			if (INFO == order)
 			{
-				j2 = 0 ;							// Start at first sum-matrix column position
-													// Loop to copy source matrix less one column
-                for (j = 0; j < n; j++)
-				{
-					if (j == j1) continue;			// Do not copy the minor column element
-					
-					m[i-1][j2] = a[i][j];			// Copy source element into new sub-matrix
-													// i-1 because new sub-matrix is one row
-													// (and column) smaller with excluded minors
-                    j2++;							// Move to next sub-matrix column position
+				std::cout << "Factorization Failed : Matrix is singular." << std::endl;
+				cudaDeviceReset();
+				exit(1);
+			}
+
+			CUBLAS_HANDLE_ERROR(cublasSgetriBatched(handle, order, (const T **)d_A, order, d_P, d_C, order, d_INFO, 1));
+
+			CUBLAS_HANDLE_ERROR(cublasDestroy_v2(handle));
+
+			// copy inverse matrix from device to device
+			HANDLE_ERROR(cudaMemcpy(&d_Cflat, d_C, sizeof(T*), cudaMemcpyDeviceToHost));
+
+			// copy inverse matrix from device to host
+			HANDLE_ERROR(cudaMemcpy(C, d_Cflat, order * order * sizeof(T), cudaMemcpyDeviceToHost));
+
+			// clear up
+			free(Aflat);
+			HANDLE_ERROR(cudaFree(d_Aflat));
+			HANDLE_ERROR(cudaFree(d_Cflat));
+			HANDLE_ERROR(cudaFree(d_A));
+			HANDLE_ERROR(cudaFree(d_C));
+			HANDLE_ERROR(cudaFree(d_P));
+			HANDLE_ERROR(cudaFree(d_INFO));
+
+#else
+			// get the determinant of a
+			double det = 1.0 / determinant(A, order);
+
+			// memory allocation
+			T* tmp = (T*)malloc((order - 1)*(order - 1) * sizeof(T));
+			T** minor = (T**)malloc((order - 1) * sizeof(T*));
+			for (int i = 0; i < order - 1; i++)
+				minor[i] = tmp + (i * (order - 1));
+
+			for (int j = 0; j < order; j++) {
+				for (int i = 0; i < order; i++) {
+					// get the co-factor (matrix) of A(j,i)
+					get_minor(A, minor, j, i, order);
+					C[i][j] = det * determinant(minor, order - 1);
+					if ((i + j) % 2 == 1)
+						C[i][j] = -C[i][j];
 				}
 			}
-			
-			det += (double)pow(-1.0, 1.0 + j1 + 1.0) * a[0][j1] * determinant(m, n-1);
-													// Sum x raised to y power
-													// recursively get determinant of next
-													// sub-matrix which is now one
-													// row & column smaller
-
-            for (i = 0; i < n-1; i++) free(m[i]);	// Free the storage allocated to
-													// this minor's set of pointers
-            free(m);								// Free the storage for the original
-													// pointer to pointer
+
+			// release memory
+			free(tmp);
+			free(minor);
+#endif
 		}
-	}
-	
-	return(det);
+	};
 }
-
-// Function to calculate the cofactor of element (row, col)
-int flow::minor(double **src, double **dest, int row, int col, int order)
-{
-	// Indicate which col and row is being copied to dest
-    int colCount=0,rowCount=0;
- 
-    for(int i = 0; i < order; i++)
-    {
-        if(i != row)
-        {
-            colCount = 0;
-            for(int j = 0; j < order; j++)
-            {
-                // When j is not the element
-                if( j != col )
-                {
-                    dest[rowCount][colCount] = src[i][j];
-                    colCount++;
-                }
-            }
-
-            rowCount++;
-		}
-    }
- 
-    return 1;
-}
 \ No newline at end of file
+#endif
+
+
+
+//// calculate the flow rate of 3D model(circle cross section)
+//void calculate_flow_rate(unsigned e, T r) {
+//	stim::triple<unsigned, unsigned, T> tmp_Q;
+//	tmp_Q.first = V[e].first;			// copy the vertices information
+//	tmp_Q.second = V[e].second;
+//	tmp_Q.third = V[e].third * stim::PI * pow(r, 2);	// UNITS: uL/s
+//	Q.push_back(tmp_Q);					// push back the volume flow rate information for every edge
+//}
+
+//// calculate the flow rate of 2D model(rectangular cross section)
+//void calculate_flow_rate(unsigned e, T r, T h) {
+//	stim::triple<unsigned, unsigned, T> tmp_Q;
+//	tmp_Q.first = V[e].first;			// copy the vertices information
+//	tmp_Q.second = V[e].second;
+//	tmp_Q.third = V[e].third * h * r;					// UNITS: uL/s = mm^3/s
+//	Q.push_back(tmp_Q);					// push back the volume flow rate information for every edge
+//}
+
+//// calculate the pressure drop of 3D model(circle cross section)
+//void calculate_deltaP(unsigned e, T u, T l, T r) {
+//	stim::triple<unsigned, unsigned, T> tmp_deltaP;
+//	tmp_deltaP.first = V[e].first;		// copy the vertices information
+//	tmp_deltaP.second = V[e].second;
+//	tmp_deltaP.third = (8 * u * l * Q[e].third) / (stim::PI * pow(r, 4));		// UNITS: g/mm/s^2 = Pa
+//	deltaP.push_back(tmp_deltaP);		// push back the volume flow rate information for every edge
+//}
+
+//// calculate the pressure drop of 2D model(rectangular cross section)
+//void calculate_deltaP(unsigned e, T u, T l, T r, T h) {
+//	stim::triple<unsigned, unsigned, T> tmp_deltaP;
+//	tmp_deltaP.first = V[e].first;		// copy the vertices information
+//	tmp_deltaP.second = V[e].second;
+//	tmp_deltaP.third = (12 * u * l * Q[e].third) / (h * pow(r, 3));	// UNITS: g/mm/s^2 = Pa
+//	deltaP.push_back(tmp_deltaP);		// push back the volume flow rate information for every edge
+//}
+
+//// better way to do this???
+//// find the maximum and minimum pressure positions
+//void find_max_min_pressure(size_t n_e, size_t n_v, unsigned& max, unsigned& min) {
+//	std::vector<T> P(n_v, FLT_MAX);
+//	// set one to reference
+//	P[Q[0].first] = 0.0;
+//	unsigned first = 0;
+//	unsigned second = 0;
+//	// calculate all the relative pressure in brute force manner
+//	for (unsigned e = 0; e < n_e; e++) {
+//		// assuming the obj file stores in a straight order, in other words, like swc file
+//		first = Q[e].first;
+//		second = Q[e].second;
+//		if (P[first] != FLT_MAX)		// if pressure at start vertex is known
+//			P[second] = P[first] - deltaP[e].third;
+//		else if (P[second] != FLT_MAX)	// if pressure at end vertex is known
+//			P[first] = P[second] + deltaP[e].third;
+//	}
+
+//	// find the maximum and minimum pressure position
+//	auto m1 = std::max_element(P.begin(), P.end());		// temporarily max number
+//	auto m2 = std::min_element(P.begin(), P.end());		// temporarily min number
+
+//	max = std::distance(P.begin(), m1);
+//	min = std::distance(P.begin(), m2);
+
+//	T tmp_m = *m2;
+//	// Now set the lowest pressure port to reference pressure(0.0 Pa)
+//	for (unsigned i = 0; i < n_v; i++)
+//		P[i] -= tmp_m;
+
+//	for (unsigned i = 0; i < n_v; i++)
+//		pressure.push_back(P[i]);
+//}
 \ No newline at end of file
+/*
+Copyright <2017> <David Mayerich>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#pragma once
+#include <fstream>									// Required for ofstream, etc.
+#include <iomanip>									// Required for setw
+#include <iostream>									// Required for cout, cin, etc.
+#include <tuple>									// Required for returning multiple values from a function
+
+using namespace std;
+
+
+class flow
+{
+public:
+	void backupToTxt(unsigned int nL, double **D, char filename[]);
+	tuple<int, int> copySrcDesRadLen(char filename[]);
+	void copyToArray(int *src, int *dest, double *radii, double *len);
+	int getDangleNodes(int datarow, int numNodes, int *row, int *column, int *dangleNodes);
+	void inversion(double **a, int n, double **b);
+
+protected:
+	float determinant(double **a, int n);
+	int minor(double **src, double **dest, int row, int col, int order);
+};
+
+/* Function to find the dangle nodes in a network */
+// Created by Cherub P. Harder (8/10/2015), U of Houston
+// Modified by Cherub P. Harder on 8/12/2015
+int flow::getDangleNodes(int datarow, int numNodes, int *column1, int *column2, int *dangleNodes)
+{
+	int count = datarow, diff1 = 0, diff2 = 0, numPress = 0, st = 0;
+
+	// Find matching nodes within column2
+	for( int i = 0; i < count; i++ )
+	{
+		for( int y = i+1; y < datarow; y++ )
+		{
+			if( column2[i] == column2[y] )			// Is there a match?
+			{
+				st = column2[i];					// Save the matching node
+//				cout << endl << column2[i] << " = " << column2[y] << endl; // Display the matching nodes
+				memmove(column2+i, column2+i+1, (datarow-(i+1)) * sizeof(column2[0])); // Move up the rows
+													// taking the places of the rows before them starting
+													// with where the matching node is located
+				column2[datarow-1] = st;			// Place the matching node at the very end of the array--
+													// this is for comparison purpose so that the other match-
+													// ing node will be moved as well and then omitted later.
+				diff1++;							// Count the matching node
+
+				// Display the updated array (with the matching node moved to the bottommost row)
+/*				cout << "Updated array:" << endl;
+				for( int k = 0; k < datarow; k++ )
+					cout << column2[k] << endl;
+*/
+				// Decrement the counters
+				// NOTE: The counters need to be decremented because the rows had been moved up, so the same
+				// locations need to be read again because they contain different values now after the move.
+				i--;								// Decrement i to read the node that took over the place
+													// of the matching node. Otherwise, it will be skipped.
+				y--;								// Decrement y to read the following node for comparison
+				count--;							// The maximum count need to be decremented so that the
+													// matching nodes that had been moved will not be read again.
+													// However, the maximum count (datarow) for finding a match
+													// will not be decremented because the remaining matching
+													// node that has not been moved yet needs to be moved and
+													// the only way to do that is to match it with its duplicate.
+			}
+		}
+	}
+	
+	// Store the nodes that have no duplicates
+	// NOTE: This will only save the nodes that have not been moved to the bottom.
+//	cout << "\ndangleNodes array:" << endl;
+	for( int j = 0; j < datarow-diff1; j++ )
+	{
+		dangleNodes[numPress] = column2[j];
+//		cout << dangleNodes[j] << endl;				// DELETE!!!
+		numPress++;									// Count the non-duplicated node
+	}
+
+	// Find if the non-duplicated nodes have a match from column1
+	count = datarow-diff1;							// Reinitialize the counter
+
+	for( int i = 0; i < count; i++ )
+	{
+		for( int j = 0; j < datarow; j++ )
+		{
+			if( dangleNodes[i] == column1[j] )		// Is there a match?
+			{
+				st = column1[j];					// Save the matching node
+//				cout << endl << dangleNodes[i] << " = " << column1[j] << endl; // Display the matching nodes
+				memmove(dangleNodes+i, dangleNodes+i+1, (datarow-diff1-(i+1)) * sizeof(dangleNodes[0]));
+				dangleNodes[count-1] = st;			// Move the matching node to the bottom of the array
+				diff2++;							// Count the matching node
+
+				// Display the updated array
+/*				cout << "Updated dangleNodes array:" << endl;
+				for( int k = 0; k < count-1; k++ )
+				{
+					cout << dangleNodes[k] << endl;
+				}
+*/
+				// Decrement the counters
+				i--;
+				j--;
+				count--;
+				numPress--;							// Decrement to the exact number of dangle nodes
+			}
+		}
+	}
+
+	return numPress;								// Number of dangle nodes
+}
+
+
+// Function to make a backup copy of the contents of a matrix to a .txt file
+// Created by Cherub P. Harder (8/10/2015), U of Houston
+void flow::backupToTxt(unsigned int nL, double **D, char filename[])
+{
+	ofstream output_file(filename);
+	
+	for( unsigned int i = 0; i < nL; i++ )
+	{
+		for( int j = 0; j < 4; j++ )
+		{
+			if( j < 3 )
+				output_file << D[i][j] << "\t";
+
+			else
+				output_file << D[i][j];
+		}
+
+		output_file << "\n";
+	}
+
+	output_file.close( );
+}
+
+
+// Function to make separate copies of the source nodes, destination nodes, radii, and lengths
+// Created by Cherub P. Harder (8/10/2015), U of Houston
+tuple<int, int> flow::copySrcDesRadLen(char filename[])
+{
+	int cnt = 0, numElements = 0, numNodes = 0;
+	float number = 0.0;
+	ofstream srcData("srcCol.txt");					// A .txt file to store the source nodes
+	ofstream desData("destCol.txt");				// A .txt file to store the destination nodes
+	ofstream radiiData("radii.txt");				// A .txt file to store the radii
+	ofstream lenData("lengths.txt");				// A .txt file to store the lengths
+	FILE *fp = fopen(filename, "r");				// Create a variable of type FILE* and open the file using
+													// the fopen function and assign the file to the variable
+	// Check if the file exists
+	if(fp == NULL)									// Alternative: if(!fp)
+	{
+		printf("Error! File does not exist.\n");
+		getchar( );									// Pause
+		exit(-1);									// NOTE: Must include stdlib.h.
+	}
+
+	// Store data to their respective .txt files
+	while(fscanf(fp, "%f", &number) == 1)
+	{
+		cnt++;										// Increment counter
+
+		// Store to srcCol.txt
+		if(cnt == 1)
+			srcData << number << endl;
+
+		// Store to destCol.txt
+		if(cnt == 2)
+			desData << number << endl;
+
+		// Save the current number of nodes
+		if(cnt < 3)
+		{
+			if(number > numNodes)
+				numNodes = (int)number;
+		}
+
+		// Store to radii.txt
+		if(cnt == 3)
+			radiiData << number << endl;
+
+		// Store to lengths.txt
+		if(cnt == 4)
+		{
+			lenData << number << endl;
+
+			numElements++;							// Count the elements
+			cnt = 0;								// Reset counter
+		}
+	}
+
+	srcData.close( );
+	desData.close( );
+	radiiData.close( );
+	lenData.close( );
+
+	return make_tuple(numNodes, numElements);		// Return two values
+}
+
+
+// Function to copy data for .txt files to their respective arrays
+// Created by Cherub P. Harder (8/11/2015), U of Houston
+void flow::copyToArray(int *src, int *dest, double *radii, double *len)
+{
+	int v = 0;
+	double tmp = 0, R = 0, L = 0;
+	
+	// Store source node values to the array src
+	ifstream readSrc("srcCol.txt");
+
+	while( readSrc >> tmp )
+	{
+		src[v] = (int)tmp;
+		v++;
+	}
+
+	readSrc.close( );
+
+	// Store destination node values to the array dest
+	v = 0;											// Reset counter
+	ifstream readDest("destCol.txt");
+
+	while( readDest >> tmp )
+	{
+		dest[v] = (int)tmp;
+		v++;
+	}
+
+	readDest.close( );
+
+	// Store radius values to the array radii
+	v = 0;											// Reset counter
+	ifstream readRad("radii.txt");
+
+	while( readRad >> tmp )
+	{
+		radii[v] = tmp;
+		v++;
+	}
+
+	readRad.close( );
+
+	// Store length values to the array len
+	v = 0;											// Reset counter
+	ifstream readLen("lengths.txt");
+
+	while( readLen >> tmp )
+	{
+		len[v] = tmp;
+		v++;
+	}
+
+	readLen.close( );
+}
+
+
+// Function to find the inverse of a square matrix
+void flow::inversion(double **a, int n, double **b)
+{
+	// Get 1 over the determinant of A
+	double det = (double)(1.0/determinant(a, n));
+//	cerr << "\n1/det(C) = " << det << endl;				// DELETE!!!
+
+    // Memory allocation
+    double *tmp = new double[(n-1) * (n-1)];
+    double **m = new double * [n-1];
+    for( int i = 0; i < n-1; i++ )
+		m[i] = tmp + ( i * (n-1) );
+ 
+    for( int j = 0; j < n; j++)
+    {
+		for( int i = 0; i < n; i++ )
+        {
+			// Get the cofactor (matrix) of a(j,i)
+            minor(a, m, j, i, n);
+            b[i][j] = det * determinant( m, n-1 );
+            if( (i+j)%2 == 1 )
+                b[i][j] = -b[i][j];
+        }
+    }
+ 
+    // Release memory
+    // Delete [] minor[0];
+    delete [] tmp;
+    delete [] m;
+}
+
+
+// Function to find the determinant of a matrix using recursion
+// Contribution by Edward Popko
+// Modified by Cherub P. Harder (7/15/2015), U of Houston
+// Arguments: a(double **) - pointer to a pointer of an arbitrary square matrix
+//			  n(int) - dimension of the square matrix
+float flow::determinant(double **a, int n)
+{
+	int i, j, j1, j2;								// General loop and matrix subscripts
+    double det = 0;									// Initialize determinant
+    double **m = NULL;								// Pointer to pointer to implement 2D square array
+
+	// Display contents of matrix C (DELETE!!!)
+/*	std::cout << "\nThe updated matrix C:\n";
+	for( int j = 0; j < n; ++j )
+	{
+		std::cerr << "\t";
+
+		for( int k = 0; k < n; ++k )
+			std::cerr << left << setw(15) << a[j][k];
+
+		std::cerr << endl;
+	}
+
+	getchar();					// DELETE!!!*/
+
+	if(n < 1) { }									// Error condition - should never get here
+
+    else if (n == 1)								// Should never get here
+	{
+		det = a[0][0];
+	}
+
+    else if(n == 2)									// Basic 2x2 sub-matrix determinate definition
+	{												// When n == 2, this ends the recursion series
+		det = a[0][0] * a[1][1] - a[1][0] * a[0][1];
+	}
+													// Recursion continues, solve next sub-matrix
+    else											// Solve the next minor by building a sub-matrix
+	{
+		det = 0;									// Initialize determinant of sub-matrix
+
+        for (j1 = 0; j1 < n; j1++)					// For each column in sub-matrix get space for the
+		{											// pointer list
+			m = (double **) malloc((n-1) * sizeof(double *));
+
+            for (i = 0; i < n-1; i++)
+				m[i] = (double *) malloc((n-1)* sizeof(double));
+                       //     i[0][1][2][3]  first malloc
+                       //  m -> +  +  +  +   space for 4 pointers
+                       //       |  |  |  |          j  second malloc
+                       //       |  |  |  +-> _ _ _ [0] pointers to
+                       //       |  |  +----> _ _ _ [1] and memory for
+                       //       |  +-------> _ a _ [2] 4 doubles
+                       //       +----------> _ _ _ [3]
+                       //
+                       //                   a[1][2]
+                       // Build sub-matrix with minor elements excluded
+            
+			for (i = 1; i < n; i++)
+			{
+				j2 = 0 ;							// Start at first sum-matrix column position
+													// Loop to copy source matrix less one column
+                for (j = 0; j < n; j++)
+				{
+					if (j == j1) continue;			// Do not copy the minor column element
+					
+					m[i-1][j2] = a[i][j];			// Copy source element into new sub-matrix
+													// i-1 because new sub-matrix is one row
+													// (and column) smaller with excluded minors
+                    j2++;							// Move to next sub-matrix column position
+				}
+			}
+			
+			det += (double)pow(-1.0, 1.0 + j1 + 1.0) * a[0][j1] * determinant(m, n-1);
+													// Sum x raised to y power
+													// recursively get determinant of next
+													// sub-matrix which is now one
+													// row & column smaller
+
+            for (i = 0; i < n-1; i++) free(m[i]);	// Free the storage allocated to
+													// this minor's set of pointers
+            free(m);								// Free the storage for the original
+													// pointer to pointer
+		}
+	}
+	
+	return(det);
+}
+
+
+// Function to calculate the cofactor of element (row, col)
+int flow::minor(double **src, double **dest, int row, int col, int order)
+{
+	// Indicate which col and row is being copied to dest
+    int colCount=0,rowCount=0;
+ 
+    for(int i = 0; i < order; i++)
+    {
+        if(i != row)
+        {
+            colCount = 0;
+            for(int j = 0; j < order; j++)
+            {
+                // When j is not the element
+                if( j != col )
+                {
+                    dest[rowCount][colCount] = src[i][j];
+                    colCount++;
+                }
+            }
+
+            rowCount++;
+		}
+    }
+ 
+    return 1;
+}
 \ No newline at end of file
+/*
+Copyright <2017> <David Mayerich>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
 #ifndef STIM_NETWORK_H
 #define STIM_NETWORK_H
@@ -10,19 +19,51 @@
 #include <math.h>
 #include <stim/math/vec3.h>
 #include <stim/visualization/obj.h>
+#include <stim/visualization/swc.h>
 #include <stim/visualization/cylinder.h>
-#include <ANN/ANN.h>
-#include <boost/tuple/tuple.hpp>
+#include <stim/cuda/cudatools/timer.h>
+#include <stim/cuda/cudatools/callable.h>
+#include <stim/structures/kdtree.cuh>
+//********************help function********************
+// gaussian_function
+CUDA_CALLABLE float gaussianFunction(float x, float std = 25) { return exp(-x / (2 * std*std)); }  // std default sigma value is 25
+
+// compute metric in parallel
+#ifdef __CUDACC__
+template <typename T>
+__global__ void find_metric_parallel(T* M, size_t n, T* D, float sigma){
+	size_t x = blockDim.x * blockIdx.x + threadIdx.x;
+	if(x >= n) return;
+	M[x] = 1.0f - gaussianFunction(D[x], sigma);
+}
+
+//find the corresponding edge index from array index
+__global__ void find_edge_index_parallel(size_t* I, size_t n, unsigned* R, size_t* E, size_t ne){
+	size_t x = blockDim.x * blockIdx.x + threadIdx.x;
+	if(x >= n) return;
+	unsigned i = 0;
+	size_t N = 0;
+	for(unsigned e = 0; e < ne; e++){
+		N += E[e];
+		if(I[x] < N){
+			R[x] = i;
+			break;
+		}
+		i++;
+	}
+}
+#endif
+//hard-coded factor
+int threshold_fac;
 namespace stim{
 /** This is the a class that interfaces with gl_spider in order to store the currently
  *   segmented network. The following data is stored and can be extracted:
  *   1)Network geometry and centerline.
- *   2)Network connectivity (a graph of nodes and edges), reconstructed using ANN library.
+ *   2)Network connectivity (a graph of nodes and edges), reconstructed using kdtree.
 */
-
 template<typename T>
 class network{
@@ -31,18 +72,30 @@ class network{
 	class edge : public cylinder<T>
 	{
 		public:
-		unsigned v[2];		//unique id's designating the starting and ending
+	
+		unsigned int v[2];		//unique id's designating the starting and ending
 		// default constructor
-		edge() : cylinder<T>()
-		{
-			v[1] = -1; v[0] = -1;
+		edge() : cylinder<T>() {
+			v[1] = (unsigned)(-1); v[0] = (unsigned)(-1);
 		}
 		/// Constructor - creates an edge from a list of points by calling the stim::fiber constructor
+/*
+		///@param v0, the starting index.
+		///@param v1, the ending index.
+		///@param sz, the number of point in the fiber.
+		edge(unsigned int v0, unsigned int v1, unsigned int sz) : cylinder<T>(
+		{
+		}
+*/
+		edge(std::vector<stim::vec3<T> > p, std::vector<T> s)
+			: cylinder<T>(p,s)
+		{
+		}
 		///@param p is an array of positions in space
-		edge(std::vector< stim::vec3<T> > p) : cylinder<T>(p){}
+		edge(stim::centerline<T> p) : cylinder<T>(p){}
-		/// Copy constructor creates an edge from a fiber
+		/// Copy constructor creates an edge from a cylinder
 		edge(stim::cylinder<T> f) : cylinder<T>(f) {}
 		/// Resamples an edge by calling the fiber resampling function
@@ -57,10 +110,71 @@ class network{
 		/// Output the edge information as a string
 		std::string str(){
 			std::stringstream ss;
-			ss<<"("<<cylinder<T>::size()<<")\tl = "<<this.length()<<"\t"<<v[0]<<"----"<<v[1];
+			ss<<"("<<cylinder<T>::size()<<")\tl = "<<this->length()<<"\t"<<v[0]<<"----"<<v[1];
 			return ss.str();
 		}
+		std::vector<edge> split(unsigned int idx){
+		
+			std::vector< stim::cylinder<T> > C;
+			C.resize(2);
+			C =	(*this).cylinder<T>::split(idx);
+			std::vector<edge> E(C.size());
+
+			for(unsigned e = 0; e < E.size(); e++){
+				E[e] = C[e];
+			}
+			return E;
+		}
+
+		/// operator for writing the edge information into a binary .nwt file.
+		friend std::ofstream& operator<<(std::ofstream& out, const edge& e)
+		{
+			out.write(reinterpret_cast<const char*>(&e.v[0]), sizeof(unsigned int));	///write the starting point.
+			out.write(reinterpret_cast<const char*>(&e.v[1]), sizeof(unsigned int));	///write the ending point.
+			unsigned int sz = e.size();	///write the number of point in the edge.
+			out.write(reinterpret_cast<const char*>(&sz), sizeof(unsigned int));
+			for(int i = 0; i < sz; i++)	///write each point
+			{
+				stim::vec3<T> point = e[i];
+				out.write(reinterpret_cast<const char*>(&point[0]), 3*sizeof(T));
+			//	for(int j = 0; j < nmags(); j++)	//future code for multiple mags
+			//	{
+				out.write(reinterpret_cast<const char*>(&e.R[i]), sizeof(T));	///write the radius
+				//std::cout << point.str() << " " << e.R[i] << std::endl;
+			//	}
+			}
+			return out;	//return stream
+		}
+
+		/// operator for reading an edge from a binary .nwt file.
+		friend std::ifstream& operator>>(std::ifstream& in, edge& e)
+		{
+			unsigned int v0, v1, sz;
+			in.read(reinterpret_cast<char*>(&v0), sizeof(unsigned int));	//read the staring point.
+			in.read(reinterpret_cast<char*>(&v1), sizeof(unsigned int));	//read the ending point
+			in.read(reinterpret_cast<char*>(&sz), sizeof(unsigned int));	//read the number of points in the edge
+//			stim::centerline<T> temp = stim::centerline<T>(sz);		//allocate the new edge
+//			e = edge(temp);
+			std::vector<stim::vec3<T> > p(sz);
+			std::vector<T> r(sz);
+			for(int i = 0; i < sz; i++)		//set the points and radii to the newly read values
+			{
+				stim::vec3<T> point;
+				in.read(reinterpret_cast<char*>(&point[0]), 3*sizeof(T));
+				p[i] = point;
+				T mag;
+		//				for(int j = 0; j < nmags(); j++)		///future code for mags
+		//				{
+				in.read(reinterpret_cast<char*>(&mag), sizeof(T));	
+				r[i] = mag;
+				//std::cout << point.str() << " " << mag << std::endl;
+		//				}
+			}
+			e = edge(p,r);
+			e.v[0] = v0; e.v[1] = v1;
+			return in;
+		}
 	};
 	///Node class that stores the physical position of the node as well as the edges it is connected to (edges that connect to it), As well as any additional data necessary.
@@ -70,12 +184,16 @@ class network{
 			//std::vector<unsigned int> edges;					//indices of edges connected to this node.
 			std::vector<unsigned int> e[2];						//indices of edges going out (e[0]) and coming in (e[1])
 			//stim::vec3<T> p;							//position of this node in physical space.
-
+			//default constructor
+			vertex() : stim::vec3<T>()
+			{
+			}
 			//constructor takes a stim::vec
 			vertex(stim::vec3<T> p) : stim::vec3<T>(p){}
 			/// Output the vertex information as a string
-			std::string	str(){
+			std::string 
+			str(){
 				std::stringstream ss;
 				ss<<"\t(x, y, z) = "<<stim::vec3<T>::str();
@@ -92,14 +210,46 @@ class network{
 				return ss.str();
 			}
+			///operator for writing the vector into the stream;
+			friend std::ofstream& operator<<(std::ofstream& out, const vertex& v)
+			{
+				unsigned int s0, s1;
+				s0 = v.e[0].size();
+				s1 = v.e[1].size();
+				out.write(reinterpret_cast<const char*>(&v.ptr[0]), 3*sizeof(T));	///write physical vertex location
+				out.write(reinterpret_cast<const char*>(&s0), sizeof(unsigned int));	///write the number of "outgoing edges"
+				out.write(reinterpret_cast<const char*>(&s1), sizeof(unsigned int));	///write the number of "incoming edges"	
+				if (s0 != 0)
+					out.write(reinterpret_cast<const char*>(&v.e[0][0]), sizeof(unsigned int)*v.e[0].size());	///write the "outgoing edges"
+				if (s1 != 0)
+					out.write(reinterpret_cast<const char*>(&v.e[1][0]), sizeof(unsigned int)*v.e[1].size());	///write the "incoming edges"
+				return out;
+			}
+			///operator for reading the vector out of the stream;
+			friend std::ifstream& operator>>(std::ifstream& in, vertex& v)
+			{
+				in.read(reinterpret_cast<char*>(&v[0]), 3*sizeof(T));	///read the physical position
+				unsigned int s[2];					
+				in.read(reinterpret_cast<char*>(&s[0]), 2*sizeof(unsigned int));	///read the sizes of incoming and outgoing edge arrays
+
+				std::vector<unsigned int> one(s[0]);
+				std::vector<unsigned int> two(s[1]);
+				v.e[0] = one;
+				v.e[1] = two;
+				if (one.size() != 0)
+					in.read(reinterpret_cast<char*>(&v.e[0][0]), s[0] * sizeof(unsigned int));		///read the arrays of "outgoing edges"
+				if (two.size() != 0)
+					in.read(reinterpret_cast<char*>(&v.e[1][0]), s[1] * sizeof(unsigned int));		///read the arrays of "incoming edges"
+				return in;
+			}
 	};
 protected:
 	std::vector<edge> E;       //list of edges
-	std::vector<vertex> V;	    //list of vertices.
+	std::vector<vertex> V;	   //list of vertices.
 public:
@@ -125,7 +275,66 @@ public:
 		return V.size();
 	}
-	std::vector<vertex> operator*(T s){
+	///Returns the radius at specific point in the edge
+	T get_r(unsigned e, unsigned i) {
+		return E[e].r(i);
+	}
+
+	///Returns the average radius of specific edge
+	T get_average_r(unsigned e) {
+		T result = 0.0;
+		unsigned n = E[e].size();
+		for (unsigned p = 0; p < n; p++)
+			result += E[e].r(p);
+
+		return (T)result / n;
+	}
+
+	///Returns the length of current edge
+	T get_l(unsigned e) {
+		return E[e].length();
+	}
+
+	///Returns the start vertex of current edge
+	size_t get_start_vertex(unsigned e) {
+		return E[e].v[0];
+	}
+
+	///Returns the end vertex of current edge
+	size_t get_end_vertex(unsigned e) {
+		return E[e].v[1];
+	}
+
+	///Returns one vertex
+	stim::vec3<T> get_vertex(unsigned i) {
+		return V[i];
+	}
+
+	///Returns the boundary vertices' indices
+	std::vector<unsigned> get_boundary_vertex() {
+		std::vector<unsigned> result;
+
+		for (unsigned v = 0; v < V.size(); v++) {
+			if (V[v].e[0].size() + V[v].e[1].size() == 1) {	// boundary vertex
+				result.push_back(v);
+			}
+		}
+
+		return result;
+	}
+
+	///Set radius
+	void set_r(unsigned e, std::vector<T> radius) {
+		E[e].cylinder<T>::copy_r(radius);
+	}
+
+	void set_r(unsigned e, T radius) {
+		for (size_t i = 0; i < E[e].size(); i++)
+			E[e].cylinder<T>::set_r(i, radius);
+	}
+	//scale the network by some constant value
+	//	I don't think these work??????
+	/*std::vector<vertex> operator*(T s){
 		for (unsigned i=0; i< vertices; i ++ ){
 			V[i] = V[i] * s;
 		}
@@ -139,10 +348,9 @@ public:
 			}
 		}
 		return V;
-	}
+	}*/
 	// Returns an average of branching index in the network
-
 	double BranchingIndex(){
 		double B=0;
 		for(unsigned v=0; v < V.size(); v ++){
@@ -154,7 +362,6 @@ public:
 	}
 	// Returns number of branch points in thenetwork
-
 	unsigned int BranchP(){
 		unsigned int B=0;
 		unsigned int c;
@@ -168,7 +375,6 @@ public:
 	}
 	// Returns number of end points (tips) in thenetwork
-
 	unsigned int EndP(){
 		unsigned int B=0;
 		unsigned int c;
@@ -202,7 +408,7 @@ public:
 	//	return s;
 	//}
-
+	//Calculate Metrics---------------------------------------------------
 	// Returns an average of fiber/edge lengths in the network
 	double Lengths(){
 		stim::vec<T> L;
@@ -270,8 +476,10 @@ public:
 	double avg = sumFractDim / E.size();
 	return avg;
 	}
-	stim::cylinder<T> get_cylinder(unsigned f){
-		return E[f];									//return the specified edge (casting it to a fiber)
+
+	//returns a cylinder represented a given fiber (based on edge index)
+	stim::cylinder<T> get_cylinder(unsigned e){
+		return E[e];									//return the specified edge (casting it to a fiber)
 	}
 	//load a network from an OBJ file
@@ -290,9 +498,10 @@ public:
 			std::vector< stim::vec<T> > c;						//allocate an array of points for the vessel centerline
 			O.getLine(l, c);							//get the fiber centerline
-			std::vector< stim::vec3<T> > c3(c.size());
+			stim::centerline<T> c3(c.size());
 			for(size_t j = 0; j < c.size(); j++)
 				c3[j] = c[j];
+			c3.update();
 	//		edge new_edge = c3;		///This is dangerous.
 			edge new_edge(c3);
@@ -313,10 +522,24 @@ public:
 			it = find(id2vert.begin(), id2vert.end(), i[0]);	//look for the first node
 			if(it == id2vert.end()){							//if i[0] hasn't already been used
 				vertex new_vertex = new_edge[0];				//create a new vertex, assign it a position
-				new_vertex.e[0].push_back(E.size());				//add the current edge as outgoing
-				new_edge.v[0] = V.size();					//add the new edge to the edge
-				V.push_back(new_vertex);					//add the new vertex to the vertex list
-				id2vert.push_back(i[0]);					//add the ID to the ID->vertex conversion list
+				bool flag = false;
+				unsigned j = 0;
+				for (; j < V.size(); j++) {						// check whether current vertex is already exist
+					if (new_vertex == V[j]) {
+						flag = true;
+						break;
+					}
+				}
+				if (!flag) {									// unique one
+					new_vertex.e[0].push_back(E.size());				//add the current edge as outgoing
+					new_edge.v[0] = V.size();					//add the new edge to the edge
+					V.push_back(new_vertex);					//add the new vertex to the vertex list
+					id2vert.push_back(i[0]);					//add the ID to the ID->vertex conversion list
+				}
+				else {
+					V[j].e[0].push_back(E.size());
+					new_edge.v[0] = j;
+				}
 			}
 			else{									//if the vertex already exists
 				it_idx = std::distance(id2vert.begin(), it);
@@ -327,10 +550,24 @@ public:
 			it = find(id2vert.begin(), id2vert.end(), i[1]);			//look for the second ID
 			if(it == id2vert.end()){						//if i[1] hasn't already been used
 				vertex new_vertex = new_edge[I-1];				//create a new vertex, assign it a position
-				new_vertex.e[1].push_back(E.size());				//add the current edge as incoming
-				new_edge.v[1] = V.size();                                  	//add the new vertex to the edge
-				V.push_back(new_vertex);					//add the new vertex to the vertex list
-				id2vert.push_back(i[1]);					//add the ID to the ID->vertex conversion list
+				bool flag = false;
+				unsigned j = 0;
+				for (; j < V.size(); j++) {					// check whether current vertex is already exist
+					if (new_vertex == V[j]) {
+						flag = true;
+						break;
+					}
+				}
+				if (!flag) {
+					new_vertex.e[1].push_back(E.size());				//add the current edge as incoming
+					new_edge.v[1] = V.size();                                  	//add the new vertex to the edge
+					V.push_back(new_vertex);					//add the new vertex to the vertex list
+					id2vert.push_back(i[1]);					//add the ID to the ID->vertex conversion list
+				}
+				else {
+					V[j].e[1].push_back(E.size());
+					new_edge.v[1] = j;
+				}
 			}
 			else{									//if the vertex already exists
 				it_idx = std::distance(id2vert.begin(), it);
@@ -341,6 +578,217 @@ public:
 			E.push_back(new_edge);							//push the edge to the list
 		}
+
+		// copy the radii information from OBJ
+		/*if (O.numVT()) {
+			unsigned k = 0;
+			for (unsigned i = 0; i < E.size(); i++) {
+				for (unsigned j = 0; j < E[i].size(); j++) {
+					E[i].cylinder<T>::set_r(j, O.getVT(k)[0] / 2);
+					k++;
+				}
+			}
+		}*/
+		// OBJ class assumes that in L the two values are equal
+		if (O.numVT()) {
+			std::vector< unsigned > id;						//create an array to store the centerline point IDs
+			for (unsigned i = 0; i < O.numL(); i++) {
+				id.clear();
+				O.getLinei(i + 1, id);							//get the list of point IDs for the line
+				for (unsigned j = 0; j < id.size(); j++)
+					E[i].cylinder<T>::set_r(j, O.getVT(id[j] - 1)[0] / 2);
+			}
+		}
+	}
+
+	///loads a .nwt file. Reads the header and loads the data into the network according to the header.
+	void
+	loadNwt(std::string filename)
+	{
+		int dims[2];		///number of vertex, number of edges
+		readHeader(filename, &dims[0]);		//read header
+		std::ifstream file;
+		file.open(filename.c_str(), std::ios::in | std::ios::binary);		///skip header information.
+		file.seekg(14+58+4+4, file.beg);
+		vertex v;
+		for(int i = 0; i < dims[0]; i++)		///for every vertex, read vertex, add to network.
+		{
+			file >> v;
+			V.push_back(v);
+//			std::cout << i << " " << v.str() << std::endl;
+		}
+
+		std::cout << std::endl;
+		for(int i = 0; i < dims[1]; i++)		///for every edge, read edge, add to network.
+		{
+			edge e;
+			file >> e;
+			E.push_back(e);
+			//std::cout << i << " " << E[i].str() << std::endl;		// not necessary?
+		}
+		file.close();
+	}
+
+	///saves a .nwt file. Writes the header in raw text format, then saves the network as a binary file.
+	void
+	saveNwt(std::string filename)
+	{
+		writeHeader(filename);
+		std::ofstream file;
+		file.open(filename.c_str(), std::ios::out | std::ios::binary | std::ios::app);	///since we have written the header we are not appending.
+		for(int i = 0; i < V.size(); i++)	///look through the Vertices and write each one.
+		{
+//			std::cout << i << " " << V[i].str() << std::endl;
+			file << V[i];
+		}
+		for(int i = 0; i < E.size(); i++)	///loop through the Edges and write each one.
+		{
+			//std::cout << i << " " << E[i].str() << std::endl;		// not necesarry?
+			file << E[i];
+		}
+		file.close();
+	}
+
+
+	///Writes the header information to a .nwt file.
+	void
+	writeHeader(std::string filename)
+	{
+		std::string magicString = "nwtFileFormat ";		///identifier for the file.
+		std::string desc = "fileid(14B), desc(58B), #vertices(4B), #edges(4B): bindata";
+		int hNumVertices = V.size();		///int byte header storing the number of vertices in the file
+		int hNumEdges = E.size();		///int byte header storing the number of edges.
+		std::ofstream file;
+		file.open(filename.c_str(), std::ios::out | std::ios::binary);
+		std::cout << hNumVertices << " " << hNumEdges << std::endl;
+		file.write(reinterpret_cast<const char*>(&magicString.c_str()[0]), 14);	//write the file id
+		file.write(reinterpret_cast<const char*>(&desc.c_str()[0]), 58);	//write the description
+		file.write(reinterpret_cast<const char*>(&hNumVertices), sizeof(int));	//write #vert.
+		file.write(reinterpret_cast<const char*>(&hNumEdges), sizeof(int));	//write #edges
+//		file << magicString.c_str() << desc.c_str() << hNumVertices << hNumEdges;
+		file.close();
+		
+	}
+
+	///Reads the header information from a .nwt file.
+	void
+	readHeader(std::string filename, int *dims)
+	{
+		char magicString[14];		///id
+		char desc[58];			///description
+		int hNumVertices;		///#vert
+		int hNumEdges;			///#edges
+		std::ifstream file;		////create stream
+		file.open(filename.c_str(), std::ios::in | std::ios::binary);
+		file.read(reinterpret_cast<char*>(&magicString[0]), 14);	///read the file id.
+		file.read(reinterpret_cast<char*>(&desc[0]), 58);		///read the description
+		file.read(reinterpret_cast<char*>(&hNumVertices), sizeof(int));	///read the number of vertices
+		file.read(reinterpret_cast<char*>(&hNumEdges), sizeof(int));	///read the number of edges
+//		std::cout << magicString << desc << hNumVertices << " " <<  hNumEdges << std::endl;
+		file.close();							///close the file.
+		dims[0] = hNumVertices;						///fill the returned reference.
+		dims[1] = hNumEdges;
+	}
+
+	//load a network from an SWC file
+	void load_swc(std::string filename) {
+		stim::swc<T> S;										// create swc variable
+		S.load(filename);									// load the node information
+		S.create_tree();									// link those node according to their linking relationships as a tree
+		S.resample();
+
+		//NT.push_back(S.node[0].type);						// set the neuronal_type value to the first vertex in the network
+		std::vector<unsigned> id2vert;						// this list stores the SWC vertex ID associated with each network vertex
+		unsigned i[2];										// temporary, IDs associated with the first and last points
+
+		for (unsigned int l = 0; l < S.numE(); l++) {		// for every edge
+			//NT.push_back(S.node[l].type);
+
+			std::vector< stim::vec3<T> > c;
+			S.get_points(l, c);
+
+			stim::centerline<T> c3(c.size());				// new fiber
+			
+			for (unsigned j = 0; j < c.size(); j++)
+				c3[j] = c[j];								// copy the points
+		
+			c3.update();									// update the L information
+			
+			stim::cylinder<T> C3(c3);						// create a new cylinder in order to copy the origin radius information
+			// upadate the R information
+			std::vector<T> radius;
+			S.get_radius(l, radius);
+
+			C3.copy_r(radius);
+
+			edge new_edge(C3);								// new edge	
+
+			//create an edge from the given centerline
+			unsigned int I = new_edge.size();				//calculate the number of points on the centerline
+			
+			//get the first and last vertex IDs for the line
+			i[0] = S.E[l].front();
+			i[1] = S.E[l].back();
+
+			std::vector<unsigned>::iterator it;				//create an iterator for searching the id2vert array
+			unsigned it_idx;								//create an integer for the id2vert entry index
+
+			//find out if the nodes for this fiber have already been created
+			it = find(id2vert.begin(), id2vert.end(), i[0]);	//look for the first node
+			if (it == id2vert.end()) {							//if i[0] hasn't already been used
+				vertex new_vertex = new_edge[0];				//create a new vertex, assign it a position
+				new_vertex.e[0].push_back(E.size());			//add the current edge as outgoing
+				new_edge.v[0] = V.size();						//add the new edge to the edge
+				V.push_back(new_vertex);						//add the new vertex to the vertex list
+				id2vert.push_back(i[0]);						//add the ID to the ID->vertex conversion list
+			}
+			else {									//if the vertex already exists
+				it_idx = std::distance(id2vert.begin(), it);
+				V[it_idx].e[0].push_back(E.size());				//add the current edge as outgoing
+				new_edge.v[0] = it_idx;
+			}
+
+			it = find(id2vert.begin(), id2vert.end(), i[1]);	//look for the second ID
+			if (it == id2vert.end()) {							//if i[1] hasn't already been used
+				vertex new_vertex = new_edge[I - 1];			//create a new vertex, assign it a position
+				new_vertex.e[1].push_back(E.size());			//add the current edge as incoming
+				new_edge.v[1] = V.size();                       //add the new vertex to the edge
+				V.push_back(new_vertex);						//add the new vertex to the vertex list
+				id2vert.push_back(i[1]);						//add the ID to the ID->vertex conversion list
+			}
+			else {									//if the vertex already exists
+				it_idx = std::distance(id2vert.begin(), it);
+				V[it_idx].e[1].push_back(E.size());				//add the current edge as incoming
+				new_edge.v[1] = it_idx;
+			}
+
+			E.push_back(new_edge);								//push the edge to the list
+		}
+	}
+
+	/// Get adjacency matrix of the network
+	std::vector< typename std::vector<int> > get_adj_mat() {
+		
+		unsigned n = V.size();		// get the number of vertices in the networks
+
+		std::vector< typename std::vector<int> > result(n, std::vector<int>(n, 0));	// initialize every entry in the matrix to be 0
+		result.resize(n);			// resize rows
+		for (unsigned i = 0; i < n; i++)
+			result[i].resize(n);	// resize columns
+		
+		for (unsigned i = 0; i < n; i++) {			// for every vertex
+			unsigned num_out = V[i].e[0].size();	// number of outgoing edges of current vertex
+			if (num_out != 0) {
+				for (unsigned j = 0; j < num_out; j++) {
+					int edge_idx = V[i].e[0][j];		// get the jth out-going edge index of current vertex
+					int vertex_idx = E[edge_idx].v[1];	// get the ending vertex of specific out-going edge
+					result[i][vertex_idx] = 1;			// can simply set to 1 if it is simple-graph
+					result[vertex_idx][i] = 1;			// symmetric
+				}
+			}
+		}
+
+		return result;
 	}
 	/// Output the network as a string
@@ -365,7 +813,7 @@ public:
 	stim::network<T> resample(T spacing){
 		stim::network<T> n;								//create a new network that will be an exact copy, with resampled fibers
 		n.V = V;									//copy all vertices
-
+		//n.NT = NT;										//copy all the neuronal type information
 		n.E.resize(edges());								//allocate space for the edge list
 		//copy all fibers, resampling them in the process
@@ -376,8 +824,6 @@ public:
 		return n;							              	//return the resampled network
 	}
-
-
 	/// Calculate the total number of points on all edges.
 	unsigned total_points(){
 		unsigned n = 0;
@@ -386,16 +832,52 @@ public:
 		return n;
 	}
-	// gaussian function
-	float gaussianFunction(float x, float std=25){ return exp(-x/(2*std*std));} // by default std = 25
+	//Copy the point cloud representing the centerline for the network into an array
+	void centerline_cloud(T* dst) {
+		size_t p;										//stores the current edge point
+		size_t P;										//stores the number of points in an edge
+		size_t i = 0;									//index into the output array of points
+		for (size_t e = 0; e < E.size(); e++) {			//for each edge in the network
+			P = E[e].size();							//get the number of points in this edge
+			for (p = 0; p < P; p++) {
+				dst[i * 3 + 0] = E[e][p][0];		
+				dst[i * 3 + 1] = E[e][p][1];
+				dst[i * 3 + 2] = E[e][p][2];
+				i++;
+			}
+		}
+	}
-    // stim 3d vector to annpoint of 3 dimensions
-	void stim2ann(ANNpoint &a, stim::vec3<T> b){
+    // convert vec3 to array
+	void stim2array(float *a, stim::vec3<T> b){
 		a[0] = b[0];
 		a[1] = b[1];
 		a[2] = b[2];
 	}
+	// convert vec3 to array in bunch
+	void edge2array(T* a, edge b){
+		size_t n = b.size();
+		for(size_t i = 0; i < n; i++){
+			a[i * 3 + 0] = b[i][0];
+			a[i * 3 + 1] = b[i][1];
+			a[i * 3 + 2] = b[i][2];	 
+		}
+	}
+
+	// get list of metric
+	std::vector<T> metric() {
+		std::vector<T> result;
+		T m;
+		for (size_t e = 0; e < E.size(); e++) {
+			for (size_t p = 0; p < E[e].size(); p++) {
+				m = E[e].r(p);
+				result.push_back(m);
+			}
+		}
+		return result;
+	}
+
 	/// Calculate the average magnitude across the entire network.
 	/// @param m is the magnitude value to use. The default is 0 (usually radius).
 	T average(unsigned m = 0){
@@ -403,7 +885,7 @@ public:
 		T M, L;										//allocate space for the total magnitude and length
 		M = L = 0;									//initialize both the initial magnitude and length to zero
 		for(unsigned e = 0; e < E.size(); e++){						//for each edge in the network
-			M += E[e].integrate(m);							//get the integrated magnitude
+			M += E[e].integrate();							//get the integrated magnitude
 			L += E[e].length();							//get the edge length
 		}
@@ -411,67 +893,391 @@ public:
 	}
 	/// This function compares two networks and returns the percentage of the current network that is missing from A.
-
 	/// @param A is the network to compare to - the field is generated for A
 	/// @param sigma is the user-defined tolerance value - smaller values provide a stricter comparison
-	stim::network<T> compare(stim::network<T> A, float sigma){
+	stim::network<T> compare(stim::network<T> A, float sigma, int device = -1){
-		stim::network<T> R;								//generate a network storing the result of the comparison
-		R = (*this);									//initialize the result with the current network
+		stim::network<T> R;										//generate a network storing the result of the comparison
+		R = (*this);											//initialize the result with the current network
-		//generate a KD-tree for network A
-		float metric = 0.0;                               				// initialize metric to be returned after comparing the networks
-		ANNkd_tree* kdt;                                 				// initialize a pointer to a kd tree
-		double **c;						                 	// centerline (array of double pointers) - points on kdtree must be double
-		unsigned int n_data = A.total_points();          				// set the number of points
-		c = (double**) malloc(sizeof(double*) * n_data); 				// allocate the array pointer
-		for(unsigned int i = 0; i < n_data; i++)		 			// allocate space for each point of 3 dimensions
-			c[i] = (double*) malloc(sizeof(double) * 3);
+		T *c;						                 			// centerline (array of double pointers) - points on kdtree must be double
+		size_t n_data = A.total_points();          				// set the number of points
+		c = (T*) malloc(sizeof(T) * n_data * 3);				// allocate an array to store all points in the data set				
 		unsigned t = 0;
-		for(unsigned e = 0; e < A.E.size(); e++){					//for each edge in the network
-			for(unsigned p = 0; p < A.E[e].size(); p++){				//for each point in the edge
+		for(unsigned e = 0; e < A.E.size(); e++){				//for each edge in the network
+			for(unsigned p = 0; p < A.E[e].size(); p++){		//for each point in the edge
 				for(unsigned d = 0; d < 3; d++){				//for each coordinate
-					c[t][d] = A.E[e][p][d];
+					c[t * 3 + d] = A.E[e][p][d];				//copy the point into the array c
 				}
 				t++;
 			}
 		}
-		//compare each point in the current network to the field produced by A
-		ANNpointArray pts = (ANNpointArray)c;           				// create an array of data points of type double
-		kdt = new ANNkd_tree(pts, n_data, 3);						// build a KD tree using the annpointarray
-		double eps = 0; // error bound
-		ANNdistArray dists = new ANNdist[1];     					// near neighbor distances
-		ANNidxArray nnIdx = new ANNidx[1];						// near neighbor indices // allocate near neigh indices
+		//generate a KD-tree for network A
+		size_t MaxTreeLevels = 3;								// max tree level
+		
+#ifdef __CUDACC__
+		cudaSetDevice(device);
+		stim::kdtree<T, 3> kdt;								// initialize a pointer to a kd tree
+
+		kdt.create(c, n_data, MaxTreeLevels);				// build a KD tree
-		stim::vec3<T> p0, p1;
-		float m1;
-		float M = 0;									//stores the total metric value
-		float L = 0;									//stores the total network length
-		ANNpoint queryPt = annAllocPt(3);
 		for(unsigned e = 0; e < R.E.size(); e++){					//for each edge in A
-			R.E[e].add_mag(0);							//add a new magnitude for the metric
+			//R.E[e].add_mag(0);							//add a new magnitude for the metric
+			//size_t errormag_id = R.E[e].nmags() - 1;		//get the id for the new magnitude
+			
+			size_t n = R.E[e].size();						// the number of points in current edge
+			T* queryPt = new T[3 * n];
+			T* m1 = new T[n];
+			T* dists = new T[n];
+			size_t* nnIdx = new size_t[n];
+
+			T* d_dists;										
+			T* d_m1;										
+			cudaMalloc((void**)&d_dists, n * sizeof(T));
+			cudaMalloc((void**)&d_m1, n * sizeof(T));
+
+			edge2array(queryPt, R.E[e]);
+			kdt.search(queryPt, n, nnIdx, dists);		
+
+			cudaMemcpy(d_dists, dists, n * sizeof(T), cudaMemcpyHostToDevice);					// copy dists from host to device
-			for(unsigned p = 0; p < R.E[e].size(); p++){				//for each point in the edge
+			// configuration parameters
+			size_t threads = (1024>n)?n:1024;
+			size_t blocks = n/threads + (n%threads)?1:0;
-				p1 = R.E[e][p];							//get the next point in the edge
-				stim2ann(queryPt, p1);
-				kdt->annkSearch( queryPt, 1, nnIdx, dists, eps);		//find the distance between A and the current network
-				m1 = 1.0f - gaussianFunction((float)dists[0], sigma);		//calculate the metric value based on the distance
-				R.E[e].set_mag(m1, p, 1);					//set the error for the second point in the segment
+			find_metric_parallel<<<blocks, threads>>>(d_m1, n, d_dists, sigma);					//calculate the metric value based on the distance
+			cudaMemcpy(m1, d_m1, n * sizeof(T), cudaMemcpyDeviceToHost);
+
+			for(unsigned p = 0; p < n; p++){
+				R.E[e].set_r(p, m1[p]);
 			}
+
+			//d_set_mag<<<blocks, threads>>>(R.E[e].M, errormag_id, n, m1);
 		}
+#else
+		stim::kdtree<T, 3> kdt;
+		kdt.create(c, n_data, MaxTreeLevels);
+	
+		for(unsigned e = 0; e < R.E.size(); e++){			//for each edge in A
+
+			size_t n = R.E[e].size();						// the number of points in current edge
+			T* query = new T[3 * n];
+			T* m1 = new T[n];
+			T* dists = new T[n];
+			size_t* nnIdx = new size_t[n];
+
+			edge2array(query, R.E[e]);
+
+			kdt.cpu_search(query, n, nnIdx, dists);			//find the distance between A and the current network
+
+			for(unsigned p = 0; p < R.E[e].size(); p++){
+				m1[p] = 1.0f - gaussianFunction((T)dists[p], sigma);	//calculate the metric value based on the distance
+				R.E[e].set_r(p, m1[p]);					//set the error for the second point in the segment
+			}
+		}
+#endif
 		return R;		//return the resulting network
 	}
-	/// Returns the number of magnitude values stored in each edge. This should be uniform across the network.
-	unsigned nmags(){
-		return E[0].nmags();
+	/// This function compares two networks and split the current one according to the nearest neighbor of each point in each edge
+	/// @param A is the network to split
+	/// @param B is the corresponding mapping network
+	/// @param sigma is the user-defined tolerance value - smaller values provide a stricter comparison
+	/// @param device is the device that user want to use
+	void split(stim::network<T> A, stim::network<T> B, float sigma, int device, float threshold){
+
+		T *c;						                 	
+		size_t n_data = B.total_points();          				
+		c = (T*) malloc(sizeof(T) * n_data * 3); 				
+
+		size_t NB = B.E.size();								// the number of edges in B
+		unsigned t = 0;
+		for(unsigned e = 0; e < NB; e++){					// for every edge in B			
+			for(unsigned p = 0; p < B.E[e].size(); p++){	// for every points in B.E[e]
+				for(unsigned d = 0; d < 3; d++){				
+
+					c[t * 3 + d] = B.E[e][p][d];			// convert to array
+				}
+				t++;
+			}
+		}
+		size_t MaxTreeLevels = 3;							// max tree level
+
+#ifdef __CUDACC__
+		cudaSetDevice(device);
+		stim::kdtree<T, 3> kdt;								// initialize a pointer to a kd tree
+	
+		//compare each point in the current network to the field produced by A
+		kdt.create(c, n_data, MaxTreeLevels);				// build a KD tree
+
+		std::vector<std::vector<unsigned> > relation;		// the relationship between GT and T corresponding to NN
+		relation.resize(A.E.size());										
+
+		for(unsigned e = 0; e < A.E.size(); e++){			//for each edge in A
+			//A.E[e].add_mag(0);								//add a new magnitude for the metric
+			//size_t errormag_id = A.E[e].nmags() - 1;
+			
+			size_t n = A.E[e].size();						// the number of edges in A
+
+			T* queryPt = new T[3 * n];							// set of all the points in current edge
+			T* m1 = new T[n];								// array of metrics for every point in current edge
+			T* dists = new T[n];							// store the distances for every point in current edge
+			size_t* nnIdx = new size_t[n];					// store the indices for every point in current edge
+			
+			// define pointers in device
+			T* d_dists;														
+			T* d_m1;
+			size_t* d_nnIdx;
+
+			// allocate memory for defined pointers
+			cudaMalloc((void**)&d_dists, n * sizeof(T));
+			cudaMalloc((void**)&d_m1, n * sizeof(T));
+			cudaMalloc((void**)&d_nnIdx, n * sizeof(size_t));
+
+			edge2array(queryPt, A.E[e]);						// convert edge to array
+			kdt.search(queryPt, n, nnIdx, dists);				// search the tree to find the NN for every point in current edge
+
+			cudaMemcpy(d_dists, dists, n * sizeof(T), cudaMemcpyHostToDevice);					// copy dists from host to device
+			cudaMemcpy(d_nnIdx, nnIdx, n * sizeof(size_t), cudaMemcpyHostToDevice);				// copy Idx from host to device
+
+			// configuration parameters
+			size_t threads = (1024>n)?n:1024;													// test to see whether the number of point in current edge is more than 1024
+			size_t blocks = n/threads + (n%threads)?1:0;
+
+			find_metric_parallel<<<blocks, threads>>>(d_m1, n, d_dists, sigma);								// calculate the metrics in parallel
+
+			cudaMemcpy(m1, d_m1, n * sizeof(T), cudaMemcpyDeviceToHost);
+
+			for(unsigned p = 0; p < n; p++){
+				A.E[e].set_r(p, m1[p]);											// set the error(radius) value to every point in current edge
+			}
+
+			relation[e].resize(n);																// resize every edge relation size
+
+			unsigned* d_relation;
+			cudaMalloc((void**)&d_relation, n * sizeof(unsigned));								// allocate memory
+
+			std::vector<size_t> edge_point_num(NB);												// %th element is the number of points that %th edge has
+			for(unsigned ee = 0; ee < NB; ee++)
+				edge_point_num[ee] = B.E[ee].size();
+
+			size_t* d_edge_point_num;
+			cudaMalloc((void**)&d_edge_point_num, NB * sizeof(size_t));
+			cudaMemcpy(d_edge_point_num, &edge_point_num[0], NB * sizeof(size_t), cudaMemcpyHostToDevice);
+
+			find_edge_index_parallel<<<blocks, threads>>>(d_nnIdx, n, d_relation, d_edge_point_num, NB);			// find the edge corresponding to the array index in parallel
+
+			cudaMemcpy(&relation[e][0], d_relation, n * sizeof(unsigned), cudaMemcpyDeviceToHost);	//copy relationship from device to host
+		}
+#else
+		stim::kdtree<T, 3> kdt;
+		kdt.create(c, n_data, MaxTreeLevels);
+	
+		std::vector<std::vector<unsigned>> relation;		// the mapping relationship between two networks
+		relation.resize(A.E.size());										
+		for(unsigned i = 0; i < A.E.size(); i++)
+			relation[i].resize(A.E[i].size());
+
+		std::vector<size_t> edge_point_num(NB);				//%th element is the number of points that %th edge has
+		for(unsigned ee = 0; ee < NB; ee++)
+			edge_point_num[ee] = B.E[ee].size();
+
+		for(unsigned e = 0; e < A.E.size(); e++){			//for each edge in A
+			
+			size_t n = A.E[e].size();						//the number of edges in A
+
+			T* queryPt = new T[3 * n];
+			T* m1 = new T[n];
+			T* dists = new T[n];							//store the distances
+			size_t* nnIdx = new size_t[n];					//store the indices
+			
+			edge2array(queryPt, A.E[e]);
+			kdt.search(queryPt, n, nnIdx, dists);		
+
+			for(unsigned p = 0; p < A.E[e].size(); p++){
+				m1[p] = 1.0f - gaussianFunction((T)dists[p], sigma);	//calculate the metric value based on the distance
+				A.E[e].set_r(p, m1[p]);									//set the error for the second point in the segment
+				
+				unsigned id = 0;																	//mapping edge's idx
+				size_t num = 0;																		//total number of points before #th edge
+				for(unsigned i = 0; i < NB; i++){
+					num += B.E[i].size();
+					if(nnIdx[p] < num){																//find the edge it belongs to
+						relation[e][p] = id;
+						break;
+					}
+					id++;																			//current edge won't be the one, move to next edge
+				}
+			}
+		}
+#endif
+		E = A.E;
+		V = A.V;
+
+		unsigned int id = 0;									// split value								
+		for(unsigned e = 0; e < E.size(); e++){					// for every edge
+			for(unsigned p = 0; p < E[e].size() - 1; p++){		// for every point in each edge
+				int t = (int)(E[e].length() / sigma * 2);
+				if (t <= 20)
+					threshold_fac = E[e].size();
+				else
+					threshold_fac = (E[e].length() / sigma * 2)/10;
+				if(relation[e][p] != relation[e][p + 1]){		// find the nearest edge changing point
+					id = p + 1;									// if there is no change in NN
+					if(id < threshold_fac || (E[e].size() - id) < threshold_fac)				
+						id = E[e].size() - 1;																			// extreme situation is not acceptable
+					else
+						break;
+				}
+				if(p == E[e].size() - 2)																// if there is no splitting index, set the id to the last point index of current edge
+					id = E[e].size() - 1;
+			}
+			//unsigned errormag_id = E[e].nmags() - 1;
+			T G = 0;																					// test to see whether it has its nearest neighbor
+			for(unsigned i = 0; i < E[e].size(); i++)
+				G += E[e].r(i);																			// won't split special edges
+			if(G / E[e].size() > threshold)															// should based on the color map
+				id = E[e].size() - 1;																	// set split idx to outgoing direction vertex
+
+			std::vector<edge> tmpe;
+			tmpe.resize(2);
+			tmpe = E[e].split(id);
+			vertex tmpv = stim::vec3<T>(-1, -1, 0);														// store the split point as vertex
+			if(tmpe.size() == 2){
+				relation.resize(relation.size() + 1);
+				for(unsigned d = id; d < E[e].size(); d++)
+					relation[relation.size() - 1].push_back(relation[e][d]);
+				tmpe[0].v[0] = E[e].v[0];																// begining vertex of first half edge -> original begining vertex
+				tmpe[1].v[1] = E[e].v[1];																// ending vertex of second half edge -> original ending vertex
+				tmpv = E[e][id];
+				V.push_back(tmpv);
+				tmpe[0].v[1] = (unsigned)V.size() - 1;													// ending vertex of first half edge -> new vertex
+				tmpe[1].v[0] = (unsigned)V.size() - 1;													// begining vertex of second half edge -> new vertex
+				edge tmp(E[e]);
+				E[e] = tmpe[0];																			// replace original edge by first half edge
+				E.push_back(tmpe[1]);																	// push second half edge to the last
+				V[V.size() - 1].e[1].push_back(e);														// push first half edge to the incoming of new vertex
+				V[V.size() - 1].e[0].push_back((unsigned)E.size() - 1);									// push second half edge to the outgoing of new vertex
+				for(unsigned i = 0; i < V[tmp.v[1]].e[1].size(); i++)									// find the incoming edge of original ending vertex
+					if(V[tmp.v[1]].e[1][i] == e)
+						V[tmp.v[1]].e[1][i] = (unsigned)E.size() - 1;									// set to new edge
+			}
+		}
 	}
+
+	/// This function compares two splitted networks and yields a mapping relationship between them according to NN
+	/// @param B is the network that the current network is going to map to
+	/// @param C is the mapping relationship: C[e1] = _e1 means e1 edge in current network is mapping the _e1 edge in B
+	/// @param device is the device that user want to use
+	void mapping(stim::network<T> B, std::vector<unsigned> &C, int device, float threshold){
+		stim::network<T> A;								//generate a network storing the result of the comparison
+		A = (*this);
+
+		size_t n = A.E.size();							// the number of edges in A
+		size_t NB = B.E.size();							// the number of edges in B
+
+		C.resize(A.E.size());	
+
+		T *c;						                 	// centerline (array of double pointers) - points on kdtree must be double
+		size_t n_data = B.total_points();          		// set the number of points
+		c = (T*) malloc(sizeof(T) * n_data * 3); 				
+
+		unsigned t = 0;
+		for(unsigned e = 0; e < NB; e++){					// for each edge in the network
+			for(unsigned p = 0; p < B.E[e].size(); p++){	// for each point in the edge
+				for(unsigned d = 0; d < 3; d++){			// for each coordinate
+
+					c[t * 3 + d] = B.E[e][p][d];
+				}
+				t++;
+			}
+		}
+
+		//generate a KD-tree for network A
+		//float metric = 0.0;                               		// initialize metric to be returned after comparing the network
+		size_t MaxTreeLevels = 3;									// max tree level
+		
+#ifdef __CUDACC__
+		cudaSetDevice(device);
+		stim::kdtree<T, 3> kdt;								// initialize a pointer to a kd tree
+	
+		kdt.create(c, n_data, MaxTreeLevels);				// build a KD tree
+
+		for(unsigned e = 0; e < n; e++){					//for each edge in A
+			//size_t errormag_id = A.E[e].nmags() - 1;		//get the id for the new magnitude
+			
+			//pre-judge to get rid of impossibly mapping edges
+			T M = 0;
+			for(unsigned p = 0; p < A.E[e].size(); p++)
+				M += A.E[e].r(p);
+			M = M / A.E[e].size();
+			if(M > threshold)
+				C[e] = (unsigned)-1;						//set the nearest edge of impossibly mapping edges to maximum of unsigned
+			else{
+				T* queryPt = new T[3];
+				T* dists = new T[1];
+				size_t* nnIdx = new size_t[1];
+
+				stim2array(queryPt, A.E[e][A.E[e].size()/2]);
+				kdt.search(queryPt, 1, nnIdx, dists);
+				
+				unsigned id = 0;							//mapping edge's idx
+				size_t num = 0;								//total number of points before #th edge
+				for(unsigned i = 0; i < NB; i++){
+					num += B.E[i].size();
+					if(nnIdx[0] < num){
+						C[e] = id;
+						break;
+					}
+					id++;
+				}
+			}
+		}
+#else
+		stim::kdtree<T, 3> kdt;
+		kdt.create(c, n_data, MaxTreeLevels);
+		T *dists = new T[1];								// near neighbor distances
+		size_t *nnIdx = new size_t[1];						// near neighbor indices // allocate near neigh indices
+
+		stim::vec3<T> p0, p1;
+		T* queryPt = new T[3];
+
+		for(unsigned int e = 0; e < R.E.size(); e++){			// for each edge in A
+			T M;											// the sum of metrics of current edge
+			for(unsigned p = 0; p < R.E[e].size(); p++)
+				M += A.E[e].r(p);
+			M = M / A.E[e].size();
+			if(M > threshold)								
+				C[e] = (unsigned)-1;
+			else{											// if it should have corresponding edge in B, then...
+				p1 = R.E[e][R.E[e].size()/2];							
+				stim2array(queryPt, p1);
+				kdt.cpu_search(queryPt, 1, nnIdx, dists);	// search the tree		
+				
+				unsigned id = 0;							//mapping edge's idx
+				size_t num = 0;								//total number of points before #th edge
+				for(unsigned i = 0; i < NB; i++){
+					num += B.E[i].size();
+					if(nnIdx[0] < num){
+						C[e] = id;
+						break;
+					}
+					id++;
+				}
+			}
+		}
+#endif
+	}
+
+	/// Returns the number of magnitude values stored in each edge. This should be uniform across the network.
+	//unsigned nmags(){
+	//	return E[0].nmags();
+	//}
 	// split a string in text by the character sep
 	stim::vec<T> split(std::string &text, char sep) 
 	{
@@ -488,7 +1294,7 @@ public:
 	void load_txt(std::string filename)
 	{
 		std::vector <std::string> file_contents;
-		std::ifstream file(filename);
+		std::ifstream file(filename.c_str());
 		std::string line;
 		std::vector<unsigned> id2vert;	//this list stores the vertex ID associated with each network vertex
 		//for each line in the text file, store them as strings in file_contents
@@ -539,7 +1345,7 @@ public:
 			for(unsigned int d = 0; d < 3; d++){
 				ss<<p[i][d];
 			}
-			ss < "\n";
+			ss << "\n";
 		}
 		return ss.str();
 	}
@@ -553,8 +1359,8 @@ public:
 	void
 	to_txt(std::string filename)
 	{
-		std::ofstream ofs(filename, std::ofstream::out | std::ofstream::app);
-		int num;
+		std::ofstream ofs(filename.c_str(), std::ofstream::out | std::ofstream::app);
+		//int num;
 		ofs << (E.size()).str() << "\n";
 		for(unsigned int i = 0; i < E.size(); i++)
 		{
@@ -567,7 +1373,8 @@ public:
 		{
 			std::string str;
 			str = V[i].str();
-			removeCharsFromString(str, "[],");
+			char temp[4] = "[],";
+			removeCharsFromString(str, temp);
 			ofs << str << "\n";
 		}
 		ofs.close();
+/*
+Copyright <2017> <David Mayerich>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
 #ifndef STIM_NETWORK_H
 #define STIM_NETWORK_H
 #include <stim/math/vector.h>
 #include <stim/visualization/obj.h>
 #include <list>
-#include <ANN/ANN.h>
+//#include <ANN/ANN.h>
 namespace stim{
@@ -4,14 +4,14 @@
 namespace stim{
 	namespace cuda{
 		template<typename T>
-		__global__ void cuda_cart2polar(T* a, int x, int y, float rotation){
+		__global__ void cuda_cart2polar(T* a, size_t x, size_t y, float rotation){
 			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
+			size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
 			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
+			size_t i = yi * x + xi;
 			if(xi >= x|| yi >= y) return;
@@ -27,11 +27,11 @@ namespace stim{
 		template<typename T>
-		void gpu_cart2polar(T* gpuGrad, unsigned int x, unsigned int y, float rotation = 0){
+		void gpu_cart2polar(T* gpuGrad, size_t x, size_t y, float rotation = 0){
 			unsigned int max_threads = stim::maxThreadsPerBlock();
 			dim3 threads(max_threads, 1);
-			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			dim3 blocks(((unsigned int)x/threads.x) + ((unsigned int)x %threads.x == 0 ? 0:1) , (unsigned int)y);
 			//call the kernel to do the multiplication
 			cuda_cart2polar <<< blocks, threads >>>(gpuGrad, x, y, rotation);
@@ -40,11 +40,11 @@ namespace stim{
 		template<typename T>
-		void cpu_cart2polar(T* a, unsigned int x, unsigned int y){
+		void cpu_cart2polar(T* a, size_t x, size_t y){
 			//calculate the number of bytes in the array
-			unsigned int N = x *y;
-			unsigned int bytes = N * sizeof(T) * 2;
+			size_t N = x *y;
+			size_t bytes = N * sizeof(T) * 2;
 			//allocate memory on the GPU for the array
 			T* gpuA;
@@ -11,7 +11,7 @@ typedef unsigned int uint;
 std::vector< stim::vec<float> >
-find_branch(GLint texbufferID, GLenum texType, unsigned int x, unsigned int y)
+find_branch(GLint texbufferID, GLenum texType, unsigned int x, unsigned int y, int iter = 0)
 {
 	float		sigma		= 2.0;
 	unsigned int	conn		= 7;
@@ -22,7 +22,7 @@ find_branch(GLint texbufferID, GLenum texType, unsigned int x, unsigned int y)
 	stringstream name;
-	cpuCenters = stim::cuda::get_centers(texbufferID, texType, x, y, sizek, sigma, conn, threshold);
+	cpuCenters = stim::cuda::get_centers(texbufferID, texType, x, y, sizek, sigma, conn, threshold, iter);
 	cudaDeviceSynchronize();
@@ -7,4 +7,10 @@
 #define CUDA_CALLABLE
 #endif
+#ifdef __CUDACC__
+#define CUDA_UNCALLABLE __host__ inline
+#else
+#define CUDA_UNCALLABLE
+#endif
+
 #endif
-#ifndef RTS_CUDA_DEVICES
-#define RTS_CUDA_DEVICES
+#ifndef STIM_CUDA_DEVICES
+#define STIM_CUDA_DEVICES
 #include <cuda.h>
 namespace stim{
-extern "C"
-int maxThreadsPerBlock()
-{
-	int device;
-	cudaGetDevice(&device);		//get the id of the current device
-	cudaDeviceProp props;		//device property structure
-	cudaGetDeviceProperties(&props, device);
-	return props.maxThreadsPerBlock;
-}
-
-extern "C"
-size_t sharedMemPerBlock()
-{
-	int device;
-	cudaGetDevice(&device);		//get the id of the current device
-	cudaDeviceProp props;		//device property structure
-	cudaGetDeviceProperties(&props, device);
-	return props.sharedMemPerBlock;
-}
-
-extern "C"
-size_t constMem()
-{
-	int device;
-	cudaGetDevice(&device);		//get the id of the current device
-	cudaDeviceProp props;		//device property structure
-	cudaGetDeviceProperties(&props, device);
-	return props.totalConstMem;
-}
+	extern "C"
+	int maxThreadsPerBlock(){
+		int device;
+		cudaGetDevice(&device);		//get the id of the current device
+		cudaDeviceProp props;		//device property structure
+		cudaGetDeviceProperties(&props, device);
+		return props.maxThreadsPerBlock;
+	}
+
+	extern "C"
+	size_t sharedMemPerBlock(){
+		int device;
+		cudaGetDevice(&device);		//get the id of the current device
+		cudaDeviceProp props;		//device property structure
+		cudaGetDeviceProperties(&props, device);
+		return props.sharedMemPerBlock;
+	}
+
+	extern "C"
+	size_t constMem(){
+		int device;
+		cudaGetDevice(&device);		//get the id of the current device
+		cudaDeviceProp props;		//device property structure
+		cudaGetDeviceProperties(&props, device);
+		return props.totalConstMem;
+	}
+
+	//tests that a given device ID is valid and provides at least the specified compute capability
+	bool testDevice(int d, int major, int minor){
+		int nd;
+		cudaGetDeviceCount(&nd);		//get the number of CUDA devices
+		if(d <= nd && d >= 0)	{		//if the given ID has an associated device
+			cudaDeviceProp props;
+			cudaGetDeviceProperties(&props, d);	//get the device properties structure
+			if(props.major > major){
+				return true;
+			}
+			else if(props.major == major && props.minor >= minor){
+				return true;
+			}
+		}
+		return false;
+	}
+
+	//tests each device ID in a list and returns the number of devices that fit the desired
+	//	compute capability
+	int testDevices(int* dlist, unsigned n_devices, int major, int minor){
+		int valid = 0;
+		for(int d = 0; d < n_devices; d++){
+			if(testDevice(dlist[d], major, minor))
+				valid++;
+		}
+		return valid;
+	}
+
+	void printDevice(int device){
+		int nd;
+		cudaGetDeviceCount(&nd);		//get the number of CUDA devices
+		printf("CUDA Device Diagnosis: [%i]\n", device);
+		if(device < 0){
+			printf("Device %i is an invalid device ID\n", device);
+		}
+		else if(device >= nd){
+			printf("Device %i is unavailable - only %i devices are detected", device, nd);
+		}
+		else{
+			cudaDeviceProp props;
+			cudaGetDeviceProperties(&props, device);	//get the device properties structure
+			printf("compute capability: %i.%i\n", props.major, props.minor);
+		}
+	}
 }	//end namespace rts
 #endif
+#ifndef STIM_CUDA_ERROR_H
+#define STIM_CUDA_ERROR_H
+
 #include <stdio.h>
 #include <iostream>
-using namespace std;
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
 #include "cufft.h"
-
-#ifndef CUDA_HANDLE_ERROR_H
-#define CUDA_HANDLE_ERROR_H
+#include "cublas_v2.h"
 //handle error macro
-static void HandleError( cudaError_t err, const char *file,  int line ) {
+static void cuHandleError( cudaError_t err, const char *file,  int line ) {
    	if (err != cudaSuccess) {
             printf("%s in %s at line %d\n", cudaGetErrorString( err ),  file, line );
+
    	}
 }
-#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
-
-static void CufftError( cufftResult err )
+#define HANDLE_ERROR( err ) (cuHandleError( err, __FILE__, __LINE__ ))
+static void cufftHandleError( cufftResult err, const char*file, int line )
 {
     if (err != CUFFT_SUCCESS)
     {
         if(err == CUFFT_INVALID_PLAN)
-            cout<<"The plan parameter is not a valid handle."<<endl;
+            std::cout<<"The plan parameter is not a valid handle."<<std::endl;
         else if(err == CUFFT_ALLOC_FAILED)
-            cout<<"Allocation failed."<<endl;
+            std::cout<<"Allocation failed."<<std::endl;
         else if(err == CUFFT_INVALID_VALUE)
-            cout<<"At least one of the parameters idata, odata, and direction is not valid."<<endl;
+            std::cout<<"At least one of the parameters idata, odata, and direction is not valid."<<std::endl;
         else if(err == CUFFT_INTERNAL_ERROR)
-            cout<<"An internal driver error was detected."<<endl;
+            std::cout<<"An internal driver error was detected."<<std::endl;
         else if(err == CUFFT_EXEC_FAILED)
-            cout<<"CUFFT failed to execute the transform on the GPU."<<endl;
+            std::cout<<"CUFFT failed to execute the transform on the GPU."<<std::endl;
         else if(err == CUFFT_SETUP_FAILED)
-            cout<<"The CUFFT library failed to initialize."<<endl;
+            std::cout<<"The CUFFT library failed to initialize."<<std::endl;
         else
-            cout<<"Unknown error: "<<err<<endl;
+            std::cout<<"Unknown error: "<<err<<std::endl;
     }
 }
+#define CUFFT_HANDLE_ERROR( err ) (cufftHandleError( err, __FILE__, __LINE__ ))
+static void cublasHandleError( cublasStatus_t err, const char*file, int line ){
+	if(err != CUBLAS_STATUS_SUCCESS){
+		if(err == CUBLAS_STATUS_NOT_INITIALIZED)
+			std::cout<<"CUBLAS_STATUS_NOT_INITIALIZED" <<" in file "<<file<<" line "<<std::endl;
+		else if(err == CUBLAS_STATUS_ALLOC_FAILED)
+			std::cout<<"CUBLAS_STATUS_ALLOC_FAILED" <<" in file "<<file<<" line "<<std::endl;
+		else if(err == CUBLAS_STATUS_INVALID_VALUE)
+			std::cout<<"CUBLAS_STATUS_INVALID_VALUE" <<" in file "<<file<<" line "<<std::endl;
+		else if(err == CUBLAS_STATUS_ARCH_MISMATCH)
+			std::cout<<"CUBLAS_STATUS_ARCH_MISMATCH" <<" in file "<<file<<" line "<<std::endl;
+		else if(err == CUBLAS_STATUS_MAPPING_ERROR)
+			std::cout<<"CUBLAS_STATUS_MAPPING_ERROR" <<" in file "<<file<<" line "<<std::endl;
+		else if(err == CUBLAS_STATUS_EXECUTION_FAILED)
+			std::cout<<"CUBLAS_STATUS_EXECUTION_FAILED" <<" in file "<<file<<" line "<<std::endl;
+		else if(err == CUBLAS_STATUS_INTERNAL_ERROR)
+			std::cout<<"CUBLAS_STATUS_INTERNAL_ERROR" <<" in file "<<file<<" line "<<std::endl;
+		else
+			std::cout<<"Unknown error"<<" in file "<<file<<" line "<<std::endl;
+	}
+}
+#define CUBLAS_HANDLE_ERROR( err ) (cublasHandleError( err, __FILE__, __LINE__ ))
 #endif
@@ -26,6 +26,11 @@ namespace stim
 	float* LoG;
 	float* res;
 	float* centers;
+
+//#ifdef DEBUG
+	float* print;
+//#endif
+
 	stim::cuda::cuda_texture tx;
@@ -44,6 +49,13 @@ namespace stim
 		HANDLE_ERROR(
 			cudaMalloc( (void**) &centers, DIM_Y*DIM_X*sizeof(float))
 		);
+
+//#ifdef DEBUG
+		HANDLE_ERROR(
+			cudaMalloc( (void**) &print, DIM_Y*DIM_X*sizeof(float))
+		);
+//#endif
+
 	//	checkCUDAerrors("Memory Allocation, Result");
 	}
@@ -58,6 +70,11 @@ namespace stim
 		HANDLE_ERROR(
 			cudaFree(centers)
 		);
+//#ifdef DEBUG
+		HANDLE_ERROR(
+			cudaFree(print)
+		);
+//#endif
 			free(LoG);
 	}
@@ -89,7 +106,7 @@ namespace stim
 	//Shared memory would be better.
 	__global__
 	void
-	applyFilter(cudaTextureObject_t texIn, unsigned int DIM_X, unsigned int DIM_Y, int kr, int kl, float *res, float* gpuLoG){
+	applyFilter(cudaTextureObject_t texIn, unsigned int DIM_X, unsigned int DIM_Y, int kr, int kl, float *res, float* gpuLoG, float* p){
 	//R = floor(size/2)
 	//THIS IS A NAIVE WAY TO DO IT, and there is a better way)
@@ -101,16 +118,15 @@ namespace stim
 	//	float val = 0;
 		float tu = (x-kr+xi)/(float)DIM_X;
 		float tv = (y-kr+yi)/(float)DIM_Y;
+		int idx = y*DIM_X+x;
 		shared[xi][yi] = gpuLoG[yi*kl+xi]*(255.0-(float)tex2D<unsigned char>(texIn, tu, tv));
 		__syncthreads();
-	
 		//x = max(0,x);
 		//x = min(x, width-1);
 		//y = max(y, 0);
 		//y = min(y, height - 1);
-		int idx = y*DIM_X+x;
 	//	int k_idx;
                 for(unsigned int step = blockDim.x/2; step >= 1; step >>= 1)
                 {
@@ -135,11 +151,12 @@ namespace stim
                 __syncthreads();
                 if(xi == 0 && yi == 0)
                         res[idx] = shared[0][0];
+
 	}
 	extern "C"
 	float *
-	get_centers(GLint texbufferID, GLenum texType, int DIM_X, int DIM_Y, int sizeK, float sigma, float conn, float threshold)
+	get_centers(GLint texbufferID, GLenum texType, int DIM_X, int DIM_Y, int sizeK, float sigma, float conn, float threshold, int iter = 0)
 	{
 		tx.SetTextureCoordinates(1);
 		tx.SetAddressMode(1, 3);
@@ -153,7 +170,14 @@ namespace stim
 		dim3 numBlocks(DIM_X, DIM_Y);
 		dim3 threadsPerBlock(sizeK, sizeK);
-		applyFilter <<< numBlocks, threadsPerBlock >>> (tx.getTexture(), DIM_X, DIM_Y, floor(sizeK/2), sizeK, res, gpuLoG);
+		applyFilter <<< numBlocks, threadsPerBlock >>> (tx.getTexture(), DIM_X, DIM_Y, floor(sizeK/2), sizeK, res, gpuLoG, print);
+
+		#ifdef DEBUG
+			stringstream name;
+			name.str("");
+			name << "Fiber Cylinder " << iter << ".bmp";
+			stim::gpu2image<float>(res, name.str(), DIM_X, DIM_Y, 0, 255);
+		#endif
 		stim::cuda::gpu_local_max<float>(centers, res, threshold, conn, DIM_X, DIM_Y);
+//This software is dervied from Professor Wei-keng Liao's parallel k-means
+//clustering code obtained on November 21, 2010 from
+// http://users.eecs.northwestern.edu/~wkliao/Kmeans/index.html
+//(http://users.eecs.northwestern.edu/~wkliao/Kmeans/simple_kmeans.tar.gz).
+//
+//With his permission, Serban Giuroiu is publishing his CUDA implementation based on his code
+//under the open-source MIT license. See the LICENSE file for more details.
+
+// The original code can be found on Github ( https://github.com/serban/kmeans )
+// Here I have just made a few changes to get it to work
+
+
+
+
+#define malloc2D(name, xDim, yDim, type) do {               \
+    name = (type **)malloc(xDim * sizeof(type *));          \
+    assert(name != NULL);                                   \
+    name[0] = (type *)malloc(xDim * yDim * sizeof(type));   \
+    assert(name[0] != NULL);                                \
+    for (size_t i = 1; i < xDim; i++)                       \
+        name[i] = name[i-1] + yDim;                         \
+} while (0)
+
+
+
+static void handleError(cudaError_t error, const char* file, int line){
+
+	if(error != cudaSuccess){
+		cout << cudaGetErrorString(error) << " in " << file <<  " at line " << line << endl;
+		exit(1);
+	}
+}
+
+#define  handle_error(error)  handleError(error, __FILE__ , __LINE__)
+
+
+
+static inline int nextPowerOfTwo(int n) {
+    n--;
+
+    n = n >>  1 | n;
+    n = n >>  2 | n;
+    n = n >>  4 | n;
+    n = n >>  8 | n;
+    n = n >> 16 | n;
+   // n = n >> 32 | n;    //  For 64-bit ints
+
+    return ++n;
+}
+
+/*----< euclid_dist_2() >----------------------------------------------------*/
+/* square of Euclid distance between two multi-dimensional points            */
+__host__ __device__ inline static
+float euclid_dist_2(int    numCoords,
+                    int    numObjs,
+                    int    numClusters,
+                    float *objects,     // [numCoords][numObjs]
+                    float *clusters,    // [numCoords][numClusters]
+                    int    objectId,
+                    int    clusterId)
+{
+    int i;
+    float ans=0.0;
+
+    for (i = 0; i < numCoords; i++) {
+        ans += (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]) *
+               (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]);
+    }
+
+    return(ans);
+}
+
+/*----< find_nearest_cluster() >---------------------------------------------*/
+__global__ static
+void find_nearest_cluster(int numCoords,
+                          int numObjs,
+                          int numClusters,
+                          float *objects,           //  [numCoords][numObjs]
+                          float *deviceClusters,    //  [numCoords][numClusters]
+                          int *membership,          //  [numObjs]
+                          int *intermediates)
+{
+    extern __shared__ char sharedMemory[];
+
+    //  The type chosen for membershipChanged must be large enough to support
+    //  reductions! There are blockDim.x elements, one for each thread in the
+    //  block. See numThreadsPerClusterBlock in cuda_kmeans().
+    unsigned char *membershipChanged = (unsigned char *)sharedMemory;
+#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
+    float *clusters = (float *)(sharedMemory + blockDim.x);
+#else
+    float *clusters = deviceClusters;
+#endif
+
+    membershipChanged[threadIdx.x] = 0;
+
+#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
+    //  BEWARE: We can overrun our shared memory here if there are too many
+    //  clusters or too many coordinates! For reference, a Tesla C1060 has 16
+    //  KiB of shared memory per block, and a GeForce GTX 480 has 48 KiB of
+    //  shared memory per block.
+    for (int i = threadIdx.x; i < numClusters; i += blockDim.x) {
+        for (int j = 0; j < numCoords; j++) {
+            clusters[numClusters * j + i] = deviceClusters[numClusters * j + i];
+        }
+    }
+    __syncthreads();
+#endif
+
+    int objectId = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (objectId < numObjs) {
+        int   index, i;
+        float dist, min_dist;
+
+        /* find the cluster id that has min distance to object */
+        index    = 0;
+        min_dist = euclid_dist_2(numCoords, numObjs, numClusters,
+                                 objects, clusters, objectId, 0);
+
+        for (i=1; i<numClusters; i++) {
+            dist = euclid_dist_2(numCoords, numObjs, numClusters,
+                                 objects, clusters, objectId, i);
+            /* no need square root */
+            if (dist < min_dist) { /* find the min and its array index */
+                min_dist = dist;
+                index    = i;
+            }
+        }
+
+        if (membership[objectId] != index) {
+            membershipChanged[threadIdx.x] = 1;
+        }
+
+        /* assign the membership to object objectId */
+        membership[objectId] = index;
+
+        __syncthreads();    //  For membershipChanged[]
+
+        //  blockDim.x *must* be a power of two!
+        for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
+            if (threadIdx.x < s) {
+                membershipChanged[threadIdx.x] +=
+                    membershipChanged[threadIdx.x + s];
+            }
+            __syncthreads();
+        }
+
+        if (threadIdx.x == 0) {
+            intermediates[blockIdx.x] = membershipChanged[0];
+        }
+    }
+}
+
+__global__ static
+void compute_delta(int *deviceIntermediates,
+                   int numIntermediates,    //  The actual number of intermediates
+                   int numIntermediates2)   //  The next power of two
+{
+    //  The number of elements in this array should be equal to
+    //  numIntermediates2, the number of threads launched. It *must* be a power
+    //  of two!
+    extern __shared__ unsigned int intermediates[];
+
+    //  Copy global intermediate values into shared memory.
+    intermediates[threadIdx.x] =
+        (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0;
+
+    __syncthreads();
+
+    //  numIntermediates2 *must* be a power of two!
+    for (unsigned int s = numIntermediates2 / 2; s > 0; s >>= 1) {
+        if (threadIdx.x < s) {
+            intermediates[threadIdx.x] += intermediates[threadIdx.x + s];
+        }
+        __syncthreads();
+    }
+
+    if (threadIdx.x == 0) {
+        deviceIntermediates[0] = intermediates[0];
+    }
+}
+
+/*----< cuda_kmeans() >-------------------------------------------------------*/
+//
+//  ----------------------------------------
+//  DATA LAYOUT
+//
+//  objects         [numObjs][numCoords]
+//  clusters        [numClusters][numCoords]
+//  dimObjects      [numCoords][numObjs]
+//  dimClusters     [numCoords][numClusters]
+//  newClusters     [numCoords][numClusters]
+//  deviceObjects   [numCoords][numObjs]
+//  deviceClusters  [numCoords][numClusters]
+//  ----------------------------------------
+//
+/* return an array of cluster centers of size [numClusters][numCoords]       */
+float** cuda_kmeans(float **objects,      /* in: [numObjs][numCoords] */
+          unsigned int     numCoords,    /* no. features */
+          unsigned int     numObjs,      /* no. objects */
+          unsigned int     numClusters,  /* no. clusters */
+                   float   threshold,    /* % objects change membership */
+                   int    *membership,   /* out: [numObjs] */
+                   int	   loops)
+{
+    int      i, j, index, loop=0;
+    int     *newClusterSize; /* [numClusters]: no. objects assigned in each
+                                new cluster */
+    float    delta;          /* % of objects change their clusters */
+    float  **dimObjects;
+    float  **clusters;       /* out: [numClusters][numCoords] */
+    float  **dimClusters;
+    float  **newClusters;    /* [numCoords][numClusters] */
+
+    float *deviceObjects;
+    float *deviceClusters;
+    int *deviceMembership;
+    int *deviceIntermediates;
+
+    //  Copy objects given in [numObjs][numCoords] layout to new
+    //  [numCoords][numObjs] layout
+    malloc2D(dimObjects, numCoords, numObjs, float);
+    for (i = 0; i < numCoords; i++) {
+        for (j = 0; j < numObjs; j++) {
+            dimObjects[i][j] = objects[j][i];
+        }
+    }
+
+    /* pick first numClusters elements of objects[] as initial cluster centers*/
+    malloc2D(dimClusters, numCoords, numClusters, float);
+    for (i = 0; i < numCoords; i++) {
+        for (j = 0; j < numClusters; j++) {
+            dimClusters[i][j] = dimObjects[i][j];
+		}
+    }
+
+    /* initialize membership[] */
+    for (i=0; i<numObjs; i++) membership[i] = -1;
+
+    /* need to initialize newClusterSize and newClusters[0] to all 0 */
+    newClusterSize = (int*) calloc(numClusters, sizeof(int));
+    assert(newClusterSize != NULL);
+
+    malloc2D(newClusters, numCoords, numClusters, float);
+    memset(newClusters[0], 0, numCoords * numClusters * sizeof(float));
+
+    //  To support reduction, numThreadsPerClusterBlock *must* be a power of
+    //  two, and it *must* be no larger than the number of bits that will
+    //  fit into an unsigned char, the type used to keep  track of membership
+    //  changes in the kernel.
+	cudaDeviceProp props;
+	handle_error(cudaGetDeviceProperties(&props, 0));
+    const unsigned int numThreadsPerClusterBlock = props.maxThreadsPerBlock;
+    const unsigned int numClusterBlocks =
+        ceil(numObjs / (double)numThreadsPerClusterBlock);
+
+#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
+    const unsigned int clusterBlockSharedDataSize =
+        numThreadsPerClusterBlock * sizeof(unsigned char) +
+        numClusters * numCoords * sizeof(float);
+
+    cudaDeviceProp deviceProp;
+    int deviceNum;
+    cudaGetDevice(&deviceNum);
+    cudaGetDeviceProperties(&deviceProp, deviceNum);
+
+    if (clusterBlockSharedDataSize > deviceProp.sharedMemPerBlock) {
+		std::cout << "ERROR: insufficient shared memory. Please don't use the definition 'BLOCK_SHARED_MEM_OPTIMIZATION'" << endl;
+		exit(1);
+    }
+#else
+    const unsigned int clusterBlockSharedDataSize =
+        numThreadsPerClusterBlock * sizeof(unsigned char);
+#endif
+
+    const unsigned int numReductionThreads =
+        nextPowerOfTwo(numClusterBlocks);
+    const unsigned int reductionBlockSharedDataSize =
+        numReductionThreads * sizeof(unsigned int);
+
+    handle_error(cudaMalloc((void**)&deviceObjects, numObjs*numCoords*sizeof(float)));
+    handle_error(cudaMalloc((void**)&deviceClusters, numClusters*numCoords*sizeof(float)));
+    handle_error(cudaMalloc((void**)&deviceMembership, numObjs*sizeof(int)));
+    handle_error(cudaMalloc((void**)&deviceIntermediates, numReductionThreads*sizeof(unsigned int)));
+
+    handle_error(cudaMemcpy(deviceObjects, dimObjects[0],
+              numObjs*numCoords*sizeof(float), cudaMemcpyHostToDevice));
+    handle_error(cudaMemcpy(deviceMembership, membership,
+              numObjs*sizeof(int), cudaMemcpyHostToDevice));
+
+    do {
+        handle_error(cudaMemcpy(deviceClusters, dimClusters[0],
+                  numClusters*numCoords*sizeof(float), cudaMemcpyHostToDevice));
+
+        find_nearest_cluster
+            <<< numClusterBlocks, numThreadsPerClusterBlock, clusterBlockSharedDataSize >>>
+            (numCoords, numObjs, numClusters,
+             deviceObjects, deviceClusters, deviceMembership, deviceIntermediates);
+
+        cudaDeviceSynchronize(); 
+
+        compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>>
+            (deviceIntermediates, numClusterBlocks, numReductionThreads);
+
+        cudaDeviceSynchronize(); 
+
+        int d;
+        handle_error(cudaMemcpy(&d, deviceIntermediates,
+                  sizeof(int), cudaMemcpyDeviceToHost));
+        delta = (float)d;
+
+        handle_error(cudaMemcpy(membership, deviceMembership,
+                  numObjs*sizeof(int), cudaMemcpyDeviceToHost));
+
+        for (i=0; i<numObjs; i++) {
+            /* find the array index of nestest cluster center */
+            index = membership[i];
+
+            /* update new cluster centers : sum of objects located within */
+            newClusterSize[index]++;
+            for (j=0; j<numCoords; j++)
+                newClusters[j][index] += objects[i][j];
+        }
+
+        //  TODO: Flip the nesting order
+        //  TODO: Change layout of newClusters to [numClusters][numCoords]
+        /* average the sum and replace old cluster centers with newClusters */
+        for (i=0; i<numClusters; i++) {
+            for (j=0; j<numCoords; j++) {
+                if (newClusterSize[i] > 0)
+                    dimClusters[j][i] = newClusters[j][i] / newClusterSize[i];
+                newClusters[j][i] = 0.0;   /* set back to 0 */
+            }
+            newClusterSize[i] = 0;   /* set back to 0 */
+        }
+
+        delta /= numObjs;
+    } while (delta > threshold && loop++ < loops);
+
+    
+
+    /* allocate a 2D space for returning variable clusters[] (coordinates
+       of cluster centers) */
+    malloc2D(clusters, numClusters, numCoords, float);
+    for (i = 0; i < numClusters; i++) {
+        for (j = 0; j < numCoords; j++) {
+            clusters[i][j] = dimClusters[j][i];
+        }
+    }
+
+    handle_error(cudaFree(deviceObjects));
+    handle_error(cudaFree(deviceClusters));
+    handle_error(cudaFree(deviceMembership));
+    handle_error(cudaFree(deviceIntermediates));
+
+    free(dimObjects[0]);
+    free(dimObjects);
+    free(dimClusters[0]);
+    free(dimClusters);
+    free(newClusters[0]);
+    free(newClusters);
+    free(newClusterSize);
+
+    return clusters;
+}
@@ -8,8 +8,6 @@
 #include <stim/cuda/sharedmem.cuh>
 #include <stim/cuda/cudatools/error.h>
-#define pi	3.14159
-
 namespace stim{
 	namespace cuda{
@@ -9,57 +9,50 @@ namespace stim{
 	namespace cuda{
 		template<typename T>
-		__global__ void gradient_2d(T* out, T* in, int x, int y){
+		__global__ void gradient_2d(T* out, T* in, size_t x, size_t y){
 			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
+			size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
 			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
+			size_t i = yi * x + xi;
 			//return if the pixel is outside of the image
 			if(xi >= x || yi >= y) return;
 			//calculate indices for the forward difference
-			int i_xp = yi * x + (xi + 1);
-			int i_yp = (yi + 1) * x + xi;
+			size_t i_xp = yi * x + (xi + 1);
+			size_t i_yp = (yi + 1) * x + xi;
 			//calculate indices for the backward difference
-			int i_xn = yi * x + (xi - 1);
-			int i_yn = (yi - 1) * x + xi;
+			size_t i_xn = yi * x + (xi - 1);
+			size_t i_yn = (yi - 1) * x + xi;
 			//use forward differences if a coordinate is zero
 			if(xi == 0)
 				out[i * 2 + 0] = in[i_xp] - in[i];
-			if(yi == 0)
-				out[i * 2 + 1] = in[i_yp] - in[i];
-
-			//use backward differences if the coordinate is at the maximum edge
-			if(xi == x-1)
+			else if (xi == x - 1)
 				out[i * 2 + 0] = in[i] - in[i_xn];
-			if(yi == y-1)
-				out[i * 2 + 1] = in[i] - in[i_yn];
-
-			//otherwise use central differences
-			if(xi > 0 && xi < x-1)
+			else
 				out[i * 2 + 0] = (in[i_xp] - in[i_xn]) / 2;
-			if(yi > 0 && yi < y-1)
+			if(yi == 0)
+				out[i * 2 + 1] = in[i_yp] - in[i];
+			else if(yi == y-1)
+				out[i * 2 + 1] = in[i] - in[i_yn];
+			else
 				out[i * 2 + 1] = (in[i_yp] - in[i_yn]) / 2;
 		}
 		template<typename T>
-		void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){
-
-			//get the number of pixels in the image
-			unsigned int pixels = x * y;
+		void gpu_gradient_2d(T* gpuGrad, T* gpuI, size_t x, size_t y){
 			//get the maximum number of threads per block for the CUDA device
 			unsigned int max_threads = stim::maxThreadsPerBlock();
 			dim3 threads(max_threads, 1);
-			dim3 blocks(x/threads.x + 1 , y);
+			dim3 blocks((unsigned int)(x/threads.x) + 1 , (unsigned int)y);
 			//call the GPU kernel to determine the gradient
@@ -53,11 +53,31 @@
 		float valIn             = tex2D<unsigned char>(texIn, x, y);
 		float templa		= templ(x, 32)*255.0;
-		print[idx]              = abs(valIn-templa);             ///temporary
+		//print[idx]              = abs(valIn-templa);             ///temporary
+		print[idx]		= abs(valIn);
 		//print[idx]              = abs(templa);             ///temporary
 	}
+	///Find the difference of the given set of samples and the template
+	///using cuda acceleration.
+	///@param stim::cuda::cuda_texture t	--stim texture that holds all the references
+	///					  to the data.
+	///@param float* result			--a pointer to the memory that stores the result.
+	__global__
+	//void get_diff (float *result)
+	void get_diff2 (cudaTextureObject_t texIn, float *print, int dx)
+	{       
+		int x   = threadIdx.x + blockIdx.x * blockDim.x;
+		int y   = threadIdx.y + blockIdx.y * blockDim.y;
+		int idx = y*dx+x;
+	//	int idx = y*16+x;
+
+		float valIn             = tex2D<unsigned char>(texIn, x, y);
+		print[idx]              = abs(valIn);             ///temporary
+
+	}
+
 	void test(cudaTextureObject_t tObj, int x, int y, std::string nam)
 	{
@@ -86,3 +106,31 @@
 		cleanUP();
 	}
+
+	void test(cudaTextureObject_t tObj, int x, int y, std::string nam, int iter)
+	{
+
+		//Bind the Texture in GL and allow access to cuda.
+
+		//initialize the return arrays.
+
+		initArray(x,y);
+		dim3 numBlocks(1, y);
+		dim3 threadsPerBlock(x, 1);
+		int max_threads = stim::maxThreadsPerBlock();
+		//dim3 threads(max_threads, 1);
+		//dim3 blocks(x / threads.x + 1, y);	
+		//dim3 numBlocks(2, 2);
+		//dim3 threadsPerBlock(8, 108);
+
+
+//		get_diff <<< blocks, threads >>> (tx.getTexture(), print);
+		get_diff2 <<< numBlocks, threadsPerBlock >>> (tObj, print, x);
+
+		cudaDeviceSynchronize();
+		stringstream name;      //for debugging
+		name << nam.c_str();
+		stim::gpu2image<float>(print, name.str(),x,y,0,255);
+	  
+		cleanUP();
+	}
@@ -4,13 +4,17 @@
 #include <string>
 #include <fstream>
+#include <complex>
+#include <cstring>
+#include <chrono>
 //CUDA
-#ifdef CUDA_FOUND
-	#include <cuda_runtime.h>
-	#include "cufft.h"
-	#include <stim/cuda/cudatools/error.h>
-#endif
+//#ifdef CUDA_FOUND
+#include <cuda_runtime.h>
+#include "cufft.h"
+#include <stim/cuda/cudatools/error.h>
+#include <stim/envi/envi_header.h>
+//#endif
 namespace stim{
@@ -33,15 +37,25 @@ public:
 		return size() * sizeof(T);
 	}
 	void alloc(){
+		if (ptr != NULL) free(ptr);
+		ptr = NULL;
 		ptr = (T*) malloc(bytes());
 	}
-	void alloc(size_t x, size_t y, size_t z){
+	void alloc(size_t x, size_t y, size_t z){		
 		R[0] = x;
 		R[1] = y;
 		R[2] = z;
 		alloc();
 	}
+	char* data() {
+		return (char*)ptr;
+	}
+
+	size_t dim(size_t i){
+		return R[i];
+	}
+
 	/// Create a deep copy of an agileng_binary object
 	void deep_copy(agilent_binary<T>* dst, const agilent_binary<T>* src){
 		dst->alloc(src->R[0], src->R[1], src->R[2]);			//allocate memory
@@ -51,23 +65,28 @@ public:
 	/// Default constructor, sets the resolution to zero and the data pointer to NULL
 	agilent_binary(){
-		memset(R, 0, sizeof(size_t) * 3);				//set the resolution to zero
 		ptr = NULL;
+		memset(R, 0, sizeof(size_t) * 3);				//set the resolution to zero
+		memset(Z, 0, sizeof(double) * 2);		
 	}
 	/// Constructor with resolution
 	agilent_binary(size_t x, size_t y, size_t z){
+		ptr = NULL;
 		alloc(x, y, z);
+		memset(Z, 0, sizeof(double) * 2);
 	}
 	/// Constructor with filename
 	agilent_binary(std::string filename){
 		ptr = NULL;
+		memset(Z, 0, sizeof(double) * 2);
 		load(filename);
 	}
 	/// Copy constructor
 	agilent_binary(const agilent_binary<T> &obj){
+		ptr = NULL;
 		deep_copy(this, &obj);
 	}
@@ -78,32 +97,42 @@ public:
 		return *this;									//return the result
 	}
+	operator bool() {
+		if (R[0] == 0 || R[1] == 0 || R[2] == 0)	return false;
+		else return true;
+	}
+
 	~agilent_binary(){
-		free(ptr);
+		if(ptr != NULL)
+			free(ptr);
 	}
 	void load(std::string filename){
-		if(ptr != NULL) free(ptr);						//if memory has been allocated, free it
+		if(ptr != NULL) free(ptr);									//if memory has been allocated, free it
+		ptr = NULL;
-		fname = filename;						//save the filename
+		fname = filename;											//save the filename
 		short x, y, z;
 		std::ifstream infile(fname, std::ios::binary);				//open the input file
-		infile.seekg(9, std::ios::beg);				//seek past 9 bytes from the beginning of the file
+		if (infile) {
+			infile.seekg(9, std::ios::beg);							//seek past 9 bytes from the beginning of the file
-		infile.read((char*)(&z), 2);							//read two bytes of data (the number of samples is stored as a 16-bit integer)
+			infile.read((char*)(&z), 2);							//read two bytes of data (the number of samples is stored as a 16-bit integer)
-		infile.seekg(13, std::ios::cur);				//skip another 13 bytes
-		infile.read((char*)(&x), 2);				//read the X and Y dimensions
-		infile.read((char*)(&y), 2);
+			infile.seekg(13, std::ios::cur);						//skip another 13 bytes
+			infile.read((char*)(&x), 2);							//read the X and Y dimensions
+			infile.read((char*)(&y), 2);
-		infile.seekg(header, std::ios::beg);			//seek to the start of the data
+			infile.seekg(header, std::ios::beg);					//seek to the start of the data
-		alloc(x, y, z);
-		ptr = (T*) malloc(bytes());							//allocate space for the data
-		infile.read((char*)ptr, bytes());				//read the data		
-		infile.close();
+			alloc(x, y, z);											//allocate the data
+			infile.read((char*)ptr, bytes());						//read the data		
+			infile.close();											//close the file
+			Z[0] = 1;
+			Z[1] = (double)R[2];
+		}
 	}
 	void save(std::string filename){
@@ -167,9 +196,8 @@ public:
 	//pads to the nearest power-of-two
 	void zeropad(){
-		size_t newZ = pow(2, ceil(log(R[2])/log(2)));			//find the nearest power-of-two
+		size_t newZ = (size_t)pow(2, ceil(log(R[2])/log(2)));			//find the nearest power-of-two
 		size_t n = newZ - R[2];									//calculate the number of bands to add
-		std::cout<<"band padding: "<<n<<std::endl;
 		zeropad(n);												//add the padding
 	}
@@ -184,9 +212,20 @@ public:
 			ptr[i] = -log10(ptr[i] / background->ptr[i]);
 	}
-#ifdef CUDA_FOUND
+	//crops the image down to a set number of samples
+	void crop(size_t n) {
+		if (n < R[2]) {											//if the requested size is smaller than the image
+			R[2] = n;											//update the number of bands
+			T* old_ptr = ptr;									//store the old pointer
+			alloc();											//allocate space for the new image
+			memcpy(ptr, old_ptr, bytes());						//copy the old data to the new image
+			free(old_ptr);										//free the old data
+		}
+	}
+
+//#ifdef CUDA_FOUND
 	/// Perform an FFT and return a binary file with bands in the specified range
-	agilent_binary<T> fft(double band_min, double band_max, double ELWN = 15798, int UDR = 2){
+	agilent_binary<T> fft(double band_min, double band_max, double ELWN = 15798, int UDR = 2, int device = 0){
 		auto total_start = std::chrono::high_resolution_clock::now();
 		auto start = std::chrono::high_resolution_clock::now();
@@ -201,6 +240,29 @@ public:
        // std::cout << "Transpose data: " << diff.count() << " s\n";
 		start = std::chrono::high_resolution_clock::now();
+		if (device >= 0) {														//if a CUDA device is specified
+			int dev_count;
+			HANDLE_ERROR(cudaGetDeviceCount(&dev_count));						//get the number of CUDA devices
+			//std::cout << "Number of CUDA devices: " << dev_count << std::endl;		//output the number of CUDA devices
+			cudaDeviceProp prop;
+			//std::cout << "CUDA devices----" << std::endl;
+			for (int d = 0; d < dev_count; d++) {									//for each CUDA device
+				cudaGetDeviceProperties(&prop, d);									//get the property of the first device
+																					//float cc = prop.major + prop.minor / 10.0f;						//calculate the compute capability
+				//std::cout << d << ":  [" << prop.major << "." << prop.minor << "]      " << prop.name << std::endl;	//display the device information
+																													//if(cc > best_device_cc){
+																													//	best_device_cc = cc;										//if this is better than the previous device, use it
+																													//	best_device_id = d;
+																													//}
+			}
+			if (dev_count > 0 && dev_count > device) {							//if the first device is not an emulator
+				cudaGetDeviceProperties(&prop, device);						//get the property of the requested CUDA device
+				if (prop.major != 9999) {
+					//std::cout << "Using device " << device << std::endl;
+					HANDLE_ERROR(cudaSetDevice(device));
+				}
+			}
+		}
 		cufftHandle plan;															//allocate space for a cufft plan
 		cufftReal* gpu_data;														//create a pointer to the data
 		size_t batch = R[0] * R[1];													//calculate the batch size (X * Y)
@@ -237,18 +299,19 @@ public:
 		HANDLE_ERROR(cudaMemcpy(cpu_fft, gpu_fft, R[0] * R[1] * (R[2]/2+1) * sizeof(cufftComplex), cudaMemcpyDeviceToHost));	//copy data from the host to the device
 		//double int_delta = 0.00012656;									//interferogram sample spacing in centimeters
-		double int_delta = (1.0 / ELWN) * ((double)UDR / 2.0);			//calculate the interferogram spacing
-		double int_length = int_delta * R[2];							//interferogram length in centimeters
-		double fft_delta = 1/int_length;								//spectrum spacing (in inverse centimeters, wavenumber)
-		double fft_max = fft_delta * R[2]/2;							//get the maximum wavenumber value supported by the specified number of interferogram samples
+		double int_delta = (1.0 / ELWN) * ((double)UDR / 2.0);				//calculate the interferogram spacing
+		double int_length = int_delta * R[2];								//interferogram length in centimeters
+		double fft_delta = 1/int_length;									//spectrum spacing (in inverse centimeters, wavenumber)
+		double fft_max = fft_delta * R[2]/2;								//get the maximum wavenumber value supported by the specified number of interferogram samples
-		if(band_max > fft_max) band_max = fft_max;						//the user gave a band outside of the FFT range, reset the band to the maximum available
+		if(band_max > fft_max) band_max = fft_max;							//the user gave a band outside of the FFT range, reset the band to the maximum available
+		if (band_min < 0) band_min = 0;
 		size_t start_i = (size_t)std::ceil(band_min / fft_delta);				//calculate the first band to store
 		size_t size_i = (size_t)std::floor(band_max / fft_delta) - start_i;		//calculate the number of bands to store
-		size_t end_i = start_i + size_i;								//last band number
+		size_t end_i = start_i + size_i;										//last band number
 		agilent_binary<T> result(R[0], R[1], size_i);
-		result.Z[0] = start_i * fft_delta;								//set the range for the FFT result
+		result.Z[0] = start_i * fft_delta;										//set the range for the FFT result
 		result.Z[1] = end_i * fft_delta;
 		for(size_t b = start_i; b < end_i; b++){
@@ -271,7 +334,22 @@ public:
 		return result;
 	}
-#endif
+
+	//saves the binary as an ENVI file with a BIP interleave format
+	int bip(T* bip_ptr){
+		//std::ofstream out(outfile.c_str(), std::ios::binary);			//create a binary file stream for output
+		size_t XY = R[0] * R[1];
+		size_t B = R[2];
+		size_t b;
+
+		for(size_t xy = 0; xy < XY; xy++){
+			for(b = 0; b < B; b++){
+				bip_ptr[xy * B + b] = ptr[b * XY + xy];
+			}
+		}
+		return 0;
+	}
+//#endif
 };
@@ -4,6 +4,7 @@
 #include "../envi/envi_header.h"
 #include "../envi/hsi.h"
 #include "../math/fd_coefficients.h"
+#include <stim/cuda/cudatools/error.h>
 #include <cstring>
 #include <utility>
 #include <deque>
@@ -54,11 +55,12 @@ public:
 			  unsigned long long Y,
 			  unsigned long long B,
 			  unsigned long long header_offset,
-			  std::vector<double> wavelengths){
+			  std::vector<double> wavelengths,
+			  stim::iotype io = stim::io_in){
 		w = wavelengths;
-		return open(filename, vec<unsigned long long>(X, B, Y), header_offset);
+		return open(filename, vec<unsigned long long>(X, B, Y), header_offset, io);
 	}
@@ -118,7 +120,7 @@ public:
 			page++;
 			//if wavelength is larger than the last wavelength in header file
 			if (page == Z()) {
-				band_index(p, Z()-1);
+				band_index(p, Z()-1, PROGRESS);
 				return true;
 			}
 		}
@@ -224,10 +226,44 @@ public:
 	}
 	//given a Y ,return a XZ slice
-	bool read_plane_y(T * p, unsigned long long y){
+	bool read_plane_xz(T * p, size_t y){
 		return binary<T>::read_plane_2(p, y);
 	}
+	//given a Y, return ZX slice (transposed such that the spectrum is the leading dimension)
+	int read_plane_zx(T* p, size_t y){
+		T* temp = (T*) malloc(X() * Z() * sizeof(T));	//allocate space to store the temporary xz plane
+		binary<T>::read_plane_2(temp, y);					//load the plane from disk
+		size_t z, x;
+		for(z = 0; z < Z(); z++){
+			for(x = 0; x <= z; x++){
+				p[x * Z() + z] = temp[z * X() + x];		//copy to the destination frame
+			}
+		}
+	}
+
+	//load a frame y into a pre-allocated double-precision array
+	int read_plane_xzd(double* f, size_t y){		
+		size_t XB = X() * Z();
+		T* temp = (T*) malloc(XB * sizeof(T));			//create a temporary location to store the plane at current precision
+		if(!read_plane_y(temp, y)) return 1;			//read the plane in its native format, if it fails return a 1
+		for(size_t i = 0; i < XB; i++) f[i] = temp[i];	//convert the plane to a double
+		return 0;
+	}
+
+	//given a Y, return ZX slice (transposed such that the spectrum is the leading dimension)
+	int read_plane_zxd(double* p, size_t y){
+		T* temp = (T*) malloc(X() * Z() * sizeof(T));		//allocate space to store the temporary xz plane
+		binary<T>::read_plane_2(temp, y);					//load the plane from disk
+		size_t z, x;
+		for(z = 0; z < Z(); z++){
+			for(x = 0; x < X(); x++){
+				p[x * Z() + z] = (double)temp[z * X() + x];	//copy to the destination frame
+			}
+		}
+		return 0;
+	}
+
 	/// Perform baseline correction given a list of baseline points and stores the result in a new BSQ file.
@@ -268,7 +304,7 @@ public:
 		for (unsigned long long k =0; k < Y(); k++)
 		{
 			//get the current y slice
-			read_plane_y(c, k);
+			read_plane_xz(c, k);
 			//initialize lownum, highnum, low, high
 			ai = w[0];
@@ -369,7 +405,7 @@ public:
 		for(unsigned long long j = 0; j < Y(); j++)
 		{
-			read_plane_y(c, j);
+			read_plane_xz(c, j);
 			for(unsigned long long i = 0; i < B; i++)
 			{
 				for(unsigned long long m = 0; m < X(); m++)
@@ -426,6 +462,11 @@ public:
 		}*/
 	}
+	bool select(std::string outfile, std::vector<double> bandlist, unsigned char* mask = NULL, bool PROGRESS = NULL) {
+		std::cout << "ERROR: select() not implemented for BIL" << std::endl;
+		exit(1);
+	}
+
 	/// Convert the current BIL file to a BSQ file with the specified file name.
 	/// @param outname is the name of the output BSQ file to be saved to disk.
@@ -469,7 +510,7 @@ public:
 		for ( unsigned long long i = 0; i < Y(); i++)
 		{
-			read_plane_y(p, i);
+			read_plane_xz(p, i);
 			for ( unsigned long long k = 0; k < Z(); k++)
 			{
 				unsigned long long ks = k * X();
@@ -863,7 +904,7 @@ public:
 		for (unsigned long long i = 0; i < Y(); i++)			//for each value in Y() (BIP should be X)
 		{
-			read_plane_y(temp, i);							//retrieve an ZX slice, stored in temp
+			read_plane_xz(temp, i);							//retrieve an ZX slice, stored in temp
 			for ( unsigned long long j = 0; j < Z(); j++)	//for each Z() (Y)
 			{
 				for (unsigned long long k = 0; k < X(); k++) //for each band
@@ -933,7 +974,7 @@ public:
 		//for each slice along the y axis
 		for (unsigned long long y = 0; y < Y(); y++)			//Select a page by choosing Y coordinate, Y()
 		{
-			read_plane_y(slice, y);							//retrieve an ZX page, store in "slice"
+			read_plane_xz(slice, y);							//retrieve an ZX page, store in "slice"
 			//for each sample along X
 			for (unsigned long long x = 0; x < X(); x++)		//Select a pixel by choosing X coordinate in the page, X()
@@ -1004,7 +1045,7 @@ public:
 		double x;											//create a register to store the pixel value
 		for (unsigned long long k = 0; k < Y(); k++){
-			read_plane_y(temp, k);
+			read_plane_xz(temp, k);
 			unsigned long long kx = k * X();
 			for (unsigned long long i = 0; i < X(); i++){
 				if (mask == NULL || mask[kx + i] != 0){
@@ -1025,13 +1066,103 @@ public:
 		return true;
 	}
+	int co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
+		cublasStatus_t stat;
+		cublasHandle_t handle;
+
+		progress = 0;														//initialize the progress to zero (0)
+		size_t XY = X() * Y();												//calculate the number of elements in a band image
+		size_t XB = X() * Z();
+		size_t B = Z();														//calculate the number of spectral elements
+
+		double* F = (double*)malloc(sizeof(double) * B * X());				//allocate space for the frame that will be pulled from the file
+		double* F_dev;
+		HANDLE_ERROR(cudaMalloc(&F_dev, X() * B * sizeof(double)));			//allocate space for the frame on the GPU
+		double* s_dev;														//declare a device pointer that will store the spectrum on the GPU
+		double* A_dev;														//declare a device pointer that will store the covariance matrix on the GPU
+		double* avg_dev;													//declare a device pointer that will store the average spectrum
+		HANDLE_ERROR(cudaMalloc(&s_dev, B * sizeof(double)));				//allocate space on the CUDA device for a spectrum
+		HANDLE_ERROR(cudaMalloc(&A_dev, B * B * sizeof(double)));			//allocate space on the CUDA device for the covariance matrix
+		HANDLE_ERROR(cudaMemset(A_dev, 0, B * B * sizeof(double)));			//initialize the covariance matrix to zero (0)
+		HANDLE_ERROR(cudaMalloc(&avg_dev, XB * sizeof(double)));				//allocate space on the CUDA device for the average spectrum
+		for(size_t x = 0; x < X(); x++)											//make multiple copies of the average spectrum in order to build a matrix
+			HANDLE_ERROR(cudaMemcpy(&avg_dev[x * B], avg, B * sizeof(double), cudaMemcpyHostToDevice));	
+		//stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1);	//copy the average spectrum to the CUDA device
+
+		double ger_alpha = 1.0/(double)XY;										//scale the outer product by the inverse of the number of samples (mean outer product)
+		double axpy_alpha = -1;													//multiplication factor for the average spectrum (in order to perform a subtraction)
+
+		CUBLAS_HANDLE_ERROR(stat = cublasCreate(&handle));								//create a cuBLAS instance
+		if (stat != CUBLAS_STATUS_SUCCESS) return 1;									//test the cuBLAS instance to make sure it is valid
+
+		else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl;
+		double beta = 1.0;
+		size_t x, y;
+		for(y = 0; y < Y(); y++){										//for each line
+			read_plane_zxd(F, y);												//read a frame from the file
+			HANDLE_ERROR(cudaMemcpy(F_dev, F, XB * sizeof(double), cudaMemcpyHostToDevice));	//copy the frame to the GPU
+			CUBLAS_HANDLE_ERROR(cublasDgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, (int)B, (int)X(), &axpy_alpha, avg_dev, (int)B, &beta, F_dev, (int)B, F_dev, (int)B));//subtract the mean spectrum
+
+			for(x = 0; x < X(); x++)
+				CUBLAS_HANDLE_ERROR(cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, &F_dev[x*B], 1, A_dev, (int)B));			//perform an outer product
+			if(PROGRESS) progress = (double)(y + 1) / Y() * 100;
+		}
+
+		cublasGetMatrix((int)B, (int)B, sizeof(double), A_dev, (int)B, co, (int)B);			//copy the result from the GPU to the CPU
+
+		cudaFree(A_dev);																	//clean up allocated device memory
+		cudaFree(s_dev);
+		cudaFree(avg_dev);
+
+		for(unsigned long long i = 0; i < B; i++){										//copy the upper triangular portion to the lower triangular portion
+			for(unsigned long long j = i+1; j < B; j++){
+				co[B * i + j] = co[B * j + i];
+			}
+		}
+
+		return 0;
+
+
+
+	}
+
+
 	/// Calculate the covariance matrix for all masked pixels in the image.
 	/// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
 	/// @param avg is a pointer to memory of size B that stores the average spectrum
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
-	bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
+	bool co_matrix(double* co, double* avg, unsigned char *mask, int cuda_device = 0, bool PROGRESS = false){
 		progress = 0;
+
+		if (cuda_device >= 0) {													//if a CUDA device is specified
+			int dev_count;
+			HANDLE_ERROR(cudaGetDeviceCount(&dev_count));						//get the number of CUDA devices
+			std::cout << "Number of CUDA devices: " << dev_count << std::endl;		//output the number of CUDA devices
+			cudaDeviceProp prop;
+			//int best_device_id = 0;													//stores the best CUDA device
+			//float best_device_cc = 0.0f;												//stores the compute capability of the best device
+			std::cout << "CUDA devices----" << std::endl;
+			for (int d = 0; d < dev_count; d++) {									//for each CUDA device
+				cudaGetDeviceProperties(&prop, d);								//get the property of the first device
+				//float cc = prop.major + prop.minor / 10.0f;						//calculate the compute capability
+				std::cout << d << ":  [" << prop.major << "." << prop.minor << "]      " << prop.name << std::endl;	//display the device information
+				//if(cc > best_device_cc){
+				//	best_device_cc = cc;										//if this is better than the previous device, use it
+				//	best_device_id = d;
+				//}
+			}
+			if (dev_count > 0 && dev_count > cuda_device) {							//if the first device is not an emulator
+				cudaGetDeviceProperties(&prop, cuda_device);									//get the property of the requested CUDA device
+				if (prop.major != 9999) {
+					std::cout << "Using device " << cuda_device << std::endl;
+					HANDLE_ERROR(cudaSetDevice(cuda_device));
+					int status = co_matrix_cublas(co, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
+					if (status == 0) return true;									//if the cuBLAS function returned correctly, we're done
+				}
+			}
+		}
+
 		//memory allocation
 		unsigned long long xy = X() * Y();
 		unsigned long long B = Z();
@@ -1089,9 +1220,9 @@ public:
 								   bool PROGRESS = false){
 		//calculate the new image parameters
-		unsigned long long samples = x1 - x0;
-		unsigned long long lines = y1 - y0;
-		unsigned long long bands = b1 - b0;
+		unsigned long long samples = x1 - x0 + 1;
+		unsigned long long lines = y1 - y0 + 1;
+		unsigned long long bands = b1 - b0 + 1;
 		//calculate the size of a line
 		unsigned long long L = samples * sizeof(T);
@@ -1107,19 +1238,19 @@ public:
 		unsigned long long jumpb = (X() - samples) * sizeof(T);
 		//distance needed to jump from the previous line of the last band to the next line of the first band
-		unsigned long long longjump = ((Z() - b1) * X() + b0 * X()) * sizeof(T);
+		unsigned long long longjump = ((Z() - bands) * X()) * sizeof(T);
 		//set the start position for the cropped region
 		file.seekg((y0 * X() * Z() + b0 * X() + x0) * sizeof(T), std::ios::beg);
 		for (unsigned long long x = 0; x < lines; x++)
 		{
-			for (unsigned long long z = b0; z < b1; z++)
+			for (unsigned long long z = b0; z <= b1; z++)
 			{
 				file.read((char *)(temp + z * samples), sizeof(T) * samples);
 				file.seekg(jumpb, std::ios::cur);    //go to the next band
-				if(PROGRESS) progress = (double)(x * Z() + z+1) / (lines * Z()) * 100;
+				if(PROGRESS) progress = (double)(x * (b1 - b0 + 1) + z + 1) / (lines * (b1 - b0 + 1)) * 100;
 			}
 			//write slice data into target file
@@ -1211,6 +1342,27 @@ public:
 		}
 	}
+	///Append two files together along the band dimension
+	void append(std::string outfile, bil<T>* C, bool PROGRESS = false) {
+		std::ofstream out(outfile.c_str(), std::ios::binary);	//open the output file for writing
+		file.seekg(0, std::ios::beg);							//move to the beginning of both files
+		C->file.seekg(0, std::ios::beg);
+		size_t a_bytes = X() * Z() * sizeof(T);					//calculate the number of bytes in a single plane of this file
+		size_t b_bytes = C->X() * C->Z() * sizeof(T);			//calculate the number of bytes in a single plane of the appending file
+		T* a = (T*)malloc(a_bytes);								//allocate space for a plane of the current file
+		T* b = (T*)malloc(b_bytes);								//allocate space for a plane of the appended file
+		if (PROGRESS) progress = 0;
+		for (size_t y = 0; y < Y(); y++) {
+			read_plane_xz(a, y);								//read a plane from the current file
+			out.write((char*)a, a_bytes);								//write the plane to disk
+			C->read_plane_xz(b, y);								//read a plane from the appending file
+			out.write((char*)b, b_bytes);
+			if (PROGRESS) progress = (double)(y + 1) / (double)(Y()) * 100;
+		}
+		
+		out.close();
+	}
+
 	/// Convolve the given band range with a kernel specified by a vector of coefficients.
 	/// @param outfile is an already open stream to the output file
@@ -1328,7 +1480,7 @@ public:
 		c = (T*)malloc( L );										//allocate space for the slice
 		for(unsigned long long j = 0; j < Y(); j++){				//for each line
-			read_plane_y(c, j);										//load the line into memory
+			read_plane_xz(c, j);										//load the line into memory
 			for(unsigned long long i = 0; i < B; i++){				//for each band
 				for(unsigned long long m = 0; m < X(); m++){		//for each sample
 					if( mask == NULL && mask[m + j * X()] )			//if the pixel is masked
@@ -1358,7 +1510,7 @@ public:
 		c = (T*)malloc( L );										//allocate space for the slice
 		for(unsigned long long j = 0; j < Y(); j++){				//for each line
-			read_plane_y(c, j);										//load the line into memory
+			read_plane_xz(c, j);										//load the line into memory
 			for(unsigned long long i = 0; i < B; i++){				//for each band
 				for(unsigned long long m = 0; m < X(); m++){		//for each sample
 					if( mask == NULL && mask[m + j * X()] )			//if the pixel is masked
@@ -3,8 +3,8 @@
 #ifndef RTS_BINARY_H
 #define RTS_BINARY_H
-#include "../envi/envi_header.h"
-#include "../math/vector.h"
+#include <stim/envi/envi_header.h>
+#include <stim/math/vector.h>
 #include <fstream>
 #include <sys/stat.h>
 #include <cstring>
@@ -134,73 +134,6 @@ public:
 			}
 		}
 	}
-
-	/*// this function updates the optimizer, given the number of bytes processed in an interval and time spent processing
-	size_t update(size_t bytes_processed, size_t ms_spent){
-		interval_B += bytes_processed;		//increment the number of bytes processed
-		interval_ms += ms_spent;			//increment the number of milliseconds spent processing
-
-		//if we have sufficient information to evaluate the optimization function at this point
-		if(interval_ms >= window_ms){					//if sufficient time has passed to get a reliable Bps measurement
-			size_t new_Bps = interval_B / interval_ms;	//calculate the current Bps
-
-			if(sample_step){							//if this is a sample step, collect the information for Bps = f(n0)
-				Bps = new_Bps;							//set the Bps to the evaluated value
-				n[1] = n[0] - dn;								//reduce the batch size by one delta to take a second sample
-				if(n[1] == 0){							//if the resulting batch size is zero
-					n[1] = 2*dn;						//we're at the left edge: set the new sample point to 2*dn
-				}
-
-				interval_B = interval_ms = 0;			//start a new interval at the new sample point
-				sample_step = false;					//next step will calculate the new batch size via optimization
-				return n[1];								//return the new batch size
-			}
-			else{								//if we have sufficient information to evaluate the derivative and optimize
-				double f = (double)new_Bps;				//we have evaluated the function at this location
-				double fprime;
-				if(n[1] < n[0] ){									//if the new point is less than the previous point (usually the case)
-					fprime = (double)(Bps - new_Bps) / (double)dn;	//calculate the forward difference
-				}
-				else{												//if the new point is larger (only happens at the minimum limit)
-					fprime = (double)(new_Bps - Bps) / (double)dn;	//calculate the backward difference
-				}
-				size_t bestn = n[1] - (size_t)(f / fprime);			//calculate the best value for B using Newton's method
-				n[0] = round_limit( (size_t)bestn );						//set the new dependent point
-				sample_step = true;									//the next step will be a sample step
-			}
-
-		}
-		if(sample_step) return n[0];
-		return n[1];										//insufficient information, keep the same batch size
-	}*/
-
-	/*size_t update(size_t bytes_processed, size_t ms_spent){
-		interval_B += bytes_processed;		//increment the number of bytes processed
-		interval_ms += ms_spent;			//increment the number of milliseconds spent processing
-
-		//if( Bps[0] == 0 ){				//if the left boundary hasn't been processed
-
-
-		//if we have sufficient information to evaluate the optimization function at this point
-		if(interval_ms >= window_ms){
-			size_t new_Bps = interval_B / interval_ms;	//calculate the current Bps
-
-			if(Bps[0] == 0)							//if the left interval Bps hasn't been calculated
-				Bps[0] = interval_B / interval_ms;	//that is the interval being processed
-			else
-				Bps[1] = interval_B / interval_ms;	//otherwise the right interval is being processed
-
-			if(Bps[0] != 0 && Bps[1] != 0){			//if both intervals have been processed
-
-
-		}
-	}*/
-
-	/*size_t update(size_t bytes_processed, size_t ms_spent, size_t& data_rate, bool VERBOSE){
-		size_t time = update(bytes_processed, ms_spent, VERBOSE);
-		data_rate = Bps[0];
-		return time;
-	}*/
 };
 /** This class manages the streaming of large multidimensional binary files.
@@ -210,6 +143,8 @@ public:
  *  @param T is the data type used to store data to disk (generally float or double)
  *  @param D is the dimension of the data (default 3)
  */
+
+enum iotype {io_in, io_out};
 template< typename T, unsigned int D = 3 >
 class binary{
@@ -272,19 +207,21 @@ protected:
 	/// Private helper file that opens a specified binary file.
 	/// @param filename is the name of the binary file to stream
-	bool open_file(std::string filename){
+	bool open_file(std::string filename, stim::iotype io = io_in){
 		//open the file as binary for reading and writing
-		file.open(filename.c_str(), std::ios::in | std::ios::out | std::ios::binary);
-
-		//if the file isn't open, the user may only have read access
-		if(!file.is_open()){
-			std::cout<<"class STIM::BINARY - failed to open file, trying for read only"<<std::endl;
+		if(io == io_in)
 			file.open(filename.c_str(), std::ios::in | std::ios::binary);
-			if(!file.is_open()){
-				std::cout<<"               still unable to load the file"<<std::endl;
+		else if (io == io_out) {
+			file.open(filename.c_str(), std::ios::out | std::ios::binary);
+			if (!file.is_open()) {
+				std::cout << "stim::binary ERROR - unable to open file for writing: " << filename << std::endl;
 				return false;
 			}
 		}
+		else {
+			std::cout << "stim::binary ERROR - unrecognized IO format" << std::endl;
+			return false;
+		}
 		//if the file is successful
 		if(file){
@@ -342,20 +279,24 @@ public:
 	/// @param filename is the name of the binary file
 	/// @param r is a STIM vector specifying the size of the binary file along each dimension
 	/// @param h is the length (in bytes) of any header file (default zero)
-	bool open(std::string filename, vec<unsigned long long> r, unsigned long long h = 0){
+	bool open(std::string filename, vec<unsigned long long> r, unsigned long long h = 0, stim::iotype io = stim::io_in){
 		for(unsigned long long i = 0; i < D; i++)		//set the dimensions of the binary file object
 			R[i] = r[i];
 		header = h;				//save the header size
-		if(!open_file(filename)) return false;	//open the binary file
+		if(!open_file(filename), io) return false;	//open the binary file
 		//reset();
 		return test_file_size();
 	}
+	bool is_open() {
+		return file.is_open();
+	}
+
 	/// Creates a new binary file for streaming
 	/// @param filename is the name of the binary file to be created
@@ -605,7 +546,7 @@ public:
 		size_t size_bytes = sx * sy * sz * sizeof(T);					//size of the block to read in bytes
 		size_t start = z * R[0] * R[1] + y * R[0] + x;						//calculate the start postion
-		size_t start_bytes = start * sizeof(T);							//start position in bytes
+		//size_t start_bytes = start * sizeof(T);							//start position in bytes
 		file.seekg(start * sizeof(T), std::ios::beg);					//seek to the start position
@@ -5,13 +5,17 @@
 #include "../envi/bil.h"
 #include "../envi/hsi.h"
 #include <cstring>
+#include <complex>
 #include <utility>
+#include <algorithm>
 //CUDA
-#ifdef CUDA_FOUND
-	#include <cuda_runtime.h>
-	#include "cublas_v2.h"
-#endif
+//#ifdef CUDA_FOUND
+#include <stim/cuda/cudatools/error.h>
+#include <cuda_runtime.h>
+#include "cublas_v2.h"
+#include "cufft.h"
+//#endif
 namespace stim{
@@ -62,14 +66,15 @@ public:
 			  unsigned long long Y,
 			  unsigned long long B,
 			  unsigned long long header_offset,
-			  std::vector<double> wavelengths){
+			  std::vector<double> wavelengths,
+			  stim::iotype io = stim::io_in){
 		//copy the wavelengths to the BSQ file structure
 		w = wavelengths;
 		//copy the offset to the structure
 		offset = header_offset;
-		return open(filename, vec<unsigned long long>(B, X, Y), header_offset);
+		return open(filename, vec<unsigned long long>(B, X, Y), header_offset, io);
 	}
@@ -257,7 +262,7 @@ public:
 	}
 	//given a Y ,return a ZX slice
-	bool read_plane_y(T * p, unsigned long long y){
+	bool read_plane_y(T * p, size_t y){
 		return binary<T>::read_plane_2(p, y);
 	}
@@ -388,6 +393,50 @@ public:
 		}
 	}
+	/// This function loads a specified set of bands and saves them into a new output file
+	bool select(std::string outfile, std::vector<double> bandlist, unsigned char* mask = NULL, bool PROGRESS = false) {
+		std::ofstream target(outfile.c_str(), std::ios::binary);	//open the target binary file
+		if (!target) {
+			std::cout << "ERROR opening output file: " << outfile << std::endl;
+			return false;
+		}
+		file.seekg(0, std::ios::beg);								//move the pointer to the current file to the beginning
+
+		size_t B = Z();												//number of spectral components
+		size_t XY = X() * Y();										//calculate the number of pixels
+		size_t Bout = bandlist.size();
+		size_t in_bytes = B * sizeof(T);							//number of bytes in a spectrum
+		size_t out_bytes = Bout * sizeof(T);						//number of bytes in an output spectrum
+
+		T* in = (T*)malloc(in_bytes);								//allocate space for the input spectrum
+		T* out = (T*)malloc(out_bytes);								//allocate space for the output spectrum
+
+		double wc;													//register to store the desired wavelength
+		double w0, w1;												//registers to store the wavelengths surrounding the given band
+		size_t b0, b1;												//indices of the bands surrounding the specified wavelength
+		for (size_t xy = 0; xy < XY; xy++) {						//for each pixel
+			//memset(out, 0, out_bytes);								//set the spectrum to zero
+			if (mask == NULL || mask[xy]) {							//if the pixel is masked
+				file.read((char*)in, in_bytes);						//read an input spectrum
+				for (size_t b = 0; b < Bout; b++) {					//for each band
+					wc = bandlist[b];									//set the desired wavelength
+					hsi<T>::band_bounds(wc, b0, b1);								//get the surrounding bands
+					w0 = w[b0];											//get the wavelength for the lower band
+					w1 = w[b1];											//get the wavelength for the higher band
+					out[b] = hsi<T>::lerp(wc, in[b0], w0, in[b1], w1);			//interpolate the spectral values to get the desired output value
+				}
+			}
+			else
+				file.seekg(Bout, std::ios::cur);						//otherwise skip a spectrum
+			target.write((char*)out, out_bytes);							//output the normalized spectrum
+			if (PROGRESS) progress = (double)(xy + 1) / (double)XY * 100;		//update the progress
+		}
+
+		free(in);
+		free(out);
+		return true;
+	}
+
 	/// Convert the current BIP file to a BIL file with the specified file name.
@@ -954,7 +1003,7 @@ public:
 	/// @param p is a pointer to pre-allocated memory of size [B * sizeof(T)] that stores the mean spectrum
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
-	bool mean_spectrum(double* m, double* std, unsigned char* mask = NULL, bool PROGRESS = false){
+	bool mean_spectrum(double* m, double* std = NULL, unsigned char* mask = NULL, bool PROGRESS = false){
 		unsigned long long XY = X() * Y();							//calculate the total number of pixels in the HSI
 		T* temp = (T*)malloc(sizeof(T) * Z());						//allocate space for the current spectrum to be read
 		memset(m, 0, Z() * sizeof(double));							//set the mean spectrum to zero
@@ -976,22 +1025,23 @@ public:
 		}
 		//calculate the standard deviation
-		for(size_t i = 0; i < Z(); i++)
-			std[i] = sqrt(e_x2[i] - m[i] * m[i]);
+		if (std != NULL) {
+			for (size_t i = 0; i < Z(); i++)
+				std[i] = sqrt(e_x2[i] - m[i] * m[i]);
+		}
 		free(temp);
 		return true;
 	}
-#ifdef CUDA_FOUND
+//#ifdef CUDA_FOUND
 	/// Calculate the covariance matrix for masked pixels using cuBLAS
 	/// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra
-	bool co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
+	int co_matrix_cublas(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
 		cudaError_t cudaStat;
 		cublasStatus_t stat;
 		cublasHandle_t handle;
-		progress = 0;													//initialize the progress to zero (0)
 		unsigned long long XY = X() * Y();									//calculate the number of elements in a band image
 		unsigned long long B = Z();											//calculate the number of spectral elements
@@ -1009,10 +1059,9 @@ public:
 		double axpy_alpha = -1;												//multiplication factor for the average spectrum (in order to perform a subtraction)
 		stat = cublasCreate(&handle);										//create a cuBLAS instance
-		if (stat != CUBLAS_STATUS_SUCCESS) {								//test the cuBLAS instance to make sure it is valid
-			printf ("CUBLAS initialization failed\n");
-			return EXIT_FAILURE;
-		}
+		if (stat != CUBLAS_STATUS_SUCCESS) return 1;						//test the cuBLAS instance to make sure it is valid
+
+		//else std::cout<<"Using cuBLAS to calculate the mean covariance matrix..."<<std::endl;
 		for (unsigned long long xy = 0; xy < XY; xy++){										//for each pixel
 			if (mask == NULL || mask[xy] != 0){
 				pixeld(s, xy);																	//retreive the spectrum at the current xy pixel location
@@ -1036,26 +1085,33 @@ public:
 			}
 		}
-		return true;
+		return 0;
 	}
-#endif
+//#endif
 	/// Calculate the covariance matrix for all masked pixels in the image with 64-bit floating point precision.
 	/// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
 	/// @param avg is a pointer to memory of size B that stores the average spectrum
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
-	bool co_matrix(double* co, double* avg, unsigned char *mask, bool PROGRESS = false){
-
-#ifdef CUDA_FOUND
-		int dev_count;
-		cudaGetDeviceCount(&dev_count);									//get the number of CUDA devices
-		cudaDeviceProp prop;
-		cudaGetDeviceProperties(&prop, 0);								//get the property of the first device
-		if(dev_count > 0 && prop.major != 9999)							//if the first device is not an emulator
-			return co_matrix_cublas(co, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
-#endif
+	bool co_matrix(double* co, double* avg, unsigned char *mask, int cuda_device = 0, bool PROGRESS = false){
 		progress = 0;
+
+		if(cuda_device >= 0){													//if a CUDA device is specified
+			int dev_count;
+			HANDLE_ERROR(cudaGetDeviceCount(&dev_count));						//get the number of CUDA devices
+			if(dev_count > 0 && dev_count > cuda_device){							//if the first device is not an emulator
+				cudaDeviceProp prop;
+				cudaGetDeviceProperties(&prop, cuda_device);									//get the property of the requested CUDA device
+				if (prop.major != 9999) {
+					std::cout << "Using CUDA device [" << cuda_device << "] to calculate the mean covariance matrix..."<<std::endl;
+					HANDLE_ERROR(cudaSetDevice(cuda_device));
+					int status = co_matrix_cublas(co, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
+					if (status == 0) return true;									//if the cuBLAS function returned correctly, we're done
+				}
+			}																	//otherwise continue using the CPU		
+			std::cout<<"WARNING: cuBLAS failed, using CPU"<<std::endl;
+		}
 		//memory allocation
 		unsigned long long XY = X() * Y();
 		unsigned long long B = Z();
@@ -1097,10 +1153,7 @@ public:
 	}
-#ifdef CUDA_FOUND
-	/// Calculate the covariance matrix of Noise for masked pixels using cuBLAS
-	/// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra
-	bool coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
+	int coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false) {
 		cudaError_t cudaStat;
 		cublasStatus_t stat;
@@ -1113,9 +1166,10 @@ public:
 		double* s = (double*)malloc(sizeof(double) * B);					//allocate space for the spectrum that will be pulled from the file
 		double* s_dev;														//declare a device pointer that will store the spectrum on the GPU
-        double* s2_dev;														//  device pointer on the GPU
-        cudaStat = cudaMalloc(&s2_dev, B * sizeof(double));					//  allocate space on the CUDA device
-        cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double));               //  initialize s2_dev to zero (0)
+		double* s2 = (double*)malloc(sizeof(double) * B);					//allocate space for the spectrum of second pixel that will be pulled from the file
+		double* s2_dev;														//  device pointer on the GPU
+		cudaStat = cudaMalloc(&s2_dev, B * sizeof(double));					//  allocate space on the CUDA device
+		cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double));               //  initialize s2_dev to zero (0)
 		double* A_dev;														//declare a device pointer that will store the covariance matrix on the GPU
 		double* avg_dev;													//declare a device pointer that will store the average spectrum
@@ -1125,28 +1179,32 @@ public:
 		cudaStat = cudaMalloc(&avg_dev, B * sizeof(double));				//allocate space on the CUDA device for the average spectrum
 		stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1);		//copy the average spectrum to the CUDA device
-		double ger_alpha = 1.0/(double)XY;									//scale the outer product by the inverse of the number of samples (mean outer product)
+		double ger_alpha = 1.0 / (double)XY;									//scale the outer product by the inverse of the number of samples (mean outer product)
 		double axpy_alpha = -1;												//multiplication factor for the average spectrum (in order to perform a subtraction)
-		stat = cublasCreate(&handle);										//create a cuBLAS instance
-		if (stat != CUBLAS_STATUS_SUCCESS) {								//test the cuBLAS instance to make sure it is valid
-			printf ("CUBLAS initialization failed\n");
-			return EXIT_FAILURE;
-		}
-		for (unsigned long long xy = 0; xy < XY; xy++){										//for each pixel
-			if (mask == NULL || mask[xy] != 0){
-				pixeld(s, xy);                                                             //retreive the spectrum at the current xy pixel location
+		CUBLAS_HANDLE_ERROR(cublasCreate(&handle));							//create a cuBLAS instance
+		if (stat != CUBLAS_STATUS_SUCCESS) return 1;						//test the cuBLAS instance to make sure it is valid
-				stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1);						//copy the spectrum from the host to the device
+		for (unsigned long long xy = 0; xy < XY; xy++) {										//for each pixel
+			if (mask == NULL || mask[xy] != 0) {
+				pixeld(s, xy);                                        //retreive the spectrum at the current xy pixel location
+				if (xy < XY - X()) {
+					pixeld(s2, xy + X());                              //retreive the spectrum at the current xy+X pixel location, which is adjacent (bellow) to the pixel at xy location (in y direction)
+				}
+				else {
+					pixeld(s2, xy - X());                               //for the last row we consider the the adjacent pixel which is located above pixel xy
+				}
+				stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1);						//copy the spectrum of first pixel from the host to the device
 				stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s_dev, 1);				//subtract the average spectrum
-                cudaMemcpy(s2_dev, s_dev + 1 , (B-1) * sizeof(double), cudaMemcpyDeviceToDevice);    //copy B-1 elements from shifted source data (s_dev) to device pointer (s2_dev )
-                stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1);	   //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them )
+				stat = cublasSetVector((int)B, sizeof(double), s2, 1, s2_dev, 1);					//copy the spectrum of second pixel from the host to the device
+				stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s2_dev, 1);				//subtract the average spectrum
+				stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1);	   //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (in y direction)
 				stat = cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, s_dev, 1, A_dev, (int)B);	//calculate the covariance matrix (symmetric outer product)
 			}
-			if(PROGRESS) progress = (double)(xy+1) / XY * 100;													//record the current progress
+			if (PROGRESS) progress = (double)(xy + 1) / XY * 100;													//record the current progress
 		}
@@ -1157,80 +1215,96 @@ public:
 		cudaFree(s2_dev);
 		cudaFree(avg_dev);
-		for(unsigned long long i = 0; i < B; i++){										//copy the upper triangular portion to the lower triangular portion
-			for(unsigned long long j = i+1; j < B; j++){
+		for (unsigned long long i = 0; i < B; i++) {										//copy the upper triangular portion to the lower triangular portion
+			for (unsigned long long j = i + 1; j < B; j++) {
 				coN[B * i + j] = coN[B * j + i];
 			}
 		}
-		return true;
+		return 0;
 	}
-#endif
+	//#endif
 	/// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision.
 	/// @param coN is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
 	/// @param avg is a pointer to memory of size B that stores the average spectrum
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
-	bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
-
-#ifdef CUDA_FOUND
-		int dev_count;
-		cudaGetDeviceCount(&dev_count);									//get the number of CUDA devices
-		cudaDeviceProp prop;
-		cudaGetDeviceProperties(&prop, 0);								//get the property of the first device
-		if(dev_count > 0 && prop.major != 9999)							//if the first device is not an emulator
-			return coNoise_matrix_cublas(coN, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
-#endif
-
-
+	bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, int cuda_device = 0, bool PROGRESS = false) {
+
+		if (cuda_device >= 0) {													//if a CUDA device is specified
+			int dev_count;
+			HANDLE_ERROR(cudaGetDeviceCount(&dev_count));						//get the number of CUDA devices
+			if (dev_count > 0 && dev_count > cuda_device) {							//if the first device is not an emulator
+				cudaDeviceProp prop;
+				cudaGetDeviceProperties(&prop, cuda_device);									//get the property of the requested CUDA device
+				if (prop.major != 9999) {
+					std::cout << "Using CUDA device [" << cuda_device << "] to calculate the noise covariance matrix..." << std::endl;
+					HANDLE_ERROR(cudaSetDevice(cuda_device));
+					int status = coNoise_matrix_cublas(coN, avg, mask, PROGRESS);			//use cuBLAS to calculate the covariance matrix
+					if (status == 0) return true;									//if the cuBLAS function returned correctly, we're done
+				}
+			}																	//otherwise continue using the CPU
+			std::cout << "WARNING: cuBLAS failed, using CPU" << std::endl;
+		}
 		progress = 0;
 		//memory allocation
 		unsigned long long XY = X() * Y();
 		unsigned long long B = Z();
 		T* temp = (T*)malloc(sizeof(T) * B);
+		T* temp2 = (T*)malloc(sizeof(T) * B);
 		unsigned long long count = nnz(mask);								//count the number of masked pixels
-		//initialize covariance matrix of noise
+																			//initialize covariance matrix of noise
 		memset(coN, 0, B * B * sizeof(double));
 		//calculate covariance matrix
-		double* coN_half = (double*) malloc(B * B * sizeof(double));			//allocate space for a higher-precision intermediate matrix
-		double* temp_precise = (double*) malloc(B * sizeof(double));
+		double* coN_half = (double*)malloc(B * B * sizeof(double));			//allocate space for a higher-precision intermediate matrix
+		double* temp_precise = (double*)malloc(B * sizeof(double));
+		double* temp_precise2 = (double*)malloc(B * sizeof(double));
 		memset(coN_half, 0, B * B * sizeof(double));							//initialize the high-precision matrix with zeros
 		unsigned long long idx;													//stores i*B to speed indexing
-		for (unsigned long long xy = 0; xy < XY; xy++){
-			if (mask == NULL || mask[xy] != 0){
-				pixel(temp, xy);												//retreive the spectrum at the current xy pixel location
-				for(unsigned long long b = 0; b < B; b++)									//subtract the mean from this spectrum and increase the precision
+		for (unsigned long long xy = 0; xy < XY; xy++) {
+			if (mask == NULL || mask[xy] != 0) {
+				pixel(temp, xy);                                        //retreive the spectrum at the current xy pixel location
+				if (xy < XY - X()) {
+					pixel(temp2, xy + X());                              //retreive the spectrum at the current xy+X pixel location, which is adjacent (bellow) to the pixel at xy location (in y direction)
+				}
+				else {
+					pixel(temp2, xy - X());                               //for the last row we consider the the adjacent pixel which is located above pixel xy
+				}
+				for (unsigned long long b = 0; b < B; b++) {									//subtract the mean from this spectrum and increase the precision
 					temp_precise[b] = (double)temp[b] - (double)avg[b];
+					temp_precise2[b] = (double)temp2[b] - (double)avg[b];
+				}
-                for(unsigned long long b2 = 0; b2 < B-1; b2++)	    //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them )
-					temp_precise[b2] -=  temp_precise[b2+1];
+				for (unsigned long long b2 = 0; b2 < B; b2++)	    //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (in y direction)
+					temp_precise[b2] -= temp_precise2[b2];
 				idx = 0;
-				for (unsigned long long b0 = 0; b0 < B; b0++){								//for each band
+				for (unsigned long long b0 = 0; b0 < B; b0++) {								//for each band
 					for (unsigned long long b1 = b0; b1 < B; b1++)
 						coN_half[idx++] += temp_precise[b0] * temp_precise[b1];
 				}
 			}
-			if(PROGRESS) progress = (double)(xy+1) / XY * 100;
+			if (PROGRESS) progress = (double)(xy + 1) / XY * 100;
 		}
 		idx = 0;
-		for (unsigned long long i = 0; i < B; i++){										//copy the precision matrix to both halves of the output matrix
-			for (unsigned long long j = i; j < B; j++){
-				coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double) count;
+		for (unsigned long long i = 0; i < B; i++) {										//copy the precision matrix to both halves of the output matrix
+			for (unsigned long long j = i; j < B; j++) {
+				coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double)count;
 			}
 		}
 		free(temp);
+		free(temp2);
 		free(temp_precise);
+		free(temp_precise2);
 		return true;
 	}
-	#ifdef CUDA_FOUND
     /// Project the spectra onto a set of basis functions
 	/// @param outfile is the name of the new binary output file that will be created
 	/// @param center is a spectrum about which the data set will be rotated (ex. when performing mean centering)
@@ -1239,8 +1313,8 @@ public:
 	/// @param mask is a character mask used to limit processing to valid pixels
 	bool project_cublas(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){
-		cudaError_t cudaStat;
-		cublasStatus_t stat;
+		//cudaError_t cudaStat;
+		//cublasStatus_t stat;
 		cublasHandle_t handle;
 		std::ofstream target(outfile.c_str(), std::ios::binary);	//open the target binary file
@@ -1251,12 +1325,12 @@ public:
 		double* s = (double*)malloc(sizeof(double) * B);					//allocate space for the spectrum that will be pulled from the file
 		double* s_dev;														//declare a device pointer that will store the spectrum on the GPU
-		cudaStat = cudaMalloc(&s_dev, B * sizeof(double));					//allocate space on the CUDA device for the spectrum
+		HANDLE_ERROR(cudaMalloc(&s_dev, B * sizeof(double)));				//allocate space on the CUDA device for the spectrum
         double* basis_dev;														//  device pointer on the GPU
-        cudaStat = cudaMalloc(&basis_dev, M * B * sizeof(double));					//  allocate space on the CUDA device
-        cudaStat = cudaMemset(basis_dev, 0, M * B * sizeof(double));               //  initialize basis_dev to zero (0)
+        HANDLE_ERROR(cudaMalloc(&basis_dev, M * B * sizeof(double)));					//  allocate space on the CUDA device
+        HANDLE_ERROR(cudaMemset(basis_dev, 0, M * B * sizeof(double)));               //  initialize basis_dev to zero (0)
         /// transposing basis matrix (because cuBLAS is column-major)
@@ -1265,28 +1339,24 @@ public:
         for (int i = 0; i<M; i++)
             for (int j = 0; j<B; j++)
             basis_Transposed[i+j*M] = basis[i*B+j];
+		//copy the basis_Transposed matrix to the CUDA device (both matrices are stored in column-major format)
+		CUBLAS_HANDLE_ERROR(cublasSetMatrix((int)M, (int)B, sizeof(double),basis_Transposed, (int)M, basis_dev, (int)M));
-        stat = cublasSetMatrix((int)M, (int)B, sizeof(double),basis_Transposed, (int)M, basis_dev, (int)M);  //copy the basis_Transposed matrix to the CUDA device (both matrices are stored in column-major format)
-
-		double* center_dev;													//declare a device pointer that will store the center (average)
-		cudaStat = cudaMalloc(&center_dev, B * sizeof(double));				//allocate space on the CUDA device for the center (average)
-		stat = cublasSetVector((int)B, sizeof(double), center, 1, center_dev, 1);		//copy the center vector (average) to the CUDA device (from host to device)
+		double* center_dev;																						//declare a device pointer that will store the center (average)
+		HANDLE_ERROR(cudaMalloc(&center_dev, B * sizeof(double)));									//allocate space on the CUDA device for the center (average)
+		CUBLAS_HANDLE_ERROR(cublasSetVector((int)B, sizeof(double), center, 1, center_dev, 1));			//copy the center vector (average) to the CUDA device (from host to device)
-        double* A = (double*)malloc(sizeof(double) * M);					//allocate space for the projected pixel on the host
-        double* A_dev;														//declare a device pointer that will store the projected pixel on the GPU
-		cudaStat = cudaMalloc(&A_dev,M * sizeof(double));				    //allocate space on the CUDA device for the projected pixel
-		cudaStat = cudaMemset(A_dev, 0,M * sizeof(double));		        	//initialize the projected pixel to zero (0)
+        double* A = (double*)malloc(sizeof(double) * M);								//allocate space for the projected pixel on the host
+        double* A_dev;																	//declare a device pointer that will store the projected pixel on the GPU
+		HANDLE_ERROR(cudaMalloc(&A_dev,M * sizeof(double)));				    //allocate space on the CUDA device for the projected pixel
+		HANDLE_ERROR(cudaMemset(A_dev, 0,M * sizeof(double)));		        //initialize the projected pixel to zero (0)
 		double axpy_alpha = -1;												//multiplication factor for the center (in order to perform a subtraction)
 		double axpy_alpha2 = 1;												//multiplication factor for the matrix-vector multiplication
         double axpy_beta = 0;												//multiplication factor for the matrix-vector multiplication (there is no second scalor)
-		stat = cublasCreate(&handle);										//create a cuBLAS instance
-		if (stat != CUBLAS_STATUS_SUCCESS) {								//test the cuBLAS instance to make sure it is valid
-			printf ("CUBLAS initialization failed\n");
-			return EXIT_FAILURE;
-		}
+		CUBLAS_HANDLE_ERROR(cublasCreate(&handle));					//create a cuBLAS instance
         T* temp = (T*)malloc(sizeof(T) * M);													//allocate space for the projected pixel to be written on the disc
 		size_t i;
@@ -1294,14 +1364,11 @@ public:
 			if (mask == NULL || mask[xy] != 0){
 				pixeld(s, xy);																	//retreive the spectrum at the current xy pixel location
-				stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1);						    //copy the spectrum from the host to the device
-                stat = cublasDaxpy(handle, (int)B, &axpy_alpha, center_dev, 1, s_dev, 1);				//subtract the center (average)
-                stat = cublasDgemv(handle,CUBLAS_OP_N,(int)M,(int)B,&axpy_alpha2,basis_dev,(int)M,s_dev,1,&axpy_beta,A_dev,1);         //performs the matrix-vector multiplication
-                stat = cublasGetVector((int)M, sizeof(double), A_dev, 1, A, 1);					//copy the projected pixel to the host (from GPU to CPU)
-
-				//stat = cublasGetVector((int)B, sizeof(double), s_dev, 1, A, 1);					//copy the projected pixel to the host (from GPU to CPU)
-
-                //std::copy<double*, T*>(A, A + M, temp);											
+				CUBLAS_HANDLE_ERROR(cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1));						    //copy the spectrum from the host to the device
+                CUBLAS_HANDLE_ERROR(cublasDaxpy(handle, (int)B, &axpy_alpha, center_dev, 1, s_dev, 1));					//subtract the center (average)
+                CUBLAS_HANDLE_ERROR(cublasDgemv(handle,CUBLAS_OP_N,(int)M,(int)B,&axpy_alpha2,basis_dev,(int)M,s_dev,1,&axpy_beta,A_dev,1));         //performs the matrix-vector multiplication
+                CUBLAS_HANDLE_ERROR(cublasGetVector((int)M, sizeof(double), A_dev, 1, A, 1));							//copy the projected pixel to the host (from GPU to CPU)
+						
 				for(i = 0; i < M; i++)	temp[i] = (T)A[i];										//casting projected pixel from double to whatever T is
 			}
 			else
@@ -1313,10 +1380,11 @@ public:
 		}
         //clean up allocated device memory
-		cudaFree(A_dev);
-		cudaFree(s_dev);
-        cudaFree(basis_dev);
-		cudaFree(center_dev);
+		HANDLE_ERROR(cudaFree(A_dev));
+		HANDLE_ERROR(cudaFree(s_dev));
+		HANDLE_ERROR(cudaFree(basis_dev));
+		HANDLE_ERROR(cudaFree(center_dev));
+		CUBLAS_HANDLE_ERROR(cublasDestroy(handle));
 		free(A);
 		free(s);
 		free(temp);
@@ -1324,7 +1392,6 @@ public:
 		return true;
 	}
-#endif
 	/// Project the spectra onto a set of basis functions
 	/// @param outfile is the name of the new binary output file that will be created
@@ -1332,16 +1399,22 @@ public:
 	/// @param basis a set of basis vectors that the data set will be projected onto (after centering)
 	/// @param M is the number of basis vectors
 	/// @param mask is a character mask used to limit processing to valid pixels
-	bool project(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){
-
-#ifdef CUDA_FOUND
-		int dev_count;
-		cudaGetDeviceCount(&dev_count);									//get the number of CUDA devices
-		cudaDeviceProp prop;
-		cudaGetDeviceProperties(&prop, 0);								//get the property of the first device
-		if(dev_count > 0 && prop.major != 9999)							//if the first device is not an emulator
-			return project_cublas(outfile,center,basis,M,mask,PROGRESS);	 //use cuBLAS to calculate the covariance matrix
-#endif
+	bool project(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, int cuda_device = 0, bool PROGRESS = false){
+		if (cuda_device >= 0) {													//if a CUDA device is specified
+			int dev_count;
+			HANDLE_ERROR(cudaGetDeviceCount(&dev_count));						//get the number of CUDA devices
+			if (dev_count > 0 && dev_count > cuda_device) {							//if the first device is not an emulator
+				cudaDeviceProp prop;
+				cudaGetDeviceProperties(&prop, cuda_device);									//get the property of the requested CUDA device
+				if (prop.major != 9999) {
+					std::cout << "Using CUDA device [" << cuda_device << "] to perform a basis projection..." << std::endl;
+					HANDLE_ERROR(cudaSetDevice(cuda_device));
+					return project_cublas(outfile, center, basis, M, mask, PROGRESS);
+				}
+			}																	//otherwise continue using the CPU		
+			std::cout << "WARNING: cuBLAS failed, using CPU" << std::endl;
+		}
+		
 		std::ofstream target(outfile.c_str(), std::ios::binary);	//open the target binary file
 		//std::string headername = outfile + ".hdr";					//the header file name
@@ -1426,9 +1499,9 @@ public:
 								   bool PROGRESS = false){
 		//calculate the new number of samples, lines, and bands
-		unsigned long long samples = x1 - x0;
-		unsigned long long lines = y1 - y0;
-		unsigned long long bands = b1 - b0;
+		unsigned long long samples = x1 - x0 + 1;
+		unsigned long long lines = y1 - y0 + 1;
+		unsigned long long bands = b1 - b0 + 1;
 		//calculate the length of one cropped spectrum
 		unsigned long long L = bands * sizeof(T);
@@ -1436,41 +1509,46 @@ public:
 		//unsigned long long L = Z() * sizeof(T);
 		//allocate space for the spectrum
-		T* temp = (T*)malloc(L);
+		char* temp = (char*)malloc(L);
 		//open an output file for binary writing
 		std::ofstream out(outfile.c_str(), std::ios::binary);
 		//seek to the first pixel in the cropped image
-		file.seekg( (y0 * X() * Z() + x0 * Z() + b0) * sizeof(T), std::ios::beg);
+		size_t startx = x0 * Z();
+		size_t starty = y0 * X() * Z();
+		size_t startb = b0;
+		file.seekg( (starty + startx + startb) * sizeof(T), std::ios::beg);
 		//distance between sample spectra in the same line
-		unsigned long long jump_sample = ( (Z() - b1) + b0 ) * sizeof(T);
+		size_t dist_between_samples = Z() - bands;
+		size_t jump_sample = dist_between_samples * sizeof(T);
 		//distance between sample spectra in adjacent lines
-		unsigned long long jump_line = (X() - x1) * Z() * sizeof(T);
+		//unsigned long long jump_line = ( X() - x1 + x0 ) * Z() * sizeof(T);
+		size_t dist_between_lines = X() - samples;
+		size_t jump_line = dist_between_lines * Z() * sizeof(T);
 		//unsigned long long sp = y0 * X() + x0;		//start pixel
 		//for each pixel in the image
-		for (unsigned y = 0; y < lines; y++)
-		{
-			for (unsigned x = 0; x < samples; x++)
-			{
+		for (unsigned y = 0; y < lines; y++) {
+			for (unsigned x = 0; x < samples; x++) {
 				//read the cropped spectral region
-				file.read( (char*) temp, L );
+				file.read(temp, L );
 				//pixel(temp, sp + x + y * X());
-				out.write(reinterpret_cast<const char*>(temp), L);   //write slice data into target file
+				out.write(temp, L);   //write slice data into target file
 				file.seekg(jump_sample, std::ios::cur);
-				if(PROGRESS) progress = (double)((y+1) * samples + x + 1) / (lines * samples) * 100;
+				if(PROGRESS) progress = (double)(y * samples + x + 1) / (lines * samples) * 100;
 			}
 			file.seekg(jump_line, std::ios::cur);
 		}
 		free(temp);
+		out.close();
 		return true;
 	}
@@ -1546,6 +1624,26 @@ public:
 		}
 	}
+	///Append two files together along the band dimension
+	void append(std::string outfile, bip<T>* C, bool PROGRESS = false) {
+		std::ofstream out(outfile.c_str(), std::ios::binary);	//open the output file for writing
+		file.seekg(0, std::ios::beg);							//move to the beginning of both files
+		C->file.seekg(0, std::ios::beg);
+		size_t a_bytes = Z() * sizeof(T);					//calculate the number of bytes in a single plane of this file
+		size_t b_bytes = C->Z() * sizeof(T);			//calculate the number of bytes in a single plane of the appending file
+		T* a = (T*)malloc(a_bytes);								//allocate space for a plane of the current file
+		T* b = (T*)malloc(b_bytes);								//allocate space for a plane of the appended file
+		if (PROGRESS) progress = 0;
+		for (size_t xy = 0; xy < X()*Y(); xy++) {
+			spectrum(a, xy);								//read a plane from the current file
+			out.write((char*)a, a_bytes);								//write the plane to disk
+			C->spectrum(b, xy);								//read a plane from the appending file
+			out.write((char*)b, b_bytes);
+			if (PROGRESS) progress = (double)(xy + 1) / (double)(X() * Y()) * 100;
+		}
+
+		out.close();
+	}
 	/// Convolve the given band range with a kernel specified by a vector of coefficients.
 	/// @param outfile is an already open stream to the output file
@@ -1687,7 +1785,117 @@ public:
 		return true;
 	}
+	int fft(std::string outname, size_t bandmin, size_t bandmax, size_t samples = 0, T* ratio = NULL, size_t rx = 0, size_t ry = 0, bool PROGRESS = false, int device = 0){
+		if(device == -1){
+			std::cout<<"ERROR: GPU required for FFT (uses cuFFT)."<<std::endl;
+			exit(1);
+		}
+		if(samples == 0) samples = Z();								//if samples are specified, use all of them
+		if(samples > Z()){
+			std::cout<<"ERROR: stim::envi doesn't support FFT padding just yet."<<std::endl;
+			exit(1);
+		}
+		int nd;															//stores the number of CUDA devices
+		HANDLE_ERROR(cudaGetDeviceCount(&nd));							//get the number of CUDA devices
+		if(device >= nd){												//test for the existence of the requested device
+			std::cout<<"ERROR: requested CUDA device for stim::envi::fft() doesn't exist"<<std::endl;
+			exit(1);
+		}
+		HANDLE_ERROR(cudaSetDevice(device));							//set the CUDA device
+		cudaDeviceProp prop;
+		HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));			//get the CUDA device properties
+
+		size_t B = Z();
+		size_t S = samples;
+		size_t fft_size = S * sizeof(T);								//number of bytes for each FFT
+		size_t cuda_bytes = prop.totalGlobalMem;						//get the number of bytes of global memory available
+		size_t cuda_use = (size_t)floor(cuda_bytes * 0.2);								//only use 80%
+		size_t nS = cuda_use / fft_size;								//calculate the number of spectra that can be loaded onto the GPU as a single batch
+		size_t batch_bytes = nS * fft_size;								//calculate the size of a batch (in bytes)
+		size_t fft_bytes = nS * (S/2 + 1) * sizeof(cufftComplex);
+		T* batch = (T*) malloc(batch_bytes);							//allocate space in host memory to store a batch
+		memset(batch, 0, batch_bytes);
+		std::complex<T>* batch_fft = (std::complex<T>*) malloc(fft_bytes);
+		T* gpu_batch;													//device pointer to the batch
+		HANDLE_ERROR(cudaMalloc(&gpu_batch, batch_bytes));				//allocate space on the device for the FFT batch
+		cufftComplex* gpu_batch_fft;												//allocate space for the FFT result
+		HANDLE_ERROR(cudaMalloc(&gpu_batch_fft, fft_bytes));
+		int N[1];														//create an array with the interferogram size (required for cuFFT input)
+		N[0] = (int)S;													//set the only array value to the interferogram size
+
+		//if a background is provided for a ratio
+		std::complex<T>* ratio_fft = NULL;											//create a pointer for the FFT of the ratio image (if it exists)
+		if(ratio){
+			size_t bkg_bytes = rx * ry * S * sizeof(T);								//calculate the total number of bytes in the background image
+			T* bkg_copy = (T*) malloc(bkg_bytes);									//allocate space to copy the background
+			if(S == Z()) memcpy(bkg_copy, ratio, bkg_bytes);						//if the number of samples used in processing equals the number of available samples
+			else{
+				for(size_t xyi = 0; xyi < rx*ry; xyi++)
+					memcpy(&bkg_copy[xyi * S], &ratio[xyi * B], S * sizeof(T));
+			}
+			T* gpu_ratio;
+			HANDLE_ERROR(cudaMalloc(&gpu_ratio, bkg_bytes));
+			HANDLE_ERROR(cudaMemcpy(gpu_ratio, bkg_copy, bkg_bytes, cudaMemcpyHostToDevice));
+			cufftHandle bkg_plan;
+			CUFFT_HANDLE_ERROR(cufftPlanMany(&bkg_plan, 1, N, NULL, 1, N[0], NULL, 1, N[0], CUFFT_R2C, (int)(rx * ry)));
+			size_t bkg_fft_bytes = rx * ry * (S / 2 + 1) * sizeof(cufftComplex);
+			T* gpu_ratio_fft;
+			HANDLE_ERROR(cudaMalloc(&gpu_ratio_fft, bkg_fft_bytes));
+			CUFFT_HANDLE_ERROR(cufftExecR2C(bkg_plan, (cufftReal*)gpu_ratio, (cufftComplex*)gpu_ratio_fft));
+			ratio_fft = (std::complex<T>*) malloc(bkg_fft_bytes);
+			HANDLE_ERROR(cudaMemcpy(ratio_fft, gpu_ratio_fft, bkg_fft_bytes, cudaMemcpyDeviceToHost));
+			HANDLE_ERROR(cudaFree(gpu_ratio));
+			HANDLE_ERROR(cudaFree(gpu_ratio_fft));
+			CUFFT_HANDLE_ERROR(cufftDestroy(bkg_plan));
+		}
+		cufftHandle plan;												//create a CUFFT plan
+		CUFFT_HANDLE_ERROR(cufftPlanMany(&plan, 1, N, NULL, 1, N[0], NULL, 1, N[0], CUFFT_R2C, (int)nS));
+
+		std::ofstream outfile(outname, std::ios::binary);				//open a file for writing
+
+		size_t XY = X() * Y();											//calculate the number of spectra
+		size_t xy = 0;
+		size_t bs;														//stores the number of spectra in the current batch
+		size_t s, b;
+		size_t S_fft = S/2 + 1;
+		size_t bandkeep = bandmax - bandmin + 1;
+		size_t x, y;
+		size_t ratio_i;
+		T* temp_spec = (T*) malloc(Z() * sizeof(T));					//allocate space to hold a single pixel
+		while(xy < XY){													//while there are unprocessed spectra
+			bs = (std::min)(XY - xy, nS);										//calculate the number of spectra to include in the batch
+			for(s = 0; s < bs; s++){									//for each spectrum in the batch
+				pixel(temp_spec, xy + s);						//read a pixel from disk
+				memcpy(&batch[s * S], temp_spec, S * sizeof(T));
+				//pixel(&batch[s * S], xy + s);							//read the next spectrum
+			}
+			HANDLE_ERROR(cudaMemcpy(gpu_batch, batch, batch_bytes, cudaMemcpyHostToDevice));
+			CUFFT_HANDLE_ERROR(cufftExecR2C(plan, (cufftReal*)gpu_batch, gpu_batch_fft));			//execute the (implicitly forward) transform
+			HANDLE_ERROR(cudaMemcpy(batch_fft, gpu_batch_fft, fft_bytes, cudaMemcpyDeviceToHost));	//copy the data back to the GPU
+			for(s = 0; s < bs; s++){															//for each spectrum in the batch
+				y = (xy + s)/X();
+				x = xy + s - y * X();
+				if(ratio_fft)	ratio_i = (y % ry) * rx + (x % rx);								//if a background is used, calculate the coordinates into it
+				for(b = 0; b < S/2 + 1; b++){														//for each sample
+					if(ratio_fft)						
+						batch[s * S + b] = -log(abs(batch_fft[s * S_fft + b]) / abs(ratio_fft[ratio_i * S_fft + b]));
+					else
+						batch[s * S + b] = abs(batch_fft[s * S_fft + b]);		//calculate the magnitude of the spectrum					
+				}
+				outfile.write((char*)&batch[s * S + bandmin], bandkeep * sizeof(T));							//save the resulting spectrum
+			}
+			xy += bs;													//increment xy by the number of spectra processed
+			if(PROGRESS) progress = (double)xy / (double)XY * 100;
+		}
+		outfile.close();
+		free(ratio_fft);
+		free(batch_fft);
+		free(batch);
+		HANDLE_ERROR(cudaFree(gpu_batch));
+		HANDLE_ERROR(cudaFree(gpu_batch_fft));
+		return 0;
+	}
 	/// Close the file.
 	bool close(){
 #ifndef STIM_BSQ_H
 #define STIM_BSQ_H
-#include "../envi/envi_header.h"
-#include "../envi/hsi.h"
-#include "../envi/bil.h"
+#include <stim/envi/envi_header.h>
+#include <stim/envi/hsi.h>
+#include <stim/envi/bil.h>
 #include <cstring>
 #include <utility>
 #include <vector>
@@ -64,14 +64,15 @@ public:
 			  unsigned long long Y,
 			  unsigned long long B,
 			  unsigned long long header_offset,
-			  std::vector<double> wavelengths){
+			  std::vector<double> wavelengths,
+			  stim::iotype io = stim::io_in){
 		//copy the wavelengths to the BSQ file structure
 		w = wavelengths;
 		//copy the wavelengths to the structure
 		offset = header_offset;
-		return open(filename, vec<unsigned long long>(X, Y, B), header_offset);
+		return open(filename, vec<unsigned long long>(X, Y, B), header_offset, io);
 	}
 	/// Retrieve a single band (based on index) and stores it in pre-allocated memory.
@@ -104,6 +105,7 @@ public:
 		//if wavelength is smaller than the first one in header file
 		if ( w[page] > wavelength ){
 			band_index(p, page);
+			if(PROGRESS) progress = 100;
 			return true;
 		}
@@ -114,6 +116,7 @@ public:
 			//	(the wavelength is out of bounds)
 			if (page == Z()) {
 				band_index(p, Z()-1);		//return the last band
+				if(PROGRESS) progress = 100;
 				return true;
 			}
 		}
@@ -377,6 +380,30 @@ public:
 	}
+	bool select(std::string outfile, std::vector<double> bandlist, unsigned char* mask = NULL, bool PROGRESS = false) {
+		std::ofstream out(outfile.c_str(), std::ios::binary);		//open the output file
+		if (!out) {
+			std::cout << "ERROR opening output file: " << outfile << std::endl;
+			return false;
+		}
+		file.seekg(0, std::ios::beg);							//move the pointer to the current file to the beginning
+
+		size_t B = Z();								//calculate the number of bands
+		size_t XY = X() * Y();						//calculate the number of pixels in a band
+		size_t in_bytes = XY * sizeof(T);				//calculate the size of a band in bytes
+
+		T* in = (T*)malloc(in_bytes);				//allocate space for the band image
+		size_t nb = bandlist.size();				//get the number of bands in the output image
+		for (size_t b = 0; b < nb; b++) {
+			band(in, bandlist[b]);					//get the band associated with the given wavelength
+			out.write((char*)in, in_bytes);		//write the band to the output file
+			if (PROGRESS) progress = (double)(b + 1) / (double)bandlist.size() * 100;
+		}
+		out.close();
+		free(in);
+		return true;
+	}
+
 	size_t readlines(T* dest, size_t start, size_t n){
 		return hsi<T>::read(dest, 0, start, 0, X(), n, Z());
 	}
@@ -648,13 +675,24 @@ public:
 		//to make sure the left and the right bound are in the bandwidth
 		if (lb < w[0] || rb < w[0] || lb > w[n-1] || rb >w[n-1]){
-			std::cout<<"ERROR: left bound or right bound out of bandwidth"<<std::endl;
-			exit(1);
+			if (lb < w[0]) {
+				std::cout << "bsq::area ERROR - left bound " << lb << " is below the minimum available wavelength " << w[0] << std::endl;
+			}
+			if (rb < w[0]) {
+				std::cout << "bsq::area ERROR - right bound " << rb << " is below the minimum available wavelength " << w[0] << std::endl;
+			}
+			if (lb > w[n - 1]) { 
+				std::cout << "bsq::area ERROR - left bound " << lb << " is above the maximum available wavelength " << w[n - 1] << std::endl; 
+			}
+			if (rb > w[n - 1]) { 
+				std::cout << "bsq::area ERROR - right bound " << rb << " is above the maximum available wavelength " << w[0] << std::endl; 
+			}
+			return false;
 		}
 		//to make sure right bound is bigger than left bound
 		else if(lb > rb){
-			std::cout<<"ERROR: right bound should be bigger than left bound"<<std::endl;
-			exit(1);
+			std::cout << "bsq::area ERROR - right bound " << rb << " should be larger than left bound " << lb << std::endl;
+			return false;
 		}
 		//find the indices of the left and right baseline points
@@ -994,7 +1032,7 @@ public:
 					matrix[i*Z() + b] = band_image[xy];			//copy it to the appropriate point in the values[] array
 					i++;
 				}
-				if(PROGRESS) progress = (double)(xy+1) / (double)XY * 100;
+				if(PROGRESS) progress = (double)(b * XY + xy+1) / (double)(XY * Z()) * 100;
 			}
 		}
@@ -1196,9 +1234,9 @@ public:
 								   bool PROGRESS = false){
 		//calculate the new number of samples, lines, and bands
-		unsigned long long samples = x1 - x0;
-		unsigned long long lines = y1 - y0;
-		unsigned long long bands = b1 - b0;
+		unsigned long long samples = x1 - x0 + 1;
+		unsigned long long lines = y1 - y0 + 1;
+		unsigned long long bands = b1 - b0 + 1;
 		//calculate the size of a single band
 		unsigned long long L = samples * lines * sizeof(T);
@@ -1219,7 +1257,7 @@ public:
 		file.seekg( (b0 * X() * Y() + y0 * X() + x0) * sizeof(T), std::ios::beg);
 		//for each band
-		for (unsigned long long z = b0; z < b1; z++)
+		for (unsigned long long z = b0; z <= b1; z++)
 		{
 			//std::cout<<z<<std::endl;
 			for (unsigned long long y = 0; y < lines; y++)
@@ -1227,7 +1265,7 @@ public:
 				file.read((char *)(temp + y * samples), sizeof(T) * samples);
 				file.seekg(jumpl, std::ios::cur);    //go to the next band
-				if(PROGRESS) progress = (double)((z - b0 + 1) * lines + y + 1) / ((b1 - b0) * lines) * 100;
+				if(PROGRESS) progress = (double)((z - b0) * lines + y + 1) / ((b1 - b0 + 1) * lines) * 100;
 			}
 			out.write(reinterpret_cast<const char*>(temp), L);   //write slice data into target file
 			file.seekg(jumpb, std::ios::cur);
@@ -1237,6 +1275,52 @@ public:
 		return true;
 	}
+	///Crop out several subimages and assemble a new image from these concatenated subimages
+
+	/// @param outfile is the file name for the output image
+	/// @param sx is the width of each subimage
+	/// @param sy is the height of each subimage
+	/// @mask is the mask used to define subimage positions extracted from the input file
+	void subimages(std::string outfile, size_t sx, size_t sy, unsigned char* mask, bool PROGRESS = false){
+
+		size_t N = nnz(mask);									//get the number of subimages
+		T* dst = (T*) malloc(N * sx * sy * sizeof(T));			//allocate space for a single band of the output image
+		memset(dst, 0, N*sx*sy*sizeof(T));						//initialize the band image to zero
+
+		std::ofstream out(outfile, std::ios::binary);			//open a file for writing
+
+		T* src = (T*) malloc(X() * Y() * sizeof(T));
+
+		for(size_t b = 0; b < Z(); b++){						//for each band
+			band_index(src, b);									//load the band image
+			size_t i = 0;										//create an image index and initialize it to zero
+			size_t n = 0;
+			while(n < N){										//for each subimage
+				if(mask[i]){									//if the pixel is masked, copy the surrounding pixels into the destination band
+					size_t yi = i / X();						//determine the y position of the current pixel
+					size_t xi = i - yi * X();					//determine the x position of the current pixel
+					if( xi > sx/2 && xi < X() - sx/2 &&			//if the subimage is completely within the bounds of the original image
+						yi > sy/2 && yi < Y() - sy/2){
+						size_t cx = xi - sx/2;					//calculate the corner position for the subimage
+						size_t cy = yi - sy/2;
+						for(size_t syi = 0; syi < sy; syi++){					//for each line in the subimage
+							size_t src_i = (cy + syi) * X() + cx;
+							//size_t dst_i = syi * (N * sx) + n * sx;
+							size_t dst_i = (n * sy + syi) * sx;
+							memcpy(&dst[dst_i],  &src[src_i], sx * sizeof(T));	//copy one line from the subimage to the destination image
+						}
+						n++;
+					}
+				}
+				i++;
+				if(PROGRESS) progress = (double)( (n+1) * (b+1) ) / (N * Z()) * 100;
+			}//end while n
+			out.write((const char*)dst, N * sx * sy * sizeof(T));			//write the band to memory
+		}
+		free(dst);												//free memory
+		free(src);
+	}
+
 	/// Remove a list of bands from the ENVI file
 	/// @param outfile is the file name for the output hyperspectral image (with trimmed bands)
@@ -1311,6 +1395,20 @@ public:
 		out.close();
 	}
+	/// Append an image to this one along the band dimension
+	void append(std::string outfile, bsq<T>* C, bool PROGRESS = false) {
+		std::ofstream out(outfile.c_str(), std::ios::binary);	//open the output file for writing
+		file.seekg(0, std::ios::beg);							//move to the beginning of both files
+		C->file.seekg(0, std::ios::beg);
+
+		if (PROGRESS) progress = 0;
+		out << file.rdbuf();									//copy the data from this ENVI file
+		if (PROGRESS) progress = (double)(Z() + 1) / (double)(Z() + C->Z()) * 100;
+		out << C->file.rdbuf();									//copy the data from the appending file
+		if (PROGRESS) progress = 100;
+		out.close();
+	}
+
 	/// Convolve the given band range with a kernel specified by a vector of coefficients.
 	/// @param outfile is an already open stream to the output file
 #ifndef STIM_ENVI_H
 #define STIM_ENVI_H
-#include "../envi/envi_header.h"
-#include "../envi/bsq.h"
-#include "../envi/bip.h"
-#include "../envi/bil.h"
-#include "../math/fd_coefficients.h"
+#include <stim/envi/envi_header.h>
+#include <stim/envi/bsq.h>
+#include <stim/envi/bip.h>
+#include <stim/envi/bil.h>
+#include <stim/math/fd_coefficients.h>
+#include <stim/parser/filename.h>
+#include <stim/util/filesize.h>
 #include <iostream>
 #include <fstream>
 //#include "../image/image.h"
@@ -69,14 +71,67 @@ public:
 		file = NULL;				//set the file pointer to NULL
 	}
-	envi(std::string filename, std::string headername){
+	envi(std::string filename, std::string headername) : envi(){
 		header.load(headername);
 		fname = filename;					//save the filename
 		allocate();
 	}
+	//used to test if the current ENVI file is valid
+	operator bool(){
+		if (header.interleave == envi_header::BSQ) {		//if the infile is bsq file
+			if (header.data_type == envi_header::float32)
+				return ((bsq<float>*)file)->is_open();
+			else if (header.data_type == envi_header::float64)
+				return ((bsq<double>*)file)->is_open();
+			else
+				std::cout << "ERROR: unidentified data type" << std::endl;
+		}
+
+		else if (header.interleave == envi_header::BIL) {		//if the infile is bil file
+			if (header.data_type == envi_header::float32)
+				return ((bil<float>*)file)->is_open();
+			else if (header.data_type == envi_header::float64)
+				return ((bil<double>*)file)->is_open();
+			else
+				std::cout << "ERROR: unidentified data type" << std::endl;
+		}
+
+		else if (header.interleave == envi_header::BIP) {		//if the infile is bip file
+			if (header.data_type == envi_header::float32)
+				return ((bip<float>*)file)->is_open();
+			else if (header.data_type == envi_header::float64)
+				return ((bip<double>*)file)->is_open();
+			else
+				std::cout << "ERROR: unidentified data type" << std::endl;
+		}
+		else {
+			std::cout << "ERROR: unidentified file type" << std::endl;
+			exit(1);
+		}
+		return false;
+	}
+
+	//test to determine if the specified file is an ENVI file
+	static bool is_envi(std::string fname, std::string hname = ""){
+		stim::filename data_file(fname);
+		stim::filename header_file;
+		if(hname == ""){								//if the header isn't provided
+			header_file = data_file;					//assume that it's the same name as the data file, with a .hdr extension
+			header_file = header_file.extension("hdr");
+		}
+		else header_file = hname;						//otherwise load the passed header
+		stim::envi_header H;
+		if(H.load(header_file) == false)				//load the header file, if it doesn't load return false
+			return false;
+		size_t targetBytes = H.data_bytes();			//get the number of bytes that SHOULD be in the data file
+		size_t bytes = stim::file_size(fname);
+		if(bytes != targetBytes) return false;			//if the data doesn't match the header, return false
+		return true;									//otherwise everything looks fine
+
+	}
 	void* malloc_spectrum(){
@@ -308,7 +363,7 @@ public:
 	}
 	/// Open a previously opened ENVI file
-	bool open(){
+	bool open(stim::iotype io = stim::io_in){
 		//load the file
 		if(header.interleave == envi_header::BSQ) {		//if the infile is bsq file
@@ -396,7 +451,7 @@ public:
 	/// @param filename is the name of the ENVI binary file
 	/// @param header is an ENVI header structure
-	bool open(std::string filename, stim::envi_header h){
+	bool open(std::string filename, stim::envi_header h, stim::iotype io = stim::io_in){
 		header = h;							//store the header
@@ -404,7 +459,7 @@ public:
 		allocate();
-		return open();						//open the ENVI file;
+		return open(io);						//open the ENVI file;
 	}
@@ -413,18 +468,21 @@ public:
 	/// @param filename is the name of the ENVI binary file
 	/// @param headername is the name of the ENVI header file
-	bool open(std::string filename, std::string headername){
+	bool open(std::string filename, std::string headername, stim::iotype io = stim::io_in){
 		//allocate memory
 		//allocate();
 		stim::envi_header h;
-		h.load(headername);
+		if (!h.load(headername)) {
+			std::cout << "Error loading header file: " << headername << std::endl;
+			return false;
+		}
 		//load the header
 		//header.load(headername);
-		return open(filename, h);
+		return open(filename, h, io);
 	}
 	/// Normalize a hyperspectral ENVI file given a band number and threshold.
@@ -504,6 +562,41 @@ public:
 		}
 	}
+	bool select(std::string outfile, std::vector<double> bandlist, unsigned char* MASK = NULL, bool PROGRESS = false) {
+		stim::envi_header new_header = header;					//copy all of the data from the current header file to the new one
+		new_header.bands = bandlist.size();						//the number of bands in the new file is equal to the number of bands provided by the user
+		new_header.wavelength = bandlist;						//the wavelength values in the output file are the same as those specified by the user
+		new_header.band_names.empty();							//no band names will be provided in the output file
+		new_header.save(outfile + ".hdr");						//save the output header file
+
+		if (header.interleave == envi_header::BSQ) {		//if the infile is bip file
+			if (header.data_type == envi_header::float32)
+				return ((bsq<float>*)file)->select(outfile, bandlist, MASK, PROGRESS);
+			else if (header.data_type == envi_header::float64)
+				return ((bsq<double>*)file)->select(outfile, bandlist, MASK, PROGRESS);
+			else
+				std::cout << "ERROR: unidentified data type" << std::endl;
+		}
+		else if (header.interleave == envi_header::BIL) {		//if the infile is bip file
+			if (header.data_type == envi_header::float32)
+				return ((bil<float>*)file)->select(outfile, bandlist, MASK, PROGRESS);
+			else if (header.data_type == envi_header::float64)
+				return ((bil<double>*)file)->select(outfile, bandlist, MASK, PROGRESS);
+			else
+				std::cout << "ERROR: unidentified data type" << std::endl;
+		}
+		else if (header.interleave == envi_header::BIP) {		//if the infile is bip file
+			if (header.data_type == envi_header::float32)
+				return ((bip<float>*)file)->select(outfile, bandlist, MASK, PROGRESS);
+			else if (header.data_type == envi_header::float64)
+				return ((bip<double>*)file)->select(outfile, bandlist, MASK, PROGRESS);
+			else
+				std::cout << "ERROR: unidentified data type" << std::endl;
+		}
+
+		return false;
+	}
+
 	/// Performs piecewise linear baseline correction of a hyperspectral file/
 	/// @param outfile is the file name for the baseline corrected output
@@ -549,7 +642,9 @@ public:
 		}
 	}
-	void project(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask, bool PROGRESS = false){
+	/// Project an array of coefficients onto a basis matrix
+
+	void project(std::string outfile, double* center, double* basis, size_t M, std::vector<double> bands, unsigned char* mask, int cuda_device = 0, bool PROGRESS = false) {
 		if(header.interleave == envi_header::BSQ){		//if the infile is bsq file
 			std::cout<<"ERROR: BSQ projection not supported"<<std::endl;
 			exit(1);
@@ -562,9 +657,9 @@ public:
 		else if(header.interleave == envi_header::BIP){		//if the infile is bip file
 			if(header.data_type ==envi_header::float32)
-				((bip<float>*)file)->project(outfile, center, basis, M, mask, PROGRESS);
+				((bip<float>*)file)->project(outfile, center, basis, M, mask, cuda_device, PROGRESS);
 			else if(header.data_type == envi_header::float64)
-				((bip<double>*)file)->project(outfile, center, basis, M, mask, PROGRESS);
+				((bip<double>*)file)->project(outfile, center, basis, M, mask, cuda_device, PROGRESS);
 			else{
 				std::cout<<"ERROR: unidentified data type"<<std::endl;
 				exit(1);
@@ -573,7 +668,7 @@ public:
 		stim::envi_header out_hdr = header;							
 		out_hdr.bands = M;											//set the number of bands in the output header
-		out_hdr.wavelength.clear();
+		out_hdr.wavelength = bands;
 		out_hdr.band_names.clear();
 		out_hdr.save(outfile + ".hdr");								//save the output header
 	}
@@ -636,7 +731,7 @@ public:
 					((bsq<double>*)file)->bil(outfile, PROGRESS, OPTIMIZATION);
 				else if(interleave == envi_header::BIP){					//ERROR
 					//std::cout<<"ERROR: conversion from BSQ to BIP isn't practical; use BSQ->BIL->BIP instead"<<std::endl;
-					((bsq<float>*)file)->bip(outfile, PROGRESS, OPTIMIZATION);
+					((bsq<double>*)file)->bip(outfile, PROGRESS, OPTIMIZATION);
 					//exit(1);
 				}
 			}
@@ -989,7 +1084,7 @@ public:
 			else if(header.data_type == envi_header::float64)
 				return ((bsq<double>*)file)->ph_to_ph((double*)result, lb1, rb1, pos1, lb2, rb2, pos2, mask);
 			else
-				std::cout<<"ERROR: unidentified data type"<<std::endl;
+				std::cout<<"envi::ph_to_ph ERROR - unidentified data type"<<std::endl;
 		}
 		else if(header.interleave == envi_header::BIL){		//if the infile is bil file
@@ -998,7 +1093,7 @@ public:
 			else if(header.data_type == envi_header::float64)
 				return ((bil<double>*)file)->ph_to_ph((double*)result, lb1, rb1, pos1, lb2, rb2, pos2, mask);
 			else
-				std::cout<<"ERROR: unidentified data type"<<std::endl;
+				std::cout<<"envi::ph_to_ph ERROR - unidentified data type"<<std::endl;
 		}
 		else if(header.interleave == envi_header::BIP){		//if the infile is bip file
@@ -1007,11 +1102,11 @@ public:
 			else if(header.data_type == envi_header::float64)
 				return ((bip<double>*)file)->ph_to_ph((double*)result, lb1, rb1, pos1, lb2, rb2, pos2, mask);
 			else
-				std::cout<<"ERROR: unidentified data type"<<std::endl;
+				std::cout<<"envi::ph_to_ph ERROR - unidentified data type"<<std::endl;
 		}
 		else{
-			std::cout<<"ERROR: unidentified file type"<<std::endl;
+			std::cout<<"envi::ph_to_ph ERROR - unidentified file type"<<std::endl;
 			exit(1);
 		}
 		return false;
@@ -1273,6 +1368,39 @@ public:
 		return false;
 	}
+	void band_bounds(double wavelength, size_t& low, size_t& high) {
+		if (header.interleave == envi_header::BSQ) {		//if the infile is bsq file
+			if (header.data_type == envi_header::float32)
+				((bsq<float>*)file)->band_bounds(wavelength, low, high);
+			else if (header.data_type == envi_header::float64)
+				((bsq<double>*)file)->band_bounds(wavelength, low, high);
+			else {
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+		else if (header.interleave == envi_header::BIL) {
+			if (header.data_type == envi_header::float32)
+				((bil<float>*)file)->band_bounds(wavelength, low, high);
+			else if (header.data_type == envi_header::float64)
+				((bil<double>*)file)->band_bounds(wavelength, low, high);
+			else {
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+		else if (header.interleave == envi_header::BIP) {
+			if (header.data_type == envi_header::float32)
+				((bip<float>*)file)->band_bounds(wavelength, low, high);
+			else if (header.data_type == envi_header::float64)
+				((bip<double>*)file)->band_bounds(wavelength, low, high);
+			else {
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+	}
+
 	// Retrieve a spectrum at the specified 1D location
 	/// @param ptr is a pointer to pre-allocated memory of size B*sizeof(T)
@@ -1325,6 +1453,7 @@ public:
 				exit(1);
 			}
 		}
+		free(temp);
 	}
 	/// Retrieve a spectrum from the specified (x, y) location
@@ -1441,16 +1570,16 @@ public:
 	/// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
 	/// @param avg is a pointer to memory of size B that stores the average spectrum
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
-	bool co_matrix(double* co, double* avg, unsigned char* mask, bool PROGRESS = false){
+	bool co_matrix(double* co, double* avg, unsigned char* mask, int cuda_device, bool PROGRESS = false){
 		if (header.interleave == envi_header::BSQ){
 			std::cout<<"ERROR: calculating the covariance matrix for a BSQ file is impractical; convert to BIL or BIP first"<<std::endl;
 			exit(1);
 		}
 		else if (header.interleave == envi_header::BIL){
 			if (header.data_type == envi_header::float32)
-				return ((bil<float>*)file)->co_matrix(co, avg, mask, PROGRESS);
+				return ((bil<float>*)file)->co_matrix(co, avg, mask, cuda_device, PROGRESS);
 			else if (header.data_type == envi_header::float64)
-				return ((bil<double>*)file)->co_matrix(co, avg, mask, PROGRESS);
+				return ((bil<double>*)file)->co_matrix(co, avg, mask, cuda_device, PROGRESS);
 			else{
 				std::cout << "ERROR: unidentified data type" << std::endl;
 				exit(1);
@@ -1458,9 +1587,9 @@ public:
 		}
 		else if (header.interleave == envi_header::BIP){
 			if (header.data_type == envi_header::float32)
-				return ((bip<float>*)file)->co_matrix(co, avg, mask, PROGRESS);
+				return ((bip<float>*)file)->co_matrix(co, avg, mask, cuda_device, PROGRESS);
 			else if (header.data_type == envi_header::float64)
-				return ((bip<double>*)file)->co_matrix(co, avg, mask, PROGRESS);
+				return ((bip<double>*)file)->co_matrix(co, avg, mask, cuda_device, PROGRESS);
 			else{
 				std::cout << "ERROR: unidentified data type" << std::endl;
 				exit(1);
@@ -1474,7 +1603,7 @@ public:
 	/// @param co is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
 	/// @param avg is a pointer to memory of size B that stores the average spectrum
 	/// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
-	bool coNoise_matrix(double* coN, double* avg, unsigned char* mask, bool PROGRESS = false){
+	bool coNoise_matrix(double* coN, double* avg, unsigned char* mask, int cuda_device = 0, bool PROGRESS = false){
 		if (header.interleave == envi_header::BSQ){
 			std::cout<<"ERROR: calculating the covariance matrix of noise for a BSQ file is impractical; convert to BIP first"<<std::endl;
 			exit(1);
@@ -1488,9 +1617,9 @@ public:
 		else if (header.interleave == envi_header::BIP){
 			if (header.data_type == envi_header::float32)
-				return ((bip<float>*)file)->coNoise_matrix(coN, avg, mask, PROGRESS);
+				return ((bip<float>*)file)->coNoise_matrix(coN, avg, mask, cuda_device, PROGRESS);
 			else if (header.data_type == envi_header::float64)
-				return ((bip<double>*)file)->coNoise_matrix(coN, avg, mask, PROGRESS);
+				return ((bip<double>*)file)->coNoise_matrix(coN, avg, mask, cuda_device, PROGRESS);
 			else{
 				std::cout << "ERROR: unidentified data type" << std::endl;
 				exit(1);
@@ -1517,11 +1646,11 @@ public:
 		//save the header for the cropped file
 		stim::envi_header new_header = header;
-		new_header.samples = x1 - x0;
-		new_header.lines = y1 - y0;
-		new_header.bands = b1 - b0;
+		new_header.samples = x1 - x0 + 1;
+		new_header.lines = y1 - y0 + 1;
+		new_header.bands = b1 - b0 + 1;
 		std::vector<double>::const_iterator first = new_header.wavelength.begin() + b0;
-		std::vector<double>::const_iterator last = new_header.wavelength.begin() + b1;
+		std::vector<double>::const_iterator last = new_header.wavelength.begin() + b1 + 1;
 		new_header.wavelength = std::vector<double>(first, last);
 		new_header.save(outfile + ".hdr");
@@ -1558,6 +1687,41 @@ public:
 		return false;
 	}
+	void subimages(std::string outfile, size_t nx, size_t ny, unsigned char* mask, bool PROGRESS = false){
+		
+		size_t nnz = 0;													//initialize the number of subimages to zero
+		for(size_t i = 0; i < header.lines * header.samples; i++)		//for each pixel in the mask
+			if(mask[i]) nnz++;											//if the pixel is valid, add a subimage
+
+
+		//save the header for the cropped file
+		stim::envi_header new_header = header;
+		new_header.samples = nx;									//calculate the width of the output image (concatenated subimages)
+		new_header.lines = nnz * ny;											//calculate the height of the output image (height of subimages)
+		
+
+		if (header.interleave == envi_header::BSQ){
+			if (header.data_type == envi_header::float32)
+				((bsq<float>*)file)->subimages(outfile, nx, ny, mask, PROGRESS);
+			else if (header.data_type == envi_header::float64)
+				((bsq<double>*)file)->subimages(outfile, nx, ny, mask, PROGRESS);
+			else{
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+		else if (header.interleave == envi_header::BIL){
+			std::cout << "ERROR: unidentified data type" << std::endl;
+			exit(1);
+		}
+		else if (header.interleave == envi_header::BIP){
+			std::cout << "ERROR: unidentified data type" << std::endl;
+			exit(1);
+		}
+
+		new_header.save(outfile + ".hdr");									//save the header for the output file
+	}
+
 	/// Remove a list of bands from the ENVI file
 	/// @param outfile is the file name for the output hyperspectral image (with trimmed bands)
@@ -1665,6 +1829,67 @@ public:
 		}
 	}
+	void append(std::string outfile, envi C, bool PROGRESS = false) {
+		if (C.header.samples != header.samples ||									//verify that the images are the same size
+			C.header.lines != header.lines) {
+			std::cout << "ERROR - appended images must be the same size: input = [" << header.samples << " x " << header.lines << "], output = [" << C.header.samples << " x " << C.header.lines << "]" << std::endl;
+			exit(1);
+		}
+		if (C.header.interleave != header.interleave) {
+			std::cout << "ERROR - appended images must have the same interleave format" << std::endl;
+			exit(1);
+		}
+
+		stim::envi_header new_header = header;												//create a header for the output image
+		new_header.bands = header.bands + C.header.bands;									//calculate the number of bands in the new image
+
+		if (header.wavelength.size() != 0 && C.header.wavelength.size() != 0) {				//if both files contain wavelength information
+			for (size_t b = 0; b < C.header.wavelength.size(); b++)
+				new_header.wavelength.push_back(C.header.wavelength[b]);					//append the wavelength labels to the new header array
+		}
+		else new_header.wavelength.clear();
+
+		if (header.band_names.size() != 0 && C.header.band_names.size() != 0) {				//if both files contain band name information
+			for (size_t b = 0; b < C.header.band_names.size(); b++)
+				new_header.band_names.push_back(C.header.band_names[b]);					//append the wavelength labels to the new header array
+		}
+		else new_header.wavelength.clear();
+
+		new_header.save(outfile + ".hdr");													//save the output
+
+		if (header.interleave == envi_header::BSQ) {
+			if (header.data_type == envi_header::float32)
+				((bsq<float>*)file)->append(outfile, (bsq<float>*)C.file, PROGRESS);
+			else if (header.data_type == envi_header::float64)
+				((bsq<double>*)file)->append(outfile, (bsq<double>*)C.file, PROGRESS);
+			else {
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+		else if (header.interleave == envi_header::BIL) {
+			if (header.data_type == envi_header::float32)
+				((bil<float>*)file)->append(outfile, (bil<float>*)C.file, PROGRESS);
+			else if (header.data_type == envi_header::float64)
+				((bil<double>*)file)->append(outfile, (bil<double>*)C.file, PROGRESS);
+			else {
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+		else if (header.interleave == envi_header::BIP) {
+			if (header.data_type == envi_header::float32)
+				((bip<float>*)file)->append(outfile, (bip<float>*)C.file, PROGRESS);
+			else if (header.data_type == envi_header::float64)
+				((bip<double>*)file)->append(outfile, (bip<double>*)C.file, PROGRESS);
+			else {
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+
+	}
+
 	/// Convolve the given band range with a kernel specified by a vector of coefficients.
 	/// @param outfile is the combined file to be output
@@ -1835,6 +2060,44 @@ public:
 		}
 		exit(1);
 	}
+
+	
+
+
+	void fft(std::string outfile, double band_min, double band_max, size_t samples = 0, void* ratio = NULL, size_t rx = 0, size_t ry = 0, bool PROGRESS = false, int cuda_device = 0){
+		if(samples == 0) samples = header.bands;
+		//double B = (double)header.bands;
+		double delta = header.wavelength[1] - header.wavelength[0];					//calculate spacing in the current domain
+		double span = samples * delta;											//calculate the span in the current domain
+		double fft_delta = 1.0 / span;												//calculate the span in the FFT domain
+		double fft_max = fft_delta * samples/2;										//calculate the maximum range of the FFT
+
+		if(band_max > fft_max) band_max = fft_max;									//the user gave a band outside of the FFT range, reset the band to the maximum available
+		size_t start_i = (size_t)std::ceil(band_min / fft_delta);					//calculate the first band to store
+		size_t size_i = (size_t)std::floor(band_max / fft_delta) - start_i + 1;		//calculate the number of bands to store
+		size_t end_i = start_i + size_i - 1;										//last band number
+
+		envi_header new_header = header;
+		new_header.bands = size_i;
+		new_header.set_wavelengths(start_i * fft_delta, fft_delta);
+		new_header.wavelength_units = "inv_" + header.wavelength_units;
+		new_header.save(outfile + ".hdr");
+		
+		if (header.interleave == envi_header::BIP){
+			if (header.data_type == envi_header::float32)
+				((bip<float>*)file)->fft(outfile, start_i, end_i, samples, (float*)ratio, rx, ry, PROGRESS, cuda_device);
+			else if (header.data_type == envi_header::float64)
+				((bip<double>*)file)->fft(outfile, start_i, end_i, samples, (double*)ratio, rx, ry, PROGRESS, cuda_device);
+			else{
+				std::cout << "ERROR: unidentified data type" << std::endl;
+				exit(1);
+			}
+		}
+		else{
+			std::cout<<"ERROR: only BIP files supported for FFT"<<std::endl;
+			exit(1);
+		}
+	}
 };	//end ENVI
 }	//end namespace rts
@@ -8,6 +8,7 @@
 #include <vector>
 #include <algorithm>
 #include <stdlib.h>
+#include <cmath>
 //information from an ENVI header file
 //A good resource can be found here: http://www.exelisvis.com/docs/enviheaderfiles.html
@@ -77,6 +78,14 @@ struct envi_header
 		load(name);
 	}
+	//sets the wavelength vector given a starting value and uniform step size
+	void set_wavelengths(double start, double step){
+		size_t B = bands;						//get the number of bands
+		wavelength.resize(B);
+		for(size_t b = 0; b < B; b++)
+			wavelength[b] = start + b * step;
+	}
+
 	std::string trim(std::string line){
 		if(line.length() == 0)
@@ -416,8 +425,13 @@ struct envi_header
 		default:
 			return 0;
 		}
+	}
+	//return the number of bytes that SHOULD be in the data file
+	size_t data_bytes(){
+		return samples * lines * bands * valsize() + header_offset;
 	}
+	
 	/// Convert an interleave type to a string
 	static std::string interleave_str(interleaveType t){
 #ifndef STIM_HSI_H
 #define STIM_HSI_H
-#include "../envi/envi_header.h"
-#include "../envi/binary.h"
+#include <stim/envi/envi_header.h>
+#include <stim/envi/binary.h>
 #include <cstring>
 #include <utility>
@@ -55,43 +55,19 @@ protected:
 		return n;													//return the number of masked pixels
 	}
+	//perform linear interpolation between two bands
 	T lerp(double w, T low_v, double low_w, T high_v, double high_w){
 		if(low_w == high_w) return low_v;										//if the interval is of zero length, just return one of the bounds
 		double alpha = (w - low_w) / (high_w - low_w);							//calculate the interpolation factor
 		return (T)((1.0 - alpha) * low_v + alpha * high_v);							//interpolate
 	}
-	/// Gets the two band indices surrounding a given wavelength
-	void band_bounds(double wavelength, unsigned long long& low, unsigned long long& high){
-		unsigned long long B = Z();
-		for(high = 0; high < B; high++){
-			if(w[high] > wavelength) break;
-		}
-		low = 0;
-		if(high > 0)
-			low = high-1;
-	}
-
-	/// Get the list of band numbers that bound a list of wavelengths
-	void band_bounds(std::vector<double> wavelengths,
-					 std::vector<unsigned long long>& low_bands,
-					 std::vector<unsigned long long>& high_bands){
-
-		unsigned long long W = w.size();									//get the number of wavelengths in the list
-		low_bands.resize(W);												//pre-allocate space for the band lists
-		high_bands.resize(W);
-
-		for(unsigned long long wl = 0; wl < W; wl++){						//for each wavelength
-			band_bounds(wavelengths[wl], low_bands[wl], high_bands[wl]);	//find the low and high bands
-		}
-	}
-
 	/// Returns the interpolated in the given spectrum based on the given wavelength
 	/// @param s is the spectrum in main memory of length Z()
 	/// @param wavelength is the wavelength value to interpolate out
 	T interp_spectrum(T* s, double wavelength){
-		unsigned long long low, high;								//indices for the bands surrounding wavelength
+		size_t low, high;								//indices for the bands surrounding wavelength
 		band_bounds(wavelength, low, high);							//get the surrounding band indices
 		if(high == w.size()) return s[w.size()-1];					//if the high band is above the wavelength range, return the highest wavelength value
@@ -138,11 +114,36 @@ protected:
 	}
 public:
+
+	/// Gets the two band indices surrounding a given wavelength
+	void band_bounds(double wavelength, size_t& low, size_t& high) {
+		size_t B = Z();
+		for (high = 0; high < B; high++) {
+			if (w[high] > wavelength) break;
+		}
+		low = 0;
+		if (high > 0)
+			low = high - 1;
+	}
+
+	/// Get the list of band numbers that bound a list of wavelengths
+	void band_bounds(std::vector<double> wavelengths,
+		std::vector<unsigned long long>& low_bands,
+		std::vector<unsigned long long>& high_bands) {
+
+		unsigned long long W = w.size();									//get the number of wavelengths in the list
+		low_bands.resize(W);												//pre-allocate space for the band lists
+		high_bands.resize(W);
+
+		for (unsigned long long wl = 0; wl < W; wl++) {						//for each wavelength
+			band_bounds(wavelengths[wl], low_bands[wl], high_bands[wl]);	//find the low and high bands
+		}
+	}
 			/// Get a mask that has all pixels with inf or NaN values masked out (false)
 	void mask_finite(unsigned char* out_mask, unsigned char* mask, bool PROGRESS = false){
 		size_t XY = X() * Y();
 		if(mask == NULL)												//if no mask is provided
-			memset(mask, 255, XY * sizeof(unsigned char));				//initialize the mask to 255
+			memset(out_mask, 255, XY * sizeof(unsigned char));				//initialize the mask to 255
 		else															//if a mask is provided
 			memcpy(out_mask, mask, XY * sizeof(unsigned char));			//initialize the current mask to that one
 		T* page = (T*)malloc(R[0] * R[1] * sizeof(T));		//allocate space for a page of data
@@ -224,4 +225,4 @@ public:
 }		//end namespace STIM
-#endif
 \ No newline at end of file
+#endif
-#ifndef STIM_GL_SPIDER_H
+ #ifndef STIM_GL_SPIDER_H
 #define STIM_GL_SPIDER_H
 //#include <GL/glew.h>
@@ -27,7 +27,6 @@
 #include <stim/cuda/branch_detection.cuh>
 #include "../../../volume-spider/glnetwork.h"
 #include <stim/visualization/cylinder.h>
-#include <stim/cuda/testKernel.cuh>
 #include <iostream>
 #include <fstream>
 #ifdef TIMING
@@ -40,6 +39,9 @@
 	#include <ctime>
 #endif
+#ifdef DEBUG
+	#include <stim/cuda/testKernel.cuh>
+#endif
 namespace stim
 {
@@ -138,11 +140,13 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 		stim::cuda::cuda_texture t_pos;				//cuda_texture object used as an interface between OpenGL and cuda for position vectors.
 		stim::cuda::cuda_texture t_mag;				//cuda_texture object used as an interface between OpenGL and cuda for size vectors.
 		stim::cuda::cuda_texture t_len;				//cuda_texture object used as an interface between OpenGL and cuda for size vectors.
-		
+
+		int last_fiber;						//variable that tracks the last fiber hit during tracing. -1 if no fiber was hit.
+
 		#ifdef DEBUG
-			stringstream name;
 			int iter;
+			stringstream name;
 			int iter_pos;
 			int iter_dir;
 			int iter_siz;
@@ -292,6 +296,7 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 			DrawLongCylinder(n, l_template, l_square);	///Draw the cylinder.
 			stim::cylinder<float> cyl(cL, cM);
 			std::vector< stim::vec<float> > result = find_branch(cylinder_texID, GL_TEXTURE_2D, n*l_square, (cL.size()-1)*l_template);		///find all the centers in cuda
+			
 			stim::vec3<float> size(S[0]*R[0], S[1]*R[1], S[2]*R[2]);			///the borders of the texture.
 			float pval;									//pvalue associated with the points on the cylinder.
 			if(!result.empty())								///if we have any points
@@ -315,7 +320,8 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 					}
 					stim::vec3<float> v = cyl.surf(pval, result[i][0]);					///find the coordinates of the point at pval on the surface in tissue space.
 					stim::vec3<float> di = cyl.p(pval);							///find the coord of v in tissue space projected on the centerline.
-					float rad = cyl.r(pval)/2;								///find the radius at the pvalue's location
+					float rad = cyl.r(pval);								///find the radius at the pvalue's location
+				//	float rad = cyl.r(pval)/2;								///find the radius at the pvalue's location
 					if(
 					 !(v[0] > size[0] || v[1] > size[1]
 					 || v[2] > size[2] || v[0] < 0
@@ -372,7 +378,7 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 		///Stored in a display list.
 		///uses the default d vector <0,0,1>
 		void
-		genDirectionVectors(float solidAngle = stim::PI/2)
+		genDirectionVectors(float solidAngle = 3*stim::PI/4)
 		{
 			//Set up the vectors necessary for Rectangle creation.
@@ -954,7 +960,7 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 			iter_dir = 0;
 			iter_siz = 0;
 #endif
-			stepsize = 3.0;
+			stepsize = 6.0;
 			n_pixels = 16.0;
 			srand(100);	
@@ -1316,20 +1322,20 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 		void
 		saveNetwork(std::string name)
 		{
-/*			stim::glObj<float> sk;
+			stim::glObj<float> sk1;
 			for(int i = 0; i < nt.sizeE(); i++)
 			{
-				std::vector<stim::vec< float > > cm = nt.getEdgeCenterLineMag(i);
+				std::vector<float> cm = nt.getEdgeCenterLineMag(i);
                  		std::vector<stim::vec3< float > > ce = nt.getEdgeCenterLine(i);
-				sk.Begin(stim::OBJ_LINE);
+				sk1.Begin(stim::OBJ_LINE);
 				for(int j = 0; j < ce.size(); j++)
 				{
-					sk.TexCoord(cm[j][0]);
-					sk.Vertex(ce[j][0], ce[j][1], ce[j][2]);
+					sk1.TexCoord(cm[j]);
+					sk1.Vertex(ce[j][0], ce[j][1], ce[j][2]);
 				}
-				sk.End();
+				sk1.End();
 			}	
-*/			sk.save(name);
+			sk1.save(name);
 		}
 		///Depreciated, but might be reused later()
@@ -1377,20 +1383,31 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 		Step()
 		{
 			#ifdef DEBUG
-			std::cerr << "Took a step" << std::endl;
+			std::cerr << "Took a step";
 			#endif
 			Bind(direction_texID, direction_buffID, numSamples, n_pixels);
 			CHECK_OPENGL_ERROR
 				findOptimalDirection();
 			Unbind();
+			#ifdef DEBUG
+			std::cerr << " " << current_cost;
+			#endif
 			Bind(position_texID, position_buffID, numSamplesPos, n_pixels);
 				findOptimalPosition();
 			Unbind();
+			#ifdef DEBUG
+			std::cerr << " " << current_cost;
+			#endif
 			Bind(radius_texID, radius_buffID, numSamplesMag, n_pixels);
 				findOptimalRadius();
 			Unbind();
+			#ifdef DEBUG
+			std::cerr << " " << current_cost;
+			#endif
 			CHECK_OPENGL_ERROR
-
+			#ifdef DEBUG
+			std::cerr << std::endl;
+			#endif
 			return current_cost;
 		}
@@ -1517,9 +1534,6 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 			while(!Empty())
 			{
 				//clear the currently traced line and start a new one.
-				cL.clear();
-				cM.clear();
-				cD.clear();
 				curSeed = seeds.top();
 				curSeedVec = seedsvecs.top();
 				curSeedMag = seedsmags.top();
@@ -1539,9 +1553,9 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 //				findOptimalDirection();
 //			Unbind();
 //THIS IS EXPERIMENTAL
-			Bind(radius_texID, radius_buffID, numSamplesMag, n_pixels);
-				findOptimalRadius();
-			Unbind();
+		//	Bind(radius_texID, radius_buffID, numSamplesMag, n_pixels);
+		//		findOptimalRadius();
+		//	Unbind();
 //THIS IS EXPERIMENTAL
 //				cL.push_back(curSeed);
@@ -1593,17 +1607,17 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 					 ds[0], ds[1], ds[2],
 					 ups[0], ups[1], ups[2]);
 				///Set the look at distance
-				sk.Render();	///Render the network
-//				nt.Render();								
+//				sk.Render();	///Render the network
+				nt.Render();								
 				CHECK_OPENGL_ERROR
-				glLoadName((int) sk.numL());		///Load all the names
-//				glLoadName(nt.sizeE());
+//				glLoadName((int) sk.numL());		///Load all the names
+				glLoadName(nt.sizeE());
-				sk.RenderLine(cL);			///Render the current line.
-//				nt.RenderLine(cL);	
+//				sk.RenderLine(cL);			///Render the current line.
+				nt.RenderLine(cL);	
 //				glPopName();
 				glFlush();				///Flush the buffer
@@ -1654,55 +1668,51 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 			cM.clear();
 		}
-/*
+
 		void
-		addToNetwork(pair<stim::fiber<float>, int> in, stim::vec3<float> spos, 
-				stim::vec<float> smag, stim::vec3<float> sdir)
+		addToNetwork(std::vector<stim::vec3<float> > L, std::vector<float > M, stim::vec3<float> spos, stim::vec3<float> sdir, float smag)
 		{
-			#ifdef TIMING
-				 double s = std::clock();
-			#endif
-			
-                        std::vector<stim::vec3<float> > ce = in.first.centerline();                
-                        std::vector<stim::vec<float> > cm = in.first.centerlinemag();
 			//if the fiber is longer than 2 steps (the number it takes to diverge)
-			if(ce.size() > 3)
+			if(L.size() > 3)
 			{	
 				//if we did not hit a fiber
-				if(in.second == -1)
+				if(last_fiber == -1)
 				{
-					spos[0] = spos[0]-sdir[0]*smag[0]/2.;
-					spos[1] = spos[1]-sdir[1]*smag[0]/2.;
-					spos[2] = spos[2]-sdir[2]*smag[0]/2.;
-					int h = selectObject(spos, -sdir, smag[0]);
+					spos[0] = spos[0]-sdir[0]*smag;
+					spos[1] = spos[1]-sdir[1]*smag;
+					spos[2] = spos[2]-sdir[2]*smag;
+					int h = selectObject(spos, -sdir, smag);
 					//did we start with a fiber?
 					if(h != -1 && h < nt.sizeE())
-						nt.addEdge(ce, cm, h, -1);
+						nt.addEdge(L, M, h, -1);
 					else
-						nt.addEdge(ce, cm, -1, -1);
+						nt.addEdge(L, M, -1, -1);
 				}
 				//if we hit a fiber?
-				else if(in.second != -1)
+				else if(last_fiber != -1)
 				{
-					nt.addEdge(ce,cm,-1, in.second);
-					spos[0] = spos[0]-sdir[0]*smag[0]/2.;
-					spos[1] = spos[1]-sdir[1]*smag[0]/2.;
-					spos[2] = spos[2]-sdir[2]*smag[0]/2.;
-					int h = selectObject(spos, -sdir, smag[0]);
+					nt.addEdge(L, M, -1, last_fiber);
+					spos[0] = spos[0]-sdir[0]*smag;
+					spos[1] = spos[1]-sdir[1]*smag;
+					spos[2] = spos[2]-sdir[2]*smag;
+					int h = selectObject(spos, -sdir, smag);
 					//did start with a fiber?
 					if(h != -1 && h < nt.sizeE()){	
 			//			std::cout << "got here double" << smag.str() << std::endl;
-						nt.addEdge(ce,cm, h, in.second);	
-					} else { nt.addEdge(ce,cm, -1, -1);}
+						nt.addEdge(L, M, h, last_fiber);	
+					}
+					else
+					{
+					 nt.addEdge(L, M, -1, -1);
+					}
 				}
 			}		
-			#ifdef TIMING
-				double nt = (std::clock() - s) / (double) CLOCKS_PER_SEC;
-				network_time += nt * 1000.0;
+			#ifdef DEBUG
+				iter++;
 			#endif
 		}
-*/
+/*
 		void
 		addToNetwork(std::vector<stim::vec3<float> > L, std::vector<float > M)
 		{
@@ -1722,7 +1732,7 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 				#endif
 			}
 		}
-
+*/
 		void
 		printSizes()
@@ -1735,22 +1745,31 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 		traceLine(stim::vec3<float> pos, float mag, int min_cost)
 		{
 			//starting (seed) position and magnitude.
+			last_fiber = -1;
+			cL.clear();
+			cM.clear();
+			cD.clear();
+
 			stim::vec3<float> spos = getPosition();
+			stim::vec3<float> sdir = getDirection();
 			float smag = getMagnitude();
-			stim::vec3<float> sdir = getDirection();	
-//			Bind();
-//			sk.Begin(stim::OBJ_LINE);
+			setPosition(pos);
+			setMagnitude(mag);
+			cL.push_back(p);
+			cD.push_back(d);
+			cM.push_back(m);
+//			stim::vec3<float> spos = getPosition();
+//			float smag = getMagnitude();
+//			stim::vec3<float> sdir = getDirection();	
-			sk.createFromSelf(GL_SELECT);
-//			nt.createFromSelf(GL_SELECT);
+//			Bind();
+//			sk.Begin(stim::OBJ_LINE);
-			cL.push_back(pos);
-			cM.push_back(mag);
-//			setPosition(pos);
-//			setMagnitude(mag);
+			//sk.createFromSelf(GL_SELECT);
+			nt.createFromSelf(GL_SELECT);
 			int h;
 			bool started = false;
 			bool running = true;
@@ -1761,7 +1780,7 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 				if (cost > min_cost){
 					running = false;
 					branchDetection2();
-					addToNetwork(cL, cM);
+					addToNetwork(cL, cM, spos, sdir, smag);
 					#ifdef DEBUG
 					std::cerr << "the cost of " << cost << " > " << min_cost << std::endl;
 					#endif
@@ -1769,13 +1788,14 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 				} else {
 					//Have we found the edge of the map?
 					pos = getPosition();
-					if(pos[0] > size[0] || pos[1] > size[1]
-					 || pos[2] > size[2] || pos[0] < 0
-					 || pos[1] < 0 || pos[2] < 0)
+					if(p[0] > size[0] || p[1] > size[1]
+					 || p[2] > size[2] || p[0] < 0
+					 || p[1] < 0 || p[2] < 0)
 					{
 						running = false;
 						branchDetection2();
-						addToNetwork(cL, cM);
+				//		addToNetwork(cL, cM);
+						addToNetwork(cL, cM, spos, sdir, smag);
 						#ifdef DEBUG
 						std::cerr << "I hit and edge" << std::endl;
 						#endif
@@ -1790,10 +1810,11 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 					}
 					//Has the template size gotten unreasonable?
 					mag = getMagnitude();
-					if(mag > 75 || mag < 1){
+					if(m > 75 || m < 1){
 						running = false;
 						branchDetection2();
-						addToNetwork(cL, cM);
+				//		addToNetwork(cL, cM);
+						addToNetwork(cL, cM, spos, sdir, smag);
 						#ifdef DEBUG
 						std::cerr << "The templates are too big" << std::endl;
 						#endif
@@ -1807,13 +1828,16 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 							#ifdef DEBUG
 							std::cerr << "I hit the fiber " << h << std::endl;
 							#endif
+							last_fiber = h;
 							running = false;
 							branchDetection2();
-							addToNetwork(cL, cM);
+						//	addToNetwork(cL, cM);
+							addToNetwork(cL, cM, spos, sdir, smag);
 							break;
 						}
 						else {  
-							cL.push_back(stim::vec3<float>(p[0], p[1],p[2]));
+							cL.push_back(p);
+							cD.push_back(d);
 							cM.push_back(m);
 //							Unbind();
 							CHECK_OPENGL_ERROR
@@ -30,9 +30,9 @@ class gl_texture : public virtual image_stack&lt;T, F&gt;
 		GLenum cpu_type;
 		GLenum gpu_type;
 		GLenum format;					//format for the texture (GL_RGBA, GL_LUMINANCE, etc.)
-		using image_stack<T>::R;
+		using image_stack<T,F>::R;
 		//using image_stack<T>::S;
-		using image_stack<T>::ptr;
+		using image_stack<T,F>::ptr;
 		///	Sets the internal texture_type, based on the data dimensions
 		void setTextureType(){
@@ -247,7 +247,7 @@ class gl_texture : public virtual image_stack&lt;T, F&gt;
 		}
 		///returns the dimentions of the data in the x, y, z directions. 
-		vec<int> getSize(){
+		stim::vec<int> getSize(){
 			stim::vec<int> size(R[1], R[2], R[3]);
 			return size;
 		}
@@ -282,7 +282,7 @@ class gl_texture : public virtual image_stack&lt;T, F&gt;
 		///@param file_mask specifies the file(s) to be loaded
 		///	Sets the path and calls the loader on that path.
 		void load_images(std::string file_mask){
-			image_stack<T>::load_images(file_mask);				//load images
+			image_stack<T, F>::load_images(file_mask);				//load images
 			guess_parameters();
 		}
@@ -292,13 +292,18 @@ class gl_texture : public virtual image_stack&lt;T, F&gt;
 			return texID;
 		}
-		
+	
 		T* getData(){
 			return ptr;
 		}
-		
-		
+	
+		void setData(T* rts)
+		{
+				
+		}
+
 	};
+
 }
+#ifndef STIM_GRID3_H
+#define STIM_GRID3_H
+
+namespace stim{
+
+template<typename T, typename F = float>
+class grid : public stim::grid<T, 3, F>{
+
+public:
+
+	/// Convert grid coordinates (integers) into world coordinates (F) based on the pixel spacing
+	void grid2volume(size_t xi, size_t yi, size_t zi, F& x, F& y, F&z){
+
+	}
+
+	/// Use linear interpolation to get a value from the grid at (x, y, z) in VOLUME space (based on voxel size)
+	T lerp(F x, F y, F z){
+
+	}
+	/// Create a resampled grid with isotropic voxel sizes
+	grid3<T, F> resample_iso(){
+
+		//find the smallest spacing
+		//create a new grid of the appropriate size
+		//use linear interpolation to resample the old grid into the new grid
+	}
+
+
+};
+}			//end namespace stim
+
+#endif
 \ No newline at end of file
@@ -5,6 +5,7 @@
 #include <stim/parser/filename.h>
 #include <stim/grids/grid.h>
 #include <stim/image/image.h>
+#include <stim/math/vec3.h>
 namespace stim{
@@ -17,13 +18,13 @@ class image_stack : public virtual stim::grid&lt;T, 4, F&gt;{
 protected:
 	//using stim::grid<T, 4>::S;
-	using stim::grid<T, 4>::R;
-	using stim::grid<T, 4>::ptr;
-	using stim::grid<T, 4>::read;
+	using stim::grid<T, 4, F>::R;
+	using stim::grid<T, 4, F>::ptr;
+	using stim::grid<T, 4, F>::S;
 public:
 	//default constructor
-	image_stack() : grid<T, 4>() {
+	image_stack() : grid<T, 4, F>() {
 	}
@@ -37,6 +38,20 @@ public:
 		return R[1] * R[2] * R[3];						//return the number of spatial samples
 	}
+	// get the number of pixels in each dimension
+	size_t nc() {
+		return R[0];
+	}
+	size_t nx() {
+		return R[1];
+	}
+	size_t ny() {
+		return R[2];
+	}
+	size_t nz() {
+		return R[3];
+	}
+
 	/// Returns the number of color channels
 	size_t channels(){
 		return R[0];
@@ -113,7 +128,8 @@ public:
 	/// @param i is the page to be saved
 	void save_image(std::string file_name, unsigned int i){		
 		stim::image<T> I;											//create an image		
-		I.set_interleaved_rgb(&ptr[ i * R[0] * R[1] * R[2] ], R[1], R[2], R[0]);	//retrieve the interlaced data from the image - store it in the grid
+		I.set_interleaved(&ptr[ i * R[0] * R[1] * R[2] ], R[1], R[2], R[0]);	//retrieve the interlaced data from the image - store it in the grid
+//		I.set_interleaved_rgb(&ptr[ i * R[0] * R[1] * R[2] ], R[1], R[2], R[0]);	//retrieve the interlaced data from the image - store it in the grid
 		I.save(file_name);
 	}
@@ -138,13 +154,14 @@ public:
 	/// @param depth,  number of pixels in depth.
 	void init(int channels, int width, int height, int depth)
 	{
-		R.resize(4);
+		//R.resize(4);
 		R[0] = channels;
 		R[1] = width;
 		R[2] = height;
 		R[3] = depth;
 		ptr = (T*)malloc(sizeof(T) * samples());
+		memset(ptr, 0, sizeof(T) * samples());
 	}
 	///Saves the entire stack to a set of images
@@ -152,12 +169,13 @@ public:
 	void save_images(std::string file_mask){
 		stim::filename file_path(file_mask);
+		//stim::filename abs_file_path = file_pat
 		//create a list of file names
 		std::vector<std::string> file_list = stim::wildcards::increment(file_path.str(), 0, R[3]-1, 1);
-
-		for(int i=0; i<R[3]; i++)
+		for (int i = 0; i < R[3]; i++) {
 			save_image(file_list[i], i);
+		}
 	}
 	/// Returns the pixel at the specified point
@@ -165,9 +183,35 @@ public:
 		return ptr[z * R[0] * R[1] * R[2] + y * R[0] * R[1] + x * R[0] + c];
 	}
+	/// Returns the world-space position at an index point (i, j, k)
+	vec3<F> p(size_t i, size_t j, size_t k){
+		vec3<F> result;
+		result[0] = (F)i * S[1];
+		result[1] = (F)j * S[2];
+		result[2] = (F)k * S[3];
+		return result;
+	}
+
+	// set the pixel at the specified point
+	void set(size_t i, size_t j, size_t k, T value, size_t c = 0){
+		ptr[k * R[0] * R[1] * R[2] + j * R[0] * R[1] + i * R[0] + c] = value;
+	}
+	void set(T* Ptr, size_t k) {
+		
+		for (unsigned i = 0; i < R[0] * R[1] * R[2]; i++)
+			ptr[i + k * R[0] * R[1] * R[2]] = Ptr[i];
+	}
+	void copy(T* Ptr) {
+		ptr = Ptr;
+	}
+
+
+	/* This was causing compiler errors. I don't think this function call exists anywhere:
+
 	void read(std::string file, unsigned int X, unsigned int Y, unsigned int Z, unsigned int C = 1, unsigned int header = 0){
 		read(file, stim::vec<unsigned long>(C, X, Y, Z), header);
 	}
+	*/
 	T* data(){
 		return ptr;
+#ifndef STIM_IVOTE2_CUH
+#define STIM_IVOTE2_CUH
+
+#include <iostream>
+#include <fstream>
+#include <stim/cuda/cudatools/error.h>
+#include <stim/cuda/templates/gradient.cuh>
+#include <stim/cuda/arraymath.cuh>
+#include <stim/iVote/ivote2/iter_vote2.cuh>
+#include <stim/iVote/ivote2/local_max.cuh>
+#include <stim/math/constants.h>
+#include <stim/math/vector.h>
+#include <stim/visualization/colormap.h>
+
+
+namespace stim {
+	// this function precomputes the atan2 values
+	template<typename T>
+	void atan_2(T* cpuTable, unsigned int rmax) {
+		int xsize = 2 * rmax + 1;						//initialize the width and height of the window which atan2 are computed in.
+		int ysize = 2 * rmax + 1;
+		int yi = rmax;									// assign the center coordinates of the atan2 window to yi and xi
+		int xi = rmax;
+		for (int xt = 0; xt < xsize; xt++) {			//for each element in the atan2 table
+			for (int yt = 0; yt < ysize; yt++) {
+				int id = yt * xsize + xt;				//convert the current 2D coordinates to 1D
+				int xd = xi - xt;						// calculate the distance between the pixel and the center of the atan2 window
+				int yd = yi - yt;
+				T atan_2d = atan2((T)yd, (T)xd);	// calculate the angle between the pixel and the center of the atan2 window and store the result.
+				cpuTable[id] = atan_2d;
+			}
+		}
+	}
+
+	//this kernel invert the 2D image
+	template<typename T>
+	__global__ void cuda_invert(T* gpuI, size_t x, size_t y) {
+		// calculate the 2D coordinates for this current thread.
+		size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
+		size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if (xi >= x || yi >= y) return;
+		size_t i = yi * x + xi;					// convert 2D coordinates to 1D
+		gpuI[i] = 255 - gpuI[i];				//invert the pixel intensity
+	}
+
+
+
+	//this function calculate the threshold using OTSU method
+	template<typename T>
+	T th_otsu(T* pts, size_t pixels, unsigned int th_num = 20) {
+		T Imax = pts[0];				//initialize the maximum value to the first one
+		T Imin = pts[0];				//initialize the maximum value to the first on
+
+		for (size_t n = 0; n < pixels; n++) {		//for every value
+			if (pts[n] > Imax) {			//if the value is higher than the current max
+				Imax = pts[n];
+			}
+		}
+		for (size_t n = 0; n< pixels; n++) {		//for every value
+			if (pts[n] < Imin) {			//if the value is higher than the current max
+				Imin = pts[n];
+			}
+		}
+
+		T th_step = ((Imax - Imin) / th_num);
+		std::vector<T> var_b;
+		for (unsigned int t0 = 0; t0 < th_num; t0++) {
+			T th = t0 * th_step + Imin;
+			unsigned int n_b(0), n_o(0);		//these variables save the number of elements that are below and over the threshold
+			T m_b(0), m_o(0);				//these variables save the mean value for each cluster
+			for (unsigned int idx = 0; idx < pixels; idx++) {
+				if (pts[idx] <= th) {
+					m_b += pts[idx];
+					n_b += 1;
+				}
+				else {
+					m_o += pts[idx];
+					n_o += 1;
+				}
+			}
+
+			m_b = m_b / n_b;		//calculate the mean value for the below threshold cluster
+			m_o = m_o / n_o;		//calculate the mean value for the over threshold cluster
+
+			var_b.push_back(n_b * n_o * pow((m_b - m_o), 2));
+		}
+
+		std::vector<float>::iterator max_var = std::max_element(var_b.begin(), var_b.end());	//finding maximum elements in the vector
+		size_t th_idx = std::distance(var_b.begin(), max_var);
+		T threshold = Imin + (T)(th_idx * th_step);
+		return threshold;
+	}
+
+	//this function performs the 2D iterative voting algorithm on the image stored in the gpu 
+	template<typename T>
+	void gpu_ivote2(T* gpuI, unsigned int rmax, size_t x, size_t y, bool invert = false, T t = 0, int iter = 8, T phi = 15.0f * (float)stim::PI / 180, int conn = 8, bool debug = false) {
+
+		size_t pixels = x * y;				//compute the size of input image
+		//
+		if (invert) {						//if inversion is required call the kernel to invert the image
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads));
+			dim3 blocks((unsigned int)x / threads.x + 1, (unsigned int)y / threads.y + 1);
+			cuda_invert << <blocks, threads >> > (gpuI, x, y);
+		}
+		//
+		size_t table_bytes = (size_t)(pow(2 * rmax + 1, 2) * sizeof(T));				// create the atan2 table
+		T* cpuTable = (T*)malloc(table_bytes);											//assign memory on the cpu for atan2 table
+		atan_2<T>(cpuTable, rmax);														//call the function to precompute the atan2 table
+		T* gpuTable;  HANDLE_ERROR(cudaMalloc(&gpuTable, table_bytes));
+		HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, table_bytes, cudaMemcpyHostToDevice));	//copy atan2 table to the gpu
+
+		size_t bytes = pixels* sizeof(T);													//calculate the bytes of the input
+		float dphi = phi / iter;															//change in phi for each iteration
+
+		float* gpuGrad; HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes * 2));									//allocate space to store the 2D gradient
+		float* gpuVote; HANDLE_ERROR(cudaMalloc(&gpuVote, bytes));										//allocate space to store the vote image
+
+		stim::cuda::gpu_gradient_2d<float>(gpuGrad, gpuI, x, y);			//calculate the 2D gradient
+		stim::cuda::gpu_cart2polar<float>(gpuGrad, x, y);					//convert cartesian coordinate of gradient to the polar
+
+		for (int i = 0; i < iter; i++) {														//for each iteration
+			cudaMemset(gpuVote, 0, bytes);													//reset the vote image to 0
+			stim::cuda::gpu_vote<float>(gpuVote, gpuGrad, gpuTable, phi, rmax, x, y, debug);		//perform voting
+			stim::cuda::gpu_update_dir<float>(gpuVote, gpuGrad, gpuTable, phi, rmax, x, y, debug);	//update the voter directions
+			phi = phi - dphi;																//decrement phi
+		}
+		stim::cuda::gpu_local_max<float>(gpuI, gpuVote, conn, x, y);				//calculate the local maxima
+
+		if (t > 0) {
+			T* pts = (T*)malloc(bytes);													//allocate memory on the cpu to store the output of iterative voting
+			HANDLE_ERROR(cudaMemcpy(pts, gpuI, bytes, cudaMemcpyDeviceToHost));			//copy the output from gpu to the cpu memory
+
+			T threshold;
+			threshold = t;
+
+			size_t ind;
+			for (size_t ix = 0; ix < x; ix++) {
+				for (size_t iy = 0; iy < y; iy++) {
+					ind = iy * x + ix;
+					if (pts[ind] > threshold) {
+						pts[ind] = 1;
+					}
+					else pts[ind] = 0;
+				}
+			}
+			HANDLE_ERROR(cudaMemcpy(gpuI, pts, bytes, cudaMemcpyHostToDevice));		//copy the points to the gpu
+		}
+				
+	}
+
+
+	template<typename T>
+	void cpu_ivote2(T* cpuI, unsigned int rmax, size_t x, size_t y, float &gpu_time, bool invert = false, T t = 0, int iter = 8, T phi = 15.0f * (float)stim::PI / 180, int conn = 8, bool debug = false) {
+		size_t bytes = x*y * sizeof(T);
+		T* gpuI;						//allocate space on the gpu to save the input image
+
+		gpuTimer_start();
+		HANDLE_ERROR(cudaMalloc(&gpuI, bytes));
+		HANDLE_ERROR(cudaMemcpy(gpuI, cpuI, bytes, cudaMemcpyHostToDevice));		//copy the image to the gpu
+		stim::gpu_ivote2<T>(gpuI, rmax, x, y, invert, t, iter, phi, conn, debug);				//call the gpu version of the ivote
+		HANDLE_ERROR(cudaMemcpy(cpuI, gpuI, bytes, cudaMemcpyDeviceToHost));		//copy the output to the cpu
+
+		gpu_time = gpuTimer_end();
+	}
+}
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_ITER_VOTE2_H
+#define STIM_CUDA_ITER_VOTE2_H
+
+//extern bool DEBUG;
+
+#include "update_dir_bb.cuh"
+#include "vote_atomic_bb.cuh"
+
+namespace stim{
+	namespace cuda{
+	
+	}
+}
+
+
+
+#endif
 \ No newline at end of file
@@ -10,7 +10,7 @@ namespace stim{
 		// this kernel calculates the local maximum for finding the cell centers
 		template<typename T>
-		__global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, int conn, int x, int y){
+		__global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, int x,  int y){
 			// calculate the 2D coordinates for this current thread.
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
@@ -20,30 +20,21 @@ namespace stim{
 				return;
 			// convert 2D coordinates to 1D
-			int i = yi * x + xi;			
+		    int i = yi * x + xi;
 			gpuCenters[i] = 0;		//initialize the value at this location to zero
 			T val = gpuVote[i];
-			//compare to the threshold
-			if(val < final_t) return;
-			
-			//define an array to store indices with same vote value
-			/*int * IdxEq;
-			IdxEq = new int  [2*conn];
-			int n = 0;*/
-			
-			for(int xl = xi - conn; xl < xi + conn; xl++){
-				for(int yl = yi - conn; yl < yi + conn; yl++){
+			for(unsigned int xl = xi - conn; xl < xi + conn; xl++){
+				for(unsigned int yl = yi - conn; yl < yi + conn; yl++){
 					if(xl >= 0 && xl < x && yl >= 0 && yl < y){
-						int il = yl * x + xl;
+						unsigned int il = yl * x + xl;
 						if(gpuVote[il] > val){							
 							return;
 							}
 						if (gpuVote[il] == val){
-							/*IdxEq[n] = il;
-							n = n+1;*/
+							
 							 if( il > i){
 								 return;
 							}
@@ -51,29 +42,25 @@ namespace stim{
 					}							
 				}
 			}
-			/*if (n!=0){
-				if(IdxEq[n/2] !=i){
-					return;
-				}
-			}	*/	
-			gpuCenters[i] = 1;
+			
+			gpuCenters[i] = gpuVote[i];
 		}
 		template<typename T>
-		void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
+		void gpu_local_max(T* gpuCenters, T* gpuVote,  unsigned int conn, unsigned int x, unsigned int y){
 			unsigned int max_threads = stim::maxThreadsPerBlock();
 			/*dim3 threads(max_threads, 1);
 			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/
-			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
-			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
+			dim3 threads((unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
+			dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
 			//call the kernel to find the local maximum.
-			cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y);
+			cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, conn, x, y);
 		}
 		template<typename T>
-		void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
+		void cpu_local_max(T* cpuCenters, T* cpuVote, unsigned int conn, unsigned int x, unsigned int y){
 			//calculate the number of bytes in the array
 			unsigned int bytes = x * y * sizeof(T);
@@ -90,7 +77,7 @@ namespace stim{
 			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
 			//call the GPU version of the local max function
-			gpu_local_max<T>(gpuCenters, gpuVote, final_t, conn, x, y);
+			gpu_local_max<T>(gpuCenters, gpuVote, conn, x, y);
 			//copy the cell centers data to the CPU
 			cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ;
@@ -40,8 +40,9 @@ namespace stim{
 			bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi)));		//insert the final corner into the bounding box
 			int x_table = 2*rmax +1;
-			int lut_i;
 			T rmax_sq = rmax * rmax;
+
+			int lut_i;
 			T dx_sq, dy_sq;
 			bb.trim_low(0, 0);															//make sure the bounding box doesn't go outside the image
@@ -49,11 +50,12 @@ namespace stim{
 			int by, bx;
 			int dx, dy;													//coordinate relative to (xi, yi)
+			
 			T v;
 			T max_v = 0;												//initialize the maximum vote value to zero
 			T alpha;
-			int max_dx = bb.low[0];
-			int max_dy = bb.low[1];
+			int max_dx = bb.low[0] - xi;
+			int max_dy = bb.low[1] - yi;
 			for(by = bb.low[1]; by <= bb.high[1]; by++){					//for each element in the bounding box
 				dy = by - yi;											//calculate the y coordinate of the current point relative to yi
 				dy_sq = dy * dy;
@@ -79,26 +81,26 @@ namespace stim{
 		// this kernel updates the gradient direction by the calculated voting direction.
 		template<typename T>
-		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
+		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, size_t x, size_t y){
 			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
+			size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
 			if(xi >= x || yi >= y) return;
 			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
+			size_t i = yi * x + xi;
 			//update the gradient image with the vote direction
 			gpuGrad[2*i] = gpuDir[i];
 		}
 		template<typename T>
-		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y, bool DEBUG = false){
 			//calculate the number of bytes in the array
-			unsigned int bytes = x * y * sizeof(T);
+			size_t bytes = x * y * sizeof(T);
 			// allocate space on the GPU for the updated vote direction
 			T* gpuDir;
@@ -106,14 +108,14 @@ namespace stim{
 			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
-			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
+			dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
+			dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
 			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
 			//size_t curtain = 2 * rmax;
 			//size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain);
 			size_t shared_mem_req = table_bytes;// + template_bytes;
-			std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
+			if (DEBUG) std::cout << "Shared Memory required: " << shared_mem_req << std::endl;
 			size_t shared_mem = stim::sharedMemPerBlock();
 			if(shared_mem_req > shared_mem){
@@ -122,16 +124,10 @@ namespace stim{
 			}
 			//call the kernel to calculate the new voting direction
-			cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
-			//stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
-
-			//exit(0);
-
-			//threads = dim3( sqrt(max_threads), sqrt(max_threads) );
-			//blocks = dim3(x/threads.x + 1, y/threads.y + 1);
+			cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, (int)x , (int)y);
 			//call the kernel to update the gradient direction
-			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
+			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, (int)x , (int)y);
 			//free allocated memory
 			HANDLE_ERROR( cudaFree(gpuDir) );
@@ -9,12 +9,14 @@
 #include <stim/visualization/colormap.h>
 #include <math.h>
+
+
 namespace stim{
 	namespace cuda{
 		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
 		template<typename T>
-		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
+		__global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, size_t x, size_t y, bool gradmag = true){
 			extern __shared__ T S[];
 			T* shared_atan = S;
@@ -22,12 +24,12 @@ namespace stim{
 			stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
 			// calculate the 2D coordinates for this current thread.
-			int xi = blockIdx.x * blockDim.x + threadIdx.x;
-			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
+			size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
 			if(xi >= x || yi >= y) return;			
 			// convert 2D coordinates to 1D
-			int i = yi * x + xi;
+			size_t i = yi * x + xi;
 			// calculate the voting direction based on the grtadient direction
 			float theta = gpuGrad[2*i];
@@ -50,7 +52,7 @@ namespace stim{
 			bb.trim_low(0, 0);															//make sure the bounding box doesn't go outside the image
 			bb.trim_high(x-1, y-1);
-			int by, bx;
+			size_t by, bx;
 			int dx, dy;					
 			unsigned int ind_g;											//initialize the maximum vote value to zero
@@ -66,7 +68,8 @@ namespace stim{
 					alpha = shared_atan[lut_i];
 					if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
 						ind_g = (by)*x + (bx);
-						atomicAdd(&gpuVote[ind_g], mag);
+						if(gradmag) atomicAdd(&gpuVote[ind_g], mag);			//add the gradient magnitude (if the gradmag flag is enabled)
+						else		atomicAdd(&gpuVote[ind_g], 1.0f);			//otherwise just add 1
 					}
 				}
@@ -75,24 +78,30 @@ namespace stim{
 		}
+		/// Iterative voting for an image
+		/// @param gpuVote is the resulting vote image
+		/// @param gpuGrad is the gradient of the input image
+		/// @param gpuTable is the pre-computed atan2() table
+		/// @param phi is the angle of the vote region
+		/// @param rmax is the estimated radius of the blob (defines the "width" of the vote region)
+		/// @param x and y are the spatial dimensions of the gradient image
+		/// @param gradmag defines whether or not the gradient magnitude is taken into account during the vote
 		template<typename T>
-		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
-
-							
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, size_t x, size_t y, bool DEBUG = false, bool gradmag = true){
 			unsigned int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads( sqrt(max_threads), sqrt(max_threads) );
-			dim3 blocks(x/threads.x + 1, y/threads.y + 1);
+			dim3 threads( (unsigned int)sqrt(max_threads), (unsigned int)sqrt(max_threads) );
+			dim3 blocks((unsigned int)x/threads.x + 1, (unsigned int)y/threads.y + 1);
 			size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
 			size_t shared_mem_req = table_bytes;// + template_bytes;
-			std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;		
+			if (DEBUG) std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
 			size_t shared_mem = stim::sharedMemPerBlock();
 			if(shared_mem_req > shared_mem){
-				std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
+				std::cout<<"Error: insufficient shared memory for this implementation of cuda_vote()."<<std::endl;
 				exit(1);
 			}
 			//call the kernel to do the voting
-			cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+			cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y, gradmag);
 		}
+#ifndef STIM_BMP_H
+#define STIM_BMP_H
+
+#include <fstream>
+#include <iostream>
+
+namespace stim {
+#pragma pack(1)
+	typedef unsigned int DWORD;
+	typedef unsigned short WORD;
+	typedef signed int LONG;
+	typedef struct tagBITMAPFILEHEADER {
+		WORD  bfType;
+		DWORD bfSize;
+		WORD  bfReserved1;
+		WORD  bfReserved2;
+		DWORD bfOffBits;
+	} BITMAPFILEHEADER, *PBITMAPFILEHEADER;
+
+	const unsigned int DIB_BITMAPCOREHEADER = 12;
+	const unsigned int DIB_OS21XBITMAPHEADER = 16;
+	const unsigned int DIB_BITMAPINFOHEADER = 40;
+	const unsigned int DIB_BITMAPV2INFOHEADER = 52;
+	const unsigned int DIB_BITMAPV3INFOHEADER = 56;
+	const unsigned int DIB_OS22XBITMAPHEADER = 64;
+	const unsigned int DIB_BITMAPV4HEADER = 108;
+	const unsigned int DIB_BITMAPV5HEADER = 124;
+
+	typedef struct tagBITMAPCOREHEADER {
+		DWORD bcSize;
+		WORD  bcWidth;
+		WORD  bcHeight;
+		WORD  bcPlanes;
+		WORD  bcBitCount;
+	} BITMAPCOREHEADER, *PBITMAPCOREHEADER;
+
+	typedef struct tagBITMAPINFOHEADER {
+		DWORD biSize;											//40 bytes
+		LONG  biWidth;
+		LONG  biHeight;
+		WORD  biPlanes;
+		WORD  biBitCount;
+		DWORD biCompression;
+		DWORD biSizeImage;
+		LONG  biXPelsPerMeter;
+		LONG  biYPelsPerMeter;
+		DWORD biClrUsed;
+		DWORD biClrImportant;
+	} BITMAPINFOHEADER, *PBITMAPINFOHEADER;
+
+	// From FileFormat.info
+	typedef struct {
+		DWORD Size;            /* Size of this header in bytes */
+		LONG  Width;           /* Image width in pixels */
+		LONG  Height;          /* Image height in pixels */
+		WORD  Planes;          /* Number of color planes */
+		WORD  BitsPerPixel;    /* Number of bits per pixel */
+		DWORD Compression;     /* Compression methods used */
+		DWORD SizeOfBitmap;    /* Size of bitmap in bytes */
+		LONG  HorzResolution;  /* Horizontal resolution in pixels per meter */
+		LONG  VertResolution;  /* Vertical resolution in pixels per meter */
+		DWORD ColorsUsed;      /* Number of colors in the image */
+		DWORD ColorsImportant; /* Minimum number of important colors */
+							   /* Fields added for Windows 4.x follow this line */
+
+		DWORD RedMask;       /* Mask identifying bits of red component */
+		DWORD GreenMask;     /* Mask identifying bits of green component */
+		DWORD BlueMask;      /* Mask identifying bits of blue component */
+		DWORD AlphaMask;     /* Mask identifying bits of alpha component */
+		DWORD CSType;        /* Color space type */
+		LONG  RedX;          /* X coordinate of red endpoint */
+		LONG  RedY;          /* Y coordinate of red endpoint */
+		LONG  RedZ;          /* Z coordinate of red endpoint */
+		LONG  GreenX;        /* X coordinate of green endpoint */
+		LONG  GreenY;        /* Y coordinate of green endpoint */
+		LONG  GreenZ;        /* Z coordinate of green endpoint */
+		LONG  BlueX;         /* X coordinate of blue endpoint */
+		LONG  BlueY;         /* Y coordinate of blue endpoint */
+		LONG  BlueZ;         /* Z coordinate of blue endpoint */
+		DWORD GammaRed;      /* Gamma red coordinate scale value */
+		DWORD GammaGreen;    /* Gamma green coordinate scale value */
+		DWORD GammaBlue;     /* Gamma blue coordinate scale value */
+	} WIN4XBITMAPHEADER;
+
+	typedef struct {
+		DWORD        bV5Size;
+		LONG         bV5Width;
+		LONG         bV5Height;
+		WORD         bV5Planes;
+		WORD         bV5BitCount;
+		DWORD        bV5Compression;
+		DWORD        bV5SizeImage;
+		LONG         bV5XPelsPerMeter;
+		LONG         bV5YPelsPerMeter;
+		DWORD        bV5ClrUsed;
+		DWORD        bV5ClrImportant;
+		DWORD        bV5RedMask;
+		DWORD        bV5GreenMask;
+		DWORD        bV5BlueMask;
+		DWORD        bV5AlphaMask;
+		DWORD        bV5CSType;
+		LONG		 RedX;          /* X coordinate of red endpoint */
+		LONG		 RedY;          /* Y coordinate of red endpoint */
+		LONG		 RedZ;          /* Z coordinate of red endpoint */
+		LONG		 GreenX;        /* X coordinate of green endpoint */
+		LONG		 GreenY;        /* Y coordinate of green endpoint */
+		LONG		 GreenZ;        /* Z coordinate of green endpoint */
+		LONG		 BlueX;         /* X coordinate of blue endpoint */
+		LONG		 BlueY;         /* Y coordinate of blue endpoint */
+		LONG		 BlueZ;         /* Z coordinate of blue endpoint */
+		DWORD        bV5GammaRed;
+		DWORD        bV5GammaGreen;
+		DWORD        bV5GammaBlue;
+		DWORD        bV5Intent;
+		DWORD        bV5ProfileData;
+		DWORD        bV5ProfileSize;
+		DWORD        bV5Reserved;
+	} BITMAPV5HEADER, *PBITMAPV5HEADER;
+
+	
+	//compression methods
+	const unsigned int STIM_BI_RGB = 0;
+	const unsigned int STIM_BI_BITFIELDS = 3;
+
+	class bmp {
+		std::ifstream file;
+	public:
+		unsigned int dib_header_size;
+		size_t bit_pos;					// start position (relative to the beginning of the file) of the bitmap bits
+		size_t total_size;									//total size of the bitmap file (in bytes)
+		size_t width;
+		size_t height;
+		int channels;
+		int bits_per_pixel;
+		unsigned int compression;
+
+		size_t bytes() {
+			return width * height * bits_per_pixel / 8;
+		}
+		void read_bmpFileHeader() {
+			BITMAPFILEHEADER file_header;
+			file.read((char*)&file_header, sizeof(BITMAPFILEHEADER));
+			bit_pos = file_header.bfOffBits;
+			total_size = file_header.bfSize;
+		}
+		void read_bmpCoreHeader() {
+			tagBITMAPCOREHEADER header;
+			file.read((char*)&header, sizeof(tagBITMAPCOREHEADER));
+			width = header.bcWidth;
+			height = header.bcHeight;
+			bits_per_pixel = header.bcBitCount;
+			compression = 0;
+		}
+		void read_bmpInfoHeader() {
+			tagBITMAPINFOHEADER info_header;
+			file.read((char*)&info_header, sizeof(tagBITMAPINFOHEADER));
+			width = info_header.biWidth;
+			height = info_header.biHeight;
+			bits_per_pixel = info_header.biBitCount;
+			compression = info_header.biCompression;
+		}
+		void read_bmpV4Header() {
+			WIN4XBITMAPHEADER header;
+			file.read((char*)&header, sizeof(WIN4XBITMAPHEADER));
+			width = header.Width;
+			height = header.Height;
+			bits_per_pixel = header.BitsPerPixel;
+			compression = header.Compression;
+		}
+		void read_bmpV5Header() {
+			BITMAPV5HEADER header;
+			file.read((char*)&header, sizeof(BITMAPV5HEADER));
+			width = header.bV5Width;
+			height = header.bV5Height;
+			bits_per_pixel = header.bV5BitCount;
+			compression = header.bV5Compression;
+		}
+		void read_dib() {								//read the bitmap DIB information header
+			std::streamoff header_pos = file.tellg();
+			file.read((char*)&dib_header_size, sizeof(unsigned int));
+			file.seekg(header_pos);
+			switch (dib_header_size) {
+			case DIB_BITMAPCOREHEADER: read_bmpCoreHeader(); break;
+			case DIB_BITMAPINFOHEADER: read_bmpInfoHeader(); break;
+			case DIB_BITMAPV4HEADER: read_bmpV4Header(); break;
+			case DIB_BITMAPV5HEADER: read_bmpV5Header(); break;
+			default:
+				std::cout << "stim::bmp ERROR: this bitmap header format isn't supported" << std::endl;
+				exit(1);
+			}
+		}
+
+		bool open(std::string filename) {								//open the bitmap file and read the header data
+			file.open(filename, std::ifstream::binary);
+			if (!file) {
+				std::cout << "stim::bmp ERROR: error opening file: " << filename.c_str() << std::endl;
+				return false;
+			}
+			read_bmpFileHeader();										//read the file header
+			read_dib();
+			if (compression != STIM_BI_RGB) {								//check for compression
+				std::cout << "stim::bmp ERROR: this file is compressed, and compression is not supported" << std::endl;
+				return false;
+			}
+			return true;
+		}
+		void close() {
+			file.close();
+		}
+
+		/// Copy the bitmap data into a pre-allocated array
+		bool read(char* dst){
+			file.seekg(bit_pos);						//seek to the beginning of the data array
+			size_t row_bytes = width * bits_per_pixel / 8;	//number of bytes in each row
+			size_t padding = row_bytes % 4;					//calculate the padding on disk for each row (rows must be multiples of 4)
+			
+			if(file){
+				for (size_t h = 0; h < height; h++) {				//for each row in the image
+					file.read(dst + (height - h - 1) * row_bytes, row_bytes);		//read the row of image data
+					file.seekg(padding, std::ios::cur);				//seek to the end of the row on disk
+					if (file.eof())	std::cout << "stim::bmp ERROR: array size incorrect, end of file reached while reading bitmap." << std::endl;
+					else if (file.fail()) std::cout << "stim::bmp ERROR: reading bitmap array failed." << std::endl;
+					else if (file.bad()) std::cout << "stim::bmp ERROR: stream integrity failed while reading bitmap array" << std::endl;
+				}
+				return true;				
+			}
+			else{
+				std::cout<<"stim::bmp ERROR: could not read array from file."<<std::endl;
+				return false;
+			}
+		}
+	};
+
+	bool save_bmp(std::string filename, char* bits, size_t width, size_t height) {
+		size_t bits_per_pixel = 24;
+		size_t row_bytes = width * bits_per_pixel / 8;	//number of bytes in each row
+		size_t padding = row_bytes % 4;					//calculate the padding on disk for each row (rows must be multiples of 4)
+
+		tagBITMAPFILEHEADER file_header;
+		memset(&file_header, 0, sizeof(tagBITMAPFILEHEADER));									//initialize the file header structure to zero
+		file_header.bfOffBits = sizeof(tagBITMAPFILEHEADER) + sizeof(tagBITMAPCOREHEADER);		//the offset includes both the file and DIB header
+		file_header.bfSize = (stim::DWORD)(file_header.bfOffBits + (row_bytes + padding) * height);			//calculate the size of the bitmap file
+		file_header.bfType = (stim::DWORD)0x4D42;
+
+		tagBITMAPCOREHEADER info_header;
+		memset(&info_header, 0, sizeof(tagBITMAPCOREHEADER));										//initialize the info header to zero
+		info_header.bcBitCount = (stim::DWORD)bits_per_pixel;
+		info_header.bcHeight = (stim::WORD)height;
+		info_header.bcWidth = (stim::WORD)width;
+		info_header.bcSize = sizeof(tagBITMAPCOREHEADER);
+		info_header.bcPlanes = 1;
+
+		std::ofstream outfile(filename, std::ios::binary);										//open the output file for binary writing
+		outfile.write((char*)&file_header, sizeof(tagBITMAPFILEHEADER));						//write the file header
+		outfile.write((char*)&info_header, sizeof(tagBITMAPCOREHEADER));						//write the information header
+		
+		char* pad = (char*)malloc(padding);										//create a buffer that will be written as padding
+		memset(pad, 0, padding);
+		for (size_t h = 0; h < height; h++) {
+			outfile.write((char*)(bits + (height - h - 1) * row_bytes), row_bytes);									//write the bitmap data
+			outfile.write(pad, padding);
+		}
+		free(pad);
+		return true;
+	}
+}
+
+#endif
 \ No newline at end of file
 #ifndef STIM_IMAGE_H
 #define STIM_IMAGE_H
+#ifdef _WIN32
+#undef max
+#endif
+
 #ifdef USING_OPENCV
-	#include <opencv2/core/core.hpp>
-	#include <opencv2/highgui/highgui.hpp>
+	//#include <opencv2/core/core.hpp>
+	//#include <opencv2/highgui/highgui.hpp>
+	#include <opencv2/opencv.hpp>
+#else
+	#include <stim/image/bmp.h>
 #endif
 #include <vector>
 #include <iostream>
-#include <limits>
+#include <limits>							//use limits and remove the MIN and MAX macros
 #include <typeinfo>
 #include <fstream>
+#include <cstring>
+
+
+#include <stim/parser/filename.h>
 namespace stim{
 /// This static class provides the STIM interface for loading, saving, and storing 2D images.
@@ -45,6 +56,10 @@ class image{
 	void allocate(){
 		unalloc();
 		img = (T*) malloc( sizeof(T) * R[0] * R[1] * R[2] );	//allocate memory
+		if (img == NULL) {
+			std::cout << "stim::image ERROR - failed to allocate memory for image" << std::endl;
+			exit(1);
+		}
 	}
 	void allocate(size_t x, size_t y, size_t c){	//allocate memory based on the resolution
@@ -52,8 +67,6 @@ class image{
 		allocate();									//allocate memory
 	}
-	size_t bytes(){ return size() * sizeof(T); }
-
 	inline size_t idx(size_t x, size_t y, size_t c = 0) const {
 		return y * R[0] * R[1] + x * R[0] + c;
 	}
@@ -74,23 +87,16 @@ class image{
 #endif
 	/// Returns the value for "white" based on the dynamic range (assumes white is 1.0 for floating point images)
 	T white(){
-
-		if(typeid(T) == typeid(unsigned char))		return UCHAR_MAX;
-		if(typeid(T) == typeid(unsigned short))		return SHRT_MAX;
-		if(typeid(T) == typeid(unsigned))			return UINT_MAX;
-		if(typeid(T) == typeid(unsigned long))		return ULONG_MAX;
-		if(typeid(T) == typeid(unsigned long long))	return ULLONG_MAX;
-		if(typeid(T) == typeid(float))				return 1.0f;
-		if(typeid(T) == typeid(double))				return 1.0;
-
-		std::cout<<"ERROR in stim::image::white - no white value known for this data type"<<std::endl;
-		exit(1);
-
+		if (typeid(T) == typeid(double) || typeid(T) == typeid(float))
+			return (T)1.0;
+		else
+			return std::numeric_limits<T>::max();
 	}
-
 public:
+	size_t bytes() { return size() * sizeof(T); }
+
 	/// Default constructor - creates an empty image object
 	image(){ init(); }							//initialize all variables to zero, don't allocate any memory
@@ -125,19 +131,49 @@ public:
 		free(img);
 	}
-	///Resize an image
+	///Resize an image - this function looks like it hasn't been implemented
 	void resize(size_t x, size_t y, size_t c = 1) {
 		allocate(x, y, c);
 	}
 	stim::image<T>& operator=(const stim::image<T>& I){
-		init();
 		if(&I == this)									//handle self-assignment
 			return *this;
+		init();
 		allocate(I.X(), I.Y(), I.C());
 		memcpy(img, I.img, bytes());
 		return *this;
 	}
+#ifndef USING_OPENCV
+	void load_bmp(std::string filename) {
+		stim::bmp bitmap;
+		bitmap.open(filename);											//load the bitmap and read the headers
+		resize(bitmap.width, bitmap.height, 3);							//resize the current image to match the bitmap
+		if (!bitmap.read((char*)img)) {										//read the bits from file
+			std::cout << "stim::image ERROR: problem loading bitmap image." << std::endl;
+			exit(1);
+		}
+		bitmap.close();													//close the bitmap file
+	}
+#endif
+
+	//determines if a filename represents a valid file format that can be loaded/saved
+	static bool test_filename(std::string f) {
+		stim::filename fname = f;
+		std::string ext = fname.extension();
+#ifdef USING_OPENCV
+		if (ext == "bmp" ||
+			ext == "jpg" ||
+			ext == "png" ||
+			ext == "pbm" ||
+			ext == "tif" )
+			return true;
+#else
+		if (ext == "pbm" || ext == "bmp")
+			return true;
+#endif
+		return false;
+	}
 	//save a Netpbm file
 	void load_netpbm(std::string filename) {
@@ -146,10 +182,6 @@ public:
 			std::cout << "Error opening input file in image::load_netpbm()" << std::endl;
 			exit(1);
 		}
-		if (sizeof(T) != 1) {
-			std::cout << "Error in image::load_netpbm() - data type must be 8-bit integer." << std::endl;
-			exit(1);
-		}
 		size_t nc;													//allocate space for the number of channels
 		char format[2];												//allocate space to hold the image format tag
@@ -196,9 +228,12 @@ public:
 		}
 		size_t h = atoi(sh.c_str());					//convert the string into an integer
-		allocate(w, h, nc);								//allocate space for the image
-		infile.read((char*)img, size());						//copy the binary data from the file to the image
-		infile.close();
+		allocate(w, h, nc);													//allocate space for the image
+		unsigned char* buffer = (unsigned char*)malloc(w * h * nc);			//create a buffer to store the read data
+		infile.read((char*)buffer, size());									//copy the binary data from the file to the image
+		infile.close();														//close the file
+		for (size_t n = 0; n < size(); n++) img[n] = (T)buffer[n];			//copy the buffer data into the image
+		free(buffer);														//free the buffer array
 	}
@@ -218,6 +253,14 @@ public:
 		}
 	}
 #endif
+	//Copy N data points from source to dest, casting while doing so
+	template<typename S, typename D>
+	void type_copy(S* source, D* dest, size_t N) {
+		if (typeid(S) == typeid(D))						//if both types are the same
+			memcpy(dest, source, N * sizeof(S));		//just use a memcpy
+		for (size_t n = 0; n < N; n++)					//otherwise, iterate through each element
+			dest[n] = (D)source[n];							//copy and cast
+	}
 	/// Load an image from a file
 	void load(std::string filename){
 #ifdef USING_OPENCV
@@ -226,17 +269,24 @@ public:
 			std::cout<<"ERROR stim::image::load() - unable to find image "<<filename<<std::endl;
 			exit(1);
 		}
+		int cv_type = cvImage.type();
 		int cols = cvImage.cols;
 		int rows = cvImage.rows;
 		int channels = cvImage.channels();
 		allocate(cols, rows, channels);			//allocate space for the image
-		unsigned char* cv_ptr = (unsigned char*)cvImage.data;
-		if(C() == 1)														//if this is a single-color image, just copy the data
-			memcpy(img, cv_ptr, bytes());
+		size_t img_bytes = bytes();
+		unsigned char* cv_ptr = (unsigned char*)cvImage.data;		
+		if (C() == 1)														//if this is a single-color image, just copy the data
+			type_copy<unsigned char, T>(cv_ptr, img, size());
+		//memcpy(img, cv_ptr, bytes());
 		if(C() == 3)														//if this is a 3-color image, OpenCV uses BGR interleaving
 			from_opencv(cv_ptr, X(), Y());
 #else
-		load_netpbm(filename);
+		stim::filename file(filename);
+		if (file.extension() == "ppm")
+			load_netpbm(filename);
+		else if (file.extension() == "bmp")
+			load_bmp(filename);
 #endif
 	}
@@ -267,9 +317,20 @@ public:
 		outfile.write((const char*)img, size());			//write the binary data
 		outfile.close();
 	}
+#ifndef USING_OPENCV
+	void save_bmp(std::string filename) {
+		stim::save_bmp(filename, (char*)img, width(), height());
+	}
+#endif
 	//save a file
 	void save(std::string filename){
+		stim::filename file(filename);
+		if (file.extension() == "raw" || file.extension() == "") {
+			std::ofstream outfile(filename.c_str(), std::ios::binary);
+			outfile.write((char*)img, sizeof(T) * R[0] * R[1] * R[2]);
+			outfile.close();
+		}
 #ifdef USING_OPENCV
 		//OpenCV uses an interleaved format, so convert first and then output
 		T* buffer = (T*) malloc(bytes());
@@ -282,10 +343,34 @@ public:
 		cv::imwrite(filename, cvImage);
 		free(buffer);
 #else
-		save_netpbm(filename);
+		if (file.extension() == "ppm")
+			save_netpbm(filename);
+		else if (file.extension() == "bmp")
+			save_bmp(filename);
+		else {
+			std::cout << "stim::image ERROR: File type not supported without OpenCV. Make sure to link OpenCV and define USING_OPENCV" << std::endl;
+			exit(1);
+		}
 #endif
 	}
+	/// Returns an image cast to the specified format
+	template<typename U>
+	image<U> convert() {
+		
+		image<U> new_image(R[1], R[2], R[0]);					//create a new image with the destination data type
+
+		size_t ni = R[0] * R[1] * R[2];							//calculate the number of data points in the image
+
+		double inmax = (std::numeric_limits<T>::max)();				//get the maximum value for the input image
+		double outmax = (std::numeric_limits<U>::max)();				//get the maximum value for the output image
+		for (size_t i = 0; i < ni; i++) {							//for each pixel in the image
+			if (img[i] > outmax) new_image(i) = outmax;			//if the source pixel is greater than the maximum destination pixel, set the output to maximum
+			else new_image(i) = img[i];							//otherwise, copy the source value and cast it to the destination value		
+		}
+		return new_image;
+	}
+
 	void set_interleaved(T* buffer, size_t width, size_t height, size_t channels){
 		allocate(width, height, channels);
 		memcpy(img, buffer, bytes());
@@ -379,18 +464,92 @@ public:
 		return img[idx(x, y, c)];
 	}
+	/// This function returns a pixel reference based on a 1D index into the image
+	T& operator()(size_t i) {
+		return img[i];
+	}
+
 	/// Set all elements in the image to a given scalar value
 	/// @param v is the value used to set all values in the image
-	image<T> operator=(T v){
-
+	void set_all(T v) {														//set all elements of the image to a given value v
 		size_t N = size();
-
-		for(size_t n = 0; n < N; n++)
-			img[n] = v;
-
+		for (size_t n = 0; n < N; n++) img[n] = v;
+	}
+	image<T> operator=(T v){
+		set_all(v);
 		return *this;
+	}
+	/// invert the image, given a specified maximum value (ex. maxval = 255, I' = 255 - I)
+	/*image<T> invert(T maxval) {
+		image<T> result(width(), height(), channels());		//create a new image
+		size_t N = size();									//get the number of elements in the image
+		for (size_t n = 0; n < N; n++)
+			result.data()[n] = maxval - img[n];				//perform the inversion and save the result to the new image
+		return result;
+	}*/
+
+	/// Stretch the contrast of the image such that the minimum and maximum intensity match the given values
+	image<T> stretch(T low, T high) {
+		T maxval = maxv();
+		T minval = minv();
+		image<T> result = *this;				//create a new image for output
+		if (maxval == minval) {					//if the minimum and maximum values are the same, return an image composed of low
+			result = low;
+			return result;
+		}	
+		
+		size_t N = size();						//get the number of values in the image
+		T range = maxval - minval;			//calculate the current range of the image
+		T desired_range = high - low;		//calculate the desired range of the image
+		for (size_t n = 0; n < N; n++) {		//for each element in the image
+			result.data()[n] = desired_range * (img[n] - minval) / range + low;
+		}
+		return result;
+	}
+
+	/// Add a border of width w with the given value around the image
+	/// @param w specifies the total size of the border
+	/// @param T is the pixel value (all channels will be the same)
+	image<T> border(size_t w, T value = 0) {
+		image<T> result(width() + w * 2, height() + w * 2, channels());						//create an output image
+		result = value;														//assign the border value to all pixels in the new image
+		for (size_t y = 0; y < height(); y++) {								//for each pixel in the original image
+			for (size_t x = 0; x < width(); x++) {
+				size_t n = (y + w) * (width() + w * 2) + x + w;				//calculate the index of the corresponding pixel in the result image
+				size_t n0 = idx(x,y);										//calculate the index for this pixel in the original image
+				result.data()[n] = img[n0];									// copy the original image to the result image afer the border area
+			}
+		}
+		return result;
+	}
+
+	/// Adds curcular padding for the specified number of pixels - in this case replicating the boundary pixels
+	image<T> pad_replicate(size_t p) {
+		image<T> result(width() + p * 2, height() + p * 2, channels());						//create an output image
+		result = 0;
+		//result = value;														//assign the border value to all pixels in the new image
+		for (size_t y = 0; y < height(); y++) {								//for each pixel in the original image
+			for (size_t x = 0; x < width(); x++) {
+				size_t n = (y + p) * (width() + p * 2) + x + p;				//calculate the index of the corresponding pixel in the result image
+				size_t n0 = idx(x, y);										//calculate the index for this pixel in the original image
+				result.data()[n] = img[n0];									// copy the original image to the result image afer the border area
+			}
+		}
+		size_t l = p;
+		size_t r = p + width() - 1;
+		size_t t = p;
+		size_t b = p + height() - 1;
+		for (size_t y = 0; y < p; y++) for (size_t x = l; x <= r; x++) result(x, y) = result(x, t);						//pad the top
+		for (size_t y = b + 1; y < result.height(); y++) for (size_t x = l; x <= r; x++) result(x, y) = result(x, b);	//pad the bottom
+		for (size_t y = t; y <= b; y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, y);						//pad the left
+		for (size_t y = t; y <= b; y++) for (size_t x = r+1; x < result.width(); x++) result(x, y) = result(r, y);		//pad the right
+		for (size_t y = 0; y < t; y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, t);						//pad the top left
+		for (size_t y = 0; y < t; y++) for (size_t x = r+1; x < result.width(); x++) result(x, y) = result(r, t);		//pad the top right
+		for (size_t y = b+1; y < result.height(); y++) for (size_t x = 0; x < l; x++) result(x, y) = result(l, b);		//pad the bottom left
+		for (size_t y = b+1; y < result.height(); y++) for (size_t x = r + 1; x < result.width(); x++) result(x, y) = result(r, b);		//pad the bottom right
+		return result;
 	}
 	/// Copy the given data to the specified channel
@@ -474,7 +633,6 @@ public:
 				max_val = img[n];
 			}
 		}
-
 		return max_val;
 	}
@@ -502,13 +660,48 @@ public:
 		return r;								//return the inverted image
 	}
+	image<T> crop(size_t x0, size_t y0, size_t w, size_t h){
+		image<T> result(w, h, C());								//create the output cropped image
+
+		size_t srci;
+		size_t dsti;
+		size_t line_bytes = w * C();							//calculate the number of bytes in a line
+		for (size_t yi = 0; yi < h; yi++) {						//for each row in the cropped image
+			srci = (y0 + yi) * X() * C() + x0 * C();			//calculate the source index
+			dsti = yi * w * C();								//calculate the destination index
+			memcpy(&result.img[dsti], &img[srci], line_bytes);	//copy the data
+		}
+		return result;
+	}
+
+	//crop regions given by an array of 1D index values
+	std::vector< image<T> > crop_idx(size_t w, size_t h, std::vector<size_t> idx) {
+		std::vector< image<T> > result(idx.size());										//create an array of image files to return
+		for (size_t i = 0; i < idx.size(); i++) {										//for each specified index point
+			size_t y = idx[i] / X();													//calculate the y coordinate from the 1D index (center of ROI)
+			size_t x = idx[i] - y * X();												//calculate the x coordinate (center of ROI)
+			y -= w / 2;																	//update x and y values to reflect the lower corner of the ROI
+			x -= h / 2;
+			result[i] = crop(x, y, w, h);												//get the cropped image and store it in the result array
+		}
+		return result;
+	}
+
+	//operator functions
+	image<T> operator+(image<T> rhs) {
+		size_t N = size();						//calculate the total number of values in the image
+		image<T> r(X(), Y(), C());				//allocate space for the resulting image
+		for (size_t n = 0; n < N; n++)
+			r.img[n] = img[n] + rhs.img[n];		//perform the inversion
+		return r;								//return the inverted image
+	}
+
 	image<T> srgb2lab(){
 		std::cout<<"ERROR stim::image::srgb2lab - function has been broken, re-implement."<<std::endl;
 		exit(1);
 	}
 	image<T> convolve2(image<T> mask){
-
 		std::cout<<"ERROR stim::image::convolve2 - function has been broken, and shouldn't really be in here."<<std::endl;
 		exit(1);
 	}
+#ifndef RTS_BESSEL_H
+#define RTS_BESSEL_H
+
+#define _USE_MATH_DEFINES
+#include <math.h>
+#include "../math/complex.h"
+#define eps 1e-15
+#define el 0.5772156649015329
+
+
+namespace stim{
+
+static complex<double> cii(0.0,1.0);
+static complex<double> cone(1.0,0.0);
+static complex<double> czero(0.0,0.0);
+
+template< typename P >
+P gamma(P x)
+{
+    int i,k,m;
+    P ga,gr,r,z;
+
+    static P g[] = {
+        1.0,
+        0.5772156649015329,
+       -0.6558780715202538,
+       -0.420026350340952e-1,
+        0.1665386113822915,
+       -0.421977345555443e-1,
+       -0.9621971527877e-2,
+        0.7218943246663e-2,
+       -0.11651675918591e-2,
+       -0.2152416741149e-3,
+        0.1280502823882e-3,
+       -0.201348547807e-4,
+       -0.12504934821e-5,
+        0.1133027232e-5,
+       -0.2056338417e-6,
+        0.6116095e-8,
+        0.50020075e-8,
+       -0.11812746e-8,
+        0.1043427e-9,
+        0.77823e-11,
+       -0.36968e-11,
+        0.51e-12,
+       -0.206e-13,
+       -0.54e-14,
+        0.14e-14};
+
+    if (x > 171.0) return 1e308;    // This value is an overflow flag.
+    if (x == (int)x) {
+        if (x > 0.0) {
+            ga = 1.0;               // use factorial
+            for (i=2;i<x;i++) {
+               ga *= i;
+            }
+         }
+         else
+            ga = 1e308;
+     }
+     else {
+        if (fabs(x) > 1.0) {
+            z = fabs(x);
+            m = (int)z;
+            r = 1.0;
+            for (k=1;k<=m;k++) {
+                r *= (z-k);
+            }
+            z -= m;
+        }
+        else
+            z = x;
+        gr = g[24];
+        for (k=23;k>=0;k--) {
+            gr = gr*z+g[k];
+        }
+        ga = 1.0/(gr*z);
+        if (fabs(x) > 1.0) {
+            ga *= r;
+            if (x < 0.0) {
+                ga = -M_PI/(x*ga*sin(M_PI*x));
+            }
+        }
+    }
+    return ga;
+}
+
+template<typename P>
+int bessjy01a(P x,P &j0,P &j1,P &y0,P &y1,
+    P &j0p,P &j1p,P &y0p,P &y1p)
+{
+    P x2,r,ec,w0,w1,r0,r1,cs0,cs1;
+    P cu,p0,q0,p1,q1,t1,t2;
+    int k,kz;
+    static P a[] = {
+        -7.03125e-2,
+         0.112152099609375,
+        -0.5725014209747314,
+         6.074042001273483,
+        -1.100171402692467e2,
+         3.038090510922384e3,
+        -1.188384262567832e5,
+         6.252951493434797e6,
+        -4.259392165047669e8,
+         3.646840080706556e10,
+        -3.833534661393944e12,
+         4.854014686852901e14,
+        -7.286857349377656e16,
+         1.279721941975975e19};
+    static P b[] = {
+         7.32421875e-2,
+        -0.2271080017089844,
+         1.727727502584457,
+        -2.438052969955606e1,
+         5.513358961220206e2,
+        -1.825775547429318e4,
+         8.328593040162893e5,
+        -5.006958953198893e7,
+         3.836255180230433e9,
+        -3.649010818849833e11,
+         4.218971570284096e13,
+        -5.827244631566907e15,
+         9.476288099260110e17,
+        -1.792162323051699e20};
+    static P a1[] = {
+         0.1171875,
+        -0.1441955566406250,
+         0.6765925884246826,
+        -6.883914268109947,
+         1.215978918765359e2,
+        -3.302272294480852e3,
+         1.276412726461746e5,
+        -6.656367718817688e6,
+         4.502786003050393e8,
+        -3.833857520742790e10,
+         4.011838599133198e12,
+        -5.060568503314727e14,
+         7.572616461117958e16,
+        -1.326257285320556e19};
+    static P b1[] = {
+        -0.1025390625,
+         0.2775764465332031,
+        -1.993531733751297,
+         2.724882731126854e1,
+        -6.038440767050702e2,
+         1.971837591223663e4,
+        -8.902978767070678e5,
+         5.310411010968522e7,
+        -4.043620325107754e9,
+         3.827011346598605e11,
+        -4.406481417852278e13,
+         6.065091351222699e15,
+        -9.833883876590679e17,
+         1.855045211579828e20};
+
+    if (x < 0.0) return 1;
+    if (x == 0.0) {
+        j0 = 1.0;
+        j1 = 0.0;
+        y0 = -1e308;
+        y1 = -1e308;
+        j0p = 0.0;
+        j1p = 0.5;
+        y0p = 1e308;
+        y1p = 1e308;
+        return 0;
+    }
+    x2 = x*x;
+    if (x <= 12.0) {
+        j0 = 1.0;
+        r = 1.0;
+        for (k=1;k<=30;k++) {
+            r *= -0.25*x2/(k*k);
+            j0 += r;
+            if (fabs(r) < fabs(j0)*1e-15) break;
+        }
+        j1 = 1.0;
+        r = 1.0;
+        for (k=1;k<=30;k++) {
+            r *= -0.25*x2/(k*(k+1));
+            j1 += r;
+            if (fabs(r) < fabs(j1)*1e-15) break;
+        }
+        j1 *= 0.5*x;
+        ec = log(0.5*x)+el;
+        cs0 = 0.0;
+        w0 = 0.0;
+        r0 = 1.0;
+        for (k=1;k<=30;k++) {
+            w0 += 1.0/k;
+            r0 *= -0.25*x2/(k*k);
+            r = r0 * w0;
+            cs0 += r;
+            if (fabs(r) < fabs(cs0)*1e-15) break;
+        }
+        y0 = M_2_PI*(ec*j0-cs0);
+        cs1 = 1.0;
+        w1 = 0.0;
+        r1 = 1.0;
+        for (k=1;k<=30;k++) {
+            w1 += 1.0/k;
+            r1 *= -0.25*x2/(k*(k+1));
+            r = r1*(2.0*w1+1.0/(k+1));
+            cs1 += r;
+            if (fabs(r) < fabs(cs1)*1e-15) break;
+        }
+        y1 = M_2_PI * (ec*j1-1.0/x-0.25*x*cs1);
+    }
+    else {
+        if (x >= 50.0) kz = 8;          // Can be changed to 10
+        else if (x >= 35.0) kz = 10;    //  "       "        12
+        else kz = 12;                   //  "       "        14
+        t1 = x-M_PI_4;
+        p0 = 1.0;
+        q0 = -0.125/x;
+        for (k=0;k<kz;k++) {
+            p0 += a[k]*pow(x,-2*k-2);
+            q0 += b[k]*pow(x,-2*k-3);
+        }
+        cu = sqrt(M_2_PI/x);
+        j0 = cu*(p0*cos(t1)-q0*sin(t1));
+        y0 = cu*(p0*sin(t1)+q0*cos(t1));
+        t2 = x-0.75*M_PI;
+        p1 = 1.0;
+        q1 = 0.375/x;
+        for (k=0;k<kz;k++) {
+            p1 += a1[k]*pow(x,-2*k-2);
+            q1 += b1[k]*pow(x,-2*k-3);
+        }
+        j1 = cu*(p1*cos(t2)-q1*sin(t2));
+        y1 = cu*(p1*sin(t2)+q1*cos(t2));
+    }
+    j0p = -j1;
+    j1p = j0-j1/x;
+    y0p = -y1;
+    y1p = y0-y1/x;
+    return 0;
+}
+//
+//  INPUT:
+//      double x    -- argument of Bessel function
+//
+//  OUTPUT:
+//      double j0   -- Bessel function of 1st kind, 0th order
+//      double j1   -- Bessel function of 1st kind, 1st order
+//      double y0   -- Bessel function of 2nd kind, 0th order
+//      double y1   -- Bessel function of 2nd kind, 1st order
+//      double j0p  -- derivative of Bessel function of 1st kind, 0th order
+//      double j1p  -- derivative of Bessel function of 1st kind, 1st order
+//      double y0p  -- derivative of Bessel function of 2nd kind, 0th order
+//      double y1p  -- derivative of Bessel function of 2nd kind, 1st order
+//
+//  RETURN:
+//      int error code: 0 = OK, 1 = error
+//
+//  This algorithm computes the functions using polynomial approximations.
+//
+template<typename P>
+int bessjy01b(P x,P &j0,P &j1,P &y0,P &y1,
+    P &j0p,P &j1p,P &y0p,P &y1p)
+{
+    P t,t2,dtmp,a0,p0,q0,p1,q1,ta0,ta1;
+    if (x < 0.0) return 1;
+    if (x == 0.0) {
+        j0 = 1.0;
+        j1 = 0.0;
+        y0 = -1e308;
+        y1 = -1e308;
+        j0p = 0.0;
+        j1p = 0.5;
+        y0p = 1e308;
+        y1p = 1e308;
+        return 0;
+    }
+    if(x <= 4.0) {
+        t = x/4.0;
+        t2 = t*t;
+        j0 = ((((((-0.5014415e-3*t2+0.76771853e-2)*t2-0.0709253492)*t2+
+            0.4443584263)*t2-1.7777560599)*t2+3.9999973021)*t2
+            -3.9999998721)*t2+1.0;
+        j1 = t*(((((((-0.1289769e-3*t2+0.22069155e-2)*t2-0.0236616773)*t2+
+            0.1777582922)*t2-0.8888839649)*t2+2.6666660544)*t2-
+            3.999999971)*t2+1.9999999998);
+        dtmp = (((((((-0.567433e-4*t2+0.859977e-3)*t2-0.94855882e-2)*t2+
+            0.0772975809)*t2-0.4261737419)*t2+1.4216421221)*t2-
+            2.3498519931)*t2+1.0766115157)*t2+0.3674669052;
+        y0 = M_2_PI*log(0.5*x)*j0+dtmp;
+        dtmp = (((((((0.6535773e-3*t2-0.0108175626)*t2+0.107657607)*t2-
+            0.7268945577)*t2+3.1261399273)*t2-7.3980241381)*t2+
+            6.8529236342)*t2+0.3932562018)*t2-0.6366197726;
+        y1 = M_2_PI*log(0.5*x)*j1+dtmp/x;
+    }
+    else {
+        t = 4.0/x;
+        t2 = t*t;
+        a0 = sqrt(M_2_PI/x);
+        p0 = ((((-0.9285e-5*t2+0.43506e-4)*t2-0.122226e-3)*t2+
+             0.434725e-3)*t2-0.4394275e-2)*t2+0.999999997;
+        q0 = t*(((((0.8099e-5*t2-0.35614e-4)*t2+0.85844e-4)*t2-
+            0.218024e-3)*t2+0.1144106e-2)*t2-0.031249995);
+        ta0 = x-M_PI_4;
+        j0 = a0*(p0*cos(ta0)-q0*sin(ta0));
+        y0 = a0*(p0*sin(ta0)+q0*cos(ta0));
+        p1 = ((((0.10632e-4*t2-0.50363e-4)*t2+0.145575e-3)*t2
+            -0.559487e-3)*t2+0.7323931e-2)*t2+1.000000004;
+        q1 = t*(((((-0.9173e-5*t2+0.40658e-4)*t2-0.99941e-4)*t2
+            +0.266891e-3)*t2-0.1601836e-2)*t2+0.093749994);
+        ta1 = x-0.75*M_PI;
+        j1 = a0*(p1*cos(ta1)-q1*sin(ta1));
+        y1 = a0*(p1*sin(ta1)+q1*cos(ta1));
+    }
+    j0p = -j1;
+    j1p = j0-j1/x;
+    y0p = -y1;
+    y1p = y0-y1/x;
+    return 0;
+}
+template<typename P>
+int msta1(P x,int mp)
+{
+    P a0,f0,f1,f;
+    int i,n0,n1,nn;
+
+    a0 = fabs(x);
+    n0 = (int)(1.1*a0)+1;
+    f0 = 0.5*log10(6.28*n0)-n0*log10(1.36*a0/n0)-mp;
+    n1 = n0+5;
+    f1 = 0.5*log10(6.28*n1)-n1*log10(1.36*a0/n1)-mp;
+    for (i=0;i<20;i++) {
+        nn = (int)(n1-(n1-n0)/(1.0-f0/f1));
+        f = 0.5*log10(6.28*nn)-nn*log10(1.36*a0/nn)-mp;
+        if (std::abs(nn-n1) < 1) break;
+        n0 = n1;
+        f0 = f1;
+        n1 = nn;
+        f1 = f;
+    }
+    return nn;
+}
+template<typename P>
+int msta2(P x,int n,int mp)
+{
+    P a0,ejn,hmp,f0,f1,f,obj;
+    int i,n0,n1,nn;
+
+    a0 = fabs(x);
+    hmp = 0.5*mp;
+    ejn = 0.5*log10(6.28*n)-n*log10(1.36*a0/n);
+    if (ejn <= hmp) {
+        obj = mp;
+        n0 = (int)(1.1*a0);
+        if (n0 < 1) n0 = 1;
+    }
+    else {
+        obj = hmp+ejn;
+        n0 = n;
+    }
+    f0 = 0.5*log10(6.28*n0)-n0*log10(1.36*a0/n0)-obj;
+    n1 = n0+5;
+    f1 = 0.5*log10(6.28*n1)-n1*log10(1.36*a0/n1)-obj;
+    for (i=0;i<20;i++) {
+        nn = (int)(n1-(n1-n0)/(1.0-f0/f1));
+        f = 0.5*log10(6.28*nn)-nn*log10(1.36*a0/nn)-obj;
+        if (std::abs(nn-n1) < 1) break;
+        n0 = n1;
+        f0 = f1;
+        n1 = nn;
+        f1 = f;
+    }
+    return nn+10;
+}
+//
+//  INPUT:
+//  double x    -- argument of Bessel function of 1st and 2nd kind.
+//  int n       -- order
+//
+//  OUPUT:
+//
+//  int nm      -- highest order actually computed (nm <= n)
+//  double jn[] -- Bessel function of 1st kind, orders from 0 to nm
+//  double yn[] -- Bessel function of 2nd kind, orders from 0 to nm
+//  double j'n[]-- derivative of Bessel function of 1st kind,
+//                      orders from 0 to nm
+//  double y'n[]-- derivative of Bessel function of 2nd kind,
+//                      orders from 0 to nm
+//
+//  Computes Bessel functions of all order up to 'n' using recurrence
+//  relations. If 'nm' < 'n' only 'nm' orders are returned.
+//
+template<typename P>
+int bessjyna(int n,P x,int &nm,P *jn,P *yn,
+    P *jnp,P *ynp)
+{
+    P bj0,bj1,f,f0,f1,f2,cs;
+    int i,k,m,ecode;
+
+    nm = n;
+    if ((x < 0.0) || (n < 0)) return 1;
+    if (x < 1e-15) {
+        for (i=0;i<=n;i++) {
+            jn[i] = 0.0;
+            yn[i] = -1e308;
+            jnp[i] = 0.0;
+            ynp[i] = 1e308;
+        }
+        jn[0] = 1.0;
+        jnp[1] = 0.5;
+        return 0;
+    }
+    ecode = bessjy01a(x,jn[0],jn[1],yn[0],yn[1],jnp[0],jnp[1],ynp[0],ynp[1]);
+    if (n < 2) return 0;
+    bj0 = jn[0];
+    bj1 = jn[1];
+    if (n < (int)0.9*x) {
+        for (k=2;k<=n;k++) {
+            jn[k] = 2.0*(k-1.0)*bj1/x-bj0;
+            bj0 = bj1;
+            bj1 = jn[k];
+        }
+    }
+    else {
+        m = msta1(x,200);
+        if (m < n) nm = m;
+        else m = msta2(x,n,15);
+        f2 = 0.0;
+        f1 = 1.0e-100;
+        for (k=m;k>=0;k--) {
+            f = 2.0*(k+1.0)/x*f1-f2;
+            if (k <= nm) jn[k] = f;
+            f2 = f1;
+            f1 = f;
+        }
+        if (fabs(bj0) > fabs(bj1)) cs = bj0/f;
+        else cs = bj1/f2;
+        for (k=0;k<=nm;k++) {
+            jn[k] *= cs;
+        }
+    }
+    for (k=2;k<=nm;k++) {
+        jnp[k] = jn[k-1]-k*jn[k]/x;
+    }
+    f0 = yn[0];
+    f1 = yn[1];
+    for (k=2;k<=nm;k++) {
+        f = 2.0*(k-1.0)*f1/x-f0;
+        yn[k] = f;
+        f0 = f1;
+        f1 = f;
+    }
+    for (k=2;k<=nm;k++) {
+        ynp[k] = yn[k-1]-k*yn[k]/x;
+    }
+    return 0;
+}
+//
+//  Same input and output conventions as above. Different recurrence
+//  relations used for 'x' < 300.
+//
+template<typename P>
+int bessjynb(int n,P x,int &nm,P *jn,P *yn,
+    P *jnp,P *ynp)
+{
+    P t1,t2,f,f1,f2,bj0,bj1,bjk,by0,by1,cu,s0,su,sv;
+    P ec,bs,byk,p0,p1,q0,q1;
+    static P a[] = {
+        -0.7031250000000000e-1,
+         0.1121520996093750,
+        -0.5725014209747314,
+         6.074042001273483};
+    static P b[] = {
+         0.7324218750000000e-1,
+        -0.2271080017089844,
+         1.727727502584457,
+        -2.438052969955606e1};
+    static P a1[] = {
+         0.1171875,
+        -0.1441955566406250,
+         0.6765925884246826,
+        -6.883914268109947};
+    static P b1[] = {
+       -0.1025390625,
+        0.2775764465332031,
+       -1.993531733751297,
+        2.724882731126854e1};
+
+    int i,k,m;
+    nm = n;
+    if ((x < 0.0) || (n < 0)) return 1;
+    if (x < 1e-15) {
+        for (i=0;i<=n;i++) {
+            jn[i] = 0.0;
+            yn[i] = -1e308;
+            jnp[i] = 0.0;
+            ynp[i] = 1e308;
+        }
+        jn[0] = 1.0;
+        jnp[1] = 0.5;
+        return 0;
+    }
+    if (x <= 300.0 || n > (int)(0.9*x)) {
+        if (n == 0) nm = 1;
+        m = msta1(x,200);
+        if (m < nm) nm = m;
+        else m = msta2(x,nm,15);
+        bs = 0.0;
+        su = 0.0;
+        sv = 0.0;
+        f2 = 0.0;
+        f1 = 1.0e-100;
+        for (k = m;k>=0;k--) {
+            f = 2.0*(k+1.0)/x*f1 - f2;
+            if (k <= nm) jn[k] = f;
+            if ((k == 2*(int)(k/2)) && (k != 0)) {
+                bs += 2.0*f;
+//                su += pow(-1,k>>1)*f/(double)k;
+                su += (-1)*((k & 2)-1)*f/(P)k;
+            }
+            else if (k > 1) {
+//                sv += pow(-1,k>>1)*k*f/(k*k-1.0);
+                sv += (-1)*((k & 2)-1)*(P)k*f/(k*k-1.0);
+            }
+            f2 = f1;
+            f1 = f;
+        }
+        s0 = bs+f;
+        for (k=0;k<=nm;k++) {
+            jn[k] /= s0;
+        }
+        ec = log(0.5*x) +0.5772156649015329;
+        by0 = M_2_PI*(ec*jn[0]-4.0*su/s0);
+        yn[0] = by0;
+        by1 = M_2_PI*((ec-1.0)*jn[1]-jn[0]/x-4.0*sv/s0);
+        yn[1] = by1;
+    }
+    else {
+        t1 = x-M_PI_4;
+        p0 = 1.0;
+        q0 = -0.125/x;
+        for (k=0;k<4;k++) {
+            p0 += a[k]*pow(x,-2*k-2);
+            q0 += b[k]*pow(x,-2*k-3);
+        }
+        cu = sqrt(M_2_PI/x);
+        bj0 = cu*(p0*cos(t1)-q0*sin(t1));
+        by0 = cu*(p0*sin(t1)+q0*cos(t1));
+        jn[0] = bj0;
+        yn[0] = by0;
+        t2 = x-0.75*M_PI;
+        p1 = 1.0;
+        q1 = 0.375/x;
+        for (k=0;k<4;k++) {
+            p1 += a1[k]*pow(x,-2*k-2);
+            q1 += b1[k]*pow(x,-2*k-3);
+        }
+        bj1 = cu*(p1*cos(t2)-q1*sin(t2));
+        by1 = cu*(p1*sin(t2)+q1*cos(t2));
+        jn[1] = bj1;
+        yn[1] = by1;
+        for (k=2;k<=nm;k++) {
+            bjk = 2.0*(k-1.0)*bj1/x-bj0;
+            jn[k] = bjk;
+            bj0 = bj1;
+            bj1 = bjk;
+        }
+    }
+    jnp[0] = -jn[1];
+    for (k=1;k<=nm;k++) {
+        jnp[k] = jn[k-1]-k*jn[k]/x;
+    }
+    for (k=2;k<=nm;k++) {
+        byk = 2.0*(k-1.0)*by1/x-by0;
+        yn[k] = byk;
+        by0 = by1;
+        by1 = byk;
+    }
+    ynp[0] = -yn[1];
+    for (k=1;k<=nm;k++) {
+        ynp[k] = yn[k-1]-k*yn[k]/x;
+    }
+    return 0;
+
+}
+
+//  The following routine computes Bessel Jv(x) and Yv(x) for
+//  arbitrary positive order (v). For negative order, use:
+//
+//      J-v(x) = Jv(x)cos(v pi) - Yv(x)sin(v pi)
+//      Y-v(x) = Jv(x)sin(v pi) + Yv(x)cos(v pi)
+//
+template<typename P>
+int bessjyv(P v,P x,P &vm,P *jv,P *yv,
+    P *djv,P *dyv)
+{
+    P v0,vl,vg,vv,a,a0,r,x2,bjv0,bjv1,bjvl,f,f0,f1,f2;
+    P r0,r1,ck,cs,cs0,cs1,sk,qx,px,byv0,byv1,rp,xk,rq;
+    P b,ec,w0,w1,bju0,bju1,pv0,pv1,byvk;
+    int j,k,l,m,n,kz;
+
+    x2 = x*x;
+    n = (int)v;
+    v0 = v-n;
+    if ((x < 0.0) || (v < 0.0)) return 1;
+    if (x < 1e-15) {
+        for (k=0;k<=n;k++) {
+            jv[k] = 0.0;
+            yv[k] = -1e308;
+            djv[k] = 0.0;
+            dyv[k] = 1e308;
+            if (v0 == 0.0) {
+                jv[0] = 1.0;
+                djv[1] = 0.5;
+            }
+            else djv[0] = 1e308;
+        }
+        vm = v;
+        return 0;
+    }
+    if (x <= 12.0) {
+        for (l=0;l<2;l++) {
+            vl = v0 + l;
+            bjvl = 1.0;
+            r = 1.0;
+            for (k=1;k<=40;k++) {
+                r *= -0.25*x2/(k*(k+vl));
+                bjvl += r;
+                if (fabs(r) < fabs(bjvl)*1e-15) break;
+            }
+            vg = 1.0 + vl;
+            a = pow(0.5*x,vl)/gamma(vg);
+            if (l == 0) bjv0 = bjvl*a;
+            else bjv1 = bjvl*a;
+        }
+    }
+    else {
+        if (x >= 50.0) kz = 8;
+        else if (x >= 35.0) kz = 10;
+        else kz = 11;
+        for (j=0;j<2;j++) {
+            vv = 4.0*(j+v0)*(j+v0);
+            px = 1.0;
+            rp = 1.0;
+            for (k=1;k<=kz;k++) {
+                rp *= (-0.78125e-2)*(vv-pow(4.0*k-3.0,2.0))*
+                    (vv-pow(4.0*k-1.0,2.0))/(k*(2.0*k-1.0)*x2);
+                px += rp;
+            }
+            qx = 1.0;
+            rq = 1.0;
+            for (k=1;k<=kz;k++) {
+                rq *= (-0.78125e-2)*(vv-pow(4.0*k-1.0,2.0))*
+                    (vv-pow(4.0*k+1.0,2.0))/(k*(2.0*k+1.0)*x2);
+                qx += rq;
+            }
+            qx *= 0.125*(vv-1.0)/x;
+            xk = x-(0.5*(j+v0)+0.25)*M_PI;
+            a0 = sqrt(M_2_PI/x);
+            ck = cos(xk);
+            sk = sin(xk);
+
+            if (j == 0) {
+                bjv0 = a0*(px*ck-qx*sk);
+                byv0 = a0*(px*sk+qx*ck);
+            }
+            else if (j == 1) {
+                bjv1 = a0*(px*ck-qx*sk);
+                byv1 = a0*(px*sk+qx*ck);
+            }
+        }
+    }
+    jv[0] = bjv0;
+    jv[1] = bjv1;
+    djv[0] = v0*jv[0]/x-jv[1];
+    djv[1] = -(1.0+v0)*jv[1]/x+jv[0];
+    if ((n >= 2) && (n <= (int)(0.9*x))) {
+        f0 = bjv0;
+        f1 = bjv1;
+        for (k=2;k<=n;k++) {
+            f = 2.0*(k+v0-1.0)*f1/x-f0;
+            jv[k] = f;
+            f0 = f1;
+            f1 = f;
+        }
+    }
+    else if (n >= 2) {
+        m = msta1(x,200);
+        if (m < n) n = m;
+        else m = msta2(x,n,15);
+        f2 = 0.0;
+        f1 = 1.0e-100;
+        for (k=m;k>=0;k--) {
+            f = 2.0*(v0+k+1.0)*f1/x-f2;
+            if (k <= n) jv[k] = f;
+            f2 = f1;
+            f1 = f;
+        }
+        if (fabs(bjv0) > fabs(bjv1)) cs = bjv0/f;
+        else cs = bjv1/f2;
+        for (k=0;k<=n;k++) {
+            jv[k] *= cs;
+        }
+    }
+    for (k=2;k<=n;k++) {
+        djv[k] = -(k+v0)*jv[k]/x+jv[k-1];
+    }
+    if (x <= 12.0) {
+        if (v0 != 0.0) {
+            for (l=0;l<2;l++) {
+                vl = v0 +l;
+                bjvl = 1.0;
+                r = 1.0;
+                for (k=1;k<=40;k++) {
+                    r *= -0.25*x2/(k*(k-vl));
+                    bjvl += r;
+                    if (fabs(r) < fabs(bjvl)*1e-15) break;
+                }
+                vg = 1.0-vl;
+                b = pow(2.0/x,vl)/gamma(vg);
+                if (l == 0) bju0 = bjvl*b;
+                else bju1 = bjvl*b;
+            }
+            pv0 = M_PI*v0;
+            pv1 = M_PI*(1.0+v0);
+            byv0 = (bjv0*cos(pv0)-bju0)/sin(pv0);
+            byv1 = (bjv1*cos(pv1)-bju1)/sin(pv1);
+        }
+        else {
+            ec = log(0.5*x)+el;
+            cs0 = 0.0;
+            w0 = 0.0;
+            r0 = 1.0;
+            for (k=1;k<=30;k++) {
+                w0 += 1.0/k;
+                r0 *= -0.25*x2/(k*k);
+                cs0 += r0*w0;
+            }
+            byv0 = M_2_PI*(ec*bjv0-cs0);
+            cs1 = 1.0;
+            w1 = 0.0;
+            r1 = 1.0;
+            for (k=1;k<=30;k++) {
+                w1 += 1.0/k;
+                r1 *= -0.25*x2/(k*(k+1));
+                cs1 += r1*(2.0*w1+1.0/(k+1.0));
+            }
+            byv1 = M_2_PI*(ec*bjv1-1.0/x-0.25*x*cs1);
+        }
+    }
+    yv[0] = byv0;
+    yv[1] = byv1;
+    for (k=2;k<=n;k++) {
+        byvk = 2.0*(v0+k-1.0)*byv1/x-byv0;
+        yv[k] = byvk;
+        byv0 = byv1;
+        byv1 = byvk;
+    }
+    dyv[0] = v0*yv[0]/x-yv[1];
+    for (k=1;k<=n;k++) {
+        dyv[k] = -(k+v0)*yv[k]/x+yv[k-1];
+    }
+    vm = n + v0;
+    return 0;
+}
+
+template<typename P>
+int bessjyv_sph(int v, P z, P &vm, P* cjv,
+    P* cyv, P* cjvp, P* cyvp)
+{
+    //first, compute the bessel functions of fractional order
+    bessjyv<P>(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp);
+
+    //iterate through each and scale
+    for(int n = 0; n<=v; n++)
+    {
+
+        cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
+        cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
+
+        cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0));
+        cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0));
+    }
+
+	return 0;
+
+}
+
+template<typename P>
+int cbessjy01(complex<P> z,complex<P> &cj0,complex<P> &cj1,
+    complex<P> &cy0,complex<P> &cy1,complex<P> &cj0p,
+    complex<P> &cj1p,complex<P> &cy0p,complex<P> &cy1p)
+{
+    complex<P> z1,z2,cr,cp,cs,cp0,cq0,cp1,cq1,ct1,ct2,cu;
+    P a0,w0,w1;
+    int k,kz;
+
+    static P a[] = {
+        -7.03125e-2,
+         0.112152099609375,
+        -0.5725014209747314,
+         6.074042001273483,
+        -1.100171402692467e2,
+         3.038090510922384e3,
+        -1.188384262567832e5,
+         6.252951493434797e6,
+        -4.259392165047669e8,
+         3.646840080706556e10,
+        -3.833534661393944e12,
+         4.854014686852901e14,
+        -7.286857349377656e16,
+         1.279721941975975e19};
+    static P b[] = {
+         7.32421875e-2,
+        -0.2271080017089844,
+         1.727727502584457,
+        -2.438052969955606e1,
+         5.513358961220206e2,
+        -1.825775547429318e4,
+         8.328593040162893e5,
+        -5.006958953198893e7,
+         3.836255180230433e9,
+        -3.649010818849833e11,
+         4.218971570284096e13,
+        -5.827244631566907e15,
+         9.476288099260110e17,
+        -1.792162323051699e20};
+    static P a1[] = {
+         0.1171875,
+        -0.1441955566406250,
+         0.6765925884246826,
+        -6.883914268109947,
+         1.215978918765359e2,
+        -3.302272294480852e3,
+         1.276412726461746e5,
+        -6.656367718817688e6,
+         4.502786003050393e8,
+        -3.833857520742790e10,
+         4.011838599133198e12,
+        -5.060568503314727e14,
+         7.572616461117958e16,
+        -1.326257285320556e19};
+    static P b1[] = {
+        -0.1025390625,
+         0.2775764465332031,
+        -1.993531733751297,
+         2.724882731126854e1,
+        -6.038440767050702e2,
+         1.971837591223663e4,
+        -8.902978767070678e5,
+         5.310411010968522e7,
+        -4.043620325107754e9,
+         3.827011346598605e11,
+        -4.406481417852278e13,
+         6.065091351222699e15,
+        -9.833883876590679e17,
+         1.855045211579828e20};
+
+    a0 = abs(z);
+    z2 = z*z;
+    z1 = z;
+    if (a0 == 0.0) {
+        cj0 = cone;
+        cj1 = czero;
+        cy0 = complex<P>(-1e308,0);
+        cy1 = complex<P>(-1e308,0);
+        cj0p = czero;
+        cj1p = complex<P>(0.5,0.0);
+        cy0p = complex<P>(1e308,0);
+        cy1p = complex<P>(1e308,0);
+        return 0;
+    }
+    if (real(z) < 0.0) z1 = -z;
+    if (a0 <= 12.0) {
+        cj0 = cone;
+        cr = cone;
+        for (k=1;k<=40;k++) {
+            cr *= -0.25*z2/(P)(k*k);
+            cj0 += cr;
+            if (abs(cr) < abs(cj0)*eps) break;
+        }
+        cj1 = cone;
+        cr = cone;
+        for (k=1;k<=40;k++) {
+            cr *= -0.25*z2/(k*(k+1.0));
+            cj1 += cr;
+            if (abs(cr) < abs(cj1)*eps) break;
+        }
+        cj1 *= 0.5*z1;
+        w0 = 0.0;
+        cr = cone;
+        cs = czero;
+        for (k=1;k<=40;k++) {
+            w0 += 1.0/k;
+            cr *= -0.25*z2/(P)(k*k);
+            cp = cr*w0;
+            cs += cp;
+            if (abs(cp) < abs(cs)*eps) break;
+        }
+        cy0 = M_2_PI*((log(0.5*z1)+el)*cj0-cs);
+        w1 = 0.0;
+        cr = cone;
+        cs = cone;
+        for (k=1;k<=40;k++) {
+            w1 += 1.0/k;
+            cr *= -0.25*z2/(k*(k+1.0));
+            cp = cr*(2.0*w1+1.0/(k+1.0));
+            cs += cp;
+            if (abs(cp) < abs(cs)*eps) break;
+        }
+        cy1 = M_2_PI*((log(0.5*z1)+el)*cj1-1.0/z1-0.25*z1*cs);
+    }
+    else {
+        if (a0 >= 50.0) kz = 8;         // can be changed to 10
+        else if (a0 >= 35.0) kz = 10;   //   "      "     "  12
+        else kz = 12;                   //   "      "     "  14
+        ct1 = z1 - M_PI_4;
+        cp0 = cone;
+        for (k=0;k<kz;k++) {
+            cp0 += a[k]*pow(z1,-2.0*k-2.0);
+        }
+        cq0 = -0.125/z1;
+        for (k=0;k<kz;k++) {
+            cq0 += b[k]*pow(z1,-2.0*k-3.0);
+        }
+        cu = sqrt(M_2_PI/z1);
+        cj0 = cu*(cp0*cos(ct1)-cq0*sin(ct1));
+        cy0 = cu*(cp0*sin(ct1)+cq0*cos(ct1));
+        ct2 = z1 - 0.75*M_PI;
+        cp1 = cone;
+        for (k=0;k<kz;k++) {
+            cp1 += a1[k]*pow(z1,-2.0*k-2.0);
+        }
+        cq1 = 0.375/z1;
+        for (k=0;k<kz;k++) {
+            cq1 += b1[k]*pow(z1,-2.0*k-3.0);
+        }
+        cj1 = cu*(cp1*cos(ct2)-cq1*sin(ct2));
+        cy1 = cu*(cp1*sin(ct2)+cq1*cos(ct2));
+    }
+    if (real(z) < 0.0) {
+        if (imag(z) < 0.0) {
+            cy0 -= 2.0*cii*cj0;
+            cy1 = -(cy1-2.0*cii*cj1);
+        }
+        else if (imag(z) > 0.0) {
+            cy0 += 2.0*cii*cj0;
+            cy1 = -(cy1+2.0*cii*cj1);
+        }
+        cj1 = -cj1;
+    }
+    cj0p = -cj1;
+    cj1p = cj0-cj1/z;
+    cy0p = -cy1;
+    cy1p = cy0-cy1/z;
+    return 0;
+}
+
+template<typename P>
+int cbessjyna(int n,complex<P> z,int &nm,complex<P> *cj,
+    complex<P> *cy,complex<P> *cjp,complex<P> *cyp)
+{
+    complex<P> cbj0,cbj1,cby0,cby1,cj0,cjk,cj1,cf,cf1,cf2;
+    complex<P> cs,cg0,cg1,cyk,cyl1,cyl2,cylk,cp11,cp12,cp21,cp22;
+    complex<P> ch0,ch1,ch2;
+    P a0,yak,ya1,ya0,wa;
+    int m,k,lb,lb0;
+
+    if (n < 0) return 1;
+    a0 = abs(z);
+    nm = n;
+    if (a0 < 1.0e-100) {
+        for (k=0;k<=n;k++) {
+            cj[k] = czero;
+            cy[k] = complex<P> (-1e308,0);
+            cjp[k] = czero;
+            cyp[k] = complex<P>(1e308,0);
+        }
+        cj[0] = cone;
+        cjp[1] = complex<P>(0.5,0.0);
+        return 0;
+    }
+    cbessjy01(z,cj[0],cj[1],cy[0],cy[1],cjp[0],cjp[1],cyp[0],cyp[1]);
+    cbj0 = cj[0];
+    cbj1 = cj[1];
+    cby0 = cy[0];
+    cby1 = cy[1];
+    if (n <= 1) return 0;
+    if (n < (int)0.25*a0) {
+        cj0 = cbj0;
+        cj1 = cbj1;
+        for (k=2;k<=n;k++) {
+            cjk = 2.0*(k-1.0)*cj1/z-cj0;
+            cj[k] = cjk;
+            cj0 = cj1;
+            cj1 = cjk;
+        }
+    }
+    else {
+        m = msta1(a0,200);
+        if (m < n) nm = m;
+        else m = msta2(a0,n,15);
+        cf2 = czero;
+        cf1 = complex<P> (1.0e-100,0.0);
+        for (k=m;k>=0;k--) {
+            cf = 2.0*(k+1.0)*cf1/z-cf2;
+            if (k <=nm) cj[k] = cf;
+            cf2 = cf1;
+            cf1 = cf;
+        }
+        if (abs(cbj0) > abs(cbj1)) cs = cbj0/cf;
+        else cs = cbj1/cf2;
+        for (k=0;k<=nm;k++) {
+            cj[k] *= cs;
+        }
+    }
+    for (k=2;k<=nm;k++) {
+        cjp[k] = cj[k-1]-(P)k*cj[k]/z;
+    }
+    ya0 = abs(cby0);
+    lb = 0;
+    cg0 = cby0;
+    cg1 = cby1;
+    for (k=2;k<=nm;k++) {
+        cyk = 2.0*(k-1.0)*cg1/z-cg0;
+        yak = abs(cyk);
+        ya1 = abs(cg0);
+        if ((yak < ya0) && (yak < ya1)) lb = k;
+        cy[k] = cyk;
+        cg0 = cg1;
+        cg1 = cyk;
+    }
+    lb0 = 0;
+    if ((lb > 4) && (imag(z) != 0.0)) {
+        while (lb != lb0) {
+            ch2 = cone;
+            ch1 = czero;
+            lb0 = lb;
+            for (k=lb;k>=1;k--) {
+                ch0 = 2.0*k*ch1/z-ch2;
+                ch2 = ch1;
+                ch1 = ch0;
+            }
+            cp12 = ch0;
+            cp22 = ch2;
+            ch2 = czero;
+            ch1 = cone;
+            for (k=lb;k>=1;k--) {
+                ch0 = 2.0*k*ch1/z-ch2;
+                ch2 = ch1;
+                ch1 = ch0;
+            }
+            cp11 = ch0;
+            cp21 = ch2;
+            if (lb == nm)
+                cj[lb+1] = 2.0*lb*cj[lb]/z-cj[lb-1];
+            if (abs(cj[0]) > abs(cj[1])) {
+                cy[lb+1] = (cj[lb+1]*cby0-2.0*cp11/(M_PI*z))/cj[0];
+                cy[lb] = (cj[lb]*cby0+2.0*cp12/(M_PI*z))/cj[0];
+            }
+            else {
+                cy[lb+1] = (cj[lb+1]*cby1-2.0*cp21/(M_PI*z))/cj[1];
+                cy[lb] = (cj[lb]*cby1+2.0*cp22/(M_PI*z))/cj[1];
+            }
+            cyl2 = cy[lb+1];
+            cyl1 = cy[lb];
+            for (k=lb-1;k>=0;k--) {
+                cylk = 2.0*(k+1.0)*cyl1/z-cyl2;
+                cy[k] = cylk;
+                cyl2 = cyl1;
+                cyl1 = cylk;
+            }
+            cyl1 = cy[lb];
+            cyl2 = cy[lb+1];
+            for (k=lb+1;k<n;k++) {
+                cylk = 2.0*k*cyl2/z-cyl1;
+                cy[k+1] = cylk;
+                cyl1 = cyl2;
+                cyl2 = cylk;
+            }
+            for (k=2;k<=nm;k++) {
+                wa = abs(cy[k]);
+                if (wa < abs(cy[k-1])) lb = k;
+            }
+        }
+    }
+    for (k=2;k<=nm;k++) {
+        cyp[k] = cy[k-1]-(P)k*cy[k]/z;
+    }
+    return 0;
+}
+
+template<typename P>
+int cbessjynb(int n,complex<P> z,int &nm,complex<P> *cj,
+    complex<P> *cy,complex<P> *cjp,complex<P> *cyp)
+{
+    complex<P> cf,cf0,cf1,cf2,cbs,csu,csv,cs0,ce;
+    complex<P> ct1,cp0,cq0,cp1,cq1,cu,cbj0,cby0,cbj1,cby1;
+    complex<P> cyy,cbjk,ct2;
+    P a0,y0;
+    int k,m;
+    static P a[] = {
+        -0.7031250000000000e-1,
+         0.1121520996093750,
+        -0.5725014209747314,
+         6.074042001273483};
+    static P b[] = {
+         0.7324218750000000e-1,
+        -0.2271080017089844,
+         1.727727502584457,
+        -2.438052969955606e1};
+    static P a1[] = {
+         0.1171875,
+        -0.1441955566406250,
+         0.6765925884246826,
+        -6.883914268109947};
+    static P b1[] = {
+       -0.1025390625,
+        0.2775764465332031,
+       -1.993531733751297,
+        2.724882731126854e1};
+
+    y0 = abs(imag(z));
+    a0 = abs(z);
+    nm = n;
+    if (a0 < 1.0e-100) {
+        for (k=0;k<=n;k++) {
+            cj[k] = czero;
+            cy[k] = complex<P> (-1e308,0);
+            cjp[k] = czero;
+            cyp[k] = complex<P>(1e308,0);
+        }
+        cj[0] = cone;
+        cjp[1] = complex<P>(0.5,0.0);
+        return 0;
+    }
+    if ((a0 <= 300.0) || (n > (int)(0.25*a0))) {
+        if (n == 0) nm = 1;
+        m = msta1(a0,200);
+        if (m < nm) nm = m;
+        else m = msta2(a0,nm,15);
+        cbs = czero;
+        csu = czero;
+        csv = czero;
+        cf2 = czero;
+        cf1 = complex<P> (1.0e-100,0.0);
+        for (k=m;k>=0;k--) {
+            cf = 2.0*(k+1.0)*cf1/z-cf2;
+            if (k <= nm) cj[k] = cf;
+            if (((k & 1) == 0) && (k != 0)) {
+                if (y0 <= 1.0) {
+                    cbs += 2.0*cf;
+                }
+                else {
+                    cbs += (-1)*((k & 2)-1)*2.0*cf;
+                }
+                csu += (P)((-1)*((k & 2)-1))*cf/(P)k;
+            }
+            else if (k > 1) {
+                csv += (P)((-1)*((k & 2)-1)*k)*cf/(P)(k*k-1.0);
+            }
+            cf2 = cf1;
+            cf1 = cf;
+        }
+        if (y0 <= 1.0) cs0 = cbs+cf;
+        else cs0 = (cbs+cf)/cos(z);
+        for (k=0;k<=nm;k++) {
+            cj[k] /= cs0;
+        }
+        ce = log(0.5*z)+el;
+        cy[0] = M_2_PI*(ce*cj[0]-4.0*csu/cs0);
+        cy[1] = M_2_PI*(-cj[0]/z+(ce-1.0)*cj[1]-4.0*csv/cs0);
+    }
+    else {
+        ct1 = z-M_PI_4;
+        cp0 = cone;
+        for (k=0;k<4;k++) {
+            cp0 += a[k]*pow(z,-2.0*k-2.0);
+        }
+        cq0 = -0.125/z;
+        for (k=0;k<4;k++) {
+            cq0 += b[k] *pow(z,-2.0*k-3.0);
+        }
+        cu = sqrt(M_2_PI/z);
+        cbj0 = cu*(cp0*cos(ct1)-cq0*sin(ct1));
+        cby0 = cu*(cp0*sin(ct1)+cq0*cos(ct1));
+        cj[0] = cbj0;
+        cy[0] = cby0;
+        ct2 = z-0.75*M_PI;
+        cp1 = cone;
+        for (k=0;k<4;k++) {
+            cp1 += a1[k]*pow(z,-2.0*k-2.0);
+        }
+        cq1 = 0.375/z;
+        for (k=0;k<4;k++) {
+            cq1 += b1[k]*pow(z,-2.0*k-3.0);
+        }
+        cbj1 = cu*(cp1*cos(ct2)-cq1*sin(ct2));
+        cby1 = cu*(cp1*sin(ct2)+cq1*cos(ct2));
+        cj[1] = cbj1;
+        cy[1] = cby1;
+        for (k=2;k<=n;k++) {
+            cbjk = 2.0*(k-1.0)*cbj1/z-cbj0;
+            cj[k] = cbjk;
+            cbj0 = cbj1;
+            cbj1 = cbjk;
+        }
+    }
+    cjp[0] = -cj[1];
+    for (k=1;k<=nm;k++) {
+        cjp[k] = cj[k-1]-(P)k*cj[k]/z;
+    }
+    if (abs(cj[0]) > 1.0)
+        cy[1] = (cj[1]*cy[0]-2.0/(M_PI*z))/cj[0];
+    for (k=2;k<=nm;k++) {
+        if (abs(cj[k-1]) >= abs(cj[k-2]))
+            cyy = (cj[k]*cy[k-1]-2.0/(M_PI*z))/cj[k-1];
+        else
+            cyy = (cj[k]*cy[k-2]-4.0*(k-1.0)/(M_PI*z*z))/cj[k-2];
+        cy[k] = cyy;
+    }
+    cyp[0] = -cy[1];
+    for (k=1;k<=nm;k++) {
+        cyp[k] = cy[k-1]-(P)k*cy[k]/z;
+    }
+
+    return 0;
+}
+
+template<typename P>
+int cbessjyva(P v,complex<P> z,P &vm,complex<P>*cjv,
+    complex<P>*cyv,complex<P>*cjvp,complex<P>*cyvp)
+{
+    complex<P> z1,z2,zk,cjvl,cr,ca,cjv0,cjv1,cpz,crp;
+    complex<P> cqz,crq,ca0,cck,csk,cyv0,cyv1,cju0,cju1,cb;
+    complex<P> cs,cs0,cr0,cs1,cr1,cec,cf,cf0,cf1,cf2;
+    complex<P> cfac0,cfac1,cg0,cg1,cyk,cp11,cp12,cp21,cp22;
+    complex<P> ch0,ch1,ch2,cyl1,cyl2,cylk;
+
+    P a0,v0,pv0,pv1,vl,ga,gb,vg,vv,w0,w1,ya0,yak,ya1,wa;
+    int j,n,k,kz,l,lb,lb0,m;
+
+    a0 = abs(z);
+    z1 = z;
+    z2 = z*z;
+    n = (int)v;
+
+
+    v0 = v-n;
+
+    pv0 = M_PI*v0;
+    pv1 = M_PI*(1.0+v0);
+    if (a0 < 1.0e-100) {
+        for (k=0;k<=n;k++) {
+            cjv[k] = czero;
+            cyv[k] = complex<P> (-1e308,0);
+            cjvp[k] = czero;
+            cyvp[k] = complex<P> (1e308,0);
+
+        }
+        if (v0 == 0.0) {
+            cjv[0] = cone;
+            cjvp[1] = complex<P> (0.5,0.0);
+        }
+        else {
+            cjvp[0] = complex<P> (1e308,0);
+        }
+        vm = v;
+        return 0;
+    }
+    if (real(z1) < 0.0) z1 = -z;
+    if (a0 <= 12.0) {
+        for (l=0;l<2;l++) {
+            vl = v0+l;
+            cjvl = cone;
+            cr = cone;
+            for (k=1;k<=40;k++) {
+                cr *= -0.25*z2/(k*(k+vl));
+                cjvl += cr;
+                if (abs(cr) < abs(cjvl)*eps) break;
+            }
+           vg = 1.0 + vl;
+           ga = gamma(vg);
+           ca = pow(0.5*z1,vl)/ga;
+           if (l == 0) cjv0 = cjvl*ca;
+           else cjv1 = cjvl*ca;
+        }
+    }
+    else {
+        if (a0 >= 50.0) kz = 8;
+        else if (a0 >= 35.0) kz = 10;
+        else kz = 11;
+        for (j=0;j<2;j++) {
+            vv = 4.0*(j+v0)*(j+v0);
+            cpz = cone;
+            crp = cone;
+            for (k=1;k<=kz;k++) {
+                crp = -0.78125e-2*crp*(vv-pow(4.0*k-3.0,2.0))*
+                    (vv-pow(4.0*k-1.0,2.0))/(k*(2.0*k-1.0)*z2);
+                cpz += crp;
+            }
+            cqz = cone;
+            crq = cone;
+            for (k=1;k<=kz;k++) {
+                crq = -0.78125e-2*crq*(vv-pow(4.0*k-1.0,2.0))*
+                    (vv-pow(4.0*k+1.0,2.0))/(k*(2.0*k+1.0)*z2);
+                cqz += crq;
+            }
+            cqz *= 0.125*(vv-1.0)/z1;
+            zk = z1-(0.5*(j+v0)+0.25)*M_PI;
+            ca0 = sqrt(M_2_PI/z1);
+            cck = cos(zk);
+            csk = sin(zk);
+            if (j == 0) {
+                cjv0 = ca0*(cpz*cck-cqz*csk);
+                cyv0 = ca0*(cpz*csk+cqz+cck);
+            }
+            else {
+                cjv1 = ca0*(cpz*cck-cqz*csk);
+                cyv1 = ca0*(cpz*csk+cqz*cck);
+            }
+        }
+    }
+    if (a0 <= 12.0) {
+        if (v0 != 0.0) {
+            for (l=0;l<2;l++) {
+                vl = v0+l;
+                cjvl = cone;
+                cr = cone;
+                for (k=1;k<=40;k++) {
+                    cr *= -0.25*z2/(k*(k-vl));
+                    cjvl += cr;
+                    if (abs(cr) < abs(cjvl)*eps) break;
+                }
+                vg = 1.0-vl;
+                gb = gamma(vg);
+                cb = pow(2.0/z1,vl)/gb;
+                if (l == 0) cju0 = cjvl*cb;
+                else cju1 = cjvl*cb;
+            }
+            cyv0 = (cjv0*cos(pv0)-cju0)/sin(pv0);
+            cyv1 = (cjv1*cos(pv1)-cju1)/sin(pv1);
+        }
+        else {
+            cec = log(0.5*z1)+el;
+            cs0 = czero;
+            w0 = 0.0;
+            cr0 = cone;
+            for (k=1;k<=30;k++) {
+                w0 += 1.0/k;
+                cr0 *= -0.25*z2/(P)(k*k);
+                cs0 += cr0*w0;
+            }
+            cyv0 = M_2_PI*(cec*cjv0-cs0);
+            cs1 = cone;
+            w1 = 0.0;
+            cr1 = cone;
+            for (k=1;k<=30;k++) {
+                w1 += 1.0/k;
+                cr1 *= -0.25*z2/(k*(k+1.0));
+                cs1 += cr1*(2.0*w1+1.0/(k+1.0));
+            }
+            cyv1 = M_2_PI*(cec*cjv1-1.0/z1-0.25*z1*cs1);
+        }
+    }
+    if (real(z) < 0.0) {
+        cfac0 = exp(pv0*cii);
+        cfac1 = exp(pv1*cii);
+        if (imag(z) < 0.0) {
+            cyv0 = cfac0*cyv0-(P)2.0*(complex<P>)cii*cos(pv0)*cjv0;
+            cyv1 = cfac1*cyv1-(P)2.0*(complex<P>)cii*cos(pv1)*cjv1;
+            cjv0 /= cfac0;
+            cjv1 /= cfac1;
+        }
+        else if (imag(z) > 0.0) {
+            cyv0 = cyv0/cfac0+(P)2.0*(complex<P>)cii*cos(pv0)*cjv0;
+            cyv1 = cyv1/cfac1+(P)2.0*(complex<P>)cii*cos(pv1)*cjv1;
+            cjv0 *= cfac0;
+            cjv1 *= cfac1;
+        }
+    }
+    cjv[0] = cjv0;
+    cjv[1] = cjv1;
+    if ((n >= 2) && (n <= (int)(0.25*a0))) {
+        cf0 = cjv0;
+        cf1 = cjv1;
+        for (k=2;k<= n;k++) {
+            cf = 2.0*(k+v0-1.0)*cf1/z-cf0;
+            cjv[k] = cf;
+            cf0 = cf1;
+            cf1 = cf;
+        }
+    }
+    else if (n >= 2) {
+        m = msta1(a0,200);
+        if (m < n) n = m;
+        else  m = msta2(a0,n,15);
+        cf2 = czero;
+        cf1 = complex<P>(1.0e-100,0.0);
+        for (k=m;k>=0;k--) {
+            cf = 2.0*(v0+k+1.0)*cf1/z-cf2;
+            if (k <= n) cjv[k] = cf;
+            cf2 = cf1;
+            cf1 = cf;
+        }
+        if (abs(cjv0) > abs(cjv1)) cs = cjv0/cf;
+        else cs = cjv1/cf2;
+        for (k=0;k<=n;k++) {
+            cjv[k] *= cs;
+        }
+    }
+    cjvp[0] = v0*cjv[0]/z-cjv[1];
+    for (k=1;k<=n;k++) {
+        cjvp[k] = -(k+v0)*cjv[k]/z+cjv[k-1];
+    }
+    cyv[0] = cyv0;
+    cyv[1] = cyv1;
+    ya0 = abs(cyv0);
+    lb = 0;
+    cg0 = cyv0;
+    cg1 = cyv1;
+    for (k=2;k<=n;k++) {
+        cyk = 2.0*(v0+k-1.0)*cg1/z-cg0;
+        yak = abs(cyk);
+        ya1 = abs(cg0);
+        if ((yak < ya0) && (yak< ya1)) lb = k;
+        cyv[k] = cyk;
+        cg0 = cg1;
+        cg1 = cyk;
+    }
+    lb0 = 0;
+    if ((lb > 4) && (imag(z) != 0.0)) {
+        while(lb != lb0) {
+            ch2 = cone;
+            ch1 = czero;
+            lb0 = lb;
+            for (k=lb;k>=1;k--) {
+                ch0 = 2.0*(k+v0)*ch1/z-ch2;
+                ch2 = ch1;
+                ch1 = ch0;
+            }
+            cp12 = ch0;
+            cp22 = ch2;
+            ch2 = czero;
+            ch1 = cone;
+            for (k=lb;k>=1;k--) {
+                ch0 = 2.0*(k+v0)*ch1/z-ch2;
+                ch2 = ch1;
+                ch1 = ch0;
+            }
+            cp11 = ch0;
+            cp21 = ch2;
+            if (lb == n)
+                cjv[lb+1] = 2.0*(lb+v0)*cjv[lb]/z-cjv[lb-1];
+            if (abs(cjv[0]) > abs(cjv[1])) {
+                cyv[lb+1] = (cjv[lb+1]*cyv0-2.0*cp11/(M_PI*z))/cjv[0];
+                cyv[lb] = (cjv[lb]*cyv0+2.0*cp12/(M_PI*z))/cjv[0];
+            }
+            else {
+                cyv[lb+1] = (cjv[lb+1]*cyv1-2.0*cp21/(M_PI*z))/cjv[1];
+                cyv[lb] = (cjv[lb]*cyv1+2.0*cp22/(M_PI*z))/cjv[1];
+            }
+            cyl2 = cyv[lb+1];
+            cyl1 = cyv[lb];
+            for (k=lb-1;k>=0;k--) {
+                cylk = 2.0*(k+v0+1.0)*cyl1/z-cyl2;
+                cyv[k] = cylk;
+                cyl2 = cyl1;
+                cyl1 = cylk;
+            }
+            cyl1 = cyv[lb];
+            cyl2 = cyv[lb+1];
+            for (k=lb+1;k<n;k++) {
+                cylk = 2.0*(k+v0)*cyl2/z-cyl1;
+                cyv[k+1] = cylk;
+                cyl1 = cyl2;
+                cyl2 = cylk;
+            }
+            for (k=2;k<=n;k++) {
+                wa = abs(cyv[k]);
+                if (wa < abs(cyv[k-1])) lb = k;
+            }
+        }
+    }
+    cyvp[0] = v0*cyv[0]/z-cyv[1];
+    for (k=1;k<=n;k++) {
+        cyvp[k] = cyv[k-1]-(k+v0)*cyv[k]/z;
+    }
+    vm = n+v0;
+    return 0;
+}
+
+template<typename P>
+int cbessjyva_sph(int v,complex<P> z,P &vm,complex<P>*cjv,
+    complex<P>*cyv,complex<P>*cjvp,complex<P>*cyvp)
+{
+    //first, compute the bessel functions of fractional order
+    cbessjyva<P>(v + 0.5, z, vm, cjv, cyv, cjvp, cyvp);
+
+    //iterate through each and scale
+    for(int n = 0; n<=v; n++)
+    {
+
+        cjv[n] = cjv[n] * sqrt(stim::PI/(z * 2.0));
+        cyv[n] = cyv[n] * sqrt(stim::PI/(z * 2.0));
+
+        cjvp[n] = -1.0 / (z * 2.0) * cjv[n] + cjvp[n] * sqrt(stim::PI / (z * 2.0));
+        cyvp[n] = -1.0 / (z * 2.0) * cyv[n] + cyvp[n] * sqrt(stim::PI / (z * 2.0));
+    }
+
+	return 0;
+
+}
+
+}	//end namespace rts
+
+
+#endif
@@ -17,10 +17,10 @@ static complex&lt;double&gt; czero(0.0,0.0);
 template< typename P >
 P gamma(P x)
 {
-	const P EPS = numeric_limits<P>::epsilon();
-	const P FPMIN_MAG = numeric_limits<P>::min();
-	const P FPMIN = numeric_limits<P>::lowest();
-	const P FPMAX = numeric_limits<P>::max();
+	const P EPS = std::numeric_limits<P>::epsilon();
+	const P FPMIN_MAG = std::numeric_limits<P>::min();
+	const P FPMIN = std::numeric_limits<P>::lowest();
+	const P FPMAX = std::numeric_limits<P>::max();
     int i,k,m;
     P ga,gr,r,z;
@@ -94,10 +94,10 @@ template&lt;typename P&gt;
 int bessjy01a(P x,P &j0,P &j1,P &y0,P &y1,
     P &j0p,P &j1p,P &y0p,P &y1p)
 {
-	const P EPS = numeric_limits<P>::epsilon();
-	const P FPMIN_MAG = numeric_limits<P>::min();
-	const P FPMIN = numeric_limits<P>::lowest();
-	const P FPMAX = numeric_limits<P>::max();
+	const P EPS = std::numeric_limits<P>::epsilon();
+	const P FPMIN_MAG = std::numeric_limits<P>::min();
+	const P FPMIN = std::numeric_limits<P>::lowest();
+	const P FPMAX = std::numeric_limits<P>::max();
     P x2,r,ec,w0,w1,r0,r1,cs0,cs1;
     P cu,p0,q0,p1,q1,t1,t2;
@@ -606,10 +606,10 @@ int bessjyv(P v,P x,P &amp;vm,P *jv,P *yv,
     P b,ec,w0,w1,bju0,bju1,pv0,pv1,byvk;
     int j,k,l,m,n,kz;
-	const P EPS = numeric_limits<P>::epsilon();
-	const P FPMIN_MAG = numeric_limits<P>::min();
-	const P FPMIN = numeric_limits<P>::lowest();
-	const P FPMAX = numeric_limits<P>::max();
+	const P EPS = std::numeric_limits<P>::epsilon();
+	const P FPMIN_MAG = std::numeric_limits<P>::min();
+	const P FPMIN = std::numeric_limits<P>::lowest();
+	const P FPMAX = std::numeric_limits<P>::max();
     x2 = x*x;
     n = (int)v;
@@ -1508,6 +1508,8 @@ int cbessjyva(P v,complex&lt;P&gt; z,P &amp;vm,complex&lt;P&gt;*cjv,
     return 0;
 }
+///Calculate the spherical bessel functions and their derivatives up to order v
+/// When allocating arrays to store the resulting values, arrays must be of size [v+2]
 template<typename P>
 int cbessjyva_sph(int v,complex<P> z,P &vm,complex<P>*cjv,
     complex<P>*cyv,complex<P>*cjvp,complex<P>*cyvp)
@@ -18,13 +18,14 @@ class circle : plane&lt;T&gt;
 private:
-	stim::vec3<T> Y;
+	//stim::vec3<T> Y;
+	T R;								//radius of the circle
-	CUDA_CALLABLE void
+	/*CUDA_CALLABLE void
 	init()
 	{
 		Y = U.cross(N).norm();
-	}
+	}*/
 public:
 	using stim::plane<T>::n;
@@ -34,6 +35,8 @@ public:
 	using stim::plane<T>::rotate;
 	using stim::plane<T>::setU;
+	using stim::plane<T>::init;
+
 	///base constructor
 	///@param th value of the angle of the starting point from 0 to 360.
 	CUDA_CALLABLE
@@ -42,26 +45,28 @@ public:
 		init();
 	}
-	///create a rectangle given a size and position in Z space.
+	///create a circle given a size and position in Z space.
 	///@param size: size of the rectangle in ND space.
 	///@param z_pos z coordinate of the rectangle.
 	CUDA_CALLABLE
-	circle(T size, T z_pos = (T)0) : plane<T>()
+	circle(T radius, T z_pos = (T)0) : plane<T>(z_pos)
 	{
-		center(stim::vec3<T>(0,0,z_pos));
-		scale(size);
-		init();
+		//center(stim::vec3<T>(0, 0, z_pos));
+		//scale(size);
+		//init();
+		R = radius;
 	}
 	///create a rectangle from a center point, normal
 	///@param c: x,y,z location of the center.
 	///@param n: x,y,z direction of the normal.	
 	CUDA_CALLABLE
-	circle(vec3<T> c, vec3<T> n = vec3<T>(0,0,1)) : plane<T>()
+	circle(vec3<T> c, vec3<T> n = vec3<T>(0,0,1)) : plane<T>(n, c)
 	{
-		center(c);
-		normal(n);
-		init();
+		//center(c);
+		//normal(n);
+		//init();
+		R = (T)1;
 	}
 	/*///create a rectangle from a center point, normal, and size
@@ -84,14 +89,18 @@ public:
 	///@param n: x,y,z direction of the normal.
 	///@param u: x,y,z direction for the zero vector (from where the rotation starts)
 	CUDA_CALLABLE
-	circle(vec3<T> c, T s, vec3<T> n = vec3<T>(0,0,1), vec3<T> u = vec3<T>(1, 0, 0)) : plane<T>()
+	circle(vec3<T> c, T r, vec3<T> n = vec3<T>(0,0,1), vec3<T> u = vec3<T>(1, 0, 0)) : plane<T>()
 	{
-		init();
+		P = c;
+		N = n;
 		setU(u);
+		R = r;
+		//init();
+		//setU(u);
 //		U = u;
-		center(c);
-		normal(n);
-		scale(s);
+		//center(c);
+		//normal(n);
+		//scale(s);
 	}
 	///scales the circle by a certain factor
@@ -99,86 +108,111 @@ public:
 	CUDA_CALLABLE
 	void scale(T factor)
 	{
-		U *= factor;
-		Y *= factor;
+		//U *= factor;
+		//Y *= factor;
+		R *= factor;
+	}
+
+	///set the radius of circle to a certain value 
+	///@param value: the new radius of the circle
+	CUDA_CALLABLE
+	void set_R(T value) 
+	{
+		R = value;
 	}
 	///sets the normal for the cirlce
 	///@param n: x,y,z direction of the normal.
 	CUDA_CALLABLE void
-	normal(vec3<T> n)
-	{
-		rotate(n, Y);
+	normal(vec3<T> n){
+		rotate(n);
 	}
 	///sets the center of the circle.
 	///@param n: x,y,z location of the center.
 	CUDA_CALLABLE void
 	center(vec3<T> p){
-		this->P = p;
+		P = p;
 	}
 	///boolean comparison
 	bool
 	operator==(const circle<T> & rhs)
 	{
-		if(P == rhs.P && U == rhs.U && Y == rhs.Y)
+		if(P == rhs.P && U == rhs.U)
 			return true;
 		else
 			return false;
 	}
+	//returns the point in world space corresponding to the polar coordinate (r, theta)
+	CUDA_CALLABLE stim::vec3<T>
+		p(T r, T theta) {
+		T u = r * cos(theta);				//calculate the coordinates in the planar space defined by the circle
+		T v = r * sin(theta);
+
+		vec3<T> V = U.cross(N);				//calculate the orthogonal vector V
+		return P + U * u + V * v;			//calculate the cartesian coordinate of the specified point
+	}
+
+	//returns the point in world space corresponding to the value theta at radius R
+	CUDA_CALLABLE stim::vec3<T>
+		p(T theta) {
+		return p(R, theta);
+	}
+
+	//get the world space value given the polar coordinates (r, theta)
+
 	///get the world space value given the planar coordinates a, b in [0, 1]
-	CUDA_CALLABLE stim::vec3<T> p(T a, T b)
+	/*CUDA_CALLABLE stim::vec3<T> p(T a, T b)
 	{
 		stim::vec3<T> result;
 		vec3<T> A = this->P - this->U * (T)0.5 - Y * (T)0.5;
 		result = A + this->U * a + Y * b;
 		return result;
-	}
+	}*/
 	///parenthesis operator returns the world space given rectangular coordinates a and b in [0 1]
-	CUDA_CALLABLE stim::vec3<T> operator()(T a, T b)
+	CUDA_CALLABLE stim::vec3<T> operator()(T r, T theta)
 	{
-		return p(a,b);
+		return p(r, theta);
+	}
+
+	//parenthesis operator returns the world space coordinate at the edge of the circle given theta
+	CUDA_CALLABLE stim::vec3<T> operator()(T theta) {
+		return p(theta);
 	}
 	///returns a vector with the points on the initialized circle.
 	///connecting the points results in a circle.
 	///@param n: integer for the number of points representing the circle.
-	std::vector<stim::vec3<T> >
-	getPoints(int n)
-	{
-		std::vector<stim::vec3<T> > result;
-		stim::vec3<T> point;
-		T x,y;
-		float step = 360.0/(float) n;
-		for(float j = 0; j <= 360.0; j += step)
-		{
-			y = 0.5*cos(j*stim::TAU/360.0)+0.5;
-			x = 0.5*sin(j*stim::TAU/360.0)+0.5;
-			result.push_back(p(x,y));
-		}
+	std::vector< stim::vec3<T> > points(unsigned n) {
+		std::vector< stim::vec3<T> > result(n);				//initialize a vector of n points
+
+		float dt = stim::TAU / n;
+		for (unsigned i = 0; i < n; i++)
+			result[i] = p(i * dt);							//calculate a point on the edge of the circle
 		return result;
 	}	
-	
-	///returns a vector with the points on the initialized circle.
-	///connecting the points results in a circle.
-	///@param n: integer for the number of points representing the circle.
-	CUDA_CALLABLE stim::vec3<T>
-	p(T theta)
-	{
-		T x,y;
-		y = 0.5*cos(theta*STIM_TAU/360.0)+0.5;
-		x = 0.5*sin(theta*STIM_TAU/360.0)+0.5;
-		return p(x,y);
-	}
+	///returns a vector with the points on the initialized circle
+	///connecting the points results in a circle
+	///@param n: integer for the number of points representing the circle
+	///the only difference between points and glpoints is that the first point appears twice in the returning lists
+	std::vector< stim::vec3<T> > glpoints(unsigned n) {
+		std::vector< stim::vec3<T> > result(n + 1);
+		float dt = stim::TAU / n;
+		for (unsigned i = 0; i < n; i++)
+			result[i] = p(i * dt);
+		result[n] = p(0);									//close the circle!
+		return result;
+	}
+	
 	std::string str() const
 	{
 		std::stringstream ss;
-		ss << "(P=" << P.str() << ", N=" << N.str() << ", U=" << U.str() << ", Y=" << Y.str();
+		ss << "r = "<<R<<"  (P=" << P.str() << ", N=" << N.str() << ", U=" << U.str() << ")";
 		return ss.str();
 	}
+#ifndef STIM_CIRCLE_H
+#define STIM_CIRCLE_H
+
+#include <stim/cuda/cudatools/callable.h>
+#include <stim/math/plane.h>
+#include <stim/math/vector.h>
+#include <stim/math/triangle.h>
+#include <stim/math/constants.h>
+#include <assert.h>
+#include <algorithm>
+#include <iostream>
+
+namespace stim{
+
+template <typename T>
+class circle : plane<T>
+{
+
+private:
+	
+	stim::vec3<T> Y;
+
+	CUDA_CALLABLE void
+	init()
+	{
+		Y = U.cross(N).norm();
+	}
+
+public:
+	using stim::plane<T>::n;
+	using stim::plane<T>::P;
+	using stim::plane<T>::N;
+	using stim::plane<T>::U;
+	using stim::plane<T>::rotate;
+	using stim::plane<T>::setU;
+
+	///base constructor
+	///@param th value of the angle of the starting point from 0 to 360.
+	CUDA_CALLABLE
+	circle() : plane<T>()
+	{
+		init();
+	}
+
+	///create a rectangle given a size and position in Z space.
+	///@param size: size of the rectangle in ND space.
+	///@param z_pos z coordinate of the rectangle.
+	CUDA_CALLABLE
+	circle(T size, T z_pos = (T)0) : plane<T>()
+	{
+		center(stim::vec3<T>(0,0,z_pos));
+		scale(size);
+		init();
+	}
+
+	///create a rectangle from a center point, normal
+	///@param c: x,y,z location of the center.
+	///@param n: x,y,z direction of the normal.	
+	CUDA_CALLABLE
+	circle(vec3<T> c, vec3<T> n = vec3<T>(0,0,1)) : plane<T>()
+	{
+		center(c);
+		normal(n);
+		init();
+	}
+
+	/*///create a rectangle from a center point, normal, and size
+	///@param c: x,y,z location of the center.
+	///@param s: size of the rectangle.
+	///@param n: x,y,z direction of the normal.
+	CUDA_CALLABLE 
+	circle(vec3<T> c, T s, vec3<T> n = vec3<T>(0,0,1)) : plane<T>()
+	{
+		init();
+		center(c);
+		rotate(n, U, Y);
+		scale(s);
+	}
+	*/
+
+	///create a rectangle from a center point, normal, and size
+	///@param c: x,y,z location of the center.
+	///@param s: size of the rectangle.
+	///@param n: x,y,z direction of the normal.
+	///@param u: x,y,z direction for the zero vector (from where the rotation starts)
+	CUDA_CALLABLE
+	circle(vec3<T> c, T s, vec3<T> n = vec3<T>(0,0,1), vec3<T> u = vec3<T>(1, 0, 0)) : plane<T>()
+	{
+		init();
+		setU(u);
+//		U = u;
+		center(c);
+		normal(n);
+		scale(s);
+	}
+
+	///scales the circle by a certain factor
+	///@param factor: the factor by which the dimensions of the shape are scaled.
+	CUDA_CALLABLE
+	void scale(T factor)
+	{
+		U *= factor;
+		Y *= factor;
+	}
+
+	///sets the normal for the cirlce
+	///@param n: x,y,z direction of the normal.
+	CUDA_CALLABLE void
+	normal(vec3<T> n)
+	{
+		rotate(n, Y);
+	}
+
+	///sets the center of the circle.
+	///@param n: x,y,z location of the center.
+	CUDA_CALLABLE void
+	center(vec3<T> p){
+		this->P = p;
+	}
+
+	///boolean comparison
+	bool
+	operator==(const circle<T> & rhs)
+	{
+		if(P == rhs.P && U == rhs.U && Y == rhs.Y)
+			return true;
+		else
+			return false;
+	}
+
+	///get the world space value given the planar coordinates a, b in [0, 1]
+	CUDA_CALLABLE stim::vec3<T> p(T a, T b)
+	{
+		stim::vec3<T> result;
+
+		vec3<T> A = this->P - this->U * (T)0.5 - Y * (T)0.5;
+		result = A + this->U * a + Y * b;
+		return result;
+	}
+
+	///parenthesis operator returns the world space given rectangular coordinates a and b in [0 1]
+	CUDA_CALLABLE stim::vec3<T> operator()(T a, T b)
+	{
+		return p(a,b);
+	}
+
+	///returns a vector with the points on the initialized circle.
+	///connecting the points results in a circle.
+	///@param n: integer for the number of points representing the circle.
+	std::vector<stim::vec3<T> >
+	getPoints(int n)
+	{
+		std::vector<stim::vec3<T> > result;
+		stim::vec3<T> point;
+		T x,y;
+		float step = 360.0/(float) n;
+		for(float j = 0; j <= 360.0; j += step)
+		{
+			y = 0.5*cos(j*stim::TAU/360.0)+0.5;
+			x = 0.5*sin(j*stim::TAU/360.0)+0.5;
+			result.push_back(p(x,y));
+		}
+		return result;
+	}	
+	
+	///returns a vector with the points on the initialized circle.
+	///connecting the points results in a circle.
+	///@param n: integer for the number of points representing the circle.
+	CUDA_CALLABLE stim::vec3<T>
+	p(T theta)
+	{
+		T x,y;
+		y = 0.5*cos(theta*STIM_TAU/360.0)+0.5;
+		x = 0.5*sin(theta*STIM_TAU/360.0)+0.5;
+		return p(x,y);
+	}
+
+	std::string str() const
+	{
+		std::stringstream ss;
+		ss << "(P=" << P.str() << ", N=" << N.str() << ", U=" << U.str() << ", Y=" << Y.str();
+		return ss.str();
+	}
+
+};
+}
+#endif
@@ -4,7 +4,6 @@
 #define STIM_PI		3.1415926535897932384626433832795028841971693993751058209749445923078164062862
 #define STIM_TAU	2 * STIM_PI
-#include "stim/cuda/cudatools/callable.h"
 namespace stim{
 	const double PI		=	3.1415926535897932384626433832795028841971693993751058209749445923078164062862;
 	const double TAU	=	2 * stim::PI;
+#ifndef STIM_CUDA_CONV2_H
+#define STIM_CUDA_CONV2_H
+//#define __CUDACC__
+
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+
+namespace stim {
+	//Kernel function that performs the 2D convolution.
+	template<typename T, typename K>
+	__global__ void kernel_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
+		extern __shared__ T s[];								//declare a shared memory array
+		size_t xi = blockIdx.x * blockDim.x + threadIdx.x;	//threads correspond to indices into the output image
+		size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
+		size_t tid = threadIdx.y * blockDim.x + threadIdx.x;
+		size_t nt = blockDim.x * blockDim.y;
+
+		size_t cx = blockIdx.x * blockDim.x;					//find the upper left corner of the input region
+		size_t cy = blockIdx.y * blockDim.y;
+
+		size_t X = sx - kx + 1;								//calculate the size of the output image
+		size_t Y = sy - ky + 1;
+
+		if (cx >= X || cy >= Y) return;						//return if the entire block is outside the image
+		size_t smx = min(blockDim.x + kx - 1, sx - cx);			//size of the shared copy of the input image
+		size_t smy = min(blockDim.y + ky - 1, sy - cy);			//	min function is used to deal with boundary blocks
+		stim::cuda::threadedMemcpy2D<T>(s, smx, smy, in, cx, cy, sx, sy, tid, nt);	//copy the input region to shared memory
+		__syncthreads();
+
+		if (xi >= X || yi >= Y) return;						//returns if the thread is outside of the output image
+		
+		//loop through the kernel
+		size_t kxi, kyi;
+		K v = 0;
+		for (kyi = 0; kyi < ky; kyi++) {
+			for (kxi = 0; kxi < kx; kxi++) {
+				v += s[(threadIdx.y + kyi) * smx + threadIdx.x + kxi] * kernel[kyi * kx + kxi];
+				//v += in[(yi + kyi) * sx + xi + kxi] * kernel[kyi * kx + kxi];
+			}
+		}
+		out[yi * X + xi] = (T)v;								//write the result to global memory
+
+	}
+
+	//Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device.
+	//@param out is a pointer to the output image, which is of size (sx - kx + 1) x (sy - ky + 1)
+	//@param in is a pointer to the input image
+	//@param sx is the size of the input image along X
+	//@param sy is the size of the input image along Y
+	//@param kx is the size of the kernel along X
+	//@param ky is the size of the kernel along Y
+	template<typename T, typename K>
+	void gpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
+		cudaDeviceProp p;
+		HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
+		size_t tmax = p.maxThreadsPerBlock;
+		dim3 nt(sqrt(tmax), sqrt(tmax));					//calculate the block dimensions
+		size_t X = sx - kx + 1;								//calculate the size of the output image
+		size_t Y = sy - ky + 1;
+		dim3 nb(X / nt.x + 1, Y / nt.y + 1);							//calculate the grid dimensions
+		size_t sm = (nt.x + kx - 1) * (nt.y + ky - 1) * sizeof(T);		//shared memory bytes required to store block data
+		if (sm > p.sharedMemPerBlock) {
+			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
+			exit(1);
+		}
+		kernel_conv2 <<<nb, nt, sm>>> (out, in, kernel, sx, sy, kx, ky);	//launch the kernel
+	}
+//#endif
+	//Performs a convolution of a 2D image. Only valid pixels based on the kernel are returned.
+	//	As a result, the output image will be smaller than the input image by (kx-1, ky-1)
+	//@param out is a pointer to the output image
+	//@param in is a pointer to the input image
+	//@param sx is the size of the input image along X
+	//@param sy is the size of the input image along Y
+	//@param kx is the size of the kernel along X
+	//@param ky is the size of the kernel along Y
+	template<typename T, typename K>
+	void cpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) {
+		size_t X = sx - kx + 1;					//x size of the output image
+		size_t Y = sy - ky + 1;					//y size of the output image
+
+		//allocate memory and copy everything to the GPU
+		T* gpu_in;
+		HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T)));
+		HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice));
+		K* gpu_kernel;
+		HANDLE_ERROR(cudaMalloc(&gpu_kernel, kx * ky * sizeof(K)));
+		HANDLE_ERROR(cudaMemcpy(gpu_kernel, kernel, kx * ky * sizeof(K), cudaMemcpyHostToDevice));
+		T* gpu_out;
+		HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T)));
+		gpu_conv2(gpu_out, gpu_in, gpu_kernel, sx, sy, kx, ky);								//execute the GPU kernel
+		HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost));	//copy the result to the host
+		HANDLE_ERROR(cudaFree(gpu_in));
+		HANDLE_ERROR(cudaFree(gpu_kernel));
+		HANDLE_ERROR(cudaFree(gpu_out));
+/* CPU CODE
+		K v;												//register stores the integral of the current pixel value
+		size_t yi, xi, kyi, kxi, yi_kyi_sx;
+		for (yi = 0; yi < Y; yi++) {					//for each pixel in the output image
+			for (xi = 0; xi < X; xi++) {
+				v = 0;
+				for (kyi = 0; kyi < ky; kyi++) {		//for each pixel in the kernel
+					yi_kyi_sx = (yi + kyi) * sx;
+					for (kxi = 0; kxi < kx; kxi++) {
+						v += in[yi_kyi_sx + xi + kxi] * kernel[kyi * kx + kxi];
+					}
+				}
+				out[yi * X + xi] = v;						//save the result to the output array
+			}
+		}
+		
+		*/
+	}
+	
+
+}
+
+
+#endif
 \ No newline at end of file
@@ -2,7 +2,7 @@
 #define STIM_CUDA_GAUSS2_H
 #include <stim/image/image.h>
-#include <stim/math/filters/sepconv2.h>
+#include <stim/math/filters/sepconv2.cuh>
 #include <stim/math/constants.h>
 namespace stim {
@@ -18,8 +18,8 @@ namespace stim {
 	///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel
 	template<typename T, typename K>
 	stim::image<T> cpu_gauss2(const stim::image<T>& in, K stdx, K stdy, size_t nstds = 3) {
-		size_t kx = stdx * nstds * 2;					//calculate the kernel sizes
-		size_t ky = stdy * nstds * 2;
+		size_t kx = (size_t)(stdx * nstds * 2) + 1;					//calculate the kernel sizes
+		size_t ky = (size_t)(stdy * nstds * 2) + 1;
 		size_t X = in.width() - kx + 1;					//calculate the size of the output image
 		size_t Y = in.height() - ky + 1;
 		stim::image<T> r(X, Y, in.channels());		//create an output image
@@ -33,10 +33,10 @@ namespace stim {
 		for (size_t yi = 0; yi < ky; yi++)
 			gy[yi] = gauss1d((K)yi, muy, stdy);
-		std::vector< stim::image<T> > IN = in.split();	//split the input image into channels
+		std::vector< stim::image<T> > clist = in.split();	//split the input image into channels
 		std::vector< stim::image<T> > R = r.split();		//split the output image into channels
 		for (size_t c = 0; c < in.channels(); c++)		//for each channel
-			cpu_sepconv2(R[c].data(), IN[c].data(), gx, gy, IN[c].width(), IN[c].height(), kx, ky);
+			cpu_sepconv2(R[c].data(), clist[c].data(), gx, gy, clist[c].width(), clist[c].height(), kx, ky);
 		r.merge(R);										//merge the blurred channels into the final image
 		return r;
+#ifndef STIM_CUDA_GAUSS3_H
+#define STIM_CUDA_GAUSS3_H
+#include <stim/math/filters/sepconv3.cuh>
+#include <stim/math/filters/gauss2.cuh>
+#include <stim/math/constants.h>
+
+namespace stim
+{
+	///Perform a 3D gaussian convolution on an input image.
+        ///@param in is a pointer to the input data.
+	///@param dimx is the size of in* in the x direction.
+	///@param dimx is the size of in* in the y direction.
+	///@param dimx is the size of in* in the z direction.
+        ///@param stdx is the standard deviation (in pixels) along the x axis.
+        ///@param stdy is the standard deviation (in pixels) along the y axis.
+        ///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel.
+	template<typename T, typename K>
+	void cpu_gauss3(T* in, K dimx, K dimy, K dimz, K stdx, K stdy, K stdz, size_t nstds = 3)
+	{
+		//Set up the sizes of the gaussian Kernels.
+		size_t kx = stdx * nstds * 2;
+		size_t ky = stdy * nstds * 2;
+		size_t kz = stdz * nstds * 2;
+	
+		//Set up the sizes of the new output, which will be kx, ky, kz, smaller than the input.
+		size_t X = dimx - kx +1; 
+		size_t Y = dimy - ky +1; 
+		size_t Z = dimz - kz +1; 
+		T* out = (T*) malloc(X*Y*Z* sizeof(T));
+
+		///Set up the memory that will store the gaussians
+		K* gaussx = (K*)malloc(kx *sizeof(K));
+		K* gaussy = (K*)malloc(ky *sizeof(K));
+		K* gaussz = (K*)malloc(kz *sizeof(K));
+
+		///Set up the midpoints of the gaussians.
+		K midgaussx = (K) kx/ (K)2;
+		K midgaussy = (K) ky/ (K)2;
+		K midgaussz = (K) kz/ (K)2;
+
+		///Evaluate the kernels in each cardinal direction.
+		for(size_t i = 0; i < kx; i++)
+			gaussx[i] = gauss1d((K) i, midgaussx, stdx);
+
+		for(size_t i = 0; i < kx; i++)
+			gaussy[i] = gauss1d((K) i, midgaussy, stdy);
+
+		for(size_t i = 0; i < kx; i++)
+			gaussz[i] = gauss1d((K) i, midgaussz, stdz);
+
+		cpu_sepconv3(out, in, gaussx, gaussy, gaussz, dimx, dimy, dimz, kx, ky, kz);
+
+	}
+}
+#endif
+#ifndef STIM_CUDA_GRADIENT_H
+#define STIM_CUDA_GRADIENT_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools.h>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void gradient_2d(T* out, T* in, int x, int y){
+
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y * blockDim.y + threadIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			//return if the pixel is outside of the image
+			if(xi >= x || yi >= y) return;
+
+			//calculate indices for the forward difference
+			int i_xp = yi * x + (xi + 1);
+			int i_yp = (yi + 1) * x + xi;
+
+			//calculate indices for the backward difference
+			int i_xn = yi * x + (xi - 1);
+			int i_yn = (yi - 1) * x + xi;
+
+			//use forward differences if a coordinate is zero
+			if(xi == 0)
+				out[i * 2 + 0] = in[i_xp] - in[i];
+			if(yi == 0)
+				out[i * 2 + 1] = in[i_yp] - in[i];
+
+			//use backward differences if the coordinate is at the maximum edge
+			if(xi == x-1)
+				out[i * 2 + 0] = in[i] - in[i_xn];
+			if(yi == y-1)
+				out[i * 2 + 1] = in[i] - in[i_yn];
+
+			//otherwise use central differences
+			if(xi > 0 && xi < x-1)
+				out[i * 2 + 0] = (in[i_xp] - in[i_xn]) / 2;
+
+			if(yi > 0 && yi < y-1)
+				out[i * 2 + 1] = (in[i_yp] - in[i_yn]) / 2;
+
+		}
+
+		template<typename T>
+		void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){
+			
+			//get the maximum number of threads per block for the CUDA device
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + 1 , y);
+			
+
+			//call the GPU kernel to determine the gradient
+			gradient_2d<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y);
+
+		}
+
+		template<typename T>
+		void cpu_gradient_2d(T* out, T* in, unsigned int x, unsigned int y){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			unsigned int bytes = pixels * sizeof(T);
+
+			//allocate space on the GPU for the input image
+			T* gpuIn;
+			HANDLE_ERROR(cudaMalloc(&gpuIn, bytes));
+
+			//copy the image data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuIn, in, bytes, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the output gradient image
+			T* gpuOut;
+			cudaMalloc(&gpuOut, bytes * 2);		//the output image will have two channels (x, y)
+
+			//call the GPU version of this function
+			gpu_gradient_2d(gpuOut, gpuIn, x, y);	
+
+			//copy the results to the CPU
+			cudaMemcpy(out, gpuOut, bytes * 2, cudaMemcpyDeviceToHost);
+
+			//free allocated memory
+			cudaFree(gpuOut);
+			cudaFree(gpuIn);
+		}
+
+	}
+}
+
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_RESAMPLE2_H
+#define STIM_CUDA_RESAMPLE2_H
+
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+
+
+///Downsamples a 2D image by a factor f using a box filter. Any pixels outside of the valid region
+///		(for example if X%f != 0) are chopped.
+template<typename T, typename K>
+void cpu_resample2(T* out, T* in, size_t f, size_t sx, size_t sy) {
+
+}
+
+
+#endif
 \ No newline at end of file
 #ifndef STIM_CUDA_SEPCONV2_H
 #define STIM_CUDA_SEPCONV2_H
-#include <stim/math/filters/conv2.h>
+#include <stim/math/filters/conv2.cuh>
 #ifdef __CUDACC__
 #include <stim/cuda/cudatools.h>
 #include <stim/cuda/sharedmem.cuh>
@@ -20,12 +20,12 @@ namespace stim {
 		cudaDeviceProp p;
 		HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
 		size_t tmax = p.maxThreadsPerBlock;
-		dim3 nt(sqrt(tmax), sqrt(tmax));						//calculate the block dimensions
+		dim3 nt((unsigned int)sqrt(tmax), (unsigned int)sqrt(tmax));						//calculate the block dimensions
 		size_t X = sx - kx + 1;									//calculate the x size of the output image
 		T* temp;												//declare a temporary variable to store the intermediate image
 		HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T)));	//allocate memory for the intermediate image
-		dim3 nb(X / nt.x + 1, sy / nt.y + 1);							//calculate the grid dimensions
+		dim3 nb((unsigned int)(X / nt.x) + 1, (unsigned int)(sy / nt.y) + 1);							//calculate the grid dimensions
 		size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T);					//shared memory bytes required to store block data
 		if (sm > p.sharedMemPerBlock) {
 			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
@@ -34,7 +34,7 @@ namespace stim {
 		kernel_conv2 <<<nb, nt, sm>>> (temp, in, k0, sx, sy, kx, 1);	//launch the kernel to compute the intermediate image
 		size_t Y = sy - ky + 1;									//calculate the y size of the output image
-		nb.y = Y / nt.y + 1;									//update the grid dimensions to reflect the Y-axis size of the output image
+		nb.y = (unsigned int)(Y / nt.y) + 1;									//update the grid dimensions to reflect the Y-axis size of the output image
 		sm = nt.x * (nt.y + ky - 1) * sizeof(T);				//calculate the amount of shared memory needed for the second pass
 		if (sm > p.sharedMemPerBlock) {
 			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
@@ -86,4 +86,4 @@ namespace stim {
 	}
 }
-#endif
 \ No newline at end of file
+#endif
+#ifndef STIM_CUDA_SEPCONV3_H
+#define STIM_CUDA_SEPCONV3_H
+
+#include <stim/math/filters/conv2.cuh>
+#include <stim/math/filters/sepconv2.cuh>
+#ifdef __CUDACC__
+	#include <stim/cuda/cudatools.h>
+	#include <stim/cuda/sharedmem.cuh>
+#endif
+
+namespace stim
+{
+#ifdef __CUDACC__
+	template<typename T, typename K>
+	void gpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz)
+{
+
+	size_t X = dimx - kx + 1; 
+	size_t Y = dimy - ky + 1; 
+	size_t Z = dimz - kz + 1;
+	
+	T* temp_out;
+	int idx_IN;
+	int idx_OUT;
+	HANDLE_ERROR(cudaMalloc(&temp_out, X*Y*dimz*sizeof(T)));
+
+	for(int i = 0; i < dimz; i++)
+	{
+		idx_IN 	= (dimx*dimy)*i-i;
+		idx_OUT = (X*Y)*i-i;
+		gpu_sepconv2(&temp_out[idx_OUT], &in[idx_IN], k0, k1, dimx, dimy, kx, ky);
+	}
+
+	cudaDeviceProp p;
+	HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
+	size_t tmax = p.maxThreadsPerBlock;
+
+	dim3 numThreads(sqrt(tmax), sqrt(tmax));
+	dim3 numBlocks(X*Y/numThreads.x +1, dimz/numThreads.y + 1);
+	size_t sharedMem = (numThreads.x + kz - 1) * numThreads.y * sizeof(T);
+	if(sharedMem > p.sharedMemPerBlock)
+	{
+		std::cout << "Error in stim::gpu_sepconv3() - insufficient shared memory for this kernel." << std::endl;
+		exit(1);
+	}
+	kernel_conv2 <<< numBlocks, numThreads, sharedMem >>> (out, temp_out, k2, X*Y, dimz, 1, kz);
+	HANDLE_ERROR(cudaFree(temp_out));
+
+
+}
+#endif
+
+	//Performs a separable convolution of a 3D image. Only valid pixels based on the kernel ar    e returned.
+	//      As a result, the output image will be smaller than the input image by (kx-1, ky-1 , kz-1)
+	//@param out is a pointer to the output image
+	//@param in is a pointer to the input image
+	//@param kx is the x-axis convolution filter
+	//@param ky is the y-axis convolution filter
+	//@param kz is the z-axis convolution filter
+	//@param dimx is the size of the input image along X
+	//@param dimy is the size of the input image along Y
+	//@param dimz is the size of the input image along Z
+	//@param kx is the size of the kernel along X
+	//@param ky is the size of the kernel along Y
+	//@param kz is the size of the kernel along Z
+
+	template <typename T, typename K>
+	void cpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz)
+	{
+		//Set up the sizes of the new output, which will be kx, ky, kz, smaller than the i    nput.
+		size_t X = dimx - kx + 1; 
+		size_t Y = dimy - ky + 1; 
+		size_t Z = dimz - kz + 1;
+
+#ifdef __CUDACC__
+	///Set up all of the memory on the GPU
+	T* gpu_in;
+	HANDLE_ERROR(cudaMalloc(&gpu_in, dimx*dimy*dimz*sizeof(T)));
+	HANDLE_ERROR(cudaMemcpy(gpu_in, in, dimx*dimy*dimz*sizeof(T),cudaMemcpyHostToDevice));
+	K* gpu_kx;
+	HANDLE_ERROR(cudaMalloc(&gpu_kx, kx*sizeof(K)));
+	HANDLE_ERROR(cudaMemcpy(gpu_kx, k0, kx*sizeof(K),cudaMemcpyHostToDevice));
+	K* gpu_ky;
+	HANDLE_ERROR(cudaMalloc(&gpu_ky, ky*sizeof(K)));
+	HANDLE_ERROR(cudaMemcpy(gpu_ky, k1, ky*sizeof(K),cudaMemcpyHostToDevice));
+	K* gpu_kz;
+	HANDLE_ERROR(cudaMalloc(&gpu_kz, kz*sizeof(K)));
+	HANDLE_ERROR(cudaMemcpy(gpu_kz, k2, kz*sizeof(K),cudaMemcpyHostToDevice));
+	T* gpu_out;
+	HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * Z*sizeof(T)));
+
+	///run the kernel
+	gpu_sepconv3(gpu_out, gpu_in, gpu_kx, gpu_ky, gpu_kz, dimx, dimy, dimz, kx, ky, kz);
+
+	///Copy the output
+	HANDLE_ERROR(cudaMemcpy(out, gpu_out, X*Y*Z*sizeof(T), cudaMemcpyDeviceToHost));
+
+	///Free all the memory used.
+	HANDLE_ERROR(cudaFree(gpu_in));
+	HANDLE_ERROR(cudaFree(gpu_kx));
+	HANDLE_ERROR(cudaFree(gpu_ky));
+	HANDLE_ERROR(cudaFree(gpu_kz));
+	HANDLE_ERROR(cudaFree(gpu_out));
+#else
+	T* temp = (T*) malloc(X * dimy * sizeof(T));
+	T* temp3 = (T*) malloc(X * Y * dimz * sizeof(T));
+	for(int i = 0; i < dimz; i++)
+	{
+		idx_IN 	= (dimx*dimy)*i-i;
+		idx_OUT = (X*Y)*i-i;
+		cpu_conv2(temp, &in[idx_IN], k0, dimx, dimy, kx, 1)
+		cpu_conv2(&temp3[idx_OUT], temp, k1, X, dimy, 1, ky);
+	}
+	cpu_conv2(out, temp, k2, X*Y, dimz, 1, kz);
+	free(temp);
+	free(temp3);
+
+#endif
+	}
+}
+
+
+#endif
-#ifndef RTS_MATRIX_H
-#define RTS_MATRIX_H
+#ifndef STIM_MATRIX_H
+#define STIM_MATRIX_H
 //#include "rts/vector.h"
 #include <string.h>
 #include <iostream>
+#include <fstream>
 #include <stim/math/vector.h>
 #include <stim/math/vec3.h>
-#include <stim/cuda/cudatools/callable.h>
+//#include <stim/cuda/cudatools/callable.h>
 namespace stim{
-template <class T, int N>
-struct matrix
-{
+	enum mat4Format {
+		mat4_float64,
+		mat4_float32,
+		mat4_int32,
+		mat4_int16,
+		mat4_uint16,
+		mat4_uint8,
+		mat4_float						//floating point type, determined automatically
+	};
+
+	static size_t mat4Format_size(mat4Format f){
+		switch(f){
+			case mat4_float64: return 8;
+			case mat4_float32:
+			case mat4_int32:   return 4;
+			case mat4_int16:
+			case mat4_uint16:  return 2;
+			case mat4_uint8:   return 1;
+			default:           return 0;
+		}
+	}
+
+	//class encapsulates a mat4 file, and can be used to write multiple matrices to a single mat4 file
+	class mat4file {
+		std::ofstream matfile;
+
+	public:
+		/// Constructor opens a mat4 file for writing
+		mat4file(std::string filename) {
+			matfile.open(filename, std::ios::binary);
+		}
+
+		bool is_open() {
+			return matfile.is_open();
+		}
+
+		void close() {
+			matfile.close();
+		}
+
+		bool writemat(char* data, std::string varname, size_t sx, size_t sy, mat4Format format) {
+			//save the matrix file here (use the mat4 function above)
+			//data format: https://maxwell.ict.griffith.edu.au/spl/matlab-page/matfile_format.pdf (page 32)
+
+			int MOPT = 0;									//initialize the MOPT type value to zero
+			int m = 0;										//little endian
+			int o = 0;										//reserved, always 0
+			int p = format;
+			int t = 0;
+			MOPT = m * 1000 + o * 100 + p * 10 + t;			//calculate the type value
+			int mrows = (int)sx;
+			int ncols = (int)sy;
+			int imagf = 0;									//assume real (for now)
+			varname.push_back('\0');									//add a null to the string
+			int namlen = (int)varname.size();						//calculate the name size
+
+			size_t bytes = sx * sy * mat4Format_size(format);
+			matfile.write((char*)&MOPT, 4);
+			matfile.write((char*)&mrows, 4);
+			matfile.write((char*)&ncols, 4);
+			matfile.write((char*)&imagf, 4);
+			matfile.write((char*)&namlen, 4);
+			matfile.write((char*)&varname[0], namlen);
+			matfile.write((char*)data, bytes);				//write the matrix data
+			return is_open();
+		}
+	};
+
+	static void save_mat4(char* data, std::string filename, std::string varname, size_t sx, size_t sy, mat4Format format){
+		mat4file outfile(filename);									//create a mat4 file object
+		if (outfile.is_open()) {									//if the file is open
+			outfile.writemat(data, varname, sx, sy, format);		//write the matrix
+			outfile.close();										//close the file
+		}		
+	}
+
+template <class T>
+class matrix {
 	//the matrix will be stored in column-major order (compatible with OpenGL)
-	T M[N*N];
+	T* M;								//pointer to the matrix data
+	size_t R;							//number of rows
+	size_t C;							//number of colums
-	CUDA_CALLABLE matrix()
-	{
-		for(int r=0; r<N; r++)
-			for(int c=0; c<N; c++)
-				if(r == c)
-					(*this)(r, c) = 1;
-				else
-					(*this)(r, c) = 0;
+	size_t bytes() {
+		return R * C * sizeof(T);		//return the number of bytes of matrix data
 	}
+	/*void init(size_t rows, size_t cols){
+		R = rows;
+		C = cols;
+		if (R == 0 || C == 0) M = NULL;
+		else
+			M = (T*)malloc(R * C * sizeof(T));	//allocate space for the matrix
+	}*/
-	CUDA_CALLABLE matrix(T rhs[N*N])
-	{
-		memcpy(M,rhs, sizeof(T)*N*N);
+	T get(const size_t row, const size_t col) const {
+		if (row >= R || col >= C) {
+			std::cout << "ERROR: row or column out of range." << std::endl;
+			exit(1);
+		}
+		return M[col * R + row];
 	}
-	CUDA_CALLABLE matrix<T,N> set(T rhs[N*N])
-	{
-		memcpy(M, rhs, sizeof(T)*N*N);
-		return *this;
+	T& at(size_t row, size_t col){
+		if (row >= R || col >= C) {
+			std::cout << "ERROR: row or column out of range." << std::endl;
+			exit(1);
+		}
+		return M[col * R + row];
+	}
+
+public:
+	matrix() {
+		R = 0;
+		C = 0;
+		M = NULL;
+	}
+
+	matrix(size_t rows, size_t cols) {
+		R = rows;
+		C = cols;
+		M = NULL;
+		if (R * C > 0) 
+			M = (T*) malloc(R * C * sizeof(T));
+	}
+
+	matrix(size_t rows, size_t cols, const T* data) {
+		R = rows;
+		C = cols;
+		M = NULL;
+		if (R * C > 0)
+			M = (T*)malloc(R * C * sizeof(T));
+		memcpy(M, data, R * C * sizeof(T));
+	}
+
+	matrix(const matrix<T>& cpy){
+		M = NULL;
+		if (cpy.R * cpy.C > 0)
+			M = (T*)malloc(cpy.R * cpy.C * sizeof(T));
+		memcpy(M, cpy.M, cpy.R * cpy.C * sizeof(T));
+
+		R = cpy.R;
+		C = cpy.C;
+	}
+
+	~matrix() {
+		if(M) free(M);
+		M = NULL;
+		R = C = 0;
+	}
+
+	size_t rows() const {
+		return R;
 	}
-	CUDA_CALLABLE T& operator()(int row, int col)
-	{
-		return M[col * N + row];
+	size_t cols() const {
+		return C;
 	}
-	CUDA_CALLABLE matrix<T, N> operator=(T rhs)
-	{
-		int Nsq = N*N;
-		for(int i=0; i<Nsq; i++)
-			M[i] = rhs;
+	T& operator()(size_t row, size_t col) {
+		return at(row, col);
+	}
+
+	matrix<T>& operator=(const T rhs) {
+		//init(R, C);
+		size_t N = R * C;
+		for(size_t n=0; n<N; n++)
+			M[n] = rhs;
 		return *this;
 	}
-	template<typename Y>
-	vec<Y> operator*(vec<Y> rhs){
-		unsigned int M = rhs.size();
+	matrix<T>& operator=(const matrix<T>& rhs){
+		if (this != &rhs) {											//if the matrix isn't self-assigned
+			T* new_matrix = new T[rhs.R * rhs.C];					//allocate new resources
+			memcpy(new_matrix, rhs.M, rhs.R * rhs.C * sizeof(T));	//copy the matrix
+
+			delete[] M;												//delete the previous array
+			M = new_matrix;
+			R = rhs.R;
+			C = rhs.C;
+		}
+		return *this;
+	}
+	
+	//element-wise operations
+	matrix<T> operator+(const T rhs) const {
+		matrix<T> result(R, C);					//create a result matrix
+		size_t N = R * C;
+
+		for(int i=0; i<N; i++)
+			result.M[i] = M[i] + rhs;			//calculate the operation and assign to result
+
+		return result;
+	}
+
+	matrix<T> operator+(const matrix<T> rhs) const {
+		if (R != rhs.R || C != rhs.C) {
+			std::cout << "ERROR: addition is only defined for matrices that are the same size." << std::endl;
+			exit(1);
+		}
+		matrix<T> result(R, C);					//create a result matrix
+		size_t N = R * C;
+
+		for (int i = 0; i < N; i++)
+			result.M[i] = M[i] + rhs.M[i];			//calculate the operation and assign to result
+
+		return result;
+	}
+
+	matrix<T> operator-(const T rhs) const {
+		return operator+(-rhs);					//add the negative of rhs
+	}
+
+	matrix<T> operator-(const matrix<T> rhs) const {
+		return operator+(-rhs);
+	}
+
+	matrix<T> operator-() const {
+		matrix<T> result(R, C);					//create a result matrix
+		size_t N = R * C;
+
+		for (int i = 0; i < N; i++)
+			result.M[i] = -M[i];			//calculate the operation and assign to result
+
+		return result;
+	}
+
+	matrix<T> operator*(const T rhs) const {
+		matrix<T> result(R, C);					//create a result matrix
+		size_t N = R * C;
+
+		for(int i=0; i<N; i++)
+			result.M[i] = M[i] * rhs;			//calculate the operation and assign to result
+
+		return result;
+	}
+
+	matrix<T> operator/(const T rhs) const {
+		matrix<T> result(R, C);					//create a result matrix
+		size_t N = R * C;
+
+		for(int i=0; i<N; i++)
+			result.M[i] = M[i] / rhs;			//calculate the operation and assign to result
+
+		return result;
+	}
+
+	//matrix multiplication
+	matrix<T> operator*(const matrix<T> rhs) const {
+		if(C != rhs.R){
+			std::cout<<"ERROR: matrix multiplication is undefined for matrices of size ";
+			std::cout<<"[ "<<R<<" x "<<C<<" ] and [ "<<rhs.R<<" x "<<rhs.C<<"]"<<std::endl;
+			exit(1);
+		}
-		vec<Y> result;
-		result.resize(M);
+		matrix<T> result(R, rhs.C);				//create the output matrix
+		T inner;								//stores the running inner product
+		size_t c, r, i;
+		for(c = 0; c < rhs.C; c++){
+			for(r = 0; r < R; r++){
+				inner = (T)0;
+				for(i = 0; i < C; i++){
+					inner += get(r, i) * rhs.get(i, c);
+				}
+				result.M[c * R + r] = inner;
+			}
+		}
+		return result;
+	}
-		for(int r=0; r<M; r++)
-			for(int c=0; c<M; c++)
-				result[r] += (*this)(r, c) * rhs[c];
+	//returns a pointer to the raw matrix data (in column major format)
+	T* data(){
+		return M;
+	}
+	//return a transposed matrix
+	matrix<T> transpose() const {
+		matrix<T> result(C, R);
+		size_t c, r;
+		for(c = 0; c < C; c++){
+			for(r = 0; r < R; r++){
+				result.M[r * C + c] = M[c * R + r];
+			}
+		}
 		return result;
 	}
-	template<typename Y>
-	CUDA_CALLABLE vec3<Y> operator*(vec3<Y> rhs){
-		vec3<Y> result = 0;
-		for(int r=0; r<3; r++)
-			for(int c=0; c<3; c++)
-				result[r] += (*this)(r, c) * rhs[c];
+	// Reshapes the matrix in place
+	void reshape(size_t rows, size_t cols) {
+		R = rows;
+		C = cols;
+	}
+
+	///Calculate and return the determinant of the matrix
+	T det() const {
+		if (R != C) {
+			std::cout << "ERROR: a determinant can only be calculated for a square matrix." << std::endl;
+			exit(1);
+		}
+		if (R == 1) return M[0];			//if the matrix only contains one value, return it
+		int r, c, ri, cia, cib;
+		T a = 0;
+		T b = 0;
+		for (c = 0; c < (int)C; c++) {
+			for (r = 0; r < R; r++) {
+				ri = r;
+				cia = (r + c) % (int)C;
+				cib = ((int)C - 1 - r) % (int)C;
+				a += get(ri, cia);
+				b += get(ri, cib);
+			}
+		}
+		return a - b;
+	}
+
+	/// Sum all elements in the matrix
+	T sum() const {
+		size_t N = R * C;								//calculate the number of elements in the matrix
+		T s = (T)0;										//allocate a register to store the sum
+		for (size_t n = 0; n < N; n++) s += M[n];		//perform the summation
+		return s;
+	}
+
+	/// Sort rows of the matrix by the specified indices
+	matrix<T> sort_rows(size_t* idx) const {
+		matrix<T> result(C, R);					//create the output matrix
+		size_t r, c;
+		for (c = 0; c < C; c++) {								//for each column
+			for (r = 0; r < R; r++) {							//for each row element
+				result.M[c * R + r] = M[c * R + idx[r]];		//copy each element of the row into its new position
+			}
+		}
+		return result;
+	}
+
+	/// Sort columns of the matrix by the specified indices
+	matrix<T> sort_cols(size_t* idx, size_t data_type = mat4_float) const {
+		matrix<T> result(C, R);
+		size_t c;
+		for (c = 0; c < C; c++) {											//for each column
+			memcpy(&result.M[c * R], &M[idx[c] * R], sizeof(T) * R);		//copy the entire column from this matrix to the appropriate location
+		}
 		return result;
 	}
-	std::string toStr()
-	{
+	/// Return the column specified by index i
+	matrix<T> col(size_t i) {
+		matrix<T> c(R, 1);										//create a single column matrix
+		memcpy(c.data(), &data()[R*i], C * sizeof(T));				//copy the column
+		return c;
+	}
+
+	/// Return the row specified by index i
+	matrix<T> row(size_t i) {
+		matrix<T> r(1, C);										//create a single row matrix
+		for (size_t c = 0; c < C; c++)
+			r(0, c) = at(i, c);
+		return r;
+	}
+
+	std::string toStr() const {
 		std::stringstream ss;
-		for(int r = 0; r < N; r++)
-		{
+		for(int r = 0; r < R; r++) {
 			ss << "| ";
-			for(int c=0; c<N; c++)
-			{
-				ss << (*this)(r, c) << " ";
+			for(int c=0; c<C; c++) {
+				ss << M[c * R + r] << " ";
 			}
 			ss << "|" << std::endl;
 		}
-
 		return ss.str();
 	}
+
+	void csv(std::ostream& out) const {
+		//std::stringstream csvss;
+		for (size_t i = 0; i < R; i++) {
+			out << std::fixed << M[i];
+			for (size_t j = 1; j < C; j++)
+				out << ", " << std::fixed << M[j * R + i];
+			out << std::endl;
+		}
+		//return csvss.str();
+	}
+
+	std::string csv() const {
+		std::stringstream csvss;
+		int digits = std::numeric_limits<double>::max_digits10;
+		csvss.precision(digits);
+		csv(csvss);
+		return csvss.str();
+	}
+
+
+
+	//save the data as a CSV file
+	void csv(std::string filename) const {
+		std::ofstream basisfile(filename.c_str());
+		basisfile << csv();
+		basisfile.close();
+	}
+
+	static matrix<T> I(size_t N) {
+		matrix<T> result(N, N);							//create the identity matrix
+		memset(result.M, 0, N * N * sizeof(T));			//set the entire matrix to zero
+		for (size_t n = 0; n < N; n++) {
+			result(n, n) = (T)1;						//set the diagonal component to 1
+		}
+		return result;
+	}
+
+	//loads a matrix from a stream in CSV format
+	void csv(std::istream& in) {
+		size_t c, r;
+		T v;
+		for (r = 0; r < R; r++) {
+			for (c = 0; c < C; c++) {
+				in >> v;
+				if (in.peek() == ',') in.seekg(1, std::ios::cur);
+				at(r, c) = v;;
+			}
+		}
+	}
+
+	void raw(std::string filename) {
+		std::ofstream out(filename, std::ios::binary);
+		if (out) {
+			out.write((char*)data(), rows() * cols() * sizeof(T));
+			out.close();
+		}
+	}
+
+	void mat4(stim::mat4file& file, std::string name = std::string("unknown"), mat4Format format = mat4_float) {
+		//make sure the matrix name is valid (only numbers and letters, with a letter at the beginning
+		for (size_t c = 0; c < name.size(); c++) {
+			if (name[c] < 48 ||												//if the character isn't a number or letter, replace it with '_'
+				(name[c] > 57 && name[c] < 65) ||
+				(name[c] > 90 && name[c] < 97) ||
+				(name[c] > 122)) {
+				name[c] = '_';
+			}
+		}
+		if (name[0] < 65 ||
+			(name[0] > 91 && name[0] < 97) ||
+			name[0] > 122) {
+			name = std::string("m") + name;
+		}
+		if (format == mat4_float) {
+			if (sizeof(T) == 4) format = mat4_float32;
+			else if (sizeof(T) == 8) format = mat4_float64;
+			else {
+				std::cout << "stim::matrix ERROR - incorrect format specified" << std::endl;
+				exit(1);
+			}
+		}
+		//the name is now valid
+
+		//if the size of the array is more than 100,000,000 elements, the matrix isn't supported
+		if (rows() * cols() > 100000000) {											//break the matrix up into multiple parts
+			//mat4file out(filename);													//create a mat4 object to write the matrix
+			if (file.is_open()) {
+				if (rows() < 100000000) {												//if the size of the row is less than 100,000,000, split the matrix up by columns
+					size_t ncols = 100000000 / rows();									//calculate the number of columns that can fit in one matrix
+					size_t nmat = (size_t)std::ceil((double)cols() / (double)ncols);			//calculate the number of matrices required
+					for (size_t m = 0; m < nmat; m++) {									//for each matrix
+						std::stringstream ss;
+						ss << name << "_part_" << m + 1;
+						if (m == nmat - 1)
+							file.writemat((char*)(data() + m * ncols * rows()), ss.str(), rows(), cols() - m * ncols, format);
+						else
+							file.writemat((char*)(data() + m * ncols * rows()), ss.str(), rows(), ncols, format);
+					}
+				}
+			}
+		}
+		//call the mat4 subroutine
+		else
+			//stim::save_mat4((char*)M, filename, name, rows(), cols(), format);
+			file.writemat((char*)data(), name, rows(), cols(), format);
+	}
+
+	// saves the matrix as a Level-4 MATLAB file
+	void mat4(std::string filename, std::string name = std::string("unknown"), mat4Format format = mat4_float) {
+		stim::mat4file matfile(filename);
+
+		if (matfile.is_open()) {
+			mat4(matfile, name, format);
+			matfile.close();
+		}
+	}
 };
 }	//end namespace rts
-template <typename T, int N>
-std::ostream& operator<<(std::ostream& os, stim::matrix<T, N> M)
-{
-    os<<M.toStr();
-    return os;
-}
-
-//#if __GNUC__ > 3 && __GNUC_MINOR__ > 7
-//template<class T, int N> using rtsMatrix = rts::matrix<T, N>;
-//#endif
 #endif
+#ifndef STIM_MATRIX_SQ_H
+#define STIM_MATRIX_SQ_H
+
+//#include "rts/vector.h"
+#include <string.h>
+#include <iostream>
+#include <stim/math/vector.h>
+#include <stim/math/vec3.h>
+#include <stim/cuda/cudatools/callable.h>
+
+namespace stim{
+
+template <class T, int N>
+struct matrix_sq
+{
+	//the matrix will be stored in column-major order (compatible with OpenGL)
+	T M[N*N];
+
+	CUDA_CALLABLE matrix_sq()
+	{
+		for(int r=0; r<N; r++)
+			for(int c=0; c<N; c++)
+				if(r == c)
+					(*this)(r, c) = 1;
+				else
+					(*this)(r, c) = 0;
+	}
+
+	CUDA_CALLABLE matrix_sq(T rhs[N*N])
+	{
+		memcpy(M,rhs, sizeof(T)*N*N);
+	}
+
+	CUDA_CALLABLE matrix_sq<T,N> set(T rhs[N*N])
+	{
+		memcpy(M, rhs, sizeof(T)*N*N);
+		return *this;
+	}
+
+	//create a symmetric matrix given the rhs values, given in column-major order
+	CUDA_CALLABLE void setsym(T rhs[(N*N+N)/2]){
+		const size_t L = (N*N+N)/2;		//store the number of values
+
+		size_t r, c;
+		r = c = 0;
+		for(size_t i = 0; i < L; i++){ 				//for each value
+			if(r == c) M[c * N + r] = rhs[i];
+			else M[c*N + r] = M[r * N + c] = rhs[i];
+			r++;
+			if(r == N) r = ++c;
+		}
+	}
+
+	CUDA_CALLABLE T& operator()(int row, int col)
+	{
+		return M[col * N + row];
+	}
+
+	CUDA_CALLABLE matrix_sq<T, N> operator=(T rhs)
+	{
+		int Nsq = N*N;
+		for(int i=0; i<Nsq; i++)
+			M[i] = rhs;
+
+		return *this;
+	}
+	
+	// M - rhs*I
+	CUDA_CALLABLE matrix_sq<T, N> operator-(T rhs)
+	{
+		for(int i=0; i<N; i++)
+			for(int j=0 ; j<N; j++)
+				if(i == j)
+					M[i*N+j] -= rhs;
+		return *this;
+	}
+
+	template<typename Y>
+	vec<Y> operator*(vec<Y> rhs){
+		unsigned int M = rhs.size();
+
+		vec<Y> result;
+		result.resize(M);
+
+		for(int r=0; r<M; r++)
+			for(int c=0; c<M; c++)
+				result[r] += (*this)(r, c) * rhs[c];
+
+		return result;
+	}
+
+	template<typename Y>
+	CUDA_CALLABLE vec3<Y> operator*(vec3<Y> rhs){
+		vec3<Y> result = 0;
+		for(int r=0; r<3; r++)
+			for(int c=0; c<3; c++)
+				result[r] += (*this)(r, c) * rhs[c];
+
+		return result;
+	}
+
+	std::string toStr()
+	{
+		std::stringstream ss;
+
+		for(int r = 0; r < N; r++)
+		{
+			ss << "| ";
+			for(int c=0; c<N; c++)
+			{
+				ss << (*this)(r, c) << " ";
+			}
+			ss << "|" << std::endl;
+		}
+
+		return ss.str();
+	}
+
+	static matrix_sq<T, N> identity() {
+		matrix_sq<T, N> I;
+		I = 0;
+		for (size_t i = 0; i < N; i++)
+			I.M[i * N + i] = 1;
+		return I;
+	}
+};
+
+}	//end namespace rts
+
+template <typename T, int N>
+std::ostream& operator<<(std::ostream& os, stim::matrix_sq<T, N> M)
+{
+    os<<M.toStr();
+    return os;
+}
+
+//#if __GNUC__ > 3 && __GNUC_MINOR__ > 7
+//template<class T, int N> using rtsMatrix = rts::matrix<T, N>;
+//#endif
+
+#endif
+#ifndef STIM_MATRIX_SYM_H
+#define STIM_MATRIX_SYM_H
+
+#include <stim/cuda/cudatools/callable.h>
+#include <stim/math/matrix.h>
+
+/* This class represents a rank 2, 3-dimensional tensor viable
+for representing tensor fields such as structure and diffusion tensors
+*/
+namespace stim{
+
+template <typename T, int D>
+class matrix_sym{
+
+protected:
+	//values are stored in column-major order as a lower-triangular matrix
+	T M[D*(D + 1)/2];
+
+	static size_t idx(size_t r, size_t c) {
+		//if the index is in the upper-triangular portion, swap the indices
+		if(r < c){
+			size_t t = r;
+			r = c;
+			c = t;
+		}
+
+		size_t ci = (c + 1) * (D + (D - c))/2 - 1;		//index to the end of column c
+		size_t i = ci - (D - r - 1);
+		return i;
+	}
+
+	//calculate the row and column given an index
+	//static void indices(size_t& r, size_t& c, size_t idx) {
+	//	size_t col = 0;
+	//	for ( ; col < D; col++)
+	//		if(idx <= ((D - col + D) * (col + 1)/2 - 1))
+	//			break;
+
+	//	c = col;
+	//	size_t ci = (D - (col - 1) + D) * col / 2 - 1;   //index to the end of last column col -1
+	//	r = idx - ci + c - 1;
+	//}
+	static void indices(size_t& r, size_t& c, size_t idx) {
+		size_t cf = -1/2 * sqrt(4 * D * D + 4 * D - (7 + 8 * idx)) + D - 1/2;
+		c = ceil(cf);
+		r = idx - D * c + c * (c + 1) / 2;
+	}
+
+public:
+	//return the symmetric matrix associated with this tensor
+	stim::matrix<T> mat() {
+		stim::matrix<T> r;
+		r.setsym(M);
+		return r;
+	}
+
+	CUDA_CALLABLE T& operator()(int r, int c) {		
+		return M[idx(r, c)];
+	}
+
+	CUDA_CALLABLE matrix_sym<T, D> operator=(T rhs) {
+		int Nsq = D*(D+1)/2;
+		for(int i=0; i<Nsq; i++)
+			M[i] = rhs;
+
+		return *this;
+	}
+
+	CUDA_CALLABLE matrix_sym<T, D> operator=(matrix_sym<T, D> rhs) {
+		size_t N = D * (D + 1) / 2;
+		for (size_t i = 0; i < N; i++) M[i] = rhs.M[i];
+		return *this;
+	}
+
+	CUDA_CALLABLE T trace() {
+		T tr = 0;
+		for (size_t i = 0; i < D; i++)		//for each diagonal value
+			tr += M[idx(i, i)];				//add the value on the diagonal
+		return tr;
+	}
+	// overload matrix multiply scalar
+	CUDA_CALLABLE void operator_product(matrix_sym<T, D> &B, T rhs) {
+		int Nsq = D*(D+1)/2;
+		for(int i=0; i<Nsq; i++)
+			B.M[i] *= rhs;
+	}
+
+	//return the tensor as a string
+	std::string str() {
+		std::stringstream ss;
+		for(int r = 0; r < D; r++){
+			ss << "| ";
+			for(int c=0; c<D; c++)
+			{
+				ss << (*this)(r, c) << " ";
+			}
+			ss << "|" << std::endl;
+		}
+
+		return ss.str();
+	}
+
+	//returns an identity matrix
+	static matrix_sym<T, D> identity() {
+		matrix_sym<T, D> I;
+		I = 0;
+		for (size_t i = 0; i < D; i++)
+			I.M[matrix_sym<T, D>::idx(i, i)] = 1;
+		return I;
+	}
+};
+
+
+
+}	//end namespace stim
+
+
+#endif
@@ -188,9 +188,9 @@ class plane
 		{
 			quaternion<T> q;
 			q.CreateRotation(N, n);
-			
-			N = q.toMatrix3() * N;
-			U = q.toMatrix3() * U;
+			matrix_sq<T, 3> M = q.toMatrix3();
+			N = M * N;
+			U = M * U;
 		}
 #ifndef RTS_QUATERNION_H
 #define RTS_QUATERNION_H
-#include <stim/math/matrix.h>
+#include <stim/math/matrix_sq.h>
 #include <stim/cuda/cudatools/callable.h>
 namespace stim{
@@ -46,7 +46,9 @@ public:
 		from = from.norm();
 		to = to.norm();
 		vec3<T> r = from.cross(to);			//compute the rotation vector
-		T theta = asin(r.len());				//compute the angle of the rotation about r
+		T l = r.len();
+		if (l > 1) l = 1;					//we have seen degenerate cases where |r| > 1 (probably due to loss of precision in the cross product)
+		T theta = asin(l);				//compute the angle of the rotation about r
 		//deal with a zero vector (both k and kn point in the same direction)
 		if(theta == (T)0){
 			return;
@@ -81,9 +83,9 @@ public:
 		return result;
 	}
-	CUDA_CALLABLE matrix<T, 3> toMatrix3(){
+	CUDA_CALLABLE matrix_sq<T, 3> toMatrix3(){
-		matrix<T, 3> result;
+		matrix_sq<T, 3> result;
 	    T wx, wy, wz, xx, yy, yz, xy, xz, zz, x2, y2, z2;
@@ -114,9 +116,9 @@ public:
 		return result;
 	}
-	CUDA_CALLABLE matrix<T, 4> toMatrix4(){
+	CUDA_CALLABLE matrix_sq<T, 4> toMatrix4(){
-		matrix<T, 4> result;
+		matrix_sq<T, 4> result;
 	    T wx, wy, wz, xx, yy, yz, xy, xz, zz, x2, y2, z2;
 	    // calculate coefficients
+#ifndef STIM_RANDOM
+#define STIM_RANDOM
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <stim/math/vec3.h>
+#include <stim/math/constants.h>
+
+namespace stim{
+
+template<class T>
+class Random{
+protected:
+	void init() {
+		srand(time(NULL));
+	}
+
+	void init(unsigned int seed){
+		srand(seed);
+	}
+
+public:
+	/// Default Constructor
+	Random(){
+		init();
+	}
+
+	/// Constructor from a seed.
+	/// A positive seed sets, 0 or negative yeilds the 
+	Random(unsigned int seed){
+		init(seed);
+	}
+
+	///Returns a random number uniformly sampled between 0 and 1
+	static
+	T uniformRandom()
+	{
+	     return (  (T)(rand()))/(  (T)(RAND_MAX));  ///generates a random number between 0 and 1 using the uniform distribution.
+	}
+
+	///Returns a random number from a normal distribution between 0 to 1.
+	static
+	T normalRandom()
+	{
+		T u1 = uniformRandom();
+		T u2 = uniformRandom();
+		return cos(2.0*atan(1.0)*u2)*sqrt(-1.0*log(u1));                                                             ///generate a random number using the normal distribution between      0 and 1.
+	}
+	///Return a random vec3 each value between 0 and 1 from a uniform distribution.
+	static
+	stim::vec3<T> uniformRandVector()
+	{
+		stim::vec3<T> r(uniformRandom(), uniformRandom(), 1.0);                                                  ///generate a random vector using the uniform distribution between 0 and 1.
+		return r;
+	}
+	///Return a random vec3, each value between 0 and 1 from a normal distribution.
+	static
+	stim::vec3<T> normalRandVector()
+	{
+		stim::vec3<float> r(normalRandom(), normalRandom(), 1.0);                                                    ///generate a random vector using the normal distribution between      0 and 1.
+		return r;
+	}
+
+	///place num_samples of samples on the surface of a sphere of radius r.
+	///returns an std::vector of vec3's in cartisian coordinates.
+	static std::vector<stim::vec3 <T> >
+	sample_sphere(unsigned int num_samples, T radius = 1, T solidAngle = stim::TAU)
+	{
+		std::cout << "did this" << std::endl;
+		T PHI[2], Z[2], range;      ///Range of angles in cylinderical coordinates
+		PHI[0] = solidAngle/2;          ///project the solid angle into spherical coords
+		PHI[1] = asin(0);               ///
+		Z[0] = cos(PHI[0]);             ///project the z into spherical coordinates
+		Z[1] = cos(PHI[1]);             ///
+		range = Z[0] - Z[1];            ///the range of all possible z values.
+
+		T z, theta, phi;            /// temporary individual
+
+		std::vector<stim::vec3<T> > samples;
+
+		//srand(100);                     ///set random seed
+
+		for(int i = 0; i < num_samples; i++)		///for each sample
+		{
+			z = uniformRandom()*range + Z[1];	///find a random z based on the solid angle
+			theta = uniformRandom() * stim::TAU;	///find theta
+			phi = acos(z);				///project into spherical coord phi
+			stim::vec3<T> sph(radius, theta, phi);	///assume spherical
+			stim::vec3<T> cart = sph.sph2cart();	///conver to cartesisn
+			samples.push_back(cart);		///push into list
+		}
+
+///THIS IS DEBUGGING CODE, UNCOMMENT TO CHECK WHETHER THE SURFACE IS WELL SAMPLED!
+/*
+		std::stringstream name;
+	      	for(int i = 0; i < num_samples; i++)
+			name << samples[i].str() << std::endl;
+		
+	    
+	      	std::ofstream outFile;
+	      	outFile.open("Sampled Surface.txt");
+	      	outFile << name.str().c_str();                                                              
+*/
+
+		return samples;					///return full list.
+	}
+};
+
+}
+
+#endif
 #ifndef STIM_SPH_HARMONICS
 #define STIM_SPH_HARMONICS
-#include <stim/math/vector.h>
+#include <complex>
 #include <boost/math/special_functions/spherical_harmonic.hpp>
+#include <stim/math/constants.h>
+#include <stim/math/random.h>
 #include <vector>
-#define PI 3.14159
 #define WIRE_SCALE 1.001
-namespace stim{
+namespace stim {
-template<class T>
-class spharmonics{
+	template<class T>
+	class spharmonics {
-protected:
+	public:
+		std::vector<T> C;										//list of SH coefficients
-	std::vector<T> C;	//list of SH coefficients
+	protected:
+		unsigned int mcN;										//number of Monte-Carlo samples
+		unsigned int coeff_1d(unsigned int l, int m) {			//convert (l,m) to i (1D coefficient index)
+			return pow(l + 1, 2) - (l - m) - 1;
+		}
+		void coeff_2d(size_t c, unsigned int& l, int& m) {		//convert a 1D coefficient index into (l, m)
+			l = (unsigned int)ceil(sqrt((double)c + 1)) - 1;		//the major index is equal to sqrt(c) - 1
+			m = (int)(c - (size_t)(l * l)) - (int)l;			//the minor index is calculated by finding the difference
+		}
+
+	public:
+		spharmonics() {
+			mcN = 0;
+		}
+		spharmonics(size_t c) : spharmonics() {
+			resize(c);
+		}
+
+		void push(T c) {
+			C.push_back(c);
+		}
+
+		void resize(unsigned int n) {
+			C.resize(n);
+		}
+
+		void setc(unsigned int l, int m, T value) {
+			unsigned int c = coeff_1d(l, m);
+			C[c] = value;
+		}
+
+		T getc(unsigned int l, int m) {
+			unsigned int c = coeff_1d(l, m);
+			return C[c];
+		}
+
+		void setc(unsigned int c, T value) {
+			C[c] = value;
+		}
-	unsigned int mcN;	//number of Monte-Carlo samples
+		unsigned int getSize() const {
+			return C.size();
+		}
-	//calculate the value of the SH basis function (l, m) at (theta, phi)
+		std::vector<T> getC() const {
+			return C;
+		}
+		//calculate the value of the SH basis function (l, m) at (theta, phi)
 		//here, theta = [0, PI], phi = [0, 2*PI]
-	double SH(int l, int m, double theta, double phi){
-		return boost::math::spherical_harmonic_r(l, m, phi, theta);
-	}
+		T SH(unsigned int l, int m, T theta, T phi) {
+			//std::complex<T> result = boost::math::spherical_harmonic(l, m, phi, theta);
+			//return result.imag() + result.real();
+
+			//this calculation is based on calculating the real spherical harmonics:
+			//		https://en.wikipedia.org/wiki/Spherical_harmonics#Addition_theorem
+			if (m < 0) {
+				return sqrt(2.0) * pow(-1, m) * boost::math::spherical_harmonic(l, abs(m), phi, theta).imag();
+			}
+			else if (m == 0) {
+				return boost::math::spherical_harmonic(l, m, phi, theta).real();
+			}
+			else {
+				return sqrt(2.0) * pow(-1, m) * boost::math::spherical_harmonic(l, m, phi, theta).real();
+			}
+		}
-	unsigned int coeff_1d(unsigned int l, int m){
-		return pow(l + 1, 2) - (l - m) - 1;
-	}
+		/// Calculate the spherical harmonic result given a 1D coefficient index
+		T SH(size_t c, T theta, T phi) {
+			unsigned int l;
+			int m;
+			coeff_2d(c, l, m);
+			return SH(l, m, theta, phi);
+		}
-	
-public:
+		/// Initialize Monte-Carlo sampling of a function using N spherical harmonics coefficients
-	void push(double c){
-		C.push_back(c);
-	}
+		/// @param N is the number of spherical harmonics coefficients used to represent the user function
+		void mcBegin(unsigned int coefficients) {
+			C.resize(coefficients, 0);
+			mcN = 0;
+		}
-	void resize(unsigned int n){
-		C.resize(n);
-	}
+		void mcBegin(unsigned int l, int m) {
+			unsigned int c = pow(l + 1, 2) - (l - m);
+			mcBegin(c);
+		}
-	void setc(unsigned int l, int m, T value){
-		unsigned int c = coeff_1d(l, m);
-		C[c] = value;
-	}
+		void mcSample(T theta, T phi, T val) {
-	void setc(unsigned int c, T value){
-		C[c] = value;
-	}
+			int l, m;
+			T sh;
-	/// Initialize Monte-Carlo sampling of a function using N spherical harmonics coefficients
+			l = m = 0;
+			for (unsigned int i = 0; i < C.size(); i++) {
-	/// @param N is the number of spherical harmonics coefficients used to represent the user function
-	void mcBegin(unsigned int coefficients){
-		C.resize(coefficients, 0);
-		mcN = 0;
-	}
+				sh = SH(l, m, theta, phi);
+				C[i] += sh * val;
-	void mcBegin(unsigned int l, int m){
-		unsigned int c = pow(l + 1, 2) - (l - m);
-		mcBegin(c);
-	}
+				m++;			//increment m
-	void mcSample(double theta, double phi, double val){
+								//if we're in a new tier, increment l and set m = -l
+				if (m > l) {
+					l++;
+					m = -l;
+				}
+			}	//end for all coefficients
-		int l, m;
-		double sh;
+				//increment the number of samples
+			mcN++;
-		l = m = 0;
-		for(unsigned int i = 0; i < C.size(); i++){
+		}	//end mcSample()
-			sh = SH(l, m, theta, phi);
-			C[i] += sh * val;
+		void mcEnd() {
-			m++;			//increment m
+			//divide all coefficients by the number of samples
+			for (unsigned int i = 0; i < C.size(); i++)
+				C[i] /= mcN;
+		}
-			//if we're in a new tier, increment l and set m = -l
-			if(m > l){		
-				l++;
-				m = -l;
+		/// Generates a PDF describing the probability distribution of points on a spherical surface
+		/// @param sph_pts is a list of points in spherical coordinates (theta, phi) where theta = [0, 2pi] and phi = [0, pi]
+		/// @param l is the maximum degree of the spherical harmonic function
+		/// @param m is the maximum order
+		void pdf(std::vector<stim::vec3<T> > sph_pts, unsigned int l, int m) {
+			mcBegin(l, m);		//begin spherical harmonic sampling
+			unsigned int nP = sph_pts.size();
+			for (unsigned int p = 0; p < nP; p++) {
+				mcSample(sph_pts[p][1], sph_pts[p][2], 1.0);
 			}
-		}	//end for all coefficients
+			mcEnd();
+		}
-		//increment the number of samples
-		mcN++;
+		void pdf(std::vector<stim::vec3<T> > sph_pts, size_t c) {
+			unsigned int l;
+			int m;
+			coeff_2d(c, l, m);
+			pdf(sph_pts, l, m);
+		}
-	}	//end mcSample()
+		/// Project a set of samples onto a spherical harmonic basis
+		void project(std::vector<stim::vec3<T> > sph_pts, unsigned int l, int m) {
+			mcBegin(l, m);		//begin spherical harmonic sampling
+			unsigned int nP = sph_pts.size();
+			for (unsigned int p = 0; p < nP; p++) {
+				mcSample(sph_pts[p][1], sph_pts[p][2], sph_pts[p][0]);
+			}
+			mcEnd();
+		}
+		void project(std::vector<stim::vec3<T> > sph_pts, size_t c) {
+			unsigned int l;
+			int m;
+			coeff_2d(c, l, m);
+			project(sph_pts, l, m);
+		}
-	void mcEnd(){
+		/// Generates a PDF describing the density distribution of points on a sphere
+		/// @param sph_pts is a list of points in cartesian coordinates 
+		/// @param l is the maximum degree of the spherical harmonic function
+		/// @param m is the maximum order
+		/// @param c is the centroid of the points in sph_pts. DEFAULT 0,0,0
+		/// @param n is the number of points of the surface of the sphere used to create the PDF. DEFAULT 1000
+		/// @param norm, a boolean that sets where the output vectors will be normalized between 0 and 1.
+		/// @param 
+		/*void pdf(std::vector<stim::vec3<T> > sph_pts, unsigned int l, int m, stim::vec3<T> c = stim::vec3<T>(0, 0, 0), unsigned int n = 1000, bool norm = true, std::vector<T> w = std::vector<T>())
+		{
+			std::vector<double> weights;		///the weight at each point on the surface of the sphere.
+												//		weights.resize(n);
+			unsigned int nP = sph_pts.size();
+			std::vector<stim::vec3<T> > sphere = stim::Random<T>::sample_sphere(n, 1.0, stim::TAU);
+			if (w.size() < nP)
+				w = std::vector<T>(nP, 1.0);
+
+			for (int i = 0; i < n; i++)
+			{
+				T val = 0;
+				for (int j = 0; j < nP; j++)
+				{
+					stim::vec3<T> temp = sph_pts[j] - c;
+					if (temp.dot(sphere[i]) > 0)
+						val += pow(temp.dot(sphere[i]), 4)*w[j];
+				}
+				weights.push_back(val);
+			}
-		//divide all coefficients by the number of samples
-		for(unsigned int i = 0; i < C.size(); i++)
-			C[i] /= mcN;
-	}
+			mcBegin(l, m);		//begin spherical harmonic sampling
-	/// Generates a PDF describing the probability distribution of points on a spherical surface
+			if (norm)
+			{
+				T min = *std::min_element(weights.begin(), weights.end());
+				T max = *std::max_element(weights.begin(), weights.end());
+				for (unsigned int i = 0; i < n; i++)
+				{
+					stim::vec3<T> sph = sphere[i].cart2sph();
+					mcSample(sph[1], sph[2], (weights[i] - min) / (max - min));
+				}
-	/// @param sph_pts is a list of points in spherical coordinates (theta, phi) where theta = [0, 2pi] and phi = [0, pi]
-	/// @param l is the maximum degree of the spherical harmonic function
-	/// @param m is the maximum order
-	void pdf(std::vector<stim::vec<double> > sph_pts, unsigned int l, int m){
-			
-		mcBegin( l, m );		//begin spherical harmonic sampling
+			}
+			else {
+				for (unsigned int i = 0; i < n; i++)
+				{
+					stim::vec3<T> sph = sphere[i].cart2sph();
+					mcSample(sph[1], sph[2], weights[i]);
+				}
+			}
+			mcEnd();
+		}*/
-		unsigned int nP = sph_pts.size();
+		std::string str() {
-		for(unsigned int p = 0; p < nP; p++){
-			mcSample(sph_pts[p][1], sph_pts[p][2], 1.0);
-		}
+			std::stringstream ss;
-		mcEnd();
-	}
+			int l, m;
+			l = m = 0;
+			for (unsigned int i = 0; i < C.size(); i++) {
-	std::string str(){
+				ss << C[i] << '\t';
-		std::stringstream ss;
+				m++;			//increment m
-		int l, m;
-		l = m = 0;
-		for(unsigned int i = 0; i < C.size(); i++){
-				
-			ss<<C[i]<<'\t';
+								//if we're in a new tier, increment l and set m = -l
+				if (m > l) {
+					l++;
+					m = -l;
-			m++;			//increment m
+					ss << std::endl;
-			//if we're in a new tier, increment l and set m = -l
-			if(m > l){
-				l++;
-				m = -l;
+				}
+			}
+
+			return ss.str();
+
+
+		}
-				ss<<std::endl;
-					
+		/// Returns the value of the function at coordinate (theta, phi)
+		T p(T theta, T phi) {
+			T fx = 0;
+
+			int l = 0;
+			int m = 0;
+			for (unsigned int i = 0; i < C.size(); i++) {
+				fx += C[i] * SH(l, m, theta, phi);
+				m++;
+				if (m > l) {
+					l++;
+					m = -l;
+				}
 			}
+			return fx;
 		}
-		return ss.str();
+		/// Returns the derivative of the spherical function with respect to theta
+		///		return value is in cartesian coordinates
+		vec3<T> dtheta(T theta, T phi, T d = 0.01) {
+			T r = p(theta, phi);											//calculate the value of the spherical function at three points
+			T rt = p(theta + d, phi);
+			//double rp = p(theta, phi + d);
+			vec3<T> s(r, theta, phi);										//get the spherical coordinate position for all three points
+			vec3<T> st(rt, theta + d, phi);
+			//vec3<double> sp(rp, theta, phi + d);
-	}
+			vec3<T> c = s.sph2cart();
+			vec3<T> ct = st.sph2cart();
+			//vec3<double> cp = sp.sph2cart();
-	/// Returns the value of the function at the coordinate (theta, phi)
+			vec3<T> dt = (ct - c)/d;									//calculate the derivative
+			return dt;
+		}
+
+		/// Returns the derivative of the spherical function with respect to phi
+		///		return value is in cartesian coordinates
+		vec3<T> dphi(T theta, T phi, T d = 0.01) {
+			T r = p(theta, phi);											//calculate the value of the spherical function at three points
+			//double rt = p(theta + d, phi);
+			T rp = p(theta, phi + d);
+
+			vec3<T> s(r, theta, phi);										//get the spherical coordinate position for all three points
+			//vec3<double> st(rt, theta + d, phi);
+			vec3<T> sp(rp, theta, phi + d);
+
+			vec3<T> c = s.sph2cart();
+			//vec3<double> ct = st.sph2cart();
+			vec3<T> cp = sp.sph2cart();
+
+			vec3<T> dp = (cp - c) / d;									//calculate the derivative
+			return dp;
+		}
+		
+		/// Returns the value of the function at the coordinate (theta, phi)
+		/// @param theta = [0, 2pi]
+		/// @param phi = [0, pi]
+		T operator()(T theta, T phi) {
+			return p(theta, phi);			
+		}
+
+		//overload arithmetic operations
+
+		spharmonics<T> operator*(T rhs) const {
+
+			spharmonics<T> result(C.size());	//create a new spherical harmonics object
+
+			for (size_t c = 0; c < C.size(); c++)	//for each coefficient
+
+				result.C[c] = C[c] * rhs;	//calculate the factor and store the result in the new spharmonics object
+
+			return result;
+
+		}
+
+
+
+		spharmonics<T> operator+(spharmonics<T> rhs) {
+
+			size_t low = std::min(C.size(), rhs.C.size());		//store the number of coefficients in the lowest object
+			size_t high = std::max(C.size(), rhs.C.size());		//store the number of coefficients in the result
+			bool rhs_lowest = false;				//true if rhs has the lowest number of coefficients
+			if (rhs.C.size() < C.size()) rhs_lowest = true;		//if rhs has a lower number of coefficients, set the flag
-	/// @param theta = [0, 2pi]
-	/// @param phi = [0, pi]
-	double operator()(double theta, double phi){
-		double fx = 0;
-		int l = 0;
-		int m = 0;
-		for(unsigned int i = 0; i < C.size(); i++){
-			fx += C[i] * SH(l, m, theta, phi);
-			m++;
-			if(m > l){
-				l++;
-				m = -l;					
+			spharmonics<T> result(high);								//create a new object
+
+			size_t c;
+			for (c = 0; c < low; c++)		//perform the first batch of additions
+				result.C[c] = C[c] + rhs.C[c];	//perform the addition
+
+			for (c = low; c < high; c++) {
+				if (rhs_lowest)
+					result.C[c] = C[c];
+				else
+					result.C[c] = rhs.C[c];
+			}
+			return result;
+		}
+
+
+
+		spharmonics<T> operator-(spharmonics<T> rhs) {
+			return (*this) + (rhs * (T)(-1));
+		}
+		/// Fill an NxN grid with the spherical function for theta = [0 2pi] and phi = [0 pi]
+		void get_func(T* data, size_t X, size_t Y) {
+			T dt = stim::TAU / (T)X;			//calculate the step size in each direction
+			T dp = stim::PI / (T)(Y - 1);
+			for (size_t ti = 0; ti < X; ti++) {
+				for (size_t pi = 0; pi < Y; pi++) {
+					data[pi * X + ti] = (*this)((T)ti * dt, (T)pi * dp);
+				}
 			}
+		}
+		/// Project a spherical function onto the basis using C coefficients
+		/// @param data is a pointer to the function values in (theta, phi) coordinates
+		/// @param N is the number of samples along each axis, where theta = [0 2pi), phi = [0 pi]
+		void project(T* data, size_t x, size_t y, size_t nc) {
+			stim::cpu2image(data, "test.ppm", x, y, stim::cmBrewer);
+			C.resize(nc, 0);													//resize the coefficient array to store the necessary coefficients
+			T dtheta = stim::TAU / (T)(x - 1);									//calculate the grid spacing along theta
+			T dphi = stim::PI / (T)y;											//calculate the grid spacing along phi
+			T theta, phi;
+			for (size_t c = 0; c < nc; c++) {									//for each coefficient
+				for (size_t theta_i = 0; theta_i < x; theta_i++) {				//for each coordinate in the provided array
+					theta = theta_i * dtheta;									//calculate theta
+					for (size_t phi_i = 0; phi_i < y; phi_i++) {
+						phi = phi_i * dphi;										//calculate phi
+						C[c] += data[phi_i * x + theta_i] * SH(c, theta, phi) * dtheta * dphi * sin(phi);
+					}
+				}
+			}
 		}
-		return fx;
-	}
+		/// Generate spherical harmonic coefficients based on a set of N samples
+		/*void fit(std::vector<stim::vec3<T> > sph_pts, unsigned int L, bool norm = true)
+		{
+			//std::vector<T> coeffs;
+
+			//generate a matrix for fitting
+			int B = L*(L+2)+1;					//calculate the matrix size
+			stim::matrix<T> mat(B, B);			//allocate space for the matrix
+
+
+
+			std::vector<T> sums;
+			//int B = l*(l+2)+1;
+			coeffs.resize(B);
+			sums.resize(B);
+			//stim::matrix<T> mat(B, B);
+			for(int i = 0; i < sph_pts.size(); i++)
+			{
+				mcBegin(l,m);
+				mcSample(sph_pts[i][1], sph_pts[i][2], 1.0);
+				for(int j = 0; j < B; j++)
+				{
+					sums[j] += C[j];
+					//      sums[j] += C[j]*sums[j];
+				}       
+				mcEnd();
+			}
+			for(int i = 0; i < B; i++)
+			{
+				for(int j = 0; j < B; j++)
+				{
+					mat(i,j) = sums[i]*sums[j];
+				}
+			}
+
+			if(mat.det() == 0)
+			{
+				std::cerr << " matrix not solvable " << std::endl;
+			}
+			else
+			{
+				//for(int i = 0; i <
+			}
+		}*/
+
+
+
+
-};		//end class sph_harmonics
+	};		//end class sph_harmonics
+#ifndef STIM_TENSOR2_H
+#define STIM_TENSOR2_H
+
+#include "matrix_sym.h"
+
+namespace stim {
+
+/*This class represents a symmetric rank-2 2D tensor, useful for structure tensors
+*/
+template<typename T>
+class tensor2 : public matrix_sym<T, 2> {
+
+protected:
+
+public:
+
+	//calculate the eigenvectors and eigenvalues of the tensor
+	CUDA_CALLABLE void eig(stim::matrix<T, 2>& v, stim::matrix<T, 2>& lambda) {
+		
+		lambda = 0;							//initialize the eigenvalue matrix to zero
+		
+		T t = M[0] + M[2];					//calculate the trace of the tensor
+		T d = M[0] * M[2] - M[1] * M[1];	//calculate the determinant of the tensor
+		
+		lambda(0, 0) = t / 2 + sqrt(t*t / 4 - d);
+		lambda(1, 1) = t / 2 - sqrt(t*t / 4 - d);
+
+		if (M[1] == 0) {
+			v = stim::matrix<T, 2>::identity();
+		}
+		else {
+			v(0, 0) = lambda(0, 0) - d;
+			v(0, 1) = lambda(1, 1) - d;
+			v(1, 0) = v(1, 1) = M[1];
+		}
+	}
+
+	CUDA_CALLABLE tensor2<T> operator=(stim::matrix_sym<T, 2> rhs){
+		stim::matrix_sym<T, 2>::operator=(rhs);
+		return *this;
+	}
+};
+
+
+}	//end namespace stim
+
+
+#endif
 \ No newline at end of file
+#ifndef STIM_TENSOR3_H
+#define STIM_TENSOR3_H
+
+#include "matrix_sym.h"
+#include <stim/math/constants.h>
+
+namespace stim {
+
+	/*This class represents a symmetric rank-2 2D tensor, useful for structure tensors
+	*/
+
+	//Matrix ID cheat sheet
+	//	| 0  1  2 |
+	//	| 1  3  4 |
+	//	| 2  4  5 |
+	template<typename T>
+	class tensor3 : public matrix_sym<T, 3> {
+
+	protected:
+
+	public:
+
+		//calculates the determinant of the tensor
+		CUDA_CALLABLE T det() {
+			return M[0] * M[3] * M[5] + 2 * (M[1] * M[4] * M[2]) - M[2] * M[3] * M[2] - M[1] * M[1] * M[5] - M[0] * M[4] * M[4];
+		}
+
+		//calculate the eigenvalues for the tensor
+		//adapted from https://en.wikipedia.org/wiki/Eigenvalue_algorithm
+
+		CUDA_CALLABLE stim::vec3<T> lambda() {
+			stim::vec3<T> lam;
+			T p1 = M[1] * M[1] + M[2] * M[2] + M[4] * M[4];		//calculate the sum of the squared off-diagonal values
+			if (p1 == 0) {										//if this value is zero, the matrix is diagonal
+				lam[0] = M[0];									//the eigenvalues are the diagonal values
+				lam[1] = M[3];
+				lam[2] = M[5];
+				return lam;										//return the eigenvalue vector
+			}
+
+			T tr = matrix_sym<T, 3>::trace();					//calculate the trace of the matrix
+			T q = tr / 3;
+			T p2 = (M[0] - q) * (M[0] - q) + (M[3] - q) * (M[3] - q) + (M[5] - q) * (M[5] - q) + 2 * p1;
+			T p = sqrt(p2 / 6);
+			tensor3<T> Q;										//allocate space for Q (q along the diagonals)
+			Q = (T)0;											//initialize Q to zeros
+			Q(0, 0) = Q(1, 1) = Q(2, 2) = q;					//set the diagonal values to q
+			tensor3<T> B = *this;								// B1 = A
+			B.M[0] = (B.M[0] - q);
+			B.M[3] = (B.M[3] - q);
+			B.M[5] = (B.M[5] - q);						
+			matrix_sym<T, 3>::operator_product(B, 1/p);				// B = (1/p) * (A - q*I)
+			//B.M[0] = B.M[0] * 1/p;
+			//B.M[1] = B.M[1] * 1/p;
+			//B.M[2] = B.M[2] * 1/p;
+			//B.M[3] = B.M[3] * 1/p;
+			//B.M[4] = B.M[4] * 1/p;
+			//B.M[5] = B.M[5] * 1/p;
+			T r = B.det() / 2;									//calculate det(B) / 2
+
+			// In exact arithmetic for a symmetric matrix - 1 <= r <= 1
+			// but computation error can leave it slightly outside this range.
+			T phi;
+			if (r <= -1) phi = stim::PI / 3;
+			else if (r >= 1) phi = 0;
+			else phi = acos(r) / 3;
+			
+			// the eigenvalues satisfy eig3 >= eig2 >= eig1
+			lam[2] = q + 2 * p * cos(phi);
+			lam[0] = q + 2 * p * cos(phi + (2 * stim::PI / 3));
+			lam[1] = 3 * q - (lam[2] + lam[0]);
+
+			return lam;
+		}
+
+		CUDA_CALLABLE stim::matrix<T> eig(stim::vec3<T>& lambda = stim::vec3<T>()) {
+			stim::matrix<T> V;
+		
+			stim::matrix<T> M1 = matrix_sym<T, 3>::mat();
+			stim::matrix<T> M2 = matrix_sym<T, 3>::mat();
+			stim::matrix<T> M3 = matrix_sym<T, 3>::mat();	// fill a tensor with symmetric values
+
+			M1 = M1 - lambda[0];					// M1 = A - lambda[0] * I
+	
+			M2 = M2 - lambda[1];					// M2 = A - lambda[1] * I
+
+			M3 = M3 - lambda[2];					// M3 = A - lambda[2] * I
+
+			T Mod = 0;											// module of one column
+
+			T tmp1[9] = {0};
+			for(int i = 0; i < 9; i++) {
+				for(int j = 0; j < 3; j++){
+					tmp1[i] += M2(i%3, j) * M3(j, i/3); 
+				}
+			}
+			if(tmp1[0] * tmp1[1] * tmp1[2] != 0) {				// test whether it is zero column
+				Mod = sqrt(pow(tmp1[0],2) + pow(tmp1[1],2) + pow(tmp1[2],2));
+				V(0, 0) = tmp1[0]/Mod;
+				V(1, 0) = tmp1[1]/Mod;
+				V(2, 0) = tmp1[2]/Mod;
+			}
+			else {
+				Mod = sqrt(pow(tmp1[3],2) + pow(tmp1[4],2) + pow(tmp1[5],2));
+				V(0, 0) = tmp1[3]/Mod;
+				V(1, 0) = tmp1[4]/Mod;
+				V(2, 0) = tmp1[5]/Mod;
+			}
+
+			T tmp2[9] = {0};
+			for(int i = 0; i < 9; i++) {
+				for(int j = 0; j < 3; j++){
+					tmp2[i] += M1(i%3, j) * M3(j, i/3); 
+				}
+			}
+			if(tmp2[0] * tmp2[1] * tmp2[2] != 0) {
+				Mod = sqrt(pow(tmp2[0],2) + pow(tmp2[1],2) + pow(tmp2[2],2));
+				V(0, 1) = tmp2[0]/Mod;
+				V(1, 1) = tmp2[1]/Mod;
+				V(2, 1) = tmp2[2]/Mod;
+			}
+			else {
+				Mod = sqrt(pow(tmp2[3],2) + pow(tmp2[4],2) + pow(tmp2[5],2));
+				V(0, 1) = tmp2[3]/Mod;
+				V(1, 1) = tmp2[4]/Mod;
+				V(2, 1) = tmp2[5]/Mod;
+			}
+
+			T tmp3[9] = {0};
+			for(int i = 0; i < 9; i++) {
+				for(int j = 0; j < 3; j++){
+					tmp3[i] += M1(i%3, j) * M2(j, i/3); 
+				}
+			}
+			if(tmp3[0] * tmp3[1] * tmp3[2] != 0) {
+				Mod = sqrt(pow(tmp3[0],2) + pow(tmp3[1],2) + pow(tmp3[2],2));
+				V(0, 2) = tmp3[0]/Mod;
+				V(1, 2) = tmp3[1]/Mod;
+				V(2, 2) = tmp3[2]/Mod;
+			}
+			else {
+				Mod = sqrt(pow(tmp3[3],2) + pow(tmp3[4],2) + pow(tmp3[5],2));
+				V(0, 2) = tmp3[3]/Mod;
+				V(1, 2) = tmp3[4]/Mod;
+				V(2, 2) = tmp3[5]/Mod;
+			}
+			return V;					//return the eigenvector matrix
+		}
+		// return one specific eigenvector
+		CUDA_CALLABLE stim::vec3<T> eig(int n, stim::vec3<T>& lambda = stim::vec3<T>()) {
+			stim::matrix<T, 3> V = eig(lambda);
+			stim::vec3<T> v;
+			for(int i = 0; i < 3; i++)
+				v[i] = V(i, n);
+			return v;
+		}
+
+
+		CUDA_CALLABLE T linear(stim::vec3<T>& lambda = stim::vec3<T>()) {
+			T cl = (lambda[2] - lambda[1]) / (lambda[0] + lambda[1] + lambda[2]);
+			return cl;
+		}
+
+		CUDA_CALLABLE T Planar(stim::vec3<T>& lambda = stim::vec3<T>()) {
+			T cp = 2 * (lambda[1] - lambda[0]) / (lambda[0] + lambda[1] + lambda[2]);
+			return cp;
+		}
+
+		CUDA_CALLABLE T spherical(stim::vec3<T>& lambda = stim::vec3<T>()) {
+			T cs = 3 * lambda[0] / (lambda[0] + lambda[1] + lambda[2]);
+			return cs;
+		}
+
+		CUDA_CALLABLE T fa(stim::vec3<T>& lambda = stim::vec3<T>()) {
+			T fa = sqrt(1/2) * sqrt(pow(lambda[2] - lambda[1], 2) + pow(lambda[1] - lambda[0], 2) + pow(lambda[0] - lambda[2], 2)) / sqrt(pow(lambda[2], 2) + pow(lambda[1], 2) + pow(lambda[0], 2));
+		}
+		//JACK 2: write functions to calculate anisotropy
+		//ex: fa(), linear(), planar(), spherical()
+
+
+		//calculate the eigenvectors and eigenvalues of the tensor
+		//CUDA_CALLABLE void eig(stim::matrix<T, 3>& v, stim::matrix<T, 3>& lambda){
+
+		//}
+		CUDA_CALLABLE tensor3<T> operator=(T rhs) {
+			stim::matrix_sym<T, 3>::operator=(rhs);
+			return *this;
+		}
+
+		CUDA_CALLABLE tensor3<T> operator=(stim::matrix_sym<T, 3> rhs) {
+			stim::matrix_sym<T, 3>::operator=(rhs);
+			return *this;
+		}
+	};
+
+
+}	//end namespace stim
+
+
+#endif
 \ No newline at end of file
@@ -5,6 +5,8 @@
 #include <stim/cuda/cudatools/callable.h>
 #include <cmath>
+#include <sstream>
+
 namespace stim{
@@ -68,7 +70,7 @@ public:
 	}
-	/// Convert the vector from cartesian to spherical coordinates (x, y, z -> r, theta, phi where theta = [0, 2*pi])
+	/// Convert the vector from cartesian to spherical coordinates (x, y, z -> r, theta, phi where theta = [-PI, PI])
 	CUDA_CALLABLE vec3<T> cart2sph() const{
 		vec3<T> sph;
 		sph.ptr[0] = len();
@@ -236,9 +238,16 @@ public:
 		return result;
 	}
-//#ifndef __NVCC__
+	CUDA_CALLABLE bool operator==(vec3<T> rhs) const{
+		if(rhs[0] == ptr[0] && rhs[1] == ptr[1] && rhs[2] == ptr[2])
+			return true;
+		else
+			return false;	
+	}
+
+//#ifndef __CUDACC__
 	/// Outputs the vector as a string
-	std::string str() const{
+std::string str() const{
 		std::stringstream ss;
 		const size_t N = 3;
@@ -5,6 +5,7 @@
 #include <cmath>
 #include <sstream>
 #include <vector>
+#include <algorithm>
 #include <stim/cuda/cudatools/callable.h>
 #include <stim/math/vec3.h>
@@ -338,7 +339,7 @@ struct vec : public std::vector&lt;T&gt;
 	/// Cast to a vec3
 	operator stim::vec3<T>(){
 		stim::vec3<T> r;
-		size_t N = min(size(), (size_t)3);
+		size_t N = std::min(size(), (size_t)3);
 		for(size_t i = 0; i < N; i++)
 			r[i] = at(i);
 		return r;