Merge branch 'master' of git.stim.ee.uh.edu:codebase/stimlib into bsds500

Tianshu Cheng
2 parents 5343a315 93de94e6
Showing 30 changed files with 265 additions and 145 deletions Show diff stats
stim/cuda/arraymath.cuh
stim/cuda/array_abs.cuh → stim/cuda/arraymath/array_abs.cuh
stim/cuda/array_add.cuh → stim/cuda/arraymath/array_add.cuh
stim/cuda/array_cart2polar.cuh → stim/cuda/arraymath/array_cart2polar.cuh
stim/cuda/array_multiply.cuh → stim/cuda/arraymath/array_multiply.cuh
stim/cuda/cost.h
stim/cuda/cudatools.h
stim/cuda/callable.h → stim/cuda/cudatools/callable.h
stim/cuda/devices.h → stim/cuda/cudatools/devices.h
stim/cuda/error.h → stim/cuda/cudatools/error.h
stim/cuda/glbind.h → stim/cuda/cudatools/glbind.h
stim/cuda/threads.h → stim/cuda/cudatools/threads.h
stim/cuda/timer.h → stim/cuda/cudatools/timer.h
stim/cuda/ivote.cuh
stim/cuda/down_sample.cuh → stim/cuda/ivote/down_sample.cuh
stim/cuda/local_max.cuh → stim/cuda/ivote/local_max.cuh
stim/cuda/update_dir.cuh → stim/cuda/ivote/update_dir.cuh
stim/cuda/vote.cuh → stim/cuda/ivote/vote.cuh
stim/cuda/conv2.cuh → stim/cuda/templates/conv2.cuh
stim/cuda/gaussian_blur.cuh → stim/cuda/templates/conv2sep.cuh
 #ifndef STIM_CUDA_ARRAYMATH_H
 #define STIM_CUDA_ARRAYMATH_H
  
-#include <stim/cuda/array_add.cuh>
-#include <stim/cuda/array_multiply.cuh>
-#include <stim/cuda/array_abs.cuh>
-#include <stim/cuda/array_cart2polar.cuh>
-#include <stim/cuda/gaussian_blur.cuh>
-#include <stim/cuda/conv2.cuh>
+#include <stim/cuda/arraymath/array_add.cuh>
+#include <stim/cuda/arraymath/array_multiply.cuh>
+#include <stim/cuda/arraymath/array_abs.cuh>
+#include <stim/cuda/arraymath/array_cart2polar.cuh>
  
 namespace stim{
 	namespace cuda{
@@ -3,8 +3,7 @@
  
 #include <iostream>
 #include <cuda.h>
-#include <stim/cuda/devices.h>
-#include <stim/cuda/error.h>
+#include <stim/cuda/cudatools.h>
  
 namespace stim{
 	namespace cuda{
@@ -3,8 +3,7 @@
  
 #include <iostream>
 #include <cuda.h>
-#include <stim/cuda/devices.h>
-#include <stim/cuda/error.h>
+#include <stim/cuda/cudatools.h>
  
 namespace stim{
 	namespace cuda{
@@ -3,11 +3,11 @@
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <stdio.h>
-#include "../visualization/colormap.h"
+#include <stim/visualization/colormap.h>
 #include <sstream>
-#include "../math/vector.h"
-#include "../cuda/devices.h"
-#include "../cuda/threads.h"
+#include <stim/math/vector.h>
+#include <stim/cuda/cudatools/devices.h>
+#include <stim/cuda/cudatools/threads.h>
  
 ///Cost function that works with the gl-spider class to find index of the item with min-cost.
 typedef unsigned char uchar;
+#include <stim/cuda/cudatools/callable.h>
+#include <stim/cuda/cudatools/devices.h>
+#include <stim/cuda/cudatools/error.h>
+#include <stim/cuda/cudatools/threads.h>
+#include <stim/cuda/cudatools/timer.h>
 \ No newline at end of file
@@ -9,7 +9,7 @@
  
 //#include <cudaHandleError.h>
 #include "cuda_gl_interop.h"
-#include "../gl/error.h"
+#include <stim/gl/error.h>
  
 namespace stim
 {
-#include "cuda_runtime.h"
+#include "cuda_runtime.h"
 #include "device_launch_parameters.h"
-#include "../cuda/callable.h"
+#include <stim/cuda/cudatools/callable.h>
  
 #ifndef CUDA_THREADS_H
 #define CUDA_THREADS_H
-#ifndef STIM_CUDA_TIMER
-#define STIM_CUDA_TIMER
-
-static cudaEvent_t tStartEvent;
-static cudaEvent_t tStopEvent;
-
-namespace stim{
-
-/// These functions calculate the time between GPU functions in milliseconds
-static void gpuStartTimer()
-{
-	//set up timing events
-	cudaEventCreate(&tStartEvent);
-	cudaEventCreate(&tStopEvent);
-	cudaEventRecord(tStartEvent, 0);
-}
-
-static float gpuStopTimer()
-{
-	cudaEventRecord(tStopEvent, 0);
-	cudaEventSynchronize(tStopEvent);
-	float elapsedTime;
-	cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent);
-	cudaEventDestroy(tStartEvent);
-	cudaEventDestroy(tStopEvent);
-	return elapsedTime;
-}
-
-}	//end namespace stim
-
+#ifndef STIM_CUDA_TIMER
+#define STIM_CUDA_TIMER
+
+static cudaEvent_t tStartEvent;
+static cudaEvent_t tStopEvent;
+
+namespace stim{
+
+/// These functions calculate the time between GPU functions in milliseconds
+static void gpuStartTimer()
+{
+	//set up timing events
+	cudaEventCreate(&tStartEvent);
+	cudaEventCreate(&tStopEvent);
+	cudaEventRecord(tStartEvent, 0);
+}
+
+static float gpuStopTimer()
+{
+	cudaEventRecord(tStopEvent, 0);
+	cudaEventSynchronize(tStopEvent);
+	float elapsedTime;
+	cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent);
+	cudaEventDestroy(tStartEvent);
+	cudaEventDestroy(tStopEvent);
+	return elapsedTime;
+}
+
+}	//end namespace stim
+
 #endif
 \ No newline at end of file
+#ifndef STIM_CUDA_IVOTE_H
+#define STIM_CUDA_IVOTE_H
+
+#include <stim/cuda/ivote/down_sample.cuh>
+#include <stim/cuda/ivote/local_max.cuh>
+#include <stim/cuda/ivote/update_dir.cuh>
+#include <stim/cuda/ivote/vote.cuh>
+
+namespace stim{
+	namespace cuda{
+	
+	}
+}
+
+
+
+#endif
 \ No newline at end of file
@@ -3,9 +3,8 @@
  
 #include <iostream>
 #include <cuda.h>
-#include <stim/cuda/devices.h>
-#include <stim/cuda/timer.h>
-#include <stim/cuda/gaussian_blur.cuh>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/templates/gaussian_blur.cuh>
  
 namespace stim{
 	namespace cuda{
@@ -51,7 +50,7 @@ namespace stim{
 			dim3 threads(max_threads, 1);
 			dim3 blocks(x_ds/threads.x + (x_ds %threads.x == 0 ? 0:1) , y_ds);
  
-			stim::cuda::gpu_gaussian_blur_2d<float>(gpuI0, sigma_ds,x ,y);
+			stim::cuda::gpu_gaussian_blur2<float>(gpuI0, sigma_ds,x ,y);
  
 			//resample the image
 			down_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y);
@@ -89,7 +88,7 @@ namespace stim{
 			gpu_down_sample<T>(gpuI, gpuI0, resize, x, y);
  
 			//copy the image data to the GPU
-			cudaMemcpy(re_image, gpuI, bytes_ds, cudaMemcpyHostToDevice);
+			cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
  
 			cudaFree(gpuI0);
 			cudeFree(gpuI);
@@ -4,8 +4,7 @@
  
 # include <iostream>
 # include <cuda.h>
-# include <stim/cuda/devices.h>
-# include <stim/cuda/error.h>
+#include <stim/cuda/cudatools.h>
  
 namespace stim{
 	namespace cuda{
@@ -4,8 +4,7 @@
  
 # include <iostream>
 # include <cuda.h>
-# include <stim/cuda/devices.h>
-# include <stim/cuda/error.h>
+#include <stim/cuda/cudatools.h>
 #include <stim/cuda/sharedmem.cuh>
  
 namespace stim{
@@ -4,8 +4,7 @@
  
 # include <iostream>
 # include <cuda.h>
-# include <stim/cuda/devices.h>
-# include <stim/cuda/error.h>
+#include <stim/cuda/cudatools.h>
 #include <stim/cuda/sharedmem.cuh>
  
  
@@ -3,8 +3,7 @@
  
 #include <iostream>
 #include <cuda.h>
-#include <stim/cuda/devices.h>
-#include <stim/cuda/error.h>
+#include <stim/cuda/cudatools.h>
 #include <cmath>
 #include <algorithm>
  
-#ifndef STIM_CUDA_GAUSSIAN_BLUR_H
-#define STIM_CUDA_GAUSSIAN_BLUR_H
+#ifndef STIM_CUDA_CONV2SEP_H
+#define STIM_CUDA_CONV2SEP_H
  
 #include <iostream>
 #include <cuda.h>
-#include <stim/cuda/devices.h>
-#include <stim/cuda/timer.h>
+#include <stim/cuda/cudatools/devices.h>
+#include <stim/cuda/cudatools/timer.h>
 #include <stim/cuda/sharedmem.cuh>
+#include <stim/cuda/cudatools/error.h>
  
 #define pi	3.14159
  
@@ -13,12 +14,13 @@ namespace stim{
 	namespace cuda{
  
 		template<typename T>
-		__global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){
+		__global__ void conv2sep_0(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y,
+										   T* kernel0, unsigned int k0){
  
 			//generate a pointer to shared memory (size will be specified as a kernel parameter)
 			extern __shared__ T s[];
  
-			int kr = sigma * 4;				//calculate the kernel radius
+			int kr = k0/2;				//calculate the kernel radius
  
 			//get a pointer to the gaussian in memory
 			T* g = (T*)&s[blockDim.x + 2 * kr];
@@ -30,30 +32,25 @@ namespace stim{
 			//copy the portion of the image necessary for this block to shared memory
 			stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
  
-			//calculate the thread index and block index
+			//calculate the thread index
 			int ti = threadIdx.x;
  
 			//calculate the spatial coordinate for this thread
 			int xi = bxi + ti;
+			int yi = byi;
  
-			//pre-compute the gaussian values for each kernel point
-			T a = 1.0 / (sigma * sqrt(2 * pi));
-			T c = - 1.0 / (2*sigma*sigma);
-			int ki;
-
-			//use the first 2kr+1 threads to evaluate a gaussian and store the result
-			if(ti <= 2* kr+1){
-				ki = ti - kr;
-				g[ti] = a * exp((ki*ki) * c);
+			
+			//use the first 2kr+1 threads to transfer the kernel to shared memory
+			if(ti < k0){
+				g[ti] = kernel0[ti];
 			}
  
 			//make sure that all writing to shared memory is done before continuing
 			__syncthreads();
  
 			//if the current pixel is outside of the image
-			if(bxi + ti > x || byi > y)
+			if(xi > x || yi > y)
 				return;
-
  
  
 			//calculate the coordinates of the current thread in shared memory
@@ -66,7 +63,7 @@ namespace stim{
 			for(int ki = -kr; ki <= kr; ki++){
 				sum += g[ki + kr] * s[si + ki];
 			}
-
+			
 			//calculate the 1D image index for this thread
 			unsigned int i = byi * x + xi;
  
@@ -75,12 +72,13 @@ namespace stim{
 		}
  
 		template<typename T>
-		__global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){
+		__global__ void conv2sep_1(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y,
+										   T* kernel0, unsigned int k0){
  
 			//generate a pointer to shared memory (size will be specified as a kernel parameter)
 			extern __shared__ T s[];
  
-			int kr = sigma * 4;				//calculate the kernel radius
+			int kr = k0/2;				//calculate the kernel radius
  
 			//get a pointer to the gaussian in memory
 			T* g = (T*)&s[blockDim.y + 2 * kr];
@@ -92,30 +90,25 @@ namespace stim{
 			//copy the portion of the image necessary for this block to shared memory
 			stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
  
-			//calculate the thread index and block index
+			//calculate the thread index
 			int ti = threadIdx.y;
  
 			//calculate the spatial coordinate for this thread
+			int xi = bxi;
 			int yi = byi + ti;
  
-			//pre-compute the gaussian values for each kernel point
-			T a = 1.0 / (sigma * sqrt(2 * pi));
-			T c = - 1.0 / (2*sigma*sigma);
-			int ki;
-
-			//use the first 2kr+1 threads to evaluate a gaussian and store the result
-			if(ti <= 2* kr+1){
-				ki = ti - kr;
-				g[ti] = a * exp((ki*ki) * c);
+			
+			//use the first 2kr+1 threads to transfer the kernel to shared memory
+			if(ti < k0){
+				g[ti] = kernel0[ti];
 			}
  
 			//make sure that all writing to shared memory is done before continuing
 			__syncthreads();
  
 			//if the current pixel is outside of the image
-			if(bxi >= x || yi >= y)
+			if(xi > x || yi > y)
 				return;
-
  
  
 			//calculate the coordinates of the current thread in shared memory
@@ -128,17 +121,55 @@ namespace stim{
 			for(int ki = -kr; ki <= kr; ki++){
 				sum += g[ki + kr] * s[si + ki];
 			}
-
+			
 			//calculate the 1D image index for this thread
-			unsigned int i = yi * x + bxi;
+			unsigned int i = yi * x + xi;
  
 			//output the result to global memory
 			out[i] = sum;
 		}
  
-		/// Applies a Gaussian blur to a 2D image stored on the GPU
 		template<typename T>
-		void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
+		void tex_conv2sep(T* out, unsigned int x, unsigned int y,
+						  cudaTextureObject_t texObj, cudaArray* cuArray,
+						  T* kernel0, unsigned int k0,
+						  T* kernel1, unsigned int k1){
+
+			//get the maximum number of threads per block for the CUDA device
+			int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+
+			//calculate the number of blocks
+			dim3 blocks(x / threads.x + 1, y);
+
+			//calculate the shared memory used in the kernel
+			unsigned int pixel_bytes = max_threads * sizeof(T);							//bytes devoted to pixel data being processed
+			unsigned int apron_bytes = k0/2 * sizeof(T);								//bytes devoted to the apron on each side of the window
+			unsigned int gaussian_bytes = k0 * sizeof(T);								//bytes devoted to memory used to store the pre-computed Gaussian window
+			unsigned int shared_bytes = pixel_bytes + 2 * apron_bytes + gaussian_bytes;		//total number of bytes shared memory used
+
+			//blur the image along the x-axis
+			conv2sep_0<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel0, k0);
+
+			// Copy the x-blurred data from global memory to the texture
+			cudaMemcpyToArray(cuArray, 0, 0, out, x * y * sizeof(T),
+							  cudaMemcpyDeviceToDevice);
+			
+			//transpose the block and thread dimensions
+			threads.x = 1;
+			threads.y = max_threads;
+			blocks.x = x;
+			blocks.y = y / threads.y + 1;
+			
+			//blur the image along the y-axis
+			conv2sep_1<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel1, k1);
+
+		}
+
+		template<typename T>
+		void gpu_conv2sep(T* image, unsigned int x, unsigned int y,
+						  T* kernel0, unsigned int k0,
+						  T* kernel1, unsigned int k1){
  
 			//get the number of pixels in the image
 			unsigned int pixels = x * y;
@@ -176,36 +207,9 @@ namespace stim{
 			cudaTextureObject_t texObj = 0;
 			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
  
+			//call the texture version of the separable convolution function
+			tex_conv2sep(image, x, y, texObj, cuArray, kernel0, k0, kernel1, k1);			
  
-			//get the maximum number of threads per block for the CUDA device
-			int max_threads = stim::maxThreadsPerBlock();
-			dim3 threads(max_threads, 1);
-
-			//calculate the number of blocks
-			dim3 blocks(x / threads.x + 1, y);
-
-			//calculate the shared memory used in the kernel
-			unsigned int pixel_bytes = max_threads * 4;									//bytes devoted to pixel data being processed
-			unsigned int apron_bytes = sigma * 8 * 4;									//bytes devoted to pixels outside the window
-			unsigned int gaussian_bytes = (sigma * 8 + 1) * 4;							//bytes devoted to memory used to store the pre-computed Gaussian window
-			unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes;		//total number of bytes shared memory used
-
-			//blur the image along the x-axis
-			gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);
-
-			// Copy the x-blurred data from global memory to the texture
-			cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
-							  cudaMemcpyDeviceToDevice);
-
-			//transpose the block and thread dimensions
-			threads.x = 1;
-			threads.y = max_threads;
-			blocks.x = x;
-			blocks.y = y / threads.y + 1;
-			
-			//blur the image along the y-axis
-			gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);
-
 			//free allocated memory
 			cudaFree(cuArray);
  
@@ -213,22 +217,35 @@ namespace stim{
  
 		/// Applies a Gaussian blur to a 2D image stored on the CPU
 		template<typename T>
-		void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
+		void cpu_conv2sep(T* image, unsigned int x, unsigned int y, 
+						  T* kernel0, unsigned int k0,
+						  T* kernel1, unsigned int k1){
  
 			//get the number of pixels in the image
 			unsigned int pixels = x * y;
 			unsigned int bytes = sizeof(T) * pixels;
  
-			//allocate space on the GPU
+			//---------Allocate Image---------
+			//allocate space on the GPU for the image
 			T* gpuI0;
-			cudaMalloc(&gpuI0, bytes);
-			
+			HANDLE_ERROR(cudaMalloc(&gpuI0, bytes));			
  
 			//copy the image data to the GPU
-			cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
+			HANDLE_ERROR(cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice));
+
+			//---------Allocate Kernel--------
+			//allocate and copy the 0 (x) kernel
+			T* gpuK0;
+			HANDLE_ERROR(cudaMalloc(&gpuK0, k0 * sizeof(T)));
+			HANDLE_ERROR(cudaMemcpy(gpuK0, kernel0, k0 * sizeof(T), cudaMemcpyHostToDevice));
+
+			//allocate and copy the 1 (y) kernel
+			T* gpuK1;
+			HANDLE_ERROR(cudaMalloc(&gpuK1, k1 * sizeof(T)));
+			HANDLE_ERROR(cudaMemcpy(gpuK1, kernel1, k1 * sizeof(T), cudaMemcpyHostToDevice));
  
 			//run the GPU-based version of the algorithm
-			gpu_gaussian_blur_2d<T>(gpuI0, sigma, x, y);
+			gpu_conv2sep<T>(gpuI0, x, y, gpuK0, k0, gpuK1, k1);
  
 			//copy the image data from the device
 			cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost);
+#ifndef STIM_CUDA_GAUSSIAN_BLUR_H
+#define STIM_CUDA_GAUSSIAN_BLUR_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/cudatools.h>
+#include <stim/cuda/sharedmem.cuh>
+#include <stim/cuda/templates/conv2sep.cuh>		//GPU-based separable convolution algorithm
+
+#define pi	3.14159
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		void gen_gaussian(T* out, T sigma, unsigned int width){
+
+			//fill the kernel with a gaussian
+			for(unsigned int xi = 0; xi < width; xi++){
+
+				float x = (float)xi - (float)(width/2);	//calculate the x position of the gaussian
+				float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) );
+				out[xi] = g;
+			}
+
+		}
+
+		template<typename T>
+		void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){
+
+			//allocate space for the kernel
+			unsigned int kwidth = sigma * 8 + 1;
+			float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
+
+			//fill the kernel with a gaussian
+			gen_gaussian(kernel0, sigma, kwidth);
+
+			//copy the kernel to the GPU
+			T* gpuKernel0;
+			HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
+
+			//perform the gaussian blur as a separable convolution
+			stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth);
+
+			HANDLE_ERROR(cudaFree(gpuKernel0));
+
+		}
+
+		template<typename T>
+		void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
+
+			//allocate space for the kernel
+			unsigned int kwidth = sigma * 8 + 1;
+			float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
+
+			//fill the kernel with a gaussian
+			gen_gaussian(kernel0, sigma, kwidth);
+
+			//copy the kernel to the GPU
+			T* gpuKernel0;
+			HANDLE_ERROR(cudaMalloc(&gpuKernel0, kwidth * sizeof(T)));
+			HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
+
+			//perform the gaussian blur as a separable convolution
+			stim::cuda::gpu_conv2sep<float>(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth);
+
+			HANDLE_ERROR(cudaFree(gpuKernel0));
+
+		}
+
+		/// Applies a Gaussian blur to a 2D image stored on the CPU
+		template<typename T>
+		void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
+
+			//allocate space for the kernel
+			unsigned int kwidth = sigma * 8 + 1;
+			float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
+
+			//fill the kernel with a gaussian
+			gen_gaussian(kernel0, sigma, kwidth);
+
+			//perform the gaussian blur as a separable convolution
+			stim::cuda::cpu_conv2sep<float>(image, x, y, kernel0, kwidth, kernel0, kwidth);
+			
+		}
+		
+	};
+};
+
+#endif
 \ No newline at end of file
@@ -3,8 +3,7 @@
  
 #include <iostream>
 #include <cuda.h>
-#include <stim/cuda/devices.h>
-#include <stim/cuda/error.h>
+#include <stim/cuda/cudatools.h>
  
 namespace stim{
 	namespace cuda{
@@ -14,7 +14,7 @@
 #include "stim/math/rect.h"
 #include "stim/math/matrix.h"
 #include "stim/cuda/cost.h"
-#include "stim/cuda/glbind.h"
+#include <stim/cuda/cudatools/glbind.h>
 #include <stim/visualization/obj.h>
 #include <vector>
  
@@ -174,7 +174,7 @@ public:
  
  
 	/// Returns the maximum pixel value in the image
-	T max(){
+	T maxv(){
 		float max = 0;
 		unsigned long N = width() * height();		//get the number of pixels
  
@@ -190,7 +190,7 @@ public:
 	}
  
 	/// Returns the minimum pixel value in the image
-	T min(){
+	T minv(){
 		float min = 0;
 		unsigned long N = width() * height();		//get the number of pixels
  
@@ -5,7 +5,7 @@
 #include <string.h>
 #include <iostream>
 #include <stim/math/vector.h>
-#include "../cuda/callable.h"
+#include <stim/cuda/cudatools/callable.h>
  
 namespace stim{
  
 #ifndef RTS_QUATERNION_H
 #define RTS_QUATERNION_H
  
-#include "../math/matrix.h"
-#include "../cuda/callable.h"
+#include <stim/math/matrix.h>
+#include <stim/cuda/cudatools/callable.h>
  
 namespace stim{
  
@@ -2,7 +2,7 @@
 #define RTS_RECT_H
  
 //enable CUDA_CALLABLE macro
-#include <stim/cuda/callable.h>
+#include <stim/cuda/cudatools/callable.h>
 #include <stim/math/vector.h>
 #include <stim/math/triangle.h>
 #include <stim/math/quaternion.h>
@@ -2,7 +2,7 @@
 #define RTS_TRIANGLE_H
  
 //enable CUDA_CALLABLE macro
-#include <stim/cuda/callable.h>
+#include <stim/cuda/cudatools/callable.h>
 #include <stim/math/vector.h>
 #include <iostream>
  
@@ -5,7 +5,8 @@
 #include <cmath>
 #include <sstream>
 #include <vector>
-#include "../cuda/callable.h"
+
+#include <stim/cuda/cudatools/callable.h>
  
 namespace stim
 {
@@ -4,12 +4,12 @@
 #include <string>
 #include <stdlib.h>
 #ifdef __CUDACC__
-#include "../cuda/error.h"
+#include <stim/cuda/cudatools/error.h>
 #endif
  
 //saving an image to a file uses the CImg library
 	//this currently throws a lot of "unreachable" warnings (as of GCC 4.8.2, nvcc 6.5.12)
-#include "../image/image.h"
+#include <stim/image/image.h>
  
  
 #define BREWER_CTRL_PTS 11