Merge branch 'master' of git.stim.ee.uh.edu:codebase/stimlib

David Mayerich
2 parents e9bddc57 13fe3c84
Showing 12 changed files with 1281 additions and 2 deletions Show diff stats
stim/cuda/array_abs.cuh
stim/cuda/array_cart2polar.cuh
stim/cuda/array_multiply.cuh
stim/cuda/arraymath.cuh
stim/cuda/down_sample.cuh
stim/cuda/gaussian_blur.cuh
stim/cuda/gradient.cuh
stim/cuda/local_max.cuh
stim/cuda/sharedmem.cuh
stim/cuda/update_dir.cuh
stim/cuda/vote.cuh
stim/parser/filename.h
+#ifndef STIM_CUDA_ARRAY_ABS_H
+#define STIM_CUDA_ARRAY_ABS_H
+
+namespace stim{
+	namespace cuda{
+		template<typename T>
+		__global__ void cuda_abs(T* a, unsigned int N){
+
+			//calculate the 1D index for this thread
+			int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(i < N)
+				a[i] = abs(a[i]);
+		}
+
+
+		template<typename T>
+		void gpu_abs(T* a, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + (N%threads == 0 ? 0:1);
+
+			//call the kernel to do the multiplication
+			cuda_abs <<< blocks, threads >>>(a, N);
+
+		}
+
+
+		template<typename T>
+		void cpu_abs(T* a, unsigned int N){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = N * sizeof(T);
+
+			//allocate memory on the GPU for the array
+			T* gpuA;
+			HANDLE_ERROR( cudaMalloc(&gpuA, bytes) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_abs<T>(gpuA, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpuA);
+
+		}
+	
+	} //end namespace cuda
+} //end namespace stim
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_ARRAY_CART2POLAR_H
+#define STIM_CUDA_ARRAY_CART2POLAR_H
+
+namespace stim{
+	namespace cuda{
+		template<typename T>
+		__global__ void cuda_cart2polar(T* a, unsigned int N){
+
+			
+			//calculate the 1D index for this thread
+			int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(i < N){
+				float x = a[i * 2 + 0];
+				float y = a[i * 2 + 1];
+				float theta = atan2( y, x ) ;
+				float r = sqrt(x * x + y * y);
+				a[i * 2 + 0] = theta;
+				a[i * 2 + 1] = r;
+			}
+		}
+
+
+		template<typename T>
+		void gpu_cart2polar(T* gpuGrad, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + (N % threads == 0 ? 0:1);
+
+			//call the kernel to do the multiplication
+			cuda_cart2polar <<< blocks, threads >>>(gpuGrad, N);
+
+		}
+
+
+		template<typename T>
+		void cpu_cart2polar(T* a, unsigned int N){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = N * sizeof(T) * 2;
+
+			//allocate memory on the GPU for the array
+			T* gpuA;
+			HANDLE_ERROR( cudaMalloc(&gpuA, bytes) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_cart2polar<T>(gpuA, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpuA);
+
+		}
+
+	}
+}
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_ARRAY_MULTIPLY_H
+#define STIM_CUDA_ARRAY_MULTIPLY_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/devices.h>
+#include <stim/cuda/error.h>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_multiply(T* lhs, T rhs, unsigned int N){
+
+			//calculate the 1D index for this thread
+			int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(i < N)
+				lhs[i] *= rhs;
+		}
+
+		template<typename T>
+		void gpu_multiply(T* lhs, T rhs, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + (N%threads == 0 ? 0:1);
+
+			//call the kernel to do the multiplication
+			cuda_multiply <<< blocks, threads >>>(lhs, rhs, N);
+
+		}
+
+		template<typename T>
+		void cpu_multiply(T* lhs, T rhs, unsigned int N){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = N * sizeof(T);
+
+			//allocate memory on the GPU for the array
+			T* gpuLHS;
+			HANDLE_ERROR( cudaMalloc(&gpuLHS, bytes) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy(gpuLHS, lhs, bytes, cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_multiply<T>(gpuLHS, rhs, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy(lhs, gpuLHS, bytes, cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpuLHS);
+		}
+		
+	}
+}
+
+
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_ARRAYMATH_H
+#define STIM_CUDA_ARRAYMATH_H
+
+#include <stim/cuda/array_multiply.cuh>
+#include <stim/cuda/array_abs.cuh>
+#include <stim/cuda/array_cart2polar.cuh>
+
+namespace stim{
+	namespace cuda{
+	
+	}
+}
+
+
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_DOWN_SAMPLE_H
+#define STIM_CUDA_DOWN_SAMPLE_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/devices.h>
+#include <stim/cuda/timer.h>
+#include <stim/cuda/gaussian_blur.cuh>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
+
+			unsigned int sigma_ds = 1/resize;
+			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
+			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
+			
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x_ds + xi;
+			
+			if(xi< x_ds && yi< y_ds){
+
+				int x_org = xi * sigma_ds;
+				int y_org = yi * sigma_ds;
+				int i_org = y_org * x + x_org;
+				gpuI[i] = gpuI0[i_org];
+			}
+
+		}
+
+
+		/// Applies a Gaussian blur to a 2D image stored on the GPU
+		template<typename T>
+		void gpu_down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
+
+			
+			unsigned int sigma_ds = 1/resize;
+			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
+			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
+			
+			//get the number of pixels in the image
+			unsigned int pixels_ds = x_ds * y_ds;
+			
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x_ds/threads.x + (x_ds %threads.x == 0 ? 0:1) , y_ds);
+			
+			stim::cuda::gpu_gaussian_blur_2d<float>(gpuI0, sigma_ds,x ,y);
+			
+			//resample the image
+			down_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y);
+
+		}
+
+		/// Applies a Gaussian blur to a 2D image stored on the CPU
+		template<typename T>
+		void cpu_down_sample(T* re_img, T* image, T resize, unsigned int x, unsigned int y){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			unsigned int bytes = sizeof(T) * pixels;
+			
+			unsigned int sigma_ds = 1/resize;
+			unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
+			unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
+			unsigned int bytes_ds = sizeof(T) * x_ds * y_ds;
+			
+
+
+			//allocate space on the GPU for the original image
+			T* gpuI0;
+			cudaMalloc(&gpuI0, bytes);
+			
+			
+			//copy the image data to the GPU
+			cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
+
+			//allocate space on the GPU for the down sampled image
+			T* gpuI;
+			cudaMalloc(&gpuI, bytes_ds);
+
+			//run the GPU-based version of the algorithm
+			gpu_down_sample<T>(gpuI, gpuI0, resize, x, y);
+
+			//copy the image data to the GPU
+			cudaMemcpy(re_image, gpuI, bytes_ds, cudaMemcpyHostToDevice);
+
+			cudaFree(gpuI0);
+			cudeFree(gpuI);
+		}
+	
+	}
+}
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_GAUSSIAN_BLUR_H
+#define STIM_CUDA_GAUSSIAN_BLUR_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/devices.h>
+#include <stim/cuda/timer.h>
+#include <stim/cuda/sharedmem.cuh>
+
+#define pi	3.14159
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ T s[];
+
+			int kr = sigma * 4;				//calculate the kernel radius
+
+			//get a pointer to the gaussian in memory
+			T* g = (T*)&s[blockDim.x + 2 * kr];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			int byi = blockIdx.y;
+
+			//copy the portion of the image necessary for this block to shared memory
+			stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
+
+			//calculate the thread index and block index
+			int ti = threadIdx.x;
+
+			//calculate the spatial coordinate for this thread
+			int xi = bxi + ti;
+
+			//pre-compute the gaussian values for each kernel point
+			T a = 1.0 / (sigma * sqrt(2 * pi));
+			T c = - 1.0 / (2*sigma*sigma);
+			int ki;
+
+			//use the first 2kr+1 threads to evaluate a gaussian and store the result
+			if(ti <= 2* kr+1){
+				ki = ti - kr;
+				g[ti] = a * exp((ki*ki) * c);
+			}
+
+			//make sure that all writing to shared memory is done before continuing
+			__syncthreads();
+			
+			//if the current pixel is outside of the image
+			if(bxi + ti > x || byi > y)
+				return;
+
+			
+
+			//calculate the coordinates of the current thread in shared memory
+			int si = ti + kr;
+
+			T sum = 0;		//running weighted sum across the kernel
+
+			
+			//for each element of the kernel
+			for(int ki = -kr; ki <= kr; ki++){
+				sum += g[ki + kr] * s[si + ki];
+			}
+
+			//calculate the 1D image index for this thread
+			unsigned int i = byi * x + xi;
+
+			//output the result to global memory
+			out[i] = sum;
+		}
+
+		template<typename T>
+		__global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ T s[];
+
+			int kr = sigma * 4;				//calculate the kernel radius
+
+			//get a pointer to the gaussian in memory
+			T* g = (T*)&s[blockDim.y + 2 * kr];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x;
+			int byi = blockIdx.y * blockDim.y;
+
+			//copy the portion of the image necessary for this block to shared memory
+			stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
+
+			//calculate the thread index and block index
+			int ti = threadIdx.y;
+
+			//calculate the spatial coordinate for this thread
+			int yi = byi + ti;
+
+			//pre-compute the gaussian values for each kernel point
+			T a = 1.0 / (sigma * sqrt(2 * pi));
+			T c = - 1.0 / (2*sigma*sigma);
+			int ki;
+
+			//use the first 2kr+1 threads to evaluate a gaussian and store the result
+			if(ti <= 2* kr+1){
+				ki = ti - kr;
+				g[ti] = a * exp((ki*ki) * c);
+			}
+
+			//make sure that all writing to shared memory is done before continuing
+			__syncthreads();
+			
+			//if the current pixel is outside of the image
+			if(bxi >= x || yi >= y)
+				return;
+
+			
+
+			//calculate the coordinates of the current thread in shared memory
+			int si = ti + kr;
+
+			T sum = 0;		//running weighted sum across the kernel
+
+			
+			//for each element of the kernel
+			for(int ki = -kr; ki <= kr; ki++){
+				sum += g[ki + kr] * s[si + ki];
+			}
+
+			//calculate the 1D image index for this thread
+			unsigned int i = yi * x + bxi;
+
+			//output the result to global memory
+			out[i] = sum;
+		}
+
+		/// Applies a Gaussian blur to a 2D image stored on the GPU
+		template<typename T>
+		void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			unsigned int bytes = sizeof(T) * pixels;
+
+			// Allocate CUDA array in device memory
+			
+			//define a channel descriptor for a single 32-bit channel
+			cudaChannelFormatDesc channelDesc =
+					   cudaCreateChannelDesc(32, 0, 0, 0,
+											 cudaChannelFormatKindFloat);
+			cudaArray* cuArray;												//declare the cuda array
+			cudaMallocArray(&cuArray, &channelDesc, x, y);			//allocate the cuda array
+
+			// Copy the image data from global memory to the array
+			cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
+							  cudaMemcpyDeviceToDevice);
+
+			// Specify texture
+			struct cudaResourceDesc resDesc;				//create a resource descriptor
+			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
+			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
+			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
+
+			// Specify texture object parameters
+			struct cudaTextureDesc texDesc;							//create a texture descriptor
+			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
+			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
+			texDesc.addressMode[1]   = cudaAddressModeWrap;
+			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
+			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
+			texDesc.normalizedCoords = 0;							//not using normalized coordinates
+
+			// Create texture object
+			cudaTextureObject_t texObj = 0;
+			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
+
+			
+			//get the maximum number of threads per block for the CUDA device
+			int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+
+			//calculate the number of blocks
+			dim3 blocks(x / threads.x + 1, y);
+
+			//calculate the shared memory used in the kernel
+			unsigned int pixel_bytes = max_threads * 4;									//bytes devoted to pixel data being processed
+			unsigned int apron_bytes = sigma * 8 * 4;									//bytes devoted to pixels outside the window
+			unsigned int gaussian_bytes = (sigma * 8 + 1) * 4;							//bytes devoted to memory used to store the pre-computed Gaussian window
+			unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes;		//total number of bytes shared memory used
+
+			//blur the image along the x-axis
+			gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);
+
+			// Copy the x-blurred data from global memory to the texture
+			cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
+							  cudaMemcpyDeviceToDevice);
+
+			//transpose the block and thread dimensions
+			threads.x = 1;
+			threads.y = max_threads;
+			blocks.x = x;
+			blocks.y = y / threads.y + 1;
+			
+			//blur the image along the y-axis
+			gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);
+
+			//free allocated memory
+			cudaFree(cuArray);
+
+		}
+
+		/// Applies a Gaussian blur to a 2D image stored on the CPU
+		template<typename T>
+		void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			unsigned int bytes = sizeof(T) * pixels;
+
+			//allocate space on the GPU
+			T* gpuI0;
+			cudaMalloc(&gpuI0, bytes);
+			
+			
+			//copy the image data to the GPU
+			cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
+
+			//run the GPU-based version of the algorithm
+			gpu_gaussian_blur_2d<T>(gpuI0, sigma, x, y);
+
+			//copy the image data from the device
+			cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost);
+
+			//free allocated memory
+			cudaFree(gpuI0);
+		}
+		
+	}
+}
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_GRADIENT_H
+#define STIM_CUDA_GRADIENT_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/devices.h>
+#include <stim/cuda/error.h>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void gradient_2d(T* out, T* in, unsigned int x, unsigned int y){
+
+			//calculate the 1D image index for this thread
+			int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+			//convert this to 2D pixel coordinates
+			int yi = i / x;
+			int xi = i - (yi * x);
+
+			//return if the pixel is outside of the image
+			if(xi >= x || yi >= y) return;
+
+			//calculate indices for the forward difference
+			int i_xp = yi * x + (xi + 1);
+			int i_yp = (yi + 1) * x + xi;
+
+			//use forward differences if a coordinate is zero
+			if(xi == 0)
+				out[i * 2 + 0] = in[i_xp] - in[i];
+			if(yi == 0)
+				out[i * 2 + 1] = in[i_yp] - in[i];
+
+			//calculate indices for the backward difference
+			int i_xn = yi * x + (xi - 1);
+			int i_yn = (yi - 1) * x + xi;
+
+			//use backward differences if the coordinate is at the maximum edge
+			if(xi == x-1)
+				out[i * 2 + 0] = in[i] - in[i_xn];
+			if(yi == y-1)
+				out[i * 2 + 1] = in[i] - in[i_yn];
+
+			//otherwise use central differences
+			if(xi > 0 && xi < x-1)
+				out[i * 2 + 0] = (in[i_xp] - in[i_xn]) / 2;
+
+			if(yi > 0 && yi < y-1)
+				out[i * 2 + 1] = (in[i_yp] - in[i_yn]) / 2;
+
+		}
+
+		template<typename T>
+		//void gpu_gradient_2d(T* gpuOut, T* gpuIn, unsigned int x, unsigned int y){
+		void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			
+			//allocate space on the GPU for the input image
+			//T* gpuI;
+			//HANDLE_ERROR(cudaMalloc(&gpuI, bytes));
+
+			//cudaMemcpy(gpuI, gpuI0, bytes, cudaMemcpyDeviceToDevice);
+
+
+			//allocate space on the GPU for the output gradient image
+			//T* gpuGrad;
+			//cudaMalloc(&gpuGrad, bytes * 2);		//the output image will have two channels (x, y)
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = pixels / threads + (pixels%threads == 0 ? 0:1);
+
+			//call the GPU kernel to determine the gradient
+			gradient_2d<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y);
+
+		}
+
+		template<typename T>
+		void cpu_gradient_2d(T* out, T* in, unsigned int x, unsigned int y){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			unsigned int bytes = pixels * sizeof(T);
+
+			//allocate space on the GPU for the input image
+			T* gpuIn;
+			HANDLE_ERROR(cudaMalloc(&gpuIn, bytes));
+
+			//copy the image data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuIn, in, bytes, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the output gradient image
+			T* gpuOut;
+			cudaMalloc(&gpuOut, bytes * 2);		//the output image will have two channels (x, y)
+
+			//call the GPU version of this function
+			gpu_gradient_2d(gpuOut, gpuIn, x, y);	
+
+			//copy the results to the CPU
+			cudaMemcpy(out, gpuOut, bytes * 2, cudaMemcpyDeviceToHost);
+
+			//free allocated memory
+			cudaFree(gpuOut);
+		}
+
+	}
+}
+
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_LOCAL_MAX_H
+#define STIM_CUDA_LOCAL_MAX_H
+
+
+# include <iostream>
+# include <cuda.h>
+# include <stim/cuda/devices.h>
+# include <stim/cuda/error.h>
+
+namespace stim{
+	namespace cuda{
+
+
+		// this kernel calculates the local maximum for finding the cell centers
+		template<typename T>
+		__global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
+
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+
+
+			//calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
+			int xl = xi - conn;
+			int yl = yi - conn;
+			
+			// use zero for the lowest limits if the xi or yi is less than conn.
+			if (xi <= conn)
+				xl = 0;
+			if (yi <= conn)
+				yl = 0;
+
+			//calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
+			int xh = xi + conn;
+			int yh = yi + conn;
+
+			// use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn.
+			if (xi >= x - conn)
+				xh = x;
+			if (yi>= y - conn)
+				yh = y;
+
+			// calculate the limits for finding the local maximum location in the connected neighbors for the current pixel
+			int n_l = yl * x + xl;
+			int n_h = yh * x + xh;
+			
+			//initial the centers image to zero
+			gpuCenters[i] = 0;
+
+
+			int n = n_l;
+
+			float l_value = 0;
+
+			if (i < x * y)
+
+				// check if the vote value for this pixel is greater than threshold, so this pixel may be a local max.
+				if (gpuVote[i]>final_t){
+
+					// compare the vote value for this pixel with the vote values with its neighbors.
+					while (n<=n_h){
+
+						// check if this vote value is a local max in its neighborhood.
+							if (gpuVote[i] < gpuVote[n]){
+								l_value  = 0;
+								n =n_h+1;
+							}
+							else if (n == n_h){
+								l_value = 1;
+								n = n+1;
+							}
+							// check if the current neighbor is the last one at the current row
+							else if ((n - n_l - 2*conn)% x ==0){
+								n = n + x - 2*conn -1;
+								n ++;						
+							}
+							else
+								n ++;
+				}
+					// set the center value for this pixel to high if it's a local max ,and to low if not.
+					gpuCenters[i] = l_value ;
+				}
+
+		}
+
+
+
+		template<typename T>
+		void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
+
+			
+			
+			
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+			
+			
+			//call the kernel to find the local maximum.
+			cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y);
+
+
+		}
+
+
+
+		template<typename T>
+		void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
+		
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			// allocate space on the GPU for the detected cell centes
+			T* gpuCenters;
+			cudaMalloc(&gpuCenters, bytes);		
+
+
+			//allocate space on the GPU for the input Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			
+			//copy the Vote image data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
+
+			
+			//call the GPU version of the local max function
+			gpu_local_max<T>(gpuCenters, gpuVote, final_t, conn, x, y);
+
+				
+			//copy the cell centers data to the CPU
+			cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ;
+
+			
+			//free allocated memory
+			cudaFree(gpuCenters);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+
+
+#endif
 \ No newline at end of file
+
+#ifndef STIM_CUDA_SHAREDMEM_H
+#define STIM_CUDA_SHAREDMEM_H
+
+namespace stim{
+	namespace cuda{
+
+		// Copies values from global memory to shared memory, optimizing threads
+		template<typename T>
+		__device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src,
+										 unsigned int x, unsigned int y, unsigned int X, unsigned int Y,
+										 dim3 threadIdx, dim3 blockDim){
+
+			//calculate the number of iterations required for the copy
+			unsigned int xI, yI;
+			xI = X/blockDim.x + 1;				//number of iterations along X
+			yI = Y/blockDim.y + 1;				//number of iterations along Y
+
+			//for each iteration
+			for(unsigned int xi = 0; xi < xI; xi++){
+				for(unsigned int yi = 0; yi < yI; yi++){
+
+					//calculate the index into shared memory
+					unsigned int sx = xi * blockDim.x + threadIdx.x;
+					unsigned int sy = yi * blockDim.y + threadIdx.y;
+
+					//calculate the index into the texture
+					unsigned int tx = x + sx;
+					unsigned int ty = y + sy;
+
+					//perform the copy
+					if(sx < X && sy < Y)
+						dest[sy * X + sx] = tex2D<T>(src, tx, ty);
+				}
+			}
+		}
+		
+	}
+}
+
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_UPDATE_DIR_H
+#define STIM_CUDA_UPDATE_DIR_H
+
+
+# include <iostream>
+# include <cuda.h>
+# include <stim/cuda/devices.h>
+# include <stim/cuda/error.h>
+#include <stim/cuda/sharedmem.cuh>
+
+namespace stim{
+	namespace cuda{
+	
+		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
+		template<typename T>
+		__global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ float s_vote[];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			
+			//calculate the width of the shared memory block
+			int swidth = 2 * rmax + blockDim.x;
+
+			// calculate the 2D coordinates for this current thread.
+			int xi = bxi + threadIdx.x;
+			int yi = blockIdx.y;
+
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			// calculate the voting direction based on the grtadient direction
+			float theta = gpuGrad[2*i];
+
+			//initialize the vote direction to zero
+			gpuDir[i] = 0;
+
+			// define a local variable to maximum value of the vote image in the voting area for this voter
+			float max = 0;
+
+			// define two local variables for the x and y coordinations where the maximum happened
+			int id_x = 0;
+			int id_y = 0;
+
+			// compute the size of window which will be checked for finding the voting area for this voter
+			unsigned int x_table = 2*rmax +1;
+			unsigned int rmax_sq = rmax * rmax;
+			int r = (int)rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			int bxs = bxi - rmax;
+			
+			for(int yr = -r; yr <= r; yr++){
+
+				//copy the portion of the image necessary for this block to shared memory
+				__syncthreads();
+				stim::cuda::sharedMemcpy_tex2D<float>(s_vote, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim);
+				__syncthreads();
+				
+				//if the current thread is outside of the image, it doesn't have to be computed
+				if(xi < x && yi < y){
+
+					for(int xr = -r; xr <= r; xr++){
+
+						unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
+
+						// calculate the angle between the voter and the current pixel in x and y directions
+						float atan_angle = gpuTable[ind_t];
+
+
+						// calculate the voting direction based on the grtadient direction
+						int idx_share_update = xr + tx_rmax ;
+						float share_vote = s_vote[idx_share_update];
+						
+						// check if the current pixel is located in the voting area of this voter.
+						if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+
+						// compare the vote value of this pixel with the max value to find the maxima and its index.
+							if  (share_vote>max) {
+
+								max = share_vote;
+								id_x =  xr;
+								id_y =  yr;
+							}
+						}
+					}
+				}
+			}
+			
+				
+		//float new_angle = atan2(dy, dx);
+		unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
+
+		float new_angle = gpuTable[ind_m];
+
+		gpuDir[i] = new_angle;
+
+		}
+
+		// this kernel updates the gradient direction by the calculated voting direction.
+		template<typename T>
+		__global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, unsigned int x, unsigned int y){
+
+			//************ when the number of threads are (1024,1) *************
+			
+			// calculate the 2D coordinates for this current thread.
+			int xi = blockIdx.x * blockDim.x + threadIdx.x;
+			int yi = blockIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+
+			//update the gradient image with the vote direction
+			gpuGrad[2*i] = gpuDir[i];
+		}
+
+
+		template<typename T>
+		void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			unsigned int bytes = sizeof(T) * pixels;
+
+			
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+			// Allocate CUDA array in device memory
+			
+			//define a channel descriptor for a single 32-bit channel
+			cudaChannelFormatDesc channelDesc =
+					   cudaCreateChannelDesc(32, 0, 0, 0,
+											 cudaChannelFormatKindFloat);
+			cudaArray* cuArray;												//declare the cuda array
+			cudaMallocArray(&cuArray, &channelDesc, x, y);			//allocate the cuda array
+
+			// Copy the image data from global memory to the array
+			cudaMemcpyToArray(cuArray, 0, 0, gpuVote, bytes,
+							  cudaMemcpyDeviceToDevice);
+
+			// Specify texture
+			struct cudaResourceDesc resDesc;				//create a resource descriptor
+			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
+			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
+			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
+
+			// Specify texture object parameters
+			struct cudaTextureDesc texDesc;							//create a texture descriptor
+			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
+			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
+			texDesc.addressMode[1]   = cudaAddressModeWrap;
+			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
+			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
+			texDesc.normalizedCoords = 0;							//not using normalized coordinates
+
+			// Create texture object
+			cudaTextureObject_t texObj = 0;
+			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
+
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*(1)*4;
+			
+			// allocate space on the GPU for the updated vote direction
+			T* gpuDir;
+			cudaMalloc(&gpuDir, bytes);	
+
+			//call the kernel to calculate the new voting direction
+			cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, texObj, gpuGrad, gpuTable, phi, rmax, x , y);
+
+			//call the kernel to update the gradient direction
+			cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
+
+
+			//free allocated memory
+			cudaFree(gpuDir);
+
+		}
+
+
+		template<typename T>
+		void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);
+
+			//copy the input vote image to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));	
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+
+			
+			//call the GPU version of the update direction function
+			gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+				
+			//copy the new gradient image back to the CPU
+			cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+
+
+#endif
 \ No newline at end of file
+#ifndef STIM_CUDA_VOTE_H
+#define STIM_CUDA_VOTE_H
+
+
+# include <iostream>
+# include <cuda.h>
+# include <stim/cuda/devices.h>
+# include <stim/cuda/error.h>
+#include <stim/cuda/sharedmem.cuh>
+
+
+namespace stim{
+	namespace cuda{
+
+		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
+		template<typename T>
+		__global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//generate a pointer to shared memory (size will be specified as a kernel parameter)
+			extern __shared__ float2 s_grad[];
+
+			//calculate the start point for this block
+			int bxi = blockIdx.x * blockDim.x;
+			
+			//calculate the width of the shared memory block
+			int swidth = 2 * rmax + blockDim.x;
+
+			// calculate the 2D coordinates for this current thread.
+			int xi = bxi + threadIdx.x;
+			int yi = blockIdx.y;
+			// convert 2D coordinates to 1D
+			int i = yi * x + xi;
+
+			
+			// define a local variable to sum the votes from the voters
+			float sum = 0;
+
+			// compute the size of window which will be checked for finding the proper voters for this pixel
+			unsigned int x_table = 2*rmax +1;
+
+			unsigned int rmax_sq = rmax * rmax;
+			int r = (int)rmax;
+			int tx_rmax = threadIdx.x + rmax;
+			int bxs = bxi - rmax;
+			
+			
+				for(int yr = -r; yr <= r; yr++){
+
+				//copy the portion of the image necessary for this block to shared memory
+				__syncthreads();
+				stim::cuda::sharedMemcpy_tex2D<float2>(s_grad, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim);
+				__syncthreads();
+				
+				//if the current thread is outside of the image, it doesn't have to be computed
+				if(xi < x && yi < y){
+					
+					for(int xr = -r; xr <= r; xr++){
+					
+							//find the location of this voter in the atan2 table
+							unsigned int id_t = (yr + rmax) * x_table + xr + rmax;
+
+							// calculate the angle between the pixel and the current voter in x and y directions
+							float atan_angle = gpuTable[id_t];
+							
+					
+							// calculate the voting direction based on the grtadient direction
+							int idx_share = xr + tx_rmax ;
+							float2 g = s_grad[idx_share];
+							float theta = g.x;
+
+							// check if the current voter is located in the voting area of this pixel.
+							if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
+								sum += g.y;		
+
+							}
+				
+					}
+				}
+			}
+
+			gpuVote[i] = sum;
+		}
+
+		template<typename T>
+		void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//get the number of pixels in the image
+			unsigned int pixels = x * y;
+			unsigned int bytes = sizeof(T) * pixels;
+
+			
+			unsigned int max_threads = stim::maxThreadsPerBlock();
+			//unsigned int thread_dim = sqrt(max_threads);
+			dim3 threads(max_threads, 1);
+			dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
+			
+			// Allocate CUDA array in device memory
+			
+			//define a channel descriptor for a single 32-bit channel
+			cudaChannelFormatDesc channelDesc =
+					   cudaCreateChannelDesc(32, 32, 0, 0,
+											 cudaChannelFormatKindFloat);
+			cudaArray* cuArray;												//declare the cuda array
+			cudaMallocArray(&cuArray, &channelDesc, x, y);			//allocate the cuda array
+
+			// Copy the image data from global memory to the array
+			cudaMemcpyToArray(cuArray, 0, 0, gpuGrad, bytes*2,
+							  cudaMemcpyDeviceToDevice);
+
+			// Specify texture
+			struct cudaResourceDesc resDesc;				//create a resource descriptor
+			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
+			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
+			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
+
+			// Specify texture object parameters
+			struct cudaTextureDesc texDesc;							//create a texture descriptor
+			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
+			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
+			texDesc.addressMode[1]   = cudaAddressModeWrap;
+			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
+			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
+			texDesc.normalizedCoords = 0;							//not using normalized coordinates
+
+			// Create texture object
+			cudaTextureObject_t texObj = 0;
+			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
+
+			// specify  share memory
+			unsigned int share_bytes = (2*rmax + threads.x)*(1)*2*4;
+			
+
+			//call the kernel to do the voting
+			
+			cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y);
+
+		}
+
+
+		template<typename T>
+		void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
+
+			//calculate the number of bytes in the array
+			unsigned int bytes = x * y * sizeof(T);
+
+			//calculate the number of bytes in the atan2 table
+			unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
+
+			//allocate space on the GPU for the Vote Image
+			T* gpuVote;
+			cudaMalloc(&gpuVote, bytes);		
+
+			//allocate space on the GPU for the input Gradient image
+			T* gpuGrad;
+			HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
+
+			//copy the Gradient Magnitude data to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
+
+			//allocate space on the GPU for the atan2 table
+			T* gpuTable;
+			HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
+
+			//copy the atan2 values to the GPU
+			HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
+
+			//cudaMemcpyToSymbol (cstTable, cpuTable, bytes_table );
+
+			
+			//call the GPU version of the vote calculation function
+			gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
+
+				
+			//copy the Vote Data back to the CPU
+			cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
+
+			//free allocated memory
+			cudaFree(gpuTable);
+			cudaFree(gpuVote);
+			cudaFree(gpuGrad);
+		}
+		
+	}
+}
+
+
+
+#endif
 \ No newline at end of file
@@ -19,8 +19,9 @@
 #include <algorithm>
  
 #include "../parser/parser.h"
-
+#ifdef BOOST_PRECOMPILED
 #include <boost/filesystem.hpp>
+#endif
  
 namespace stim{
  
@@ -154,6 +155,7 @@ public:
  
 		return ss.str();
 	}
+#ifdef BOOST_PRECOMPILED
  
 	//get a list of files matching the current template
 	std::vector<stim::filename> get_list(){
@@ -193,7 +195,7 @@ public:
  
 		return file_list;
 	}
-
+#endif
 	//gets the current working directory
 	static stim::filename cwd(){