merged the changes from Master_Clone_W_Branch with into master

Pavel Govyadinov
2 parents eb5dfb2b 27194b56
Showing 11 changed files with 717 additions and 774 deletions Show diff stats
stim/cuda/branch_detection.cuh
stim/cuda/branch_detection2.cuh
stim/cuda/cuda_texture.cuh
stim/cuda/filter.cuh
stim/cuda/filter.h
stim/cuda/sharedmem.cuh
stim/cuda/testKernel.cuh
stim/gl/gl_spider.h
stim/math/circle.h
stim/math/plane.h
stim/visualization/cylinder.h
@@ -5,167 +5,48 @@
 //#include <math.h>
 #include <stim/visualization/colormap.h>
 #include <stim/cuda/cuda_texture.cuh>
-#include <stim/cuda/templates/gradient.cuh>
-#include <stim/cuda/templates/gaussian_blur.cuh>
-#include <stim/cuda/arraymath.cuh>
-#include <stim/cuda/ivote.cuh>
-#include <stim/cuda/testKernel.cuh>
+#include <stim/cuda/filter.cuh>
 typedef unsigned int uint;
 typedef unsigned int uchar;
-stim::cuda::cuda_texture t;	
-float*		gpuTable;
-float*		gpuGrad;
-float*		gpuVote;	
-float*		gpuI;
-float*		gpuCenters;
-
-void atan_2d(float* cpuTable, unsigned int rmax)
-{
-	//initialize the width and height of the window which atan2 are computed in.
-	int xsize = 2*rmax +1;
-	int ysize = 2*rmax +1;
-	
-	// assign the center coordinates of the atan2 window to yi and xi
-	int yi = rmax;
-	int xi = rmax;
-	
-
-	for (int xt = 0; xt < xsize; xt++){
-
-		for(int yt = 0; yt < ysize; yt++){
-
-			//convert the current 2D coordinates to 1D
-			int id = yt * xsize + xt;
-			// calculate the distance between the pixel and the center of the atan2 window
-			float xd = xi - xt;
-			float yd = yi - yt;
-
-			// calculate the angle between the pixel and the center of the atan2 window and store the result.
-			float atan_2d_vote = atan2(yd, xd);
-			cpuTable[id] = atan_2d_vote;
-		}
-	}
-
-}
-
-void initCuda(unsigned int bytes_table, unsigned int bytes_ds)
-{
-	HANDLE_ERROR(
-		cudaMalloc((void**) &gpuTable, bytes_table)
-		);
-	HANDLE_ERROR(
-		cudaMalloc((void**) &gpuI, bytes_ds)
-		);
-	HANDLE_ERROR(
-		cudaMalloc((void**) &gpuGrad,  bytes_ds*2)
-		);
-	HANDLE_ERROR(
-		cudaMalloc((void**) &gpuVote,  bytes_ds)
-		);
-	HANDLE_ERROR(
-		cudaMalloc((void**) &gpuCenters, bytes_ds)
-		);
-}
-
-void cleanCuda()
-{
-	HANDLE_ERROR(
-		cudaFree(gpuTable)
-	);
-	HANDLE_ERROR(
-		cudaFree(gpuGrad)
-	);
-	HANDLE_ERROR(
-		cudaFree(gpuVote)
-	);
-	HANDLE_ERROR(
-		cudaFree(gpuCenters)
-	);
-	HANDLE_ERROR(
-		cudaFree(gpuI)
-	);
-}
 std::vector< stim::vec<float> >
 find_branch(GLint texbufferID, GLenum texType, unsigned int x, unsigned int y)
 {
-	float 		phi	 	= 15.1*M_PI/180;
-	int		iter		= 5;
-	float 		dphi		= phi/iter;
-	float 		rmax 		= 10;
-	float		sigma		= 4;
-	unsigned int 	pixels 		= x * y;
-	unsigned int 	bytes  		= sizeof(float) * pixels;
-	unsigned int 	bytes_table	= sizeof(float) * (2*rmax + 1) * (2*rmax + 1);
-	unsigned int 	x_ds		= (x + (x % 1 == 0 ? 0:1));
-	unsigned int 	y_ds		= (y + (x % 1 == 0 ? 0:1));
-	unsigned int	bytes_ds	= sizeof(float) * x_ds * y_ds;
-	unsigned int	conn		= 5;
-	float		final_t		= 200.0;
-	float*		cpuTable	= (float*) malloc(bytes_table);
-	float*		cpuCenters	= (float*) malloc(bytes_ds);
+	float		sigma		= 2.0;
+	unsigned int	conn		= 7;
+	float		threshold	= 40.0;
+	float*		cpuCenters	= (float*) malloc(x*y*sizeof(float));
+	int		sizek		= 7;
 	stringstream name;
-
-
-	std::vector<stim::vec<float> >  output;
-	initCuda(bytes_table, bytes_ds); 
-
-	atan_2d(cpuTable, rmax);
-	cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice);
-
-
-	t.MapCudaTexture(texbufferID, texType);
-	cudaDeviceSynchronize();
-	stim::cuda::tex_gaussian_blur2<float>(
-		gpuI, sigma, x, y, t.getTexture(), t.getArray()
-		);
+	cpuCenters = stim::cuda::get_centers(texbufferID, texType, x, y, sizek, sigma, conn, threshold);
 	cudaDeviceSynchronize();
-	stim::cuda::gpu_gradient_2d<float>(
-		gpuGrad, gpuI, x, y
-		);
-	cudaDeviceSynchronize();
-	
-	stim::cuda::gpu_cart2polar<float>(gpuGrad, x, y);
-	cudaDeviceSynchronize();
-	cudaDeviceSynchronize();
-	for (int i = 0; i < iter; i++)
-	{
-		stim::cuda::gpu_vote<float>(gpuVote, gpuGrad, gpuTable, phi, rmax, x, y);
-	cudaDeviceSynchronize();
-		stim::cuda::gpu_update_dir<float>(gpuVote, gpuGrad, gpuTable, phi, rmax, x, y);
-	cudaDeviceSynchronize();
-		phi = phi - dphi;
-	}
+	std::vector<stim::vec<float> >  output;
 	cudaDeviceSynchronize();
-	stim::cuda::gpu_local_max<float>(gpuCenters, gpuVote, final_t, conn, x, y);
-	cudaMemcpy(cpuCenters, gpuCenters, bytes_ds, cudaMemcpyDeviceToHost);
-	for(int i = 0; i < pixels; i++)
+
+	for(int i = 0; i < x; i++)
 	{
-		int ix = (i % x);
-		int iy = (i / x);
-		if((cpuCenters[i] == 1) && (ix > 4) && (ix < x-4))
+		for(int j = 0; j < y; j++)
 		{
-
-			float x_v = (float) ix;
-			float y_v = (float) iy;
-			output.push_back(stim::vec<float>((x_v/(float)x),
-							  (y_v/(float)y), 0.0));	
-
-		}
+			int idx = x*j+i;
+			if(cpuCenters[idx] != 0)
+			{
+				float x_v = (float) i;
+				float y_v = (float) j;
+				output.push_back(stim::vec<float>((x_v/(float)x*360.0),
+								  (y_v), y_v/8));	
+			}
+
+		} 
 	}
-
-
-	t.UnmapCudaTexture();
-	cleanCuda();
-	free(cpuTable);
+	
 	free(cpuCenters);
 	return output;
 }
-#include <stim/cuda/templates/gaussian_blur.cuh>
-#include <stim/cuda/templates/gradient.cuh>
-#include <stim/cuda/arraymath.cuh>
-#include <stim/cuda/ivote.cuh>
-
-
-
-
-
-
-
-
-
-
-void atan_2(float* cpuTable, unsigned int rmax){
-
-	//initialize the width and height of the window which atan2 are computed in.
-	int xsize = 2*rmax +1;
-	int ysize = 2*rmax +1;
-	
-	// assign the center coordinates of the atan2 window to yi and xi
-	int yi = rmax;
-	int xi = rmax;
-	
-
-	for (int xt = 0; xt < xsize; xt++){
-
-		for(int yt = 0; yt < ysize; yt++){
-
-			//convert the current 2D coordinates to 1D
-			int id = yt * xsize + xt;
-			// calculate the distance between the pixel and the center of the atan2 window
-			float xd = xi - xt;
-			float yd = yi - yt;
-
-			// calculate the angle between the pixel and the center of the atan2 window and store the result.
-			float atan_2d_vote = atan2(yd, xd);
-			cpuTable[id] = atan_2d_vote;
-		}
-	}
-
-}
-std::vector<stim::vec<float> > 
-find_branch(GLint texbufferID, GLenum texType, unsigned int x, unsigned int y)
-{
-
-	float* cpuTable		= (float
-
-	unsigned int pixels = x * y;
-	unsigned int bytes = sizeof(float) * pixels;
-
-	//calculate the number of bytes in the atan2 table
-
-	unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(float);
-
-
-
-	//allocate space on the GPU for the atan2 table
-
-	float* gpuTable;
-
-	cudaMalloc(&gpuTable, bytes_table);
-
-
-
-	cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice);
-
-	unsigned int sigma_ds = 1/resize;
-	unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
-	unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
-	unsigned int bytes_ds = sizeof(float) * x_ds * y_ds;
-	
-
-	float* gpuI;
-	cudaMalloc(&gpuI, bytes_ds);
-
-	
-	float* gpuGrad;
-	cudaMalloc(&gpuGrad, bytes_ds*2);
-
-	float* gpuVote;
-	cudaMalloc(&gpuVote, bytes_ds);
-
-	// allocate space on the GPU for the detected cell centes
-
-	float* gpuCenters;
-
-	cudaMalloc(&gpuCenters, bytes_ds);		
-
-
-	stim::cuda::gpu_down_sample<float>(gpuI, gpuI0, resize, x , y);
-	cudaMemcpy(cpuResize, gpuI, bytes_ds, cudaMemcpyDeviceToHost);
-
-x = x_ds;
-	y = y_ds;
-	t = t * resize;
-	//sigma = sigma * resize;
-
-	cudaDeviceSynchronize();
-	stim::cuda::gpu_gaussian_blur2<float>(gpuI,sigma, x, y);
-	cudaDeviceSynchronize();
-	cudaMemcpy(cpuBlur, gpuI, bytes_ds, cudaMemcpyDeviceToHost);
-	cudaDeviceSynchronize();
-	
-	stim::cuda::gpu_gradient_2d<float>(gpuGrad, gpuI, x, y);
-	cudaDeviceSynchronize();
-	cudaMemcpy(cpuGradient, gpuGrad, bytes_ds*2, cudaMemcpyDeviceToHost);
-
-	stim::cuda::gpu_cart2polar<float>(gpuGrad, x, y);
-	cudaDeviceSynchronize();
-	cudaMemcpy(cpuCart2Polar, gpuGrad, bytes_ds*2, cudaMemcpyDeviceToHost);
-	
-
-	//multiply the gradient by a constant and calculate the absolute value (to save an image)	
-
-	stim::cuda::cpu_multiply<float>(cpuCart2Polar, 40, x * y * 2);
-
-	cudaDeviceSynchronize();
-
-	stim::cuda::cpu_abs<float>(cpuCart2Polar, x * y * 2);
-
-	cudaDeviceSynchronize();
-
-		
-	for (int i =0; i<iter; i++){
-		
-		stim::cuda::gpu_vote<float>(gpuVote, gpuGrad, gpuTable, phi, rmax, x, y);
-		cudaDeviceSynchronize();
-		stim::cuda::gpu_update_dir<float>(gpuVote, gpuGrad, gpuTable, phi, rmax, x, y);
-		cudaDeviceSynchronize();
-		switch (i){
-		case 0 : cudaMemcpy(cpuVote1, gpuVote, bytes_ds, cudaMemcpyDeviceToHost);
-			break;
-		case 1 : cudaMemcpy(cpuVote2, gpuVote, bytes_ds, cudaMemcpyDeviceToHost);
-			break;
-		case 2 : cudaMemcpy(cpuVote3, gpuVote, bytes_ds, cudaMemcpyDeviceToHost);
-			break;
-		case 3 : cudaMemcpy(cpuVote4, gpuVote, bytes_ds, cudaMemcpyDeviceToHost);
-			break;
-		case 4 : cudaMemcpy(cpuVote5, gpuVote, bytes_ds, cudaMemcpyDeviceToHost);
-			break;
-		default : cudaMemcpy(cpuVote5, gpuVote, bytes_ds, cudaMemcpyDeviceToHost);
-			break;
-		}
-		phi = phi - dphi;
-	}
-	
-	stim::cuda::gpu_local_max<float>(gpuCenters, gpuVote, t, conn, x, y);
-	cudaMemcpy(cpuCenters, gpuCenters, bytes_ds, cudaMemcpyDeviceToHost);
-	
-}
@@ -41,6 +41,46 @@ namespace stim
 				texDesc.normalizedCoords	= 0;
 			}
+
+			///Enable the nromalized texture coordinates.
+			///@param bool, 1 for on, 0 for off
+			void
+			SetTextureCoordinates(bool val)
+			{
+				if(val)
+					texDesc.normalizedCoords	=	1;
+				else	
+					texDesc.normalizedCoords	= 	0;
+			}
+
+			///sets the dimension dim to used the mode at the borders of the texture.
+			///@param dim : 0-x, 1-y, 2-z
+			///@param mode: cudaAddressModeWrap = 0,
+			///		cudaAddressModeClamp = 1,
+			///		cudaAddressMNodeMirror = 2,
+			///		cudaAddressModeBorder = 3,
+			void
+			SetAddressMode(int dim, int mode)
+			{
+				switch(mode)
+				{
+					case 0:
+						texDesc.addressMode[dim] = cudaAddressModeWrap;
+						break;
+					case 1:
+						texDesc.addressMode[dim] = cudaAddressModeClamp;
+						break;
+					case 2:
+						texDesc.addressMode[dim] = cudaAddressModeMirror;
+						break;
+					case 3:
+						texDesc.addressMode[dim] = cudaAddressModeBorder;
+						break;
+					default:
+						break;
+				}
+			}
+
 //-------------------------------------------------------------------------//
 //-------------------------------CUDA_MAPPING------------------------------//
 //-------------------------------------------------------------------------//
+#ifndef STIM_FILTER_H
+#define STIM_FILTER_H
+
+#include <assert.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stim/visualization/colormap.h>
+#include <sstream>
+#include <stim/cuda/cudatools/devices.h>
+#include <stim/cuda/cudatools/threads.h>
+#include <stim/cuda/cuda_texture.cuh>
+#include <stim/cuda/ivote.cuh>
+#include <stim/cuda/arraymath.cuh>
+
+#define IMAD(a,b,c) ( __mul24((a), (b)) + (c) )
+#define M_PI 3.141592654f
+
+
+namespace stim
+{
+	namespace cuda
+	{
+
+	float* gpuLoG;
+	float* LoG;
+	float* res;
+	float* centers;
+	stim::cuda::cuda_texture tx;
+
+
+
+	void initArray(int DIM_X, int DIM_Y, int kl)
+	{
+		
+			LoG =  (float*) malloc(kl*kl*sizeof(float));
+		HANDLE_ERROR(
+			cudaMalloc( (void**) &gpuLoG, kl*kl*sizeof(float))
+		);
+	//	checkCUDAerrors("Memory Allocation, LoG");
+		HANDLE_ERROR(
+			cudaMalloc( (void**) &res, DIM_Y*DIM_X*sizeof(float))
+		);
+		HANDLE_ERROR(
+			cudaMalloc( (void**) &centers, DIM_Y*DIM_X*sizeof(float))
+		);
+	//	checkCUDAerrors("Memory Allocation, Result");
+	}
+
+	void cleanUp(cudaGraphicsResource_t src)
+	{
+		HANDLE_ERROR(
+			cudaFree(gpuLoG)
+		);
+		HANDLE_ERROR(
+			cudaFree(res)
+		);
+		HANDLE_ERROR(
+			cudaFree(centers)
+		);
+			free(LoG);
+	}
+
+	void
+	filterKernel (float kl, float sigma, float *LoG)
+	{
+		float t = 0.0;
+		float kr = kl/2; 
+		float x, y;
+		int idx;
+		for(int i = 0; i < kl; i++){
+			for(int j = 0; j < kl; j++){
+				idx = j*kl+i;
+				x = i - kr - 0.5;
+				y = j - kr - 0.5;
+				LoG[idx] = (-1.0/M_PI/powf(sigma, 4))* (1 - (powf(x,2)+powf(y,2))/2.0/powf(sigma, 2))
+						*expf(-(powf(x,2)+powf(y,2))/2/powf(sigma,2));	
+				t +=LoG[idx];
+			}
+		}
+		
+		for(int i = 0; i < kl*kl; i++)
+		{
+			LoG[i] = LoG[i]/t;
+		}
+		
+	}
+
+	//Shared memory would be better.
+	__global__
+	void
+	applyFilter(cudaTextureObject_t texIn, unsigned int DIM_X, unsigned int DIM_Y, int kr, int kl, float *res, float* gpuLoG){
+	//R = floor(size/2)
+	//THIS IS A NAIVE WAY TO DO IT, and there is a better way)
+		
+		__shared__ float shared[7][7];
+		int x = blockIdx.x;
+		int y = blockIdx.y;
+		int xi = threadIdx.x;
+		int yi = threadIdx.y;
+		float val = 0;
+		float tu = (x-kr+xi)/(float)DIM_X;
+		float tv = (y-kr+yi)/(float)DIM_Y;
+		shared[xi][yi] = gpuLoG[yi*kl+xi]*(255.0-(float)tex2D<unsigned char>(texIn, tu, tv));
+		__syncthreads();
+	
+		
+		//x = max(0,x);
+		//x = min(x, width-1);
+		//y = max(y, 0);
+		//y = min(y, height - 1);
+
+		int idx = y*DIM_X+x;
+		int k_idx;
+                for(unsigned int step = blockDim.x/2; step >= 1; step >>= 1)
+                {
+                        __syncthreads();
+                        if (xi < step)
+                        {
+                                shared[xi][yi] += shared[xi + step][yi];
+                        }
+                __syncthreads();
+                }
+                __syncthreads();
+
+                for(unsigned int step = blockDim.y/2; step >= 1; step >>= 1)
+                {
+                        __syncthreads();
+                        if(yi < step)
+                        {
+                                shared[xi][yi] += shared[xi][yi + step];
+                        }
+                __syncthreads();
+                }
+                __syncthreads();
+                if(xi == 0 && yi == 0)
+                        res[idx] = shared[0][0];
+	}
+
+	extern "C"
+	float *
+	get_centers(GLint texbufferID, GLenum texType, int DIM_X, int DIM_Y, int sizeK, float sigma, float conn, float threshold)
+	{
+		tx.SetTextureCoordinates(1);
+		tx.SetAddressMode(1, 3);
+		tx.MapCudaTexture(texbufferID, texType);
+		float* result =  (float*) malloc(DIM_X*DIM_Y*sizeof(float));
+		
+		initArray(DIM_X, DIM_Y, sizeK);
+
+		filterKernel(sizeK, sigma, LoG);
+		cudaMemcpy(gpuLoG, LoG, sizeK*sizeK*sizeof(float), cudaMemcpyHostToDevice);
+		dim3 numBlocks(DIM_X, DIM_Y);
+		dim3 threadsPerBlock(sizeK, sizeK);
+
+		applyFilter <<< numBlocks, threadsPerBlock >>> (tx.getTexture(), DIM_X, DIM_Y, floor(sizeK/2), sizeK, res, gpuLoG);
+
+
+		stim::cuda::gpu_local_max<float>(centers, res, threshold, conn, DIM_X, DIM_Y);
+		
+		cudaDeviceSynchronize();
+
+
+		cudaMemcpy(result, centers, DIM_X*DIM_Y*sizeof(float), cudaMemcpyDeviceToHost);
+
+		tx.UnmapCudaTexture();
+		cleanUP();
+		return result;
+	}
+
+	}
+}
+#endif
-#include <assert.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <stdio.h>
-#include <stim/visualization/colormap.h>
-#include <sstream>
-
-#define IMAD(a,b,c) ( __mul24((a), (b)) + (c) )
-
-int kr;
-int kl;
-float sigma;
-float* LoG;
-float* result;
-cudaArray* srcArray;
-texture<uchar, cudaTextureType2D, cudaReadModeElementType> texIn;
-
-
-__device__ float filterKernel ()
-{
-	float t = 0;
-	idx = j*kl+i;
-	for(int i = 0; i < kl; i++){
-		for(int j = 0; j < kl; j++){
-			x = i - floor(kl);
-			y = j - floor(kl);
-			LoG(idx) = (-1/M_PI/sigma^4)* (1 - (x^2+y^2)/2/sigma^2)
-					*exp(-(x^2+y^2)/2/sigma^2);	
-			t +=LoG(idx);
-		}
-	}
-	LoG =/ t;
-}
-
-void initArray(cudaGraphicsResource_t src, int DIM_X, int DIM_Y)
-{
-	HANDLE_ERROR(
-		cudaGraphicsMapResources(1, &src)
-	);
-	HANDLE_ERROR(
-		cudaGraphicsSubResourceGetMappedArray(&srcArray, src, 0,0)
-		);
-	HANDLE_ERROR(
-		cudaBindTertureToArray(texIn, srcArray)
-		);
-	cudaMalloc( (void**) &LoG, kl*kl*sizeof(float));
-	checkCUDAerrors("Memory Allocation, LoG");
-	cudaMalloc( (void**) &result, DIM_Y*DIM_X*sizeof(float));
-	checkCUDAerrors("Memory Allocation, Result");
-}
-
-void cleanUp(cudaGraphicsResource_t src);
-{
-	HANDLE_ERROR(
-		cudaUnbindTexture(texIn)
-	);
-	HANDLE_ERROR(
-		cudaFree(LoG)
-	);
-	HANDLE_ERROR(
-		cudaFree(result)
-	);
-	HANDLE_ERROR(
-		cudaGraphicsUnmapResources(1, &src)
-	);
-}
-
-//Shared memory would be better.
-__global__
-void
-applyFilter(unsigned int DIM_X, unsigned int DIM_Y){
-//R = floor(size/2)
-//THIS IS A NAIVE WAY TO DO IT, and there is a better way)
-	//__shared__ float shared[(DIM_X+2*R), (DIM_Y+2*R)];
-	
-	const	 int x = IMAD(blockDim.x, blockIdx.x, threadIdx.x);
-	const	 int y = IMAD(blockDim.y, blockIdx.y, threadIdx.y);
-	float val = 0;
-	//x = max(0,x);
-	//x = min(x, width-1);
-	//y = max(y, 0);
-	//y = min(y, height - 1);
-
-	int idx = y*DIM_X+x;
-	//unsigned int bindex = threadIdx.y * blockDim.y + threadIdx.x;
-
-	//float valIn		= tex2D(texIn, x, y);
-	for (int i = -kr; i <= kr; i++){	//rows
-		for (int j = -kr; i <= kr; j++){	//colls
-			k_idx = (j+kr)+(i+kr)*kl;
-			xi = max(0, x+i);
-			xi = min(x+i, DIM_X-1);
-			yj = max(y+j, 0);
-			yj = min(y+j, DIM_Y-1);
-			val += LoG(k_idx)*tex2D(texIn,x+i, y+j);	
-		}
-	}
-
-	result[idx] = val;
-}
@@ -35,34 +35,6 @@ namespace stim{
 			}
 		}
-		template<typename T, typename D>
-		__device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src,
-										 unsigned int x, unsigned int y, unsigned int X, unsigned int Y,
-										 dim3 threadIdx, dim3 blockDim){
-
-			//calculate the number of iterations required for the copy
-			unsigned int xI, yI;
-			xI = X/blockDim.x + 1;				//number of iterations along X
-			yI = Y/blockDim.y + 1;				//number of iterations along Y
-
-			//for each iteration
-			for(unsigned int xi = 0; xi < xI; xi++){
-				for(unsigned int yi = 0; yi < yI; yi++){
-
-					//calculate the index into shared memory
-					unsigned int sx = xi * blockDim.x + threadIdx.x;
-					unsigned int sy = yi * blockDim.y + threadIdx.y;
-
-					//calculate the index into the texture
-					unsigned int tx = x + sx;
-					unsigned int ty = y + sy;
-
-					//perform the copy
-					if(sx < X && sy < Y)
-						dest[sy * X + sx] = abs(255 - tex2D<D>(src, tx, ty));
-				}
-			}
-		}
 	}
 }
@@ -14,9 +14,9 @@
 	///Initialization function, allocates the memory and passes the necessary
 	///handles from OpenGL and Cuda.
 	///@param DIM_Y			--integer controlling how much memory to allocate.
-	void initArray()
+	void initArray(int x, int y)
 	{
-			cudaMalloc( (void**) &print, 216*16*sizeof(float));     ///temporary
+			cudaMalloc( (void**) &print, x*y*sizeof(float));     ///temporary
 	}
 	///Deinit function that frees the memery used and releases the texture resource
@@ -48,12 +48,14 @@
 	{       
 		int x   = threadIdx.x + blockIdx.x * blockDim.x;
 		int y   = threadIdx.y + blockIdx.y * blockDim.y;
-		int idx = y*16+x;
+		int idx = y*64+x;
+//		int idx = y*32+x;
+//		int idx = y*16+x;
 		float valIn             = tex2D<unsigned char>(texIn, x, y);
 		float templa		= templ(x);
-		//print[idx]              = abs(valIn);             ///temporary
-		print[idx]              = abs(templa);             ///temporary
+		print[idx]              = valIn;             ///temporary
+		//print[idx]              = abs(templa);             ///temporary
 	}
@@ -64,7 +66,7 @@
 	///@param GLenum texType	--either GL_TEXTURE_1D, GL_TEXTURE_2D or GL_TEXTURE_3D
 	///				  may work with other gl texture types, but untested.
 	///@param DIM_Y, the number of samples in the template.
-	void test(GLint texbufferID, GLenum texType)
+	void test(GLint texbufferID, GLenum texType, int x, int y)
 	{
 		//Bind the Texture in GL and allow access to cuda.
@@ -72,16 +74,12 @@
 		//initialize the return arrays.
-		initArray();
-		
-		int x = 16;
-		int y = 27*8;
-		y = 8* 1089; 
+		initArray(x,y);
+		dim3 numBlocks(1, y);
+		dim3 threadsPerBlock(x, 1);
 		int max_threads = stim::maxThreadsPerBlock();
 		//dim3 threads(max_threads, 1);
 		//dim3 blocks(x / threads.x + 1, y);	
-		dim3 numBlocks(1, 1089);
-		dim3 threadsPerBlock(16, 8);
 		//dim3 numBlocks(2, 2);
 		//dim3 threadsPerBlock(8, 108);
@@ -92,7 +90,7 @@
 		cudaDeviceSynchronize();
 		stringstream name;      //for debugging
 		name << "FromTex.bmp";
-		stim::gpu2image<float>(print, name.str(),16,1089*8,0,1.0);
+		stim::gpu2image<float>(print, name.str(),x,y,0,255);
 		tx.UnmapCudaTexture();
 		cleanUP();
@@ -23,6 +23,7 @@
 #include <stim/cuda/branch_detection.cuh>
 #include "../../../volume-spider/fiber.h"
 #include "../../../volume-spider/glnetwork.h"
+#include <stim/visualization/cylinder.h>
 //#include <stim/cuda/testKernel.cuh>
 //#include <stim/cuda/testKernel.cuh>
@@ -81,8 +82,8 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 		int numSamplesPos;
 		int numSamplesMag;
-//		float stepsize = 4.0;			//Step size.
-		float stepsize = 3.0;			//Step size.
+		float stepsize = 5.0;			//Step size.
+//		float stepsize = 3.0;			//Step size.
 		int current_cost;			//variable to store the cost of the current step.
@@ -95,7 +96,6 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 		std::vector< stim::vec<float> > cD;	//Direction of line currently being traced.
 		std::vector< stim::vec<float> > cM;	//Magnitude of line currently being traced.
-		stim::glObj<float> sk;			//object to store the skeleton.
 		stim::glnetwork<float> nt;		//object for storing the network.
 		stim::vec<float> rev;			//reverse vector;
@@ -169,6 +169,7 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 		///subject to change.
 		///finds branches.
+		///depreciated
 		void
 		branchDetection()
 		{
@@ -196,14 +197,8 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 						   -p[1] + cylp[1]*S[1]*R[1],
 						   -p[2] + cylp[2]*S[2]*R[2]);
 						seeddir = seeddir.norm();
-//					float seedm = m[0]/2.0;
 					float seedm = m[0];
 // Uncomment for global run 
-/*					stim::vec<float> lSeed = getLastSeed();
-					if(sqrt(pow((lSeed[0] - vec[0]),2)
-					 + pow((lSeed[1] - vec[1]),2) + 
-					 pow((lSeed[2] - vec[2]),2)) > m[0]/4.0
-					 &&  */
 					if(
 					 !(vec[0] > size[0] || vec[1] > size[1]
 					 || vec[2] > size[2] || vec[0] < 0
@@ -218,6 +213,56 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 		}
+
+		///finds all the branches in the a given fiber.
+		///using LoG method.
+		void
+		branchDetection2(int n = 8, int l_template = 8, int l_square = 8)
+		{
+			if(cL.size() < 4){}
+			else{
+			setMatrix(1);
+			DrawLongCylinder(n, l_template, l_square);
+			stim::cylinder<float> cyl(cL, cM);
+			std::vector< stim::vec<float> > result = find_branch(btexbufferID, GL_TEXTURE_2D, n*l_square, (cL.size()-1)*l_template);
+			stim::vec<float> size(S[0]*R[0], S[1]*R[1], S[2]*R[2]);
+			float pval;
+			if(!result.empty())
+			{
+				for(int i = 0; i < result.size(); i++)
+				{
+					int id = result[i][2];
+					if(fmod(result[i][2], id) != 0 && id != 0)
+					{
+						
+						pval = ((cyl.getl(id+1)-cyl.getl(id))*
+							(fmod(result[i][2], id))+cyl.getl(id))/cyl.getl(cL.size()-1);
+					}
+					else if(id == 0)
+					{
+						pval = (cyl.getl(id+1)*result[i][2])/cyl.getl(cL.size()-1);
+					}
+					else
+					{
+						pval = (cyl.getl(id)/cyl.getl(cL.size()-1));
+					}
+					stim::vec<float> v = cyl.surf(pval, result[i][0]);
+					stim::vec<float> di = cyl.p(pval);
+					float rad = cyl.r(pval);
+					if(
+					 !(v[0] > size[0] || v[1] > size[1]
+					 || v[2] > size[2] || v[0] < 0
+					 || v[1] < 0 || v[2] < 0))
+					{
+						setSeed(v);
+						setSeedVec((v-di).norm());
+						setSeedMag(rad);
+					}
+				}
+			}
+			}
+		}
+
 //--------------------------------------------------------------------------//
 //---------------------TEMPLATE CREATION METHODS----------------------------//
@@ -459,6 +504,7 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 		void
 		GenerateFBO(unsigned int width, unsigned int height, GLuint &textureID, GLuint &framebufferID)
 		{
+			glDeleteFramebuffers(1, &framebufferID);
 			glGenFramebuffers(1, &framebufferID);
 			glBindFramebuffer(GL_FRAMEBUFFER, framebufferID);
 			int numChannels = 1;
@@ -506,45 +552,60 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 		}
+		///IF type == 0
 		///Method for using the gl manipulation to align templates from
 		///Template space (-0.5 0.5) to Texture space (0.0, 1.0),
 		///Based on the p of the spider in real space (arbitrary).
+
+		///IF type == 1
+		///Method for using the gl manipulation to set up a matrix 
+		///To transform from tissue space into texture space.
 		///All transformation happen in glMatrixMode(GL_TEXTURE).
-		void setMatrix()
+		///All transformation happen in glMatrixMode(GL_TEXTURE).
+		void setMatrix(int type = 0)
 		{
-			float curTrans[16];			//array to store the matrix values.
-			stim::vec<float> rot = getRotation(d);	//get the rotation parameters for the current direction vector.
-			glMatrixMode(GL_TEXTURE);
-			glLoadIdentity();
+			if(type == 0)
+			{
+				float curTrans[16];			//array to store the matrix values.
+				stim::vec<float> rot = getRotation(d);	//get the rotation parameters for the current direction vector.
+				glMatrixMode(GL_TEXTURE);
+				glLoadIdentity();
-			//Scale by the voxel size and number of slices.
-			glScalef(1.0/S[0]/R[0], 1.0/S[1]/R[1], 1.0/S[2]/R[2]);
-			//translate to the current position of the spider in the texture.
-			glTranslatef(p[0],
-				     p[1],
-				     p[2]);
-			//rotate to the current direction of the spider.
-			glRotatef(rot[0], rot[1], rot[2], rot[3]);
-			//scale to the magnitude of the spider.
-			glScalef(m[0],
-				 m[0],
-				 m[0]);
-			//get and store the current transformation matrix for later use.
-			glGetFloatv(GL_TEXTURE_MATRIX, curTrans);
-			cT.set(curTrans);
-		//	printTransform();
-			
-			CHECK_OPENGL_ERROR
-			//revert back to default gl mode.
-			glMatrixMode(GL_MODELVIEW);
+				//Scale by the voxel size and number of slices.
+				glScalef(1.0/S[0]/R[0], 1.0/S[1]/R[1], 1.0/S[2]/R[2]);
+				//translate to the current position of the spider in the texture.
+				glTranslatef(p[0],
+					     p[1],
+					     p[2]);
+				//rotate to the current direction of the spider.
+				glRotatef(rot[0], rot[1], rot[2], rot[3]);
+				//scale to the magnitude of the spider.
+				glScalef(m[0],
+					 m[0],
+					 m[0]);
+				//get and store the current transformation matrix for later use.
+				glGetFloatv(GL_TEXTURE_MATRIX, curTrans);
+				cT.set(curTrans);
+			//	printTransform();
+				
+				CHECK_OPENGL_ERROR
+				//revert back to default gl mode.
+				glMatrixMode(GL_MODELVIEW);
+			}
+			else if(type == 1)
+			{
+				glMatrixMode(GL_TEXTURE);
+				glLoadIdentity();
+				glScalef(1.0/S[0]/R[0], 1.0/S[1]/R[1], 1.0/S[2]/R[2]);
+				glMatrixMode(GL_MODELVIEW);
+			}
 		}
 		///Method for controling the buffer and texture binding.
 		///Clears the buffer upon binding.
 		void
-		Bind()
+		Bind(float len = 8.0)
 		{
-			float len = 8.0;
 			glBindFramebuffer(GL_FRAMEBUFFER, fboID);//set up GL buffer		
 			glFramebufferTexture2D(
 				GL_FRAMEBUFFER,
@@ -576,9 +637,8 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 		///@param GLuint &framebufferID, framebuffer used for storage.
 		///@param int nSamples, number of rectanges to create. 
 		void
-		Bind(GLuint &textureID, GLuint &framebufferID, int nSamples)
+		Bind(GLuint &textureID, GLuint &framebufferID, int nSamples, float len = 8.0)
 		{
-			float len = 8.0;
 			glBindFramebuffer(GL_FRAMEBUFFER, framebufferID);//set up GL buffer
 			glFramebufferTexture2D(
 				GL_FRAMEBUFFER,
@@ -1085,7 +1145,7 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
                    		{
                            		float x, y, z, u, v, w, m;
                            		myfile >> x >> y >> z >> u >> v >> w >> m;
-					setSeed(x, y , z);
+					setSeed(x, y, z);
 					setSeedVec(u, v, w);
 					setSeedMag(m);
                    		}
@@ -1099,14 +1159,28 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 		void
 		saveNetwork(std::string name)
 		{
+			stim::glObj<float> sk;
+			for(int i = 0; i < nt.sizeE(); i++)
+			{
+				std::vector<stim::vec< float > > cm = nt.getEdgeCenterLineMag(i);
+                 		std::vector<stim::vec< float > > ce = nt.getEdgeCenterLine(i);
+				sk.Begin(stim::OBJ_LINE);
+				for(int j = 0; j < ce.size(); j++)
+				{
+					sk.TexCoord(cm[j][0]);
+					sk.Vertex(ce[j][0], ce[j][1], ce[j][2]);
+				}
+				sk.End();
+			}	
 			sk.save(name);
 		}
+		///Depreciated, but might be reused later()
 		///returns a COPY of the entire stim::glObj object.
 		stim::glObj<float>
 		getNetwork()
 		{
-			return sk;
+//			return sk;
 		}
 		///returns a COPY of the entire stim::glnetwork object.
@@ -1216,6 +1290,37 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 		      	 glEnd();  
 			 glEndList();
 		}
+
+///need to return the cylinder.
+		void
+		DrawLongCylinder(int n = 8, int l_template = 8,int l_square = 8)
+		{
+			int cylLen = cL.size()-1;
+			GenerateFBO(n*l_square, cylLen*l_template, btexbufferID, bfboID);
+			Bind(btexbufferID, bfboID, cylLen, l_template*l_square/2.0);
+			stim::cylinder<float> cyl(cL, cM);
+			std::vector<std::vector<stim::vec<float> > > p = cyl.getPoints(n);
+			for(int i = 0; i < p.size()-1; i++)	///number of circles
+			{
+				for(int j = 0; j < p[0].size()-1; j++)		///points in the circle
+				{
+					glBegin(GL_QUADS);
+						glTexCoord3f(p[i][j][0], p[i][j][1], p[i][j][2]);
+						glVertex2f(j*l_square,  i*(float)l_template); 
+
+						glTexCoord3f(p[i][j+1][0], p[i][j+1][1], p[i][j+1][2]);
+						glVertex2f(j*l_square+l_square, i*(float)l_template); 
+
+						glTexCoord3f(p[i+1][j+1][0], p[i+1][j+1][1], p[i+1][j+1][2]);
+						glVertex2f(j*l_square+l_square, i*(float)l_template+(float)l_template); 
+
+						glTexCoord3f(p[i+1][j][0], p[i+1][j][1], p[i+1][j][2]);
+						glVertex2f(j*l_square,i*(float)l_template+(float)l_template); 
+					glEnd();
+				}
+			}
+			Unbind();
+		}
 		///@param min_cost the cost value used for tracing
@@ -1223,114 +1328,35 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 		void
 		trace(int min_cost)
 		{	
-			Bind();
-			rev = stim::vec<float>(0.0,0.0,1.0);
+//			rev = stim::vec<float>(0.0,0.0,1.0);
 			bool sEmpty = true;
 			float lastmag = 16.0;;
-			while(!seeds.empty())
+			stim::vec<float> curSeed; 
+			stim::vec<float> curSeedVec;			
+			float curSeedMag;
+			while(!Empty())
 			{
 				//clear the currently traced line and start a new one.
 				cL.clear();
 				cM.clear();
-				sk.Begin(stim::OBJ_LINE);
-				stim::vec<float> curSeed = seeds.top();
-//				std::cout << "The current seeds is " << curSeed << std::endl;
-				stim::vec<float> curSeedVec = seedsvecs.top();
-				float curSeedMag = seedsmags.top();
+				cD.clear();
+				curSeed = seeds.top();
+				curSeedVec = seedsvecs.top();
+				curSeedMag = seedsmags.top();
 				seeds.pop();
 				seedsvecs.pop();
 				seedsmags.pop();
 //				std::cout << "The current seed Vector is " << curSeedVec << std::endl;
 				setPosition(curSeed);
 				setDirection(curSeedVec);
-				cL.push_back(curSeed);
-				cM.push_back(curSeedMag);
-				sk.createFromSelf(GL_SELECT);
-				traceLine(min_cost);
-
-				sk.rev();
-		//		std::cout << "reversed" << std::endl;
-				std::reverse(cL.begin(), cL.end());
-				std::reverse(cM.begin(), cM.end());
-				setPosition(curSeed);
-				setDirection(-rev);
-				setMagnitude(16.0);
-				sk.createFromSelf(GL_SELECT);
-				traceLine(min_cost);
-				sk.End();
+				setMagnitude(curSeedMag);
+//				cL.push_back(curSeed);
+//				cM.push_back(curSeedMag);
+//				cD.push_back(curSeedMag);
+				pair<stim::fiber<float>, int> a = traceLine(p, m, min_cost);
 			}
-			Unbind();
 		}
-		///@param min_cost the cost value used for tracing
-		///traces the seedpoint passed to completion in one directions.
-		void
-		traceLine(int min_cost)
-		{
-			stim::vec<float> pos;
-			stim::vec<float> mag;
-			int h;
-			bool started = false;
-			bool running = true;
-			stim::vec<float> size(S[0]*R[0], S[1]*R[1], S[2]*R[2]);
-			while(running)
-			{
-				int cost = Step();
-				if (cost > min_cost){
-					running = false;
-					break;
-				} else {
-					//Have we found an edge?
-					pos = getPosition();
-					if(pos[0] > size[0] || pos[1] > size[1]
-					 || pos[2] > size[2] || pos[0] < 0
-					 || pos[1] < 0 || pos[2] < 0)
-					{
-//					       std::cout << "Found Edge" << std::endl;
-						running = false;
-						break;
-					}
-					//If this is the first step in the trace,
-					// save the direction
-					//(to be used later to trace the fiber in the opposite direction)
-					if(started == false){
-						rev = -getDirection();
-						started = true;
-					}
-//					std::cout << i << p << std::endl;
-					m = getMagnitude();
-					//Has the template size gotten unreasonable?
-					if(m[0] > 75 || m[0] < 1){
-//						std::cout << "Magnitude Limit" << std::endl;
-						running = false;
-						break;
-					}
-					else
-					{
-						h = selectObject(pos, getDirection(), m[0]);
-						//Have we hit something previously traced?
-						if(h != -1){
-							std::cout << "I hit a line" << h << std::endl;
-							running = false;
-							break;
-						}
-						else {          
-							cL.push_back(stim::vec<float>(p[0], p[1],p[2]));//
-							sk.TexCoord(m[0]);
-							sk.Vertex(p[0], p[1], p[2]);
-							Bind(btexbufferID, bfboID, 27);
-							CHECK_OPENGL_ERROR
-							branchDetection();
-							CHECK_OPENGL_ERROR
-							Unbind();
-							CHECK_OPENGL_ERROR
-						}
-				 	}
-                 		}
-         		}
-		}	
-
-
 		int
 		selectObject(stim::vec<float> loc, stim::vec<float> dir, float mag) 
 		{
@@ -1468,10 +1494,10 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 					spos[2] = spos[2]-sdir[2]*smag[0]/2.;
 					int h = selectObject(spos, -sdir, smag[0]);
 					//did start with a fiber?
-					if(h != -1){	
+					if(h != -1 && h != nt.sizeE()){	
 			//			std::cout << "got here double" << smag.str() << std::endl;
 						nt.addEdge(ce,cm, h, in.second);	
-					}
+					} else { nt.addEdge(ce,cm, -1, -1);}
 				}
 			}		
 		}
@@ -1494,7 +1520,7 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 			stim::vec<float> sdir = getDirection();	
 			Bind();
-			sk.Begin(stim::OBJ_LINE);
+//			sk.Begin(stim::OBJ_LINE);
 //			sk.createFromSelf(GL_SELECT);
@@ -1514,7 +1540,8 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 				int cost = Step();
 				if (cost > min_cost){
 					running = false;
-					sk.End();
+//					sk.End();
+					branchDetection2();
 					pair<stim::fiber<float>, int> a(stim::fiber<float> (cL, cM), -1);
 					addToNetwork(a, spos, smag, sdir);
 					return a;
@@ -1526,9 +1553,8 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 					 || pos[2] > size[2] || pos[0] < 0
 					 || pos[1] < 0 || pos[2] < 0)
 					{
-//					       std::cout << "Found Edge" << std::endl;
 						running = false;
-						sk.End();
+						branchDetection2();
 						pair<stim::fiber<float>, int> a(stim::fiber<float> (cL, cM), -1);
 						addToNetwork(a, spos, smag, sdir);
 						return a;
@@ -1541,13 +1567,11 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 						rev = -getDirection();
 						started = true;
 					}
-//					std::cout << i << p << std::endl;
 					//Has the template size gotten unreasonable?
 					mag = getMagnitude();
 					if(mag[0] > 75 || mag[0] < 1){
-//						std::cout << "Magnitude Limit" << std::endl;
 						running = false;
-						sk.End();
+						branchDetection2();
 						pair<stim::fiber<float>, int> a(stim::fiber<float> (cL, cM), -1);
 						addToNetwork(a, spos, smag, sdir);
 						return a;
@@ -1559,7 +1583,7 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 						//Have we hit something previously traced?
 						if(h != -1){
 							running = false;
-							sk.End();
+							branchDetection2();
 							pair<stim::fiber<float>, int> a(stim::fiber<float> (cL, cM), h);
 							addToNetwork(a, spos, smag, sdir);
 							return a;
@@ -1568,14 +1592,7 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 						else {  
 							cL.push_back(stim::vec<float>(p[0], p[1],p[2]));
 							cM.push_back(stim::vec<float>(m[0], m[0]));
-//							cM.push_back(m[0]);
-
-							sk.TexCoord(m[0]);
-							sk.Vertex(p[0], p[1], p[2]);
-							Bind(btexbufferID, bfboID, 27);
-							CHECK_OPENGL_ERROR
-							branchDetection();
-							CHECK_OPENGL_ERROR
+//							Bind(btexbufferID, bfboID, 27);
 							Unbind();
 							CHECK_OPENGL_ERROR
 #ifndef STIM_CIRCLE_H
 #define STIM_CIRCLE_H
-//enable CUDA_CALLABLE macro
 #include <stim/cuda/cudatools/callable.h>
+#include <stim/math/plane.h>
 #include <stim/math/vector.h>
 #include <stim/math/triangle.h>
-#include <stim/math/quaternion.h>
-#include <stim/math/rect.h>
-#include <iostream>
-#include <iomanip>
+#include <assert.h>
 #include <algorithm>
+#include <iostream>
-namespace stim
-{
+namespace stim{
-template <class T>
-struct circle : rect<T>
+template <typename T>
+class circle : plane<T>
 {
-	private:
-		T theta;
-
-	public:
-
-		using stim::rect<T>::p;
-		using stim::rect<T>::normal;
-		using stim::rect<T>::center;
-		using stim::rect<T>::scale;
-		///base constructor
-		///@param th value of the angle of the starting point from 0 to 360.
-		CUDA_CALLABLE circle(float th = 0.0) : rect<T>()
-		{
-			theta = th;
-		}
-		///create a rectangle given a size and position in Z space.
-		///@param size: size of the rectangle in ND space.
-		///@param z_pos z coordinate of the rectangle.
-		///@param th value of the angle of the starting point from 0 to 360.
-		CUDA_CALLABLE circle(T size, T zpos = (T)0, float th = 0.0) : rect<T>(size, zpos)
-		{
-			theta = th;
-		}
+private:
+	
+	stim::vec<T> Y;
-		///create a rectangle from a center point, normal
-		///@param c: x,y,z location of the center.
-		///@param n: x,y,z direction of the normal.
-		///@param th value of the angle of the starting point from 0 to 360.
-		CUDA_CALLABLE circle(vec<T> c, vec<T> n = vec<T>(0,0,1), float th = 0.0) : rect<T>(c, n)
-		{
-			theta = th;
-		}
-		
-		///create a rectangle from a center point, normal, and size
-		///@param c: x,y,z location of the center.
-		///@param s: size of the rectangle.
-		///@param n: x,y,z direction of the normal.
-		///@param th value of the angle of the starting point from 0 to 360.
-		CUDA_CALLABLE circle(vec<T> c, T s, vec<T> n = vec<T>(0,0,1), float th = 0.0):rect<T>(c,s,n)
-		{
-			theta = th;
-		}
+	CUDA_CALLABLE void
+	init()
+	{
+		Y = U.cross(N).norm();
+	}
-		///creates a rectangle from a centerpoint and an X and Y direction vectors.
-		///@param center: x,y,z location of the center.
-		///@param directionX: u,v,w direction of the X vector.
-		///@param directionY: u,v,w direction of the Y vector.
-		///@param th value of the angle of the starting point from 0 to 360.
-		CUDA_CALLABLE circle(vec<T> center, vec<T> directionX, vec<T> directionY, float th = 0.0) : rect<T>(center, directionX, directionY)
-		{
-			theta = th;
-		}
+public:
+	using stim::plane<T>::n;
+	using stim::plane<T>::P;
+	using stim::plane<T>::N;
+	using stim::plane<T>::U;
+	using stim::plane<T>::rotate;
+	using stim::plane<T>::setU;
-		///creates a rectangle from a size, centerpoint, X, and Y direction vectors.
-		///@param size of the rectangle in ND space.
-		///@param center: x,y,z location of the center.
-		///@param directionX: u,v,w direction of the X vector.
-		///@param directionY: u,v,w direction of the Y vector.
-		///@param th value of the angle of the starting point from 0 to 360.
-		CUDA_CALLABLE circle(T size, vec<T> center, vec<T> directionX, vec<T> directionY,  float th = 0.0) : rect<T>(size, center, directionX, directionY)
-		{
-			theta = th;
-		}
-		
-		///creates a rectangle from a size, centerpoint, X, and Y direction vectors.
-		///@param size of the rectangle in ND space, size[0] = size in X, size[1] = size in Y.
-		///@param center: x,y,z location of the center.
-		///@param directionX: u,v,w direction of the X vector.
-		///@param directionY: u,v,w direction of the Y vector.
-		///@param th value of the angle of the starting point from 0 to 360.
-		CUDA_CALLABLE circle(vec<T> size, vec<T> center, vec<T> directionX, vec<T> directionY, float th = 0.0) : rect<T>(size, center, directionX, directionY)
-		{
-			theta = th;
-		}
+	///base constructor
+	///@param th value of the angle of the starting point from 0 to 360.
+	CUDA_CALLABLE
+	circle() : plane<T>()
+	{
+		init();
+	}
-		///returns a vector with the points on the initialized circle.
-		///connecting the points results in a circle.
-		///@param n: integer for the number of points representing the circle.
-		std::vector<stim::vec<T> >
-		getPoints(int n)
-		{
-			std::vector<stim::vec<T> > result;
-			stim::vec<T> point;
-			T x,y;
-			float step = 360.0/(float) n;
-			for(float j = theta; j <= theta+360.0; j += step)
-			{
-				y = 0.5*cos(j*2.0*M_PI/360.0)+0.5;
-				x = 0.5*sin(j*2.0*M_PI/360.0)+0.5;
-				result.push_back(p(x,y));
-			}
-
-			return result;
-		}
+	///create a rectangle given a size and position in Z space.
+	///@param size: size of the rectangle in ND space.
+	///@param z_pos z coordinate of the rectangle.
+	CUDA_CALLABLE
+	circle(T size, T z_pos = (T)0) : plane<T>()
+	{
+		init();
+		center(stim::vec<T>(0,0,z_pos));
+		scale(size);
+	}
+
+	///create a rectangle from a center point, normal
+	///@param c: x,y,z location of the center.
+	///@param n: x,y,z direction of the normal.	
+	CUDA_CALLABLE
+	circle(vec<T> c, vec<T> n = vec<T>(0,0,1)) : plane<T>()
+	{
+		center(c);
+		normal(n);
+		init();
+	}
+
+	///create a rectangle from a center point, normal, and size
+	///@param c: x,y,z location of the center.
+	///@param s: size of the rectangle.
+	///@param n: x,y,z direction of the normal.
+	CUDA_CALLABLE 
+	circle(vec<T> c, T s, vec<T> n = vec<T>(0,0,1)) : plane<T>()
+	{
+		init();
+		center(c);
+		rotate(n, U, Y);
+		scale(s);
+	}
+
+	///create a rectangle from a center point, normal, and size
+	///@param c: x,y,z location of the center.
+	///@param s: size of the rectangle.
+	///@param n: x,y,z direction of the normal.
+	///@param u: x,y,z direction for the zero vector (from where the rotation starts)
+	CUDA_CALLABLE
+	circle(vec<T> c, T s, vec<T> n = vec<T>(0,0,1), vec<T> u = vec<T>(1, 0, 0)) : plane<T>()
+	{
+		init();
+		setU(u);
+		center(c);
+		normal(n);
+		scale(s);
+	}
+
+	///scales the circle by a certain factor
+	///@param factor: the factor by which the dimensions of the shape are scaled.
+	CUDA_CALLABLE
+	void scale(T factor)
+	{
+		U *= factor;
+		Y *= factor;
+	}
+
+	///sets the normal for the cirlce
+	///@param n: x,y,z direction of the normal.
+	CUDA_CALLABLE void
+	normal(vec<T> n)
+	{
+		rotate(n, Y);
+	}
-		///returns a vector with the points on the initialized circle.
-		///connecting the points results in a circle.
-		///@param n: integer for the number of points representing the circle.
-		stim::vec<T>
-		p(T theta)
+	///sets the center of the circle.
+	///@param n: x,y,z location of the center.
+	CUDA_CALLABLE T
+	center(vec<T> p)
+	{
+		this->P = p;
+	}
+
+	///boolean comparison
+	bool
+	operator==(const circle<T> & rhs)
+	{
+		if(P == rhs.P && U == rhs.U && Y == rhs.Y)
+			return true;
+		else
+			return false;
+	}
+
+	///get the world space value given the planar coordinates a, b in [0, 1]
+	CUDA_CALLABLE stim::vec<T> p(T a, T b)
+	{
+		stim::vec<T> result;
+
+		vec<T> A = this->P - this->U * (T)0.5 - Y * (T)0.5;
+		result = A + this->U * a + Y * b;
+		return result;
+	}
+
+	///parenthesis operator returns the world space given rectangular coordinates a and b in [0 1]
+	CUDA_CALLABLE stim::vec<T> operator()(T a, T b)
+	{
+		return p(a,b);
+	}
+
+	///returns a vector with the points on the initialized circle.
+	///connecting the points results in a circle.
+	///@param n: integer for the number of points representing the circle.
+	std::vector<stim::vec<T> >
+	getPoints(int n)
+	{
+		std::vector<stim::vec<T> > result;
+		stim::vec<T> point;
+		T x,y;
+		float step = 360.0/(float) n;
+		for(float j = 0; j <= 360.0; j += step)
 		{
-			T x,y;
-			y = 0.5*cos(theta*2.0*M_PI/360.0)+0.5;
-			x = 0.5*sin(theta*2.0*M_PI/360.0)+0.5;
-			return p(x,y);
+			y = 0.5*cos(j*2.0*M_PI/360.0)+0.5;
+			x = 0.5*sin(j*2.0*M_PI/360.0)+0.5;
+			result.push_back(p(x,y));
 		}
-};
+		return result;
+	}	
+	
+	///returns a vector with the points on the initialized circle.
+	///connecting the points results in a circle.
+	///@param n: integer for the number of points representing the circle.
+	stim::vec<T>
+	p(T theta)
+	{
+		T x,y;
+		y = 0.5*cos(theta*2.0*M_PI/360.0)+0.5;
+		x = 0.5*sin(theta*2.0*M_PI/360.0)+0.5;
+		return p(x,y);
+	}
+};
 }
-
+>>>>>>> origin/Master_Clone_W_Branch
 #endif
@@ -194,6 +194,17 @@ class plane
 		}
+		CUDA_CALLABLE void rotate(vec<T> n, vec<T> &Y)
+		{
+			quaternion<T> q;
+			q.CreateRotation(N, n);
+			
+			N = q.toMatrix3() * N;
+			U = q.toMatrix3() * U;
+			Y = q.toMatrix3() * Y;
+
+		}
+
 		CUDA_CALLABLE void rotate(vec<T> n, vec<T> &X, vec<T> &Y)
 		{
 			quaternion<T> q;
@@ -12,89 +12,134 @@ class cylinder
 {
 	private:
 		stim::circle<T> s;			//an arbitrary circle
-		std::vector< stim::vec<T> > pos;	//positions of the cylinder.
-		std::vector< stim::vec<T> > mags;	//radii at each position
+		std::vector<stim::circle<T> > e;
+		std::vector<stim::vec<T> > mags;
 		std::vector< T > L;			//length of the cylinder at each position.
-
-		///default init
+	
+		///default init	
 		void
-		init(){
+		init()
+		{
 		}
 		///inits the cylinder from a list of points (inP) and radii (inM)
 		void
-		init(std::vector<stim::vec<T> > inP, std::vector<stim::vec<T> > inM){
-			pos = inP;
+		init(std::vector<stim::vec<T> > inP, std::vector<stim::vec<T> > inM)
+		{
 			mags = inM;
+			stim::vec<float> v1;
+			stim::vec<float> v2;
+			e.resize(inP.size());
+			if(inP.size() < 2)
+				return;
 			//calculate each L.
-			L.resize(pos.size()-1);
+			L.resize(inP.size());
 			T temp = (T)0;
-			for(int i = 0; i < L.size(); i++)
+			L[0] = 0;
+			for(int i = 1; i < L.size(); i++)
 			{
-				temp += (pos[i] - pos[i+1]).len();
+				temp += (inP[i-1] - inP[i]).len();
 				L[i] = temp;
 			}
-		}
+			stim::vec<T> dr = (inP[1] - inP[0]).norm();
+			s = stim::circle<T>(inP[0], inM[0][0], dr, stim::vec<T>(1,0,0));
+			e[0] = s;
+			for(int i = 1; i < inP.size()-1; i++)
+			{
+				s.center(inP[i]);
+				v1 = (inP[i] - inP[i-1]).norm();
+				v2 = (inP[i+1] - inP[i]).norm();
+				dr = (v1+v2).norm();
+				s.normal(dr);
+				s.scale(inM[i][0]/inM[i-1][0]);
+				e[i] = s;
+			}
+			
+			int j = inP.size()-1;
+			s.center(inP[j]);
+			dr = (inP[j] - inP[j-1]).norm();
+			s.normal(dr);
+			s.scale(inM[j][0]/inM[j-1][0]);
+			e[j] = s;
+		}
+		
 		///returns the direction vector at point idx.
 		stim::vec<T>
-		d(int idx){
-			return (pos[idx] - pos[idx+1]).norm();
+		d(int idx)
+		{
+			if(idx == 0)
+			{
+				return (e[idx+1].P - e[idx].P).norm();
+			}
+			else if(idx == e.size()-1)
+			{
+				return (e[idx].P - e[idx-1].P).norm();
+			}
+			else
+			{
+//				return (e[idx+1].P - e[idx].P).norm();
+				stim::vec<float> v1 = (e[idx].P-e[idx-1].P).norm();
+				stim::vec<float> v2 = (e[idx+1].P-e[idx].P).norm();
+				return (v1+v2).norm();			
+			} 
+	//		return e[idx].N;	
+
 		}
-		///returns the total length of the line at index j.
-		T
-		getl(int j){
-			for(int i = 0; i < j-1; ++i)
+		stim::vec<T>
+		d(T l, int idx)
+		{
+			if(idx == 0 || idx == e.size()-1)
 			{
-				temp += (pos[i] - pos[i+1]).len();
-				L[i] = temp;
+				return e[idx].N;
 			}
+			else
+			{
+				T rat = (l-L[idx])/(L[idx+1]-L[idx]);
+				return(	e[idx].N + (e[idx+1].N - e[idx].N)*rat);
+			} 
 		}
+
 		///finds the index of the point closest to the length l on the lower bound.
 		///binary search.
 		int
-		findIdx(T l){
-			int i = pos.size()/2;
-			while(i > 0 && i < pos.size())
+		findIdx(T l)
+		{
+			unsigned int i = L.size()/2;
+			unsigned int max = L.size()-1;
+			unsigned int min = 0;
+			while(i > 0 && i < L.size()-1)
 			{
-				if(L[i] < l)
+//				std::cerr << "Trying " << i << std::endl;
+//				std::cerr << "l is " << l << ", L[" << i << "]" << L[i] << std::endl;
+				if(l < L[i])
 				{
-					i = i/2;
+					max = i;
+					i = min+(max-min)/2;
 				}
-				else if(L[i] < l && L[i+1] > l)
+				else if(L[i] <= l && L[i+1] >= l)
 				{
 					break;
 				}
 				else
 				{
-					i = i+i/2;
+					min = i;
+					i = min+(max-min)/2;
 				}
 			}
 			return i;
 		}
-		//initializes the length array given the current set of positions
-		void init_length(){
-			vec<T> p0, p1;
-			p0 = pos[0];						//initialize the first point in the segment to the first point in the cylinder
-			T l;								//allocate space for the segment length
-			for(unsigned p = 1; p < pos.size(); p++){		//for each point in the cylinder
-				p1 = pos[p];					//get the second point in the segment
-				l = (p1 - p0).len();			//calculate the length of the segment
-
-				if(p == 1) L[0] = l;			//set the length for the first segment
-				else L[p-1] = L[p-2] + l;		//calculate and set the running length for each additional segment
-			}
-
-		}
-
 	public:
 		///default constructor
-		cylinder(){}
+		cylinder()
+		{
+
+		}
 		///constructor to create a cylinder from a set of points, radii, and the number of sides for the cylinder.
 		///@param inP:  Vector of stim vecs composing the points of the centerline.
@@ -127,12 +172,16 @@ class cylinder
 		///interpolates the position along the line.
 		///@param pvalue: the location of the in the cylinder, from 0 (beginning to 1).
 		stim::vec<T>
-		p(T pvalue){
+		p(T pvalue)
+		{
 			if(pvalue < 0.0 || pvalue > 1.0)
-				return;
+			{
+				return stim::vec<float>(-1,-1,-1);
+			}
 			T l = pvalue*L[L.size()-1];
 			int idx = findIdx(l);
-			return (pos[idx] + (pos[idx+1]-pos[idx])*((l-L[idx])/(L[idx+1]- L[idx])));
+				T rat = (l-L[idx])/(L[idx+1]-L[idx]);
+			return(	e[idx].P + (e[idx+1].P-e[idx].P)*rat);
 		}
 		///Returns a position vector at the given length into the fiber (based on the pvalue).
@@ -140,20 +189,25 @@ class cylinder
 		///@param l: the location of the in the cylinder.
 		///@param idx: integer location of the point closest to l but prior to it.
 		stim::vec<T>
-		p(T l, int idx){
-			return (pos[idx] + (pos[idx+1]-pos[idx])*((l-L[idx])/(L[idx+1]- L[idx])));
+		p(T l, int idx)
+		{
+				T rat = (l-L[idx])/(L[idx+1]-L[idx]);
+			return(	e[idx].P + (e[idx+1].P-e[idx].P)*rat);
+//			return(
+//			return (pos[idx] + (pos[idx+1]-pos[idx])*((l-L[idx])/(L[idx+1]- L[idx])));
 		}
 		///Returns a radius at the given p-value (p value ranges from 0 to 1).
 		///interpolates the radius along the line.
 		///@param pvalue: the location of the in the cylinder, from 0 (beginning to 1).
 		T
-		r(T pvalue){
+		r(T pvalue)
+		{
 			if(pvalue < 0.0 || pvalue > 1.0)
 				return;
 			T l = pvalue*L[L.size()-1];
 			int idx = findIdx(l);
-			return (mags[idx] + (mags[idx+1]-mags[idx])*((l-L[idx])/(L[idx+1]- L[idx])));
+			return (e[idx].U.len() + (e[idx+1].U.len() - e[idx].U.len())*((l-L[idx])/(L[idx+1]- L[idx])));
 		}
 		///Returns a radius at the given length into the fiber (based on the pvalue).
@@ -161,8 +215,10 @@ class cylinder
 		///@param l: the location of the in the cylinder.
 		///@param idx: integer location of the point closest to l but prior to it.
 		T
-		r(T l, int idx){
-			return (mags[idx] + (mags[idx+1]-mags[idx])*((l-L[idx])/(L[idx+1]- L[idx])));
+		r(T l, int idx)
+		{
+				T rat = (l-L[idx])/(L[idx+1]-L[idx]);
+			return(	e[idx].U.len() + (e[idx+1].U.len() - e[idx].U.len())*rat);
 		}
 		///	Returns the magnitude at the given index
@@ -192,8 +248,6 @@ class cylinder
 			return mags[0].size();
 		}
-
-
 		///returns the position of the point with a given pvalue and theta on the surface
 		///in x, y, z coordinates. Theta is in degrees from 0 to 360.
 		///@param pvalue: the location of the in the cylinder, from 0 (beginning to 1).
@@ -202,46 +256,45 @@ class cylinder
 		surf(T pvalue, T theta)
 		{
 			if(pvalue < 0.0 || pvalue > 1.0)
-				return;
+			{
+				return stim::vec<float>(-1,-1,-1);
+			} else {
 			T l = pvalue*L[L.size()-1];
 			int idx = findIdx(l);
-			stim::vec<T> ps = p(l, idx);
+			stim::vec<T> ps = p(l, idx); 
 			T m = r(l, idx);
-			stim::vec<T> dr = d(idx);
-			s = stim::circle<T>(ps, m, dr);
+			s = e[idx];
+			s.center(ps);
+			s.normal(d(l, idx));
+			s.scale(m/e[idx].U.len());
 			return(s.p(theta));
+			}
 		}
 		///returns a vector of points necessary to create a circle at every position in the fiber.
-		///@param sides: the number of sides of each circle.
+		///@param sides: the number of sides of each circle.	
 		std::vector<std::vector<vec<T> > >
 		getPoints(int sides)
 		{
-			if(pos.size() < 2)
+			std::vector<std::vector <vec<T> > > points;
+			points.resize(e.size());
+			for(int i = 0; i < e.size(); i++)
 			{
-				return;
-			} else {
-				std::vector<std::vector <vec<T> > > points;
-				points.resize(pos.size());
-				stim::vec<T> d = (pos[0] - pos[1]).norm();
-				s = stim::circle<T>(pos[0], mags[0][0], d);
-				points[0] = s.getPoints(sides);
-				for(int i = 1; i < pos.size(); i++)
-				{
-					d = (pos[i] - pos[i-1]).norm();
-					s.center(pos[i]);
-					s.normal(d);
-					s.scale(mags[i][0]/mags[i-1][0], mags[i][0]/mags[i-1][0]);
-					points[i] = s.getPoints(sides);
-				}
-				return points;
+				points[i] = e[i].getPoints(sides);
 			}
+			return points;
 		}
+		///returns the total length of the line at index j.
+		T
+		getl(int j)
+		{
+			return (L[j]);
+		}
 		/// Allows a point on the centerline to be accessed using bracket notation
 		vec<T> operator[](unsigned int i){
-			return pos[i];
+			return e[i].P;
 		}
 		/// Returns the total length of the cylinder centerline
@@ -294,13 +347,13 @@ class cylinder
 			std::vector< vec<T> > result;
-			vec<T> p0 = pos[0];								//initialize p0 to the first point on the centerline
+			vec<T> p0 = e[i].P;								//initialize p0 to the first point on the centerline
 			vec<T> p1;
 			unsigned N = size();							//number of points in the current centerline
 			//for each line segment on the centerline
 			for(unsigned int i = 1; i < N; i++){
-				p1 = pos[i];								//get the second point in the line segment
+				p1 = e[i].P;								//get the second point in the line segment
 				vec<T> v = p1 - p0;							//calculate the vector between these two points
 				T d = v.len();								//calculate the distance between these two points (length of the line segment)
@@ -317,12 +370,13 @@ class cylinder
 				p0 = p1;								//shift the points to move to the next line segment
 			}
-			result.push_back(pos[size() - 1]);			//push the last point in the centerline
+			result.push_back(e[size() - 1].P);			//push the last point in the centerline
 			return cylinder<T>(result);
 		}
+		
 };
 }