major bug fixes, stopped using the ivote code for branch detection and started u…

…sing a general Laplassian of Gaussians algorithm (the algorithm invers the image atm), fixed a minor bug with the dynamic network creation

major bug fixes, stopped using the ivote code for branch detection and started u…
…sing a general Laplassian of Gaussians algorithm (the algorithm invers the image atm), fixed a minor bug with the dynamic network creation
Pavel Govyadinov
1 parent 8761b649
Showing 7 changed files with 226 additions and 472 deletions Show diff stats
stim/cuda/branch_detection.cuh
stim/cuda/branch_detection2.cuh
stim/cuda/cuda_texture.cuh
stim/cuda/filter.cuh
stim/cuda/filter.h
stim/cuda/sharedmem.cuh
stim/gl/gl_spider.h
@@ -5,157 +5,31 @@
 //#include <math.h>
 #include <stim/visualization/colormap.h>
 #include <stim/cuda/cuda_texture.cuh>
-#include <stim/cuda/templates/gradient.cuh>
-#include <stim/cuda/templates/gaussian_blur.cuh>
-#include <stim/cuda/arraymath.cuh>
-#include <stim/cuda/ivote.cuh>
-#include <stim/cuda/testKernel.cuh>
+#include <stim/cuda/filter.cuh>
 typedef unsigned int uint;
 typedef unsigned int uchar;
  
-stim::cuda::cuda_texture t;	
-float*		gpuTable;
-float*		gpuGrad;
-float*		gpuVote;	
-float*		gpuI;
-float*		gpuCenters;
-
-void atan_2d(float* cpuTable, unsigned int rmax)
-{
-	//initialize the width and height of the window which atan2 are computed in.
-	int xsize = 2*rmax +1;
-	int ysize = 2*rmax +1;
-	
-	// assign the center coordinates of the atan2 window to yi and xi
-	int yi = rmax;
-	int xi = rmax;
-	
-
-	for (int xt = 0; xt < xsize; xt++){
-
-		for(int yt = 0; yt < ysize; yt++){
-
-			//convert the current 2D coordinates to 1D
-			int id = yt * xsize + xt;
-			// calculate the distance between the pixel and the center of the atan2 window
-			float xd = xi - xt;
-			float yd = yi - yt;
-
-			// calculate the angle between the pixel and the center of the atan2 window and store the result.
-			float atan_2d_vote = atan2(yd, xd);
-			cpuTable[id] = atan_2d_vote;
-		}
-	}
-
-}
-
-void initCuda(unsigned int bytes_table, unsigned int bytes_ds)
-{
-	HANDLE_ERROR(
-		cudaMalloc((void**) &gpuTable, bytes_table)
-		);
-	HANDLE_ERROR(
-		cudaMalloc((void**) &gpuI, bytes_ds)
-		);
-	HANDLE_ERROR(
-		cudaMalloc((void**) &gpuGrad,  bytes_ds*2)
-		);
-	HANDLE_ERROR(
-		cudaMalloc((void**) &gpuVote,  bytes_ds)
-		);
-	HANDLE_ERROR(
-		cudaMalloc((void**) &gpuCenters, bytes_ds)
-		);
-}
-
-void cleanCuda()
-{
-	HANDLE_ERROR(
-		cudaFree(gpuTable)
-	);
-	HANDLE_ERROR(
-		cudaFree(gpuGrad)
-	);
-	HANDLE_ERROR(
-		cudaFree(gpuVote)
-	);
-	HANDLE_ERROR(
-		cudaFree(gpuCenters)
-	);
-	HANDLE_ERROR(
-		cudaFree(gpuI)
-	);
-}
  
 std::vector< stim::vec<float> >
 find_branch(GLint texbufferID, GLenum texType, unsigned int x, unsigned int y)
 {
-	float 		phi	 	= 15.1*M_PI/180;
-	int		iter		= 5;
-	float 		dphi		= phi/iter;
-	float 		rmax 		= 10;
-	float		sigma		= 4;
-	unsigned int 	pixels 		= x * y;
-	unsigned int 	bytes  		= sizeof(float) * pixels;
-	unsigned int 	bytes_table	= sizeof(float) * (2*rmax + 1) * (2*rmax + 1);
-	unsigned int 	x_ds		= (x + (x % 1 == 0 ? 0:1));
-	unsigned int 	y_ds		= (y + (x % 1 == 0 ? 0:1));
-	unsigned int	bytes_ds	= sizeof(float) * x_ds * y_ds;
-	unsigned int	conn		= 10;
-	float		final_t		= 200.0;
-	float*		cpuTable	= (float*) malloc(bytes_table);
-	float*		cpuCenters	= (float*) malloc(bytes_ds);
+	float		sigma		= 2.0;
+	unsigned int	conn		= 7;
+	float		threshold	= 40.0;
+	float*		cpuCenters	= (float*) malloc(x*y*sizeof(float));
+	int		sizek		= 7;
  
 	stringstream name;
  
  
-
-
-//	test(texbufferID, texType, x, y);
-	std::vector<stim::vec<float> >  output;
-	initCuda(bytes_table, bytes_ds); 
-
-	atan_2d(cpuTable, rmax);
-	cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice);
-
-
-	t.MapCudaTexture(texbufferID, texType);
-	cudaDeviceSynchronize();
-	stim::cuda::tex_gaussian_blur2<float>(
-		gpuI, sigma, x, y, t.getTexture(), t.getArray()
-		);
-//	stim::gpu2image<float>(gpuI, "Blur.jpg", x,y , 0, 255);
-//	stim::gpu2image<float>(t.getArray(), "ORIGINAL.jpg", x,y , 0, 255);
+	cpuCenters = stim::cuda::get_centers(texbufferID, texType, x, y, sizek, sigma, conn, threshold);
 	cudaDeviceSynchronize();
  
  
-	stim::cuda::gpu_gradient_2d<float>(
-		gpuGrad, gpuI, x, y
-		);
-	cudaDeviceSynchronize();
-	
-	stim::cuda::gpu_cart2polar<float>(gpuGrad, x, y);
-	cudaDeviceSynchronize();
  
-	cudaDeviceSynchronize();
-	for (int i = 0; i < iter; i++)
-	{
-		stim::cuda::gpu_vote<float>(gpuVote, gpuGrad, gpuTable, phi, rmax, x, y);
-	cudaDeviceSynchronize();
-		stim::cuda::gpu_update_dir<float>(gpuVote, gpuGrad, gpuTable, phi, rmax, x, y);
-	cudaDeviceSynchronize();
-		phi = phi - dphi;
-	}
+	std::vector<stim::vec<float> >  output;
  
 	cudaDeviceSynchronize();
-	stim::cuda::gpu_local_max<float>(gpuCenters, gpuVote, final_t, conn, x, y);
-//	stim::gpu2image<float>(gpuCenters, "Centers.jpg", x, y, 0, 1);
-	cudaMemcpy(cpuCenters, gpuCenters, bytes_ds, cudaMemcpyDeviceToHost);
-//	stim::cpu2image<float>(cpuCenters, "CentersXPU.jpg", x, y, 0, 1);
-//	std::cerr << pixels << " " << x << " " << y << std::endl;
-//	std::cerr << "y is " << y << ", x is " << x << std::endl; 
-
-//	std::cout << "Before " << output.size() << std::endl;
  
 	for(int i = 0; i < x; i++)
 	{
@@ -166,8 +40,6 @@ find_branch(GLint texbufferID, GLenum texType, unsigned int x, unsigned int y)
 			{
 				float x_v = (float) i;
 				float y_v = (float) j;
-				std::cout << x_v/x*360.0 << std::endl;
-				std::cout << y_v/y << std::endl;
 				output.push_back(stim::vec<float>((x_v/(float)x*360.0),
 								  (y_v), y_v/8));	
 			}
@@ -175,29 +47,6 @@ find_branch(GLint texbufferID, GLenum texType, unsigned int x, unsigned int y)
 		} 
 	}
  
-/*	for(int i = 0; i < pixels; i++)
-	{
-		int ix = (i % x);
-		int iy = (i / x);
-		if(cpuCenters[i] != 0)
-		{
-
-			float x_v = (float) ix;
-			float y_v = (float) iy;
-			std::cout << x_v/x*360 << std::endl;
-			std::cout << y_v/y << std::endl;
-			output.push_back(stim::vec<float>((x_v/(float)x*360),
-							  (y_v/(float)y), 0.0));	
-
-		}
-	} */
-
-//	std::cout << "After " <<  output.size() << std::endl;
-
-
-	t.UnmapCudaTexture();
-	cleanCuda();
-	free(cpuTable);
 	free(cpuCenters);
 	return output;
 }
-#include <stim/cuda/templates/gaussian_blur.cuh>
-#include <stim/cuda/templates/gradient.cuh>
-#include <stim/cuda/arraymath.cuh>
-#include <stim/cuda/ivote.cuh>
-
-
-
-
-
-
-
-
-
-
-void atan_2(float* cpuTable, unsigned int rmax){
-
-	//initialize the width and height of the window which atan2 are computed in.
-	int xsize = 2*rmax +1;
-	int ysize = 2*rmax +1;
-	
-	// assign the center coordinates of the atan2 window to yi and xi
-	int yi = rmax;
-	int xi = rmax;
-	
-
-	for (int xt = 0; xt < xsize; xt++){
-
-		for(int yt = 0; yt < ysize; yt++){
-
-			//convert the current 2D coordinates to 1D
-			int id = yt * xsize + xt;
-			// calculate the distance between the pixel and the center of the atan2 window
-			float xd = xi - xt;
-			float yd = yi - yt;
-
-			// calculate the angle between the pixel and the center of the atan2 window and store the result.
-			float atan_2d_vote = atan2(yd, xd);
-			cpuTable[id] = atan_2d_vote;
-		}
-	}
-
-}
-std::vector<stim::vec<float> > 
-find_branch(GLint texbufferID, GLenum texType, unsigned int x, unsigned int y)
-{
-
-	float* cpuTable		= (float
-
-	unsigned int pixels = x * y;
-	unsigned int bytes = sizeof(float) * pixels;
-
-	//calculate the number of bytes in the atan2 table
-
-	unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(float);
-
-
-
-	//allocate space on the GPU for the atan2 table
-
-	float* gpuTable;
-
-	cudaMalloc(&gpuTable, bytes_table);
-
-
-
-	cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice);
-
-	unsigned int sigma_ds = 1/resize;
-	unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
-	unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
-	unsigned int bytes_ds = sizeof(float) * x_ds * y_ds;
-	
-
-	float* gpuI;
-	cudaMalloc(&gpuI, bytes_ds);
-
-	
-	float* gpuGrad;
-	cudaMalloc(&gpuGrad, bytes_ds*2);
-
-	float* gpuVote;
-	cudaMalloc(&gpuVote, bytes_ds);
-
-	// allocate space on the GPU for the detected cell centes
-
-	float* gpuCenters;
-
-	cudaMalloc(&gpuCenters, bytes_ds);		
-
-
-	stim::cuda::gpu_down_sample<float>(gpuI, gpuI0, resize, x , y);
-	cudaMemcpy(cpuResize, gpuI, bytes_ds, cudaMemcpyDeviceToHost);
-
-x = x_ds;
-	y = y_ds;
-	t = t * resize;
-	//sigma = sigma * resize;
-
-	cudaDeviceSynchronize();
-	stim::cuda::gpu_gaussian_blur2<float>(gpuI,sigma, x, y);
-	cudaDeviceSynchronize();
-	cudaMemcpy(cpuBlur, gpuI, bytes_ds, cudaMemcpyDeviceToHost);
-	cudaDeviceSynchronize();
-	
-	stim::cuda::gpu_gradient_2d<float>(gpuGrad, gpuI, x, y);
-	cudaDeviceSynchronize();
-	cudaMemcpy(cpuGradient, gpuGrad, bytes_ds*2, cudaMemcpyDeviceToHost);
-
-	stim::cuda::gpu_cart2polar<float>(gpuGrad, x, y);
-	cudaDeviceSynchronize();
-	cudaMemcpy(cpuCart2Polar, gpuGrad, bytes_ds*2, cudaMemcpyDeviceToHost);
-	
-
-	//multiply the gradient by a constant and calculate the absolute value (to save an image)	
-
-	stim::cuda::cpu_multiply<float>(cpuCart2Polar, 40, x * y * 2);
-
-	cudaDeviceSynchronize();
-
-	stim::cuda::cpu_abs<float>(cpuCart2Polar, x * y * 2);
-
-	cudaDeviceSynchronize();
-
-		
-	for (int i =0; i<iter; i++){
-		
-		stim::cuda::gpu_vote<float>(gpuVote, gpuGrad, gpuTable, phi, rmax, x, y);
-		cudaDeviceSynchronize();
-		stim::cuda::gpu_update_dir<float>(gpuVote, gpuGrad, gpuTable, phi, rmax, x, y);
-		cudaDeviceSynchronize();
-		switch (i){
-		case 0 : cudaMemcpy(cpuVote1, gpuVote, bytes_ds, cudaMemcpyDeviceToHost);
-			break;
-		case 1 : cudaMemcpy(cpuVote2, gpuVote, bytes_ds, cudaMemcpyDeviceToHost);
-			break;
-		case 2 : cudaMemcpy(cpuVote3, gpuVote, bytes_ds, cudaMemcpyDeviceToHost);
-			break;
-		case 3 : cudaMemcpy(cpuVote4, gpuVote, bytes_ds, cudaMemcpyDeviceToHost);
-			break;
-		case 4 : cudaMemcpy(cpuVote5, gpuVote, bytes_ds, cudaMemcpyDeviceToHost);
-			break;
-		default : cudaMemcpy(cpuVote5, gpuVote, bytes_ds, cudaMemcpyDeviceToHost);
-			break;
-		}
-		phi = phi - dphi;
-	}
-	
-	stim::cuda::gpu_local_max<float>(gpuCenters, gpuVote, t, conn, x, y);
-	cudaMemcpy(cpuCenters, gpuCenters, bytes_ds, cudaMemcpyDeviceToHost);
-	
-}
@@ -41,6 +41,46 @@ namespace stim
 				texDesc.normalizedCoords	= 0;
 			}
  
+
+			///Enable the nromalized texture coordinates.
+			///@param bool, 1 for on, 0 for off
+			void
+			SetTextureCoordinates(bool val)
+			{
+				if(val)
+					texDesc.normalizedCoords	=	1;
+				else	
+					texDesc.normalizedCoords	= 	0;
+			}
+
+			///sets the dimension dim to used the mode at the borders of the texture.
+			///@param dim : 0-x, 1-y, 2-z
+			///@param mode: cudaAddressModeWrap = 0,
+			///		cudaAddressModeClamp = 1,
+			///		cudaAddressMNodeMirror = 2,
+			///		cudaAddressModeBorder = 3,
+			void
+			SetAddressMode(int dim, int mode)
+			{
+				switch(mode)
+				{
+					case 0:
+						texDesc.addressMode[dim] = cudaAddressModeWrap;
+						break;
+					case 1:
+						texDesc.addressMode[dim] = cudaAddressModeClamp;
+						break;
+					case 2:
+						texDesc.addressMode[dim] = cudaAddressModeMirror;
+						break;
+					case 3:
+						texDesc.addressMode[dim] = cudaAddressModeBorder;
+						break;
+					default:
+						break;
+				}
+			}
+
 //-------------------------------------------------------------------------//
 //-------------------------------CUDA_MAPPING------------------------------//
 //-------------------------------------------------------------------------//
+#ifndef STIM_FILTER_H
+#define STIM_FILTER_H
+
+#include <assert.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stim/visualization/colormap.h>
+#include <sstream>
+#include <stim/cuda/cudatools/devices.h>
+#include <stim/cuda/cudatools/threads.h>
+#include <stim/cuda/cuda_texture.cuh>
+#include <stim/cuda/ivote.cuh>
+#include <stim/cuda/arraymath.cuh>
+
+#define IMAD(a,b,c) ( __mul24((a), (b)) + (c) )
+#define M_PI 3.141592654f
+
+
+namespace stim
+{
+	namespace cuda
+	{
+
+	float* gpuLoG;
+	float* LoG;
+	float* res;
+	float* centers;
+	stim::cuda::cuda_texture tx;
+
+
+
+	void initArray(int DIM_X, int DIM_Y, int kl)
+	{
+		
+			LoG =  (float*) malloc(kl*kl*sizeof(float));
+		HANDLE_ERROR(
+			cudaMalloc( (void**) &gpuLoG, kl*kl*sizeof(float))
+		);
+	//	checkCUDAerrors("Memory Allocation, LoG");
+		HANDLE_ERROR(
+			cudaMalloc( (void**) &res, DIM_Y*DIM_X*sizeof(float))
+		);
+		HANDLE_ERROR(
+			cudaMalloc( (void**) &centers, DIM_Y*DIM_X*sizeof(float))
+		);
+	//	checkCUDAerrors("Memory Allocation, Result");
+	}
+
+	void cleanUp(cudaGraphicsResource_t src)
+	{
+		HANDLE_ERROR(
+			cudaFree(gpuLoG)
+		);
+		HANDLE_ERROR(
+			cudaFree(res)
+		);
+		HANDLE_ERROR(
+			cudaFree(centers)
+		);
+			free(LoG);
+	}
+
+	void
+	filterKernel (float kl, float sigma, float *LoG)
+	{
+		float t = 0.0;
+		float kr = kl/2; 
+		float x, y;
+		int idx;
+		for(int i = 0; i < kl; i++){
+			for(int j = 0; j < kl; j++){
+				idx = j*kl+i;
+				x = i - kr - 0.5;
+				y = j - kr - 0.5;
+				LoG[idx] = (-1.0/M_PI/powf(sigma, 4))* (1 - (powf(x,2)+powf(y,2))/2.0/powf(sigma, 2))
+						*expf(-(powf(x,2)+powf(y,2))/2/powf(sigma,2));	
+				t +=LoG[idx];
+			}
+		}
+		
+		for(int i = 0; i < kl*kl; i++)
+		{
+			LoG[i] = LoG[i]/t;
+		}
+		
+	}
+
+	//Shared memory would be better.
+	__global__
+	void
+	applyFilter(cudaTextureObject_t texIn, unsigned int DIM_X, unsigned int DIM_Y, int kr, int kl, float *res, float* gpuLoG){
+	//R = floor(size/2)
+	//THIS IS A NAIVE WAY TO DO IT, and there is a better way)
+		
+		__shared__ float shared[7][7];
+		int x = blockIdx.x;
+		int y = blockIdx.y;
+		int xi = threadIdx.x;
+		int yi = threadIdx.y;
+		float val = 0;
+		float tu = (x-kr+xi)/(float)DIM_X;
+		float tv = (y-kr+yi)/(float)DIM_Y;
+		shared[xi][yi] = gpuLoG[yi*kl+xi]*(255.0-(float)tex2D<unsigned char>(texIn, tu, tv));
+		__syncthreads();
+	
+		
+		//x = max(0,x);
+		//x = min(x, width-1);
+		//y = max(y, 0);
+		//y = min(y, height - 1);
+
+		int idx = y*DIM_X+x;
+		int k_idx;
+                for(unsigned int step = blockDim.x/2; step >= 1; step >>= 1)
+                {
+                        __syncthreads();
+                        if (xi < step)
+                        {
+                                shared[xi][yi] += shared[xi + step][yi];
+                        }
+                __syncthreads();
+                }
+                __syncthreads();
+
+                for(unsigned int step = blockDim.y/2; step >= 1; step >>= 1)
+                {
+                        __syncthreads();
+                        if(yi < step)
+                        {
+                                shared[xi][yi] += shared[xi][yi + step];
+                        }
+                __syncthreads();
+                }
+                __syncthreads();
+                if(xi == 0 && yi == 0)
+                        res[idx] = shared[0][0];
+	}
+
+	extern "C"
+	float *
+	get_centers(GLint texbufferID, GLenum texType, int DIM_X, int DIM_Y, int sizeK, float sigma, float conn, float threshold)
+	{
+		tx.SetTextureCoordinates(1);
+		tx.SetAddressMode(1, 3);
+		tx.MapCudaTexture(texbufferID, texType);
+		float* result =  (float*) malloc(DIM_X*DIM_Y*sizeof(float));
+		
+		initArray(DIM_X, DIM_Y, sizeK);
+
+		filterKernel(sizeK, sigma, LoG);
+		cudaMemcpy(gpuLoG, LoG, sizeK*sizeK*sizeof(float), cudaMemcpyHostToDevice);
+		dim3 numBlocks(DIM_X, DIM_Y);
+		dim3 threadsPerBlock(sizeK, sizeK);
+
+		applyFilter <<< numBlocks, threadsPerBlock >>> (tx.getTexture(), DIM_X, DIM_Y, floor(sizeK/2), sizeK, res, gpuLoG);
+
+
+		stim::cuda::gpu_local_max<float>(centers, res, threshold, conn, DIM_X, DIM_Y);
+		
+		cudaDeviceSynchronize();
+
+
+		cudaMemcpy(result, centers, DIM_X*DIM_Y*sizeof(float), cudaMemcpyDeviceToHost);
+
+		tx.UnmapCudaTexture();
+		cleanUP();
+		return result;
+	}
+
+	}
+}
+#endif
-#include <assert.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <stdio.h>
-#include <stim/visualization/colormap.h>
-#include <sstream>
-
-#define IMAD(a,b,c) ( __mul24((a), (b)) + (c) )
-
-int kr;
-int kl;
-float sigma;
-float* LoG;
-float* result;
-cudaArray* srcArray;
-texture<uchar, cudaTextureType2D, cudaReadModeElementType> texIn;
-
-
-__device__ float filterKernel ()
-{
-	float t = 0;
-	idx = j*kl+i;
-	for(int i = 0; i < kl; i++){
-		for(int j = 0; j < kl; j++){
-			x = i - floor(kl);
-			y = j - floor(kl);
-			LoG(idx) = (-1/M_PI/sigma^4)* (1 - (x^2+y^2)/2/sigma^2)
-					*exp(-(x^2+y^2)/2/sigma^2);	
-			t +=LoG(idx);
-		}
-	}
-	LoG =/ t;
-}
-
-void initArray(cudaGraphicsResource_t src, int DIM_X, int DIM_Y)
-{
-	HANDLE_ERROR(
-		cudaGraphicsMapResources(1, &src)
-	);
-	HANDLE_ERROR(
-		cudaGraphicsSubResourceGetMappedArray(&srcArray, src, 0,0)
-		);
-	HANDLE_ERROR(
-		cudaBindTertureToArray(texIn, srcArray)
-		);
-	cudaMalloc( (void**) &LoG, kl*kl*sizeof(float));
-	checkCUDAerrors("Memory Allocation, LoG");
-	cudaMalloc( (void**) &result, DIM_Y*DIM_X*sizeof(float));
-	checkCUDAerrors("Memory Allocation, Result");
-}
-
-void cleanUp(cudaGraphicsResource_t src);
-{
-	HANDLE_ERROR(
-		cudaUnbindTexture(texIn)
-	);
-	HANDLE_ERROR(
-		cudaFree(LoG)
-	);
-	HANDLE_ERROR(
-		cudaFree(result)
-	);
-	HANDLE_ERROR(
-		cudaGraphicsUnmapResources(1, &src)
-	);
-}
-
-//Shared memory would be better.
-__global__
-void
-applyFilter(unsigned int DIM_X, unsigned int DIM_Y){
-//R = floor(size/2)
-//THIS IS A NAIVE WAY TO DO IT, and there is a better way)
-	//__shared__ float shared[(DIM_X+2*R), (DIM_Y+2*R)];
-	
-	const	 int x = IMAD(blockDim.x, blockIdx.x, threadIdx.x);
-	const	 int y = IMAD(blockDim.y, blockIdx.y, threadIdx.y);
-	float val = 0;
-	//x = max(0,x);
-	//x = min(x, width-1);
-	//y = max(y, 0);
-	//y = min(y, height - 1);
-
-	int idx = y*DIM_X+x;
-	//unsigned int bindex = threadIdx.y * blockDim.y + threadIdx.x;
-
-	//float valIn		= tex2D(texIn, x, y);
-	for (int i = -kr; i <= kr; i++){	//rows
-		for (int j = -kr; i <= kr; j++){	//colls
-			k_idx = (j+kr)+(i+kr)*kl;
-			xi = max(0, x+i);
-			xi = min(x+i, DIM_X-1);
-			yj = max(y+j, 0);
-			yj = min(y+j, DIM_Y-1);
-			val += LoG(k_idx)*tex2D(texIn,x+i, y+j);	
-		}
-	}
-
-	result[idx] = val;
-}
@@ -35,34 +35,6 @@ namespace stim{
 			}
 		}
  
-		template<typename T, typename D>
-		__device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src,
-										 unsigned int x, unsigned int y, unsigned int X, unsigned int Y,
-										 dim3 threadIdx, dim3 blockDim){
-
-			//calculate the number of iterations required for the copy
-			unsigned int xI, yI;
-			xI = X/blockDim.x + 1;				//number of iterations along X
-			yI = Y/blockDim.y + 1;				//number of iterations along Y
-
-			//for each iteration
-			for(unsigned int xi = 0; xi < xI; xi++){
-				for(unsigned int yi = 0; yi < yI; yi++){
-
-					//calculate the index into shared memory
-					unsigned int sx = xi * blockDim.x + threadIdx.x;
-					unsigned int sy = yi * blockDim.y + threadIdx.y;
-
-					//calculate the index into the texture
-					unsigned int tx = x + sx;
-					unsigned int ty = y + sy;
-
-					//perform the copy
-					if(sx < X && sy < Y)
-						dest[sy * X + sx] = abs(255 - tex2D<D>(src, tx, ty));
-				}
-			}
-		}
  
 	}
 }
@@ -96,7 +96,6 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 		std::vector< stim::vec<float> > cD;	//Direction of line currently being traced.
 		std::vector< stim::vec<float> > cM;	//Magnitude of line currently being traced.
  
-//		stim::glObj<float> sk;			//object to store the skeleton.
 		stim::glnetwork<float> nt;		//object for storing the network.
  
 		stim::vec<float> rev;			//reverse vector;
@@ -141,7 +140,6 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 			setMatrix();			//create the transformation matrix.
 			glCallList(dList+1);		//move the templates to p, d, m.
 			int best = getCost(ptexbufferID, numSamplesPos);		//find min cost.
-			std::cerr << best << std::endl;
 			stim::vec<float> next(		//find next position.
  				pV[best][0],
 				pV[best][1],
@@ -170,6 +168,7 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
  
 		///subject to change.
 		///finds branches.
+		///depreciated
 		void
 		branchDetection()
 		{
@@ -197,14 +196,8 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 						   -p[1] + cylp[1]*S[1]*R[1],
 						   -p[2] + cylp[2]*S[2]*R[2]);
 						seeddir = seeddir.norm();
-//					float seedm = m[0]/2.0;
 					float seedm = m[0];
 // Uncomment for global run 
-/*					stim::vec<float> lSeed = getLastSeed();
-					if(sqrt(pow((lSeed[0] - vec[0]),2)
-					 + pow((lSeed[1] - vec[1]),2) + 
-					 pow((lSeed[2] - vec[2]),2)) > m[0]/4.0
-					 &&  */
 					if(
 					 !(vec[0] > size[0] || vec[1] > size[1]
 					 || vec[2] > size[2] || vec[0] < 0
@@ -220,6 +213,8 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 		}
  
  
+		///finds all the branches in the a given fiber.
+		///using LoG method.
 		void
 		branchDetection2(int n = 8, int l_template = 8, int l_square = 8)
 		{
@@ -231,7 +226,6 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 			std::vector< stim::vec<float> > result = find_branch(btexbufferID, GL_TEXTURE_2D, n*l_square, (cL.size()-1)*l_template);
 			stim::vec<float> size(S[0]*R[0], S[1]*R[1], S[2]*R[2]);
 			float pval;
-//			std::cerr << "the number of points is " << result.size() << std::endl;
 			if(!result.empty())
 			{
 				for(int i = 0; i < result.size(); i++)
@@ -242,7 +236,6 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
  
 						pval = ((cyl.getl(id+1)-cyl.getl(id))*
 							(fmod(result[i][2], id))+cyl.getl(id))/cyl.getl(cL.size()-1);
-//						std::cout << id << " " <<  cyl.getl(id) << " " << pval << " " << cyl.getl(cL.size()-1) << fmod(result[i][2], id) << std::endl;
 					}
 					else if(id == 0)
 					{
@@ -252,14 +245,9 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 					{
 						pval = (cyl.getl(id)/cyl.getl(cL.size()-1));
 					}
-//					std::cout << "Testing "<< i << ": " << result[i][0] << ", " << result[i][1] << ", " <<  result[i][2] <<  std::endl;
-//					std::cout << "Testing " << pval << std::endl;
 					stim::vec<float> v = cyl.surf(pval, result[i][0]);
-//					std::cout <<  v[0] << " ," << v[1] << " ," << v[2] << std::endl;
 					stim::vec<float> di = cyl.p(pval);
-//					std::cout <<  di[0] <<  " ," << di[1] << " ," << di[2] << std::endl;
 					float rad = cyl.r(pval);
-					std::cout << rad << std::endl;
 					if(
 					 !(v[0] > size[0] || v[1] > size[1]
 					 || v[2] > size[2] || v[0] < 0
@@ -271,7 +259,6 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 					}
 				}
 			}
-//			std::cout << "I ran the new branch detection" << std::endl;
 			}
 		}
  
@@ -1506,10 +1493,10 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 					spos[2] = spos[2]-sdir[2]*smag[0]/2.;
 					int h = selectObject(spos, -sdir, smag[0]);
 					//did start with a fiber?
-					if(h != -1){	
+					if(h != -1 && h != nt.sizeE()){	
 			//			std::cout << "got here double" << smag.str() << std::endl;
 						nt.addEdge(ce,cm, h, in.second);	
-					}
+					} else { nt.addEdge(ce,cm, -1, -1);}
 				}
 			}		
 		}
@@ -1551,7 +1538,6 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 			{
 				int cost = Step();
 				if (cost > min_cost){
-					std::cout << "Cost Limit" << std::endl;
 					running = false;
 //					sk.End();
 		branchDetection2();
@@ -1566,10 +1552,7 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 					 || pos[2] > size[2] || pos[0] < 0
 					 || pos[1] < 0 || pos[2] < 0)
 					{
-						std::cout << "Edge Limit" << std::endl;
-//					       std::cout << "Found Edge" << std::endl;
 						running = false;
-//						sk.End();
 		branchDetection2();
 						pair<stim::fiber<float>, int> a(stim::fiber<float> (cL, cM), -1);
 						addToNetwork(a, spos, smag, sdir);
@@ -1583,13 +1566,10 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 						rev = -getDirection();
 						started = true;
 					}
-//					std::cout << i << p << std::endl;
 					//Has the template size gotten unreasonable?
 					mag = getMagnitude();
 					if(mag[0] > 75 || mag[0] < 1){
-						std::cout << "Magnitude Limit" << std::endl;
 						running = false;
-//						sk.End();
 		branchDetection2();
 						pair<stim::fiber<float>, int> a(stim::fiber<float> (cL, cM), -1);
 						addToNetwork(a, spos, smag, sdir);
@@ -1601,9 +1581,7 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 						h = selectObject(p, getDirection(), m[0]);
 						//Have we hit something previously traced?
 						if(h != -1){
-							std::cout << "Hit Limit" << std::endl;
 							running = false;
-//							sk.End();
 		branchDetection2();
 							pair<stim::fiber<float>, int> a(stim::fiber<float> (cL, cM), h);
 							addToNetwork(a, spos, smag, sdir);
@@ -1613,14 +1591,7 @@ class gl_spider : public virtual gl_texture&lt;T&gt;
 						else {  
 							cL.push_back(stim::vec<float>(p[0], p[1],p[2]));
 							cM.push_back(stim::vec<float>(m[0], m[0]));
-//							cM.push_back(m[0]);
-
-//							sk.TexCoord(m[0]);
-//							sk.Vertex(p[0], p[1], p[2]);
 							Bind(btexbufferID, bfboID, 27);
-							CHECK_OPENGL_ERROR
-//							branchDetection();
-							CHECK_OPENGL_ERROR
 							Unbind();
 							CHECK_OPENGL_ERROR