Merge branch 'master' of git.stim.ee.uh.edu:codebase/stimlib

David Mayerich
2 parents 0129eeb3 df0a759f
Showing 11 changed files with 386 additions and 98 deletions Show diff stats
stim/cuda/branch_detection.cuh
stim/cuda/filter.cuh
stim/cuda/testKernel.cuh
stim/envi/envi_header.h
stim/gl/gl_spider.h
stim/gl/gl_texture.h
stim/grids/image_stack.h
stim/math/filters/gauss3.h
stim/math/filters/sepconv2.h
stim/math/filters/sepconv3.h
stim/math/vec3.h
@@ -11,7 +11,7 @@ typedef unsigned int uint;
 std::vector< stim::vec<float> >
-find_branch(GLint texbufferID, GLenum texType, unsigned int x, unsigned int y)
+find_branch(GLint texbufferID, GLenum texType, unsigned int x, unsigned int y, int iter = 0)
 {
 	float		sigma		= 2.0;
 	unsigned int	conn		= 7;
@@ -22,7 +22,7 @@ find_branch(GLint texbufferID, GLenum texType, unsigned int x, unsigned int y)
 	stringstream name;
-	cpuCenters = stim::cuda::get_centers(texbufferID, texType, x, y, sizek, sigma, conn, threshold);
+	cpuCenters = stim::cuda::get_centers(texbufferID, texType, x, y, sizek, sigma, conn, threshold, iter);
 	cudaDeviceSynchronize();
@@ -26,6 +26,11 @@ namespace stim
 	float* LoG;
 	float* res;
 	float* centers;
+
+//#ifdef DEBUG
+	float* print;
+//#endif
+
 	stim::cuda::cuda_texture tx;
@@ -44,6 +49,13 @@ namespace stim
 		HANDLE_ERROR(
 			cudaMalloc( (void**) &centers, DIM_Y*DIM_X*sizeof(float))
 		);
+
+//#ifdef DEBUG
+		HANDLE_ERROR(
+			cudaMalloc( (void**) &print, DIM_Y*DIM_X*sizeof(float))
+		);
+//#endif
+
 	//	checkCUDAerrors("Memory Allocation, Result");
 	}
@@ -58,6 +70,11 @@ namespace stim
 		HANDLE_ERROR(
 			cudaFree(centers)
 		);
+//#ifdef DEBUG
+		HANDLE_ERROR(
+			cudaFree(print)
+		);
+//#endif
 			free(LoG);
 	}
@@ -89,7 +106,7 @@ namespace stim
 	//Shared memory would be better.
 	__global__
 	void
-	applyFilter(cudaTextureObject_t texIn, unsigned int DIM_X, unsigned int DIM_Y, int kr, int kl, float *res, float* gpuLoG){
+	applyFilter(cudaTextureObject_t texIn, unsigned int DIM_X, unsigned int DIM_Y, int kr, int kl, float *res, float* gpuLoG, float* p){
 	//R = floor(size/2)
 	//THIS IS A NAIVE WAY TO DO IT, and there is a better way)
@@ -101,16 +118,15 @@ namespace stim
 	//	float val = 0;
 		float tu = (x-kr+xi)/(float)DIM_X;
 		float tv = (y-kr+yi)/(float)DIM_Y;
+		int idx = y*DIM_X+x;
 		shared[xi][yi] = gpuLoG[yi*kl+xi]*(255.0-(float)tex2D<unsigned char>(texIn, tu, tv));
 		__syncthreads();
-	
 		//x = max(0,x);
 		//x = min(x, width-1);
 		//y = max(y, 0);
 		//y = min(y, height - 1);
-		int idx = y*DIM_X+x;
 	//	int k_idx;
                 for(unsigned int step = blockDim.x/2; step >= 1; step >>= 1)
                 {
@@ -135,11 +151,12 @@ namespace stim
                 __syncthreads();
                 if(xi == 0 && yi == 0)
                         res[idx] = shared[0][0];
+
 	}
 	extern "C"
 	float *
-	get_centers(GLint texbufferID, GLenum texType, int DIM_X, int DIM_Y, int sizeK, float sigma, float conn, float threshold)
+	get_centers(GLint texbufferID, GLenum texType, int DIM_X, int DIM_Y, int sizeK, float sigma, float conn, float threshold, int iter = 0)
 	{
 		tx.SetTextureCoordinates(1);
 		tx.SetAddressMode(1, 3);
@@ -153,7 +170,14 @@ namespace stim
 		dim3 numBlocks(DIM_X, DIM_Y);
 		dim3 threadsPerBlock(sizeK, sizeK);
-		applyFilter <<< numBlocks, threadsPerBlock >>> (tx.getTexture(), DIM_X, DIM_Y, floor(sizeK/2), sizeK, res, gpuLoG);
+		applyFilter <<< numBlocks, threadsPerBlock >>> (tx.getTexture(), DIM_X, DIM_Y, floor(sizeK/2), sizeK, res, gpuLoG, print);
+
+		#ifdef DEBUG
+			stringstream name;
+			name.str("");
+			name << "Fiber Cylinder " << iter << ".bmp";
+			stim::gpu2image<float>(res, name.str(), DIM_X, DIM_Y, 0, 255);
+		#endif
 		stim::cuda::gpu_local_max<float>(centers, res, threshold, conn, DIM_X, DIM_Y);
@@ -53,11 +53,31 @@
 		float valIn             = tex2D<unsigned char>(texIn, x, y);
 		float templa		= templ(x, 32)*255.0;
-		print[idx]              = abs(valIn-templa);             ///temporary
+		//print[idx]              = abs(valIn-templa);             ///temporary
+		print[idx]		= abs(valIn);
 		//print[idx]              = abs(templa);             ///temporary
 	}
+	///Find the difference of the given set of samples and the template
+	///using cuda acceleration.
+	///@param stim::cuda::cuda_texture t	--stim texture that holds all the references
+	///					  to the data.
+	///@param float* result			--a pointer to the memory that stores the result.
+	__global__
+	//void get_diff (float *result)
+	void get_diff2 (cudaTextureObject_t texIn, float *print, int dx)
+	{       
+		int x   = threadIdx.x + blockIdx.x * blockDim.x;
+		int y   = threadIdx.y + blockIdx.y * blockDim.y;
+		int idx = y*dx+x;
+	//	int idx = y*16+x;
+
+		float valIn             = tex2D<unsigned char>(texIn, x, y);
+		print[idx]              = abs(valIn);             ///temporary
+
+	}
+
 	void test(cudaTextureObject_t tObj, int x, int y, std::string nam)
 	{
@@ -86,3 +106,31 @@
 		cleanUP();
 	}
+
+	void test(cudaTextureObject_t tObj, int x, int y, std::string nam, int iter)
+	{
+
+		//Bind the Texture in GL and allow access to cuda.
+
+		//initialize the return arrays.
+
+		initArray(x,y);
+		dim3 numBlocks(1, y);
+		dim3 threadsPerBlock(x, 1);
+		int max_threads = stim::maxThreadsPerBlock();
+		//dim3 threads(max_threads, 1);
+		//dim3 blocks(x / threads.x + 1, y);	
+		//dim3 numBlocks(2, 2);
+		//dim3 threadsPerBlock(8, 108);
+
+
+//		get_diff <<< blocks, threads >>> (tx.getTexture(), print);
+		get_diff2 <<< numBlocks, threadsPerBlock >>> (tObj, print, x);
+
+		cudaDeviceSynchronize();
+		stringstream name;      //for debugging
+		name << nam.c_str();
+		stim::gpu2image<float>(print, name.str(),x,y,0,255);
+	  
+		cleanUP();
+	}
@@ -8,6 +8,7 @@
 #include <vector>
 #include <algorithm>
 #include <stdlib.h>
+#include <cmath>
 //information from an ENVI header file
 //A good resource can be found here: http://www.exelisvis.com/docs/enviheaderfiles.html
-#ifndef STIM_GL_SPIDER_H
+ #ifndef STIM_GL_SPIDER_H
 #define STIM_GL_SPIDER_H
 //#include <GL/glew.h>
@@ -27,7 +27,6 @@
 #include <stim/cuda/branch_detection.cuh>
 #include "../../../volume-spider/glnetwork.h"
 #include <stim/visualization/cylinder.h>
-#include <stim/cuda/testKernel.cuh>
 #include <iostream>
 #include <fstream>
 #ifdef TIMING
@@ -40,6 +39,9 @@
 	#include <ctime>
 #endif
+#ifdef DEBUG
+	#include <stim/cuda/testKernel.cuh>
+#endif
 namespace stim
 {
@@ -138,11 +140,13 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 		stim::cuda::cuda_texture t_pos;				//cuda_texture object used as an interface between OpenGL and cuda for position vectors.
 		stim::cuda::cuda_texture t_mag;				//cuda_texture object used as an interface between OpenGL and cuda for size vectors.
 		stim::cuda::cuda_texture t_len;				//cuda_texture object used as an interface between OpenGL and cuda for size vectors.
-		
+
+		int last_fiber;						//variable that tracks the last fiber hit during tracing. -1 if no fiber was hit.
+
 		#ifdef DEBUG
-			stringstream name;
 			int iter;
+			stringstream name;
 			int iter_pos;
 			int iter_dir;
 			int iter_siz;
@@ -292,6 +296,7 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 			DrawLongCylinder(n, l_template, l_square);	///Draw the cylinder.
 			stim::cylinder<float> cyl(cL, cM);
 			std::vector< stim::vec<float> > result = find_branch(cylinder_texID, GL_TEXTURE_2D, n*l_square, (cL.size()-1)*l_template);		///find all the centers in cuda
+			
 			stim::vec3<float> size(S[0]*R[0], S[1]*R[1], S[2]*R[2]);			///the borders of the texture.
 			float pval;									//pvalue associated with the points on the cylinder.
 			if(!result.empty())								///if we have any points
@@ -315,7 +320,8 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 					}
 					stim::vec3<float> v = cyl.surf(pval, result[i][0]);					///find the coordinates of the point at pval on the surface in tissue space.
 					stim::vec3<float> di = cyl.p(pval);							///find the coord of v in tissue space projected on the centerline.
-					float rad = cyl.r(pval)/2;								///find the radius at the pvalue's location
+					float rad = cyl.r(pval);								///find the radius at the pvalue's location
+				//	float rad = cyl.r(pval)/2;								///find the radius at the pvalue's location
 					if(
 					 !(v[0] > size[0] || v[1] > size[1]
 					 || v[2] > size[2] || v[0] < 0
@@ -372,7 +378,7 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 		///Stored in a display list.
 		///uses the default d vector <0,0,1>
 		void
-		genDirectionVectors(float solidAngle = stim::PI/2)
+		genDirectionVectors(float solidAngle = 3*stim::PI/4)
 		{
 			//Set up the vectors necessary for Rectangle creation.
@@ -954,7 +960,7 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 			iter_dir = 0;
 			iter_siz = 0;
 #endif
-			stepsize = 3.0;
+			stepsize = 6.0;
 			n_pixels = 16.0;
 			srand(100);	
@@ -1316,20 +1322,20 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 		void
 		saveNetwork(std::string name)
 		{
-/*			stim::glObj<float> sk;
+			stim::glObj<float> sk1;
 			for(int i = 0; i < nt.sizeE(); i++)
 			{
-				std::vector<stim::vec< float > > cm = nt.getEdgeCenterLineMag(i);
+				std::vector<float> cm = nt.getEdgeCenterLineMag(i);
                  		std::vector<stim::vec3< float > > ce = nt.getEdgeCenterLine(i);
-				sk.Begin(stim::OBJ_LINE);
+				sk1.Begin(stim::OBJ_LINE);
 				for(int j = 0; j < ce.size(); j++)
 				{
-					sk.TexCoord(cm[j][0]);
-					sk.Vertex(ce[j][0], ce[j][1], ce[j][2]);
+					sk1.TexCoord(cm[j]);
+					sk1.Vertex(ce[j][0], ce[j][1], ce[j][2]);
 				}
-				sk.End();
+				sk1.End();
 			}	
-*/			sk.save(name);
+			sk1.save(name);
 		}
 		///Depreciated, but might be reused later()
@@ -1377,20 +1383,31 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 		Step()
 		{
 			#ifdef DEBUG
-			std::cerr << "Took a step" << std::endl;
+			std::cerr << "Took a step";
 			#endif
 			Bind(direction_texID, direction_buffID, numSamples, n_pixels);
 			CHECK_OPENGL_ERROR
 				findOptimalDirection();
 			Unbind();
+			#ifdef DEBUG
+			std::cerr << " " << current_cost;
+			#endif
 			Bind(position_texID, position_buffID, numSamplesPos, n_pixels);
 				findOptimalPosition();
 			Unbind();
+			#ifdef DEBUG
+			std::cerr << " " << current_cost;
+			#endif
 			Bind(radius_texID, radius_buffID, numSamplesMag, n_pixels);
 				findOptimalRadius();
 			Unbind();
+			#ifdef DEBUG
+			std::cerr << " " << current_cost;
+			#endif
 			CHECK_OPENGL_ERROR
-
+			#ifdef DEBUG
+			std::cerr << std::endl;
+			#endif
 			return current_cost;
 		}
@@ -1517,9 +1534,6 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 			while(!Empty())
 			{
 				//clear the currently traced line and start a new one.
-				cL.clear();
-				cM.clear();
-				cD.clear();
 				curSeed = seeds.top();
 				curSeedVec = seedsvecs.top();
 				curSeedMag = seedsmags.top();
@@ -1539,9 +1553,9 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 //				findOptimalDirection();
 //			Unbind();
 //THIS IS EXPERIMENTAL
-			Bind(radius_texID, radius_buffID, numSamplesMag, n_pixels);
-				findOptimalRadius();
-			Unbind();
+		//	Bind(radius_texID, radius_buffID, numSamplesMag, n_pixels);
+		//		findOptimalRadius();
+		//	Unbind();
 //THIS IS EXPERIMENTAL
 //				cL.push_back(curSeed);
@@ -1593,17 +1607,17 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 					 ds[0], ds[1], ds[2],
 					 ups[0], ups[1], ups[2]);
 				///Set the look at distance
-				sk.Render();	///Render the network
-//				nt.Render();								
+//				sk.Render();	///Render the network
+				nt.Render();								
 				CHECK_OPENGL_ERROR
-				glLoadName((int) sk.numL());		///Load all the names
-//				glLoadName(nt.sizeE());
+//				glLoadName((int) sk.numL());		///Load all the names
+				glLoadName(nt.sizeE());
-				sk.RenderLine(cL);			///Render the current line.
-//				nt.RenderLine(cL);	
+//				sk.RenderLine(cL);			///Render the current line.
+				nt.RenderLine(cL);	
 //				glPopName();
 				glFlush();				///Flush the buffer
@@ -1654,55 +1668,51 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 			cM.clear();
 		}
-/*
+
 		void
-		addToNetwork(pair<stim::fiber<float>, int> in, stim::vec3<float> spos, 
-				stim::vec<float> smag, stim::vec3<float> sdir)
+		addToNetwork(std::vector<stim::vec3<float> > L, std::vector<float > M, stim::vec3<float> spos, stim::vec3<float> sdir, float smag)
 		{
-			#ifdef TIMING
-				 double s = std::clock();
-			#endif
-			
-                        std::vector<stim::vec3<float> > ce = in.first.centerline();                
-                        std::vector<stim::vec<float> > cm = in.first.centerlinemag();
 			//if the fiber is longer than 2 steps (the number it takes to diverge)
-			if(ce.size() > 3)
+			if(L.size() > 3)
 			{	
 				//if we did not hit a fiber
-				if(in.second == -1)
+				if(last_fiber == -1)
 				{
-					spos[0] = spos[0]-sdir[0]*smag[0]/2.;
-					spos[1] = spos[1]-sdir[1]*smag[0]/2.;
-					spos[2] = spos[2]-sdir[2]*smag[0]/2.;
-					int h = selectObject(spos, -sdir, smag[0]);
+					spos[0] = spos[0]-sdir[0]*smag;
+					spos[1] = spos[1]-sdir[1]*smag;
+					spos[2] = spos[2]-sdir[2]*smag;
+					int h = selectObject(spos, -sdir, smag);
 					//did we start with a fiber?
 					if(h != -1 && h < nt.sizeE())
-						nt.addEdge(ce, cm, h, -1);
+						nt.addEdge(L, M, h, -1);
 					else
-						nt.addEdge(ce, cm, -1, -1);
+						nt.addEdge(L, M, -1, -1);
 				}
 				//if we hit a fiber?
-				else if(in.second != -1)
+				else if(last_fiber != -1)
 				{
-					nt.addEdge(ce,cm,-1, in.second);
-					spos[0] = spos[0]-sdir[0]*smag[0]/2.;
-					spos[1] = spos[1]-sdir[1]*smag[0]/2.;
-					spos[2] = spos[2]-sdir[2]*smag[0]/2.;
-					int h = selectObject(spos, -sdir, smag[0]);
+					nt.addEdge(L, M, -1, last_fiber);
+					spos[0] = spos[0]-sdir[0]*smag;
+					spos[1] = spos[1]-sdir[1]*smag;
+					spos[2] = spos[2]-sdir[2]*smag;
+					int h = selectObject(spos, -sdir, smag);
 					//did start with a fiber?
 					if(h != -1 && h < nt.sizeE()){	
 			//			std::cout << "got here double" << smag.str() << std::endl;
-						nt.addEdge(ce,cm, h, in.second);	
-					} else { nt.addEdge(ce,cm, -1, -1);}
+						nt.addEdge(L, M, h, last_fiber);	
+					}
+					else
+					{
+					 nt.addEdge(L, M, -1, -1);
+					}
 				}
 			}		
-			#ifdef TIMING
-				double nt = (std::clock() - s) / (double) CLOCKS_PER_SEC;
-				network_time += nt * 1000.0;
+			#ifdef DEBUG
+				iter++;
 			#endif
 		}
-*/
+/*
 		void
 		addToNetwork(std::vector<stim::vec3<float> > L, std::vector<float > M)
 		{
@@ -1722,7 +1732,7 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 				#endif
 			}
 		}
-
+*/
 		void
 		printSizes()
@@ -1735,22 +1745,31 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 		traceLine(stim::vec3<float> pos, float mag, int min_cost)
 		{
 			//starting (seed) position and magnitude.
+			last_fiber = -1;
+			cL.clear();
+			cM.clear();
+			cD.clear();
+
 			stim::vec3<float> spos = getPosition();
+			stim::vec3<float> sdir = getDirection();
 			float smag = getMagnitude();
-			stim::vec3<float> sdir = getDirection();	
-//			Bind();
-//			sk.Begin(stim::OBJ_LINE);
+			setPosition(pos);
+			setMagnitude(mag);
+			cL.push_back(p);
+			cD.push_back(d);
+			cM.push_back(m);
+//			stim::vec3<float> spos = getPosition();
+//			float smag = getMagnitude();
+//			stim::vec3<float> sdir = getDirection();	
-			sk.createFromSelf(GL_SELECT);
-//			nt.createFromSelf(GL_SELECT);
+//			Bind();
+//			sk.Begin(stim::OBJ_LINE);
-			cL.push_back(pos);
-			cM.push_back(mag);
-//			setPosition(pos);
-//			setMagnitude(mag);
+			//sk.createFromSelf(GL_SELECT);
+			nt.createFromSelf(GL_SELECT);
 			int h;
 			bool started = false;
 			bool running = true;
@@ -1761,7 +1780,7 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 				if (cost > min_cost){
 					running = false;
 					branchDetection2();
-					addToNetwork(cL, cM);
+					addToNetwork(cL, cM, spos, sdir, smag);
 					#ifdef DEBUG
 					std::cerr << "the cost of " << cost << " > " << min_cost << std::endl;
 					#endif
@@ -1769,13 +1788,14 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 				} else {
 					//Have we found the edge of the map?
 					pos = getPosition();
-					if(pos[0] > size[0] || pos[1] > size[1]
-					 || pos[2] > size[2] || pos[0] < 0
-					 || pos[1] < 0 || pos[2] < 0)
+					if(p[0] > size[0] || p[1] > size[1]
+					 || p[2] > size[2] || p[0] < 0
+					 || p[1] < 0 || p[2] < 0)
 					{
 						running = false;
 						branchDetection2();
-						addToNetwork(cL, cM);
+				//		addToNetwork(cL, cM);
+						addToNetwork(cL, cM, spos, sdir, smag);
 						#ifdef DEBUG
 						std::cerr << "I hit and edge" << std::endl;
 						#endif
@@ -1790,10 +1810,11 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 					}
 					//Has the template size gotten unreasonable?
 					mag = getMagnitude();
-					if(mag > 75 || mag < 1){
+					if(m > 75 || m < 1){
 						running = false;
 						branchDetection2();
-						addToNetwork(cL, cM);
+				//		addToNetwork(cL, cM);
+						addToNetwork(cL, cM, spos, sdir, smag);
 						#ifdef DEBUG
 						std::cerr << "The templates are too big" << std::endl;
 						#endif
@@ -1807,13 +1828,16 @@ class gl_spider // : public virtual gl_texture&lt;T&gt;
 							#ifdef DEBUG
 							std::cerr << "I hit the fiber " << h << std::endl;
 							#endif
+							last_fiber = h;
 							running = false;
 							branchDetection2();
-							addToNetwork(cL, cM);
+						//	addToNetwork(cL, cM);
+							addToNetwork(cL, cM, spos, sdir, smag);
 							break;
 						}
 						else {  
-							cL.push_back(stim::vec3<float>(p[0], p[1],p[2]));
+							cL.push_back(p);
+							cD.push_back(d);
 							cM.push_back(m);
 //							Unbind();
 							CHECK_OPENGL_ERROR
@@ -30,9 +30,9 @@ class gl_texture : public virtual image_stack&lt;T, F&gt;
 		GLenum cpu_type;
 		GLenum gpu_type;
 		GLenum format;					//format for the texture (GL_RGBA, GL_LUMINANCE, etc.)
-		using image_stack<T>::R;
+		using image_stack<T,F>::R;
 		//using image_stack<T>::S;
-		using image_stack<T>::ptr;
+		using image_stack<T,F>::ptr;
 		///	Sets the internal texture_type, based on the data dimensions
 		void setTextureType(){
@@ -247,7 +247,7 @@ class gl_texture : public virtual image_stack&lt;T, F&gt;
 		}
 		///returns the dimentions of the data in the x, y, z directions. 
-		vec<int> getSize(){
+		stim::vec<int> getSize(){
 			stim::vec<int> size(R[1], R[2], R[3]);
 			return size;
 		}
@@ -282,7 +282,7 @@ class gl_texture : public virtual image_stack&lt;T, F&gt;
 		///@param file_mask specifies the file(s) to be loaded
 		///	Sets the path and calls the loader on that path.
 		void load_images(std::string file_mask){
-			image_stack<T>::load_images(file_mask);				//load images
+			image_stack<T, F>::load_images(file_mask);				//load images
 			guess_parameters();
 		}
@@ -292,13 +292,18 @@ class gl_texture : public virtual image_stack&lt;T, F&gt;
 			return texID;
 		}
-		
+	
 		T* getData(){
 			return ptr;
 		}
-		
-		
+	
+		void setData(T* rts)
+		{
+				
+		}
+
 	};
+
 }
@@ -17,13 +17,13 @@ class image_stack : public virtual stim::grid&lt;T, 4, F&gt;{
 protected:
 	//using stim::grid<T, 4>::S;
-	using stim::grid<T, 4>::R;
-	using stim::grid<T, 4>::ptr;
-	using stim::grid<T, 4>::read;
+	using stim::grid<T, 4, F>::R;
+	using stim::grid<T, 4, F>::ptr;
+	using stim::grid<T, 4, F>::read;
 public:
 	//default constructor
-	image_stack() : grid<T, 4>() {
+	image_stack() : grid<T, 4, F>() {
 	}
@@ -113,7 +113,8 @@ public:
 	/// @param i is the page to be saved
 	void save_image(std::string file_name, unsigned int i){		
 		stim::image<T> I;											//create an image		
-		I.set_interleaved_rgb(&ptr[ i * R[0] * R[1] * R[2] ], R[1], R[2], R[0]);	//retrieve the interlaced data from the image - store it in the grid
+		I.set_interleaved(&ptr[ i * R[0] * R[1] * R[2] ], R[1], R[2], R[0]);	//retrieve the interlaced data from the image - store it in the grid
+//		I.set_interleaved_rgb(&ptr[ i * R[0] * R[1] * R[2] ], R[1], R[2], R[0]);	//retrieve the interlaced data from the image - store it in the grid
 		I.save(file_name);
 	}
+#ifndef STIM_CUDA_GAUSS3_H
+#define STIM_CUDA_GAUSS3_H
+#include <stim/math/filters/sepconv3.h>
+#include <stim/math/filters/gauss2.h>
+#include <stim/math/constants.h>
+
+namespace stim
+{
+	///Perform a 3D gaussian convolution on an input image.
+        ///@param in is a pointer to the input data.
+	///@param dimx is the size of in* in the x direction.
+	///@param dimx is the size of in* in the y direction.
+	///@param dimx is the size of in* in the z direction.
+        ///@param stdx is the standard deviation (in pixels) along the x axis.
+        ///@param stdy is the standard deviation (in pixels) along the y axis.
+        ///@param nstds specifies the number of standard deviations of the Gaussian that will be k    ept in the kernel.
+	template<typename T, typename K>
+	void cpu_gauss3(T* in, K dimx, K dimy, K dimz, K stdx, K stdy, K stdz, size_t nstds = 3)
+	{
+		//Set up the sizes of the gaussian Kernels.
+		size_t kx = stdx * nstds * 2;
+		size_t ky = stdy * nstds * 2;
+		size_t kz = stdz * nstds * 2;
+	
+		//Set up the sizes of the new output, which will be kx, ky, kz, smaller than the input.
+		size_t X = dimx - kx +1; 
+		size_t Y = dimy - ky +1; 
+		size_t Z = dimz - kz +1; 
+		T* out = (T*) malloc(X*Y*Z* sizeof(T));
+
+		///Set up the memory that will store the gaussians
+		K* gaussx = (K*)malloc(kx *sizeof(K));
+		K* gaussy = (K*)malloc(ky *sizeof(K));
+		K* gaussz = (K*)malloc(kz *sizeof(K));
+
+		///Set up the midpoints of the gaussians.
+		K midgaussx = (K) kx/ (K)2;
+		K midgaussy = (K) ky/ (K)2;
+		K midgaussz = (K) kz/ (K)2;
+
+		///Evaluate the kernels in each cardinal direction.
+		for(size_t i = 0; i < kx; i++)
+			gaussx[i] = gauss1d((K) i, midgaussx, stdx);
+
+		for(size_t i = 0; i < kx; i++)
+			gaussy[i] = gauss1d((K) i, midgaussy, stdy);
+
+		for(size_t i = 0; i < kx; i++)
+			gaussz[i] = gauss1d((K) i, midgaussz, stdz);
+
+		cpu_sepconv3(out, in, gaussx, gaussy, gaussz, dimx, dimy, dimz, kx, ky, kz);
+
+	}
+}
+#endif
@@ -86,4 +86,4 @@ namespace stim {
 	}
 }
-#endif
 \ No newline at end of file
+#endif
+#ifndef STIM_CUDA_SEPCONV3_H
+#define STIM_CUDA_SEPCONV3_H
+
+#include <stim/math/filters/conv2.h>
+#include <stim/math/filters/sepconv2.h>
+#ifdef __CUDACC__
+	#include <stim/cuda/cudatools.h>
+	#include <stim/cuda/sharedmem.cuh>
+#endif
+
+namespace stim
+{
+#ifdef __CUDACC__
+	template<typename T, typename K>
+	void gpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz)
+{
+
+	size_t X = dimx - kx + 1; 
+	size_t Y = dimy - ky + 1; 
+	size_t Z = dimz - kz + 1;
+	
+	T* temp_out;
+	int idx_IN;
+	int idx_OUT;
+	HANDLE_ERROR(cudaMalloc(&temp_out, X*Y*dimz*sizeof(T)));
+
+	for(int i = 0; i < dimz; i++)
+	{
+		idx_IN 	= (dimx*dimy)*i-i;
+		idx_OUT = (X*Y)*i-i;
+		gpu_sepconv2(&temp_out[idx_OUT], &in[idx_IN], k0, k1, dimx, dimy, kx, ky);
+	}
+
+	cudaDeviceProp p;
+	HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
+	size_t tmax = p.maxThreadsPerBlock;
+
+	dim3 numThreads(sqrt(tmax), sqrt(tmax));
+	dim3 numBlocks(X*Y/numThreads.x +1, dimz/numThreads.y + 1);
+	size_t sharedMem = (numThreads.x + kz - 1) * numThreads.y * sizeof(T);
+	if(sharedMem > p.sharedMemPerBlock)
+	{
+		std::cout << "Error in stim::gpu_sepconv3() - insufficient shared memory for this kernel." << std::endl;
+		exit(1);
+	}
+	kernel_conv2 <<< numBlocks, numThreads, sharedMem >>> (out, temp_out, k2, X*Y, dimz, 1, kz);
+	HANDLE_ERROR(cudaFree(temp_out));
+
+
+}
+#endif
+
+	//Performs a separable convolution of a 3D image. Only valid pixels based on the kernel ar    e returned.
+	//      As a result, the output image will be smaller than the input image by (kx-1, ky-1 , kz-1)
+	//@param out is a pointer to the output image
+	//@param in is a pointer to the input image
+	//@param kx is the x-axis convolution filter
+	//@param ky is the y-axis convolution filter
+	//@param kz is the z-axis convolution filter
+	//@param dimx is the size of the input image along X
+	//@param dimy is the size of the input image along Y
+	//@param dimz is the size of the input image along Z
+	//@param kx is the size of the kernel along X
+	//@param ky is the size of the kernel along Y
+	//@param kz is the size of the kernel along Z
+
+	template <typename T, typename K>
+	void cpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz)
+	{
+		//Set up the sizes of the new output, which will be kx, ky, kz, smaller than the i    nput.
+		size_t X = dimx - kx + 1; 
+		size_t Y = dimy - ky + 1; 
+		size_t Z = dimz - kz + 1;
+
+#ifdef __CUDACC__
+	///Set up all of the memory on the GPU
+	T* gpu_in;
+	HANDLE_ERROR(cudaMalloc(&gpu_in, dimx*dimy*dimz*sizeof(T)));
+	HANDLE_ERROR(cudaMemcpy(gpu_in, in, dimx*dimy*dimz*sizeof(T),cudaMemcpyHostToDevice));
+	K* gpu_kx;
+	HANDLE_ERROR(cudaMalloc(&gpu_kx, kx*sizeof(K)));
+	HANDLE_ERROR(cudaMemcpy(gpu_kx, k0, kx*sizeof(K),cudaMemcpyHostToDevice));
+	K* gpu_ky;
+	HANDLE_ERROR(cudaMalloc(&gpu_ky, ky*sizeof(K)));
+	HANDLE_ERROR(cudaMemcpy(gpu_ky, k1, ky*sizeof(K),cudaMemcpyHostToDevice));
+	K* gpu_kz;
+	HANDLE_ERROR(cudaMalloc(&gpu_kz, kz*sizeof(K)));
+	HANDLE_ERROR(cudaMemcpy(gpu_kz, k2, kz*sizeof(K),cudaMemcpyHostToDevice));
+	T* gpu_out;
+	HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * Z*sizeof(T)));
+
+	///run the kernel
+	gpu_sepconv3(gpu_out, gpu_in, gpu_kx, gpu_ky, gpu_kz, dimx, dimy, dimz, kx, ky, kz);
+
+	///Copy the output
+	HANDLE_ERROR(cudaMemcpy(out, gpu_out, X*Y*Z*sizeof(T), cudaMemcpyDeviceToHost));
+
+	///Free all the memory used.
+	HANDLE_ERROR(cudaFree(gpu_in));
+	HANDLE_ERROR(cudaFree(gpu_kx));
+	HANDLE_ERROR(cudaFree(gpu_ky));
+	HANDLE_ERROR(cudaFree(gpu_kz));
+	HANDLE_ERROR(cudaFree(gpu_out));
+#else
+	T* temp = (T*) malloc(X * dimy * sizeof(T));
+	T* temp3 = (T*) malloc(X * Y * dimz * sizeof(T));
+	for(int i = 0; i < dimz; i++)
+	{
+		idx_IN 	= (dimx*dimy)*i-i;
+		idx_OUT = (X*Y)*i-i;
+		cpu_conv2(temp, &in[idx_IN], k0, dimx, dimy, kx, 1)
+		cpu_conv2(&temp3[idx_OUT], temp, k1, X, dimy, 1, ky);
+	}
+	cpu_conv2(out, temp, k2, X*Y, dimz, 1, kz);
+	free(temp);
+	free(temp3);
+
+#endif
+	}
+}
+
+
+#endif
@@ -68,7 +68,7 @@ public:
 	}
-	/// Convert the vector from cartesian to spherical coordinates (x, y, z -> r, theta, phi where theta = [0, 2*pi])
+	/// Convert the vector from cartesian to spherical coordinates (x, y, z -> r, theta, phi where theta = [-PI, PI])
 	CUDA_CALLABLE vec3<T> cart2sph() const{
 		vec3<T> sph;
 		sph.ptr[0] = len();
@@ -236,6 +236,13 @@ public:
 		return result;
 	}
+	CUDA_CALLABLE bool operator==(vec3<T> rhs) const{
+		if(rhs[0] == ptr[0] && rhs[1] == ptr[1] && rhs[2] == ptr[2])
+			return true;
+		else
+			return false;	
+	}
+
 //#ifndef __NVCC__
 	/// Outputs the vector as a string
 	std::string str() const{