cpp/cpyToshare.cuh

#ifndef STIM_CUDA_cpyToshare_H
#define STIM_CUDA_cpyToshare_H
		//this function copy one channel data from global to shared memory in one dimension with size of X bytes.
		template<typename T>
		__device__ void cpyG2S1D(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
											  dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
			//calculate the total number of threads available
			unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
			
			//calculate the current 1D thread ID
			unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
			//calculate the number of iteration require for the copy
			unsigned int I = X/tThreads + 1;
			//the specified start position in global memory is (x, y, z)
			unsigned int gstart = z*I_x*I_y + y*I_x + x;
		
			for (unsigned int i = 0; i < I; i++){
		
				//each iteration will copy tThreads elements, so the starting index in shared memory
				//for each iteration will be i * tThreads (iteration # times the number of elements transferred per iteration)
				unsigned int sIdx = i * tThreads + ti;
				if (sIdx>= X*Y) return;
				
				//each iteration will copy tThreads elements from the global index
				unsigned int gIdx = gstart + sIdx;
				//copy global to share
				dest[sIdx] = src[gIdx];
				
			}
		}
		//this function copy one channel data from global to shared memory in two dimensions with size of X*Y bytes.
		template<typename T>
		__device__ void cpyG2S2D(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
											  dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
			//calculate the total number of threads available
			unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
			
			//calculate the current 1D thread ID
			unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
			//calculate the number of iteration require for the copy
			unsigned int I = X*Y/tThreads + 1;
			unsigned int gz1 = z*I_x*I_y ;
		
			for (unsigned int i = 0; i < I; i++){
				
					unsigned int sIdx = i * tThreads + ti;
					if (sIdx>= X*Y) return;
					unsigned int sy = sIdx/X;
					unsigned int sx = sIdx - (sy * X);
					
					unsigned int gx = x + sx;
					unsigned int gy = y + sy;
					if (gx<I_x && gy<I_y){
						unsigned int gIdx = gz1 + gy * I_x + gx;
						//copy global to share
						dest[sIdx] = src[gIdx];
					}
							
			}
		}
		//this function copy three channels data from global to shared memory in one dimension with size of X bytes.
		template<typename T>
		__device__ void cpyG2S1D3ch(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
											  dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
			//calculate the total number of threads available
			unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
			
			//calculate the current 1D thread ID
			unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
			//calculate the number of iteration require for the copy
			unsigned int I = X/tThreads + 1;
			//the specified start position in global memory is (x, y, z)
			unsigned int gstart = z*I_x*I_y + y*I_x + x;
		
			for (unsigned int i = 0; i < I; i++){
		
				//each iteration will copy tThreads elements, so the starting index in shared memory
				//for each iteration will be i * tThreads (iteration # times the number of elements transferred per iteration)
				unsigned int sIdx = i * tThreads + ti;
				if (sIdx>= X*Y) return;				
				unsigned int gIdx = gstart*3 + sIdx; 
				//copy global to share
				dest[sIdx] = src[gIdx];
				
			}
		}
		//this function copy three channels data from global to shared memory in two dimensions with size of X*Y bytes.
		template<typename T>
		__device__ void cpyG2S2D3ch(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
											  dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
			//calculate the total number of threads available
			unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
			
			//calculate the current 1D thread ID
			unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
			//calculate the number of iteration require for the copy
			unsigned int I = X*Y/tThreads + 1;
			
			unsigned int gz1 = z*I_x*I_y ;
		
			for (unsigned int i = 0; i < I; i++){
				
					unsigned int sIdx = i * tThreads + ti;
					if (sIdx>= X*Y) return;
					unsigned int sy = sIdx/X;
					unsigned int sx = sIdx - (sy * X);
					
					unsigned int gx = x + sx/3;
					unsigned int gy = y + sy;
					if (gx<I_x && gy<I_y){
						unsigned int gIdx = (gz1 + gy * I_x + gx)*3 + (sx%3);
						//copy global to share
						dest[sIdx] = src[gIdx];			
					}
			}
		}
		// this function compute the gradient magnitude saved in the shared memory and stores the magnitude result in the rest of shared memory.
		template<typename T>
		__device__ void mag_share2D(T* grad, unsigned int bs, unsigned int X, unsigned int Y, dim3 threadIdx, dim3 blockDim){
			
			//calculate the total number of threads available
			unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
			//calculate the current 1D thread ID
			unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
			//calculate the number of iteration require for the copy
			unsigned int I = X*Y/tThreads + 1;
			for (unsigned int i = 0; i < I; i++){
				
				unsigned int sIdx = i * tThreads + ti;
				if (sIdx>= X*Y) return;
				float gx = grad[sIdx*3];
				float gy = grad[sIdx*3 + 1];
				float gz = grad[sIdx*3 + 2];
				float mag = sqrt(gx*gx + gy*gy + gz*gz);
				grad[bs + sIdx] = mag;
					
			}
		}
#endif