stim/cuda/sharedmem.cuh

#ifndef STIM_CUDA_SHAREDMEM_H
#define STIM_CUDA_SHAREDMEM_H
namespace stim{
	namespace cuda{
		// Copies values from global memory to shared memory, optimizing threads
		template<typename T>
		__device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src,
										 unsigned int x, unsigned int y, unsigned int X, unsigned int Y,
										 dim3 threadIdx, dim3 blockDim){
			//calculate the number of iterations required for the copy
			unsigned int xI, yI;
			xI = X/blockDim.x + 1;				//number of iterations along X
			yI = Y/blockDim.y + 1;				//number of iterations along Y
			//for each iteration
			for(unsigned int xi = 0; xi < xI; xi++){
				for(unsigned int yi = 0; yi < yI; yi++){
					//calculate the index into shared memory
					unsigned int sx = xi * blockDim.x + threadIdx.x;
					unsigned int sy = yi * blockDim.y + threadIdx.y;
					//calculate the index into the texture
					unsigned int tx = x + sx;
					unsigned int ty = y + sy;
					//perform the copy
					if(sx < X && sy < Y)
						dest[sy * X + sx] = tex2D<T>(src, tx, ty);
				}
			}
		}
		
	}
}
#endif