#ifndef STIM_CUDA_SHAREDMEM_H #define STIM_CUDA_SHAREDMEM_H namespace stim{ namespace cuda{ // Copies values from global memory to shared memory, optimizing threads template __device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src, unsigned int x, unsigned int y, unsigned int X, unsigned int Y, dim3 threadIdx, dim3 blockDim){ //calculate the number of iterations required for the copy unsigned int xI, yI; xI = X/blockDim.x + 1; //number of iterations along X yI = Y/blockDim.y + 1; //number of iterations along Y //for each iteration for(unsigned int xi = 0; xi < xI; xi++){ for(unsigned int yi = 0; yi < yI; yi++){ //calculate the index into shared memory unsigned int sx = xi * blockDim.x + threadIdx.x; unsigned int sy = yi * blockDim.y + threadIdx.y; //calculate the index into the texture unsigned int tx = x + sx; unsigned int ty = y + sy; //perform the copy if(sx < X && sy < Y) dest[sy * X + sx] = tex2D(src, tx, ty); } } } } } #endif