#ifndef STIM_CUDA_SHAREDMEM_H #define STIM_CUDA_SHAREDMEM_H namespace stim{ namespace cuda{ // Copies values from texture memory to shared memory, optimizing threads template __device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src, unsigned int x, unsigned int y, unsigned int X, unsigned int Y, dim3 threadIdx, dim3 blockDim){ //calculate the number of iterations required for the copy unsigned int xI, yI; xI = X/blockDim.x + 1; //number of iterations along X yI = Y/blockDim.y + 1; //number of iterations along Y //for each iteration for(unsigned int xi = 0; xi < xI; xi++){ for(unsigned int yi = 0; yi < yI; yi++){ //calculate the index into shared memory unsigned int sx = xi * blockDim.x + threadIdx.x; unsigned int sy = yi * blockDim.y + threadIdx.y; //calculate the index into the texture unsigned int tx = x + sx; unsigned int ty = y + sy; //perform the copy if(sx < X && sy < Y) dest[sy * X + sx] = tex2D(src, tx, ty); } } } // Copies values from global memory to shared memory, optimizing threads template __device__ void sharedMemcpy(T* dest, T* src, size_t N, size_t tid, size_t nt){ size_t I = N / nt + 1; //calculate the number of iterations required to make the copy size_t xi = tid; //initialize the source and destination index to the thread ID for(size_t i = 0; i < I; i++){ //for each iteration if(xi < N) //if the index is within the copy region dest[xi] = src[xi]; //perform the copy xi += nt; } } } } #endif