#ifndef STIM_CUDA_SHAREDMEM_H #define STIM_CUDA_SHAREDMEM_H namespace stim{ namespace cuda{ // Copies values from texture memory to shared memory, optimizing threads template __device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src, unsigned int x, unsigned int y, unsigned int X, unsigned int Y, dim3 threadIdx, dim3 blockDim){ //calculate the number of iterations required for the copy unsigned int xI, yI; xI = X/blockDim.x + 1; //number of iterations along X yI = Y/blockDim.y + 1; //number of iterations along Y //for each iteration for(unsigned int xi = 0; xi < xI; xi++){ for(unsigned int yi = 0; yi < yI; yi++){ //calculate the index into shared memory unsigned int sx = xi * blockDim.x + threadIdx.x; unsigned int sy = yi * blockDim.y + threadIdx.y; //calculate the index into the texture unsigned int tx = x + sx; unsigned int ty = y + sy; //perform the copy if(sx < X && sy < Y) dest[sy * X + sx] = tex2D(src, tx, ty); } } } // Threaded copying of data on a CUDA device. __device__ void threadedMemcpy(char* dest, char* src, size_t N, size_t tid, size_t nt){ size_t I = N / nt + 1; //calculate the number of iterations required to make the copy size_t xi = tid; //initialize the source and destination index to the thread ID for(size_t i = 0; i < I; i++){ //for each iteration if(xi < N) //if the index is within the copy region dest[xi] = src[xi]; //perform the copy xi += nt; } } /// Threaded copying of 2D data on a CUDA device /// @param dest is a linear destination array of size nx * ny /// @param nx is the size of the region to be copied along the X dimension /// @param ny is the size of the region to be copied along the Y dimension /// @param src is a 2D image stored as a linear array with a pitch of X /// @param x is the x position in the source image where the copy is started /// @param y is the y position in the source image where the copy is started /// @param X is the number of bytes in a row of src /// @param tid is a 1D id for the current thread /// @param nt is the number of threads in the block template __device__ void threadedMemcpy2D(T* dest, size_t nx, size_t ny, T* src, size_t x, size_t y, size_t sX, size_t sY, size_t tid, size_t nt){ size_t vals = nx * ny; //calculate the total number of bytes to be copied size_t I = vals / nt + 1; //calculate the number of iterations required to perform the copy size_t src_i, dest_i; size_t dest_x, dest_y, src_x, src_y; for(size_t i = 0; i < I; i++){ //for each iteration dest_i = i * nt + tid; //calculate the index into the destination array dest_y = dest_i / nx; dest_x = dest_i - dest_y * nx; if(dest_y < ny && dest_x < nx){ src_x = x + dest_x; src_y = y + dest_y; src_i = src_y * sX + src_x; dest[dest_i] = src[src_i]; } } } } } #endif