sharedmem.cuh 1.61 KB

#ifndef STIM_CUDA_SHAREDMEM_H
#define STIM_CUDA_SHAREDMEM_H

namespace stim{
	namespace cuda{

		// Copies values from texture memory to shared memory, optimizing threads
		template<typename T>
		__device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src,
										 unsigned int x, unsigned int y, unsigned int X, unsigned int Y,
										 dim3 threadIdx, dim3 blockDim){

			//calculate the number of iterations required for the copy
			unsigned int xI, yI;
			xI = X/blockDim.x + 1;				//number of iterations along X
			yI = Y/blockDim.y + 1;				//number of iterations along Y

			//for each iteration
			for(unsigned int xi = 0; xi < xI; xi++){
				for(unsigned int yi = 0; yi < yI; yi++){

					//calculate the index into shared memory
					unsigned int sx = xi * blockDim.x + threadIdx.x;
					unsigned int sy = yi * blockDim.y + threadIdx.y;

					//calculate the index into the texture
					unsigned int tx = x + sx;
					unsigned int ty = y + sy;

					//perform the copy
					if(sx < X && sy < Y)
						dest[sy * X + sx] = tex2D<T>(src, tx, ty);
				}
			}
		}

		// Copies values from global memory to shared memory, optimizing threads
		template<typename T>
		__device__ void sharedMemcpy(T* dest, T* src, size_t N, size_t tid, size_t nt){

			size_t I = N / nt + 1;	//calculate the number of iterations required to make the copy
			size_t xi = tid;							//initialize the source and destination index to the thread ID
			for(size_t i = 0; i < I; i++){ 				//for each iteration
				if(xi < N)								//if the index is within the copy region
					dest[xi] = src[xi];					//perform the copy
				xi += nt;
			}
		}

		
	}
}


#endif