sharedmem.cuh 2.7 KB


namespace stim{
	namespace cuda{

		// Copies values from texture memory to shared memory, optimizing threads
		template<typename T>
		__device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src,
										 unsigned int x, unsigned int y, unsigned int X, unsigned int Y,
										 dim3 threadIdx, dim3 blockDim){

			//calculate the number of iterations required for the copy
			unsigned int xI, yI;
			xI = X/blockDim.x + 1;				//number of iterations along X
			yI = Y/blockDim.y + 1;				//number of iterations along Y

			//for each iteration
			for(unsigned int xi = 0; xi < xI; xi++){
				for(unsigned int yi = 0; yi < yI; yi++){

					//calculate the index into shared memory
					unsigned int sx = xi * blockDim.x + threadIdx.x;
					unsigned int sy = yi * blockDim.y + threadIdx.y;

					//calculate the index into the texture
					unsigned int tx = x + sx;
					unsigned int ty = y + sy;

					//perform the copy
					if(sx < X && sy < Y)
						dest[sy * X + sx] = tex2D<T>(src, tx, ty);

		// Threaded copying of data on a CUDA device.
		__device__ void threadedMemcpy(char* dest, char* src, size_t N, size_t tid, size_t nt){
			size_t I = N / nt + 1;	//calculate the number of iterations required to make the copy
			size_t xi = tid;							//initialize the source and destination index to the thread ID
			for(size_t i = 0; i < I; i++){ 				//for each iteration
				if(xi < N)								//if the index is within the copy region
					dest[xi] = src[xi];					//perform the copy
				xi += nt;

		/// Threaded copying of 2D data on a CUDA device
		/// @param dest is a linear destination array of size nx * ny
		/// @param src is a 2D image stored as a linear array with a pitch of X
		/// @param X is the number of bytes in a row of src
		/// @param tid is a 1D id for the current thread
		/// @param nt is the number of threads in the block
		template<typename T>
		__device__ void threadedMemcpy2D(T* dest, size_t nx, size_t ny, 
										 T* src, size_t x, size_t y, size_t sX, size_t sY,
										 size_t tid, size_t nt){

			size_t vals = nx * ny;								//calculate the total number of bytes to be copied
			size_t I = vals / nt + 1;							//calculate the number of iterations required to perform the copy

			size_t src_i, dest_i;
			size_t dest_x, dest_y, src_x, src_y;
			for(size_t i = 0; i < I; i++){						//for each iteration
				dest_i = i * nt + tid;						//calculate the index into the destination array
				dest_y = dest_i / nx;
				dest_x = dest_i - dest_y * nx;

				if(dest_y < ny && dest_x < nx){

					src_x = x + dest_x;
					src_y = y + dest_y;

					src_i = src_y * sX + src_x;
					dest[dest_i] = src[src_i];