Blame view

stim/cuda/sharedmem.cuh 3 KB
13fe3c84   Laila Saadatifard   update the stimli...
1
2
3
4
5
6
7
  
  #ifndef STIM_CUDA_SHAREDMEM_H
  #define STIM_CUDA_SHAREDMEM_H
  
  namespace stim{
  	namespace cuda{
  
8e4f8364   David Mayerich   started a new opt...
8
  		// Copies values from texture memory to shared memory, optimizing threads
13fe3c84   Laila Saadatifard   update the stimli...
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
  		template<typename T>
  		__device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src,
  										 unsigned int x, unsigned int y, unsigned int X, unsigned int Y,
  										 dim3 threadIdx, dim3 blockDim){
  
  			//calculate the number of iterations required for the copy
  			unsigned int xI, yI;
  			xI = X/blockDim.x + 1;				//number of iterations along X
  			yI = Y/blockDim.y + 1;				//number of iterations along Y
  
  			//for each iteration
  			for(unsigned int xi = 0; xi < xI; xi++){
  				for(unsigned int yi = 0; yi < yI; yi++){
  
  					//calculate the index into shared memory
  					unsigned int sx = xi * blockDim.x + threadIdx.x;
  					unsigned int sy = yi * blockDim.y + threadIdx.y;
  
  					//calculate the index into the texture
  					unsigned int tx = x + sx;
  					unsigned int ty = y + sy;
  
  					//perform the copy
  					if(sx < X && sy < Y)
  						dest[sy * X + sx] = tex2D<T>(src, tx, ty);
  				}
  			}
  		}
84eff8b1   Pavel Govyadinov   Merged only the n...
37
  
ca99f951   David Mayerich   faster implementa...
38
39
  		// Threaded copying of data on a CUDA device.
  		__device__ void threadedMemcpy(char* dest, char* src, size_t N, size_t tid, size_t nt){
8e4f8364   David Mayerich   started a new opt...
40
41
42
43
44
45
46
47
48
  			size_t I = N / nt + 1;	//calculate the number of iterations required to make the copy
  			size_t xi = tid;							//initialize the source and destination index to the thread ID
  			for(size_t i = 0; i < I; i++){ 				//for each iteration
  				if(xi < N)								//if the index is within the copy region
  					dest[xi] = src[xi];					//perform the copy
  				xi += nt;
  			}
  		}
  
ca99f951   David Mayerich   faster implementa...
49
50
  		/// Threaded copying of 2D data on a CUDA device
  		/// @param dest is a linear destination array of size nx * ny
dbeb83f2   David Mayerich   added separable c...
51
52
  		/// @param nx is the size of the region to be copied along the X dimension
  		/// @param ny is the size of the region to be copied along the Y dimension
ca99f951   David Mayerich   faster implementa...
53
  		/// @param src is a 2D image stored as a linear array with a pitch of X
dbeb83f2   David Mayerich   added separable c...
54
55
  		/// @param x is the x position in the source image where the copy is started
  		/// @param y is the y position in the source image where the copy is started
ca99f951   David Mayerich   faster implementa...
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
  		/// @param X is the number of bytes in a row of src
  		/// @param tid is a 1D id for the current thread
  		/// @param nt is the number of threads in the block
  		template<typename T>
  		__device__ void threadedMemcpy2D(T* dest, size_t nx, size_t ny, 
  										 T* src, size_t x, size_t y, size_t sX, size_t sY,
  										 size_t tid, size_t nt){
  
  			size_t vals = nx * ny;								//calculate the total number of bytes to be copied
  			size_t I = vals / nt + 1;							//calculate the number of iterations required to perform the copy
  
  			size_t src_i, dest_i;
  			size_t dest_x, dest_y, src_x, src_y;
  			for(size_t i = 0; i < I; i++){						//for each iteration
  				dest_i = i * nt + tid;						//calculate the index into the destination array
  				dest_y = dest_i / nx;
  				dest_x = dest_i - dest_y * nx;
  
  				if(dest_y < ny && dest_x < nx){
  
  					src_x = x + dest_x;
  					src_y = y + dest_y;
  
  					src_i = src_y * sX + src_x;
  					dest[dest_i] = src[src_i];
  				}
  			}
  		}
13fe3c84   Laila Saadatifard   update the stimli...
84
85
86
87
  	}
  }
  
  
84eff8b1   Pavel Govyadinov   Merged only the n...
88
  #endif