sharedmem.cuh
3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#ifndef STIM_CUDA_SHAREDMEM_H
#define STIM_CUDA_SHAREDMEM_H
namespace stim{
namespace cuda{
// Copies values from texture memory to shared memory, optimizing threads
template<typename T>
__device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src,
unsigned int x, unsigned int y, unsigned int X, unsigned int Y,
dim3 threadIdx, dim3 blockDim){
//calculate the number of iterations required for the copy
unsigned int xI, yI;
xI = X/blockDim.x + 1; //number of iterations along X
yI = Y/blockDim.y + 1; //number of iterations along Y
//for each iteration
for(unsigned int xi = 0; xi < xI; xi++){
for(unsigned int yi = 0; yi < yI; yi++){
//calculate the index into shared memory
unsigned int sx = xi * blockDim.x + threadIdx.x;
unsigned int sy = yi * blockDim.y + threadIdx.y;
//calculate the index into the texture
unsigned int tx = x + sx;
unsigned int ty = y + sy;
//perform the copy
if(sx < X && sy < Y)
dest[sy * X + sx] = tex2D<T>(src, tx, ty);
}
}
}
// Threaded copying of data on a CUDA device.
__device__ void threadedMemcpy(char* dest, char* src, size_t N, size_t tid, size_t nt){
size_t I = N / nt + 1; //calculate the number of iterations required to make the copy
size_t xi = tid; //initialize the source and destination index to the thread ID
for(size_t i = 0; i < I; i++){ //for each iteration
if(xi < N) //if the index is within the copy region
dest[xi] = src[xi]; //perform the copy
xi += nt;
}
}
/// Threaded copying of 2D data on a CUDA device
/// @param dest is a linear destination array of size nx * ny
/// @param nx is the size of the region to be copied along the X dimension
/// @param ny is the size of the region to be copied along the Y dimension
/// @param src is a 2D image stored as a linear array with a pitch of X
/// @param x is the x position in the source image where the copy is started
/// @param y is the y position in the source image where the copy is started
/// @param X is the number of bytes in a row of src
/// @param tid is a 1D id for the current thread
/// @param nt is the number of threads in the block
template<typename T>
__device__ void threadedMemcpy2D(T* dest, size_t nx, size_t ny,
T* src, size_t x, size_t y, size_t sX, size_t sY,
size_t tid, size_t nt){
size_t vals = nx * ny; //calculate the total number of bytes to be copied
size_t I = vals / nt + 1; //calculate the number of iterations required to perform the copy
size_t src_i, dest_i;
size_t dest_x, dest_y, src_x, src_y;
for(size_t i = 0; i < I; i++){ //for each iteration
dest_i = i * nt + tid; //calculate the index into the destination array
dest_y = dest_i / nx;
dest_x = dest_i - dest_y * nx;
if(dest_y < ny && dest_x < nx){
src_x = x + dest_x;
src_y = y + dest_y;
src_i = src_y * sX + src_x;
dest[dest_i] = src[src_i];
}
}
}
}
}
#endif