cpyToshare.cuh
5.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#ifndef STIM_CUDA_cpyToshare_H
#define STIM_CUDA_cpyToshare_H
//this function copy one channel data from global to shared memory in one dimension with size of X bytes.
template<typename T>
__device__ void cpyG2S1D(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
//calculate the total number of threads available
unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
//calculate the current 1D thread ID
unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
//calculate the number of iteration require for the copy
unsigned int I = X/tThreads + 1;
//the specified start position in global memory is (x, y, z)
unsigned int gstart = z*I_x*I_y + y*I_x + x;
for (unsigned int i = 0; i < I; i++){
//each iteration will copy tThreads elements, so the starting index in shared memory
//for each iteration will be i * tThreads (iteration # times the number of elements transferred per iteration)
unsigned int sIdx = i * tThreads + ti;
if (sIdx>= X*Y) return;
//each iteration will copy tThreads elements from the global index
unsigned int gIdx = gstart + sIdx;
//copy global to share
dest[sIdx] = src[gIdx];
}
}
//this function copy one channel data from global to shared memory in two dimensions with size of X*Y bytes.
template<typename T>
__device__ void cpyG2S2D(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
//calculate the total number of threads available
unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
//calculate the current 1D thread ID
unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
//calculate the number of iteration require for the copy
unsigned int I = X*Y/tThreads + 1;
unsigned int gz1 = z*I_x*I_y ;
for (unsigned int i = 0; i < I; i++){
unsigned int sIdx = i * tThreads + ti;
if (sIdx>= X*Y) return;
unsigned int sy = sIdx/X;
unsigned int sx = sIdx - (sy * X);
unsigned int gx = x + sx;
unsigned int gy = y + sy;
if (gx<I_x && gy<I_y){
unsigned int gIdx = gz1 + gy * I_x + gx;
//copy global to share
dest[sIdx] = src[gIdx];
}
}
}
//this function copy three channels data from global to shared memory in one dimension with size of X bytes.
template<typename T>
__device__ void cpyG2S1D3ch(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
//calculate the total number of threads available
unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
//calculate the current 1D thread ID
unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
//calculate the number of iteration require for the copy
unsigned int I = X/tThreads + 1;
//the specified start position in global memory is (x, y, z)
unsigned int gstart = z*I_x*I_y + y*I_x + x;
for (unsigned int i = 0; i < I; i++){
//each iteration will copy tThreads elements, so the starting index in shared memory
//for each iteration will be i * tThreads (iteration # times the number of elements transferred per iteration)
unsigned int sIdx = i * tThreads + ti;
if (sIdx>= X*Y) return;
unsigned int gIdx = gstart*3 + sIdx;
//copy global to share
dest[sIdx] = src[gIdx];
}
}
//this function copy three channels data from global to shared memory in two dimensions with size of X*Y bytes.
template<typename T>
__device__ void cpyG2S2D3ch(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y,
dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){
//calculate the total number of threads available
unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
//calculate the current 1D thread ID
unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
//calculate the number of iteration require for the copy
unsigned int I = X*Y/tThreads + 1;
unsigned int gz1 = z*I_x*I_y ;
for (unsigned int i = 0; i < I; i++){
unsigned int sIdx = i * tThreads + ti;
if (sIdx>= X*Y) return;
unsigned int sy = sIdx/X;
unsigned int sx = sIdx - (sy * X);
unsigned int gx = x + sx/3;
unsigned int gy = y + sy;
if (gx<I_x && gy<I_y){
unsigned int gIdx = (gz1 + gy * I_x + gx)*3 + (sx%3);
//copy global to share
dest[sIdx] = src[gIdx];
}
}
}
// this function compute the gradient magnitude saved in the shared memory and stores the magnitude result in the rest of shared memory.
template<typename T>
__device__ void mag_share2D(T* grad, unsigned int bs, unsigned int X, unsigned int Y, dim3 threadIdx, dim3 blockDim){
//calculate the total number of threads available
unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z;
//calculate the current 1D thread ID
unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x;
//calculate the number of iteration require for the copy
unsigned int I = X*Y/tThreads + 1;
for (unsigned int i = 0; i < I; i++){
unsigned int sIdx = i * tThreads + ti;
if (sIdx>= X*Y) return;
float gx = grad[sIdx*3];
float gy = grad[sIdx*3 + 1];
float gz = grad[sIdx*3 + 2];
float mag = sqrt(gx*gx + gy*gy + gz*gz);
grad[bs + sIdx] = mag;
}
}
#endif