nfSumUf.cu
3.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#include "nearfield.h"
#include <stdlib.h>
#include "rts/cuda/error.h"
#include "rts/cuda/timer.h"
__global__ void gpuScalarUsp(bsComplex* Ufx, bsComplex* Ufy, bsComplex* Ufz,
bsComplex* Ux, bsComplex* Uy, bsComplex* Uz,
bsPoint* ps, ptype* as, int ns, bsRect ABCD, int uR, int vR)
{
//get the current coordinate in the plane slice
int iu = blockIdx.x * blockDim.x + threadIdx.x;
int iv = blockIdx.y * blockDim.y + threadIdx.y;
//make sure that the thread indices are in-bounds
if(iu >= uR || iv >= vR) return;
//compute the index (easier access to the scalar field array)
int i = iv*uR + iu;
//compute the parameters for u and v
ptype u = (ptype)iu / uR;
ptype v = (ptype)iv / vR;
//get the rtsPoint in world space and then the r vector
bsPoint p = ABCD(u, v);
bsVector r;
ptype d;
//if we are inside of a sphere, return
for(int is=0; is<ns; is++)
{
r = p - ps[is];
d = r.len();
if(d <= as[is])
return;
}
//otherwise add the focused field to the full field
if(Ufx != NULL)
Ux[i] += Ufx[i];
if(Ufy != NULL)
Uy[i] += Ufy[i];
if(Ufz != NULL)
Uz[i] += Ufz[i];
}
void nearfieldStruct::sumUf()
{
//create arrays to store sphere positions and radii
int nSpheres = sVector.size();
//if the number of spheres is zero, just copy the incident field
if(nSpheres == 0)
{
if(U.x_hat != NULL)
HANDLE_ERROR(cudaMemcpy(U.x_hat, Uf.x_hat, sizeof(bsComplex) * U.R[0] * U.R[1], cudaMemcpyDeviceToDevice));
if(U.y_hat != NULL)
HANDLE_ERROR(cudaMemcpy(U.y_hat, Uf.y_hat, sizeof(bsComplex) * U.R[0] * U.R[1], cudaMemcpyDeviceToDevice));
if(U.z_hat != NULL)
HANDLE_ERROR(cudaMemcpy(U.z_hat, Uf.z_hat, sizeof(bsComplex) * U.R[0] * U.R[1], cudaMemcpyDeviceToDevice));
return;
}
//time the calculation of the focused field
//gpuStartTimer();
bsPoint* cpu_p = (bsPoint*)malloc(sizeof(bsPoint) * nSpheres);
ptype* cpu_a = (ptype*)malloc(sizeof(ptype) * nSpheres);
//copy the sphere positions and radii to the new arrays
for(int s=0; s<nSpheres; s++)
{
cpu_p[s] = sVector[s].p;
cpu_a[s] = sVector[s].a;
}
//copy the arrays to the gpu
bsPoint* gpu_p;
HANDLE_ERROR(cudaMalloc( (void**) &gpu_p, sizeof(bsPoint) * nSpheres));
HANDLE_ERROR(cudaMemcpy(gpu_p, cpu_p, sizeof(bsPoint) * nSpheres, cudaMemcpyHostToDevice));
ptype* gpu_a;
HANDLE_ERROR(cudaMalloc( (void**) &gpu_a, sizeof(ptype) * nSpheres));
HANDLE_ERROR(cudaMemcpy(gpu_a, cpu_a, sizeof(ptype) * nSpheres, cudaMemcpyHostToDevice));
//create one thread for each pixel of the field slice
dim3 dimBlock(SQRT_BLOCK, SQRT_BLOCK);
dim3 dimGrid((U.R[0] + SQRT_BLOCK -1)/SQRT_BLOCK, (U.R[1] + SQRT_BLOCK - 1)/SQRT_BLOCK);
//copy the focused field
gpuScalarUsp<<<dimGrid, dimBlock>>>(Uf.x_hat,
Uf.y_hat,
Uf.z_hat,
U.x_hat,
U.y_hat,
U.z_hat,
gpu_p,
gpu_a,
nSpheres,
pos,
U.R[0],
U.R[1]);
//free sphere lists
HANDLE_ERROR(cudaFree(gpu_p));
HANDLE_ERROR(cudaFree(gpu_a));
//float t = gpuStopTimer();
//std::cout<<"Add Us Time: "<<t<<"ms"<<std::endl;
//std::cout<<focus<<std::endl;
}