Commit 4252d8276a60522356cbd52577b4fe5911f7f845
1 parent
b5dd3d3a
ivote3 fixes and new bimsim code
Showing
8 changed files
with
419 additions
and
119 deletions
Show diff stats
1 | + | ||
2 | +template<typename T> | ||
3 | +__global__ void cuda_crop2d(T* dest, T* src, size_t sx, size_t sy, size_t x, size_t y, size_t dx, size_t dy){ | ||
4 | + | ||
5 | + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate the current working position within the destination image | ||
6 | + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
7 | + if(xi >= dx || yi >= dy) return; //if this thread is outside of the destination image, return | ||
8 | + | ||
9 | + size_t di = yi * dx + xi; //calculate the 1D index into the destination image | ||
10 | + size_t si = (y + yi) * sx + (x + xi); //calculate the 1D index into the source image | ||
11 | + | ||
12 | + dest[di] = src[si]; //copy the corresponding source pixel to the destination image | ||
13 | +} | ||
14 | + | ||
15 | +/// Crops a 2D image composed of elements of type T | ||
16 | +/// @param dest is a device pointer to memory of size dx*dy that will store the cropped image | ||
17 | +/// @param src is a device pointer to memory of size sx*sy that stores the original image | ||
18 | +/// @param sx is the size of the source image along x | ||
19 | +/// @param sy is the size of the source image along y | ||
20 | +/// @param x is the x-coordinate of the start position of the cropped region within the source image | ||
21 | +/// @param y is the y-coordinate of the start position of the cropped region within the source image | ||
22 | +/// @param dx is the size of the destination image along x | ||
23 | +/// @param dy is the size of the destination image along y | ||
24 | +template<typename T> | ||
25 | +void gpu_crop2d(T* dest, T* src, size_t sx, size_t sy, size_t x, size_t y, size_t dx, size_t dy){ | ||
26 | + int max_threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device | ||
27 | + dim3 threads( sqrt(max_threads), sqrt(max_threads) ); | ||
28 | + dim3 blocks( dx / sqrt(threads.x) + 1, dy / sqrt(threads.y) + 1); //calculate the optimal number of blocks | ||
29 | + cuda_crop2d<T> <<< blocks, threads >>>(dest, src, sx, sy, x, y, dx, dy); | ||
30 | +} | ||
0 | \ No newline at end of file | 31 | \ No newline at end of file |
stim/math/circle.h
@@ -64,7 +64,7 @@ public: | @@ -64,7 +64,7 @@ public: | ||
64 | init(); | 64 | init(); |
65 | } | 65 | } |
66 | 66 | ||
67 | - ///create a rectangle from a center point, normal, and size | 67 | + /*///create a rectangle from a center point, normal, and size |
68 | ///@param c: x,y,z location of the center. | 68 | ///@param c: x,y,z location of the center. |
69 | ///@param s: size of the rectangle. | 69 | ///@param s: size of the rectangle. |
70 | ///@param n: x,y,z direction of the normal. | 70 | ///@param n: x,y,z direction of the normal. |
@@ -76,6 +76,7 @@ public: | @@ -76,6 +76,7 @@ public: | ||
76 | rotate(n, U, Y); | 76 | rotate(n, U, Y); |
77 | scale(s); | 77 | scale(s); |
78 | } | 78 | } |
79 | + */ | ||
79 | 80 | ||
80 | ///create a rectangle from a center point, normal, and size | 81 | ///create a rectangle from a center point, normal, and size |
81 | ///@param c: x,y,z location of the center. | 82 | ///@param c: x,y,z location of the center. |
@@ -164,12 +165,12 @@ public: | @@ -164,12 +165,12 @@ public: | ||
164 | ///returns a vector with the points on the initialized circle. | 165 | ///returns a vector with the points on the initialized circle. |
165 | ///connecting the points results in a circle. | 166 | ///connecting the points results in a circle. |
166 | ///@param n: integer for the number of points representing the circle. | 167 | ///@param n: integer for the number of points representing the circle. |
167 | - stim::vec3<T> | 168 | + CUDA_CALLABLE stim::vec3<T> |
168 | p(T theta) | 169 | p(T theta) |
169 | { | 170 | { |
170 | T x,y; | 171 | T x,y; |
171 | - y = 0.5*cos(theta*stim::TAU/360.0)+0.5; | ||
172 | - x = 0.5*sin(theta*stim::TAU/360.0)+0.5; | 172 | + y = 0.5*cos(theta*STIM_TAU/360.0)+0.5; |
173 | + x = 0.5*sin(theta*STIM_TAU/360.0)+0.5; | ||
173 | return p(x,y); | 174 | return p(x,y); |
174 | } | 175 | } |
175 | 176 |
stim/math/constants.h
1 | #ifndef STIM_CONSTANTS_H | 1 | #ifndef STIM_CONSTANTS_H |
2 | #define STIM_CONSTANTS_H | 2 | #define STIM_CONSTANTS_H |
3 | 3 | ||
4 | +#define STIM_PI 3.1415926535897932384626433832795028841971693993751058209749445923078164062862 | ||
5 | +#define STIM_TAU 2 * STIM_PI | ||
6 | + | ||
4 | #include "stim/cuda/cudatools/callable.h" | 7 | #include "stim/cuda/cudatools/callable.h" |
5 | namespace stim{ | 8 | namespace stim{ |
6 | const double PI = 3.1415926535897932384626433832795028841971693993751058209749445923078164062862; | 9 | const double PI = 3.1415926535897932384626433832795028841971693993751058209749445923078164062862; |
stim/math/vec3.h
@@ -216,7 +216,7 @@ public: | @@ -216,7 +216,7 @@ public: | ||
216 | return result; | 216 | return result; |
217 | } | 217 | } |
218 | 218 | ||
219 | - | 219 | +#ifndef __NVCC__ |
220 | /// Outputs the vector as a string | 220 | /// Outputs the vector as a string |
221 | std::string str() const{ | 221 | std::string str() const{ |
222 | std::stringstream ss; | 222 | std::stringstream ss; |
@@ -234,6 +234,7 @@ public: | @@ -234,6 +234,7 @@ public: | ||
234 | 234 | ||
235 | return ss.str(); | 235 | return ss.str(); |
236 | } | 236 | } |
237 | +#endif | ||
237 | 238 | ||
238 | size_t size(){ return 3; } | 239 | size_t size(){ return 3; } |
239 | 240 |
stim/optics/scalarbeam.h
@@ -193,6 +193,7 @@ void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda, | @@ -193,6 +193,7 @@ void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda, | ||
193 | printf ("CUBLAS Error: failed to calculate maximum r value.\n"); | 193 | printf ("CUBLAS Error: failed to calculate maximum r value.\n"); |
194 | exit(1); | 194 | exit(1); |
195 | } | 195 | } |
196 | + cublasDestroy(handle); | ||
196 | 197 | ||
197 | i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility | 198 | i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility |
198 | i_max--; | 199 | i_max--; |
@@ -212,8 +213,9 @@ void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda, | @@ -212,8 +213,9 @@ void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda, | ||
212 | T* j_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table | 213 | T* j_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table |
213 | T dr = (r_max - r_min) / (Nlut_j-1); //distance between values in the LUT | 214 | T dr = (r_max - r_min) / (Nlut_j-1); //distance between values in the LUT |
214 | T jl; | 215 | T jl; |
216 | + unsigned l; | ||
215 | for(size_t ri = 0; ri < Nlut_j; ri++){ //for each value in the LUT | 217 | for(size_t ri = 0; ri < Nlut_j; ri++){ //for each value in the LUT |
216 | - for(unsigned l = 0; l <= (unsigned)Nl; l++){ //for each order | 218 | + for(l = 0; l <= (unsigned)Nl; l++){ //for each order |
217 | jl = boost::math::sph_bessel<T>(l, k*(r_min + ri * dr)); //use boost to calculate the spherical bessel function | 219 | jl = boost::math::sph_bessel<T>(l, k*(r_min + ri * dr)); //use boost to calculate the spherical bessel function |
218 | j_lut[ri * (Nl + 1) + l] = jl; //store the bessel function result | 220 | j_lut[ri * (Nl + 1) + l] = jl; //store the bessel function result |
219 | } | 221 | } |
@@ -323,6 +325,9 @@ void gpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lamb | @@ -323,6 +325,9 @@ void gpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lamb | ||
323 | 325 | ||
324 | gpu_scalar_psf_local(E, N, gpu_r, gpu_phi, lambda, A, NA, NA_in, Nl, r_spacing); | 326 | gpu_scalar_psf_local(E, N, gpu_r, gpu_phi, lambda, A, NA, NA_in, Nl, r_spacing); |
325 | 327 | ||
328 | + HANDLE_ERROR( cudaFree(gpu_r) ); | ||
329 | + HANDLE_ERROR( cudaFree(gpu_phi) ); | ||
330 | + | ||
326 | } | 331 | } |
327 | #endif | 332 | #endif |
328 | 333 | ||
@@ -449,17 +454,12 @@ public: | @@ -449,17 +454,12 @@ public: | ||
449 | 454 | ||
450 | /// Evaluate the beam to a scalar field using Debye focusing | 455 | /// Evaluate the beam to a scalar field using Debye focusing |
451 | void eval(stim::scalarfield<T>& E, int order = 500){ | 456 | void eval(stim::scalarfield<T>& E, int order = 500){ |
452 | - size_t array_size = E.grid_bytes(); | ||
453 | - T* X = (T*) malloc( array_size ); //allocate space for the coordinate meshes | ||
454 | - T* Y = (T*) malloc( array_size ); | ||
455 | - T* Z = (T*) malloc( array_size ); | ||
456 | - | ||
457 | - E.meshgrid(X, Y, Z, stim::CPUmem); //calculate the coordinate meshes | ||
458 | - eval(E, X, Y, Z, order); | ||
459 | - | ||
460 | - free(X); //free the coordinate meshes | ||
461 | - free(Y); | ||
462 | - free(Z); | 457 | + E.meshgrid(); //calculate a meshgrid if one isn't created |
458 | + if(E.gpu()) | ||
459 | + gpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[0], NA[1], order, E.spacing()); | ||
460 | + else | ||
461 | + cpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[0], NA[1], order, E.spacing()); | ||
462 | + //eval(E, E.x(), E.y(), E.z(), order); | ||
463 | } | 463 | } |
464 | 464 | ||
465 | /// Calculate the field at a given point | 465 | /// Calculate the field at a given point |
stim/optics/scalarfield.h
@@ -6,8 +6,52 @@ | @@ -6,8 +6,52 @@ | ||
6 | #include "../math/complex.h" | 6 | #include "../math/complex.h" |
7 | #include "../math/fft.h" | 7 | #include "../math/fft.h" |
8 | 8 | ||
9 | +#ifdef CUDA_FOUND | ||
10 | +#include "../cuda/crop.cuh" | ||
11 | +#endif | ||
12 | + | ||
9 | namespace stim{ | 13 | namespace stim{ |
10 | 14 | ||
15 | + template<typename T> | ||
16 | + __global__ void cuda_abs(T* img, stim::complex<T>* field, size_t N){ | ||
17 | + size_t i = blockIdx.x * blockDim.x + threadIdx.x; | ||
18 | + if(i >= N) return; | ||
19 | + | ||
20 | + img[i] = field[i].abs(); | ||
21 | + } | ||
22 | + | ||
23 | + template<typename T> | ||
24 | + __global__ void cuda_real(T* img, stim::complex<T>* field, size_t N){ | ||
25 | + size_t i = blockIdx.x * blockDim.x + threadIdx.x; | ||
26 | + if(i >= N) return; | ||
27 | + | ||
28 | + img[i] = field[i].real(); | ||
29 | + } | ||
30 | + | ||
31 | + template<typename T> | ||
32 | + __global__ void cuda_imag(T* img, stim::complex<T>* field, size_t N){ | ||
33 | + size_t i = blockIdx.x * blockDim.x + threadIdx.x; | ||
34 | + if(i >= N) return; | ||
35 | + | ||
36 | + img[i] = field[i].imag(); | ||
37 | + } | ||
38 | + | ||
39 | + template<typename T> | ||
40 | + __global__ void cuda_intensity(T* img, stim::complex<T>* field, size_t N){ | ||
41 | + size_t i = blockIdx.x * blockDim.x + threadIdx.x; | ||
42 | + if(i >= N) return; | ||
43 | + | ||
44 | + img[i] = pow(field[i].abs(), 2); | ||
45 | + } | ||
46 | + | ||
47 | + template<typename T> | ||
48 | + __global__ void cuda_sum_intensity(T* img, stim::complex<T>* field, size_t N){ | ||
49 | + size_t i = blockIdx.x * blockDim.x + threadIdx.x; | ||
50 | + if(i >= N) return; | ||
51 | + | ||
52 | + img[i] += pow(field[i].abs(), 2); | ||
53 | + } | ||
54 | + | ||
11 | /// Perform a k-space transform of a scalar field (FFT). The given field has a width of x and the calculated momentum space has a | 55 | /// Perform a k-space transform of a scalar field (FFT). The given field has a width of x and the calculated momentum space has a |
12 | /// width of kx (in radians). | 56 | /// width of kx (in radians). |
13 | /// @param K is a pointer to the output array of all plane waves in the field | 57 | /// @param K is a pointer to the output array of all plane waves in the field |
@@ -44,12 +88,16 @@ namespace stim{ | @@ -44,12 +88,16 @@ namespace stim{ | ||
44 | std::cout<<"Error using cuFFT to perform a forward Fourier transform of the field."<<std::endl; | 88 | std::cout<<"Error using cuFFT to perform a forward Fourier transform of the field."<<std::endl; |
45 | exit(1); | 89 | exit(1); |
46 | } | 90 | } |
91 | + cufftDestroy(plan); | ||
47 | 92 | ||
48 | stim::complex<T>* fft = (stim::complex<T>*) malloc(sizeof(stim::complex<T>) * nx * ny); | 93 | stim::complex<T>* fft = (stim::complex<T>*) malloc(sizeof(stim::complex<T>) * nx * ny); |
49 | HANDLE_ERROR( cudaMemcpy(fft, dev_FFT, sizeof(stim::complex<T>) * nx * ny, cudaMemcpyDeviceToHost) ); | 94 | HANDLE_ERROR( cudaMemcpy(fft, dev_FFT, sizeof(stim::complex<T>) * nx * ny, cudaMemcpyDeviceToHost) ); |
50 | 95 | ||
51 | stim::cpu_fftshift(K, fft, nx, ny); | 96 | stim::cpu_fftshift(K, fft, nx, ny); |
52 | - //memcpy(K, fft, sizeof(stim::complex<T>) * nx * ny); | 97 | + |
98 | + HANDLE_ERROR( cudaFree(dev_FFT) ); //free GPU memory | ||
99 | + HANDLE_ERROR( cudaFree(dev_E) ); | ||
100 | + free(fft); //free CPU memory | ||
53 | } | 101 | } |
54 | 102 | ||
55 | template<typename T> | 103 | template<typename T> |
@@ -82,12 +130,18 @@ namespace stim{ | @@ -82,12 +130,18 @@ namespace stim{ | ||
82 | std::cout<<"Error using cuFFT to perform a forward Fourier transform of the field."<<std::endl; | 130 | std::cout<<"Error using cuFFT to perform a forward Fourier transform of the field."<<std::endl; |
83 | exit(1); | 131 | exit(1); |
84 | } | 132 | } |
133 | + cufftDestroy(plan); | ||
85 | 134 | ||
86 | HANDLE_ERROR( cudaMemcpy(E, dev_E, sizeof(stim::complex<T>) * nx * ny, cudaMemcpyDeviceToHost) ); | 135 | HANDLE_ERROR( cudaMemcpy(E, dev_E, sizeof(stim::complex<T>) * nx * ny, cudaMemcpyDeviceToHost) ); |
87 | 136 | ||
137 | + HANDLE_ERROR( cudaFree(dev_FFT) ); //free GPU memory | ||
138 | + HANDLE_ERROR( cudaFree(dev_E) ); | ||
139 | + free(fft); //free CPU memory | ||
88 | 140 | ||
89 | } | 141 | } |
90 | 142 | ||
143 | + | ||
144 | + | ||
91 | /// Propagate a field slice along its orthogonal direction by a given distance z | 145 | /// Propagate a field slice along its orthogonal direction by a given distance z |
92 | /// @param Enew is the resulting propogated field | 146 | /// @param Enew is the resulting propogated field |
93 | /// @param E is the field to be propogated | 147 | /// @param E is the field to be propogated |
@@ -105,9 +159,9 @@ namespace stim{ | @@ -105,9 +159,9 @@ namespace stim{ | ||
105 | T Kx, Ky; //width and height in k space | 159 | T Kx, Ky; //width and height in k space |
106 | cpu_scalar_to_kspace(K, Kx, Ky, E ,sx, sy, nx, ny); | 160 | cpu_scalar_to_kspace(K, Kx, Ky, E ,sx, sy, nx, ny); |
107 | 161 | ||
108 | - T* mag = (T*) malloc( sizeof(T) * nx * ny ); | ||
109 | - stim::abs(mag, K, nx * ny); | ||
110 | - stim::cpu2image<float>(mag, "kspace_pre_shift.bmp", nx, ny, stim::cmBrewer); | 162 | + //T* mag = (T*) malloc( sizeof(T) * nx * ny ); |
163 | + //stim::abs(mag, K, nx * ny); | ||
164 | + //stim::cpu2image<float>(mag, "kspace_pre_shift.bmp", nx, ny, stim::cmBrewer); | ||
111 | 165 | ||
112 | size_t kxi, kyi; | 166 | size_t kxi, kyi; |
113 | size_t i; | 167 | size_t i; |
@@ -143,10 +197,27 @@ namespace stim{ | @@ -143,10 +197,27 @@ namespace stim{ | ||
143 | } | 197 | } |
144 | } | 198 | } |
145 | 199 | ||
146 | - stim::abs(mag, K, nx * ny); | ||
147 | - stim::cpu2image<float>(mag, "kspace_post_shift.bmp", nx, ny, stim::cmBrewer); | 200 | + //stim::abs(mag, K, nx * ny); |
201 | + //stim::cpu2image<float>(mag, "kspace_post_shift.bmp", nx, ny, stim::cmBrewer); | ||
148 | 202 | ||
149 | cpu_scalar_from_kspace(Enew, sx, sy, K, Kx, Ky, nx, ny); | 203 | cpu_scalar_from_kspace(Enew, sx, sy, K, Kx, Ky, nx, ny); |
204 | + free(K); | ||
205 | + } | ||
206 | + | ||
207 | + template<typename T> | ||
208 | + void gpu_scalar_propagate(stim::complex<T>* Enew, stim::complex<T>* E, T sx, T sy, T z, T k, size_t nx, size_t ny){ | ||
209 | + | ||
210 | + size_t field_bytes = sizeof(stim::complex<T>) * nx * ny; | ||
211 | + stim::complex<T>* host_E = (stim::complex<T>*) malloc( field_bytes); | ||
212 | + HANDLE_ERROR( cudaMemcpy(host_E, E, field_bytes, cudaMemcpyDeviceToHost) ); | ||
213 | + | ||
214 | + stim::complex<T>* host_Enew = (stim::complex<T>*) malloc(field_bytes); | ||
215 | + | ||
216 | + cpu_scalar_propagate(host_Enew, host_E, sx, sy, z, k, nx, ny); | ||
217 | + | ||
218 | + HANDLE_ERROR( cudaMemcpy(Enew, host_Enew, field_bytes, cudaMemcpyHostToDevice) ); | ||
219 | + free(host_E); | ||
220 | + free(host_Enew); | ||
150 | } | 221 | } |
151 | 222 | ||
152 | /// Apply a lowpass filter to a field slice | 223 | /// Apply a lowpass filter to a field slice |
@@ -165,9 +236,9 @@ namespace stim{ | @@ -165,9 +236,9 @@ namespace stim{ | ||
165 | T Kx, Ky; //width and height in k space | 236 | T Kx, Ky; //width and height in k space |
166 | cpu_scalar_to_kspace(K, Kx, Ky, E ,sx, sy, nx, ny); | 237 | cpu_scalar_to_kspace(K, Kx, Ky, E ,sx, sy, nx, ny); |
167 | 238 | ||
168 | - T* mag = (T*) malloc( sizeof(T) * nx * ny ); | ||
169 | - stim::abs(mag, K, nx * ny); | ||
170 | - stim::cpu2image<float>(mag, "kspace_pre_lowpass.bmp", nx, ny, stim::cmBrewer); | 239 | + //T* mag = (T*) malloc( sizeof(T) * nx * ny ); |
240 | + //stim::abs(mag, K, nx * ny); | ||
241 | + //stim::cpu2image<float>(mag, "kspace_pre_lowpass.bmp", nx, ny, stim::cmBrewer); | ||
171 | 242 | ||
172 | size_t kxi, kyi; | 243 | size_t kxi, kyi; |
173 | size_t i; | 244 | size_t i; |
@@ -200,10 +271,11 @@ namespace stim{ | @@ -200,10 +271,11 @@ namespace stim{ | ||
200 | } | 271 | } |
201 | } | 272 | } |
202 | 273 | ||
203 | - stim::abs(mag, K, nx * ny); | ||
204 | - stim::cpu2image<float>(mag, "kspace_post_lowpass.bmp", nx, ny, stim::cmBrewer); | 274 | + //stim::abs(mag, K, nx * ny); |
275 | + //stim::cpu2image<float>(mag, "kspace_post_lowpass.bmp", nx, ny, stim::cmBrewer); | ||
205 | 276 | ||
206 | cpu_scalar_from_kspace(Enew, sx, sy, K, Kx, Ky, nx, ny); | 277 | cpu_scalar_from_kspace(Enew, sx, sy, K, Kx, Ky, nx, ny); |
278 | + free(K); | ||
207 | } | 279 | } |
208 | 280 | ||
209 | enum locationType {CPUmem, GPUmem}; | 281 | enum locationType {CPUmem, GPUmem}; |
@@ -222,6 +294,8 @@ protected: | @@ -222,6 +294,8 @@ protected: | ||
222 | size_t R[2]; | 294 | size_t R[2]; |
223 | locationType loc; | 295 | locationType loc; |
224 | 296 | ||
297 | + T* p[3]; //scalar position for each point in E | ||
298 | + | ||
225 | /// Convert the field to a k-space representation (do an FFT) | 299 | /// Convert the field to a k-space representation (do an FFT) |
226 | void to_kspace(T& kx, T& ky){ | 300 | void to_kspace(T& kx, T& ky){ |
227 | cpu_scalar_to_kspace(E, kx, ky, E, X.len(), Y.len(), R[0], R[1]); | 301 | cpu_scalar_to_kspace(E, kx, ky, E, X.len(), Y.len(), R[0], R[1]); |
@@ -252,6 +326,9 @@ public: | @@ -252,6 +326,9 @@ public: | ||
252 | E = (stim::complex<T>*) malloc(grid_bytes()); //allocate in CPU memory | 326 | E = (stim::complex<T>*) malloc(grid_bytes()); //allocate in CPU memory |
253 | memset(E, 0, grid_bytes()); | 327 | memset(E, 0, grid_bytes()); |
254 | loc = CPUmem; | 328 | loc = CPUmem; |
329 | + | ||
330 | + p[0] = p[1] = p[2] = NULL; //set the position vector to NULL | ||
331 | + | ||
255 | } | 332 | } |
256 | 333 | ||
257 | ~scalarfield(){ | 334 | ~scalarfield(){ |
@@ -271,11 +348,35 @@ public: | @@ -271,11 +348,35 @@ public: | ||
271 | if(loc == GPUmem) return; | 348 | if(loc == GPUmem) return; |
272 | else{ | 349 | else{ |
273 | stim::complex<T>* dev_E; | 350 | stim::complex<T>* dev_E; |
274 | - HANDLE_ERROR( cudaMalloc(&dev_E, e_bytes()) ); //allocate GPU memory | ||
275 | - HANDLE_ERROR( cudaMemcpy(dev_E, E, e_bytes(), cudaMemcpyHostToDevice) ); //copy the field to the GPU | 351 | + HANDLE_ERROR( cudaMalloc(&dev_E, grid_bytes()) ); //allocate GPU memory |
352 | + HANDLE_ERROR( cudaMemcpy(dev_E, E, grid_bytes(), cudaMemcpyHostToDevice) ); //copy the field to the GPU | ||
276 | free(E); //free the CPU memory | 353 | free(E); //free the CPU memory |
277 | E = dev_E; //swap pointers | 354 | E = dev_E; //swap pointers |
355 | + | ||
356 | + if(p[0]){ | ||
357 | + size_t meshgrid_bytes = size() * sizeof(T); //calculate the number of bytes in each meshgrid | ||
358 | + T* dev_X; //allocate variables to store the device meshgrid | ||
359 | + T* dev_Y; | ||
360 | + T* dev_Z; | ||
361 | + HANDLE_ERROR( cudaMalloc(&dev_X, meshgrid_bytes) ); //allocate space for the meshgrid on the device | ||
362 | + HANDLE_ERROR( cudaMalloc(&dev_Y, meshgrid_bytes) ); | ||
363 | + HANDLE_ERROR( cudaMalloc(&dev_Z, meshgrid_bytes) ); | ||
364 | + | ||
365 | + HANDLE_ERROR( cudaMemcpy(dev_X, p[0], meshgrid_bytes, cudaMemcpyHostToDevice) ); //copy from the host to the device | ||
366 | + HANDLE_ERROR( cudaMemcpy(dev_Y, p[1], meshgrid_bytes, cudaMemcpyHostToDevice) ); | ||
367 | + HANDLE_ERROR( cudaMemcpy(dev_Z, p[2], meshgrid_bytes, cudaMemcpyHostToDevice) ); | ||
368 | + | ||
369 | + free(p[0]); //free device memory | ||
370 | + free(p[1]); | ||
371 | + free(p[2]); | ||
372 | + | ||
373 | + p[0] = dev_X; //swap in the new pointers to device memory | ||
374 | + p[1] = dev_Y; | ||
375 | + p[2] = dev_Z; | ||
376 | + } | ||
377 | + loc = GPUmem; //set the location flag | ||
278 | } | 378 | } |
379 | + | ||
279 | } | 380 | } |
280 | 381 | ||
281 | /// Copy the field array to the CPU, if it isn't already there | 382 | /// Copy the field array to the CPU, if it isn't already there |
@@ -286,14 +387,43 @@ public: | @@ -286,14 +387,43 @@ public: | ||
286 | HANDLE_ERROR( cudaMemcpy(host_E, E, grid_bytes(), cudaMemcpyDeviceToHost) ); //copy from GPU to CPU | 387 | HANDLE_ERROR( cudaMemcpy(host_E, E, grid_bytes(), cudaMemcpyDeviceToHost) ); //copy from GPU to CPU |
287 | HANDLE_ERROR( cudaFree(E) ); //free device memory | 388 | HANDLE_ERROR( cudaFree(E) ); //free device memory |
288 | E = host_E; //swap pointers | 389 | E = host_E; //swap pointers |
390 | + | ||
391 | + //copy a meshgrid has been created | ||
392 | + if(p[0]){ | ||
393 | + size_t meshgrid_bytes = size() * sizeof(T); //move X to the CPU | ||
394 | + T* host_X = (T*) malloc( meshgrid_bytes ); | ||
395 | + T* host_Y = (T*) malloc( meshgrid_bytes ); | ||
396 | + T* host_Z = (T*) malloc( meshgrid_bytes ); | ||
397 | + | ||
398 | + HANDLE_ERROR( cudaMemcpy(host_X, p[0], meshgrid_bytes, cudaMemcpyDeviceToHost) ); | ||
399 | + HANDLE_ERROR( cudaMemcpy(host_Y, p[1], meshgrid_bytes, cudaMemcpyDeviceToHost) ); | ||
400 | + HANDLE_ERROR( cudaMemcpy(host_Z, p[2], meshgrid_bytes, cudaMemcpyDeviceToHost) ); | ||
401 | + | ||
402 | + HANDLE_ERROR( cudaFree(p[0]) ); | ||
403 | + HANDLE_ERROR( cudaFree(p[1]) ); | ||
404 | + HANDLE_ERROR( cudaFree(p[2]) ); | ||
405 | + | ||
406 | + p[0] = host_X; | ||
407 | + p[1] = host_Y; | ||
408 | + p[2] = host_Z; | ||
409 | + } | ||
410 | + loc = CPUmem; | ||
289 | } | 411 | } |
290 | } | 412 | } |
291 | 413 | ||
414 | + bool gpu(){ | ||
415 | + if(loc == GPUmem) return true; | ||
416 | + else return false; | ||
417 | + } | ||
418 | + | ||
292 | /// Propagate the field along its orthogonal direction by a distance d | 419 | /// Propagate the field along its orthogonal direction by a distance d |
293 | void propagate(T d, T k){ | 420 | void propagate(T d, T k){ |
294 | - //X[2] += d; //move the plane position along the propagation direction | ||
295 | - //Y[2] += d; | ||
296 | - cpu_scalar_propagate(E, E, X.len(), Y.len(), d, k, R[0], R[1]); | 421 | + if(loc == CPUmem){ |
422 | + cpu_scalar_propagate(E, E, X.len(), Y.len(), d, k, R[0], R[1]); | ||
423 | + } | ||
424 | + else{ | ||
425 | + gpu_scalar_propagate(E, E, X.len(), Y.len(), d, k, R[0], R[1]); | ||
426 | + } | ||
297 | } | 427 | } |
298 | 428 | ||
299 | /// Propagate the field along its orthogonal direction by a distance d | 429 | /// Propagate the field along its orthogonal direction by a distance d |
@@ -312,6 +442,7 @@ public: | @@ -312,6 +442,7 @@ public: | ||
312 | } | 442 | } |
313 | 443 | ||
314 | if(loc == CPUmem){ | 444 | if(loc == CPUmem){ |
445 | + cropped.to_cpu(); //make sure that the cropped image is on the CPU | ||
315 | size_t x, y; | 446 | size_t x, y; |
316 | size_t sx, sy, si, di; | 447 | size_t sx, sy, si, di; |
317 | for(y = 0; y < Cy; y++){ | 448 | for(y = 0; y < Cy; y++){ |
@@ -325,8 +456,8 @@ public: | @@ -325,8 +456,8 @@ public: | ||
325 | } | 456 | } |
326 | } | 457 | } |
327 | else{ | 458 | else{ |
328 | - std::cout<<"Error: cropping not supported for GPU memory yet."<<std::endl; | ||
329 | - exit(1); | 459 | + cropped.to_gpu(); //make sure that the cropped image is also on the GPU |
460 | + gpu_crop2d<stim::complex<T>>(cropped.E, E, R[0], R[1], Cx * padding, Cy * padding, Cx, Cy); | ||
330 | } | 461 | } |
331 | } | 462 | } |
332 | 463 | ||
@@ -346,6 +477,10 @@ public: | @@ -346,6 +477,10 @@ public: | ||
346 | return E; | 477 | return E; |
347 | } | 478 | } |
348 | 479 | ||
480 | + T* x(){ return p[0]; } | ||
481 | + T* y(){ return p[1]; } | ||
482 | + T* z(){ return p[2]; } | ||
483 | + | ||
349 | /// Evaluate the cartesian coordinates of each point in the field. The resulting arrays are allocated in the same memory where the field is stored. | 484 | /// Evaluate the cartesian coordinates of each point in the field. The resulting arrays are allocated in the same memory where the field is stored. |
350 | void meshgrid(T* X, T* Y, T* Z, locationType location){ | 485 | void meshgrid(T* X, T* Y, T* Z, locationType location){ |
351 | //size_t array_size = sizeof(T) * R[0] * R[1]; | 486 | //size_t array_size = sizeof(T) * R[0] * R[1]; |
@@ -376,6 +511,21 @@ public: | @@ -376,6 +511,21 @@ public: | ||
376 | } | 511 | } |
377 | } | 512 | } |
378 | 513 | ||
514 | + /// Create a local meshgrid | ||
515 | + void meshgrid(){ | ||
516 | + if(p[0]) return; //if the p[0] value is not NULL, a meshgrid has already been created | ||
517 | + if(loc == CPUmem){ | ||
518 | + p[0] = (T*) malloc( size() * sizeof(T) ); | ||
519 | + p[1] = (T*) malloc( size() * sizeof(T) ); | ||
520 | + p[2] = (T*) malloc( size() * sizeof(T) ); | ||
521 | + } | ||
522 | + else{ | ||
523 | + std::cout<<"GPUmem meshgrid isn't implemented yet."<<std::endl; | ||
524 | + exit(1); | ||
525 | + } | ||
526 | + meshgrid(p[0], p[1], p[2], loc); | ||
527 | + } | ||
528 | + | ||
379 | //clear the field, setting all values to zero | 529 | //clear the field, setting all values to zero |
380 | void clear(){ | 530 | void clear(){ |
381 | if(loc == GPUmem) | 531 | if(loc == GPUmem) |
@@ -386,25 +536,49 @@ public: | @@ -386,25 +536,49 @@ public: | ||
386 | 536 | ||
387 | void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer){ | 537 | void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer){ |
388 | 538 | ||
389 | - if(loc == GPUmem) to_cpu(); //if the field is in the GPU, move it to the CPU | ||
390 | - T* image = (T*) malloc( sizeof(T) * size() ); //allocate space for the real image | ||
391 | - | ||
392 | - switch(type){ //get the specified component from the complex value | ||
393 | - case complexMag: | ||
394 | - stim::abs(image, E, size()); | ||
395 | - break; | ||
396 | - case complexReal: | ||
397 | - stim::real(image, E, size()); | ||
398 | - break; | ||
399 | - case complexImaginary: | ||
400 | - stim::imag(image, E, size()); | ||
401 | - break; | ||
402 | - case complexIntensity: | ||
403 | - stim::intensity(image, E, size()); | ||
404 | - break; | 539 | + if(loc == GPUmem){ |
540 | + T* image; | ||
541 | + HANDLE_ERROR( cudaMalloc(&image, sizeof(T) * size()) ); | ||
542 | + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device | ||
543 | + dim3 blocks( R[0] * R[1] / threads + 1 ); //create a 1D array of blocks | ||
544 | + | ||
545 | + switch(type){ | ||
546 | + case complexMag: | ||
547 | + cuda_abs<T><<< blocks, threads >>>(image, E, size()); | ||
548 | + break; | ||
549 | + case complexReal: | ||
550 | + cuda_real<T><<< blocks, threads >>>(image, E, size()); | ||
551 | + break; | ||
552 | + case complexImaginary: | ||
553 | + cuda_imag<T><<< blocks, threads >>>(image, E, size()); | ||
554 | + break; | ||
555 | + case complexIntensity: | ||
556 | + cuda_intensity<T><<< blocks, threads >>>(image, E, size()); | ||
557 | + break; | ||
558 | + } | ||
559 | + stim::gpu2image<T>(image, filename, R[0], R[1], stim::cmBrewer); | ||
560 | + HANDLE_ERROR( cudaFree(image) ); | ||
561 | + } | ||
562 | + else{ | ||
563 | + T* image = (T*) malloc( sizeof(T) * size() ); //allocate space for the real image | ||
564 | + | ||
565 | + switch(type){ //get the specified component from the complex value | ||
566 | + case complexMag: | ||
567 | + stim::abs(image, E, size()); | ||
568 | + break; | ||
569 | + case complexReal: | ||
570 | + stim::real(image, E, size()); | ||
571 | + break; | ||
572 | + case complexImaginary: | ||
573 | + stim::imag(image, E, size()); | ||
574 | + break; | ||
575 | + case complexIntensity: | ||
576 | + stim::intensity(image, E, size()); | ||
577 | + break; | ||
578 | + } | ||
579 | + stim::cpu2image(image, filename, R[0], R[1], cmap); //save the resulting image | ||
580 | + free(image); //free the real image | ||
405 | } | 581 | } |
406 | - stim::cpu2image(image, filename, R[0], R[1], cmap); //save the resulting image | ||
407 | - free(image); //free the real image | ||
408 | } | 582 | } |
409 | 583 | ||
410 | void image(T* img, stim::complexComponentType type = complexMag){ | 584 | void image(T* img, stim::complexComponentType type = complexMag){ |
@@ -430,15 +604,23 @@ public: | @@ -430,15 +604,23 @@ public: | ||
430 | 604 | ||
431 | //adds the field intensity to the output array (useful for calculating detector response to incoherent fields) | 605 | //adds the field intensity to the output array (useful for calculating detector response to incoherent fields) |
432 | void intensity(T* out){ | 606 | void intensity(T* out){ |
433 | - if(loc == GPUmem) to_cpu(); //if the field is in the GPU, move it to the CPU | ||
434 | - T* image = (T*) malloc( sizeof(T) * size() ); //allocate space for the real image | ||
435 | - stim::intensity(image, E, size()); //calculate the intensity | 607 | + if(loc == GPUmem){ |
608 | + //T* image; | ||
609 | + //HANDLE_ERROR( cudaMalloc(&image, sizeof(T) * size()) ); | ||
610 | + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device | ||
611 | + dim3 blocks( R[0] * R[1] / threads + 1 ); //create a 1D array of blocks | ||
612 | + cuda_sum_intensity<T><<< blocks, threads >>>(out, E, size()); | ||
613 | + } | ||
614 | + else{ | ||
615 | + T* image = (T*) malloc( sizeof(T) * size() ); //allocate space for the real image | ||
616 | + stim::intensity(image, E, size()); //calculate the intensity | ||
436 | 617 | ||
437 | - size_t N = size(); //calculate the number of elements in the field | ||
438 | - for(size_t n = 0; n < N; n++) //for each point in the field | ||
439 | - out[n] += image[n]; //add the field intensity to the output image | 618 | + size_t N = size(); //calculate the number of elements in the field |
619 | + for(size_t n = 0; n < N; n++) //for each point in the field | ||
620 | + out[n] += image[n]; //add the field intensity to the output image | ||
440 | 621 | ||
441 | - free(image); //free the temporary intensity image | 622 | + free(image); //free the temporary intensity image |
623 | + } | ||
442 | } | 624 | } |
443 | 625 | ||
444 | }; //end class scalarfield | 626 | }; //end class scalarfield |
stim/optics/scalarmie.h
@@ -149,16 +149,16 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, sti | @@ -149,16 +149,16 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, sti | ||
149 | 149 | ||
150 | size_t max_shared_mem = stim::sharedMemPerBlock(); | 150 | size_t max_shared_mem = stim::sharedMemPerBlock(); |
151 | size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1); | 151 | size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1); |
152 | - std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl; | ||
153 | - std::cout<<"shared memory: "<<max_shared_mem<<std::endl; | 152 | + //std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl; |
153 | + //std::cout<<"shared memory: "<<max_shared_mem<<std::endl; | ||
154 | int threads = (int)((max_shared_mem / hBl_array) / 32 * 32); | 154 | int threads = (int)((max_shared_mem / hBl_array) / 32 * 32); |
155 | - std::cout<<"threads per block: "<<threads<<std::endl; | 155 | + //std::cout<<"threads per block: "<<threads<<std::endl; |
156 | dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks | 156 | dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks |
157 | 157 | ||
158 | size_t shared_mem; | 158 | size_t shared_mem; |
159 | if(Nl <= LOCAL_NL) shared_mem = 0; | 159 | if(Nl <= LOCAL_NL) shared_mem = 0; |
160 | else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate | 160 | else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate |
161 | - std::cout<<"shared memory allocated: "<<shared_mem<<std::endl; | 161 | + //std::cout<<"shared memory allocated: "<<shared_mem<<std::endl; |
162 | cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel | 162 | cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel |
163 | } | 163 | } |
164 | 164 | ||
@@ -174,52 +174,20 @@ __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){ | @@ -174,52 +174,20 @@ __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){ | ||
174 | 174 | ||
175 | r[i] = p.len(); | 175 | r[i] = p.len(); |
176 | } | 176 | } |
177 | -/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave | ||
178 | - | ||
179 | -/// @param E is a pointer to the destination field values | ||
180 | -/// @param N is the number of points used to calculate the field | ||
181 | -/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros) | ||
182 | -/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros) | ||
183 | -/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros) | ||
184 | -/// @param W is an array of planewaves that will be scattered | ||
185 | -/// @param a is the radius of the sphere | ||
186 | -/// @param n is the complex refractive index of the sphere | ||
187 | template<typename T> | 177 | template<typename T> |
188 | -void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){ | 178 | +void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){ |
179 | + | ||
189 | //calculate the necessary number of orders required to represent the scattered field | 180 | //calculate the necessary number of orders required to represent the scattered field |
190 | T k = W[0].kmag(); | 181 | T k = W[0].kmag(); |
191 | 182 | ||
192 | int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); | 183 | int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); |
193 | if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) | 184 | if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) |
194 | - std::cout<<"Nl: "<<Nl<<std::endl; | 185 | + //std::cout<<"Nl: "<<Nl<<std::endl; |
195 | 186 | ||
196 | //calculate the scattering coefficients for the sphere | 187 | //calculate the scattering coefficients for the sphere |
197 | stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients | 188 | stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients |
198 | - B_coefficients(B, a, k, n, Nl); | ||
199 | - | ||
200 | -#ifdef CUDA_FOUND | ||
201 | - stim::complex<T>* dev_E; //allocate space for the field | ||
202 | - cudaMalloc(&dev_E, N * sizeof(stim::complex<T>)); | ||
203 | - cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice); | ||
204 | - //cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used) | ||
205 | - | ||
206 | - // COORDINATES | ||
207 | - T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) | ||
208 | - if(x != NULL){ | ||
209 | - HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T))); | ||
210 | - HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice)); | ||
211 | - } | ||
212 | - T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified) | ||
213 | - if(y != NULL){ | ||
214 | - HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T))); | ||
215 | - HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice)); | ||
216 | - } | ||
217 | - T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified) | ||
218 | - if(z != NULL){ | ||
219 | - HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T))); | ||
220 | - HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); | ||
221 | - } | ||
222 | - | 189 | + B_coefficients(B, a, k, n, Nl); |
190 | + | ||
223 | // PLANE WAVES | 191 | // PLANE WAVES |
224 | stim::scalarwave<T>* dev_W; //allocate space and copy plane waves | 192 | stim::scalarwave<T>* dev_W; //allocate space and copy plane waves |
225 | HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) ); | 193 | HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) ); |
@@ -232,7 +200,7 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | @@ -232,7 +200,7 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | ||
232 | 200 | ||
233 | int threads = stim::maxThreadsPerBlock(); | 201 | int threads = stim::maxThreadsPerBlock(); |
234 | dim3 blocks((unsigned)(N / threads + 1)); | 202 | dim3 blocks((unsigned)(N / threads + 1)); |
235 | - cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N); | 203 | + cuda_dist<T> <<< blocks, threads >>>(dev_r, x, y, z, N); |
236 | 204 | ||
237 | //Find the minimum and maximum values of r | 205 | //Find the minimum and maximum values of r |
238 | cublasStatus_t stat; | 206 | cublasStatus_t stat; |
@@ -280,7 +248,7 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | @@ -280,7 +248,7 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | ||
280 | size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut; | 248 | size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut; |
281 | stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes); //pointer to the look-up table | 249 | stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes); //pointer to the look-up table |
282 | T dr = (r_max - r_min) / (N_hB_lut-1); //distance between values in the LUT | 250 | T dr = (r_max - r_min) / (N_hB_lut-1); //distance between values in the LUT |
283 | - std::cout<<"LUT jl bytes: "<<hB_bytes<<std::endl; | 251 | + //std::cout<<"LUT jl bytes: "<<hB_bytes<<std::endl; |
284 | stim::complex<T> hl; | 252 | stim::complex<T> hl; |
285 | for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT | 253 | for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT |
286 | stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] | 254 | stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] |
@@ -292,18 +260,57 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | @@ -292,18 +260,57 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | ||
292 | //std::cout<<hB_lut[ri * (Nl + 1) + l]<<std::endl; | 260 | //std::cout<<hB_lut[ri * (Nl + 1) + l]<<std::endl; |
293 | } | 261 | } |
294 | } | 262 | } |
295 | - T* real_lut = (T*) malloc(hB_bytes/2); | ||
296 | - stim::real(real_lut, hB_lut, N_hB_lut); | ||
297 | - stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer); | 263 | + //T* real_lut = (T*) malloc(hB_bytes/2); |
264 | + //stim::real(real_lut, hB_lut, N_hB_lut); | ||
265 | + //stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer); | ||
298 | 266 | ||
299 | //Allocate device memory and copy everything to the GPU | 267 | //Allocate device memory and copy everything to the GPU |
300 | stim::complex<T>* dev_hB_lut; | 268 | stim::complex<T>* dev_hB_lut; |
301 | HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); | 269 | HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); |
302 | HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); | 270 | HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); |
303 | 271 | ||
304 | - gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl); | 272 | + gpu_scalar_mie_scatter<T>(E, N, x, y, z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl); |
305 | 273 | ||
306 | - cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory | 274 | + cudaMemcpy(E, E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory |
275 | +} | ||
276 | +/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave | ||
277 | + | ||
278 | +/// @param E is a pointer to the destination field values | ||
279 | +/// @param N is the number of points used to calculate the field | ||
280 | +/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros) | ||
281 | +/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros) | ||
282 | +/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros) | ||
283 | +/// @param W is an array of planewaves that will be scattered | ||
284 | +/// @param a is the radius of the sphere | ||
285 | +/// @param n is the complex refractive index of the sphere | ||
286 | +template<typename T> | ||
287 | +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){ | ||
288 | + | ||
289 | + | ||
290 | +#ifdef CUDA_FOUND | ||
291 | + stim::complex<T>* dev_E; //allocate space for the field | ||
292 | + cudaMalloc(&dev_E, N * sizeof(stim::complex<T>)); | ||
293 | + cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice); | ||
294 | + //cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used) | ||
295 | + | ||
296 | + // COORDINATES | ||
297 | + T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) | ||
298 | + if(x != NULL){ | ||
299 | + HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T))); | ||
300 | + HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice)); | ||
301 | + } | ||
302 | + T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified) | ||
303 | + if(y != NULL){ | ||
304 | + HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T))); | ||
305 | + HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice)); | ||
306 | + } | ||
307 | + T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified) | ||
308 | + if(z != NULL){ | ||
309 | + HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T))); | ||
310 | + HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); | ||
311 | + } | ||
312 | + | ||
313 | + gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, W, a, n, r_spacing); | ||
307 | 314 | ||
308 | if(x != NULL) cudaFree(dev_x); //free everything | 315 | if(x != NULL) cudaFree(dev_x); //free everything |
309 | if(y != NULL) cudaFree(dev_y); | 316 | if(y != NULL) cudaFree(dev_y); |
@@ -311,6 +318,16 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | @@ -311,6 +318,16 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std | ||
311 | cudaFree(dev_E); | 318 | cudaFree(dev_E); |
312 | #else | 319 | #else |
313 | 320 | ||
321 | + //calculate the necessary number of orders required to represent the scattered field | ||
322 | + T k = W[0].kmag(); | ||
323 | + | ||
324 | + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); | ||
325 | + if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) | ||
326 | + //std::cout<<"Nl: "<<Nl<<std::endl; | ||
327 | + | ||
328 | + //calculate the scattering coefficients for the sphere | ||
329 | + stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients | ||
330 | + B_coefficients(B, a, k, n, Nl); | ||
314 | 331 | ||
315 | //allocate space to store the bessel function call results | 332 | //allocate space to store the bessel function call results |
316 | double vm; | 333 | double vm; |
@@ -422,16 +439,16 @@ void gpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | @@ -422,16 +439,16 @@ void gpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | ||
422 | 439 | ||
423 | size_t max_shared_mem = stim::sharedMemPerBlock(); | 440 | size_t max_shared_mem = stim::sharedMemPerBlock(); |
424 | size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1); | 441 | size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1); |
425 | - std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl; | ||
426 | - std::cout<<"shared memory: "<<max_shared_mem<<std::endl; | 442 | + //std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl; |
443 | + //std::cout<<"shared memory: "<<max_shared_mem<<std::endl; | ||
427 | int threads = (int)((max_shared_mem / hBl_array) / 32 * 32); | 444 | int threads = (int)((max_shared_mem / hBl_array) / 32 * 32); |
428 | - std::cout<<"threads per block: "<<threads<<std::endl; | 445 | + //std::cout<<"threads per block: "<<threads<<std::endl; |
429 | dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks | 446 | dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks |
430 | 447 | ||
431 | size_t shared_mem; | 448 | size_t shared_mem; |
432 | if(Nl <= LOCAL_NL) shared_mem = 0; | 449 | if(Nl <= LOCAL_NL) shared_mem = 0; |
433 | else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate | 450 | else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate |
434 | - std::cout<<"shared memory allocated: "<<shared_mem<<std::endl; | 451 | + //std::cout<<"shared memory allocated: "<<shared_mem<<std::endl; |
435 | cuda_scalar_mie_internal<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl); //call the kernel | 452 | cuda_scalar_mie_internal<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl); //call the kernel |
436 | } | 453 | } |
437 | 454 | ||
@@ -452,7 +469,7 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | @@ -452,7 +469,7 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | ||
452 | 469 | ||
453 | int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); | 470 | int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); |
454 | if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) | 471 | if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) |
455 | - std::cout<<"Nl: "<<Nl<<std::endl; | 472 | + //std::cout<<"Nl: "<<Nl<<std::endl; |
456 | 473 | ||
457 | //calculate the scattering coefficients for the sphere | 474 | //calculate the scattering coefficients for the sphere |
458 | stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients | 475 | stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients |
@@ -537,7 +554,7 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | @@ -537,7 +554,7 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | ||
537 | size_t jA_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_jA_lut; | 554 | size_t jA_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_jA_lut; |
538 | stim::complex<T>* jA_lut = (stim::complex<T>*) malloc(jA_bytes); //pointer to the look-up table | 555 | stim::complex<T>* jA_lut = (stim::complex<T>*) malloc(jA_bytes); //pointer to the look-up table |
539 | T dr = (r_max - r_min) / (N_jA_lut-1); //distance between values in the LUT | 556 | T dr = (r_max - r_min) / (N_jA_lut-1); //distance between values in the LUT |
540 | - std::cout<<"LUT jl bytes: "<<jA_bytes<<std::endl; | 557 | + //std::cout<<"LUT jl bytes: "<<jA_bytes<<std::endl; |
541 | stim::complex<T> hl; | 558 | stim::complex<T> hl; |
542 | stim::complex<double> nd = (stim::complex<double>)n; | 559 | stim::complex<double> nd = (stim::complex<double>)n; |
543 | for(size_t ri = 0; ri < N_jA_lut; ri++){ //for each value in the LUT | 560 | for(size_t ri = 0; ri < N_jA_lut; ri++){ //for each value in the LUT |
@@ -559,6 +576,10 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | @@ -559,6 +576,10 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st | ||
559 | if(x != NULL) cudaFree(dev_x); //free everything | 576 | if(x != NULL) cudaFree(dev_x); //free everything |
560 | if(y != NULL) cudaFree(dev_y); | 577 | if(y != NULL) cudaFree(dev_y); |
561 | if(z != NULL) cudaFree(dev_z); | 578 | if(z != NULL) cudaFree(dev_z); |
579 | + HANDLE_ERROR( cudaFree(dev_jA_lut) ); | ||
580 | + HANDLE_ERROR( cudaFree(dev_E) ); | ||
581 | + HANDLE_ERROR( cudaFree(dev_W) ); | ||
582 | + HANDLE_ERROR( cudaFree(dev_r) ); | ||
562 | cudaFree(dev_E); | 583 | cudaFree(dev_E); |
563 | #else | 584 | #else |
564 | 585 | ||
@@ -624,19 +645,44 @@ public: | @@ -624,19 +645,44 @@ public: | ||
624 | n = ri; | 645 | n = ri; |
625 | } | 646 | } |
626 | 647 | ||
648 | + void sum_scat(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){ | ||
649 | + std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves | ||
650 | + stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); | ||
651 | + } | ||
652 | + | ||
653 | + void sum_intern(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){ | ||
654 | + std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves | ||
655 | + stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); | ||
656 | + } | ||
657 | + | ||
658 | + void eval(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int order = 500, int samples = 1000){ | ||
659 | + b.eval(E, X, Y, Z, order); //evaluate the incident field using a plane wave expansion | ||
660 | + std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves | ||
661 | + sum_scat(E, X, Y, Z, b, samples); | ||
662 | + sum_intern(E, X, Y, Z, b, samples); | ||
663 | + } | ||
664 | + | ||
627 | void eval(stim::scalarfield<T>& E, stim::scalarbeam<T> b, int order = 500, int samples = 1000){ | 665 | void eval(stim::scalarfield<T>& E, stim::scalarbeam<T> b, int order = 500, int samples = 1000){ |
628 | 666 | ||
629 | - size_t array_size = E.grid_bytes(); //calculate the number of bytes in the scalar grid | 667 | + /*size_t array_size = E.grid_bytes(); //calculate the number of bytes in the scalar grid |
630 | float* X = (float*) malloc( array_size ); //allocate space for the coordinate meshes | 668 | float* X = (float*) malloc( array_size ); //allocate space for the coordinate meshes |
631 | float* Y = (float*) malloc( array_size ); | 669 | float* Y = (float*) malloc( array_size ); |
632 | float* Z = (float*) malloc( array_size ); | 670 | float* Z = (float*) malloc( array_size ); |
633 | E.meshgrid(X, Y, Z, stim::CPUmem); //calculate the coordinate meshes | 671 | E.meshgrid(X, Y, Z, stim::CPUmem); //calculate the coordinate meshes |
634 | - | ||
635 | - b.eval(E, X, Y, Z, order); //evaluate the incident field using a plane wave expansion | 672 | + */ |
673 | + E.meshgrid(); | ||
674 | + b.eval(E, order); | ||
636 | 675 | ||
637 | std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves | 676 | std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves |
638 | - stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); | ||
639 | - stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); | 677 | + |
678 | + if(E.gpu()){ | ||
679 | + stim::gpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); | ||
680 | + } | ||
681 | + else{ | ||
682 | + stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); | ||
683 | + stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); | ||
684 | + } | ||
685 | + //eval(E, X, Y, Z, b, order, samples); //evaluate the field | ||
640 | } | 686 | } |
641 | 687 | ||
642 | }; //end stim::scalarmie | 688 | }; //end stim::scalarmie |
stim/visualization/colormap.h
@@ -4,6 +4,7 @@ | @@ -4,6 +4,7 @@ | ||
4 | #include <string> | 4 | #include <string> |
5 | #include <stdlib.h> | 5 | #include <stdlib.h> |
6 | #include <cmath> | 6 | #include <cmath> |
7 | +#include "cublas_v2.h" | ||
7 | 8 | ||
8 | #ifdef _WIN32 | 9 | #ifdef _WIN32 |
9 | #include <float.h> | 10 | #include <float.h> |
@@ -219,6 +220,42 @@ static void gpu2image(T* gpuSource, std::string fileDest, unsigned int x_size, u | @@ -219,6 +220,42 @@ static void gpu2image(T* gpuSource, std::string fileDest, unsigned int x_size, u | ||
219 | free(cpuBuffer); | 220 | free(cpuBuffer); |
220 | } | 221 | } |
221 | 222 | ||
223 | +/// save a GPU image to a file using automatic scaling | ||
224 | +template<typename T> | ||
225 | +static void gpu2image(T* gpuSource, std::string fileDest, unsigned int x_size, unsigned int y_size, colormapType cm = cmGrayscale){ | ||
226 | + size_t N = x_size * y_size; //calculate the total number of elements in the image | ||
227 | + | ||
228 | + cublasStatus_t stat; | ||
229 | + cublasHandle_t handle; | ||
230 | + | ||
231 | + stat = cublasCreate(&handle); //create a cuBLAS handle | ||
232 | + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure | ||
233 | + printf ("CUBLAS initialization failed\n"); | ||
234 | + exit(1); | ||
235 | + } | ||
236 | + | ||
237 | + int i_min, i_max; | ||
238 | + stat = cublasIsamin(handle, (int)N, gpuSource, 1, &i_min); | ||
239 | + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure | ||
240 | + printf ("CUBLAS Error: failed to calculate minimum r value.\n"); | ||
241 | + exit(1); | ||
242 | + } | ||
243 | + stat = cublasIsamax(handle, (int)N, gpuSource, 1, &i_max); | ||
244 | + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure | ||
245 | + printf ("CUBLAS Error: failed to calculate maximum r value.\n"); | ||
246 | + exit(1); | ||
247 | + } | ||
248 | + cublasDestroy(handle); | ||
249 | + | ||
250 | + i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility | ||
251 | + i_max--; | ||
252 | + T v_min, v_max; //allocate space to store the minimum and maximum values | ||
253 | + HANDLE_ERROR( cudaMemcpy(&v_min, gpuSource + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU | ||
254 | + HANDLE_ERROR( cudaMemcpy(&v_max, gpuSource + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); | ||
255 | + | ||
256 | + gpu2image<T>(gpuSource, fileDest, x_size, y_size, v_min, v_max, cm); | ||
257 | +} | ||
258 | + | ||
222 | #endif | 259 | #endif |
223 | 260 | ||
224 | template<class T> | 261 | template<class T> |