Commit 4252d8276a60522356cbd52577b4fe5911f7f845
1 parent
b5dd3d3a
ivote3 fixes and new bimsim code
Showing
8 changed files
with
419 additions
and
119 deletions
Show diff stats
1 | + | |
2 | +template<typename T> | |
3 | +__global__ void cuda_crop2d(T* dest, T* src, size_t sx, size_t sy, size_t x, size_t y, size_t dx, size_t dy){ | |
4 | + | |
5 | + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate the current working position within the destination image | |
6 | + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; | |
7 | + if(xi >= dx || yi >= dy) return; //if this thread is outside of the destination image, return | |
8 | + | |
9 | + size_t di = yi * dx + xi; //calculate the 1D index into the destination image | |
10 | + size_t si = (y + yi) * sx + (x + xi); //calculate the 1D index into the source image | |
11 | + | |
12 | + dest[di] = src[si]; //copy the corresponding source pixel to the destination image | |
13 | +} | |
14 | + | |
15 | +/// Crops a 2D image composed of elements of type T | |
16 | +/// @param dest is a device pointer to memory of size dx*dy that will store the cropped image | |
17 | +/// @param src is a device pointer to memory of size sx*sy that stores the original image | |
18 | +/// @param sx is the size of the source image along x | |
19 | +/// @param sy is the size of the source image along y | |
20 | +/// @param x is the x-coordinate of the start position of the cropped region within the source image | |
21 | +/// @param y is the y-coordinate of the start position of the cropped region within the source image | |
22 | +/// @param dx is the size of the destination image along x | |
23 | +/// @param dy is the size of the destination image along y | |
24 | +template<typename T> | |
25 | +void gpu_crop2d(T* dest, T* src, size_t sx, size_t sy, size_t x, size_t y, size_t dx, size_t dy){ | |
26 | + int max_threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device | |
27 | + dim3 threads( sqrt(max_threads), sqrt(max_threads) ); | |
28 | + dim3 blocks( dx / sqrt(threads.x) + 1, dy / sqrt(threads.y) + 1); //calculate the optimal number of blocks | |
29 | + cuda_crop2d<T> <<< blocks, threads >>>(dest, src, sx, sy, x, y, dx, dy); | |
30 | +} | |
0 | 31 | \ No newline at end of file | ... | ... |
stim/math/circle.h
... | ... | @@ -64,7 +64,7 @@ public: |
64 | 64 | init(); |
65 | 65 | } |
66 | 66 | |
67 | - ///create a rectangle from a center point, normal, and size | |
67 | + /*///create a rectangle from a center point, normal, and size | |
68 | 68 | ///@param c: x,y,z location of the center. |
69 | 69 | ///@param s: size of the rectangle. |
70 | 70 | ///@param n: x,y,z direction of the normal. |
... | ... | @@ -76,6 +76,7 @@ public: |
76 | 76 | rotate(n, U, Y); |
77 | 77 | scale(s); |
78 | 78 | } |
79 | + */ | |
79 | 80 | |
80 | 81 | ///create a rectangle from a center point, normal, and size |
81 | 82 | ///@param c: x,y,z location of the center. |
... | ... | @@ -164,12 +165,12 @@ public: |
164 | 165 | ///returns a vector with the points on the initialized circle. |
165 | 166 | ///connecting the points results in a circle. |
166 | 167 | ///@param n: integer for the number of points representing the circle. |
167 | - stim::vec3<T> | |
168 | + CUDA_CALLABLE stim::vec3<T> | |
168 | 169 | p(T theta) |
169 | 170 | { |
170 | 171 | T x,y; |
171 | - y = 0.5*cos(theta*stim::TAU/360.0)+0.5; | |
172 | - x = 0.5*sin(theta*stim::TAU/360.0)+0.5; | |
172 | + y = 0.5*cos(theta*STIM_TAU/360.0)+0.5; | |
173 | + x = 0.5*sin(theta*STIM_TAU/360.0)+0.5; | |
173 | 174 | return p(x,y); |
174 | 175 | } |
175 | 176 | ... | ... |
stim/math/constants.h
1 | 1 | #ifndef STIM_CONSTANTS_H |
2 | 2 | #define STIM_CONSTANTS_H |
3 | 3 | |
4 | +#define STIM_PI 3.1415926535897932384626433832795028841971693993751058209749445923078164062862 | |
5 | +#define STIM_TAU 2 * STIM_PI | |
6 | + | |
4 | 7 | #include "stim/cuda/cudatools/callable.h" |
5 | 8 | namespace stim{ |
6 | 9 | const double PI = 3.1415926535897932384626433832795028841971693993751058209749445923078164062862; | ... | ... |
stim/math/vec3.h
... | ... | @@ -216,7 +216,7 @@ public: |
216 | 216 | return result; |
217 | 217 | } |
218 | 218 | |
219 | - | |
219 | +#ifndef __NVCC__ | |
220 | 220 | /// Outputs the vector as a string |
221 | 221 | std::string str() const{ |
222 | 222 | std::stringstream ss; |
... | ... | @@ -234,6 +234,7 @@ public: |
234 | 234 | |
235 | 235 | return ss.str(); |
236 | 236 | } |
237 | +#endif | |
237 | 238 | |
238 | 239 | size_t size(){ return 3; } |
239 | 240 | ... | ... |
stim/optics/scalarbeam.h
... | ... | @@ -193,6 +193,7 @@ void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda, |
193 | 193 | printf ("CUBLAS Error: failed to calculate maximum r value.\n"); |
194 | 194 | exit(1); |
195 | 195 | } |
196 | + cublasDestroy(handle); | |
196 | 197 | |
197 | 198 | i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility |
198 | 199 | i_max--; |
... | ... | @@ -212,8 +213,9 @@ void gpu_scalar_psf_local(stim::complex<T>* E, size_t N, T* r, T* phi, T lambda, |
212 | 213 | T* j_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table |
213 | 214 | T dr = (r_max - r_min) / (Nlut_j-1); //distance between values in the LUT |
214 | 215 | T jl; |
216 | + unsigned l; | |
215 | 217 | for(size_t ri = 0; ri < Nlut_j; ri++){ //for each value in the LUT |
216 | - for(unsigned l = 0; l <= (unsigned)Nl; l++){ //for each order | |
218 | + for(l = 0; l <= (unsigned)Nl; l++){ //for each order | |
217 | 219 | jl = boost::math::sph_bessel<T>(l, k*(r_min + ri * dr)); //use boost to calculate the spherical bessel function |
218 | 220 | j_lut[ri * (Nl + 1) + l] = jl; //store the bessel function result |
219 | 221 | } |
... | ... | @@ -323,6 +325,9 @@ void gpu_scalar_psf_cart(stim::complex<T>* E, size_t N, T* x, T* y, T* z, T lamb |
323 | 325 | |
324 | 326 | gpu_scalar_psf_local(E, N, gpu_r, gpu_phi, lambda, A, NA, NA_in, Nl, r_spacing); |
325 | 327 | |
328 | + HANDLE_ERROR( cudaFree(gpu_r) ); | |
329 | + HANDLE_ERROR( cudaFree(gpu_phi) ); | |
330 | + | |
326 | 331 | } |
327 | 332 | #endif |
328 | 333 | |
... | ... | @@ -449,17 +454,12 @@ public: |
449 | 454 | |
450 | 455 | /// Evaluate the beam to a scalar field using Debye focusing |
451 | 456 | void eval(stim::scalarfield<T>& E, int order = 500){ |
452 | - size_t array_size = E.grid_bytes(); | |
453 | - T* X = (T*) malloc( array_size ); //allocate space for the coordinate meshes | |
454 | - T* Y = (T*) malloc( array_size ); | |
455 | - T* Z = (T*) malloc( array_size ); | |
456 | - | |
457 | - E.meshgrid(X, Y, Z, stim::CPUmem); //calculate the coordinate meshes | |
458 | - eval(E, X, Y, Z, order); | |
459 | - | |
460 | - free(X); //free the coordinate meshes | |
461 | - free(Y); | |
462 | - free(Z); | |
457 | + E.meshgrid(); //calculate a meshgrid if one isn't created | |
458 | + if(E.gpu()) | |
459 | + gpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[0], NA[1], order, E.spacing()); | |
460 | + else | |
461 | + cpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[0], NA[1], order, E.spacing()); | |
462 | + //eval(E, E.x(), E.y(), E.z(), order); | |
463 | 463 | } |
464 | 464 | |
465 | 465 | /// Calculate the field at a given point | ... | ... |
stim/optics/scalarfield.h
... | ... | @@ -6,8 +6,52 @@ |
6 | 6 | #include "../math/complex.h" |
7 | 7 | #include "../math/fft.h" |
8 | 8 | |
9 | +#ifdef CUDA_FOUND | |
10 | +#include "../cuda/crop.cuh" | |
11 | +#endif | |
12 | + | |
9 | 13 | namespace stim{ |
10 | 14 | |
15 | + template<typename T> | |
16 | + __global__ void cuda_abs(T* img, stim::complex<T>* field, size_t N){ | |
17 | + size_t i = blockIdx.x * blockDim.x + threadIdx.x; | |
18 | + if(i >= N) return; | |
19 | + | |
20 | + img[i] = field[i].abs(); | |
21 | + } | |
22 | + | |
23 | + template<typename T> | |
24 | + __global__ void cuda_real(T* img, stim::complex<T>* field, size_t N){ | |
25 | + size_t i = blockIdx.x * blockDim.x + threadIdx.x; | |
26 | + if(i >= N) return; | |
27 | + | |
28 | + img[i] = field[i].real(); | |
29 | + } | |
30 | + | |
31 | + template<typename T> | |
32 | + __global__ void cuda_imag(T* img, stim::complex<T>* field, size_t N){ | |
33 | + size_t i = blockIdx.x * blockDim.x + threadIdx.x; | |
34 | + if(i >= N) return; | |
35 | + | |
36 | + img[i] = field[i].imag(); | |
37 | + } | |
38 | + | |
39 | + template<typename T> | |
40 | + __global__ void cuda_intensity(T* img, stim::complex<T>* field, size_t N){ | |
41 | + size_t i = blockIdx.x * blockDim.x + threadIdx.x; | |
42 | + if(i >= N) return; | |
43 | + | |
44 | + img[i] = pow(field[i].abs(), 2); | |
45 | + } | |
46 | + | |
47 | + template<typename T> | |
48 | + __global__ void cuda_sum_intensity(T* img, stim::complex<T>* field, size_t N){ | |
49 | + size_t i = blockIdx.x * blockDim.x + threadIdx.x; | |
50 | + if(i >= N) return; | |
51 | + | |
52 | + img[i] += pow(field[i].abs(), 2); | |
53 | + } | |
54 | + | |
11 | 55 | /// Perform a k-space transform of a scalar field (FFT). The given field has a width of x and the calculated momentum space has a |
12 | 56 | /// width of kx (in radians). |
13 | 57 | /// @param K is a pointer to the output array of all plane waves in the field |
... | ... | @@ -44,12 +88,16 @@ namespace stim{ |
44 | 88 | std::cout<<"Error using cuFFT to perform a forward Fourier transform of the field."<<std::endl; |
45 | 89 | exit(1); |
46 | 90 | } |
91 | + cufftDestroy(plan); | |
47 | 92 | |
48 | 93 | stim::complex<T>* fft = (stim::complex<T>*) malloc(sizeof(stim::complex<T>) * nx * ny); |
49 | 94 | HANDLE_ERROR( cudaMemcpy(fft, dev_FFT, sizeof(stim::complex<T>) * nx * ny, cudaMemcpyDeviceToHost) ); |
50 | 95 | |
51 | 96 | stim::cpu_fftshift(K, fft, nx, ny); |
52 | - //memcpy(K, fft, sizeof(stim::complex<T>) * nx * ny); | |
97 | + | |
98 | + HANDLE_ERROR( cudaFree(dev_FFT) ); //free GPU memory | |
99 | + HANDLE_ERROR( cudaFree(dev_E) ); | |
100 | + free(fft); //free CPU memory | |
53 | 101 | } |
54 | 102 | |
55 | 103 | template<typename T> |
... | ... | @@ -82,12 +130,18 @@ namespace stim{ |
82 | 130 | std::cout<<"Error using cuFFT to perform a forward Fourier transform of the field."<<std::endl; |
83 | 131 | exit(1); |
84 | 132 | } |
133 | + cufftDestroy(plan); | |
85 | 134 | |
86 | 135 | HANDLE_ERROR( cudaMemcpy(E, dev_E, sizeof(stim::complex<T>) * nx * ny, cudaMemcpyDeviceToHost) ); |
87 | 136 | |
137 | + HANDLE_ERROR( cudaFree(dev_FFT) ); //free GPU memory | |
138 | + HANDLE_ERROR( cudaFree(dev_E) ); | |
139 | + free(fft); //free CPU memory | |
88 | 140 | |
89 | 141 | } |
90 | 142 | |
143 | + | |
144 | + | |
91 | 145 | /// Propagate a field slice along its orthogonal direction by a given distance z |
92 | 146 | /// @param Enew is the resulting propogated field |
93 | 147 | /// @param E is the field to be propogated |
... | ... | @@ -105,9 +159,9 @@ namespace stim{ |
105 | 159 | T Kx, Ky; //width and height in k space |
106 | 160 | cpu_scalar_to_kspace(K, Kx, Ky, E ,sx, sy, nx, ny); |
107 | 161 | |
108 | - T* mag = (T*) malloc( sizeof(T) * nx * ny ); | |
109 | - stim::abs(mag, K, nx * ny); | |
110 | - stim::cpu2image<float>(mag, "kspace_pre_shift.bmp", nx, ny, stim::cmBrewer); | |
162 | + //T* mag = (T*) malloc( sizeof(T) * nx * ny ); | |
163 | + //stim::abs(mag, K, nx * ny); | |
164 | + //stim::cpu2image<float>(mag, "kspace_pre_shift.bmp", nx, ny, stim::cmBrewer); | |
111 | 165 | |
112 | 166 | size_t kxi, kyi; |
113 | 167 | size_t i; |
... | ... | @@ -143,10 +197,27 @@ namespace stim{ |
143 | 197 | } |
144 | 198 | } |
145 | 199 | |
146 | - stim::abs(mag, K, nx * ny); | |
147 | - stim::cpu2image<float>(mag, "kspace_post_shift.bmp", nx, ny, stim::cmBrewer); | |
200 | + //stim::abs(mag, K, nx * ny); | |
201 | + //stim::cpu2image<float>(mag, "kspace_post_shift.bmp", nx, ny, stim::cmBrewer); | |
148 | 202 | |
149 | 203 | cpu_scalar_from_kspace(Enew, sx, sy, K, Kx, Ky, nx, ny); |
204 | + free(K); | |
205 | + } | |
206 | + | |
207 | + template<typename T> | |
208 | + void gpu_scalar_propagate(stim::complex<T>* Enew, stim::complex<T>* E, T sx, T sy, T z, T k, size_t nx, size_t ny){ | |
209 | + | |
210 | + size_t field_bytes = sizeof(stim::complex<T>) * nx * ny; | |
211 | + stim::complex<T>* host_E = (stim::complex<T>*) malloc( field_bytes); | |
212 | + HANDLE_ERROR( cudaMemcpy(host_E, E, field_bytes, cudaMemcpyDeviceToHost) ); | |
213 | + | |
214 | + stim::complex<T>* host_Enew = (stim::complex<T>*) malloc(field_bytes); | |
215 | + | |
216 | + cpu_scalar_propagate(host_Enew, host_E, sx, sy, z, k, nx, ny); | |
217 | + | |
218 | + HANDLE_ERROR( cudaMemcpy(Enew, host_Enew, field_bytes, cudaMemcpyHostToDevice) ); | |
219 | + free(host_E); | |
220 | + free(host_Enew); | |
150 | 221 | } |
151 | 222 | |
152 | 223 | /// Apply a lowpass filter to a field slice |
... | ... | @@ -165,9 +236,9 @@ namespace stim{ |
165 | 236 | T Kx, Ky; //width and height in k space |
166 | 237 | cpu_scalar_to_kspace(K, Kx, Ky, E ,sx, sy, nx, ny); |
167 | 238 | |
168 | - T* mag = (T*) malloc( sizeof(T) * nx * ny ); | |
169 | - stim::abs(mag, K, nx * ny); | |
170 | - stim::cpu2image<float>(mag, "kspace_pre_lowpass.bmp", nx, ny, stim::cmBrewer); | |
239 | + //T* mag = (T*) malloc( sizeof(T) * nx * ny ); | |
240 | + //stim::abs(mag, K, nx * ny); | |
241 | + //stim::cpu2image<float>(mag, "kspace_pre_lowpass.bmp", nx, ny, stim::cmBrewer); | |
171 | 242 | |
172 | 243 | size_t kxi, kyi; |
173 | 244 | size_t i; |
... | ... | @@ -200,10 +271,11 @@ namespace stim{ |
200 | 271 | } |
201 | 272 | } |
202 | 273 | |
203 | - stim::abs(mag, K, nx * ny); | |
204 | - stim::cpu2image<float>(mag, "kspace_post_lowpass.bmp", nx, ny, stim::cmBrewer); | |
274 | + //stim::abs(mag, K, nx * ny); | |
275 | + //stim::cpu2image<float>(mag, "kspace_post_lowpass.bmp", nx, ny, stim::cmBrewer); | |
205 | 276 | |
206 | 277 | cpu_scalar_from_kspace(Enew, sx, sy, K, Kx, Ky, nx, ny); |
278 | + free(K); | |
207 | 279 | } |
208 | 280 | |
209 | 281 | enum locationType {CPUmem, GPUmem}; |
... | ... | @@ -222,6 +294,8 @@ protected: |
222 | 294 | size_t R[2]; |
223 | 295 | locationType loc; |
224 | 296 | |
297 | + T* p[3]; //scalar position for each point in E | |
298 | + | |
225 | 299 | /// Convert the field to a k-space representation (do an FFT) |
226 | 300 | void to_kspace(T& kx, T& ky){ |
227 | 301 | cpu_scalar_to_kspace(E, kx, ky, E, X.len(), Y.len(), R[0], R[1]); |
... | ... | @@ -252,6 +326,9 @@ public: |
252 | 326 | E = (stim::complex<T>*) malloc(grid_bytes()); //allocate in CPU memory |
253 | 327 | memset(E, 0, grid_bytes()); |
254 | 328 | loc = CPUmem; |
329 | + | |
330 | + p[0] = p[1] = p[2] = NULL; //set the position vector to NULL | |
331 | + | |
255 | 332 | } |
256 | 333 | |
257 | 334 | ~scalarfield(){ |
... | ... | @@ -271,11 +348,35 @@ public: |
271 | 348 | if(loc == GPUmem) return; |
272 | 349 | else{ |
273 | 350 | stim::complex<T>* dev_E; |
274 | - HANDLE_ERROR( cudaMalloc(&dev_E, e_bytes()) ); //allocate GPU memory | |
275 | - HANDLE_ERROR( cudaMemcpy(dev_E, E, e_bytes(), cudaMemcpyHostToDevice) ); //copy the field to the GPU | |
351 | + HANDLE_ERROR( cudaMalloc(&dev_E, grid_bytes()) ); //allocate GPU memory | |
352 | + HANDLE_ERROR( cudaMemcpy(dev_E, E, grid_bytes(), cudaMemcpyHostToDevice) ); //copy the field to the GPU | |
276 | 353 | free(E); //free the CPU memory |
277 | 354 | E = dev_E; //swap pointers |
355 | + | |
356 | + if(p[0]){ | |
357 | + size_t meshgrid_bytes = size() * sizeof(T); //calculate the number of bytes in each meshgrid | |
358 | + T* dev_X; //allocate variables to store the device meshgrid | |
359 | + T* dev_Y; | |
360 | + T* dev_Z; | |
361 | + HANDLE_ERROR( cudaMalloc(&dev_X, meshgrid_bytes) ); //allocate space for the meshgrid on the device | |
362 | + HANDLE_ERROR( cudaMalloc(&dev_Y, meshgrid_bytes) ); | |
363 | + HANDLE_ERROR( cudaMalloc(&dev_Z, meshgrid_bytes) ); | |
364 | + | |
365 | + HANDLE_ERROR( cudaMemcpy(dev_X, p[0], meshgrid_bytes, cudaMemcpyHostToDevice) ); //copy from the host to the device | |
366 | + HANDLE_ERROR( cudaMemcpy(dev_Y, p[1], meshgrid_bytes, cudaMemcpyHostToDevice) ); | |
367 | + HANDLE_ERROR( cudaMemcpy(dev_Z, p[2], meshgrid_bytes, cudaMemcpyHostToDevice) ); | |
368 | + | |
369 | + free(p[0]); //free device memory | |
370 | + free(p[1]); | |
371 | + free(p[2]); | |
372 | + | |
373 | + p[0] = dev_X; //swap in the new pointers to device memory | |
374 | + p[1] = dev_Y; | |
375 | + p[2] = dev_Z; | |
376 | + } | |
377 | + loc = GPUmem; //set the location flag | |
278 | 378 | } |
379 | + | |
279 | 380 | } |
280 | 381 | |
281 | 382 | /// Copy the field array to the CPU, if it isn't already there |
... | ... | @@ -286,14 +387,43 @@ public: |
286 | 387 | HANDLE_ERROR( cudaMemcpy(host_E, E, grid_bytes(), cudaMemcpyDeviceToHost) ); //copy from GPU to CPU |
287 | 388 | HANDLE_ERROR( cudaFree(E) ); //free device memory |
288 | 389 | E = host_E; //swap pointers |
390 | + | |
391 | + //copy a meshgrid has been created | |
392 | + if(p[0]){ | |
393 | + size_t meshgrid_bytes = size() * sizeof(T); //move X to the CPU | |
394 | + T* host_X = (T*) malloc( meshgrid_bytes ); | |
395 | + T* host_Y = (T*) malloc( meshgrid_bytes ); | |
396 | + T* host_Z = (T*) malloc( meshgrid_bytes ); | |
397 | + | |
398 | + HANDLE_ERROR( cudaMemcpy(host_X, p[0], meshgrid_bytes, cudaMemcpyDeviceToHost) ); | |
399 | + HANDLE_ERROR( cudaMemcpy(host_Y, p[1], meshgrid_bytes, cudaMemcpyDeviceToHost) ); | |
400 | + HANDLE_ERROR( cudaMemcpy(host_Z, p[2], meshgrid_bytes, cudaMemcpyDeviceToHost) ); | |
401 | + | |
402 | + HANDLE_ERROR( cudaFree(p[0]) ); | |
403 | + HANDLE_ERROR( cudaFree(p[1]) ); | |
404 | + HANDLE_ERROR( cudaFree(p[2]) ); | |
405 | + | |
406 | + p[0] = host_X; | |
407 | + p[1] = host_Y; | |
408 | + p[2] = host_Z; | |
409 | + } | |
410 | + loc = CPUmem; | |
289 | 411 | } |
290 | 412 | } |
291 | 413 | |
414 | + bool gpu(){ | |
415 | + if(loc == GPUmem) return true; | |
416 | + else return false; | |
417 | + } | |
418 | + | |
292 | 419 | /// Propagate the field along its orthogonal direction by a distance d |
293 | 420 | void propagate(T d, T k){ |
294 | - //X[2] += d; //move the plane position along the propagation direction | |
295 | - //Y[2] += d; | |
296 | - cpu_scalar_propagate(E, E, X.len(), Y.len(), d, k, R[0], R[1]); | |
421 | + if(loc == CPUmem){ | |
422 | + cpu_scalar_propagate(E, E, X.len(), Y.len(), d, k, R[0], R[1]); | |
423 | + } | |
424 | + else{ | |
425 | + gpu_scalar_propagate(E, E, X.len(), Y.len(), d, k, R[0], R[1]); | |
426 | + } | |
297 | 427 | } |
298 | 428 | |
299 | 429 | /// Propagate the field along its orthogonal direction by a distance d |
... | ... | @@ -312,6 +442,7 @@ public: |
312 | 442 | } |
313 | 443 | |
314 | 444 | if(loc == CPUmem){ |
445 | + cropped.to_cpu(); //make sure that the cropped image is on the CPU | |
315 | 446 | size_t x, y; |
316 | 447 | size_t sx, sy, si, di; |
317 | 448 | for(y = 0; y < Cy; y++){ |
... | ... | @@ -325,8 +456,8 @@ public: |
325 | 456 | } |
326 | 457 | } |
327 | 458 | else{ |
328 | - std::cout<<"Error: cropping not supported for GPU memory yet."<<std::endl; | |
329 | - exit(1); | |
459 | + cropped.to_gpu(); //make sure that the cropped image is also on the GPU | |
460 | + gpu_crop2d<stim::complex<T>>(cropped.E, E, R[0], R[1], Cx * padding, Cy * padding, Cx, Cy); | |
330 | 461 | } |
331 | 462 | } |
332 | 463 | |
... | ... | @@ -346,6 +477,10 @@ public: |
346 | 477 | return E; |
347 | 478 | } |
348 | 479 | |
480 | + T* x(){ return p[0]; } | |
481 | + T* y(){ return p[1]; } | |
482 | + T* z(){ return p[2]; } | |
483 | + | |
349 | 484 | /// Evaluate the cartesian coordinates of each point in the field. The resulting arrays are allocated in the same memory where the field is stored. |
350 | 485 | void meshgrid(T* X, T* Y, T* Z, locationType location){ |
351 | 486 | //size_t array_size = sizeof(T) * R[0] * R[1]; |
... | ... | @@ -376,6 +511,21 @@ public: |
376 | 511 | } |
377 | 512 | } |
378 | 513 | |
514 | + /// Create a local meshgrid | |
515 | + void meshgrid(){ | |
516 | + if(p[0]) return; //if the p[0] value is not NULL, a meshgrid has already been created | |
517 | + if(loc == CPUmem){ | |
518 | + p[0] = (T*) malloc( size() * sizeof(T) ); | |
519 | + p[1] = (T*) malloc( size() * sizeof(T) ); | |
520 | + p[2] = (T*) malloc( size() * sizeof(T) ); | |
521 | + } | |
522 | + else{ | |
523 | + std::cout<<"GPUmem meshgrid isn't implemented yet."<<std::endl; | |
524 | + exit(1); | |
525 | + } | |
526 | + meshgrid(p[0], p[1], p[2], loc); | |
527 | + } | |
528 | + | |
379 | 529 | //clear the field, setting all values to zero |
380 | 530 | void clear(){ |
381 | 531 | if(loc == GPUmem) |
... | ... | @@ -386,25 +536,49 @@ public: |
386 | 536 | |
387 | 537 | void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer){ |
388 | 538 | |
389 | - if(loc == GPUmem) to_cpu(); //if the field is in the GPU, move it to the CPU | |
390 | - T* image = (T*) malloc( sizeof(T) * size() ); //allocate space for the real image | |
391 | - | |
392 | - switch(type){ //get the specified component from the complex value | |
393 | - case complexMag: | |
394 | - stim::abs(image, E, size()); | |
395 | - break; | |
396 | - case complexReal: | |
397 | - stim::real(image, E, size()); | |
398 | - break; | |
399 | - case complexImaginary: | |
400 | - stim::imag(image, E, size()); | |
401 | - break; | |
402 | - case complexIntensity: | |
403 | - stim::intensity(image, E, size()); | |
404 | - break; | |
539 | + if(loc == GPUmem){ | |
540 | + T* image; | |
541 | + HANDLE_ERROR( cudaMalloc(&image, sizeof(T) * size()) ); | |
542 | + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device | |
543 | + dim3 blocks( R[0] * R[1] / threads + 1 ); //create a 1D array of blocks | |
544 | + | |
545 | + switch(type){ | |
546 | + case complexMag: | |
547 | + cuda_abs<T><<< blocks, threads >>>(image, E, size()); | |
548 | + break; | |
549 | + case complexReal: | |
550 | + cuda_real<T><<< blocks, threads >>>(image, E, size()); | |
551 | + break; | |
552 | + case complexImaginary: | |
553 | + cuda_imag<T><<< blocks, threads >>>(image, E, size()); | |
554 | + break; | |
555 | + case complexIntensity: | |
556 | + cuda_intensity<T><<< blocks, threads >>>(image, E, size()); | |
557 | + break; | |
558 | + } | |
559 | + stim::gpu2image<T>(image, filename, R[0], R[1], stim::cmBrewer); | |
560 | + HANDLE_ERROR( cudaFree(image) ); | |
561 | + } | |
562 | + else{ | |
563 | + T* image = (T*) malloc( sizeof(T) * size() ); //allocate space for the real image | |
564 | + | |
565 | + switch(type){ //get the specified component from the complex value | |
566 | + case complexMag: | |
567 | + stim::abs(image, E, size()); | |
568 | + break; | |
569 | + case complexReal: | |
570 | + stim::real(image, E, size()); | |
571 | + break; | |
572 | + case complexImaginary: | |
573 | + stim::imag(image, E, size()); | |
574 | + break; | |
575 | + case complexIntensity: | |
576 | + stim::intensity(image, E, size()); | |
577 | + break; | |
578 | + } | |
579 | + stim::cpu2image(image, filename, R[0], R[1], cmap); //save the resulting image | |
580 | + free(image); //free the real image | |
405 | 581 | } |
406 | - stim::cpu2image(image, filename, R[0], R[1], cmap); //save the resulting image | |
407 | - free(image); //free the real image | |
408 | 582 | } |
409 | 583 | |
410 | 584 | void image(T* img, stim::complexComponentType type = complexMag){ |
... | ... | @@ -430,15 +604,23 @@ public: |
430 | 604 | |
431 | 605 | //adds the field intensity to the output array (useful for calculating detector response to incoherent fields) |
432 | 606 | void intensity(T* out){ |
433 | - if(loc == GPUmem) to_cpu(); //if the field is in the GPU, move it to the CPU | |
434 | - T* image = (T*) malloc( sizeof(T) * size() ); //allocate space for the real image | |
435 | - stim::intensity(image, E, size()); //calculate the intensity | |
607 | + if(loc == GPUmem){ | |
608 | + //T* image; | |
609 | + //HANDLE_ERROR( cudaMalloc(&image, sizeof(T) * size()) ); | |
610 | + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device | |
611 | + dim3 blocks( R[0] * R[1] / threads + 1 ); //create a 1D array of blocks | |
612 | + cuda_sum_intensity<T><<< blocks, threads >>>(out, E, size()); | |
613 | + } | |
614 | + else{ | |
615 | + T* image = (T*) malloc( sizeof(T) * size() ); //allocate space for the real image | |
616 | + stim::intensity(image, E, size()); //calculate the intensity | |
436 | 617 | |
437 | - size_t N = size(); //calculate the number of elements in the field | |
438 | - for(size_t n = 0; n < N; n++) //for each point in the field | |
439 | - out[n] += image[n]; //add the field intensity to the output image | |
618 | + size_t N = size(); //calculate the number of elements in the field | |
619 | + for(size_t n = 0; n < N; n++) //for each point in the field | |
620 | + out[n] += image[n]; //add the field intensity to the output image | |
440 | 621 | |
441 | - free(image); //free the temporary intensity image | |
622 | + free(image); //free the temporary intensity image | |
623 | + } | |
442 | 624 | } |
443 | 625 | |
444 | 626 | }; //end class scalarfield | ... | ... |
stim/optics/scalarmie.h
... | ... | @@ -149,16 +149,16 @@ void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, sti |
149 | 149 | |
150 | 150 | size_t max_shared_mem = stim::sharedMemPerBlock(); |
151 | 151 | size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1); |
152 | - std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl; | |
153 | - std::cout<<"shared memory: "<<max_shared_mem<<std::endl; | |
152 | + //std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl; | |
153 | + //std::cout<<"shared memory: "<<max_shared_mem<<std::endl; | |
154 | 154 | int threads = (int)((max_shared_mem / hBl_array) / 32 * 32); |
155 | - std::cout<<"threads per block: "<<threads<<std::endl; | |
155 | + //std::cout<<"threads per block: "<<threads<<std::endl; | |
156 | 156 | dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks |
157 | 157 | |
158 | 158 | size_t shared_mem; |
159 | 159 | if(Nl <= LOCAL_NL) shared_mem = 0; |
160 | 160 | else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate |
161 | - std::cout<<"shared memory allocated: "<<shared_mem<<std::endl; | |
161 | + //std::cout<<"shared memory allocated: "<<shared_mem<<std::endl; | |
162 | 162 | cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel |
163 | 163 | } |
164 | 164 | |
... | ... | @@ -174,52 +174,20 @@ __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){ |
174 | 174 | |
175 | 175 | r[i] = p.len(); |
176 | 176 | } |
177 | -/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave | |
178 | - | |
179 | -/// @param E is a pointer to the destination field values | |
180 | -/// @param N is the number of points used to calculate the field | |
181 | -/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros) | |
182 | -/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros) | |
183 | -/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros) | |
184 | -/// @param W is an array of planewaves that will be scattered | |
185 | -/// @param a is the radius of the sphere | |
186 | -/// @param n is the complex refractive index of the sphere | |
187 | 177 | template<typename T> |
188 | -void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){ | |
178 | +void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){ | |
179 | + | |
189 | 180 | //calculate the necessary number of orders required to represent the scattered field |
190 | 181 | T k = W[0].kmag(); |
191 | 182 | |
192 | 183 | int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); |
193 | 184 | if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) |
194 | - std::cout<<"Nl: "<<Nl<<std::endl; | |
185 | + //std::cout<<"Nl: "<<Nl<<std::endl; | |
195 | 186 | |
196 | 187 | //calculate the scattering coefficients for the sphere |
197 | 188 | stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients |
198 | - B_coefficients(B, a, k, n, Nl); | |
199 | - | |
200 | -#ifdef CUDA_FOUND | |
201 | - stim::complex<T>* dev_E; //allocate space for the field | |
202 | - cudaMalloc(&dev_E, N * sizeof(stim::complex<T>)); | |
203 | - cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice); | |
204 | - //cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used) | |
205 | - | |
206 | - // COORDINATES | |
207 | - T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) | |
208 | - if(x != NULL){ | |
209 | - HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T))); | |
210 | - HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice)); | |
211 | - } | |
212 | - T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified) | |
213 | - if(y != NULL){ | |
214 | - HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T))); | |
215 | - HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice)); | |
216 | - } | |
217 | - T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified) | |
218 | - if(z != NULL){ | |
219 | - HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T))); | |
220 | - HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); | |
221 | - } | |
222 | - | |
189 | + B_coefficients(B, a, k, n, Nl); | |
190 | + | |
223 | 191 | // PLANE WAVES |
224 | 192 | stim::scalarwave<T>* dev_W; //allocate space and copy plane waves |
225 | 193 | HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) ); |
... | ... | @@ -232,7 +200,7 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std |
232 | 200 | |
233 | 201 | int threads = stim::maxThreadsPerBlock(); |
234 | 202 | dim3 blocks((unsigned)(N / threads + 1)); |
235 | - cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N); | |
203 | + cuda_dist<T> <<< blocks, threads >>>(dev_r, x, y, z, N); | |
236 | 204 | |
237 | 205 | //Find the minimum and maximum values of r |
238 | 206 | cublasStatus_t stat; |
... | ... | @@ -280,7 +248,7 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std |
280 | 248 | size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut; |
281 | 249 | stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes); //pointer to the look-up table |
282 | 250 | T dr = (r_max - r_min) / (N_hB_lut-1); //distance between values in the LUT |
283 | - std::cout<<"LUT jl bytes: "<<hB_bytes<<std::endl; | |
251 | + //std::cout<<"LUT jl bytes: "<<hB_bytes<<std::endl; | |
284 | 252 | stim::complex<T> hl; |
285 | 253 | for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT |
286 | 254 | stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] |
... | ... | @@ -292,18 +260,57 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std |
292 | 260 | //std::cout<<hB_lut[ri * (Nl + 1) + l]<<std::endl; |
293 | 261 | } |
294 | 262 | } |
295 | - T* real_lut = (T*) malloc(hB_bytes/2); | |
296 | - stim::real(real_lut, hB_lut, N_hB_lut); | |
297 | - stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer); | |
263 | + //T* real_lut = (T*) malloc(hB_bytes/2); | |
264 | + //stim::real(real_lut, hB_lut, N_hB_lut); | |
265 | + //stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer); | |
298 | 266 | |
299 | 267 | //Allocate device memory and copy everything to the GPU |
300 | 268 | stim::complex<T>* dev_hB_lut; |
301 | 269 | HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); |
302 | 270 | HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); |
303 | 271 | |
304 | - gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl); | |
272 | + gpu_scalar_mie_scatter<T>(E, N, x, y, z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl); | |
305 | 273 | |
306 | - cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory | |
274 | + cudaMemcpy(E, E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory | |
275 | +} | |
276 | +/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave | |
277 | + | |
278 | +/// @param E is a pointer to the destination field values | |
279 | +/// @param N is the number of points used to calculate the field | |
280 | +/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros) | |
281 | +/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros) | |
282 | +/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros) | |
283 | +/// @param W is an array of planewaves that will be scattered | |
284 | +/// @param a is the radius of the sphere | |
285 | +/// @param n is the complex refractive index of the sphere | |
286 | +template<typename T> | |
287 | +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){ | |
288 | + | |
289 | + | |
290 | +#ifdef CUDA_FOUND | |
291 | + stim::complex<T>* dev_E; //allocate space for the field | |
292 | + cudaMalloc(&dev_E, N * sizeof(stim::complex<T>)); | |
293 | + cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice); | |
294 | + //cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used) | |
295 | + | |
296 | + // COORDINATES | |
297 | + T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) | |
298 | + if(x != NULL){ | |
299 | + HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T))); | |
300 | + HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice)); | |
301 | + } | |
302 | + T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified) | |
303 | + if(y != NULL){ | |
304 | + HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T))); | |
305 | + HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice)); | |
306 | + } | |
307 | + T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified) | |
308 | + if(z != NULL){ | |
309 | + HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T))); | |
310 | + HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); | |
311 | + } | |
312 | + | |
313 | + gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, W, a, n, r_spacing); | |
307 | 314 | |
308 | 315 | if(x != NULL) cudaFree(dev_x); //free everything |
309 | 316 | if(y != NULL) cudaFree(dev_y); |
... | ... | @@ -311,6 +318,16 @@ void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std |
311 | 318 | cudaFree(dev_E); |
312 | 319 | #else |
313 | 320 | |
321 | + //calculate the necessary number of orders required to represent the scattered field | |
322 | + T k = W[0].kmag(); | |
323 | + | |
324 | + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); | |
325 | + if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) | |
326 | + //std::cout<<"Nl: "<<Nl<<std::endl; | |
327 | + | |
328 | + //calculate the scattering coefficients for the sphere | |
329 | + stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients | |
330 | + B_coefficients(B, a, k, n, Nl); | |
314 | 331 | |
315 | 332 | //allocate space to store the bessel function call results |
316 | 333 | double vm; |
... | ... | @@ -422,16 +439,16 @@ void gpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st |
422 | 439 | |
423 | 440 | size_t max_shared_mem = stim::sharedMemPerBlock(); |
424 | 441 | size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1); |
425 | - std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl; | |
426 | - std::cout<<"shared memory: "<<max_shared_mem<<std::endl; | |
442 | + //std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl; | |
443 | + //std::cout<<"shared memory: "<<max_shared_mem<<std::endl; | |
427 | 444 | int threads = (int)((max_shared_mem / hBl_array) / 32 * 32); |
428 | - std::cout<<"threads per block: "<<threads<<std::endl; | |
445 | + //std::cout<<"threads per block: "<<threads<<std::endl; | |
429 | 446 | dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks |
430 | 447 | |
431 | 448 | size_t shared_mem; |
432 | 449 | if(Nl <= LOCAL_NL) shared_mem = 0; |
433 | 450 | else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate |
434 | - std::cout<<"shared memory allocated: "<<shared_mem<<std::endl; | |
451 | + //std::cout<<"shared memory allocated: "<<shared_mem<<std::endl; | |
435 | 452 | cuda_scalar_mie_internal<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl); //call the kernel |
436 | 453 | } |
437 | 454 | |
... | ... | @@ -452,7 +469,7 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st |
452 | 469 | |
453 | 470 | int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); |
454 | 471 | if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) |
455 | - std::cout<<"Nl: "<<Nl<<std::endl; | |
472 | + //std::cout<<"Nl: "<<Nl<<std::endl; | |
456 | 473 | |
457 | 474 | //calculate the scattering coefficients for the sphere |
458 | 475 | stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients |
... | ... | @@ -537,7 +554,7 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st |
537 | 554 | size_t jA_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_jA_lut; |
538 | 555 | stim::complex<T>* jA_lut = (stim::complex<T>*) malloc(jA_bytes); //pointer to the look-up table |
539 | 556 | T dr = (r_max - r_min) / (N_jA_lut-1); //distance between values in the LUT |
540 | - std::cout<<"LUT jl bytes: "<<jA_bytes<<std::endl; | |
557 | + //std::cout<<"LUT jl bytes: "<<jA_bytes<<std::endl; | |
541 | 558 | stim::complex<T> hl; |
542 | 559 | stim::complex<double> nd = (stim::complex<double>)n; |
543 | 560 | for(size_t ri = 0; ri < N_jA_lut; ri++){ //for each value in the LUT |
... | ... | @@ -559,6 +576,10 @@ void cpu_scalar_mie_internal(stim::complex<T>* E, size_t N, T* x, T* y, T* z, st |
559 | 576 | if(x != NULL) cudaFree(dev_x); //free everything |
560 | 577 | if(y != NULL) cudaFree(dev_y); |
561 | 578 | if(z != NULL) cudaFree(dev_z); |
579 | + HANDLE_ERROR( cudaFree(dev_jA_lut) ); | |
580 | + HANDLE_ERROR( cudaFree(dev_E) ); | |
581 | + HANDLE_ERROR( cudaFree(dev_W) ); | |
582 | + HANDLE_ERROR( cudaFree(dev_r) ); | |
562 | 583 | cudaFree(dev_E); |
563 | 584 | #else |
564 | 585 | |
... | ... | @@ -624,19 +645,44 @@ public: |
624 | 645 | n = ri; |
625 | 646 | } |
626 | 647 | |
648 | + void sum_scat(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){ | |
649 | + std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves | |
650 | + stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); | |
651 | + } | |
652 | + | |
653 | + void sum_intern(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){ | |
654 | + std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves | |
655 | + stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); | |
656 | + } | |
657 | + | |
658 | + void eval(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int order = 500, int samples = 1000){ | |
659 | + b.eval(E, X, Y, Z, order); //evaluate the incident field using a plane wave expansion | |
660 | + std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves | |
661 | + sum_scat(E, X, Y, Z, b, samples); | |
662 | + sum_intern(E, X, Y, Z, b, samples); | |
663 | + } | |
664 | + | |
627 | 665 | void eval(stim::scalarfield<T>& E, stim::scalarbeam<T> b, int order = 500, int samples = 1000){ |
628 | 666 | |
629 | - size_t array_size = E.grid_bytes(); //calculate the number of bytes in the scalar grid | |
667 | + /*size_t array_size = E.grid_bytes(); //calculate the number of bytes in the scalar grid | |
630 | 668 | float* X = (float*) malloc( array_size ); //allocate space for the coordinate meshes |
631 | 669 | float* Y = (float*) malloc( array_size ); |
632 | 670 | float* Z = (float*) malloc( array_size ); |
633 | 671 | E.meshgrid(X, Y, Z, stim::CPUmem); //calculate the coordinate meshes |
634 | - | |
635 | - b.eval(E, X, Y, Z, order); //evaluate the incident field using a plane wave expansion | |
672 | + */ | |
673 | + E.meshgrid(); | |
674 | + b.eval(E, order); | |
636 | 675 | |
637 | 676 | std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves |
638 | - stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); | |
639 | - stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); | |
677 | + | |
678 | + if(E.gpu()){ | |
679 | + stim::gpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); | |
680 | + } | |
681 | + else{ | |
682 | + stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); | |
683 | + stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); | |
684 | + } | |
685 | + //eval(E, X, Y, Z, b, order, samples); //evaluate the field | |
640 | 686 | } |
641 | 687 | |
642 | 688 | }; //end stim::scalarmie | ... | ... |
stim/visualization/colormap.h
... | ... | @@ -4,6 +4,7 @@ |
4 | 4 | #include <string> |
5 | 5 | #include <stdlib.h> |
6 | 6 | #include <cmath> |
7 | +#include "cublas_v2.h" | |
7 | 8 | |
8 | 9 | #ifdef _WIN32 |
9 | 10 | #include <float.h> |
... | ... | @@ -219,6 +220,42 @@ static void gpu2image(T* gpuSource, std::string fileDest, unsigned int x_size, u |
219 | 220 | free(cpuBuffer); |
220 | 221 | } |
221 | 222 | |
223 | +/// save a GPU image to a file using automatic scaling | |
224 | +template<typename T> | |
225 | +static void gpu2image(T* gpuSource, std::string fileDest, unsigned int x_size, unsigned int y_size, colormapType cm = cmGrayscale){ | |
226 | + size_t N = x_size * y_size; //calculate the total number of elements in the image | |
227 | + | |
228 | + cublasStatus_t stat; | |
229 | + cublasHandle_t handle; | |
230 | + | |
231 | + stat = cublasCreate(&handle); //create a cuBLAS handle | |
232 | + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure | |
233 | + printf ("CUBLAS initialization failed\n"); | |
234 | + exit(1); | |
235 | + } | |
236 | + | |
237 | + int i_min, i_max; | |
238 | + stat = cublasIsamin(handle, (int)N, gpuSource, 1, &i_min); | |
239 | + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure | |
240 | + printf ("CUBLAS Error: failed to calculate minimum r value.\n"); | |
241 | + exit(1); | |
242 | + } | |
243 | + stat = cublasIsamax(handle, (int)N, gpuSource, 1, &i_max); | |
244 | + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure | |
245 | + printf ("CUBLAS Error: failed to calculate maximum r value.\n"); | |
246 | + exit(1); | |
247 | + } | |
248 | + cublasDestroy(handle); | |
249 | + | |
250 | + i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility | |
251 | + i_max--; | |
252 | + T v_min, v_max; //allocate space to store the minimum and maximum values | |
253 | + HANDLE_ERROR( cudaMemcpy(&v_min, gpuSource + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU | |
254 | + HANDLE_ERROR( cudaMemcpy(&v_max, gpuSource + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); | |
255 | + | |
256 | + gpu2image<T>(gpuSource, fileDest, x_size, y_size, v_min, v_max, cm); | |
257 | +} | |
258 | + | |
222 | 259 | #endif |
223 | 260 | |
224 | 261 | template<class T> | ... | ... |