Commit 4252d8276a60522356cbd52577b4fe5911f7f845

Authored by David Mayerich
1 parent b5dd3d3a

ivote3 fixes and new bimsim code

stim/cuda/crop.cuh 0 → 100644
  1 +
  2 +template<typename T>
  3 +__global__ void cuda_crop2d(T* dest, T* src, size_t sx, size_t sy, size_t x, size_t y, size_t dx, size_t dy){
  4 +
  5 + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate the current working position within the destination image
  6 + size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
  7 + if(xi >= dx || yi >= dy) return; //if this thread is outside of the destination image, return
  8 +
  9 + size_t di = yi * dx + xi; //calculate the 1D index into the destination image
  10 + size_t si = (y + yi) * sx + (x + xi); //calculate the 1D index into the source image
  11 +
  12 + dest[di] = src[si]; //copy the corresponding source pixel to the destination image
  13 +}
  14 +
  15 +/// Crops a 2D image composed of elements of type T
  16 +/// @param dest is a device pointer to memory of size dx*dy that will store the cropped image
  17 +/// @param src is a device pointer to memory of size sx*sy that stores the original image
  18 +/// @param sx is the size of the source image along x
  19 +/// @param sy is the size of the source image along y
  20 +/// @param x is the x-coordinate of the start position of the cropped region within the source image
  21 +/// @param y is the y-coordinate of the start position of the cropped region within the source image
  22 +/// @param dx is the size of the destination image along x
  23 +/// @param dy is the size of the destination image along y
  24 +template<typename T>
  25 +void gpu_crop2d(T* dest, T* src, size_t sx, size_t sy, size_t x, size_t y, size_t dx, size_t dy){
  26 + int max_threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
  27 + dim3 threads( sqrt(max_threads), sqrt(max_threads) );
  28 + dim3 blocks( dx / sqrt(threads.x) + 1, dy / sqrt(threads.y) + 1); //calculate the optimal number of blocks
  29 + cuda_crop2d<T> <<< blocks, threads >>>(dest, src, sx, sy, x, y, dx, dy);
  30 +}
0 31 \ No newline at end of file
... ...
stim/math/circle.h
... ... @@ -64,7 +64,7 @@ public:
64 64 init();
65 65 }
66 66  
67   - ///create a rectangle from a center point, normal, and size
  67 + /*///create a rectangle from a center point, normal, and size
68 68 ///@param c: x,y,z location of the center.
69 69 ///@param s: size of the rectangle.
70 70 ///@param n: x,y,z direction of the normal.
... ... @@ -76,6 +76,7 @@ public:
76 76 rotate(n, U, Y);
77 77 scale(s);
78 78 }
  79 + */
79 80  
80 81 ///create a rectangle from a center point, normal, and size
81 82 ///@param c: x,y,z location of the center.
... ... @@ -164,12 +165,12 @@ public:
164 165 ///returns a vector with the points on the initialized circle.
165 166 ///connecting the points results in a circle.
166 167 ///@param n: integer for the number of points representing the circle.
167   - stim::vec3<T>
  168 + CUDA_CALLABLE stim::vec3<T>
168 169 p(T theta)
169 170 {
170 171 T x,y;
171   - y = 0.5*cos(theta*stim::TAU/360.0)+0.5;
172   - x = 0.5*sin(theta*stim::TAU/360.0)+0.5;
  172 + y = 0.5*cos(theta*STIM_TAU/360.0)+0.5;
  173 + x = 0.5*sin(theta*STIM_TAU/360.0)+0.5;
173 174 return p(x,y);
174 175 }
175 176  
... ...
stim/math/constants.h
1 1 #ifndef STIM_CONSTANTS_H
2 2 #define STIM_CONSTANTS_H
3 3  
  4 +#define STIM_PI 3.1415926535897932384626433832795028841971693993751058209749445923078164062862
  5 +#define STIM_TAU 2 * STIM_PI
  6 +
4 7 #include "stim/cuda/cudatools/callable.h"
5 8 namespace stim{
6 9 const double PI = 3.1415926535897932384626433832795028841971693993751058209749445923078164062862;
... ...
stim/math/vec3.h
... ... @@ -216,7 +216,7 @@ public:
216 216 return result;
217 217 }
218 218  
219   -
  219 +#ifndef __NVCC__
220 220 /// Outputs the vector as a string
221 221 std::string str() const{
222 222 std::stringstream ss;
... ... @@ -234,6 +234,7 @@ public:
234 234  
235 235 return ss.str();
236 236 }
  237 +#endif
237 238  
238 239 size_t size(){ return 3; }
239 240  
... ...
stim/optics/scalarbeam.h
... ... @@ -193,6 +193,7 @@ void gpu_scalar_psf_local(stim::complex&lt;T&gt;* E, size_t N, T* r, T* phi, T lambda,
193 193 printf ("CUBLAS Error: failed to calculate maximum r value.\n");
194 194 exit(1);
195 195 }
  196 + cublasDestroy(handle);
196 197  
197 198 i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility
198 199 i_max--;
... ... @@ -212,8 +213,9 @@ void gpu_scalar_psf_local(stim::complex&lt;T&gt;* E, size_t N, T* r, T* phi, T lambda,
212 213 T* j_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table
213 214 T dr = (r_max - r_min) / (Nlut_j-1); //distance between values in the LUT
214 215 T jl;
  216 + unsigned l;
215 217 for(size_t ri = 0; ri < Nlut_j; ri++){ //for each value in the LUT
216   - for(unsigned l = 0; l <= (unsigned)Nl; l++){ //for each order
  218 + for(l = 0; l <= (unsigned)Nl; l++){ //for each order
217 219 jl = boost::math::sph_bessel<T>(l, k*(r_min + ri * dr)); //use boost to calculate the spherical bessel function
218 220 j_lut[ri * (Nl + 1) + l] = jl; //store the bessel function result
219 221 }
... ... @@ -323,6 +325,9 @@ void gpu_scalar_psf_cart(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, T lamb
323 325  
324 326 gpu_scalar_psf_local(E, N, gpu_r, gpu_phi, lambda, A, NA, NA_in, Nl, r_spacing);
325 327  
  328 + HANDLE_ERROR( cudaFree(gpu_r) );
  329 + HANDLE_ERROR( cudaFree(gpu_phi) );
  330 +
326 331 }
327 332 #endif
328 333  
... ... @@ -449,17 +454,12 @@ public:
449 454  
450 455 /// Evaluate the beam to a scalar field using Debye focusing
451 456 void eval(stim::scalarfield<T>& E, int order = 500){
452   - size_t array_size = E.grid_bytes();
453   - T* X = (T*) malloc( array_size ); //allocate space for the coordinate meshes
454   - T* Y = (T*) malloc( array_size );
455   - T* Z = (T*) malloc( array_size );
456   -
457   - E.meshgrid(X, Y, Z, stim::CPUmem); //calculate the coordinate meshes
458   - eval(E, X, Y, Z, order);
459   -
460   - free(X); //free the coordinate meshes
461   - free(Y);
462   - free(Z);
  457 + E.meshgrid(); //calculate a meshgrid if one isn't created
  458 + if(E.gpu())
  459 + gpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[0], NA[1], order, E.spacing());
  460 + else
  461 + cpu_scalar_psf_cart<T>(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[0], NA[1], order, E.spacing());
  462 + //eval(E, E.x(), E.y(), E.z(), order);
463 463 }
464 464  
465 465 /// Calculate the field at a given point
... ...
stim/optics/scalarfield.h
... ... @@ -6,8 +6,52 @@
6 6 #include "../math/complex.h"
7 7 #include "../math/fft.h"
8 8  
  9 +#ifdef CUDA_FOUND
  10 +#include "../cuda/crop.cuh"
  11 +#endif
  12 +
9 13 namespace stim{
10 14  
  15 + template<typename T>
  16 + __global__ void cuda_abs(T* img, stim::complex<T>* field, size_t N){
  17 + size_t i = blockIdx.x * blockDim.x + threadIdx.x;
  18 + if(i >= N) return;
  19 +
  20 + img[i] = field[i].abs();
  21 + }
  22 +
  23 + template<typename T>
  24 + __global__ void cuda_real(T* img, stim::complex<T>* field, size_t N){
  25 + size_t i = blockIdx.x * blockDim.x + threadIdx.x;
  26 + if(i >= N) return;
  27 +
  28 + img[i] = field[i].real();
  29 + }
  30 +
  31 + template<typename T>
  32 + __global__ void cuda_imag(T* img, stim::complex<T>* field, size_t N){
  33 + size_t i = blockIdx.x * blockDim.x + threadIdx.x;
  34 + if(i >= N) return;
  35 +
  36 + img[i] = field[i].imag();
  37 + }
  38 +
  39 + template<typename T>
  40 + __global__ void cuda_intensity(T* img, stim::complex<T>* field, size_t N){
  41 + size_t i = blockIdx.x * blockDim.x + threadIdx.x;
  42 + if(i >= N) return;
  43 +
  44 + img[i] = pow(field[i].abs(), 2);
  45 + }
  46 +
  47 + template<typename T>
  48 + __global__ void cuda_sum_intensity(T* img, stim::complex<T>* field, size_t N){
  49 + size_t i = blockIdx.x * blockDim.x + threadIdx.x;
  50 + if(i >= N) return;
  51 +
  52 + img[i] += pow(field[i].abs(), 2);
  53 + }
  54 +
11 55 /// Perform a k-space transform of a scalar field (FFT). The given field has a width of x and the calculated momentum space has a
12 56 /// width of kx (in radians).
13 57 /// @param K is a pointer to the output array of all plane waves in the field
... ... @@ -44,12 +88,16 @@ namespace stim{
44 88 std::cout<<"Error using cuFFT to perform a forward Fourier transform of the field."<<std::endl;
45 89 exit(1);
46 90 }
  91 + cufftDestroy(plan);
47 92  
48 93 stim::complex<T>* fft = (stim::complex<T>*) malloc(sizeof(stim::complex<T>) * nx * ny);
49 94 HANDLE_ERROR( cudaMemcpy(fft, dev_FFT, sizeof(stim::complex<T>) * nx * ny, cudaMemcpyDeviceToHost) );
50 95  
51 96 stim::cpu_fftshift(K, fft, nx, ny);
52   - //memcpy(K, fft, sizeof(stim::complex<T>) * nx * ny);
  97 +
  98 + HANDLE_ERROR( cudaFree(dev_FFT) ); //free GPU memory
  99 + HANDLE_ERROR( cudaFree(dev_E) );
  100 + free(fft); //free CPU memory
53 101 }
54 102  
55 103 template<typename T>
... ... @@ -82,12 +130,18 @@ namespace stim{
82 130 std::cout<<"Error using cuFFT to perform a forward Fourier transform of the field."<<std::endl;
83 131 exit(1);
84 132 }
  133 + cufftDestroy(plan);
85 134  
86 135 HANDLE_ERROR( cudaMemcpy(E, dev_E, sizeof(stim::complex<T>) * nx * ny, cudaMemcpyDeviceToHost) );
87 136  
  137 + HANDLE_ERROR( cudaFree(dev_FFT) ); //free GPU memory
  138 + HANDLE_ERROR( cudaFree(dev_E) );
  139 + free(fft); //free CPU memory
88 140  
89 141 }
90 142  
  143 +
  144 +
91 145 /// Propagate a field slice along its orthogonal direction by a given distance z
92 146 /// @param Enew is the resulting propogated field
93 147 /// @param E is the field to be propogated
... ... @@ -105,9 +159,9 @@ namespace stim{
105 159 T Kx, Ky; //width and height in k space
106 160 cpu_scalar_to_kspace(K, Kx, Ky, E ,sx, sy, nx, ny);
107 161  
108   - T* mag = (T*) malloc( sizeof(T) * nx * ny );
109   - stim::abs(mag, K, nx * ny);
110   - stim::cpu2image<float>(mag, "kspace_pre_shift.bmp", nx, ny, stim::cmBrewer);
  162 + //T* mag = (T*) malloc( sizeof(T) * nx * ny );
  163 + //stim::abs(mag, K, nx * ny);
  164 + //stim::cpu2image<float>(mag, "kspace_pre_shift.bmp", nx, ny, stim::cmBrewer);
111 165  
112 166 size_t kxi, kyi;
113 167 size_t i;
... ... @@ -143,10 +197,27 @@ namespace stim{
143 197 }
144 198 }
145 199  
146   - stim::abs(mag, K, nx * ny);
147   - stim::cpu2image<float>(mag, "kspace_post_shift.bmp", nx, ny, stim::cmBrewer);
  200 + //stim::abs(mag, K, nx * ny);
  201 + //stim::cpu2image<float>(mag, "kspace_post_shift.bmp", nx, ny, stim::cmBrewer);
148 202  
149 203 cpu_scalar_from_kspace(Enew, sx, sy, K, Kx, Ky, nx, ny);
  204 + free(K);
  205 + }
  206 +
  207 + template<typename T>
  208 + void gpu_scalar_propagate(stim::complex<T>* Enew, stim::complex<T>* E, T sx, T sy, T z, T k, size_t nx, size_t ny){
  209 +
  210 + size_t field_bytes = sizeof(stim::complex<T>) * nx * ny;
  211 + stim::complex<T>* host_E = (stim::complex<T>*) malloc( field_bytes);
  212 + HANDLE_ERROR( cudaMemcpy(host_E, E, field_bytes, cudaMemcpyDeviceToHost) );
  213 +
  214 + stim::complex<T>* host_Enew = (stim::complex<T>*) malloc(field_bytes);
  215 +
  216 + cpu_scalar_propagate(host_Enew, host_E, sx, sy, z, k, nx, ny);
  217 +
  218 + HANDLE_ERROR( cudaMemcpy(Enew, host_Enew, field_bytes, cudaMemcpyHostToDevice) );
  219 + free(host_E);
  220 + free(host_Enew);
150 221 }
151 222  
152 223 /// Apply a lowpass filter to a field slice
... ... @@ -165,9 +236,9 @@ namespace stim{
165 236 T Kx, Ky; //width and height in k space
166 237 cpu_scalar_to_kspace(K, Kx, Ky, E ,sx, sy, nx, ny);
167 238  
168   - T* mag = (T*) malloc( sizeof(T) * nx * ny );
169   - stim::abs(mag, K, nx * ny);
170   - stim::cpu2image<float>(mag, "kspace_pre_lowpass.bmp", nx, ny, stim::cmBrewer);
  239 + //T* mag = (T*) malloc( sizeof(T) * nx * ny );
  240 + //stim::abs(mag, K, nx * ny);
  241 + //stim::cpu2image<float>(mag, "kspace_pre_lowpass.bmp", nx, ny, stim::cmBrewer);
171 242  
172 243 size_t kxi, kyi;
173 244 size_t i;
... ... @@ -200,10 +271,11 @@ namespace stim{
200 271 }
201 272 }
202 273  
203   - stim::abs(mag, K, nx * ny);
204   - stim::cpu2image<float>(mag, "kspace_post_lowpass.bmp", nx, ny, stim::cmBrewer);
  274 + //stim::abs(mag, K, nx * ny);
  275 + //stim::cpu2image<float>(mag, "kspace_post_lowpass.bmp", nx, ny, stim::cmBrewer);
205 276  
206 277 cpu_scalar_from_kspace(Enew, sx, sy, K, Kx, Ky, nx, ny);
  278 + free(K);
207 279 }
208 280  
209 281 enum locationType {CPUmem, GPUmem};
... ... @@ -222,6 +294,8 @@ protected:
222 294 size_t R[2];
223 295 locationType loc;
224 296  
  297 + T* p[3]; //scalar position for each point in E
  298 +
225 299 /// Convert the field to a k-space representation (do an FFT)
226 300 void to_kspace(T& kx, T& ky){
227 301 cpu_scalar_to_kspace(E, kx, ky, E, X.len(), Y.len(), R[0], R[1]);
... ... @@ -252,6 +326,9 @@ public:
252 326 E = (stim::complex<T>*) malloc(grid_bytes()); //allocate in CPU memory
253 327 memset(E, 0, grid_bytes());
254 328 loc = CPUmem;
  329 +
  330 + p[0] = p[1] = p[2] = NULL; //set the position vector to NULL
  331 +
255 332 }
256 333  
257 334 ~scalarfield(){
... ... @@ -271,11 +348,35 @@ public:
271 348 if(loc == GPUmem) return;
272 349 else{
273 350 stim::complex<T>* dev_E;
274   - HANDLE_ERROR( cudaMalloc(&dev_E, e_bytes()) ); //allocate GPU memory
275   - HANDLE_ERROR( cudaMemcpy(dev_E, E, e_bytes(), cudaMemcpyHostToDevice) ); //copy the field to the GPU
  351 + HANDLE_ERROR( cudaMalloc(&dev_E, grid_bytes()) ); //allocate GPU memory
  352 + HANDLE_ERROR( cudaMemcpy(dev_E, E, grid_bytes(), cudaMemcpyHostToDevice) ); //copy the field to the GPU
276 353 free(E); //free the CPU memory
277 354 E = dev_E; //swap pointers
  355 +
  356 + if(p[0]){
  357 + size_t meshgrid_bytes = size() * sizeof(T); //calculate the number of bytes in each meshgrid
  358 + T* dev_X; //allocate variables to store the device meshgrid
  359 + T* dev_Y;
  360 + T* dev_Z;
  361 + HANDLE_ERROR( cudaMalloc(&dev_X, meshgrid_bytes) ); //allocate space for the meshgrid on the device
  362 + HANDLE_ERROR( cudaMalloc(&dev_Y, meshgrid_bytes) );
  363 + HANDLE_ERROR( cudaMalloc(&dev_Z, meshgrid_bytes) );
  364 +
  365 + HANDLE_ERROR( cudaMemcpy(dev_X, p[0], meshgrid_bytes, cudaMemcpyHostToDevice) ); //copy from the host to the device
  366 + HANDLE_ERROR( cudaMemcpy(dev_Y, p[1], meshgrid_bytes, cudaMemcpyHostToDevice) );
  367 + HANDLE_ERROR( cudaMemcpy(dev_Z, p[2], meshgrid_bytes, cudaMemcpyHostToDevice) );
  368 +
  369 + free(p[0]); //free device memory
  370 + free(p[1]);
  371 + free(p[2]);
  372 +
  373 + p[0] = dev_X; //swap in the new pointers to device memory
  374 + p[1] = dev_Y;
  375 + p[2] = dev_Z;
  376 + }
  377 + loc = GPUmem; //set the location flag
278 378 }
  379 +
279 380 }
280 381  
281 382 /// Copy the field array to the CPU, if it isn't already there
... ... @@ -286,14 +387,43 @@ public:
286 387 HANDLE_ERROR( cudaMemcpy(host_E, E, grid_bytes(), cudaMemcpyDeviceToHost) ); //copy from GPU to CPU
287 388 HANDLE_ERROR( cudaFree(E) ); //free device memory
288 389 E = host_E; //swap pointers
  390 +
  391 + //copy a meshgrid has been created
  392 + if(p[0]){
  393 + size_t meshgrid_bytes = size() * sizeof(T); //move X to the CPU
  394 + T* host_X = (T*) malloc( meshgrid_bytes );
  395 + T* host_Y = (T*) malloc( meshgrid_bytes );
  396 + T* host_Z = (T*) malloc( meshgrid_bytes );
  397 +
  398 + HANDLE_ERROR( cudaMemcpy(host_X, p[0], meshgrid_bytes, cudaMemcpyDeviceToHost) );
  399 + HANDLE_ERROR( cudaMemcpy(host_Y, p[1], meshgrid_bytes, cudaMemcpyDeviceToHost) );
  400 + HANDLE_ERROR( cudaMemcpy(host_Z, p[2], meshgrid_bytes, cudaMemcpyDeviceToHost) );
  401 +
  402 + HANDLE_ERROR( cudaFree(p[0]) );
  403 + HANDLE_ERROR( cudaFree(p[1]) );
  404 + HANDLE_ERROR( cudaFree(p[2]) );
  405 +
  406 + p[0] = host_X;
  407 + p[1] = host_Y;
  408 + p[2] = host_Z;
  409 + }
  410 + loc = CPUmem;
289 411 }
290 412 }
291 413  
  414 + bool gpu(){
  415 + if(loc == GPUmem) return true;
  416 + else return false;
  417 + }
  418 +
292 419 /// Propagate the field along its orthogonal direction by a distance d
293 420 void propagate(T d, T k){
294   - //X[2] += d; //move the plane position along the propagation direction
295   - //Y[2] += d;
296   - cpu_scalar_propagate(E, E, X.len(), Y.len(), d, k, R[0], R[1]);
  421 + if(loc == CPUmem){
  422 + cpu_scalar_propagate(E, E, X.len(), Y.len(), d, k, R[0], R[1]);
  423 + }
  424 + else{
  425 + gpu_scalar_propagate(E, E, X.len(), Y.len(), d, k, R[0], R[1]);
  426 + }
297 427 }
298 428  
299 429 /// Propagate the field along its orthogonal direction by a distance d
... ... @@ -312,6 +442,7 @@ public:
312 442 }
313 443  
314 444 if(loc == CPUmem){
  445 + cropped.to_cpu(); //make sure that the cropped image is on the CPU
315 446 size_t x, y;
316 447 size_t sx, sy, si, di;
317 448 for(y = 0; y < Cy; y++){
... ... @@ -325,8 +456,8 @@ public:
325 456 }
326 457 }
327 458 else{
328   - std::cout<<"Error: cropping not supported for GPU memory yet."<<std::endl;
329   - exit(1);
  459 + cropped.to_gpu(); //make sure that the cropped image is also on the GPU
  460 + gpu_crop2d<stim::complex<T>>(cropped.E, E, R[0], R[1], Cx * padding, Cy * padding, Cx, Cy);
330 461 }
331 462 }
332 463  
... ... @@ -346,6 +477,10 @@ public:
346 477 return E;
347 478 }
348 479  
  480 + T* x(){ return p[0]; }
  481 + T* y(){ return p[1]; }
  482 + T* z(){ return p[2]; }
  483 +
349 484 /// Evaluate the cartesian coordinates of each point in the field. The resulting arrays are allocated in the same memory where the field is stored.
350 485 void meshgrid(T* X, T* Y, T* Z, locationType location){
351 486 //size_t array_size = sizeof(T) * R[0] * R[1];
... ... @@ -376,6 +511,21 @@ public:
376 511 }
377 512 }
378 513  
  514 + /// Create a local meshgrid
  515 + void meshgrid(){
  516 + if(p[0]) return; //if the p[0] value is not NULL, a meshgrid has already been created
  517 + if(loc == CPUmem){
  518 + p[0] = (T*) malloc( size() * sizeof(T) );
  519 + p[1] = (T*) malloc( size() * sizeof(T) );
  520 + p[2] = (T*) malloc( size() * sizeof(T) );
  521 + }
  522 + else{
  523 + std::cout<<"GPUmem meshgrid isn't implemented yet."<<std::endl;
  524 + exit(1);
  525 + }
  526 + meshgrid(p[0], p[1], p[2], loc);
  527 + }
  528 +
379 529 //clear the field, setting all values to zero
380 530 void clear(){
381 531 if(loc == GPUmem)
... ... @@ -386,25 +536,49 @@ public:
386 536  
387 537 void image(std::string filename, stim::complexComponentType type = complexMag, stim::colormapType cmap = stim::cmBrewer){
388 538  
389   - if(loc == GPUmem) to_cpu(); //if the field is in the GPU, move it to the CPU
390   - T* image = (T*) malloc( sizeof(T) * size() ); //allocate space for the real image
391   -
392   - switch(type){ //get the specified component from the complex value
393   - case complexMag:
394   - stim::abs(image, E, size());
395   - break;
396   - case complexReal:
397   - stim::real(image, E, size());
398   - break;
399   - case complexImaginary:
400   - stim::imag(image, E, size());
401   - break;
402   - case complexIntensity:
403   - stim::intensity(image, E, size());
404   - break;
  539 + if(loc == GPUmem){
  540 + T* image;
  541 + HANDLE_ERROR( cudaMalloc(&image, sizeof(T) * size()) );
  542 + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
  543 + dim3 blocks( R[0] * R[1] / threads + 1 ); //create a 1D array of blocks
  544 +
  545 + switch(type){
  546 + case complexMag:
  547 + cuda_abs<T><<< blocks, threads >>>(image, E, size());
  548 + break;
  549 + case complexReal:
  550 + cuda_real<T><<< blocks, threads >>>(image, E, size());
  551 + break;
  552 + case complexImaginary:
  553 + cuda_imag<T><<< blocks, threads >>>(image, E, size());
  554 + break;
  555 + case complexIntensity:
  556 + cuda_intensity<T><<< blocks, threads >>>(image, E, size());
  557 + break;
  558 + }
  559 + stim::gpu2image<T>(image, filename, R[0], R[1], stim::cmBrewer);
  560 + HANDLE_ERROR( cudaFree(image) );
  561 + }
  562 + else{
  563 + T* image = (T*) malloc( sizeof(T) * size() ); //allocate space for the real image
  564 +
  565 + switch(type){ //get the specified component from the complex value
  566 + case complexMag:
  567 + stim::abs(image, E, size());
  568 + break;
  569 + case complexReal:
  570 + stim::real(image, E, size());
  571 + break;
  572 + case complexImaginary:
  573 + stim::imag(image, E, size());
  574 + break;
  575 + case complexIntensity:
  576 + stim::intensity(image, E, size());
  577 + break;
  578 + }
  579 + stim::cpu2image(image, filename, R[0], R[1], cmap); //save the resulting image
  580 + free(image); //free the real image
405 581 }
406   - stim::cpu2image(image, filename, R[0], R[1], cmap); //save the resulting image
407   - free(image); //free the real image
408 582 }
409 583  
410 584 void image(T* img, stim::complexComponentType type = complexMag){
... ... @@ -430,15 +604,23 @@ public:
430 604  
431 605 //adds the field intensity to the output array (useful for calculating detector response to incoherent fields)
432 606 void intensity(T* out){
433   - if(loc == GPUmem) to_cpu(); //if the field is in the GPU, move it to the CPU
434   - T* image = (T*) malloc( sizeof(T) * size() ); //allocate space for the real image
435   - stim::intensity(image, E, size()); //calculate the intensity
  607 + if(loc == GPUmem){
  608 + //T* image;
  609 + //HANDLE_ERROR( cudaMalloc(&image, sizeof(T) * size()) );
  610 + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
  611 + dim3 blocks( R[0] * R[1] / threads + 1 ); //create a 1D array of blocks
  612 + cuda_sum_intensity<T><<< blocks, threads >>>(out, E, size());
  613 + }
  614 + else{
  615 + T* image = (T*) malloc( sizeof(T) * size() ); //allocate space for the real image
  616 + stim::intensity(image, E, size()); //calculate the intensity
436 617  
437   - size_t N = size(); //calculate the number of elements in the field
438   - for(size_t n = 0; n < N; n++) //for each point in the field
439   - out[n] += image[n]; //add the field intensity to the output image
  618 + size_t N = size(); //calculate the number of elements in the field
  619 + for(size_t n = 0; n < N; n++) //for each point in the field
  620 + out[n] += image[n]; //add the field intensity to the output image
440 621  
441   - free(image); //free the temporary intensity image
  622 + free(image); //free the temporary intensity image
  623 + }
442 624 }
443 625  
444 626 }; //end class scalarfield
... ...
stim/optics/scalarmie.h
... ... @@ -149,16 +149,16 @@ void gpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, sti
149 149  
150 150 size_t max_shared_mem = stim::sharedMemPerBlock();
151 151 size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
152   - std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl;
153   - std::cout<<"shared memory: "<<max_shared_mem<<std::endl;
  152 + //std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl;
  153 + //std::cout<<"shared memory: "<<max_shared_mem<<std::endl;
154 154 int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
155   - std::cout<<"threads per block: "<<threads<<std::endl;
  155 + //std::cout<<"threads per block: "<<threads<<std::endl;
156 156 dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
157 157  
158 158 size_t shared_mem;
159 159 if(Nl <= LOCAL_NL) shared_mem = 0;
160 160 else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate
161   - std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
  161 + //std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
162 162 cuda_scalar_mie_scatter<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel
163 163 }
164 164  
... ... @@ -174,52 +174,20 @@ __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){
174 174  
175 175 r[i] = p.len();
176 176 }
177   -/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
178   -
179   -/// @param E is a pointer to the destination field values
180   -/// @param N is the number of points used to calculate the field
181   -/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
182   -/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
183   -/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
184   -/// @param W is an array of planewaves that will be scattered
185   -/// @param a is the radius of the sphere
186   -/// @param n is the complex refractive index of the sphere
187 177 template<typename T>
188   -void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
  178 +void gpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
  179 +
189 180 //calculate the necessary number of orders required to represent the scattered field
190 181 T k = W[0].kmag();
191 182  
192 183 int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
193 184 if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization)
194   - std::cout<<"Nl: "<<Nl<<std::endl;
  185 + //std::cout<<"Nl: "<<Nl<<std::endl;
195 186  
196 187 //calculate the scattering coefficients for the sphere
197 188 stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients
198   - B_coefficients(B, a, k, n, Nl);
199   -
200   -#ifdef CUDA_FOUND
201   - stim::complex<T>* dev_E; //allocate space for the field
202   - cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
203   - cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
204   - //cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used)
205   -
206   - // COORDINATES
207   - T* dev_x = NULL; //allocate space and copy the X coordinate (if specified)
208   - if(x != NULL){
209   - HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
210   - HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
211   - }
212   - T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified)
213   - if(y != NULL){
214   - HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
215   - HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
216   - }
217   - T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified)
218   - if(z != NULL){
219   - HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
220   - HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
221   - }
222   -
  189 + B_coefficients(B, a, k, n, Nl);
  190 +
223 191 // PLANE WAVES
224 192 stim::scalarwave<T>* dev_W; //allocate space and copy plane waves
225 193 HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave<T>) * W.size()) );
... ... @@ -232,7 +200,7 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
232 200  
233 201 int threads = stim::maxThreadsPerBlock();
234 202 dim3 blocks((unsigned)(N / threads + 1));
235   - cuda_dist<T> <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N);
  203 + cuda_dist<T> <<< blocks, threads >>>(dev_r, x, y, z, N);
236 204  
237 205 //Find the minimum and maximum values of r
238 206 cublasStatus_t stat;
... ... @@ -280,7 +248,7 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
280 248 size_t hB_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_hB_lut;
281 249 stim::complex<T>* hB_lut = (stim::complex<T>*) malloc(hB_bytes); //pointer to the look-up table
282 250 T dr = (r_max - r_min) / (N_hB_lut-1); //distance between values in the LUT
283   - std::cout<<"LUT jl bytes: "<<hB_bytes<<std::endl;
  251 + //std::cout<<"LUT jl bytes: "<<hB_bytes<<std::endl;
284 252 stim::complex<T> hl;
285 253 for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT
286 254 stim::bessjyv_sph<double>(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl]
... ... @@ -292,18 +260,57 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
292 260 //std::cout<<hB_lut[ri * (Nl + 1) + l]<<std::endl;
293 261 }
294 262 }
295   - T* real_lut = (T*) malloc(hB_bytes/2);
296   - stim::real(real_lut, hB_lut, N_hB_lut);
297   - stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer);
  263 + //T* real_lut = (T*) malloc(hB_bytes/2);
  264 + //stim::real(real_lut, hB_lut, N_hB_lut);
  265 + //stim::cpu2image<T>(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer);
298 266  
299 267 //Allocate device memory and copy everything to the GPU
300 268 stim::complex<T>* dev_hB_lut;
301 269 HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) );
302 270 HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) );
303 271  
304   - gpu_scalar_mie_scatter<T>(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl);
  272 + gpu_scalar_mie_scatter<T>(E, N, x, y, z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl);
305 273  
306   - cudaMemcpy(E, dev_E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory
  274 + cudaMemcpy(E, E, N * sizeof(stim::complex<T>), cudaMemcpyDeviceToHost); //copy the field from device memory
  275 +}
  276 +/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave
  277 +
  278 +/// @param E is a pointer to the destination field values
  279 +/// @param N is the number of points used to calculate the field
  280 +/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros)
  281 +/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros)
  282 +/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros)
  283 +/// @param W is an array of planewaves that will be scattered
  284 +/// @param a is the radius of the sphere
  285 +/// @param n is the complex refractive index of the sphere
  286 +template<typename T>
  287 +void cpu_scalar_mie_scatter(stim::complex<T>* E, size_t N, T* x, T* y, T* z, std::vector<stim::scalarwave<T>> W, T a, stim::complex<T> n, T r_spacing = 0.1){
  288 +
  289 +
  290 +#ifdef CUDA_FOUND
  291 + stim::complex<T>* dev_E; //allocate space for the field
  292 + cudaMalloc(&dev_E, N * sizeof(stim::complex<T>));
  293 + cudaMemcpy(dev_E, E, N * sizeof(stim::complex<T>), cudaMemcpyHostToDevice);
  294 + //cudaMemset(dev_F, 0, N * sizeof(stim::complex<T>)); //set the field to zero (necessary because a sum is used)
  295 +
  296 + // COORDINATES
  297 + T* dev_x = NULL; //allocate space and copy the X coordinate (if specified)
  298 + if(x != NULL){
  299 + HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T)));
  300 + HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice));
  301 + }
  302 + T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified)
  303 + if(y != NULL){
  304 + HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T)));
  305 + HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice));
  306 + }
  307 + T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified)
  308 + if(z != NULL){
  309 + HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T)));
  310 + HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice));
  311 + }
  312 +
  313 + gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, W, a, n, r_spacing);
307 314  
308 315 if(x != NULL) cudaFree(dev_x); //free everything
309 316 if(y != NULL) cudaFree(dev_y);
... ... @@ -311,6 +318,16 @@ void cpu_scalar_mie_scatter(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, std
311 318 cudaFree(dev_E);
312 319 #else
313 320  
  321 + //calculate the necessary number of orders required to represent the scattered field
  322 + T k = W[0].kmag();
  323 +
  324 + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
  325 + if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization)
  326 + //std::cout<<"Nl: "<<Nl<<std::endl;
  327 +
  328 + //calculate the scattering coefficients for the sphere
  329 + stim::complex<T>* B = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients
  330 + B_coefficients(B, a, k, n, Nl);
314 331  
315 332 //allocate space to store the bessel function call results
316 333 double vm;
... ... @@ -422,16 +439,16 @@ void gpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
422 439  
423 440 size_t max_shared_mem = stim::sharedMemPerBlock();
424 441 size_t hBl_array = sizeof(stim::complex<T>) * (Nl + 1);
425   - std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl;
426   - std::cout<<"shared memory: "<<max_shared_mem<<std::endl;
  442 + //std::cout<<"hl*Bl array size: "<<hBl_array<<std::endl;
  443 + //std::cout<<"shared memory: "<<max_shared_mem<<std::endl;
427 444 int threads = (int)((max_shared_mem / hBl_array) / 32 * 32);
428   - std::cout<<"threads per block: "<<threads<<std::endl;
  445 + //std::cout<<"threads per block: "<<threads<<std::endl;
429 446 dim3 blocks((unsigned)(N / threads + 1)); //calculate the optimal number of blocks
430 447  
431 448 size_t shared_mem;
432 449 if(Nl <= LOCAL_NL) shared_mem = 0;
433 450 else shared_mem = threads * sizeof(stim::complex<T>) * (Nl - LOCAL_NL); //amount of shared memory to allocate
434   - std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
  451 + //std::cout<<"shared memory allocated: "<<shared_mem<<std::endl;
435 452 cuda_scalar_mie_internal<T><<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl); //call the kernel
436 453 }
437 454  
... ... @@ -452,7 +469,7 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
452 469  
453 470 int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2);
454 471 if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization)
455   - std::cout<<"Nl: "<<Nl<<std::endl;
  472 + //std::cout<<"Nl: "<<Nl<<std::endl;
456 473  
457 474 //calculate the scattering coefficients for the sphere
458 475 stim::complex<T>* A = (stim::complex<T>*) malloc( sizeof(stim::complex<T>) * (Nl + 1) ); //allocate space for the scattering coefficients
... ... @@ -537,7 +554,7 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
537 554 size_t jA_bytes = sizeof(stim::complex<T>) * (Nl+1) * N_jA_lut;
538 555 stim::complex<T>* jA_lut = (stim::complex<T>*) malloc(jA_bytes); //pointer to the look-up table
539 556 T dr = (r_max - r_min) / (N_jA_lut-1); //distance between values in the LUT
540   - std::cout<<"LUT jl bytes: "<<jA_bytes<<std::endl;
  557 + //std::cout<<"LUT jl bytes: "<<jA_bytes<<std::endl;
541 558 stim::complex<T> hl;
542 559 stim::complex<double> nd = (stim::complex<double>)n;
543 560 for(size_t ri = 0; ri < N_jA_lut; ri++){ //for each value in the LUT
... ... @@ -559,6 +576,10 @@ void cpu_scalar_mie_internal(stim::complex&lt;T&gt;* E, size_t N, T* x, T* y, T* z, st
559 576 if(x != NULL) cudaFree(dev_x); //free everything
560 577 if(y != NULL) cudaFree(dev_y);
561 578 if(z != NULL) cudaFree(dev_z);
  579 + HANDLE_ERROR( cudaFree(dev_jA_lut) );
  580 + HANDLE_ERROR( cudaFree(dev_E) );
  581 + HANDLE_ERROR( cudaFree(dev_W) );
  582 + HANDLE_ERROR( cudaFree(dev_r) );
562 583 cudaFree(dev_E);
563 584 #else
564 585  
... ... @@ -624,19 +645,44 @@ public:
624 645 n = ri;
625 646 }
626 647  
  648 + void sum_scat(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){
  649 + std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves
  650 + stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
  651 + }
  652 +
  653 + void sum_intern(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int samples = 1000){
  654 + std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves
  655 + stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
  656 + }
  657 +
  658 + void eval(stim::scalarfield<T>& E, T* X, T* Y, T* Z, stim::scalarbeam<T> b, int order = 500, int samples = 1000){
  659 + b.eval(E, X, Y, Z, order); //evaluate the incident field using a plane wave expansion
  660 + std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves
  661 + sum_scat(E, X, Y, Z, b, samples);
  662 + sum_intern(E, X, Y, Z, b, samples);
  663 + }
  664 +
627 665 void eval(stim::scalarfield<T>& E, stim::scalarbeam<T> b, int order = 500, int samples = 1000){
628 666  
629   - size_t array_size = E.grid_bytes(); //calculate the number of bytes in the scalar grid
  667 + /*size_t array_size = E.grid_bytes(); //calculate the number of bytes in the scalar grid
630 668 float* X = (float*) malloc( array_size ); //allocate space for the coordinate meshes
631 669 float* Y = (float*) malloc( array_size );
632 670 float* Z = (float*) malloc( array_size );
633 671 E.meshgrid(X, Y, Z, stim::CPUmem); //calculate the coordinate meshes
634   -
635   - b.eval(E, X, Y, Z, order); //evaluate the incident field using a plane wave expansion
  672 + */
  673 + E.meshgrid();
  674 + b.eval(E, order);
636 675  
637 676 std::vector< stim::scalarwave<float> > wave_array = b.mc(samples); //decompose the beam into an array of plane waves
638   - stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
639   - stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing());
  677 +
  678 + if(E.gpu()){
  679 + stim::gpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing());
  680 + }
  681 + else{
  682 + stim::cpu_scalar_mie_scatter<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing());
  683 + stim::cpu_scalar_mie_internal<float>(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing());
  684 + }
  685 + //eval(E, X, Y, Z, b, order, samples); //evaluate the field
640 686 }
641 687  
642 688 }; //end stim::scalarmie
... ...
stim/visualization/colormap.h
... ... @@ -4,6 +4,7 @@
4 4 #include <string>
5 5 #include <stdlib.h>
6 6 #include <cmath>
  7 +#include "cublas_v2.h"
7 8  
8 9 #ifdef _WIN32
9 10 #include <float.h>
... ... @@ -219,6 +220,42 @@ static void gpu2image(T* gpuSource, std::string fileDest, unsigned int x_size, u
219 220 free(cpuBuffer);
220 221 }
221 222  
  223 +/// save a GPU image to a file using automatic scaling
  224 +template<typename T>
  225 +static void gpu2image(T* gpuSource, std::string fileDest, unsigned int x_size, unsigned int y_size, colormapType cm = cmGrayscale){
  226 + size_t N = x_size * y_size; //calculate the total number of elements in the image
  227 +
  228 + cublasStatus_t stat;
  229 + cublasHandle_t handle;
  230 +
  231 + stat = cublasCreate(&handle); //create a cuBLAS handle
  232 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  233 + printf ("CUBLAS initialization failed\n");
  234 + exit(1);
  235 + }
  236 +
  237 + int i_min, i_max;
  238 + stat = cublasIsamin(handle, (int)N, gpuSource, 1, &i_min);
  239 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  240 + printf ("CUBLAS Error: failed to calculate minimum r value.\n");
  241 + exit(1);
  242 + }
  243 + stat = cublasIsamax(handle, (int)N, gpuSource, 1, &i_max);
  244 + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure
  245 + printf ("CUBLAS Error: failed to calculate maximum r value.\n");
  246 + exit(1);
  247 + }
  248 + cublasDestroy(handle);
  249 +
  250 + i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility
  251 + i_max--;
  252 + T v_min, v_max; //allocate space to store the minimum and maximum values
  253 + HANDLE_ERROR( cudaMemcpy(&v_min, gpuSource + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU
  254 + HANDLE_ERROR( cudaMemcpy(&v_max, gpuSource + i_max, sizeof(T), cudaMemcpyDeviceToHost) );
  255 +
  256 + gpu2image<T>(gpuSource, fileDest, x_size, y_size, v_min, v_max, cm);
  257 +}
  258 +
222 259 #endif
223 260  
224 261 template<class T>
... ...