diff --git a/stim/cuda/crop.cuh b/stim/cuda/crop.cuh new file mode 100644 index 0000000..1f80d65 --- /dev/null +++ b/stim/cuda/crop.cuh @@ -0,0 +1,30 @@ + +template +__global__ void cuda_crop2d(T* dest, T* src, size_t sx, size_t sy, size_t x, size_t y, size_t dx, size_t dy){ + + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate the current working position within the destination image + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; + if(xi >= dx || yi >= dy) return; //if this thread is outside of the destination image, return + + size_t di = yi * dx + xi; //calculate the 1D index into the destination image + size_t si = (y + yi) * sx + (x + xi); //calculate the 1D index into the source image + + dest[di] = src[si]; //copy the corresponding source pixel to the destination image +} + +/// Crops a 2D image composed of elements of type T +/// @param dest is a device pointer to memory of size dx*dy that will store the cropped image +/// @param src is a device pointer to memory of size sx*sy that stores the original image +/// @param sx is the size of the source image along x +/// @param sy is the size of the source image along y +/// @param x is the x-coordinate of the start position of the cropped region within the source image +/// @param y is the y-coordinate of the start position of the cropped region within the source image +/// @param dx is the size of the destination image along x +/// @param dy is the size of the destination image along y +template +void gpu_crop2d(T* dest, T* src, size_t sx, size_t sy, size_t x, size_t y, size_t dx, size_t dy){ + int max_threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device + dim3 threads( sqrt(max_threads), sqrt(max_threads) ); + dim3 blocks( dx / sqrt(threads.x) + 1, dy / sqrt(threads.y) + 1); //calculate the optimal number of blocks + cuda_crop2d <<< blocks, threads >>>(dest, src, sx, sy, x, y, dx, dy); +} \ No newline at end of file diff --git a/stim/math/circle.h b/stim/math/circle.h index 92f9335..8a8c43c 100644 --- a/stim/math/circle.h +++ b/stim/math/circle.h @@ -64,7 +64,7 @@ public: init(); } - ///create a rectangle from a center point, normal, and size + /*///create a rectangle from a center point, normal, and size ///@param c: x,y,z location of the center. ///@param s: size of the rectangle. ///@param n: x,y,z direction of the normal. @@ -76,6 +76,7 @@ public: rotate(n, U, Y); scale(s); } + */ ///create a rectangle from a center point, normal, and size ///@param c: x,y,z location of the center. @@ -164,12 +165,12 @@ public: ///returns a vector with the points on the initialized circle. ///connecting the points results in a circle. ///@param n: integer for the number of points representing the circle. - stim::vec3 + CUDA_CALLABLE stim::vec3 p(T theta) { T x,y; - y = 0.5*cos(theta*stim::TAU/360.0)+0.5; - x = 0.5*sin(theta*stim::TAU/360.0)+0.5; + y = 0.5*cos(theta*STIM_TAU/360.0)+0.5; + x = 0.5*sin(theta*STIM_TAU/360.0)+0.5; return p(x,y); } diff --git a/stim/math/constants.h b/stim/math/constants.h index 9d93349..a4bcdb9 100644 --- a/stim/math/constants.h +++ b/stim/math/constants.h @@ -1,6 +1,9 @@ #ifndef STIM_CONSTANTS_H #define STIM_CONSTANTS_H +#define STIM_PI 3.1415926535897932384626433832795028841971693993751058209749445923078164062862 +#define STIM_TAU 2 * STIM_PI + #include "stim/cuda/cudatools/callable.h" namespace stim{ const double PI = 3.1415926535897932384626433832795028841971693993751058209749445923078164062862; diff --git a/stim/math/vec3.h b/stim/math/vec3.h index 3cc2efc..99e3809 100644 --- a/stim/math/vec3.h +++ b/stim/math/vec3.h @@ -216,7 +216,7 @@ public: return result; } - +#ifndef __NVCC__ /// Outputs the vector as a string std::string str() const{ std::stringstream ss; @@ -234,6 +234,7 @@ public: return ss.str(); } +#endif size_t size(){ return 3; } diff --git a/stim/optics/scalarbeam.h b/stim/optics/scalarbeam.h index 77fc289..51b32e6 100644 --- a/stim/optics/scalarbeam.h +++ b/stim/optics/scalarbeam.h @@ -193,6 +193,7 @@ void gpu_scalar_psf_local(stim::complex* E, size_t N, T* r, T* phi, T lambda, printf ("CUBLAS Error: failed to calculate maximum r value.\n"); exit(1); } + cublasDestroy(handle); i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility i_max--; @@ -212,8 +213,9 @@ void gpu_scalar_psf_local(stim::complex* E, size_t N, T* r, T* phi, T lambda, T* j_lut = (T*) malloc(lutj_bytes); //pointer to the look-up table T dr = (r_max - r_min) / (Nlut_j-1); //distance between values in the LUT T jl; + unsigned l; for(size_t ri = 0; ri < Nlut_j; ri++){ //for each value in the LUT - for(unsigned l = 0; l <= (unsigned)Nl; l++){ //for each order + for(l = 0; l <= (unsigned)Nl; l++){ //for each order jl = boost::math::sph_bessel(l, k*(r_min + ri * dr)); //use boost to calculate the spherical bessel function j_lut[ri * (Nl + 1) + l] = jl; //store the bessel function result } @@ -323,6 +325,9 @@ void gpu_scalar_psf_cart(stim::complex* E, size_t N, T* x, T* y, T* z, T lamb gpu_scalar_psf_local(E, N, gpu_r, gpu_phi, lambda, A, NA, NA_in, Nl, r_spacing); + HANDLE_ERROR( cudaFree(gpu_r) ); + HANDLE_ERROR( cudaFree(gpu_phi) ); + } #endif @@ -449,17 +454,12 @@ public: /// Evaluate the beam to a scalar field using Debye focusing void eval(stim::scalarfield& E, int order = 500){ - size_t array_size = E.grid_bytes(); - T* X = (T*) malloc( array_size ); //allocate space for the coordinate meshes - T* Y = (T*) malloc( array_size ); - T* Z = (T*) malloc( array_size ); - - E.meshgrid(X, Y, Z, stim::CPUmem); //calculate the coordinate meshes - eval(E, X, Y, Z, order); - - free(X); //free the coordinate meshes - free(Y); - free(Z); + E.meshgrid(); //calculate a meshgrid if one isn't created + if(E.gpu()) + gpu_scalar_psf_cart(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[0], NA[1], order, E.spacing()); + else + cpu_scalar_psf_cart(E.ptr(), E.size(), E.x(), E.y(), E.z(), lambda, A, f, d, NA[0], NA[1], order, E.spacing()); + //eval(E, E.x(), E.y(), E.z(), order); } /// Calculate the field at a given point diff --git a/stim/optics/scalarfield.h b/stim/optics/scalarfield.h index 31b74f2..4705a48 100644 --- a/stim/optics/scalarfield.h +++ b/stim/optics/scalarfield.h @@ -6,8 +6,52 @@ #include "../math/complex.h" #include "../math/fft.h" +#ifdef CUDA_FOUND +#include "../cuda/crop.cuh" +#endif + namespace stim{ + template + __global__ void cuda_abs(T* img, stim::complex* field, size_t N){ + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= N) return; + + img[i] = field[i].abs(); + } + + template + __global__ void cuda_real(T* img, stim::complex* field, size_t N){ + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= N) return; + + img[i] = field[i].real(); + } + + template + __global__ void cuda_imag(T* img, stim::complex* field, size_t N){ + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= N) return; + + img[i] = field[i].imag(); + } + + template + __global__ void cuda_intensity(T* img, stim::complex* field, size_t N){ + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= N) return; + + img[i] = pow(field[i].abs(), 2); + } + + template + __global__ void cuda_sum_intensity(T* img, stim::complex* field, size_t N){ + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= N) return; + + img[i] += pow(field[i].abs(), 2); + } + /// Perform a k-space transform of a scalar field (FFT). The given field has a width of x and the calculated momentum space has a /// width of kx (in radians). /// @param K is a pointer to the output array of all plane waves in the field @@ -44,12 +88,16 @@ namespace stim{ std::cout<<"Error using cuFFT to perform a forward Fourier transform of the field."<* fft = (stim::complex*) malloc(sizeof(stim::complex) * nx * ny); HANDLE_ERROR( cudaMemcpy(fft, dev_FFT, sizeof(stim::complex) * nx * ny, cudaMemcpyDeviceToHost) ); stim::cpu_fftshift(K, fft, nx, ny); - //memcpy(K, fft, sizeof(stim::complex) * nx * ny); + + HANDLE_ERROR( cudaFree(dev_FFT) ); //free GPU memory + HANDLE_ERROR( cudaFree(dev_E) ); + free(fft); //free CPU memory } template @@ -82,12 +130,18 @@ namespace stim{ std::cout<<"Error using cuFFT to perform a forward Fourier transform of the field."<) * nx * ny, cudaMemcpyDeviceToHost) ); + HANDLE_ERROR( cudaFree(dev_FFT) ); //free GPU memory + HANDLE_ERROR( cudaFree(dev_E) ); + free(fft); //free CPU memory } + + /// Propagate a field slice along its orthogonal direction by a given distance z /// @param Enew is the resulting propogated field /// @param E is the field to be propogated @@ -105,9 +159,9 @@ namespace stim{ T Kx, Ky; //width and height in k space cpu_scalar_to_kspace(K, Kx, Ky, E ,sx, sy, nx, ny); - T* mag = (T*) malloc( sizeof(T) * nx * ny ); - stim::abs(mag, K, nx * ny); - stim::cpu2image(mag, "kspace_pre_shift.bmp", nx, ny, stim::cmBrewer); + //T* mag = (T*) malloc( sizeof(T) * nx * ny ); + //stim::abs(mag, K, nx * ny); + //stim::cpu2image(mag, "kspace_pre_shift.bmp", nx, ny, stim::cmBrewer); size_t kxi, kyi; size_t i; @@ -143,10 +197,27 @@ namespace stim{ } } - stim::abs(mag, K, nx * ny); - stim::cpu2image(mag, "kspace_post_shift.bmp", nx, ny, stim::cmBrewer); + //stim::abs(mag, K, nx * ny); + //stim::cpu2image(mag, "kspace_post_shift.bmp", nx, ny, stim::cmBrewer); cpu_scalar_from_kspace(Enew, sx, sy, K, Kx, Ky, nx, ny); + free(K); + } + + template + void gpu_scalar_propagate(stim::complex* Enew, stim::complex* E, T sx, T sy, T z, T k, size_t nx, size_t ny){ + + size_t field_bytes = sizeof(stim::complex) * nx * ny; + stim::complex* host_E = (stim::complex*) malloc( field_bytes); + HANDLE_ERROR( cudaMemcpy(host_E, E, field_bytes, cudaMemcpyDeviceToHost) ); + + stim::complex* host_Enew = (stim::complex*) malloc(field_bytes); + + cpu_scalar_propagate(host_Enew, host_E, sx, sy, z, k, nx, ny); + + HANDLE_ERROR( cudaMemcpy(Enew, host_Enew, field_bytes, cudaMemcpyHostToDevice) ); + free(host_E); + free(host_Enew); } /// Apply a lowpass filter to a field slice @@ -165,9 +236,9 @@ namespace stim{ T Kx, Ky; //width and height in k space cpu_scalar_to_kspace(K, Kx, Ky, E ,sx, sy, nx, ny); - T* mag = (T*) malloc( sizeof(T) * nx * ny ); - stim::abs(mag, K, nx * ny); - stim::cpu2image(mag, "kspace_pre_lowpass.bmp", nx, ny, stim::cmBrewer); + //T* mag = (T*) malloc( sizeof(T) * nx * ny ); + //stim::abs(mag, K, nx * ny); + //stim::cpu2image(mag, "kspace_pre_lowpass.bmp", nx, ny, stim::cmBrewer); size_t kxi, kyi; size_t i; @@ -200,10 +271,11 @@ namespace stim{ } } - stim::abs(mag, K, nx * ny); - stim::cpu2image(mag, "kspace_post_lowpass.bmp", nx, ny, stim::cmBrewer); + //stim::abs(mag, K, nx * ny); + //stim::cpu2image(mag, "kspace_post_lowpass.bmp", nx, ny, stim::cmBrewer); cpu_scalar_from_kspace(Enew, sx, sy, K, Kx, Ky, nx, ny); + free(K); } enum locationType {CPUmem, GPUmem}; @@ -222,6 +294,8 @@ protected: size_t R[2]; locationType loc; + T* p[3]; //scalar position for each point in E + /// Convert the field to a k-space representation (do an FFT) void to_kspace(T& kx, T& ky){ cpu_scalar_to_kspace(E, kx, ky, E, X.len(), Y.len(), R[0], R[1]); @@ -252,6 +326,9 @@ public: E = (stim::complex*) malloc(grid_bytes()); //allocate in CPU memory memset(E, 0, grid_bytes()); loc = CPUmem; + + p[0] = p[1] = p[2] = NULL; //set the position vector to NULL + } ~scalarfield(){ @@ -271,11 +348,35 @@ public: if(loc == GPUmem) return; else{ stim::complex* dev_E; - HANDLE_ERROR( cudaMalloc(&dev_E, e_bytes()) ); //allocate GPU memory - HANDLE_ERROR( cudaMemcpy(dev_E, E, e_bytes(), cudaMemcpyHostToDevice) ); //copy the field to the GPU + HANDLE_ERROR( cudaMalloc(&dev_E, grid_bytes()) ); //allocate GPU memory + HANDLE_ERROR( cudaMemcpy(dev_E, E, grid_bytes(), cudaMemcpyHostToDevice) ); //copy the field to the GPU free(E); //free the CPU memory E = dev_E; //swap pointers + + if(p[0]){ + size_t meshgrid_bytes = size() * sizeof(T); //calculate the number of bytes in each meshgrid + T* dev_X; //allocate variables to store the device meshgrid + T* dev_Y; + T* dev_Z; + HANDLE_ERROR( cudaMalloc(&dev_X, meshgrid_bytes) ); //allocate space for the meshgrid on the device + HANDLE_ERROR( cudaMalloc(&dev_Y, meshgrid_bytes) ); + HANDLE_ERROR( cudaMalloc(&dev_Z, meshgrid_bytes) ); + + HANDLE_ERROR( cudaMemcpy(dev_X, p[0], meshgrid_bytes, cudaMemcpyHostToDevice) ); //copy from the host to the device + HANDLE_ERROR( cudaMemcpy(dev_Y, p[1], meshgrid_bytes, cudaMemcpyHostToDevice) ); + HANDLE_ERROR( cudaMemcpy(dev_Z, p[2], meshgrid_bytes, cudaMemcpyHostToDevice) ); + + free(p[0]); //free device memory + free(p[1]); + free(p[2]); + + p[0] = dev_X; //swap in the new pointers to device memory + p[1] = dev_Y; + p[2] = dev_Z; + } + loc = GPUmem; //set the location flag } + } /// Copy the field array to the CPU, if it isn't already there @@ -286,14 +387,43 @@ public: HANDLE_ERROR( cudaMemcpy(host_E, E, grid_bytes(), cudaMemcpyDeviceToHost) ); //copy from GPU to CPU HANDLE_ERROR( cudaFree(E) ); //free device memory E = host_E; //swap pointers + + //copy a meshgrid has been created + if(p[0]){ + size_t meshgrid_bytes = size() * sizeof(T); //move X to the CPU + T* host_X = (T*) malloc( meshgrid_bytes ); + T* host_Y = (T*) malloc( meshgrid_bytes ); + T* host_Z = (T*) malloc( meshgrid_bytes ); + + HANDLE_ERROR( cudaMemcpy(host_X, p[0], meshgrid_bytes, cudaMemcpyDeviceToHost) ); + HANDLE_ERROR( cudaMemcpy(host_Y, p[1], meshgrid_bytes, cudaMemcpyDeviceToHost) ); + HANDLE_ERROR( cudaMemcpy(host_Z, p[2], meshgrid_bytes, cudaMemcpyDeviceToHost) ); + + HANDLE_ERROR( cudaFree(p[0]) ); + HANDLE_ERROR( cudaFree(p[1]) ); + HANDLE_ERROR( cudaFree(p[2]) ); + + p[0] = host_X; + p[1] = host_Y; + p[2] = host_Z; + } + loc = CPUmem; } } + bool gpu(){ + if(loc == GPUmem) return true; + else return false; + } + /// Propagate the field along its orthogonal direction by a distance d void propagate(T d, T k){ - //X[2] += d; //move the plane position along the propagation direction - //Y[2] += d; - cpu_scalar_propagate(E, E, X.len(), Y.len(), d, k, R[0], R[1]); + if(loc == CPUmem){ + cpu_scalar_propagate(E, E, X.len(), Y.len(), d, k, R[0], R[1]); + } + else{ + gpu_scalar_propagate(E, E, X.len(), Y.len(), d, k, R[0], R[1]); + } } /// Propagate the field along its orthogonal direction by a distance d @@ -312,6 +442,7 @@ public: } if(loc == CPUmem){ + cropped.to_cpu(); //make sure that the cropped image is on the CPU size_t x, y; size_t sx, sy, si, di; for(y = 0; y < Cy; y++){ @@ -325,8 +456,8 @@ public: } } else{ - std::cout<<"Error: cropping not supported for GPU memory yet."<>(cropped.E, E, R[0], R[1], Cx * padding, Cy * padding, Cx, Cy); } } @@ -346,6 +477,10 @@ public: return E; } + T* x(){ return p[0]; } + T* y(){ return p[1]; } + T* z(){ return p[2]; } + /// Evaluate the cartesian coordinates of each point in the field. The resulting arrays are allocated in the same memory where the field is stored. void meshgrid(T* X, T* Y, T* Z, locationType location){ //size_t array_size = sizeof(T) * R[0] * R[1]; @@ -376,6 +511,21 @@ public: } } + /// Create a local meshgrid + void meshgrid(){ + if(p[0]) return; //if the p[0] value is not NULL, a meshgrid has already been created + if(loc == CPUmem){ + p[0] = (T*) malloc( size() * sizeof(T) ); + p[1] = (T*) malloc( size() * sizeof(T) ); + p[2] = (T*) malloc( size() * sizeof(T) ); + } + else{ + std::cout<<"GPUmem meshgrid isn't implemented yet."<<<< blocks, threads >>>(image, E, size()); + break; + case complexReal: + cuda_real<<< blocks, threads >>>(image, E, size()); + break; + case complexImaginary: + cuda_imag<<< blocks, threads >>>(image, E, size()); + break; + case complexIntensity: + cuda_intensity<<< blocks, threads >>>(image, E, size()); + break; + } + stim::gpu2image(image, filename, R[0], R[1], stim::cmBrewer); + HANDLE_ERROR( cudaFree(image) ); + } + else{ + T* image = (T*) malloc( sizeof(T) * size() ); //allocate space for the real image + + switch(type){ //get the specified component from the complex value + case complexMag: + stim::abs(image, E, size()); + break; + case complexReal: + stim::real(image, E, size()); + break; + case complexImaginary: + stim::imag(image, E, size()); + break; + case complexIntensity: + stim::intensity(image, E, size()); + break; + } + stim::cpu2image(image, filename, R[0], R[1], cmap); //save the resulting image + free(image); //free the real image } - stim::cpu2image(image, filename, R[0], R[1], cmap); //save the resulting image - free(image); //free the real image } void image(T* img, stim::complexComponentType type = complexMag){ @@ -430,15 +604,23 @@ public: //adds the field intensity to the output array (useful for calculating detector response to incoherent fields) void intensity(T* out){ - if(loc == GPUmem) to_cpu(); //if the field is in the GPU, move it to the CPU - T* image = (T*) malloc( sizeof(T) * size() ); //allocate space for the real image - stim::intensity(image, E, size()); //calculate the intensity + if(loc == GPUmem){ + //T* image; + //HANDLE_ERROR( cudaMalloc(&image, sizeof(T) * size()) ); + int threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device + dim3 blocks( R[0] * R[1] / threads + 1 ); //create a 1D array of blocks + cuda_sum_intensity<<< blocks, threads >>>(out, E, size()); + } + else{ + T* image = (T*) malloc( sizeof(T) * size() ); //allocate space for the real image + stim::intensity(image, E, size()); //calculate the intensity - size_t N = size(); //calculate the number of elements in the field - for(size_t n = 0; n < N; n++) //for each point in the field - out[n] += image[n]; //add the field intensity to the output image + size_t N = size(); //calculate the number of elements in the field + for(size_t n = 0; n < N; n++) //for each point in the field + out[n] += image[n]; //add the field intensity to the output image - free(image); //free the temporary intensity image + free(image); //free the temporary intensity image + } } }; //end class scalarfield diff --git a/stim/optics/scalarmie.h b/stim/optics/scalarmie.h index 3e1fd7a..1ff7f0b 100644 --- a/stim/optics/scalarmie.h +++ b/stim/optics/scalarmie.h @@ -149,16 +149,16 @@ void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, sti size_t max_shared_mem = stim::sharedMemPerBlock(); size_t hBl_array = sizeof(stim::complex) * (Nl + 1); - std::cout<<"hl*Bl array size: "<) * (Nl - LOCAL_NL); //amount of shared memory to allocate - std::cout<<"shared memory allocated: "<<<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, hB, kr_min, dkr, N_hB, (int)Nl); //call the kernel } @@ -174,52 +174,20 @@ __global__ void cuda_dist(T* r, T* x, T* y, T* z, size_t N){ r[i] = p.len(); } -/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave - -/// @param E is a pointer to the destination field values -/// @param N is the number of points used to calculate the field -/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros) -/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros) -/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros) -/// @param W is an array of planewaves that will be scattered -/// @param a is the radius of the sphere -/// @param n is the complex refractive index of the sphere template -void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector> W, T a, stim::complex n, T r_spacing = 0.1){ +void gpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector> W, T a, stim::complex n, T r_spacing = 0.1){ + //calculate the necessary number of orders required to represent the scattered field T k = W[0].kmag(); int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) - std::cout<<"Nl: "<* B = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients - B_coefficients(B, a, k, n, Nl); - -#ifdef CUDA_FOUND - stim::complex* dev_E; //allocate space for the field - cudaMalloc(&dev_E, N * sizeof(stim::complex)); - cudaMemcpy(dev_E, E, N * sizeof(stim::complex), cudaMemcpyHostToDevice); - //cudaMemset(dev_F, 0, N * sizeof(stim::complex)); //set the field to zero (necessary because a sum is used) - - // COORDINATES - T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) - if(x != NULL){ - HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T))); - HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice)); - } - T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified) - if(y != NULL){ - HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T))); - HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice)); - } - T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified) - if(z != NULL){ - HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T))); - HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); - } - + B_coefficients(B, a, k, n, Nl); + // PLANE WAVES stim::scalarwave* dev_W; //allocate space and copy plane waves HANDLE_ERROR( cudaMalloc(&dev_W, sizeof(stim::scalarwave) * W.size()) ); @@ -232,7 +200,7 @@ void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std int threads = stim::maxThreadsPerBlock(); dim3 blocks((unsigned)(N / threads + 1)); - cuda_dist <<< blocks, threads >>>(dev_r, dev_x, dev_y, dev_z, N); + cuda_dist <<< blocks, threads >>>(dev_r, x, y, z, N); //Find the minimum and maximum values of r cublasStatus_t stat; @@ -280,7 +248,7 @@ void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std size_t hB_bytes = sizeof(stim::complex) * (Nl+1) * N_hB_lut; stim::complex* hB_lut = (stim::complex*) malloc(hB_bytes); //pointer to the look-up table T dr = (r_max - r_min) / (N_hB_lut-1); //distance between values in the LUT - std::cout<<"LUT jl bytes: "< hl; for(size_t ri = 0; ri < N_hB_lut; ri++){ //for each value in the LUT stim::bessjyv_sph(Nl, k * (r_min + ri * dr), vm, jv, yv, djv, dyv); //compute the list of spherical bessel functions from [0 Nl] @@ -292,18 +260,57 @@ void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std //std::cout<(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer); + //T* real_lut = (T*) malloc(hB_bytes/2); + //stim::real(real_lut, hB_lut, N_hB_lut); + //stim::cpu2image(real_lut, "hankel_B.bmp", Nl+1, N_hB_lut, stim::cmBrewer); //Allocate device memory and copy everything to the GPU stim::complex* dev_hB_lut; HANDLE_ERROR( cudaMalloc(&dev_hB_lut, hB_bytes) ); HANDLE_ERROR( cudaMemcpy(dev_hB_lut, hB_lut, hB_bytes, cudaMemcpyHostToDevice) ); - gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl); + gpu_scalar_mie_scatter(E, N, x, y, z, dev_W, W.size(), a, n, dev_hB_lut, r_min, dr, N_hB_lut, Nl); - cudaMemcpy(E, dev_E, N * sizeof(stim::complex), cudaMemcpyDeviceToHost); //copy the field from device memory + cudaMemcpy(E, E, N * sizeof(stim::complex), cudaMemcpyDeviceToHost); //copy the field from device memory +} +/// Calculate the scalar Mie solution for the scattered field produced by a single plane wave + +/// @param E is a pointer to the destination field values +/// @param N is the number of points used to calculate the field +/// @param x is an array of x coordinates for each point, specified relative to the sphere (x = NULL assumes all zeros) +/// @param y is an array of y coordinates for each point, specified relative to the sphere (y = NULL assumes all zeros) +/// @param z is an array of z coordinates for each point, specified relative to the sphere (z = NULL assumes all zeros) +/// @param W is an array of planewaves that will be scattered +/// @param a is the radius of the sphere +/// @param n is the complex refractive index of the sphere +template +void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std::vector> W, T a, stim::complex n, T r_spacing = 0.1){ + + +#ifdef CUDA_FOUND + stim::complex* dev_E; //allocate space for the field + cudaMalloc(&dev_E, N * sizeof(stim::complex)); + cudaMemcpy(dev_E, E, N * sizeof(stim::complex), cudaMemcpyHostToDevice); + //cudaMemset(dev_F, 0, N * sizeof(stim::complex)); //set the field to zero (necessary because a sum is used) + + // COORDINATES + T* dev_x = NULL; //allocate space and copy the X coordinate (if specified) + if(x != NULL){ + HANDLE_ERROR(cudaMalloc(&dev_x, N * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(dev_x, x, N * sizeof(T), cudaMemcpyHostToDevice)); + } + T* dev_y = NULL; //allocate space and copy the Y coordinate (if specified) + if(y != NULL){ + HANDLE_ERROR(cudaMalloc(&dev_y, N * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(dev_y, y, N * sizeof(T), cudaMemcpyHostToDevice)); + } + T* dev_z = NULL; //allocate space and copy the Z coordinate (if specified) + if(z != NULL){ + HANDLE_ERROR(cudaMalloc(&dev_z, N * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(dev_z, z, N * sizeof(T), cudaMemcpyHostToDevice)); + } + + gpu_scalar_mie_scatter(dev_E, N, dev_x, dev_y, dev_z, W, a, n, r_spacing); if(x != NULL) cudaFree(dev_x); //free everything if(y != NULL) cudaFree(dev_y); @@ -311,6 +318,16 @@ void cpu_scalar_mie_scatter(stim::complex* E, size_t N, T* x, T* y, T* z, std cudaFree(dev_E); #else + //calculate the necessary number of orders required to represent the scattered field + T k = W[0].kmag(); + + int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); + if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) + //std::cout<<"Nl: "<* B = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients + B_coefficients(B, a, k, n, Nl); //allocate space to store the bessel function call results double vm; @@ -422,16 +439,16 @@ void gpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, st size_t max_shared_mem = stim::sharedMemPerBlock(); size_t hBl_array = sizeof(stim::complex) * (Nl + 1); - std::cout<<"hl*Bl array size: "<) * (Nl - LOCAL_NL); //amount of shared memory to allocate - std::cout<<"shared memory allocated: "<<<< blocks, threads, shared_mem >>>(E, N, x, y, z, W, nW, a, n, jA, r_min, dr, N_jA, (int)Nl); //call the kernel } @@ -452,7 +469,7 @@ void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, st int Nl = (int)ceil(k*a + 4 * cbrt( k * a ) + 2); if(Nl < LOCAL_NL) Nl = LOCAL_NL; //always do at least the minimum number of local operations (kernel optimization) - std::cout<<"Nl: "<* A = (stim::complex*) malloc( sizeof(stim::complex) * (Nl + 1) ); //allocate space for the scattering coefficients @@ -537,7 +554,7 @@ void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, st size_t jA_bytes = sizeof(stim::complex) * (Nl+1) * N_jA_lut; stim::complex* jA_lut = (stim::complex*) malloc(jA_bytes); //pointer to the look-up table T dr = (r_max - r_min) / (N_jA_lut-1); //distance between values in the LUT - std::cout<<"LUT jl bytes: "< hl; stim::complex nd = (stim::complex)n; for(size_t ri = 0; ri < N_jA_lut; ri++){ //for each value in the LUT @@ -559,6 +576,10 @@ void cpu_scalar_mie_internal(stim::complex* E, size_t N, T* x, T* y, T* z, st if(x != NULL) cudaFree(dev_x); //free everything if(y != NULL) cudaFree(dev_y); if(z != NULL) cudaFree(dev_z); + HANDLE_ERROR( cudaFree(dev_jA_lut) ); + HANDLE_ERROR( cudaFree(dev_E) ); + HANDLE_ERROR( cudaFree(dev_W) ); + HANDLE_ERROR( cudaFree(dev_r) ); cudaFree(dev_E); #else @@ -624,19 +645,44 @@ public: n = ri; } + void sum_scat(stim::scalarfield& E, T* X, T* Y, T* Z, stim::scalarbeam b, int samples = 1000){ + std::vector< stim::scalarwave > wave_array = b.mc(samples); //decompose the beam into an array of plane waves + stim::cpu_scalar_mie_scatter(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); + } + + void sum_intern(stim::scalarfield& E, T* X, T* Y, T* Z, stim::scalarbeam b, int samples = 1000){ + std::vector< stim::scalarwave > wave_array = b.mc(samples); //decompose the beam into an array of plane waves + stim::cpu_scalar_mie_internal(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); + } + + void eval(stim::scalarfield& E, T* X, T* Y, T* Z, stim::scalarbeam b, int order = 500, int samples = 1000){ + b.eval(E, X, Y, Z, order); //evaluate the incident field using a plane wave expansion + std::vector< stim::scalarwave > wave_array = b.mc(samples); //decompose the beam into an array of plane waves + sum_scat(E, X, Y, Z, b, samples); + sum_intern(E, X, Y, Z, b, samples); + } + void eval(stim::scalarfield& E, stim::scalarbeam b, int order = 500, int samples = 1000){ - size_t array_size = E.grid_bytes(); //calculate the number of bytes in the scalar grid + /*size_t array_size = E.grid_bytes(); //calculate the number of bytes in the scalar grid float* X = (float*) malloc( array_size ); //allocate space for the coordinate meshes float* Y = (float*) malloc( array_size ); float* Z = (float*) malloc( array_size ); E.meshgrid(X, Y, Z, stim::CPUmem); //calculate the coordinate meshes - - b.eval(E, X, Y, Z, order); //evaluate the incident field using a plane wave expansion + */ + E.meshgrid(); + b.eval(E, order); std::vector< stim::scalarwave > wave_array = b.mc(samples); //decompose the beam into an array of plane waves - stim::cpu_scalar_mie_scatter(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); - stim::cpu_scalar_mie_internal(E.ptr(), E.size(), X, Y, Z, wave_array, radius, n, E.spacing()); + + if(E.gpu()){ + stim::gpu_scalar_mie_scatter(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); + } + else{ + stim::cpu_scalar_mie_scatter(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); + stim::cpu_scalar_mie_internal(E.ptr(), E.size(), E.x(), E.y(), E.z(), wave_array, radius, n, E.spacing()); + } + //eval(E, X, Y, Z, b, order, samples); //evaluate the field } }; //end stim::scalarmie diff --git a/stim/visualization/colormap.h b/stim/visualization/colormap.h index 60d882e..3920cd3 100644 --- a/stim/visualization/colormap.h +++ b/stim/visualization/colormap.h @@ -4,6 +4,7 @@ #include #include #include +#include "cublas_v2.h" #ifdef _WIN32 #include @@ -219,6 +220,42 @@ static void gpu2image(T* gpuSource, std::string fileDest, unsigned int x_size, u free(cpuBuffer); } +/// save a GPU image to a file using automatic scaling +template +static void gpu2image(T* gpuSource, std::string fileDest, unsigned int x_size, unsigned int y_size, colormapType cm = cmGrayscale){ + size_t N = x_size * y_size; //calculate the total number of elements in the image + + cublasStatus_t stat; + cublasHandle_t handle; + + stat = cublasCreate(&handle); //create a cuBLAS handle + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure + printf ("CUBLAS initialization failed\n"); + exit(1); + } + + int i_min, i_max; + stat = cublasIsamin(handle, (int)N, gpuSource, 1, &i_min); + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure + printf ("CUBLAS Error: failed to calculate minimum r value.\n"); + exit(1); + } + stat = cublasIsamax(handle, (int)N, gpuSource, 1, &i_max); + if (stat != CUBLAS_STATUS_SUCCESS){ //test for failure + printf ("CUBLAS Error: failed to calculate maximum r value.\n"); + exit(1); + } + cublasDestroy(handle); + + i_min--; //cuBLAS uses 1-based indexing for Fortran compatibility + i_max--; + T v_min, v_max; //allocate space to store the minimum and maximum values + HANDLE_ERROR( cudaMemcpy(&v_min, gpuSource + i_min, sizeof(T), cudaMemcpyDeviceToHost) ); //copy the min and max values from the device to the CPU + HANDLE_ERROR( cudaMemcpy(&v_max, gpuSource + i_max, sizeof(T), cudaMemcpyDeviceToHost) ); + + gpu2image(gpuSource, fileDest, x_size, y_size, v_min, v_max, cm); +} + #endif template -- libgit2 0.21.4