#ifndef STIM_CUDA_HALF_TO_FLOAT_H #define STIM_CUDA_HALF_TO_FLOAT_H #include #include #include #include #include #include "cuda_fp16.h" __global__ void cuda_h2f(float* gpu_float, half* gpu_half, int x, int y, int z){ //calculate x,y,z coordinates for this thread int xi = blockIdx.x * blockDim.x + threadIdx.x; //find the grid size along y int grid_y = y / blockDim.y; int blockidx_y = blockIdx.y % grid_y; int yi = blockidx_y * blockDim.y + threadIdx.y; int zi = blockIdx.y / grid_y; int i = zi * x * y + yi * x + xi; if(xi >= x|| yi >= y || zi>= z) return; gpu_float[i] = __half2float(gpu_half[i]); } void gpu_h2f(float* gpu_float, half* gpu_half, unsigned int x, unsigned int y, unsigned int z){ int max_threads = stim::maxThreadsPerBlock(); dim3 threads(sqrt (max_threads),sqrt (max_threads)); dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); //call the GPU kernel to determine the gradient cuda_h2f <<< blocks, threads >>>(gpu_float, gpu_half, x, y, z); } void cpu_f2h(float* f_out, half* h_in, unsigned int x, unsigned int y, unsigned int z){ //calculate the number of pixels in the array unsigned int pix = x* y* z; //allocate memory on the GPU for the input half precision half* gpu_half; cudaMalloc(&gpu_half, pix * sizeof(half)); cudaMemcpy(gpu_half, h_in, pix * sizeof(half), cudaMemcpyHostToDevice); //allocate memory on the GPU for the output float precision. float* gpu_float; cudaMalloc(&gpu_float, pix * sizeof(float)); //call the GPU version of this function gpu_h2f(gpu_float, gpu_half, x, y, z); cudaMemcpy(f_out, gpu_float, pix * sizeof(float), cudaMemcpyDeviceToHost); //free allocated memory cudaFree(gpu_float); cudaFree(gpu_half); } #endif