#ifndef STIM_CUDA_FLOAT_TO_HALF_H #define STIM_CUDA_FLOAT_TO_HALF_H #include #include #include #include #include #include #include __global__ void cuda_f2h(half* gpu_half, float* gpu_float, int x, int y, int z){ //calculate x,y,z coordinates for this thread int xi = blockIdx.x * blockDim.x + threadIdx.x; //find the grid size along y int grid_y = y / blockDim.y; int blockidx_y = blockIdx.y % grid_y; int yi = blockidx_y * blockDim.y + threadIdx.y; int zi = blockIdx.y / grid_y; int i = zi * x * y + yi * x + xi; if(xi >= x|| yi >= y || zi>= z) return; gpu_half[i] = __float2half(gpu_float[i]); } void gpu_f2h(half* gpu_half, float* gpu_float, unsigned int x, unsigned int y, unsigned int z){ int max_threads = stim::maxThreadsPerBlock(); dim3 threads(sqrt (max_threads),sqrt (max_threads)); dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); //call the GPU kernel to determine the gradient cuda_f2h <<< blocks, threads >>>(gpu_half, gpu_float, x, y, z); } void cpu_f2h(half* h_out, float* f_in, unsigned int x, unsigned int y, unsigned int z){ //calculate the number of pixels in the array unsigned int pix = x* y* z; //allocate memory on the GPU for the input float precision. float* gpu_float; cudaMalloc(&gpu_float, pix * sizeof(float)); cudaMemcpy(gpu_float, f_in, pix * sizeof(float), cudaMemcpyHostToDevice); //allocate memory on the GPU for the output half precision half* gpu_half; cudaMalloc(&gpu_half, pix * sizeof(half)); //call the GPU version of this function gpu_f2h(gpu_half, gpu_float, x, y, z); //copy the array back to the CPU cudaMemcpy(h_out, gpu_half, pix * sizeof(half), cudaMemcpyDeviceToHost); //free allocated memory cudaFree(gpu_float); cudaFree(gpu_half); } #endif