#ifndef STIM_CUDA_HALF_TO_FLOAT_H
#define STIM_CUDA_HALF_TO_FLOAT_H

#include <iostream>
#include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <stim/cuda/sharedmem.cuh>
#include <stim/cuda/cudatools/error.h>
#include "cuda_fp16.h"

		__global__ void cuda_h2f(float* gpu_float, half* gpu_half, int x, int y, int z){

			
			//calculate x,y,z coordinates for this thread
			int xi = blockIdx.x * blockDim.x + threadIdx.x;
			//find the grid size along y
			int grid_y = y / blockDim.y;
			int blockidx_y = blockIdx.y % grid_y;
			int yi = blockidx_y * blockDim.y + threadIdx.y;
			int zi = blockIdx.y / grid_y;
			int i = zi * x * y + yi * x + xi;

			if(xi >= x|| yi >= y || zi>= z) return;

							
			gpu_float[i] = __half2float(gpu_half[i]);
							
		}


		void gpu_h2f(float* gpu_float, half* gpu_half, unsigned int x, unsigned int y, unsigned int z){

			
			int max_threads = stim::maxThreadsPerBlock();
			dim3 threads(sqrt (max_threads),sqrt (max_threads));
			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);

			//call the GPU kernel to determine the gradient
			cuda_h2f <<< blocks, threads >>>(gpu_float, gpu_half, x, y, z);

		}


		void cpu_f2h(float* f_out, half* h_in, unsigned int x, unsigned int y, unsigned int z){

			//calculate the number of pixels in the array
			unsigned int pix = x* y* z;

			//allocate memory on the GPU for the input half precision
			half* gpu_half;
			cudaMalloc(&gpu_half, pix * sizeof(half));			
			cudaMemcpy(gpu_half, h_in, pix * sizeof(half), cudaMemcpyHostToDevice);

			//allocate memory on the GPU for the output float precision.
			float* gpu_float;
			cudaMalloc(&gpu_float, pix * sizeof(float));
			
			
			//call the GPU version of this function
			gpu_h2f(gpu_float, gpu_half, x, y, z);

			cudaMemcpy(f_out, gpu_float, pix * sizeof(float), cudaMemcpyDeviceToHost);

			
			//free allocated memory
			cudaFree(gpu_float);
			cudaFree(gpu_half);

		}


#endif