#ifndef STIM_CUDA_FLOAT_TO_HALF_H
#define STIM_CUDA_FLOAT_TO_HALF_H

#include <iostream>
#include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <stim/cuda/sharedmem.cuh>
#include <stim/cuda/cudatools/error.h>
#include <cuda_fp16.h>
#include <stdio.h>
		__global__ void cuda_f2h(half* gpu_half, float* gpu_float, int x, int y, int z){

			
			//calculate x,y,z coordinates for this thread
			int xi = blockIdx.x * blockDim.x + threadIdx.x;
			//find the grid size along y
			int grid_y = y / blockDim.y;
			int blockidx_y = blockIdx.y % grid_y;
			int yi = blockidx_y * blockDim.y + threadIdx.y;
			int zi = blockIdx.y / grid_y;
			int i = zi * x * y + yi * x + xi;

			if(xi >= x|| yi >= y || zi>= z) return;

							
			gpu_half[i] = __float2half(gpu_float[i]);
			
			
		}


		void gpu_f2h(half* gpu_half, float* gpu_float, unsigned int x, unsigned int y, unsigned int z){

			
			int max_threads = stim::maxThreadsPerBlock();
			dim3 threads(sqrt (max_threads),sqrt (max_threads));
			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);

			//call the GPU kernel to determine the gradient
			cuda_f2h <<< blocks, threads >>>(gpu_half, gpu_float, x, y, z);

		}


		void cpu_f2h(half* h_out, float* f_in, unsigned int x, unsigned int y, unsigned int z){

			//calculate the number of pixels in the array
			unsigned int pix = x* y* z;

			//allocate memory on the GPU for the input float precision.
			float* gpu_float;
			cudaMalloc(&gpu_float, pix * sizeof(float));
			cudaMemcpy(gpu_float, f_in, pix * sizeof(float), cudaMemcpyHostToDevice);
			
			//allocate memory on the GPU for the output half precision
			half* gpu_half;
			cudaMalloc(&gpu_half, pix * sizeof(half));

			//call the GPU version of this function
			gpu_f2h(gpu_half, gpu_float, x, y, z);

			//copy the array back to the CPU
			cudaMemcpy(h_out, gpu_half, pix * sizeof(half), cudaMemcpyDeviceToHost);

			//free allocated memory
			cudaFree(gpu_float);
			cudaFree(gpu_half);

		}


#endif