#ifndef STIM_CUDA_SEPCONV2_H
#define STIM_CUDA_SEPCONV2_H
#include <stim/math/filters/conv2.h>
#ifdef __CUDACC__
#include <stim/cuda/cudatools.h>
#include <stim/cuda/sharedmem.cuh>
#endif

namespace stim {
#ifdef __CUDACC__
	//Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device.
	//@param out is a pointer to the output image
	//@param in is a pointer to the input image
	//@param sx is the size of the input image along X
	//@param sy is the size of the input image along Y
	//@param kx is the size of the kernel along X
	//@param ky is the size of the kernel along Y
	template<typename T, typename K = T>
	void gpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) {
		cudaDeviceProp p;
		HANDLE_ERROR(cudaGetDeviceProperties(&p, 0));
		size_t tmax = p.maxThreadsPerBlock;
		dim3 nt(sqrt(tmax), sqrt(tmax));						//calculate the block dimensions
		size_t X = sx - kx + 1;									//calculate the x size of the output image
		T* temp;												//declare a temporary variable to store the intermediate image
		HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T)));	//allocate memory for the intermediate image

		dim3 nb(X / nt.x + 1, sy / nt.y + 1);							//calculate the grid dimensions
		size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T);					//shared memory bytes required to store block data
		if (sm > p.sharedMemPerBlock) {
			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
			exit(1);
		}
		kernel_conv2 <<<nb, nt, sm>>> (temp, in, k0, sx, sy, kx, 1);	//launch the kernel to compute the intermediate image

		size_t Y = sy - ky + 1;									//calculate the y size of the output image
		nb.y = Y / nt.y + 1;									//update the grid dimensions to reflect the Y-axis size of the output image
		sm = nt.x * (nt.y + ky - 1) * sizeof(T);				//calculate the amount of shared memory needed for the second pass
		if (sm > p.sharedMemPerBlock) {
			std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl;
			exit(1);
		}
		kernel_conv2 <<<nb, nt, sm>>> (out, temp, k1, X, sy, 1, ky);	//launch the kernel to compute the final image
		HANDLE_ERROR(cudaFree(temp));							//free memory allocated for the intermediate image
	}
#endif
	//Performs a separable convolution of a 2D image. Only valid pixels based on the kernel are returned.
	//	As a result, the output image will be smaller than the input image by (kx-1, ky-1)
	//@param out is a pointer to the output image
	//@param in is a pointer to the input image
	//@param k0 is the x-axis convolution filter
	//@param k1 is the y-axis convolution filter
	//@param sx is the size of the input image along X
	//@param sy is the size of the input image along Y
	//@param kx is the size of the kernel along X
	//@param ky is the size of the kernel along Y
	template<typename T, typename K = T>
	void cpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) {
		size_t X = sx - kx + 1;					//x size of the output image
		size_t Y = sy - ky + 1;
#ifdef __CUDACC__
		//allocate memory and copy everything to the GPU
		T* gpu_in;
		HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T)));
		HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice));
		K* gpu_k0;
		HANDLE_ERROR(cudaMalloc(&gpu_k0, kx * sizeof(K)));
		HANDLE_ERROR(cudaMemcpy(gpu_k0, k0, kx * sizeof(K), cudaMemcpyHostToDevice));
		K* gpu_k1;
		HANDLE_ERROR(cudaMalloc(&gpu_k1, ky * sizeof(K)));
		HANDLE_ERROR(cudaMemcpy(gpu_k1, k1, ky * sizeof(K), cudaMemcpyHostToDevice));
		T* gpu_out;
		HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T)));
		gpu_sepconv2(gpu_out, gpu_in, gpu_k0, gpu_k1, sx, sy, kx, ky);								//execute the GPU kernel
		HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost));	//copy the result to the host
		HANDLE_ERROR(cudaFree(gpu_in));
		HANDLE_ERROR(cudaFree(gpu_k0));
		HANDLE_ERROR(cudaFree(gpu_k1));
		HANDLE_ERROR(cudaFree(gpu_out));
#else
		T* temp = (T*)malloc(X * sy * sizeof(T));		//allocate space for the intermediate image
		cpu_conv2(temp, in, k0, sx, sy, kx, 1);			//evaluate the intermediate image
		cpu_conv2(out, temp, k1, X, sy, 1, ky);			//evaluate the final image
		free(temp);										//free the memory for the intermediate image
#endif
	}
}

#endif