template<typename T>
__global__ void cuda_crop2d(T* dest, T* src, size_t sx, size_t sy, size_t x, size_t y, size_t dx, size_t dy){

	size_t xi = blockIdx.x * blockDim.x + threadIdx.x;				//calculate the current working position within the destination image
	size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
	if(xi >= dx || yi >= dy) return;									//if this thread is outside of the destination image, return

	size_t di = yi * dx + xi;										//calculate the 1D index into the destination image
	size_t si = (y + yi) * sx + (x + xi);							//calculate the 1D index into the source image

	dest[di] = src[si];												//copy the corresponding source pixel to the destination image
}

/// Crops a 2D image composed of elements of type T
/// @param dest is a device pointer to memory of size dx*dy that will store the cropped image
/// @param src is a device pointer to memory of size sx*sy that stores the original image
/// @param sx is the size of the source image along x
/// @param sy is the size of the source image along y
/// @param x is the x-coordinate of the start position of the cropped region within the source image
/// @param y is the y-coordinate of the start position of the cropped region within the source image
/// @param dx is the size of the destination image along x
/// @param dy is the size of the destination image along y
template<typename T>
void gpu_crop2d(T* dest, T* src, size_t sx, size_t sy, size_t x, size_t y, size_t dx, size_t dy){
	int max_threads = stim::maxThreadsPerBlock();												//get the maximum number of threads per block for the CUDA device
	dim3 threads( sqrt(max_threads), sqrt(max_threads) );
	dim3 blocks( dx / sqrt(threads.x) + 1, dy / sqrt(threads.y) + 1);							//calculate the optimal number of blocks
	cuda_crop2d<T> <<< blocks, threads >>>(dest, src, sx, sy, x, y, dx, dy);
}