template
__global__ void cuda_crop2d(T* dest, T* src, size_t sx, size_t sy, size_t x, size_t y, size_t dx, size_t dy){
size_t xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate the current working position within the destination image
size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
if(xi >= dx || yi >= dy) return; //if this thread is outside of the destination image, return
size_t di = yi * dx + xi; //calculate the 1D index into the destination image
size_t si = (y + yi) * sx + (x + xi); //calculate the 1D index into the source image
dest[di] = src[si]; //copy the corresponding source pixel to the destination image
}
/// Crops a 2D image composed of elements of type T
/// @param dest is a device pointer to memory of size dx*dy that will store the cropped image
/// @param src is a device pointer to memory of size sx*sy that stores the original image
/// @param sx is the size of the source image along x
/// @param sy is the size of the source image along y
/// @param x is the x-coordinate of the start position of the cropped region within the source image
/// @param y is the y-coordinate of the start position of the cropped region within the source image
/// @param dx is the size of the destination image along x
/// @param dy is the size of the destination image along y
template
void gpu_crop2d(T* dest, T* src, size_t sx, size_t sy, size_t x, size_t y, size_t dx, size_t dy){
int max_threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device
dim3 threads( sqrt(max_threads), sqrt(max_threads) );
dim3 blocks( dx / sqrt(threads.x) + 1, dy / sqrt(threads.y) + 1); //calculate the optimal number of blocks
cuda_crop2d <<< blocks, threads >>>(dest, src, sx, sy, x, y, dx, dy);
}