template __global__ void cuda_crop2d(T* dest, T* src, size_t sx, size_t sy, size_t x, size_t y, size_t dx, size_t dy){ size_t xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate the current working position within the destination image size_t yi = blockIdx.y * blockDim.y + threadIdx.y; if(xi >= dx || yi >= dy) return; //if this thread is outside of the destination image, return size_t di = yi * dx + xi; //calculate the 1D index into the destination image size_t si = (y + yi) * sx + (x + xi); //calculate the 1D index into the source image dest[di] = src[si]; //copy the corresponding source pixel to the destination image } /// Crops a 2D image composed of elements of type T /// @param dest is a device pointer to memory of size dx*dy that will store the cropped image /// @param src is a device pointer to memory of size sx*sy that stores the original image /// @param sx is the size of the source image along x /// @param sy is the size of the source image along y /// @param x is the x-coordinate of the start position of the cropped region within the source image /// @param y is the y-coordinate of the start position of the cropped region within the source image /// @param dx is the size of the destination image along x /// @param dy is the size of the destination image along y template void gpu_crop2d(T* dest, T* src, size_t sx, size_t sy, size_t x, size_t y, size_t dx, size_t dy){ int max_threads = stim::maxThreadsPerBlock(); //get the maximum number of threads per block for the CUDA device dim3 threads( sqrt(max_threads), sqrt(max_threads) ); dim3 blocks( dx / sqrt(threads.x) + 1, dy / sqrt(threads.y) + 1); //calculate the optimal number of blocks cuda_crop2d <<< blocks, threads >>>(dest, src, sx, sy, x, y, dx, dy); }