#ifndef STIM_CUDA_RE_SAMPLE_H #define STIM_CUDA_RE_SAMPLE_H #include #include #include #include namespace stim{ namespace cuda{ template __global__ void cuda_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ unsigned int sigma_ds = 1/resize; unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); // calculate the 2D coordinates for this current thread. int xi = blockIdx.x * blockDim.x + threadIdx.x; int yi = blockIdx.y; // convert 2D coordinates to 1D int i = yi * x + xi; if(xi< x && yi< y){ if(xi%sigma_ds==0){ if(yi%sigma_ds==0){ gpuI[i] = gpuI0[(yi/sigma_ds)*x_ds + xi/sigma_ds]; } } else gpuI[i] = 0; //int x_org = xi * sigma_ds ; //int y_org = yi * sigma_ds ; //int i_org = y_org * x + x_org; //gpuI[i] = gpuI0[i_org]; } } /// Applies a Gaussian blur to a 2D image stored on the GPU template void gpu_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ //unsigned int sigma_ds = 1/resize; //unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); //unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); //get the number of pixels in the image //unsigned int pixels_ds = x_ds * y_ds; unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(max_threads, 1); dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); //stim::cuda::gpu_gaussian_blur2(gpuI0, sigma_ds,x ,y); //resample the image cuda_re_sample <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y); } /// Applies a Gaussian blur to a 2D image stored on the CPU template void cpu_re_sample(T* out, T* in, T resize, unsigned int x, unsigned int y){ //get the number of pixels in the image unsigned int pixels = x*y; unsigned int bytes = sizeof(T) * pixels; unsigned int sigma_ds = 1/resize; unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); unsigned int bytes_ds = sizeof(T) * x_ds * y_ds; //allocate space on the GPU for the original image T* gpuI0; cudaMalloc(&gpuI0, bytes_ds); //copy the image data to the GPU cudaMemcpy(gpuI0, in, bytes_ds, cudaMemcpyHostToDevice); //allocate space on the GPU for the down sampled image T* gpuI; cudaMalloc(&gpuI, bytes); //run the GPU-based version of the algorithm gpu_re_sample(gpuI, gpuI0, resize, x, y); //copy the image data to the GPU cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice); cudaFree(gpuI0); cudeFree(gpuI); } } } #endif