#ifndef STIM_CUDA_GAUSSIAN_BLUR_H #define STIM_CUDA_GAUSSIAN_BLUR_H #include #include #include #include #include //GPU-based separable convolution algorithm namespace stim{ namespace cuda { template void gen_gaussian(T* out, T sigma, unsigned int width) { //fill the kernel with a gaussian for (unsigned int xi = 0; xi < width; xi++) { float x = (float)xi - (float)(width / 2); //calculate the x position of the gaussian float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp(-(x*x) / (2 * sigma*sigma)); out[xi] = g; } } template void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray) { //allocate space for the kernel unsigned int kwidth = sigma * 8 + 1; float* kernel0 = (float*)malloc(kwidth * sizeof(float)); //fill the kernel with a gaussian gen_gaussian(kernel0, sigma, kwidth); //copy the kernel to the GPU T* gpuKernel0; HANDLE_ERROR(cudaMalloc(&gpuKernel0, kwidth * sizeof(T))); HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); //perform the gaussian blur as a separable convolution stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth); HANDLE_ERROR(cudaFree(gpuKernel0)); free(kernel0); } template void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y) { //allocate space for the kernel unsigned int kwidth = sigma * 8 + 1; float* kernel0 = (float*)malloc(kwidth * sizeof(float)); //fill the kernel with a gaussian gen_gaussian(kernel0, sigma, kwidth); //copy the kernel to the GPU T* gpuKernel0; HANDLE_ERROR(cudaMalloc(&gpuKernel0, kwidth * sizeof(T))); HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); //perform the gaussian blur as a separable convolution stim::cuda::gpu_conv2sep(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth); HANDLE_ERROR(cudaFree(gpuKernel0)); } /// Applies a Gaussian blur to a 2D image stored on the CPU template void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y, float &gpu_time) { gpuTimer_start(); //allocate space for the kernel unsigned int kwidth = sigma * 8 + 1; float* kernel0 = (float*)malloc(kwidth * sizeof(float)); //fill the kernel with a gaussian gen_gaussian(kernel0, sigma, kwidth); //perform the gaussian blur as a separable convolution stim::cuda::cpu_conv2sep(image, x, y, kernel0, kwidth, kernel0, kwidth); gpu_time = gpuTimer_end(); } } } #endif