#ifndef STIM_CUDA_GAUSSIAN_BLUR_H #define STIM_CUDA_GAUSSIAN_BLUR_H #include #include #include #include #include #define pi 3.14159 namespace stim{ namespace cuda{ template __global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ //generate a pointer to shared memory (size will be specified as a kernel parameter) extern __shared__ T s[]; int kr = sigma * 4; //calculate the kernel radius //get a pointer to the gaussian in memory T* g = (T*)&s[blockDim.x + 2 * kr]; //calculate the start point for this block int bxi = blockIdx.x * blockDim.x; int byi = blockIdx.y; //copy the portion of the image necessary for this block to shared memory stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); //calculate the thread index and block index int ti = threadIdx.x; //calculate the spatial coordinate for this thread int xi = bxi + ti; //pre-compute the gaussian values for each kernel point T a = 1.0 / (sigma * sqrt(2 * pi)); T c = - 1.0 / (2*sigma*sigma); int ki; //use the first 2kr+1 threads to evaluate a gaussian and store the result if(ti <= 2* kr+1){ ki = ti - kr; g[ti] = a * exp((ki*ki) * c); } //make sure that all writing to shared memory is done before continuing __syncthreads(); //if the current pixel is outside of the image if(bxi + ti > x || byi > y) return; //calculate the coordinates of the current thread in shared memory int si = ti + kr; T sum = 0; //running weighted sum across the kernel //for each element of the kernel for(int ki = -kr; ki <= kr; ki++){ sum += g[ki + kr] * s[si + ki]; } //calculate the 1D image index for this thread unsigned int i = byi * x + xi; //output the result to global memory out[i] = sum; } template __global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ //generate a pointer to shared memory (size will be specified as a kernel parameter) extern __shared__ T s[]; int kr = sigma * 4; //calculate the kernel radius //get a pointer to the gaussian in memory T* g = (T*)&s[blockDim.y + 2 * kr]; //calculate the start point for this block int bxi = blockIdx.x; int byi = blockIdx.y * blockDim.y; //copy the portion of the image necessary for this block to shared memory stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); //calculate the thread index and block index int ti = threadIdx.y; //calculate the spatial coordinate for this thread int yi = byi + ti; //pre-compute the gaussian values for each kernel point T a = 1.0 / (sigma * sqrt(2 * pi)); T c = - 1.0 / (2*sigma*sigma); int ki; //use the first 2kr+1 threads to evaluate a gaussian and store the result if(ti <= 2* kr+1){ ki = ti - kr; g[ti] = a * exp((ki*ki) * c); } //make sure that all writing to shared memory is done before continuing __syncthreads(); //if the current pixel is outside of the image if(bxi >= x || yi >= y) return; //calculate the coordinates of the current thread in shared memory int si = ti + kr; T sum = 0; //running weighted sum across the kernel //for each element of the kernel for(int ki = -kr; ki <= kr; ki++){ sum += g[ki + kr] * s[si + ki]; } //calculate the 1D image index for this thread unsigned int i = yi * x + bxi; //output the result to global memory out[i] = sum; } /// Applies a Gaussian blur to a 2D image stored on the GPU template void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ //get the number of pixels in the image unsigned int pixels = x * y; unsigned int bytes = sizeof(T) * pixels; // Allocate CUDA array in device memory //define a channel descriptor for a single 32-bit channel cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); cudaArray* cuArray; //declare the cuda array cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array // Copy the image data from global memory to the array cudaMemcpyToArray(cuArray, 0, 0, image, bytes, cudaMemcpyDeviceToDevice); // Specify texture struct cudaResourceDesc resDesc; //create a resource descriptor memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type resDesc.res.array.array = cuArray; //add a pointer to the cuda array // Specify texture object parameters struct cudaTextureDesc texDesc; //create a texture descriptor memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) texDesc.addressMode[1] = cudaAddressModeWrap; texDesc.filterMode = cudaFilterModePoint; //use linear filtering texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) texDesc.normalizedCoords = 0; //not using normalized coordinates // Create texture object cudaTextureObject_t texObj = 0; cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); //get the maximum number of threads per block for the CUDA device int max_threads = stim::maxThreadsPerBlock(); dim3 threads(max_threads, 1); //calculate the number of blocks dim3 blocks(x / threads.x + 1, y); //calculate the shared memory used in the kernel unsigned int pixel_bytes = max_threads * 4; //bytes devoted to pixel data being processed unsigned int apron_bytes = sigma * 8 * 4; //bytes devoted to pixels outside the window unsigned int gaussian_bytes = (sigma * 8 + 1) * 4; //bytes devoted to memory used to store the pre-computed Gaussian window unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes; //total number of bytes shared memory used //blur the image along the x-axis gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y); // Copy the x-blurred data from global memory to the texture cudaMemcpyToArray(cuArray, 0, 0, image, bytes, cudaMemcpyDeviceToDevice); //transpose the block and thread dimensions threads.x = 1; threads.y = max_threads; blocks.x = x; blocks.y = y / threads.y + 1; //blur the image along the y-axis gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y); //free allocated memory cudaFree(cuArray); } /// Applies a Gaussian blur to a 2D image stored on the CPU template void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ //get the number of pixels in the image unsigned int pixels = x * y; unsigned int bytes = sizeof(T) * pixels; //allocate space on the GPU T* gpuI0; cudaMalloc(&gpuI0, bytes); //copy the image data to the GPU cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice); //run the GPU-based version of the algorithm gpu_gaussian_blur_2d(gpuI0, sigma, x, y); //copy the image data from the device cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost); //free allocated memory cudaFree(gpuI0); } } } #endif