#ifndef STIM_CUDA_CONV2SEP_H #define STIM_CUDA_CONV2SEP_H #include #include #include #include #include #include #define pi 3.14159 namespace stim{ namespace cuda{ template __global__ void conv2sep_0(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y, T* kernel0, unsigned int k0){ //generate a pointer to shared memory (size will be specified as a kernel parameter) extern __shared__ T s[]; int kr = k0/2; //calculate the kernel radius //get a pointer to the gaussian in memory T* g = (T*)&s[blockDim.x + 2 * kr]; //calculate the start point for this block int bxi = blockIdx.x * blockDim.x; int byi = blockIdx.y; //copy the portion of the image necessary for this block to shared memory //stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); //calculate the thread index int ti = threadIdx.x; //calculate the spatial coordinate for this thread int xi = bxi + ti; int yi = byi; //use the first 2kr+1 threads to transfer the kernel to shared memory if(ti < k0){ g[ti] = kernel0[ti]; } //make sure that all writing to shared memory is done before continuing __syncthreads(); //if the current pixel is outside of the image if(xi >= x || yi >= y) return; //calculate the coordinates of the current thread in shared memory int si = ti + kr; T sum = 0; //running weighted sum across the kernel //for each element of the kernel for(int ki = -kr; ki <= kr; ki++){ sum += s[si + ki] * g[ki + kr]; } //calculate the 1D image index for this thread unsigned int i = byi * x + xi; //output the result to global memory out[i] = sum; } template __global__ void conv2sep_1(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y, T* kernel0, unsigned int k0){ //generate a pointer to shared memory (size will be specified as a kernel parameter) extern __shared__ T s[]; int kr = k0/2; //calculate the kernel radius //get a pointer to the gaussian in memory T* g = (T*)&s[blockDim.y + 2 * kr]; //calculate the start point for this block int bxi = blockIdx.x; int byi = blockIdx.y * blockDim.y; //copy the portion of the image necessary for this block to shared memory //stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); //calculate the thread index int ti = threadIdx.y; //calculate the spatial coordinate for this thread int xi = bxi; int yi = byi + ti; //use the first 2kr+1 threads to transfer the kernel to shared memory if(ti < k0){ g[ti] = kernel0[ti]; } //make sure that all writing to shared memory is done before continuing __syncthreads(); //if the current pixel is outside of the image if(xi > x || yi > y) return; //calculate the coordinates of the current thread in shared memory int si = ti + kr; T sum = 0; //running weighted sum across the kernel //for each element of the kernel for(int ki = -kr; ki <= kr; ki++){ sum += g[ki + kr] * s[si + ki]; } //calculate the 1D image index for this thread unsigned int i = yi * x + xi; //output the result to global memory out[i] = sum; } template void tex_conv2sep(T* out, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray, T* kernel0, unsigned int k0, T* kernel1, unsigned int k1){ //get the maximum number of threads per block for the CUDA device int max_threads = stim::maxThreadsPerBlock(); dim3 threads(max_threads, 1); //calculate the number of blocks dim3 blocks(x / threads.x + 1, y); //calculate the shared memory used in the kernel unsigned int pixel_bytes = max_threads * sizeof(T); //bytes devoted to pixel data being processed unsigned int apron_bytes = k0/2 * sizeof(T); //bytes devoted to the apron on each side of the window unsigned int gaussian_bytes = k0 * sizeof(T); //bytes devoted to memory used to store the pre-computed Gaussian window unsigned int shared_bytes = pixel_bytes + 2 * apron_bytes + gaussian_bytes; //total number of bytes shared memory used //blur the image along the x-axis conv2sep_0 <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel0, k0); // Copy the x-blurred data from global memory to the texture cudaMemcpyToArray(cuArray, 0, 0, out, x * y * sizeof(T), cudaMemcpyDeviceToDevice); //transpose the block and thread dimensions threads.x = 1; threads.y = max_threads; blocks.x = x; blocks.y = y / threads.y + 1; //blur the image along the y-axis conv2sep_1 <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel1, k1); } template void gpu_conv2sep(T* image, unsigned int x, unsigned int y, T* kernel0, unsigned int k0, T* kernel1, unsigned int k1){ //get the number of pixels in the image unsigned int pixels = x * y; unsigned int bytes = sizeof(T) * pixels; // Allocate CUDA array in device memory //define a channel descriptor for a single 32-bit channel cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); cudaArray* cuArray; //declare the cuda array cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array // Copy the image data from global memory to the array cudaMemcpyToArray(cuArray, 0, 0, image, bytes, cudaMemcpyDeviceToDevice); // Specify texture struct cudaResourceDesc resDesc; //create a resource descriptor memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type resDesc.res.array.array = cuArray; //add a pointer to the cuda array // Specify texture object parameters struct cudaTextureDesc texDesc; //create a texture descriptor memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) texDesc.addressMode[1] = cudaAddressModeWrap; texDesc.filterMode = cudaFilterModePoint; //use linear filtering texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) texDesc.normalizedCoords = 0; //not using normalized coordinates // Create texture object cudaTextureObject_t texObj = 0; cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); //call the texture version of the separable convolution function tex_conv2sep(image, x, y, texObj, cuArray, kernel0, k0, kernel1, k1); //free allocated memory cudaFree(cuArray); cudaDestroyTextureObject(texObj); } /// Applies a Gaussian blur to a 2D image stored on the CPU template void cpu_conv2sep(T* image, unsigned int x, unsigned int y, T* kernel0, unsigned int k0, T* kernel1, unsigned int k1){ //get the number of pixels in the image unsigned int pixels = x * y; unsigned int bytes = sizeof(T) * pixels; //---------Allocate Image--------- //allocate space on the GPU for the image T* gpuI0; HANDLE_ERROR(cudaMalloc(&gpuI0, bytes)); //copy the image data to the GPU HANDLE_ERROR(cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice)); //---------Allocate Kernel-------- //allocate and copy the 0 (x) kernel T* gpuK0; HANDLE_ERROR(cudaMalloc(&gpuK0, k0 * sizeof(T))); HANDLE_ERROR(cudaMemcpy(gpuK0, kernel0, k0 * sizeof(T), cudaMemcpyHostToDevice)); //allocate and copy the 1 (y) kernel T* gpuK1; HANDLE_ERROR(cudaMalloc(&gpuK1, k1 * sizeof(T))); HANDLE_ERROR(cudaMemcpy(gpuK1, kernel1, k1 * sizeof(T), cudaMemcpyHostToDevice)); //run the GPU-based version of the algorithm gpu_conv2sep(gpuI0, x, y, gpuK0, k0, gpuK1, k1); //copy the image data from the device cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost); //free allocated memory cudaFree(gpuI0); } }; }; #endif