#ifndef STIM_CUDA_SEPCONV2_H #define STIM_CUDA_SEPCONV2_H #include #ifdef __CUDACC__ #include #include #endif namespace stim { #ifdef __CUDACC__ //Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device. //@param out is a pointer to the output image //@param in is a pointer to the input image //@param sx is the size of the input image along X //@param sy is the size of the input image along Y //@param kx is the size of the kernel along X //@param ky is the size of the kernel along Y template void gpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) { cudaDeviceProp p; HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); size_t tmax = p.maxThreadsPerBlock; dim3 nt(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions size_t X = sx - kx + 1; //calculate the x size of the output image T* temp; //declare a temporary variable to store the intermediate image HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T))); //allocate memory for the intermediate image dim3 nb(X / nt.x + 1, sy / nt.y + 1); //calculate the grid dimensions size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T); //shared memory bytes required to store block data if (sm > p.sharedMemPerBlock) { std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; exit(1); } kernel_conv2 <<>> (temp, in, k0, sx, sy, kx, 1); //launch the kernel to compute the intermediate image size_t Y = sy - ky + 1; //calculate the y size of the output image nb.y = Y / nt.y + 1; //update the grid dimensions to reflect the Y-axis size of the output image sm = nt.x * (nt.y + ky - 1) * sizeof(T); //calculate the amount of shared memory needed for the second pass if (sm > p.sharedMemPerBlock) { std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; exit(1); } kernel_conv2 <<>> (out, temp, k1, X, sy, 1, ky); //launch the kernel to compute the final image HANDLE_ERROR(cudaFree(temp)); //free memory allocated for the intermediate image } #endif //Performs a separable convolution of a 2D image. Only valid pixels based on the kernel are returned. // As a result, the output image will be smaller than the input image by (kx-1, ky-1) //@param out is a pointer to the output image //@param in is a pointer to the input image //@param k0 is the x-axis convolution filter //@param k1 is the y-axis convolution filter //@param sx is the size of the input image along X //@param sy is the size of the input image along Y //@param kx is the size of the kernel along X //@param ky is the size of the kernel along Y template void cpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) { size_t X = sx - kx + 1; //x size of the output image size_t Y = sy - ky + 1; #ifdef __CUDACC__ //allocate memory and copy everything to the GPU T* gpu_in; HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T))); HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice)); K* gpu_k0; HANDLE_ERROR(cudaMalloc(&gpu_k0, kx * sizeof(K))); HANDLE_ERROR(cudaMemcpy(gpu_k0, k0, kx * sizeof(K), cudaMemcpyHostToDevice)); K* gpu_k1; HANDLE_ERROR(cudaMalloc(&gpu_k1, ky * sizeof(K))); HANDLE_ERROR(cudaMemcpy(gpu_k1, k1, ky * sizeof(K), cudaMemcpyHostToDevice)); T* gpu_out; HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T))); gpu_sepconv2(gpu_out, gpu_in, gpu_k0, gpu_k1, sx, sy, kx, ky); //execute the GPU kernel HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost)); //copy the result to the host HANDLE_ERROR(cudaFree(gpu_in)); HANDLE_ERROR(cudaFree(gpu_k0)); HANDLE_ERROR(cudaFree(gpu_k1)); HANDLE_ERROR(cudaFree(gpu_out)); #else T* temp = (T*)malloc(X * sy * sizeof(T)); //allocate space for the intermediate image cpu_conv2(temp, in, k0, sx, sy, kx, 1); //evaluate the intermediate image cpu_conv2(out, temp, k1, X, sy, 1, ky); //evaluate the final image free(temp); //free the memory for the intermediate image #endif } } #endif