\documentclass[aspectratio=169,obeyspaces,spaces,hyphens,dvipsnames]{beamer}
\usepackage[utf8]{inputenc}
\usepackage{lmodern}% http://ctan.org/pkg/lm
\usepackage{minted}
\usepackage{hyperref}
\usepackage{xcolor}
\usepackage{pgfplots}
\usepackage{tikz}
\usepackage[normalem]{ulem}

\mode<presentation>
\usetheme{Bootlin}

\def\signed #1{{\leavevmode\unskip\nobreak\hfil\penalty50\hskip2em
  \hbox{}\nobreak\hfil(#1)
  \parfillskip=0pt \finalhyphendemerits=0 \endgraf}}

\newsavebox\mybox
\newenvironment{aquote}[1]
  {\savebox\mybox{#1}\begin{quotation}}
  {\signed{\usebox\mybox}\end{quotation}}

\tikzstyle{block} = [rectangle, draw, fill=white, text width=4em, text centered, minimum height=3em, node distance=10em]
\tikzstyle{blockish} = [draw, fill=white, text width=4em, text centered, node distance=8em]
\tikzstyle{arrow} = [thick,->,>=stealth]

\title{Integrating Hardware-Accelerated Video Decoding with the Display Stack}
\authors{Paul Kocialkowski}
\email{paul@bootlin.com}
\slidesurl{https://bootlin.com/pub/conferences/}
\institute{Bootlin}
\conference{Embedded Linux Conference Europe}

\begin{document}

\addtocontents{toc}{\protect\setcounter{tocdepth}{-1}}
\section{Integrating HW-Accelerated Video Decoding with the Display Stack}
\addtocontents{toc}{\protect\setcounter{tocdepth}{2}}

\begin{frame}{Paul Kocialkowski}
  \begin{itemize}
  \item Embedded Linux engineer at Bootlin
    \begin{itemize}
    \item Embedded Linux {\bf expertise}
    \item {\bf Development}, consulting and training
    \item Strong open-source focus
    \end{itemize}
  \item Open-source contributor
    \begin{itemize}
    \item Co-maintainer of the \textbf{cedrus} VPU driver in V4L2
    \item Contributor to the \textbf{sun4i-drm} DRM driver
    \item Developed the \textbf{displaying and rendering graphics with Linux} training
    \end{itemize}
  \item Living in {\bf Toulouse}, south-west of France
  \end{itemize}
\end{frame}

\subsection{Outline and Introduction}

\begin{frame}{Purpose of this talk}
\begin{itemize}
\item Present our \textbf{specific use case}
  \begin{itemize}
  \item Some basics about video decoding
  \item How Linux supports dedicated hardware for it
  \item Our hardware, driver and constraints
  \end{itemize}
\item Provide an overview of \textbf{video pipeline integration}
  \begin{itemize}
  \item From source to sink
  \item With efficient use of the hardware
  \item Using the existing userspace software components
  \end{itemize}
\item Detail what went \textbf{wrong}
  \begin{itemize}
  \item Things don't always pan out in the graphics world
  \item Sharing the pain points we encountered
  \item Constructive criticism, things could be a lot worse\\
    \textit{Always look on the bright side of life}
  \end{itemize}
\end{itemize}
\end{frame}

\begin{frame}{Purpose of this talk}
\begin{center}
\includegraphics[width=0.6\linewidth]{images/good-pipeline.jpg}\\
\textit{Let's try and build a good pipeline, eh?}
\end{center}
\end{frame}

\begin{frame}{You said video decoding?}
\begin{itemize}
\item Sequences of pictures take a huge load of data to represent...
\item So we compress them using a given codec:
  \begin{itemize}
  \item Color compression: YUV sub-sampling
  \item Spatial compression: frequency-space transform (DCT) and filtering
  \item Temporal compression: multi-directional interpolation
  \item Entropy compression: Huffman coding, Arithmetic coding
  \end{itemize}
\item Add some meta-data to the mix to get the bitstream
\item Encapsulate that bitstream with other things (audio, ...) in a container
\item Then we have a reasonable amount of data for a fair result!
\end{itemize}
\vspace{1em}
\begin{center}
\includegraphics[width=0.7\linewidth]{images/video-file.pdf}
\end{center}
\end{frame}

\begin{frame}{You said hardware video decoding?}
\begin{itemize}
\item So now we need a significant number of operations to get back our frames
\item Embedded systems don't have that much CPU time to spare
\item Hardware to the rescue: fixed-function decoder block implementations
  \begin{itemize}
  \item Digest video bitstream to spit out decoded pictures
  \item Implementations are per-codec (or per-generation)
  \end{itemize}
\item Two distinct types of hardware implementations:
  \begin{itemize}
  \item \textbf{Stateful}: with a MCU to parse raw meta-data from bitstream, keep track of buffers
  \item \textbf{Stateless}: that expect parsed metadata and compressed data only
  \end{itemize}
\end{itemize}
\end{frame}

\begin{frame}{Hardware video decoding in Linux (Media/V4L2)}
  \begin{itemize}
  \item In Linux, hardware video decoders (aka VPUs) are supported in V4L2
  \item Support for stateful VPUs landed with the \textbf{V4L2 M2M} framework
    \begin{itemize}
    \item Adapted to memory-to-memory hardware
    \item Source (output) is bitstream, destination (capture) is a decoded picture
    \end{itemize}
  \item Support for stateless VPUs landed with the \textbf{Media Request API}
    \begin{itemize}
    \item Meta-data is passed in per-codec V4L2 controls
    \item Controls are synchronized with buffers under media requests
    \item Source (output) is compressed data, destination (capture) is a decoded picture
    \end{itemize}
  \item Decoded pictures are accessed:
    \begin{itemize}
    \item By the CPU through \code{mmap} on the destination buffer
    \item By other devices through \code{dma-buf} import of the destination buffer
    \end{itemize}
  \end{itemize}
\end{frame}

\begin{frame}{The kind of expected result}
\begin{center}
\includegraphics[width=0.6\linewidth]{images/vlc-h265-working.jpg}\\
\textit{H.265 hardware video decoding with UI integration}
\end{center}
\end{frame}

\begin{frame}{What to do with decoded pictures}

Video decoding is just the tip of the iceberg...
\begin{itemize}
\item Colorspace conversion (CSC) from YUV is often needed
\item Scaling and composition with UI are also required
\item These are awfully calculation-intensive\\
  \textit{sometimes more than CPU-based video decoding}
\item But hey, we have hardware for that too:
  \begin{itemize}
  \item The display engine usually supports all these operations via overlays/planes
  \item Sometimes there are dedicated hardware blocks too
  \item The GPU can do anything, so it can do that too (right?)
  \end{itemize}
\item Let's avoid copies and share buffers between devices\\
  \textit{full-frame memory copies are just a big no-no for performance}
\end{itemize}
\end{frame}

\subsection{Hardware video decoding on Allwinner platforms and display stack integration}

% Talk about what Allwinner SoCs are used for here.
\begin{frame}{Allwinner platforms}
\begin{center}
\includegraphics[width=0.8\linewidth]{images/sun50i-boards.jpg}\\
\textit{Community Allwinner boards from our friends at Olimex and Libre Computer}
\end{center}
\end{frame}

\begin{frame}{Our situation: the Allwinner side of things}
\begin{itemize}
\item Relevant \textbf{multimedia blocks} on Allwinner hardware:
  \begin{itemize}
  \item \textbf{Video decoder (VPU)}: fixed-function (stateless) implementation,\\
    supports \textbf{MPEG-2/H.263/Xvid/H.264/VP6/VP8}, \textbf{H.265}/VP9 on recent SoCs
  \item \textbf{Display engines}: support multiple input overlays
  \item \textbf{GPU}: Mali 400/450 in most cases
  \end{itemize}
\item First generation of devices (A10-A33) \textbf{comes with constraints}:
  \begin{itemize}
  \item VPU can only map the lowest 256 MiB of RAM
  \item VPU produces pictures in a specific tiled scan order (aka MB32)
  \item Display engine supports MB32 tiling for planes/overlay % say that we added support for it
  \end{itemize}
\item Second generation (A33-A64+) \textbf{doesn't have these constraints}:
  \begin{itemize}
  \item VPU still works with tiling internally, but untiling block is in the VPU
  \end{itemize}
\end{itemize}

\end{frame}

\begin{frame}{Allwinner MB32 tiled video format}
\begin{minipage}[t]{0.4\linewidth}
\centering
\includegraphics[width=\linewidth]{images/linear.pdf}\\
{\small\textit{Linear (raster) scan order}}
\vspace{1em}
\begin{itemize}
\item \(w\): width, \(s\): stride
\item \(h\): height
\end{itemize}
\end{minipage}
\hfill
\begin{minipage}[t]{0.4\linewidth}
\centering
\includegraphics[width=\linewidth]{images/tiled.pdf}\\
{\small\textit{MB32-tiled scan order}}
\vspace{1em}
\begin{itemize}
\item \(w_t\): tile-aligned width (stride)
\item \(h_t\): tile-aligned height
\end{itemize}
\end{minipage}
\end{frame}

\begin{frame}{Bootlin's contribution for hardware video decoding support}
\begin{itemize}
\item On the \textbf{DRM kernel} side:
  \begin{itemize}
  \item \code{DRM_FORMAT_MOD_ALLWINNER_TILED} \textbf{modifier} (merged in 5.1)
  \item sun4i-drm support for linear/tiled YUV formats in \textbf{overlay planes} (merged in 5.1)
  \end{itemize}
\item On the \textbf{V4L2 kernel} side:
  \begin{itemize}
  \item Cedrus base driver (merged in 5.1)
  \item \code{V4L2_PIX_FMT_SUNXI_TILED_NV12} pixel format (merged in 5.1)
  \item Experimental stateless \textbf{MPEG-2} API and cedrus support (merged in 5.1)
  \item Experimental stateless \textbf{H.264} API and cedrus support (merged in 5.3)
  \item Experimental stateless \textbf{H.265} API and cedrus support (to be merged in 5.5)
  \end{itemize}
\item On the \textbf{userspace} side:
  \begin{itemize}
  \item A test utility: \textbf{v4l2-request-test}\\
    \url{https://github.com/bootlin/v4l2-request-test}
  \item A VAAPI back-end: \textbf{libva-v4l2-request}\\
    \url{https://github.com/bootlin/libva-v4l2-request}
  \end{itemize}
\end{itemize}
\end{frame}

\subsection{Investigated and/or implemented setups}

\begin{frame}{Bare-metal pipeline setup}

\begin{itemize}
\item Test scenario: \textbf{standalone dedicated application} (\code{v4l2-request-test})
\item Talks to the kernel directly (both V4L2 and DRM)
\item Uses dma-buf for zero-copy
  \begin{itemize}
  \item Exported from V4L2 with the \code{VIDIOC_EXPBUF} ioctl
  \item Imported to DRM with the \code{DRM_IOCTL_PRIME_FD_TO_HANDLE} ioctl
  \end{itemize}
\item CSC, scaling and composition offloaded using DRM planes
\item Bottomline: \textbf{all is well} but very limited use-case (testing)
\end{itemize}

\begin{center}
Pipeline components overview:\\
\vspace{0.5em}
\begin{tikzpicture}[shorten >=1pt, auto, semithick, scale=0.85, every node/.style={scale=0.85}]
\node [blockish] (v4l2) {V4L2};
\node [blockish] (v4l2-request-test) [right of=v4l2, text width=8em] {v4l2-request-test};
\node [blockish] (drm) [right of=v4l2-request-test] {DRM};

\draw [arrow] (v4l2) -- (v4l2-request-test);
\draw [arrow] (v4l2-request-test) -- (drm);

%\node (anchor-frame) [right of=anchor-bitstream, node distance=10em] {frame vidéo};
%\node (etapes) [text width=6em, text centered, node distance=8em, left of=bitstream] {Étapes};
\end{tikzpicture}
\end{center}
\end{frame}

\begin{frame}{X.org pipeline setup (GPU-less): investigation}
\begin{itemize}
\item Scenario: usual media players (using VAAPI) under X
\item Can we use a similar setup (dma-buf to DRM plane) under X?
  \begin{itemize}
  \item X initially only knows about RGB formats
  \item But extensions exist: Xv, DRI3
  \end{itemize}
\item Xv extension allows supporting YUV and scaling, but...
  \begin{itemize}
  \item Requires writing a hardware-specific DDX (e.g. to use planes)
  \item Requires a buffer copy and doesn't support modifiers
  \item Has synchronization issues and deprecated anyway (in favor of GL)
  \end{itemize}
\item DRI3 supposedly can solve these points:
  \begin{itemize}
  \item Supports dma-buf import (but no modifier support)
  \item Currently apparently only implemented in glamor (GPU-backed)
  \item Doesn't give us access to a DRM planes
  \end{itemize}
\end{itemize}
\end{frame}

\begin{frame}{X.org pipeline setup (GPU-less): bottomline}
\begin{itemize}
\item Scenario: usual media players (using VAAPI) under X
\item What worked:
  \begin{itemize}
  \item Software untiling (NEON-accelerated) in VAAPI back-end
  \item Software-based CSC, scaling and composition
  \item Buffer copies through XCB
  \end{itemize}
\item As a result, performance sucks\\
  \textit{still surprisingly good without scaling involved}
\end{itemize}

\begin{center}
Pipeline components overview:\\
\vspace{0.5em}
\begin{tikzpicture}[shorten >=1pt, auto, semithick, scale=0.85, every node/.style={scale=0.85}]
\node [blockish] (v4l2) {V4L2};
\node [blockish] (vaapi) [right of=v4l2] {VAAPI};
\node [blockish] (ffmpeg) [right of=vaapi] {FFmpeg};
\node [blockish] (vlc) [right of=ffmpeg] {VLC};
\node [blockish] (xorg) [right of=vlc] {X.org (XCB)};

\draw [arrow] (v4l2) -- (vaapi);
\draw [arrow] (vaapi) -- (ffmpeg);
\draw [arrow] (ffmpeg) -- (vlc);
\draw [arrow] (vlc) -- (xorg);

%\node (anchor-frame) [right of=anchor-bitstream, node distance=10em] {frame vidéo};
%\node (etapes) [text width=6em, text centered, node distance=8em, left of=bitstream] {Étapes};
\end{tikzpicture}
\end{center}
\end{frame}

\begin{frame}{Improving the X.org pipeline with a GPU in the mix}
\begin{itemize}
\item Using the GPU shall speed things up
  \begin{itemize}
  \item Requires using the \code{xf86-video-armsoc} DDX
  \item Only accelerates rendering, not composition using GL (glamor)
  \end{itemize}
\item First try: importing YUV with the GPU and untiling
  \begin{itemize}
  \item Lack of/undocumented blob support for YUV format
  \item Zero-copy (dma-buf) import supported by the blob only for RGB formats
  \end{itemize}
\item Second try: importing as 8-bit component (luminance) and untiling
  \begin{itemize}
  \item Wrote an untiling shader that just works on Intel GPUs
  \item Zero-copy (dma-buf) not supported for (\code{GL_LUMINANCE})
  \item Copy import (\code{glTexImage2D}) for \code{GL_LUMINANCE} failed\\
    \textit{apparently a weird undocumented issue due to Mali constraints}
  \item Untiling shader never worked with the Mali (tl;dr)
  \end{itemize}
\item Bottomline:
  \begin{itemize}
  \item GPU didn't help, for reasons we can't fix
  \item Perhaps a free driver (Lima) would help?
  \end{itemize}
\end{itemize}
\end{frame}

\begin{frame}{But what about Wayland?}

\begin{itemize}
\item Didn't investigate/implement at the time of the project
\item Wayland's relationship with DRM planes:
  \begin{itemize}
  \item Planes are not exposed to applications
  \item But might be used by the compositor internally
  \end{itemize}
\item Zero-copy buffer import from devices:
  \begin{itemize}
  \item Exposed with the \code{linux-dmabuf} extension, \code{zwp_linux_dmabuf_v1} interface
  \item Modifiers are supported by the protocol
  \item \code{libweston} implementation calls \code{EGL_EXT_image_dma_buf_import_modifiers}
  \item Requires GPU hardware support for the modifier
  \end{itemize}
\item Bottomline: unusable for our (GPU-less) use case
\end{itemize}

\end{frame}

\begin{frame}{Kodi pipeline}
\begin{itemize}
\item Kodi (media center) relies on GPU support, compatible with Mali blob
\item Kodi supports the GBM EGL back-end
  \begin{itemize}
  \item Allows using GL with DRM as output surface
  \item Used for drawing the UI
  \item Video CSC/scaling/composition uses a plane directly
  \item Supports dma-buf import from FFmpeg
  \end{itemize}
\item Required plumbing to get it to work:
  \begin{itemize}
  \item FFmpeg hwaccel support to use our V4L2-exposed codec (through VAAPI)
  \end{itemize}
\item Bottomline: it works great!
\end{itemize}

\begin{center}
Pipeline components overview:\\
\vspace{0.5em}
\begin{tikzpicture}[shorten >=1pt, auto, semithick, scale=0.85, every node/.style={scale=0.85}]
\node [blockish] (v4l2) {V4L2};
\node [blockish] (vaapi) [right of=v4l2] {VAAPI};
\node [blockish] (ffmpeg) [right of=vaapi] {FFmpeg};
\node [blockish] (kodi) [right of=ffmpeg] {Kodi};
\node [blockish] (drm) [right of=kodi] {DRM};

\draw [arrow] (v4l2) -- (vaapi);
\draw [arrow] (vaapi) -- (ffmpeg);
\draw [arrow] (ffmpeg) -- (kodi);
\draw [arrow] (kodi) -- (drm);

%\node (anchor-frame) [right of=anchor-bitstream, node distance=10em] {frame vidéo};
%\node (etapes) [text width=6em, text centered, node distance=8em, left of=bitstream] {Étapes};
\end{tikzpicture}
\end{center}

\end{frame}

\begin{frame}{General takeaway}
\begin{itemize}
\item Planes support is never exposed to applications\\
  \textit{at best supported and hidden by the compositor}
\item Modifier support is still very rare in userspace
\item Strong incentive all around the userspace stack to use GL\\
  \textit{the unified way to integrate graphics}
\item But GPU support does not always solve the issue:
  \begin{itemize}
  \item Life's much harder when it's a proprietary blob
  \item Lack of usable dma-buf import support
  \item Bugs and limitations
  \end{itemize}
\item Some projects try to make use of planes easier:
  \begin{itemize}
  \item \textbf{libliftoff}, \textbf{liboutput}
  \item Microchip's \textbf{Ensemble Graphics Toolkit}: \url{https://ensemble.graphics/}
  \end{itemize}
\end{itemize}
\end{frame}

\questionslide

\end{document}
